Path: blob/21.2-virgl/src/panfrost/bifrost/bi_schedule.c
4564 views
/*1* Copyright (C) 2020 Collabora Ltd.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*22* Authors (Collabora):23* Alyssa Rosenzweig <[email protected]>24*/2526#include "compiler.h"27#include "bi_builder.h"2829/* Arguments common to worklist, passed by value for convenience */3031struct bi_worklist {32/* # of instructions in the block */33unsigned count;3435/* Instructions in the block */36bi_instr **instructions;3738/* Bitset of instructions in the block ready for scheduling */39BITSET_WORD *worklist;4041/* The backwards dependency graph. nr_dependencies is the number of42* unscheduled instructions that must still be scheduled after (before)43* this instruction. dependents are which instructions need to be44* scheduled before (after) this instruction. */45unsigned *dep_counts;46BITSET_WORD **dependents;47};4849/* State of a single tuple and clause under construction */5051struct bi_reg_state {52/* Number of register writes */53unsigned nr_writes;5455/* Register reads, expressed as (equivalence classes of)56* sources. Only 3 reads are allowed, but up to 2 may spill as57* "forced" for the next scheduled tuple, provided such a tuple58* can be constructed */59bi_index reads[5];60unsigned nr_reads;6162/* The previous tuple scheduled (= the next tuple executed in the63* program) may require certain writes, in order to bypass the register64* file and use a temporary passthrough for the value. Up to 2 such65* constraints are architecturally satisfiable */66unsigned forced_count;67bi_index forceds[2];68};6970struct bi_tuple_state {71/* Is this the last tuple in the clause */72bool last;7374/* Scheduled ADD instruction, or null if none */75bi_instr *add;7677/* Reads for previous (succeeding) tuple */78bi_index prev_reads[5];79unsigned nr_prev_reads;80bi_tuple *prev;8182/* Register slot state for current tuple */83struct bi_reg_state reg;8485/* Constants are shared in the tuple. If constant_count is nonzero, it86* is a size for constant count. Otherwise, fau is the slot read from87* FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,88* but within a tuple, that should be encoded as constant_count != 089* and constants[0] = constants[1] = 0 */90unsigned constant_count;9192union {93uint32_t constants[2];94enum bir_fau fau;95};9697unsigned pcrel_idx;98};99100struct bi_const_state {101unsigned constant_count;102bool pcrel; /* applies to first const */103uint32_t constants[2];104105/* Index of the constant into the clause */106unsigned word_idx;107};108109struct bi_clause_state {110/* Has a message-passing instruction already been assigned? */111bool message;112113/* Indices already accessed, this needs to be tracked to avoid hazards114* around message-passing instructions */115unsigned access_count;116bi_index accesses[(BI_MAX_SRCS + 2) * 16];117118unsigned tuple_count;119struct bi_const_state consts[8];120};121122/* Determines messsage type by checking the table and a few special cases. Only123* case missing is tilebuffer instructions that access depth/stencil, which124* require a Z_STENCIL message (to implement125* ARM_shader_framebuffer_fetch_depth_stencil) */126127static enum bifrost_message_type128bi_message_type_for_instr(bi_instr *ins)129{130enum bifrost_message_type msg = bi_opcode_props[ins->op].message;131bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);132133if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)134return BIFROST_MESSAGE_Z_STENCIL;135136if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)137return BIFROST_MESSAGE_ATTRIBUTE;138139return msg;140}141142/* Attribute, texture, and UBO load (attribute message) instructions support143* bindless, so just check the message type */144145ASSERTED static bool146bi_supports_dtsel(bi_instr *ins)147{148switch (bi_message_type_for_instr(ins)) {149case BIFROST_MESSAGE_ATTRIBUTE:150return ins->op != BI_OPCODE_LD_GCLK_U64;151case BIFROST_MESSAGE_TEX:152return true;153default:154return false;155}156}157158/* Adds an edge to the dependency graph */159160static void161bi_push_dependency(unsigned parent, unsigned child,162BITSET_WORD **dependents, unsigned *dep_counts)163{164if (!BITSET_TEST(dependents[parent], child)) {165BITSET_SET(dependents[parent], child);166dep_counts[child]++;167}168}169170static void171add_dependency(struct util_dynarray *table, unsigned index, unsigned child,172BITSET_WORD **dependents, unsigned *dep_counts)173{174assert(index < 64);175util_dynarray_foreach(table + index, unsigned, parent)176bi_push_dependency(*parent, child, dependents, dep_counts);177}178179static void180mark_access(struct util_dynarray *table, unsigned index, unsigned parent)181{182assert(index < 64);183util_dynarray_append(&table[index], unsigned, parent);184}185186static bool187bi_is_sched_barrier(bi_instr *I)188{189switch (I->op) {190case BI_OPCODE_BARRIER:191case BI_OPCODE_DISCARD_F32:192return true;193default:194return false;195}196}197198static void199bi_create_dependency_graph(struct bi_worklist st, bool inorder)200{201struct util_dynarray last_read[64], last_write[64];202203for (unsigned i = 0; i < 64; ++i) {204util_dynarray_init(&last_read[i], NULL);205util_dynarray_init(&last_write[i], NULL);206}207208/* Initialize dependency graph */209for (unsigned i = 0; i < st.count; ++i) {210st.dependents[i] =211calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));212213st.dep_counts[i] = 0;214}215216unsigned prev_msg = ~0;217218/* Populate dependency graph */219for (signed i = st.count - 1; i >= 0; --i) {220bi_instr *ins = st.instructions[i];221222bi_foreach_src(ins, s) {223if (ins->src[s].type != BI_INDEX_REGISTER) continue;224unsigned count = bi_count_read_registers(ins, s);225226for (unsigned c = 0; c < count; ++c)227add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);228}229230/* Keep message-passing ops in order. (This pass only cares231* about bundling; reordering of message-passing instructions232* happens during earlier scheduling.) */233234if (bi_message_type_for_instr(ins)) {235if (prev_msg != ~0)236bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);237238prev_msg = i;239}240241/* Handle schedule barriers, adding All the deps */242if (inorder || bi_is_sched_barrier(ins)) {243for (unsigned j = 0; j < st.count; ++j) {244if (i == j) continue;245246bi_push_dependency(MAX2(i, j), MIN2(i, j),247st.dependents, st.dep_counts);248}249}250251bi_foreach_dest(ins, d) {252if (ins->dest[d].type != BI_INDEX_REGISTER) continue;253unsigned dest = ins->dest[d].value;254255unsigned count = bi_count_write_registers(ins, d);256257for (unsigned c = 0; c < count; ++c) {258add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);259add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);260mark_access(last_write, dest + c, i);261}262}263264bi_foreach_src(ins, s) {265if (ins->src[s].type != BI_INDEX_REGISTER) continue;266267unsigned count = bi_count_read_registers(ins, s);268269for (unsigned c = 0; c < count; ++c)270mark_access(last_read, ins->src[s].value + c, i);271}272}273274/* If there is a branch, all instructions depend on it, as interblock275* execution must be purely in-order */276277bi_instr *last = st.instructions[st.count - 1];278if (last->branch_target || last->op == BI_OPCODE_JUMP) {279for (signed i = st.count - 2; i >= 0; --i)280bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);281}282283/* Free the intermediate structures */284for (unsigned i = 0; i < 64; ++i) {285util_dynarray_fini(&last_read[i]);286util_dynarray_fini(&last_write[i]);287}288}289290/* Scheduler pseudoinstruction lowerings to enable instruction pairings.291* Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2292*/293294static bi_instr *295bi_lower_cubeface(bi_context *ctx,296struct bi_clause_state *clause, struct bi_tuple_state *tuple)297{298bi_instr *pinstr = tuple->add;299bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));300bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],301pinstr->src[0], pinstr->src[1], pinstr->src[2]);302303pinstr->op = BI_OPCODE_CUBEFACE2;304pinstr->dest[0] = pinstr->dest[1];305pinstr->dest[1] = bi_null();306pinstr->src[0] = cubeface1->dest[0];307pinstr->src[1] = bi_null();308pinstr->src[2] = bi_null();309310return cubeface1;311}312313/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to314* have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the315* arguments (rbase, address lo, address hi, rbase) */316317static bi_instr *318bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct319bi_tuple_state *tuple)320{321bi_instr *pinstr = tuple->add;322bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));323bi_instr *atom_c = bi_atom_c_return_i32(&b,324pinstr->src[1], pinstr->src[2], pinstr->src[0],325pinstr->atom_opc);326327if (bi_is_null(pinstr->dest[0]))328atom_c->op = BI_OPCODE_ATOM_C_I32;329330pinstr->op = BI_OPCODE_ATOM_CX;331pinstr->src[3] = atom_c->src[2];332333return atom_c;334}335336static bi_instr *337bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct338bi_tuple_state *tuple)339{340bi_instr *pinstr = tuple->add;341bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));342bi_instr *atom_c = bi_atom_c1_return_i32(&b,343pinstr->src[0], pinstr->src[1], pinstr->atom_opc);344345if (bi_is_null(pinstr->dest[0]))346atom_c->op = BI_OPCODE_ATOM_C1_I32;347348pinstr->op = BI_OPCODE_ATOM_CX;349pinstr->src[2] = pinstr->src[1];350pinstr->src[1] = pinstr->src[0];351pinstr->src[3] = bi_dontcare();352pinstr->src[0] = bi_null();353354return atom_c;355}356357static bi_instr *358bi_lower_seg_add(bi_context *ctx,359struct bi_clause_state *clause, struct bi_tuple_state *tuple)360{361bi_instr *pinstr = tuple->add;362bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));363364bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],365pinstr->preserve_null, pinstr->seg);366367pinstr->op = BI_OPCODE_SEG_ADD;368pinstr->src[0] = pinstr->src[1];369pinstr->src[1] = bi_null();370371assert(pinstr->dest[0].type == BI_INDEX_REGISTER);372pinstr->dest[0].value += 1;373374return fma;375}376377static bi_instr *378bi_lower_dtsel(bi_context *ctx,379struct bi_clause_state *clause, struct bi_tuple_state *tuple)380{381bi_instr *add = tuple->add;382bi_builder b = bi_init_builder(ctx, bi_before_instr(add));383384bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),385add->src[0], add->table);386add->src[0] = dtsel->dest[0];387388assert(bi_supports_dtsel(add));389return dtsel;390}391392/* Flatten linked list to array for O(1) indexing */393394static bi_instr **395bi_flatten_block(bi_block *block, unsigned *len)396{397if (list_is_empty(&block->base.instructions))398return NULL;399400*len = list_length(&block->base.instructions);401bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));402403unsigned i = 0;404405bi_foreach_instr_in_block(block, ins)406instructions[i++] = ins;407408return instructions;409}410411/* The worklist would track instructions without outstanding dependencies. For412* debug, force in-order scheduling (no dependency graph is constructed).413*/414415static struct bi_worklist416bi_initialize_worklist(bi_block *block, bool inorder)417{418struct bi_worklist st = { };419st.instructions = bi_flatten_block(block, &st.count);420421if (!st.count)422return st;423424st.dependents = calloc(st.count, sizeof(st.dependents[0]));425st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));426427bi_create_dependency_graph(st, inorder);428st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));429430for (unsigned i = 0; i < st.count; ++i) {431if (st.dep_counts[i] == 0)432BITSET_SET(st.worklist, i);433}434435return st;436}437438static void439bi_free_worklist(struct bi_worklist st)440{441free(st.dep_counts);442free(st.dependents);443free(st.instructions);444free(st.worklist);445}446447static void448bi_update_worklist(struct bi_worklist st, unsigned idx)449{450assert(st.dep_counts[idx] == 0);451452if (!st.dependents[idx])453return;454455/* Iterate each dependent to remove one dependency (`done`),456* adding dependents to the worklist where possible. */457458unsigned i;459BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {460assert(st.dep_counts[i] != 0);461unsigned new_deps = --st.dep_counts[i];462463if (new_deps == 0)464BITSET_SET(st.worklist, i);465}466467free(st.dependents[idx]);468}469470/* To work out the back-to-back flag, we need to detect branches and471* "fallthrough" branches, implied in the last clause of a block that falls472* through to another block with *multiple predecessors*. */473474static bool475bi_back_to_back(bi_block *block)476{477/* Last block of a program */478if (!block->base.successors[0]) {479assert(!block->base.successors[1]);480return false;481}482483/* Multiple successors? We're branching */484if (block->base.successors[1])485return false;486487struct pan_block *succ = block->base.successors[0];488assert(succ->predecessors);489unsigned count = succ->predecessors->entries;490491/* Back to back only if the successor has only a single predecessor */492return (count == 1);493}494495/* Scheduler predicates */496497/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */498static bool499bi_can_iaddc(bi_instr *ins)500{501return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&502ins->src[0].swizzle == BI_SWIZZLE_H01 &&503ins->src[1].swizzle == BI_SWIZZLE_H01);504}505506ASSERTED static bool507bi_can_fma(bi_instr *ins)508{509/* +IADD.i32 -> *IADDC.i32 */510if (bi_can_iaddc(ins))511return true;512513/* TODO: some additional fp16 constraints */514return bi_opcode_props[ins->op].fma;515}516517static bool518bi_impacted_fadd_widens(bi_instr *I)519{520enum bi_swizzle swz0 = I->src[0].swizzle;521enum bi_swizzle swz1 = I->src[1].swizzle;522523return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||524(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||525(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);526}527528ASSERTED static bool529bi_can_add(bi_instr *ins)530{531/* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */532if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)533return false;534535/* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */536if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))537return false;538539/* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */540if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))541return false;542543/* TODO: some additional fp16 constraints */544return bi_opcode_props[ins->op].add;545}546547ASSERTED static bool548bi_must_last(bi_instr *ins)549{550return bi_opcode_props[ins->op].last;551}552553/* Architecturally, no single instruction has a "not last" constraint. However,554* pseudoinstructions writing multiple destinations (expanding to multiple555* paired instructions) can run afoul of the "no two writes on the last clause"556* constraint, so we check for that here.557*/558559static bool560bi_must_not_last(bi_instr *ins)561{562return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]);563}564565/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we566* treat it as a message-passing instruction for the purpose of scheduling567* despite no passing no logical message. Otherwise invalid encoding faults may568* be raised for unknown reasons (possibly an errata).569*/570571ASSERTED static bool572bi_must_message(bi_instr *ins)573{574return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||575(ins->op == BI_OPCODE_DISCARD_F32);576}577578static bool579bi_fma_atomic(enum bi_opcode op)580{581switch (op) {582case BI_OPCODE_ATOM_C_I32:583case BI_OPCODE_ATOM_C_I64:584case BI_OPCODE_ATOM_C1_I32:585case BI_OPCODE_ATOM_C1_I64:586case BI_OPCODE_ATOM_C1_RETURN_I32:587case BI_OPCODE_ATOM_C1_RETURN_I64:588case BI_OPCODE_ATOM_C_RETURN_I32:589case BI_OPCODE_ATOM_C_RETURN_I64:590case BI_OPCODE_ATOM_POST_I32:591case BI_OPCODE_ATOM_POST_I64:592case BI_OPCODE_ATOM_PRE_I64:593return true;594default:595return false;596}597}598599ASSERTED static bool600bi_reads_zero(bi_instr *ins)601{602return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);603}604605static bool606bi_reads_temps(bi_instr *ins, unsigned src)607{608switch (ins->op) {609/* Cannot permute a temporary */610case BI_OPCODE_CLPER_V6_I32:611case BI_OPCODE_CLPER_V7_I32:612return src != 0;613case BI_OPCODE_IMULD:614return false;615default:616return true;617}618}619620static bool621bi_impacted_t_modifiers(bi_instr *I, unsigned src)622{623enum bi_swizzle swizzle = I->src[src].swizzle;624625switch (I->op) {626case BI_OPCODE_F16_TO_F32:627case BI_OPCODE_F16_TO_S32:628case BI_OPCODE_F16_TO_U32:629case BI_OPCODE_MKVEC_V2I16:630case BI_OPCODE_S16_TO_F32:631case BI_OPCODE_S16_TO_S32:632case BI_OPCODE_U16_TO_F32:633case BI_OPCODE_U16_TO_U32:634return (swizzle != BI_SWIZZLE_H00);635636case BI_OPCODE_BRANCH_F32:637case BI_OPCODE_LOGB_F32:638case BI_OPCODE_ILOGB_F32:639case BI_OPCODE_FADD_F32:640case BI_OPCODE_FCMP_F32:641case BI_OPCODE_FREXPE_F32:642case BI_OPCODE_FREXPM_F32:643case BI_OPCODE_FROUND_F32:644return (swizzle != BI_SWIZZLE_H01);645646case BI_OPCODE_IADD_S32:647case BI_OPCODE_IADD_U32:648case BI_OPCODE_ISUB_S32:649case BI_OPCODE_ISUB_U32:650case BI_OPCODE_IADD_V4S8:651case BI_OPCODE_IADD_V4U8:652case BI_OPCODE_ISUB_V4S8:653case BI_OPCODE_ISUB_V4U8:654return (src == 1) && (swizzle != BI_SWIZZLE_H01);655656case BI_OPCODE_S8_TO_F32:657case BI_OPCODE_S8_TO_S32:658case BI_OPCODE_U8_TO_F32:659case BI_OPCODE_U8_TO_U32:660return (swizzle != BI_SWIZZLE_B0000);661662case BI_OPCODE_V2S8_TO_V2F16:663case BI_OPCODE_V2S8_TO_V2S16:664case BI_OPCODE_V2U8_TO_V2F16:665case BI_OPCODE_V2U8_TO_V2U16:666return (swizzle != BI_SWIZZLE_B0022);667668case BI_OPCODE_IADD_V2S16:669case BI_OPCODE_IADD_V2U16:670case BI_OPCODE_ISUB_V2S16:671case BI_OPCODE_ISUB_V2U16:672return (src == 1) && (swizzle >= BI_SWIZZLE_H11);673674#if 0675/* Restriction on IADD in 64-bit clauses on G72 */676case BI_OPCODE_IADD_S64:677case BI_OPCODE_IADD_U64:678return (src == 1) && (swizzle != BI_SWIZZLE_D0);679#endif680681default:682return false;683}684}685686ASSERTED static bool687bi_reads_t(bi_instr *ins, unsigned src)688{689/* Branch offset cannot come from passthrough */690if (bi_opcode_props[ins->op].branch)691return src != 2;692693/* Table can never read passthrough */694if (bi_opcode_props[ins->op].table)695return false;696697/* Staging register reads may happen before the succeeding register698* block encodes a write, so effectively there is no passthrough */699if (src == 0 && bi_opcode_props[ins->op].sr_read)700return false;701702/* Bifrost cores newer than Mali G71 have restrictions on swizzles on703* same-cycle temporaries. Check the list for these hazards. */704if (bi_impacted_t_modifiers(ins, src))705return false;706707/* Descriptor must not come from a passthrough */708switch (ins->op) {709case BI_OPCODE_LD_CVT:710case BI_OPCODE_LD_TILE:711case BI_OPCODE_ST_CVT:712case BI_OPCODE_ST_TILE:713case BI_OPCODE_TEXC:714return src != 2;715case BI_OPCODE_BLEND:716return src != 2 && src != 3;717718/* Else, just check if we can read any temps */719default:720return bi_reads_temps(ins, src);721}722}723724/* Counts the number of 64-bit constants required by a clause. TODO: We725* might want to account for merging, right now we overestimate, but726* that's probably fine most of the time */727728static unsigned729bi_nconstants(struct bi_clause_state *clause)730{731unsigned count_32 = 0;732733for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)734count_32 += clause->consts[i].constant_count;735736return DIV_ROUND_UP(count_32, 2);737}738739/* Would there be space for constants if we added one tuple? */740741static bool742bi_space_for_more_constants(struct bi_clause_state *clause)743{744return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));745}746747/* Updates the FAU assignment for a tuple. A valid FAU assignment must be748* possible (as a precondition), though not necessarily on the selected unit;749* this is gauranteed per-instruction by bi_lower_fau and per-tuple by750* bi_instr_schedulable */751752static bool753bi_update_fau(struct bi_clause_state *clause,754struct bi_tuple_state *tuple,755bi_instr *instr, bool fma, bool destructive)756{757/* Maintain our own constants, for nondestructive mode */758uint32_t copied_constants[2], copied_count;759unsigned *constant_count = &tuple->constant_count;760uint32_t *constants = tuple->constants;761enum bir_fau fau = tuple->fau;762763if (!destructive) {764memcpy(copied_constants, tuple->constants,765(*constant_count) * sizeof(constants[0]));766copied_count = tuple->constant_count;767768constant_count = &copied_count;769constants = copied_constants;770}771772bi_foreach_src(instr, s) {773bi_index src = instr->src[s];774775if (src.type == BI_INDEX_FAU) {776bool no_constants = *constant_count == 0;777bool no_other_fau = (fau == src.value) || !fau;778bool mergable = no_constants && no_other_fau;779780if (destructive) {781assert(mergable);782tuple->fau = src.value;783} else if (!mergable) {784return false;785}786787fau = src.value;788} else if (src.type == BI_INDEX_CONSTANT) {789/* No need to reserve space if we have a fast 0 */790if (src.value == 0 && fma && bi_reads_zero(instr))791continue;792793/* If there is a branch target, #0 by convention is the794* PC-relative offset to the target */795bool pcrel = instr->branch_target && src.value == 0;796bool found = false;797798for (unsigned i = 0; i < *constant_count; ++i) {799found |= (constants[i] == src.value) &&800(i != tuple->pcrel_idx);801}802803/* pcrel constants are unique, so don't match */804if (found && !pcrel)805continue;806807bool no_fau = (*constant_count > 0) || !fau;808bool mergable = no_fau && ((*constant_count) < 2);809810if (destructive) {811assert(mergable);812813if (pcrel)814tuple->pcrel_idx = *constant_count;815} else if (!mergable)816return false;817818constants[(*constant_count)++] = src.value;819}820}821822/* Constants per clause may be limited by tuple count */823bool room_for_constants = (*constant_count == 0) ||824bi_space_for_more_constants(clause);825826if (destructive)827assert(room_for_constants);828else if (!room_for_constants)829return false;830831return true;832}833834/* Given an in-progress tuple, a candidate new instruction to add to the tuple,835* and a source (index) from that candidate, determine whether this source is836* "new", in the sense of requiring an additional read slot. That is, checks837* whether the specified source reads from the register file via a read slot838* (determined by its type and placement) and whether the source was already839* specified by a prior read slot (to avoid double counting) */840841static bool842bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)843{844bi_index src = instr->src[src_idx];845846/* Only consider sources which come from the register file */847if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))848return false;849850/* Staging register reads bypass the usual register file mechanism */851if (src_idx == 0 && bi_opcode_props[instr->op].sr_read)852return false;853854/* If a source is already read in the tuple, it is already counted */855for (unsigned t = 0; t < reg->nr_reads; ++t)856if (bi_is_word_equiv(src, reg->reads[t]))857return false;858859/* If a source is read in _this instruction_, it is already counted */860for (unsigned t = 0; t < src_idx; ++t)861if (bi_is_word_equiv(src, instr->src[t]))862return false;863864return true;865}866867/* Given two tuples in source order, count the number of register reads of the868* successor, determined as the number of unique words accessed that aren't869* written by the predecessor (since those are tempable).870*/871872static unsigned873bi_count_succ_reads(bi_index t0, bi_index t1,874bi_index *succ_reads, unsigned nr_succ_reads)875{876unsigned reads = 0;877878for (unsigned i = 0; i < nr_succ_reads; ++i) {879bool unique = true;880881for (unsigned j = 0; j < i; ++j)882if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))883unique = false;884885if (!unique)886continue;887888if (bi_is_word_equiv(succ_reads[i], t0))889continue;890891if (bi_is_word_equiv(succ_reads[i], t1))892continue;893894reads++;895}896897return reads;898}899900/* Not all instructions can read from the staging passthrough (as determined by901* reads_t), check if a given pair of instructions has such a restriction. Note902* we also use this mechanism to prevent data races around staging register903* reads, so we allow the input source to potentially be vector-valued */904905static bool906bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)907{908bi_foreach_src(add, s) {909bi_index src = add->src[s];910911if (src.type != BI_INDEX_REGISTER)912continue;913914unsigned count = bi_count_read_registers(add, s);915bool read = false;916917for (unsigned d = 0; d < count; ++d)918read |= bi_is_equiv(fma, bi_register(src.value + d));919920if (read && !bi_reads_t(add, s))921return true;922}923924return false;925}926927/* Likewise for cross-tuple passthrough (reads_temps) */928929static bool930bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)931{932bi_foreach_instr_in_tuple(succ, pins) {933bi_foreach_src(pins, s) {934if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&935!bi_reads_temps(pins, s))936return true;937}938}939940return false;941}942943/* Is a register written other than the staging mechanism? ATEST is special,944* writing to both a staging register and a regular register (fixed packing).945* BLEND is special since it has to write r48 the normal way even if it never946* gets read. This depends on liveness analysis, as a register is not needed947* for a write that will be discarded after one tuple. */948949static unsigned950bi_write_count(bi_instr *instr, uint64_t live_after_temp)951{952if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)953return 1;954955unsigned count = 0;956957bi_foreach_dest(instr, d) {958if (d == 0 && bi_opcode_props[instr->op].sr_write)959continue;960961if (bi_is_null(instr->dest[d]))962continue;963964assert(instr->dest[0].type == BI_INDEX_REGISTER);965if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))966count++;967}968969return count;970}971972/* Instruction placement entails two questions: what subset of instructions in973* the block can legally be scheduled? and of those which is the best? That is,974* we seek to maximize a cost function on a subset of the worklist satisfying a975* particular predicate. The necessary predicate is determined entirely by976* Bifrost's architectural limitations and is described in the accompanying977* whitepaper. The cost function is a heuristic. */978979static bool980bi_instr_schedulable(bi_instr *instr,981struct bi_clause_state *clause,982struct bi_tuple_state *tuple,983uint64_t live_after_temp,984bool fma)985{986/* The units must match */987if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))988return false;989990/* There can only be one message-passing instruction per clause */991if (bi_must_message(instr) && clause->message)992return false;993994/* Some instructions have placement requirements */995if (bi_must_last(instr) && !tuple->last)996return false;997998if (bi_must_not_last(instr) && tuple->last)999return false;10001001/* Message-passing instructions are not guaranteed write within the1002* same clause (most likely they will not), so if a later instruction1003* in the clause accesses the destination, the message-passing1004* instruction can't be scheduled */1005if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {1006unsigned nr = bi_count_write_registers(instr, 0);1007assert(instr->dest[0].type == BI_INDEX_REGISTER);1008unsigned reg = instr->dest[0].value;10091010for (unsigned i = 0; i < clause->access_count; ++i) {1011bi_index idx = clause->accesses[i];1012for (unsigned d = 0; d < nr; ++d) {1013if (bi_is_equiv(bi_register(reg + d), idx))1014return false;1015}1016}1017}10181019if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {1020unsigned nr = bi_count_read_registers(instr, 0);1021assert(instr->src[0].type == BI_INDEX_REGISTER);1022unsigned reg = instr->src[0].value;10231024for (unsigned i = 0; i < clause->access_count; ++i) {1025bi_index idx = clause->accesses[i];1026for (unsigned d = 0; d < nr; ++d) {1027if (bi_is_equiv(bi_register(reg + d), idx))1028return false;1029}1030}1031}10321033/* If FAU is already assigned, we may not disrupt that. Do a1034* non-disruptive test update */1035if (!bi_update_fau(clause, tuple, instr, fma, false))1036return false;10371038/* If this choice of FMA would force a staging passthrough, the ADD1039* instruction must support such a passthrough */1040if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))1041return false;10421043/* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */1044if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))1045return false;10461047/* Register file writes are limited */1048unsigned total_writes = tuple->reg.nr_writes;1049total_writes += bi_write_count(instr, live_after_temp);10501051/* Last tuple in a clause can only write a single value */1052if (tuple->last && total_writes > 1)1053return false;10541055/* Register file reads are limited, so count unique */10561057unsigned unique_new_srcs = 0;10581059bi_foreach_src(instr, s) {1060if (bi_tuple_is_new_src(instr, &tuple->reg, s))1061unique_new_srcs++;1062}10631064unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;10651066bool can_spill_to_moves = (!tuple->add);1067can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));1068can_spill_to_moves &= (clause->tuple_count < 7);10691070/* However, we can get an extra 1 or 2 sources by inserting moves */1071if (total_srcs > (can_spill_to_moves ? 4 : 3))1072return false;10731074/* Count effective reads for the successor */1075unsigned succ_reads = bi_count_succ_reads(instr->dest[0],1076tuple->add ? tuple->add->dest[0] : bi_null(),1077tuple->prev_reads, tuple->nr_prev_reads);10781079/* Successor must satisfy R+W <= 4, so we require W <= 4-R */1080if ((signed) total_writes > (4 - (signed) succ_reads))1081return false;10821083return true;1084}10851086static signed1087bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)1088{1089signed cost = 0;10901091/* Instructions that can schedule to either FMA or to ADD should be1092* deprioritized since they're easier to reschedule elsewhere */1093if (bi_can_fma(instr) && bi_can_add(instr))1094cost++;10951096/* Message-passing instructions impose constraints on the registers1097* later in the clause, so schedule them as late within a clause as1098* possible (<==> prioritize them since we're backwards <==> decrease1099* cost) */1100if (bi_must_message(instr))1101cost--;11021103/* Last instructions are big constraints (XXX: no effect on shader-db) */1104if (bi_must_last(instr))1105cost -= 2;11061107return cost;1108}11091110static unsigned1111bi_choose_index(struct bi_worklist st,1112struct bi_clause_state *clause,1113struct bi_tuple_state *tuple,1114uint64_t live_after_temp,1115bool fma)1116{1117unsigned i, best_idx = ~0;1118signed best_cost = INT_MAX;11191120BITSET_FOREACH_SET(i, st.worklist, st.count) {1121bi_instr *instr = st.instructions[i];11221123if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))1124continue;11251126signed cost = bi_instr_cost(instr, tuple);11271128/* Tie break in favour of later instructions, under the1129* assumption this promotes temporary usage (reducing pressure1130* on the register file). This is a side effect of a prepass1131* scheduling for pressure. */11321133if (cost <= best_cost) {1134best_idx = i;1135best_cost = cost;1136}1137}11381139return best_idx;1140}11411142static void1143bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,1144bi_instr *instr, uint64_t live_after_temp, bool fma)1145{1146bi_update_fau(clause, tuple, instr, fma, true);11471148/* TODO: maybe opt a bit? or maybe doesn't matter */1149assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));1150memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));1151clause->access_count += BI_MAX_SRCS;1152memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));1153clause->access_count += BI_MAX_DESTS;1154tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);11551156bi_foreach_src(instr, s) {1157if (bi_tuple_is_new_src(instr, &tuple->reg, s))1158tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];1159}1160}11611162/* Choose the best instruction and pop it off the worklist. Returns NULL if no1163* instruction is available. This function is destructive. */11641165static bi_instr *1166bi_take_instr(bi_context *ctx, struct bi_worklist st,1167struct bi_clause_state *clause,1168struct bi_tuple_state *tuple,1169uint64_t live_after_temp,1170bool fma)1171{1172if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)1173return bi_lower_cubeface(ctx, clause, tuple);1174else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32)1175return bi_lower_atom_c(ctx, clause, tuple);1176else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32)1177return bi_lower_atom_c1(ctx, clause, tuple);1178else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)1179return bi_lower_seg_add(ctx, clause, tuple);1180else if (tuple->add && tuple->add->table)1181return bi_lower_dtsel(ctx, clause, tuple);11821183/* TODO: Optimize these moves */1184if (!fma && tuple->nr_prev_reads > 3) {1185/* Only spill by one source for now */1186assert(tuple->nr_prev_reads == 4);11871188/* Pick a source to spill */1189bi_index src = tuple->prev_reads[0];11901191/* Schedule the spill */1192bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));1193bi_instr *mov = bi_mov_i32_to(&b, src, src);1194bi_pop_instr(clause, tuple, mov, live_after_temp, fma);1195return mov;1196}11971198#ifndef NDEBUG1199/* Don't pair instructions if debugging */1200if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)1201return NULL;1202#endif12031204unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);12051206if (idx >= st.count)1207return NULL;12081209/* Update state to reflect taking the instruction */1210bi_instr *instr = st.instructions[idx];12111212BITSET_CLEAR(st.worklist, idx);1213bi_update_worklist(st, idx);1214bi_pop_instr(clause, tuple, instr, live_after_temp, fma);12151216/* Fixups */1217if (instr->op == BI_OPCODE_IADD_U32 && fma) {1218assert(bi_can_iaddc(instr));1219instr->op = BI_OPCODE_IADDC_I32;1220instr->src[2] = bi_zero();1221}12221223return instr;1224}12251226/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting1227* to a passthrough register. If except_zero is true, the zeroth (first) source1228* is skipped, so staging register reads are not accidentally encoded as1229* passthrough (which is impossible) */12301231static void1232bi_use_passthrough(bi_instr *ins, bi_index old,1233enum bifrost_packed_src new,1234bool except_zero)1235{1236/* Optional for convenience */1237if (!ins || bi_is_null(old))1238return;12391240bi_foreach_src(ins, i) {1241if (i == 0 && except_zero)1242continue;12431244if (bi_is_word_equiv(ins->src[i], old)) {1245ins->src[i].type = BI_INDEX_PASS;1246ins->src[i].value = new;1247ins->src[i].reg = false;1248ins->src[i].offset = 0;1249}1250}1251}12521253/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use1254* intertuple passthroughs where necessary. Passthroughs are allowed as a1255* post-condition of scheduling. Note we rewrite ADD first, FMA second --1256* opposite the order of execution. This is deliberate -- if both FMA and ADD1257* write to the same logical register, the next executed tuple will get the1258* latter result. There's no interference issue under the assumption of correct1259* register allocation. */12601261static void1262bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)1263{1264bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;12651266if (prec.add) {1267bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);1268bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);1269}12701271if (prec.fma) {1272bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);1273bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);1274}1275}12761277static void1278bi_rewrite_fau_to_pass(bi_tuple *tuple)1279{1280bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {1281if (ins->src[s].type != BI_INDEX_FAU) continue;12821283bi_index pass = bi_passthrough(ins->src[s].offset ?1284BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);12851286ins->src[s] = bi_replace_index(ins->src[s], pass);1287}1288}12891290static void1291bi_rewrite_zero(bi_instr *ins, bool fma)1292{1293bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);12941295bi_foreach_src(ins, s) {1296bi_index src = ins->src[s];12971298if (src.type == BI_INDEX_CONSTANT && src.value == 0)1299ins->src[s] = bi_replace_index(src, zero);1300}1301}13021303/* Assumes #0 to {T, FAU} rewrite has already occurred */13041305static void1306bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)1307{1308bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {1309if (ins->src[s].type != BI_INDEX_CONSTANT) continue;13101311uint32_t cons = ins->src[s].value;13121313ASSERTED bool lo = (cons == (constant & 0xffffffff));1314bool hi = (cons == (constant >> 32ull));13151316/* PC offsets always live in the upper half, set to zero by1317* convention before pack time. (This is safe, since if you1318* wanted to compare against zero, you would use a BRANCHZ1319* instruction instead.) */1320if (cons == 0 && ins->branch_target != NULL) {1321assert(pcrel);1322hi = true;1323lo = false;1324} else if (pcrel) {1325hi = false;1326}13271328assert(lo || hi);13291330ins->src[s] = bi_replace_index(ins->src[s],1331bi_passthrough(hi ? BIFROST_SRC_FAU_HI :1332BIFROST_SRC_FAU_LO));1333}1334}13351336/* Constructs a constant state given a tuple state. This has the1337* postcondition that pcrel applies to the first constant by convention,1338* and PC-relative constants will be #0 by convention here, so swap to1339* match if needed */13401341static struct bi_const_state1342bi_get_const_state(struct bi_tuple_state *tuple)1343{1344struct bi_const_state consts = {1345.constant_count = tuple->constant_count,1346.constants[0] = tuple->constants[0],1347.constants[1] = tuple->constants[1],1348.pcrel = tuple->add && tuple->add->branch_target,1349};13501351/* pcrel applies to the first constant by convention, and1352* PC-relative constants will be #0 by convention here, so swap1353* to match if needed */1354if (consts.pcrel && consts.constants[0]) {1355assert(consts.constant_count == 2);1356assert(consts.constants[1] == 0);13571358consts.constants[1] = consts.constants[0];1359consts.constants[0] = 0;1360}13611362return consts;1363}13641365/* Merges constants in a clause, satisfying the following rules, assuming no1366* more than one tuple has pcrel:1367*1368* 1. If a tuple has two constants, they must be packed together. If one is1369* pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +1370* (PC << 32)]. Otherwise choose an arbitrary order.1371*1372* 4. If a tuple has one constant, it may be shared with an existing1373* pair that already contains that constant, or it may be combined with another1374* (distinct) tuple of a single constant.1375*1376* This gaurantees a packing is possible. The next routine handles modification1377* related swapping, to satisfy format 12 and the lack of modification for1378* tuple count 5/8 in EC0.1379*/13801381static uint64_t1382bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)1383{1384/* At this point in the constant merge algorithm, pcrel constants are1385* treated as zero, so pcrel implies at least one constants is zero */1386assert(!pcrel || (c0 == 0 || c1 == 0));13871388/* Order: pcrel, maximum non-pcrel, minimum non-pcrel */1389uint32_t hi = pcrel ? 0 : MAX2(c0, c1);1390uint32_t lo = (c0 == hi) ? c1 : c0;13911392/* Merge in the selected order */1393return lo | (((uint64_t) hi) << 32ull);1394}13951396static unsigned1397bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,1398uint64_t *merged, unsigned *pcrel_pair)1399{1400unsigned merge_count = 0;14011402for (unsigned t = 0; t < tuple_count; ++t) {1403if (consts[t].constant_count != 2) continue;14041405unsigned idx = ~0;1406uint64_t val = bi_merge_u32(consts[t].constants[0],1407consts[t].constants[1], consts[t].pcrel);14081409/* Skip the pcrel pair if assigned, because if one is assigned,1410* this one is not pcrel by uniqueness so it's a mismatch */1411for (unsigned s = 0; s < merge_count; ++s) {1412if (merged[s] == val && (*pcrel_pair) != s) {1413idx = s;1414break;1415}1416}14171418if (idx == ~0) {1419idx = merge_count++;1420merged[idx] = val;14211422if (consts[t].pcrel)1423(*pcrel_pair) = idx;1424}14251426consts[t].word_idx = idx;1427}14281429return merge_count;1430}14311432static unsigned1433bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,1434uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)1435{1436bool pending = false, pending_pcrel = false;1437uint32_t pending_single = 0;14381439for (unsigned t = 0; t < tuple_count; ++t) {1440if (consts[t].constant_count != 1) continue;14411442uint32_t val = consts[t].constants[0];1443unsigned idx = ~0;14441445/* Try to match, but don't match pcrel with non-pcrel, even1446* though we can merge a pcrel with a non-pcrel single */1447for (unsigned i = 0; i < pair_count; ++i) {1448bool lo = ((pairs[i] & 0xffffffff) == val);1449bool hi = ((pairs[i] >> 32) == val);1450bool match = (lo || hi);1451match &= ((*pcrel_pair) != i);1452if (match && !consts[t].pcrel) {1453idx = i;1454break;1455}1456}14571458if (idx == ~0) {1459idx = pair_count;14601461if (pending && pending_single != val) {1462assert(!(pending_pcrel && consts[t].pcrel));1463bool pcrel = pending_pcrel || consts[t].pcrel;14641465if (pcrel)1466*pcrel_pair = idx;14671468pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);14691470pending = pending_pcrel = false;1471} else {1472pending = true;1473pending_pcrel = consts[t].pcrel;1474pending_single = val;1475}1476}14771478consts[t].word_idx = idx;1479}14801481/* Shift so it works whether pending_pcrel is set or not */1482if (pending) {1483if (pending_pcrel)1484*pcrel_pair = pair_count;14851486pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;1487}14881489return pair_count;1490}14911492static unsigned1493bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)1494{1495unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);1496return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);1497}14981499/* Swap two constants at word i and i+1 by swapping their actual positions and1500* swapping all references so the meaning of the clause is preserved */15011502static void1503bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)1504{1505uint64_t tmp_pair = pairs[i + 0];1506pairs[i + 0] = pairs[i + 1];1507pairs[i + 1] = tmp_pair;15081509for (unsigned t = 0; t < 8; ++t) {1510if (consts[t].word_idx == i)1511consts[t].word_idx = (i + 1);1512else if (consts[t].word_idx == (i + 1))1513consts[t].word_idx = i;1514}1515}15161517/* Given merged constants, one of which might be PC-relative, fix up the M1518* values so the PC-relative constant (if it exists) has the M1=4 modification1519* and other constants are used as-is (which might require swapping) */15201521static unsigned1522bi_apply_constant_modifiers(struct bi_const_state *consts,1523uint64_t *pairs, unsigned *pcrel_idx,1524unsigned tuple_count, unsigned constant_count)1525{1526unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;15271528/* Clauses with these tuple counts lack an M field for the packed EC0,1529* so EC0 cannot be PC-relative, which might require swapping (and1530* possibly adding an unused constant) to fit */15311532if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {1533constant_count = MAX2(constant_count, 2);1534*pcrel_idx = 1;1535bi_swap_constants(consts, pairs, 0);1536}15371538/* EC0 might be packed free, after that constants are packed in pairs1539* (with clause format 12), with M1 values computed from the pair */15401541for (unsigned i = start; i < constant_count; i += 2) {1542bool swap = false;1543bool last = (i + 1) == constant_count;15441545unsigned A1 = (pairs[i] >> 60);1546unsigned B1 = (pairs[i + 1] >> 60);15471548if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {1549/* PC-relative constant must be E0, not E1 */1550swap = (*pcrel_idx == (i + 1));15511552/* Set M1 = 4 by noting (A - B) mod 16 = 4 is1553* equivalent to A = (B + 4) mod 16 and that we can1554* control A */1555unsigned B = swap ? A1 : B1;1556unsigned A = (B + 4) & 0xF;1557pairs[*pcrel_idx] |= ((uint64_t) A) << 60;15581559/* Swapped if swap set, identity if swap not set */1560*pcrel_idx = i;1561} else {1562/* Compute M1 value if we don't swap */1563unsigned M1 = (16 + A1 - B1) & 0xF;15641565/* For M1 = 0 or M1 >= 8, the constants are unchanged,1566* we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -1567* A1) % 16 >= 8, so swapping will let them be used1568* unchanged */1569swap = (M1 != 0) && (M1 < 8);15701571/* However, we can't swap the last constant, so we1572* force M1 = 0 instead for this case */1573if (last && swap) {1574pairs[i + 1] |= pairs[i] & (0xfull << 60);1575swap = false;1576}1577}15781579if (swap) {1580assert(!last);1581bi_swap_constants(consts, pairs, i);1582}1583}15841585return constant_count;1586}15871588/* Schedule a single clause. If no instructions remain, return NULL. */15891590static bi_clause *1591bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)1592{1593struct bi_clause_state clause_state = { 0 };1594bi_clause *clause = rzalloc(ctx, bi_clause);1595bi_tuple *tuple = NULL;15961597const unsigned max_tuples = ARRAY_SIZE(clause->tuples);15981599/* TODO: Decide flow control better */1600clause->flow_control = BIFROST_FLOW_NBTB;16011602/* The last clause can only write one instruction, so initialize that */1603struct bi_reg_state reg_state = {};1604bi_index prev_reads[5] = { bi_null() };1605unsigned nr_prev_reads = 0;16061607/* We need to track future liveness. The main *live set tracks what is1608* live at the current point int he program we are scheduling, but to1609* determine temp eligibility, we instead want what will be live after1610* the next tuple in the program. If you scheduled forwards, you'd need1611* a crystall ball for this. Luckily we schedule backwards, so we just1612* delay updates to the live_after_temp by an extra tuple. */1613uint64_t live_after_temp = *live;1614uint64_t live_next_tuple = live_after_temp;16151616do {1617struct bi_tuple_state tuple_state = {1618.last = (clause->tuple_count == 0),1619.reg = reg_state,1620.nr_prev_reads = nr_prev_reads,1621.prev = tuple,1622.pcrel_idx = ~0,1623};16241625assert(nr_prev_reads < ARRAY_SIZE(prev_reads));1626memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));16271628unsigned idx = max_tuples - clause->tuple_count - 1;16291630tuple = &clause->tuples[idx];16311632if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {1633unsigned nr = bi_count_read_registers(clause->message, 0);1634live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);1635}16361637/* Since we schedule backwards, we schedule ADD first */1638tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);1639tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);1640tuple->add = tuple_state.add;16411642/* Update liveness from the new instructions */1643if (tuple->add)1644*live = bi_postra_liveness_ins(*live, tuple->add);16451646if (tuple->fma)1647*live = bi_postra_liveness_ins(*live, tuple->fma);16481649/* Rotate in the new per-tuple liveness */1650live_after_temp = live_next_tuple;1651live_next_tuple = *live;16521653/* We may have a message, but only one per clause */1654if (tuple->add && bi_must_message(tuple->add)) {1655assert(!clause_state.message);1656clause_state.message = true;16571658clause->message_type =1659bi_message_type_for_instr(tuple->add);1660clause->message = tuple->add;16611662switch (tuple->add->op) {1663case BI_OPCODE_ATEST:1664clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);1665break;1666case BI_OPCODE_LD_TILE:1667if (!ctx->inputs->is_blend)1668clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);1669break;1670case BI_OPCODE_BLEND:1671clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);1672clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);1673break;1674default:1675break;1676}1677}16781679clause_state.consts[idx] = bi_get_const_state(&tuple_state);16801681/* Before merging constants, eliminate zeroes, otherwise the1682* merging will fight over the #0 that never gets read (and is1683* never marked as read by update_fau) */1684if (tuple->fma && bi_reads_zero(tuple->fma))1685bi_rewrite_zero(tuple->fma, true);16861687/* Rewrite away FAU, constant write is deferred */1688if (!tuple_state.constant_count) {1689tuple->fau_idx = tuple_state.fau;1690bi_rewrite_fau_to_pass(tuple);1691}16921693/* Use passthrough register for cross-stage accesses. Since1694* there are just FMA and ADD stages, that means we rewrite to1695* passthrough the sources of the ADD that read from the1696* destination of the FMA */16971698if (tuple->fma) {1699bi_use_passthrough(tuple->add, tuple->fma->dest[0],1700BIFROST_SRC_STAGE, false);1701}17021703/* Don't add an empty tuple, unless the worklist has nothing1704* but a (pseudo)instruction failing to schedule due to a "not1705* last instruction" constraint */17061707int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));1708bool not_last = (some_instruction > 0) &&1709bi_must_not_last(st.instructions[some_instruction - 1]);17101711bool insert_empty = tuple_state.last && not_last;17121713if (!(tuple->fma || tuple->add || insert_empty))1714break;17151716clause->tuple_count++;17171718/* Adding enough tuple might overflow constants */1719if (!bi_space_for_more_constants(&clause_state))1720break;17211722#ifndef NDEBUG1723/* Don't schedule more than 1 tuple if debugging */1724if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)1725break;1726#endif17271728/* Link through the register state */1729STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));1730memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));1731nr_prev_reads = tuple_state.reg.nr_reads;1732clause_state.tuple_count++;1733} while(clause->tuple_count < 8);17341735/* Don't schedule an empty clause */1736if (!clause->tuple_count)1737return NULL;17381739/* Before merging, rewrite away any tuples that read only zero */1740for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {1741bi_tuple *tuple = &clause->tuples[i];1742struct bi_const_state *st = &clause_state.consts[i];17431744if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)1745continue;17461747bi_foreach_instr_in_tuple(tuple, ins)1748bi_rewrite_zero(ins, false);17491750/* Constant has been demoted to FAU, so don't pack it separately */1751st->constant_count = 0;17521753/* Default */1754assert(tuple->fau_idx == BIR_FAU_ZERO);1755}17561757uint64_t constant_pairs[8] = { 0 };1758unsigned pcrel_idx = ~0;1759unsigned constant_words =1760bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);17611762constant_words = bi_apply_constant_modifiers(clause_state.consts,1763constant_pairs, &pcrel_idx, clause->tuple_count,1764constant_words);17651766clause->pcrel_idx = pcrel_idx;17671768for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {1769bi_tuple *tuple = &clause->tuples[i];17701771/* If no constants, leave FAU as it is, possibly defaulting to 0 */1772if (clause_state.consts[i].constant_count == 0)1773continue;17741775/* FAU is already handled */1776assert(!tuple->fau_idx);17771778unsigned word_idx = clause_state.consts[i].word_idx;1779assert(word_idx <= 8);17801781/* We could try to merge regardless of bottom bits as well, but1782* that's probably diminishing returns */1783uint64_t pair = constant_pairs[word_idx];1784unsigned lo = pair & 0xF;17851786tuple->fau_idx = bi_constant_field(word_idx) | lo;1787bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);1788}17891790clause->constant_count = constant_words;1791memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));17921793/* Branches must be last, so this can be factored out */1794bi_instr *last = clause->tuples[max_tuples - 1].add;1795clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);1796clause->block = block;17971798/* TODO: scoreboard assignment post-sched */1799clause->dependencies |= (1 << 0);18001801/* We emit in reverse and emitted to the back of the tuples array, so1802* move it up front for easy indexing */1803memmove(clause->tuples,1804clause->tuples + (max_tuples - clause->tuple_count),1805clause->tuple_count * sizeof(clause->tuples[0]));18061807/* Use passthrough register for cross-tuple accesses. Note this is1808* after the memmove, so this is forwards. Skip the first tuple since1809* there is nothing before it to passthrough */18101811for (unsigned t = 1; t < clause->tuple_count; ++t)1812bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);18131814return clause;1815}18161817static void1818bi_schedule_block(bi_context *ctx, bi_block *block)1819{1820list_inithead(&block->clauses);18211822/* Copy list to dynamic array */1823struct bi_worklist st = bi_initialize_worklist(block,1824bifrost_debug & BIFROST_DBG_INORDER);18251826if (!st.count) {1827bi_free_worklist(st);1828return;1829}18301831/* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */1832uint64_t live = block->reg_live_out;18331834/* Schedule as many clauses as needed to fill the block */1835bi_clause *u = NULL;1836while((u = bi_schedule_clause(ctx, block, st, &live)))1837list_add(&u->link, &block->clauses);18381839/* Back-to-back bit affects only the last clause of a block,1840* the rest are implicitly true */1841if (!list_is_empty(&block->clauses)) {1842bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);1843if (!bi_back_to_back(block))1844last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;1845}18461847/* Reorder instructions to match the new schedule. First remove1848* existing instructions and then recreate the list */18491850bi_foreach_instr_in_block_safe(block, ins) {1851list_del(&ins->link);1852}18531854bi_foreach_clause_in_block(block, clause) {1855for (unsigned i = 0; i < clause->tuple_count; ++i) {1856bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {1857list_addtail(&ins->link, &block->base.instructions);1858}1859}1860}18611862block->scheduled = true;18631864#ifndef NDEBUG1865unsigned i;1866bool incomplete = false;18671868BITSET_FOREACH_SET(i, st.worklist, st.count) {1869bi_print_instr(st.instructions[i], stderr);1870incomplete = true;1871}18721873if (incomplete)1874unreachable("The above instructions failed to schedule.");1875#endif18761877bi_free_worklist(st);1878}18791880static bool1881bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)1882{1883bi_index src = ins->src[s];18841885/* Staging registers can't have FAU accesses */1886if (s == 0 && bi_opcode_props[ins->op].sr_read)1887return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);18881889if (src.type == BI_INDEX_CONSTANT) {1890/* Allow fast zero */1891if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))1892return true;18931894if (!bi_is_null(*fau))1895return false;18961897/* Else, try to inline a constant */1898for (unsigned i = 0; i < *cwords; ++i) {1899if (src.value == constants[i])1900return true;1901}19021903if (*cwords >= 2)1904return false;19051906constants[(*cwords)++] = src.value;1907} else if (src.type == BI_INDEX_FAU) {1908if (*cwords != 0)1909return false;19101911/* Can only read from one pair of FAU words */1912if (!bi_is_null(*fau) && (src.value != fau->value))1913return false;19141915/* If there is a target, we'll need a PC-relative constant */1916if (ins->branch_target)1917return false;19181919*fau = src;1920}19211922return true;1923}19241925void1926bi_lower_fau(bi_context *ctx)1927{1928bi_foreach_instr_global_safe(ctx, ins) {1929bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));19301931uint32_t constants[2];1932unsigned cwords = 0;1933bi_index fau = bi_null();19341935/* ATEST must have the ATEST datum encoded, not any other1936* uniform. See to it this is the case. */1937if (ins->op == BI_OPCODE_ATEST)1938fau = ins->src[2];19391940bi_foreach_src(ins, s) {1941if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;19421943bi_index copy = bi_mov_i32(&b, ins->src[s]);1944ins->src[s] = bi_replace_index(ins->src[s], copy);1945}1946}1947}19481949/* On v6, ATEST cannot be the first clause of a shader, add a NOP if needed */19501951static void1952bi_add_nop_for_atest(bi_context *ctx)1953{1954/* Only needed on v6 */1955if (ctx->arch >= 7)1956return;19571958if (list_is_empty(&ctx->blocks))1959return;19601961/* Fetch the first clause of the shader */1962pan_block *block = list_first_entry(&ctx->blocks, pan_block, link);1963bi_clause *clause = bi_next_clause(ctx, block, NULL);19641965if (!clause || clause->message_type != BIFROST_MESSAGE_ATEST)1966return;19671968/* Add a NOP so we can wait for the dependencies required for ATEST to1969* execute */19701971bi_instr *I = rzalloc(ctx, bi_instr);1972I->op = BI_OPCODE_NOP_I32;1973I->dest[0] = bi_null();19741975bi_clause *new_clause = ralloc(ctx, bi_clause);1976*new_clause = (bi_clause) {1977.flow_control = BIFROST_FLOW_NBTB,1978.next_clause_prefetch = true,1979.block = clause->block,19801981.tuple_count = 1,1982.tuples[0] = { .fma = I, },1983};19841985list_add(&new_clause->link, &clause->block->clauses);1986}19871988void1989bi_schedule(bi_context *ctx)1990{1991/* Fed into both scheduling and DCE */1992bi_postra_liveness(ctx);19931994bi_foreach_block(ctx, block) {1995bi_block *bblock = (bi_block *) block;1996bi_schedule_block(ctx, bblock);1997}19981999bi_opt_dce_post_ra(ctx);2000bi_add_nop_for_atest(ctx);2001}20022003#ifndef NDEBUG20042005static bi_builder *2006bit_builder(void *memctx)2007{2008bi_context *ctx = rzalloc(memctx, bi_context);2009list_inithead(&ctx->blocks);20102011bi_block *blk = rzalloc(ctx, bi_block);20122013blk->base.predecessors = _mesa_set_create(blk,2014_mesa_hash_pointer,2015_mesa_key_pointer_equal);20162017list_addtail(&blk->base.link, &ctx->blocks);2018list_inithead(&blk->base.instructions);20192020bi_builder *b = rzalloc(memctx, bi_builder);2021b->shader = ctx;2022b->cursor = bi_after_block(blk);2023return b;2024}20252026#define TMP() bi_temp(b->shader)20272028static void2029bi_test_units(bi_builder *b)2030{2031bi_instr *mov = bi_mov_i32_to(b, TMP(), TMP());2032assert(bi_can_fma(mov));2033assert(bi_can_add(mov));2034assert(!bi_must_last(mov));2035assert(!bi_must_message(mov));2036assert(bi_reads_zero(mov));2037assert(bi_reads_temps(mov, 0));2038assert(bi_reads_t(mov, 0));20392040bi_instr *fma = bi_fma_f32_to(b, TMP(), TMP(), TMP(), bi_zero(), BI_ROUND_NONE);2041assert(bi_can_fma(fma));2042assert(!bi_can_add(fma));2043assert(!bi_must_last(fma));2044assert(!bi_must_message(fma));2045assert(bi_reads_zero(fma));2046for (unsigned i = 0; i < 3; ++i) {2047assert(bi_reads_temps(fma, i));2048assert(bi_reads_t(fma, i));2049}20502051bi_instr *load = bi_load_i128_to(b, TMP(), TMP(), TMP(), BI_SEG_UBO);2052assert(!bi_can_fma(load));2053assert(bi_can_add(load));2054assert(!bi_must_last(load));2055assert(bi_must_message(load));2056for (unsigned i = 0; i < 2; ++i) {2057assert(bi_reads_temps(load, i));2058assert(bi_reads_t(load, i));2059}20602061bi_instr *blend = bi_blend_to(b, TMP(), TMP(), TMP(), TMP(), TMP(), 4);2062assert(!bi_can_fma(load));2063assert(bi_can_add(load));2064assert(bi_must_last(blend));2065assert(bi_must_message(blend));2066for (unsigned i = 0; i < 4; ++i)2067assert(bi_reads_temps(blend, i));2068assert(!bi_reads_t(blend, 0));2069assert(bi_reads_t(blend, 1));2070assert(!bi_reads_t(blend, 2));2071assert(!bi_reads_t(blend, 3));2072}20732074int bi_test_scheduler(void)2075{2076void *memctx = NULL;20772078bi_test_units(bit_builder(memctx));20792080return 0;2081}2082#endif208320842085