Path: blob/21.2-virgl/src/broadcom/compiler/qpu_schedule.c
4564 views
/*1* Copyright © 2010 Intel Corporation2* Copyright © 2014-2017 Broadcom3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice (including the next12* paragraph) shall be included in all copies or substantial portions of the13* Software.14*15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS21* IN THE SOFTWARE.22*/2324/**25* @file26*27* The basic model of the list scheduler is to take a basic block, compute a28* DAG of the dependencies, and make a list of the DAG heads. Heuristically29* pick a DAG head, then put all the children that are now DAG heads into the30* list of things to schedule.31*32* The goal of scheduling here is to pack pairs of operations together in a33* single QPU instruction.34*/3536#include "qpu/qpu_disasm.h"37#include "v3d_compiler.h"38#include "util/ralloc.h"39#include "util/dag.h"4041static bool debug;4243struct schedule_node_child;4445struct schedule_node {46struct dag_node dag;47struct list_head link;48struct qinst *inst;4950/* Longest cycles + instruction_latency() of any parent of this node. */51uint32_t unblocked_time;5253/**54* Minimum number of cycles from scheduling this instruction until the55* end of the program, based on the slowest dependency chain through56* the children.57*/58uint32_t delay;5960/**61* cycles between this instruction being scheduled and when its result62* can be consumed.63*/64uint32_t latency;65};6667/* When walking the instructions in reverse, we need to swap before/after in68* add_dep().69*/70enum direction { F, R };7172struct schedule_state {73const struct v3d_device_info *devinfo;74struct dag *dag;75struct schedule_node *last_r[6];76struct schedule_node *last_rf[64];77struct schedule_node *last_sf;78struct schedule_node *last_vpm_read;79struct schedule_node *last_tmu_write;80struct schedule_node *last_tmu_config;81struct schedule_node *last_tmu_read;82struct schedule_node *last_tlb;83struct schedule_node *last_vpm;84struct schedule_node *last_unif;85struct schedule_node *last_rtop;86struct schedule_node *last_unifa;87enum direction dir;88/* Estimated cycle when the current instruction would start. */89uint32_t time;90};9192static void93add_dep(struct schedule_state *state,94struct schedule_node *before,95struct schedule_node *after,96bool write)97{98bool write_after_read = !write && state->dir == R;99void *edge_data = (void *)(uintptr_t)write_after_read;100101if (!before || !after)102return;103104assert(before != after);105106if (state->dir == F)107dag_add_edge(&before->dag, &after->dag, edge_data);108else109dag_add_edge(&after->dag, &before->dag, edge_data);110}111112static void113add_read_dep(struct schedule_state *state,114struct schedule_node *before,115struct schedule_node *after)116{117add_dep(state, before, after, false);118}119120static void121add_write_dep(struct schedule_state *state,122struct schedule_node **before,123struct schedule_node *after)124{125add_dep(state, *before, after, true);126*before = after;127}128129static bool130qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)131{132if (inst->sig.ldtlb || inst->sig.ldtlbu)133return true;134135if (inst->type != V3D_QPU_INSTR_TYPE_ALU)136return false;137138if (inst->alu.add.magic_write &&139(inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||140inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))141return true;142143if (inst->alu.mul.magic_write &&144(inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||145inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))146return true;147148return false;149}150151static void152process_mux_deps(struct schedule_state *state, struct schedule_node *n,153enum v3d_qpu_mux mux)154{155switch (mux) {156case V3D_QPU_MUX_A:157add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);158break;159case V3D_QPU_MUX_B:160if (!n->inst->qpu.sig.small_imm) {161add_read_dep(state,162state->last_rf[n->inst->qpu.raddr_b], n);163}164break;165default:166add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);167break;168}169}170171static bool172tmu_write_is_sequence_terminator(uint32_t waddr)173{174switch (waddr) {175case V3D_QPU_WADDR_TMUS:176case V3D_QPU_WADDR_TMUSCM:177case V3D_QPU_WADDR_TMUSF:178case V3D_QPU_WADDR_TMUSLOD:179case V3D_QPU_WADDR_TMUA:180case V3D_QPU_WADDR_TMUAU:181return true;182default:183return false;184}185}186187static bool188can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)189{190if (devinfo->ver < 40)191return false;192193if (tmu_write_is_sequence_terminator(waddr))194return false;195196if (waddr == V3D_QPU_WADDR_TMUD)197return false;198199return true;200}201202static void203process_waddr_deps(struct schedule_state *state, struct schedule_node *n,204uint32_t waddr, bool magic)205{206if (!magic) {207add_write_dep(state, &state->last_rf[waddr], n);208} else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {209if (can_reorder_tmu_write(state->devinfo, waddr))210add_read_dep(state, state->last_tmu_write, n);211else212add_write_dep(state, &state->last_tmu_write, n);213214if (tmu_write_is_sequence_terminator(waddr))215add_write_dep(state, &state->last_tmu_config, n);216} else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {217/* Handled by v3d_qpu_writes_r4() check. */218} else {219switch (waddr) {220case V3D_QPU_WADDR_R0:221case V3D_QPU_WADDR_R1:222case V3D_QPU_WADDR_R2:223add_write_dep(state,224&state->last_r[waddr - V3D_QPU_WADDR_R0],225n);226break;227case V3D_QPU_WADDR_R3:228case V3D_QPU_WADDR_R4:229case V3D_QPU_WADDR_R5:230/* Handled by v3d_qpu_writes_r*() checks below. */231break;232233case V3D_QPU_WADDR_VPM:234case V3D_QPU_WADDR_VPMU:235add_write_dep(state, &state->last_vpm, n);236break;237238case V3D_QPU_WADDR_TLB:239case V3D_QPU_WADDR_TLBU:240add_write_dep(state, &state->last_tlb, n);241break;242243case V3D_QPU_WADDR_SYNC:244case V3D_QPU_WADDR_SYNCB:245case V3D_QPU_WADDR_SYNCU:246/* For CS barrier(): Sync against any other memory247* accesses. There doesn't appear to be any need for248* barriers to affect ALU operations.249*/250add_write_dep(state, &state->last_tmu_write, n);251add_write_dep(state, &state->last_tmu_read, n);252break;253254case V3D_QPU_WADDR_UNIFA:255if (state->devinfo->ver >= 40)256add_write_dep(state, &state->last_unifa, n);257break;258259case V3D_QPU_WADDR_NOP:260break;261262default:263fprintf(stderr, "Unknown waddr %d\n", waddr);264abort();265}266}267}268269/**270* Common code for dependencies that need to be tracked both forward and271* backward.272*273* This is for things like "all reads of r4 have to happen between the r4274* writes that surround them".275*/276static void277calculate_deps(struct schedule_state *state, struct schedule_node *n)278{279const struct v3d_device_info *devinfo = state->devinfo;280struct qinst *qinst = n->inst;281struct v3d_qpu_instr *inst = &qinst->qpu;282/* If the input and output segments are shared, then all VPM reads to283* a location need to happen before all writes. We handle this by284* serializing all VPM operations for now.285*/286bool separate_vpm_segment = false;287288if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {289if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)290add_read_dep(state, state->last_sf, n);291292/* XXX: BDI */293/* XXX: BDU */294/* XXX: ub */295/* XXX: raddr_a */296297add_write_dep(state, &state->last_unif, n);298return;299}300301assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);302303/* XXX: LOAD_IMM */304305if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)306process_mux_deps(state, n, inst->alu.add.a);307if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)308process_mux_deps(state, n, inst->alu.add.b);309310if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)311process_mux_deps(state, n, inst->alu.mul.a);312if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)313process_mux_deps(state, n, inst->alu.mul.b);314315switch (inst->alu.add.op) {316case V3D_QPU_A_VPMSETUP:317/* Could distinguish read/write by unpacking the uniform. */318add_write_dep(state, &state->last_vpm, n);319add_write_dep(state, &state->last_vpm_read, n);320break;321322case V3D_QPU_A_STVPMV:323case V3D_QPU_A_STVPMD:324case V3D_QPU_A_STVPMP:325add_write_dep(state, &state->last_vpm, n);326break;327328case V3D_QPU_A_LDVPMV_IN:329case V3D_QPU_A_LDVPMD_IN:330case V3D_QPU_A_LDVPMG_IN:331case V3D_QPU_A_LDVPMP:332if (!separate_vpm_segment)333add_write_dep(state, &state->last_vpm, n);334break;335336case V3D_QPU_A_VPMWT:337add_read_dep(state, state->last_vpm, n);338break;339340case V3D_QPU_A_MSF:341add_read_dep(state, state->last_tlb, n);342break;343344case V3D_QPU_A_SETMSF:345case V3D_QPU_A_SETREVF:346add_write_dep(state, &state->last_tlb, n);347break;348349default:350break;351}352353switch (inst->alu.mul.op) {354case V3D_QPU_M_MULTOP:355case V3D_QPU_M_UMUL24:356/* MULTOP sets rtop, and UMUL24 implicitly reads rtop and357* resets it to 0. We could possibly reorder umul24s relative358* to each other, but for now just keep all the MUL parts in359* order.360*/361add_write_dep(state, &state->last_rtop, n);362break;363default:364break;365}366367if (inst->alu.add.op != V3D_QPU_A_NOP) {368process_waddr_deps(state, n, inst->alu.add.waddr,369inst->alu.add.magic_write);370}371if (inst->alu.mul.op != V3D_QPU_M_NOP) {372process_waddr_deps(state, n, inst->alu.mul.waddr,373inst->alu.mul.magic_write);374}375if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {376process_waddr_deps(state, n, inst->sig_addr,377inst->sig_magic);378}379380if (v3d_qpu_writes_r3(devinfo, inst))381add_write_dep(state, &state->last_r[3], n);382if (v3d_qpu_writes_r4(devinfo, inst))383add_write_dep(state, &state->last_r[4], n);384if (v3d_qpu_writes_r5(devinfo, inst))385add_write_dep(state, &state->last_r[5], n);386387/* If we add any more dependencies here we should consider whether we388* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.389*/390if (inst->sig.thrsw) {391/* All accumulator contents and flags are undefined after the392* switch.393*/394for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)395add_write_dep(state, &state->last_r[i], n);396add_write_dep(state, &state->last_sf, n);397add_write_dep(state, &state->last_rtop, n);398399/* Scoreboard-locking operations have to stay after the last400* thread switch.401*/402add_write_dep(state, &state->last_tlb, n);403404add_write_dep(state, &state->last_tmu_write, n);405add_write_dep(state, &state->last_tmu_config, n);406}407408if (v3d_qpu_waits_on_tmu(inst)) {409/* TMU loads are coming from a FIFO, so ordering is important.410*/411add_write_dep(state, &state->last_tmu_read, n);412/* Keep TMU loads after their TMU lookup terminator */413add_read_dep(state, state->last_tmu_config, n);414}415416/* Allow wrtmuc to be reordered with other instructions in the417* same TMU sequence by using a read dependency on the last TMU418* sequence terminator.419*/420if (inst->sig.wrtmuc)421add_read_dep(state, state->last_tmu_config, n);422423if (inst->sig.ldtlb | inst->sig.ldtlbu)424add_write_dep(state, &state->last_tlb, n);425426if (inst->sig.ldvpm) {427add_write_dep(state, &state->last_vpm_read, n);428429/* At least for now, we're doing shared I/O segments, so queue430* all writes after all reads.431*/432if (!separate_vpm_segment)433add_write_dep(state, &state->last_vpm, n);434}435436/* inst->sig.ldunif or sideband uniform read */437if (vir_has_uniform(qinst))438add_write_dep(state, &state->last_unif, n);439440/* Both unifa and ldunifa must preserve ordering */441if (inst->sig.ldunifa || inst->sig.ldunifarf)442add_write_dep(state, &state->last_unifa, n);443444if (v3d_qpu_reads_flags(inst))445add_read_dep(state, state->last_sf, n);446if (v3d_qpu_writes_flags(inst))447add_write_dep(state, &state->last_sf, n);448}449450static void451calculate_forward_deps(struct v3d_compile *c, struct dag *dag,452struct list_head *schedule_list)453{454struct schedule_state state;455456memset(&state, 0, sizeof(state));457state.dag = dag;458state.devinfo = c->devinfo;459state.dir = F;460461list_for_each_entry(struct schedule_node, node, schedule_list, link)462calculate_deps(&state, node);463}464465static void466calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,467struct list_head *schedule_list)468{469struct schedule_state state;470471memset(&state, 0, sizeof(state));472state.dag = dag;473state.devinfo = c->devinfo;474state.dir = R;475476list_for_each_entry_rev(struct schedule_node, node, schedule_list,477link) {478calculate_deps(&state, (struct schedule_node *)node);479}480}481482struct choose_scoreboard {483struct dag *dag;484int tick;485int last_magic_sfu_write_tick;486int last_stallable_sfu_reg;487int last_stallable_sfu_tick;488int last_ldvary_tick;489int last_unifa_write_tick;490int last_uniforms_reset_tick;491int last_thrsw_tick;492int last_branch_tick;493int last_setmsf_tick;494bool tlb_locked;495bool fixup_ldvary;496int ldvary_count;497};498499static bool500mux_reads_too_soon(struct choose_scoreboard *scoreboard,501const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)502{503switch (mux) {504case V3D_QPU_MUX_R4:505if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)506return true;507break;508509case V3D_QPU_MUX_R5:510if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)511return true;512break;513default:514break;515}516517return false;518}519520static bool521reads_too_soon_after_write(struct choose_scoreboard *scoreboard,522struct qinst *qinst)523{524const struct v3d_qpu_instr *inst = &qinst->qpu;525526/* XXX: Branching off of raddr. */527if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)528return false;529530assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);531532if (inst->alu.add.op != V3D_QPU_A_NOP) {533if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&534mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {535return true;536}537if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&538mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {539return true;540}541}542543if (inst->alu.mul.op != V3D_QPU_M_NOP) {544if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&545mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {546return true;547}548if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&549mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {550return true;551}552}553554/* XXX: imm */555556return false;557}558559static bool560writes_too_soon_after_write(const struct v3d_device_info *devinfo,561struct choose_scoreboard *scoreboard,562struct qinst *qinst)563{564const struct v3d_qpu_instr *inst = &qinst->qpu;565566/* Don't schedule any other r4 write too soon after an SFU write.567* This would normally be prevented by dependency tracking, but might568* occur if a dead SFU computation makes it to scheduling.569*/570if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&571v3d_qpu_writes_r4(devinfo, inst))572return true;573574return false;575}576577static bool578pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,579const struct v3d_qpu_instr *inst)580{581return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));582}583584static bool585qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,586uint32_t waddr) {587588if (inst->type != V3D_QPU_INSTR_TYPE_ALU)589return false;590591if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&592inst->raddr_a == waddr)593return true;594595if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&596!inst->sig.small_imm && (inst->raddr_b == waddr))597return true;598599return false;600}601602static bool603mux_read_stalls(struct choose_scoreboard *scoreboard,604const struct v3d_qpu_instr *inst)605{606return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&607qpu_instruction_uses_rf(inst,608scoreboard->last_stallable_sfu_reg);609}610611/* We define a max schedule priority to allow negative priorities as result of612* substracting this max when an instruction stalls. So instructions that613* stall have lower priority than regular instructions. */614#define MAX_SCHEDULE_PRIORITY 16615616static int617get_instruction_priority(const struct v3d_device_info *devinfo,618const struct v3d_qpu_instr *inst)619{620uint32_t baseline_score;621uint32_t next_score = 0;622623/* Schedule TLB operations as late as possible, to get more624* parallelism between shaders.625*/626if (qpu_inst_is_tlb(inst))627return next_score;628next_score++;629630/* Schedule texture read results collection late to hide latency. */631if (v3d_qpu_waits_on_tmu(inst))632return next_score;633next_score++;634635/* Default score for things that aren't otherwise special. */636baseline_score = next_score;637next_score++;638639/* Schedule texture read setup early to hide their latency better. */640if (v3d_qpu_writes_tmu(devinfo, inst))641return next_score;642next_score++;643644/* We should increase the maximum if we assert here */645assert(next_score < MAX_SCHEDULE_PRIORITY);646647return baseline_score;648}649650static bool651qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,652enum v3d_qpu_waddr waddr)653{654return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||655v3d_qpu_magic_waddr_is_sfu(waddr) ||656v3d_qpu_magic_waddr_is_tlb(waddr) ||657v3d_qpu_magic_waddr_is_vpm(waddr) ||658v3d_qpu_magic_waddr_is_tsy(waddr));659}660661static bool662qpu_accesses_peripheral(const struct v3d_device_info *devinfo,663const struct v3d_qpu_instr *inst)664{665if (v3d_qpu_uses_vpm(inst))666return true;667if (v3d_qpu_uses_sfu(inst))668return true;669670if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {671if (inst->alu.add.op != V3D_QPU_A_NOP &&672inst->alu.add.magic_write &&673qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {674return true;675}676677if (inst->alu.add.op == V3D_QPU_A_TMUWT)678return true;679680if (inst->alu.mul.op != V3D_QPU_M_NOP &&681inst->alu.mul.magic_write &&682qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {683return true;684}685}686687return (inst->sig.ldvpm ||688inst->sig.ldtmu ||689inst->sig.ldtlb ||690inst->sig.ldtlbu ||691inst->sig.wrtmuc);692}693694static bool695qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,696const struct v3d_qpu_instr *a,697const struct v3d_qpu_instr *b)698{699const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);700const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);701702/* We can always do one peripheral access per instruction. */703if (!a_uses_peripheral || !b_uses_peripheral)704return true;705706if (devinfo->ver < 41)707return false;708709/* V3D 4.1 and later allow TMU read along with a VPM read or write, and710* WRTMUC with a TMU magic register write (other than tmuc).711*/712if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||713(b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {714return true;715}716717if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||718(b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {719return true;720}721722return false;723}724725/* Compute a bitmask of which rf registers are used between726* the two instructions.727*/728static uint64_t729qpu_raddrs_used(const struct v3d_qpu_instr *a,730const struct v3d_qpu_instr *b)731{732assert(a->type == V3D_QPU_INSTR_TYPE_ALU);733assert(b->type == V3D_QPU_INSTR_TYPE_ALU);734735uint64_t raddrs_used = 0;736if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))737raddrs_used |= (1ll << a->raddr_a);738if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))739raddrs_used |= (1ll << a->raddr_b);740if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))741raddrs_used |= (1ll << b->raddr_a);742if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))743raddrs_used |= (1ll << b->raddr_b);744745return raddrs_used;746}747748/* Take two instructions and attempt to merge their raddr fields749* into one merged instruction. Returns false if the two instructions750* access more than two different rf registers between them, or more751* than one rf register and one small immediate.752*/753static bool754qpu_merge_raddrs(struct v3d_qpu_instr *result,755const struct v3d_qpu_instr *add_instr,756const struct v3d_qpu_instr *mul_instr)757{758uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);759int naddrs = util_bitcount64(raddrs_used);760761if (naddrs > 2)762return false;763764if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {765if (naddrs > 1)766return false;767768if (add_instr->sig.small_imm && mul_instr->sig.small_imm)769if (add_instr->raddr_b != mul_instr->raddr_b)770return false;771772result->sig.small_imm = true;773result->raddr_b = add_instr->sig.small_imm ?774add_instr->raddr_b : mul_instr->raddr_b;775}776777if (naddrs == 0)778return true;779780int raddr_a = ffsll(raddrs_used) - 1;781raddrs_used &= ~(1ll << raddr_a);782result->raddr_a = raddr_a;783784if (!result->sig.small_imm) {785if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&786raddr_a == add_instr->raddr_b) {787if (add_instr->alu.add.a == V3D_QPU_MUX_B)788result->alu.add.a = V3D_QPU_MUX_A;789if (add_instr->alu.add.b == V3D_QPU_MUX_B &&790v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {791result->alu.add.b = V3D_QPU_MUX_A;792}793}794if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&795raddr_a == mul_instr->raddr_b) {796if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)797result->alu.mul.a = V3D_QPU_MUX_A;798if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&799v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {800result->alu.mul.b = V3D_QPU_MUX_A;801}802}803}804if (!raddrs_used)805return true;806807int raddr_b = ffsll(raddrs_used) - 1;808result->raddr_b = raddr_b;809if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&810raddr_b == add_instr->raddr_a) {811if (add_instr->alu.add.a == V3D_QPU_MUX_A)812result->alu.add.a = V3D_QPU_MUX_B;813if (add_instr->alu.add.b == V3D_QPU_MUX_A &&814v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {815result->alu.add.b = V3D_QPU_MUX_B;816}817}818if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&819raddr_b == mul_instr->raddr_a) {820if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)821result->alu.mul.a = V3D_QPU_MUX_B;822if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&823v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {824result->alu.mul.b = V3D_QPU_MUX_B;825}826}827828return true;829}830831static bool832can_do_add_as_mul(enum v3d_qpu_add_op op)833{834switch (op) {835case V3D_QPU_A_ADD:836case V3D_QPU_A_SUB:837return true;838default:839return false;840}841}842843static enum v3d_qpu_mul_op844add_op_as_mul_op(enum v3d_qpu_add_op op)845{846switch (op) {847case V3D_QPU_A_ADD:848return V3D_QPU_M_ADD;849case V3D_QPU_A_SUB:850return V3D_QPU_M_SUB;851default:852unreachable("unexpected add opcode");853}854}855856static void857qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)858{859STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));860assert(inst->alu.add.op != V3D_QPU_A_NOP);861assert(inst->alu.mul.op == V3D_QPU_M_NOP);862863memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));864inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);865inst->alu.add.op = V3D_QPU_A_NOP;866867inst->flags.mc = inst->flags.ac;868inst->flags.mpf = inst->flags.apf;869inst->flags.muf = inst->flags.auf;870inst->flags.ac = V3D_QPU_COND_NONE;871inst->flags.apf = V3D_QPU_PF_NONE;872inst->flags.auf = V3D_QPU_UF_NONE;873}874875static bool876qpu_merge_inst(const struct v3d_device_info *devinfo,877struct v3d_qpu_instr *result,878const struct v3d_qpu_instr *a,879const struct v3d_qpu_instr *b)880{881if (a->type != V3D_QPU_INSTR_TYPE_ALU ||882b->type != V3D_QPU_INSTR_TYPE_ALU) {883return false;884}885886if (!qpu_compatible_peripheral_access(devinfo, a, b))887return false;888889struct v3d_qpu_instr merge = *a;890const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;891892struct v3d_qpu_instr mul_inst;893if (b->alu.add.op != V3D_QPU_A_NOP) {894if (a->alu.add.op == V3D_QPU_A_NOP) {895merge.alu.add = b->alu.add;896897merge.flags.ac = b->flags.ac;898merge.flags.apf = b->flags.apf;899merge.flags.auf = b->flags.auf;900901add_instr = b;902mul_instr = a;903}904/* If a's add op is used but its mul op is not, then see if we905* can convert either a's add op or b's add op to a mul op906* so we can merge.907*/908else if (a->alu.mul.op == V3D_QPU_M_NOP &&909can_do_add_as_mul(b->alu.add.op)) {910mul_inst = *b;911qpu_convert_add_to_mul(&mul_inst);912913merge.alu.mul = mul_inst.alu.mul;914915merge.flags.mc = b->flags.ac;916merge.flags.mpf = b->flags.apf;917merge.flags.muf = b->flags.auf;918919add_instr = a;920mul_instr = &mul_inst;921} else if (a->alu.mul.op == V3D_QPU_M_NOP &&922can_do_add_as_mul(a->alu.add.op)) {923mul_inst = *a;924qpu_convert_add_to_mul(&mul_inst);925926merge = mul_inst;927merge.alu.add = b->alu.add;928929merge.flags.ac = b->flags.ac;930merge.flags.apf = b->flags.apf;931merge.flags.auf = b->flags.auf;932933add_instr = b;934mul_instr = &mul_inst;935} else {936return false;937}938}939940if (b->alu.mul.op != V3D_QPU_M_NOP) {941if (a->alu.mul.op != V3D_QPU_M_NOP)942return false;943merge.alu.mul = b->alu.mul;944945merge.flags.mc = b->flags.mc;946merge.flags.mpf = b->flags.mpf;947merge.flags.muf = b->flags.muf;948949mul_instr = b;950add_instr = a;951}952953if (add_instr && mul_instr &&954!qpu_merge_raddrs(&merge, add_instr, mul_instr)) {955return false;956}957958merge.sig.thrsw |= b->sig.thrsw;959merge.sig.ldunif |= b->sig.ldunif;960merge.sig.ldunifrf |= b->sig.ldunifrf;961merge.sig.ldunifa |= b->sig.ldunifa;962merge.sig.ldunifarf |= b->sig.ldunifarf;963merge.sig.ldtmu |= b->sig.ldtmu;964merge.sig.ldvary |= b->sig.ldvary;965merge.sig.ldvpm |= b->sig.ldvpm;966merge.sig.small_imm |= b->sig.small_imm;967merge.sig.ldtlb |= b->sig.ldtlb;968merge.sig.ldtlbu |= b->sig.ldtlbu;969merge.sig.ucb |= b->sig.ucb;970merge.sig.rotate |= b->sig.rotate;971merge.sig.wrtmuc |= b->sig.wrtmuc;972973if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&974v3d_qpu_sig_writes_address(devinfo, &b->sig))975return false;976merge.sig_addr |= b->sig_addr;977merge.sig_magic |= b->sig_magic;978979uint64_t packed;980bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);981982*result = merge;983/* No modifying the real instructions on failure. */984assert(ok || (a != result && b != result));985986return ok;987}988989static inline bool990try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)991{992return inst->sig.ldunif || inst->sig.ldunifrf;993}994995static bool996qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,997struct choose_scoreboard *scoreboard,998const struct qinst *qinst);9991000static struct schedule_node *1001choose_instruction_to_schedule(struct v3d_compile *c,1002struct choose_scoreboard *scoreboard,1003struct schedule_node *prev_inst)1004{1005struct schedule_node *chosen = NULL;1006int chosen_prio = 0;10071008/* Don't pair up anything with a thread switch signal -- emit_thrsw()1009* will handle pairing it along with filling the delay slots.1010*/1011if (prev_inst) {1012if (prev_inst->inst->qpu.sig.thrsw)1013return NULL;1014}10151016bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&1017scoreboard->ldvary_count < c->num_inputs;1018bool skipped_insts_for_ldvary_pipelining = false;1019retry:1020list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,1021dag.link) {1022const struct v3d_qpu_instr *inst = &n->inst->qpu;10231024if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {1025skipped_insts_for_ldvary_pipelining = true;1026continue;1027}10281029/* Don't choose the branch instruction until it's the last one1030* left. We'll move it up to fit its delay slots after we1031* choose it.1032*/1033if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&1034!list_is_singular(&scoreboard->dag->heads)) {1035continue;1036}10371038/* We need to have 3 delay slots between a write to unifa and1039* a follow-up ldunifa.1040*/1041if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&1042scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)1043continue;10441045/* "An instruction must not read from a location in physical1046* regfile A or B that was written to by the previous1047* instruction."1048*/1049if (reads_too_soon_after_write(scoreboard, n->inst))1050continue;10511052if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))1053continue;10541055/* "A scoreboard wait must not occur in the first two1056* instructions of a fragment shader. This is either the1057* explicit Wait for Scoreboard signal or an implicit wait1058* with the first tile-buffer read or write instruction."1059*/1060if (pixel_scoreboard_too_soon(scoreboard, inst))1061continue;10621063/* ldunif and ldvary both write r5, but ldunif does so a tick1064* sooner. If the ldvary's r5 wasn't used, then ldunif might1065* otherwise get scheduled so ldunif and ldvary try to update1066* r5 in the same tick.1067*/1068if ((inst->sig.ldunif || inst->sig.ldunifa) &&1069scoreboard->tick == scoreboard->last_ldvary_tick + 1) {1070continue;1071}10721073/* If we are in a thrsw delay slot check that this instruction1074* is valid for that.1075*/1076if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&1077!qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,1078n->inst)) {1079continue;1080}10811082if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {1083/* Don't try to put a branch in the delay slots of another1084* branch or a unifa write.1085*/1086if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)1087continue;1088if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)1089continue;10901091/* No branch with cond != 0,2,3 and msfign != 0 after1092* setmsf.1093*/1094if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&1095inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&1096inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&1097inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&1098inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {1099continue;1100}1101}11021103/* If we're trying to pair with another instruction, check1104* that they're compatible.1105*/1106if (prev_inst) {1107/* Don't pair up a thread switch signal -- we'll1108* handle pairing it when we pick it on its own.1109*/1110if (inst->sig.thrsw)1111continue;11121113if (prev_inst->inst->uniform != -1 &&1114n->inst->uniform != -1)1115continue;11161117/* Simulator complains if we have two uniforms loaded in1118* the the same instruction, which could happen if we1119* have a ldunif or sideband uniform and we pair that1120* with ldunifa.1121*/1122if (vir_has_uniform(prev_inst->inst) &&1123(inst->sig.ldunifa || inst->sig.ldunifarf)) {1124continue;1125}11261127if ((prev_inst->inst->qpu.sig.ldunifa ||1128prev_inst->inst->qpu.sig.ldunifarf) &&1129vir_has_uniform(n->inst)) {1130continue;1131}11321133/* Don't merge in something that will lock the TLB.1134* Hopwefully what we have in inst will release some1135* other instructions, allowing us to delay the1136* TLB-locking instruction until later.1137*/1138if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))1139continue;11401141/* When we succesfully pair up an ldvary we then try1142* to merge it into the previous instruction if1143* possible to improve pipelining. Don't pick up the1144* ldvary now if the follow-up fixup would place1145* it in the delay slots of a thrsw, which is not1146* allowed and would prevent the fixup from being1147* successul.1148*/1149if (inst->sig.ldvary &&1150scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {1151continue;1152}11531154struct v3d_qpu_instr merged_inst;1155if (!qpu_merge_inst(c->devinfo, &merged_inst,1156&prev_inst->inst->qpu, inst)) {1157continue;1158}1159}11601161int prio = get_instruction_priority(c->devinfo, inst);11621163if (mux_read_stalls(scoreboard, inst)) {1164/* Don't merge an instruction that stalls */1165if (prev_inst)1166continue;1167else {1168/* Any instruction that don't stall will have1169* higher scheduling priority */1170prio -= MAX_SCHEDULE_PRIORITY;1171assert(prio < 0);1172}1173}11741175/* Found a valid instruction. If nothing better comes along,1176* this one works.1177*/1178if (!chosen) {1179chosen = n;1180chosen_prio = prio;1181continue;1182}11831184if (prio > chosen_prio) {1185chosen = n;1186chosen_prio = prio;1187} else if (prio < chosen_prio) {1188continue;1189}11901191if (n->delay > chosen->delay) {1192chosen = n;1193chosen_prio = prio;1194} else if (n->delay < chosen->delay) {1195continue;1196}1197}11981199/* If we did not find any instruction to schedule but we discarded1200* some of them to prioritize ldvary pipelining, try again.1201*/1202if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {1203skipped_insts_for_ldvary_pipelining = false;1204ldvary_pipelining = false;1205goto retry;1206}12071208if (chosen && chosen->inst->qpu.sig.ldvary) {1209scoreboard->ldvary_count++;1210/* If we are pairing an ldvary, flag it so we can fix it up for1211* optimal pipelining of ldvary sequences.1212*/1213if (prev_inst)1214scoreboard->fixup_ldvary = true;1215}12161217return chosen;1218}12191220static void1221update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,1222enum v3d_qpu_waddr waddr,1223const struct v3d_device_info *devinfo)1224{1225if (v3d_qpu_magic_waddr_is_sfu(waddr))1226scoreboard->last_magic_sfu_write_tick = scoreboard->tick;1227else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)1228scoreboard->last_unifa_write_tick = scoreboard->tick;1229}12301231static void1232update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,1233const struct v3d_qpu_instr *inst)1234{1235if (v3d_qpu_instr_is_sfu(inst)) {1236scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;1237scoreboard->last_stallable_sfu_tick = scoreboard->tick;1238}1239}12401241static void1242update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,1243const struct v3d_qpu_instr *inst,1244const struct v3d_device_info *devinfo)1245{1246if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)1247return;12481249assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);12501251if (inst->alu.add.op != V3D_QPU_A_NOP) {1252if (inst->alu.add.magic_write) {1253update_scoreboard_for_magic_waddr(scoreboard,1254inst->alu.add.waddr,1255devinfo);1256} else {1257update_scoreboard_for_sfu_stall_waddr(scoreboard,1258inst);1259}12601261if (inst->alu.add.op == V3D_QPU_A_SETMSF)1262scoreboard->last_setmsf_tick = scoreboard->tick;1263}12641265if (inst->alu.mul.op != V3D_QPU_M_NOP) {1266if (inst->alu.mul.magic_write) {1267update_scoreboard_for_magic_waddr(scoreboard,1268inst->alu.mul.waddr,1269devinfo);1270}1271}12721273if (inst->sig.ldvary)1274scoreboard->last_ldvary_tick = scoreboard->tick;12751276if (qpu_inst_is_tlb(inst))1277scoreboard->tlb_locked = true;1278}12791280static void1281dump_state(const struct v3d_device_info *devinfo, struct dag *dag)1282{1283list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {1284fprintf(stderr, " t=%4d: ", n->unblocked_time);1285v3d_qpu_dump(devinfo, &n->inst->qpu);1286fprintf(stderr, "\n");12871288util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {1289struct schedule_node *child =1290(struct schedule_node *)edge->child;1291if (!child)1292continue;12931294fprintf(stderr, " - ");1295v3d_qpu_dump(devinfo, &child->inst->qpu);1296fprintf(stderr, " (%d parents, %c)\n",1297child->dag.parent_count,1298edge->data ? 'w' : 'r');1299}1300}1301}13021303static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,1304enum v3d_qpu_waddr waddr,1305const struct v3d_qpu_instr *after)1306{1307/* Apply some huge latency between texture fetch requests and getting1308* their results back.1309*1310* FIXME: This is actually pretty bogus. If we do:1311*1312* mov tmu0_s, a1313* <a bit of math>1314* mov tmu0_s, b1315* load_tmu01316* <more math>1317* load_tmu01318*1319* we count that as worse than1320*1321* mov tmu0_s, a1322* mov tmu0_s, b1323* <lots of math>1324* load_tmu01325* <more math>1326* load_tmu01327*1328* because we associate the first load_tmu0 with the *second* tmu0_s.1329*/1330if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&1331v3d_qpu_waits_on_tmu(after)) {1332return 100;1333}13341335/* Assume that anything depending on us is consuming the SFU result. */1336if (v3d_qpu_magic_waddr_is_sfu(waddr))1337return 3;13381339return 1;1340}13411342static uint32_t1343instruction_latency(const struct v3d_device_info *devinfo,1344struct schedule_node *before, struct schedule_node *after)1345{1346const struct v3d_qpu_instr *before_inst = &before->inst->qpu;1347const struct v3d_qpu_instr *after_inst = &after->inst->qpu;1348uint32_t latency = 1;13491350if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||1351after_inst->type != V3D_QPU_INSTR_TYPE_ALU)1352return latency;13531354if (before_inst->alu.add.magic_write) {1355latency = MAX2(latency,1356magic_waddr_latency(devinfo,1357before_inst->alu.add.waddr,1358after_inst));1359}13601361if (before_inst->alu.mul.magic_write) {1362latency = MAX2(latency,1363magic_waddr_latency(devinfo,1364before_inst->alu.mul.waddr,1365after_inst));1366}13671368if (v3d_qpu_instr_is_sfu(before_inst))1369return 2;13701371return latency;1372}13731374/** Recursive computation of the delay member of a node. */1375static void1376compute_delay(struct dag_node *node, void *state)1377{1378struct schedule_node *n = (struct schedule_node *)node;1379struct v3d_compile *c = (struct v3d_compile *) state;13801381n->delay = 1;13821383util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {1384struct schedule_node *child =1385(struct schedule_node *)edge->child;13861387n->delay = MAX2(n->delay, (child->delay +1388instruction_latency(c->devinfo, n,1389child)));1390}1391}13921393/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()1394* should be called on it later to finish pruning the other edges).1395*/1396static void1397pre_remove_head(struct dag *dag, struct schedule_node *n)1398{1399list_delinit(&n->dag.link);14001401util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {1402if (edge->data)1403dag_remove_edge(dag, edge);1404}1405}14061407static void1408mark_instruction_scheduled(const struct v3d_device_info *devinfo,1409struct dag *dag,1410uint32_t time,1411struct schedule_node *node)1412{1413if (!node)1414return;14151416util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {1417struct schedule_node *child =1418(struct schedule_node *)edge->child;14191420if (!child)1421continue;14221423uint32_t latency = instruction_latency(devinfo, node, child);14241425child->unblocked_time = MAX2(child->unblocked_time,1426time + latency);1427}1428dag_prune_head(dag, &node->dag);1429}14301431static void1432insert_scheduled_instruction(struct v3d_compile *c,1433struct qblock *block,1434struct choose_scoreboard *scoreboard,1435struct qinst *inst)1436{1437list_addtail(&inst->link, &block->instructions);14381439update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);1440c->qpu_inst_count++;1441scoreboard->tick++;1442}14431444static struct qinst *1445vir_nop()1446{1447struct qreg undef = vir_nop_reg();1448struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);14491450return qinst;1451}14521453static void1454emit_nop(struct v3d_compile *c, struct qblock *block,1455struct choose_scoreboard *scoreboard)1456{1457insert_scheduled_instruction(c, block, scoreboard, vir_nop());1458}14591460static bool1461qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,1462const struct qinst *qinst, int slot)1463{1464const struct v3d_qpu_instr *inst = &qinst->qpu;14651466/* Only TLB Z writes are prohibited in the last slot, but we don't1467* have those flagged so prohibit all TLB ops for now.1468*/1469if (slot == 2 && qpu_inst_is_tlb(inst))1470return false;14711472if (slot > 0 && qinst->uniform != ~0)1473return false;14741475if (v3d_qpu_uses_vpm(inst))1476return false;14771478if (inst->sig.ldvary)1479return false;14801481if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {1482/* GFXH-1625: TMUWT not allowed in the final instruction. */1483if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)1484return false;14851486/* No writing physical registers at the end. */1487if (!inst->alu.add.magic_write ||1488!inst->alu.mul.magic_write) {1489return false;1490}14911492if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)1493return false;14941495/* RF0-2 might be overwritten during the delay slots by1496* fragment shader setup.1497*/1498if (inst->raddr_a < 3 &&1499(inst->alu.add.a == V3D_QPU_MUX_A ||1500inst->alu.add.b == V3D_QPU_MUX_A ||1501inst->alu.mul.a == V3D_QPU_MUX_A ||1502inst->alu.mul.b == V3D_QPU_MUX_A)) {1503return false;1504}15051506if (inst->raddr_b < 3 &&1507!inst->sig.small_imm &&1508(inst->alu.add.a == V3D_QPU_MUX_B ||1509inst->alu.add.b == V3D_QPU_MUX_B ||1510inst->alu.mul.a == V3D_QPU_MUX_B ||1511inst->alu.mul.b == V3D_QPU_MUX_B)) {1512return false;1513}1514}15151516return true;1517}15181519/**1520* This is called when trying to merge a thrsw back into the instruction stream1521* of instructions that were scheduled *before* the thrsw signal to fill its1522* delay slots. Because the actual execution of the thrsw happens after the1523* delay slots, it is usually safe to do this, but there are some cases that1524* need special care.1525*/1526static bool1527qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,1528const struct qinst *qinst,1529uint32_t slot)1530{1531/* No scheduling SFU when the result would land in the other1532* thread. The simulator complains for safety, though it1533* would only occur for dead code in our case.1534*/1535if (slot > 0 &&1536qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&1537(v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||1538v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {1539return false;1540}15411542if (slot > 0 && qinst->qpu.sig.ldvary)1543return false;15441545/* unifa and the following 3 instructions can't overlap a1546* thread switch/end. The docs further clarify that this means1547* the cycle at which the actual thread switch/end happens1548* and not when the thrsw instruction is processed, which would1549* be after the 2 delay slots following the thrsw instruction.1550* This means that we can move up a thrsw up to the instruction1551* right after unifa:1552*1553* unifa, r51554* thrsw1555* delay slot 11556* delay slot 21557* Thread switch happens here, 4 instructions away from unifa1558*/1559if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))1560return false;15611562return true;1563}15641565/**1566* This is called for instructions scheduled *after* a thrsw signal that may1567* land in the delay slots of the thrsw. Because these instructions were1568* scheduled after the thrsw, we need to be careful when placing them into1569* the delay slots, since that means that we are moving them ahead of the1570* thread switch and we need to ensure that is not a problem.1571*/1572static bool1573qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,1574struct choose_scoreboard *scoreboard,1575const struct qinst *qinst)1576{1577const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;1578assert(slot <= 2);15791580/* We merge thrsw instructions back into the instruction stream1581* manually, so any instructions scheduled after a thrsw shold be1582* in the actual delay slots and not in the same slot as the thrsw.1583*/1584assert(slot >= 1);15851586/* No emitting a thrsw while the previous thrsw hasn't happened yet. */1587if (qinst->qpu.sig.thrsw)1588return false;15891590/* The restrictions for instructions scheduled before the the thrsw1591* also apply to instructions scheduled after the thrsw that we want1592* to place in its delay slots.1593*/1594if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))1595return false;15961597/* TLB access is disallowed until scoreboard wait is executed, which1598* we do on the last thread switch.1599*/1600if (qpu_inst_is_tlb(&qinst->qpu))1601return false;16021603/* Instruction sequence restrictions: Branch is not allowed in delay1604* slots of a thrsw.1605*/1606if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)1607return false;16081609/* Miscellaneous restrictions: At the point of a thrsw we need to have1610* at least one outstanding lookup or TSY wait.1611*1612* So avoid placing TMU instructions scheduled after the thrsw into1613* its delay slots or we may be compromising the integrity of our TMU1614* sequences. Also, notice that if we moved these instructions into1615* the delay slots of a previous thrsw we could overflow our TMU output1616* fifo, since we could be effectively pipelining a lookup scheduled1617* after the thrsw into the sequence before the thrsw.1618*/1619if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||1620qinst->qpu.sig.wrtmuc) {1621return false;1622}16231624/* Don't move instructions that wait on the TMU before the thread switch1625* happens since that would make the current thread stall before the1626* switch, which is exactly what we want to avoid with the thrsw1627* instruction.1628*/1629if (v3d_qpu_waits_on_tmu(&qinst->qpu))1630return false;16311632/* A thread switch invalidates all accumulators, so don't place any1633* instructions that write accumulators into the delay slots.1634*/1635if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))1636return false;16371638/* Multop has an implicit write to the rtop register which is an1639* specialized accumulator that is only used with this instruction.1640*/1641if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)1642return false;16431644/* Flags are invalidated across a thread switch, so dont' place1645* instructions that write flags into delay slots.1646*/1647if (v3d_qpu_writes_flags(&qinst->qpu))1648return false;16491650return true;1651}16521653static bool1654valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,1655struct qinst *qinst, int instructions_in_sequence,1656bool is_thrend)1657{1658/* No emitting our thrsw while the previous thrsw hasn't happened yet. */1659if (scoreboard->last_thrsw_tick + 3 >1660scoreboard->tick - instructions_in_sequence) {1661return false;1662}16631664for (int slot = 0; slot < instructions_in_sequence; slot++) {1665if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))1666return false;16671668if (is_thrend &&1669!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {1670return false;1671}16721673/* Note that the list is circular, so we can only do this up1674* to instructions_in_sequence.1675*/1676qinst = (struct qinst *)qinst->link.next;1677}16781679return true;1680}16811682/**1683* Emits a THRSW signal in the stream, trying to move it up to pair with1684* another instruction.1685*/1686static int1687emit_thrsw(struct v3d_compile *c,1688struct qblock *block,1689struct choose_scoreboard *scoreboard,1690struct qinst *inst,1691bool is_thrend)1692{1693int time = 0;16941695/* There should be nothing in a thrsw inst being scheduled other than1696* the signal bits.1697*/1698assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);1699assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);1700assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);17011702/* Don't try to emit a thrsw in the delay slots of a previous thrsw1703* or branch.1704*/1705while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {1706emit_nop(c, block, scoreboard);1707time++;1708}1709while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {1710emit_nop(c, block, scoreboard);1711time++;1712}17131714/* Find how far back into previous instructions we can put the THRSW. */1715int slots_filled = 0;1716struct qinst *merge_inst = NULL;1717vir_for_each_inst_rev(prev_inst, block) {1718struct v3d_qpu_sig sig = prev_inst->qpu.sig;1719sig.thrsw = true;1720uint32_t packed_sig;17211722if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))1723break;17241725if (!valid_thrsw_sequence(c, scoreboard,1726prev_inst, slots_filled + 1,1727is_thrend)) {1728break;1729}17301731merge_inst = prev_inst;1732if (++slots_filled == 3)1733break;1734}17351736bool needs_free = false;1737if (merge_inst) {1738merge_inst->qpu.sig.thrsw = true;1739needs_free = true;1740scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;1741} else {1742scoreboard->last_thrsw_tick = scoreboard->tick;1743insert_scheduled_instruction(c, block, scoreboard, inst);1744time++;1745slots_filled++;1746merge_inst = inst;1747}17481749/* If we're emitting the last THRSW (other than program end), then1750* signal that to the HW by emitting two THRSWs in a row.1751*/1752if (inst->is_last_thrsw) {1753if (slots_filled <= 1) {1754emit_nop(c, block, scoreboard);1755time++;1756}1757struct qinst *second_inst =1758(struct qinst *)merge_inst->link.next;1759second_inst->qpu.sig.thrsw = true;1760}17611762/* Make sure the thread end executes within the program lifespan */1763if (is_thrend) {1764for (int i = 0; i < 3 - slots_filled; i++) {1765emit_nop(c, block, scoreboard);1766time++;1767}1768}17691770/* If we put our THRSW into another instruction, free up the1771* instruction that didn't end up scheduled into the list.1772*/1773if (needs_free)1774free(inst);17751776return time;1777}17781779static bool1780qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)1781{1782if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)1783return false;17841785if (inst->qpu.sig.thrsw)1786return false;17871788if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))1789return false;17901791if (vir_has_uniform(inst))1792return false;17931794return true;1795}17961797static void1798emit_branch(struct v3d_compile *c,1799struct qblock *block,1800struct choose_scoreboard *scoreboard,1801struct qinst *inst)1802{1803assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);18041805/* We should've not picked up a branch for the delay slots of a previous1806* thrsw, branch or unifa write instruction.1807*/1808int branch_tick = scoreboard->tick;1809assert(scoreboard->last_thrsw_tick + 2 < branch_tick);1810assert(scoreboard->last_branch_tick + 3 < branch_tick);1811assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);18121813/* Can't place a branch with msfign != 0 and cond != 0,2,3 after1814* setmsf.1815*/1816bool is_safe_msf_branch =1817inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||1818inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||1819inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||1820inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;1821assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||1822is_safe_msf_branch);18231824/* Insert the branch instruction */1825insert_scheduled_instruction(c, block, scoreboard, inst);18261827/* Now see if we can move the branch instruction back into the1828* instruction stream to fill its delay slots1829*/1830int slots_filled = 0;1831while (slots_filled < 3 && block->instructions.next != &inst->link) {1832struct qinst *prev_inst = (struct qinst *) inst->link.prev;1833assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);18341835/* Can't move the branch instruction if that would place it1836* in the delay slots of other instructions.1837*/1838if (scoreboard->last_branch_tick + 3 >=1839branch_tick - slots_filled - 1) {1840break;1841}18421843if (scoreboard->last_thrsw_tick + 2 >=1844branch_tick - slots_filled - 1) {1845break;1846}18471848if (scoreboard->last_unifa_write_tick + 3 >=1849branch_tick - slots_filled - 1) {1850break;1851}18521853/* Can't move a conditional branch before the instruction1854* that writes the flags for its condition.1855*/1856if (v3d_qpu_writes_flags(&prev_inst->qpu) &&1857inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {1858break;1859}18601861if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))1862break;18631864if (!is_safe_msf_branch) {1865struct qinst *prev_prev_inst =1866(struct qinst *) prev_inst->link.prev;1867if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&1868prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {1869break;1870}1871}18721873list_del(&prev_inst->link);1874list_add(&prev_inst->link, &inst->link);1875slots_filled++;1876}18771878block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;1879scoreboard->last_branch_tick = branch_tick - slots_filled;18801881/* Fill any remaining delay slots.1882*1883* For unconditional branches we'll try to fill these with the1884* first instructions in the successor block after scheduling1885* all blocks when setting up branch targets.1886*/1887for (int i = 0; i < 3 - slots_filled; i++)1888emit_nop(c, block, scoreboard);1889}18901891static bool1892alu_reads_register(struct v3d_qpu_instr *inst,1893bool add, bool magic, uint32_t index)1894{1895uint32_t num_src;1896enum v3d_qpu_mux mux_a, mux_b;18971898if (add) {1899num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);1900mux_a = inst->alu.add.a;1901mux_b = inst->alu.add.b;1902} else {1903num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);1904mux_a = inst->alu.mul.a;1905mux_b = inst->alu.mul.b;1906}19071908for (int i = 0; i < num_src; i++) {1909if (magic) {1910if (i == 0 && mux_a == index)1911return true;1912if (i == 1 && mux_b == index)1913return true;1914} else {1915if (i == 0 && mux_a == V3D_QPU_MUX_A &&1916inst->raddr_a == index) {1917return true;1918}1919if (i == 0 && mux_a == V3D_QPU_MUX_B &&1920inst->raddr_b == index) {1921return true;1922}1923if (i == 1 && mux_b == V3D_QPU_MUX_A &&1924inst->raddr_a == index) {1925return true;1926}1927if (i == 1 && mux_b == V3D_QPU_MUX_B &&1928inst->raddr_b == index) {1929return true;1930}1931}1932}19331934return false;1935}19361937/**1938* This takes and ldvary signal merged into 'inst' and tries to move it up to1939* the previous instruction to get good pipelining of ldvary sequences,1940* transforming this:1941*1942* nop ; nop ; ldvary.r41943* nop ; fmul r0, r4, rf0 ;1944* fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst1945*1946* into:1947*1948* nop ; nop ; ldvary.r41949* nop ; fmul r0, r4, rf0 ; ldvary.r11950* fadd rf13, r0, r5 ; nop; ; <-- inst1951*1952* If we manage to do this successfully (we return true here), then flagging1953* the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that1954* we will be able to pick up to merge into 'inst', leading to code like this:1955*1956* nop ; nop ; ldvary.r41957* nop ; fmul r0, r4, rf0 ; ldvary.r11958* fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst1959*/1960static bool1961fixup_pipelined_ldvary(struct v3d_compile *c,1962struct choose_scoreboard *scoreboard,1963struct qblock *block,1964struct v3d_qpu_instr *inst)1965{1966/* We only call this if we have successfuly merged an ldvary into a1967* previous instruction.1968*/1969assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);1970assert(inst->sig.ldvary);1971uint32_t ldvary_magic = inst->sig_magic;1972uint32_t ldvary_index = inst->sig_addr;19731974/* The instruction in which we merged the ldvary cannot read1975* the ldvary destination, if it does, then moving the ldvary before1976* it would overwrite it.1977*/1978if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))1979return false;1980if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))1981return false;19821983/* The previous instruction can't write to the same destination as the1984* ldvary.1985*/1986struct qinst *prev = (struct qinst *) block->instructions.prev;1987if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)1988return false;19891990if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {1991if (prev->qpu.alu.add.magic_write == ldvary_magic &&1992prev->qpu.alu.add.waddr == ldvary_index) {1993return false;1994}1995}19961997if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {1998if (prev->qpu.alu.mul.magic_write == ldvary_magic &&1999prev->qpu.alu.mul.waddr == ldvary_index) {2000return false;2001}2002}20032004/* The previous instruction cannot have a conflicting signal */2005if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))2006return false;20072008/* The previous instruction cannot use flags since ldvary uses the2009* 'cond' instruction field to store the destination.2010*/2011if (v3d_qpu_writes_flags(&prev->qpu))2012return false;2013if (v3d_qpu_reads_flags(&prev->qpu))2014return false;20152016/* We can't put an ldvary in the delay slots of a thrsw. We should've2017* prevented this when pairing up the ldvary with another instruction2018* and flagging it for a fixup.2019*/2020assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);20212022/* Move the ldvary to the previous instruction and remove it from the2023* current one.2024*/2025prev->qpu.sig.ldvary = true;2026prev->qpu.sig_magic = ldvary_magic;2027prev->qpu.sig_addr = ldvary_index;2028scoreboard->last_ldvary_tick = scoreboard->tick - 1;20292030inst->sig.ldvary = false;2031inst->sig_magic = false;2032inst->sig_addr = 0;20332034/* By moving ldvary to the previous instruction we make it update2035* r5 in the current one, so nothing else in it should write r5.2036* This should've been prevented by our depedency tracking, which2037* would not allow ldvary to be paired up with an instruction that2038* writes r5 (since our dependency tracking doesn't know that the2039* ldvary write r5 happens in the next instruction).2040*/2041assert(!v3d_qpu_writes_r5(c->devinfo, inst));20422043return true;2044}20452046static uint32_t2047schedule_instructions(struct v3d_compile *c,2048struct choose_scoreboard *scoreboard,2049struct qblock *block,2050enum quniform_contents *orig_uniform_contents,2051uint32_t *orig_uniform_data,2052uint32_t *next_uniform)2053{2054const struct v3d_device_info *devinfo = c->devinfo;2055uint32_t time = 0;20562057while (!list_is_empty(&scoreboard->dag->heads)) {2058struct schedule_node *chosen =2059choose_instruction_to_schedule(c, scoreboard, NULL);2060struct schedule_node *merge = NULL;20612062/* If there are no valid instructions to schedule, drop a NOP2063* in.2064*/2065struct qinst *qinst = chosen ? chosen->inst : vir_nop();2066struct v3d_qpu_instr *inst = &qinst->qpu;20672068if (debug) {2069fprintf(stderr, "t=%4d: current list:\n",2070time);2071dump_state(devinfo, scoreboard->dag);2072fprintf(stderr, "t=%4d: chose: ", time);2073v3d_qpu_dump(devinfo, inst);2074fprintf(stderr, "\n");2075}20762077/* We can't mark_instruction_scheduled() the chosen inst until2078* we're done identifying instructions to merge, so put the2079* merged instructions on a list for a moment.2080*/2081struct list_head merged_list;2082list_inithead(&merged_list);20832084/* Schedule this instruction onto the QPU list. Also try to2085* find an instruction to pair with it.2086*/2087if (chosen) {2088time = MAX2(chosen->unblocked_time, time);2089pre_remove_head(scoreboard->dag, chosen);20902091while ((merge =2092choose_instruction_to_schedule(c, scoreboard,2093chosen))) {2094time = MAX2(merge->unblocked_time, time);2095pre_remove_head(scoreboard->dag, merge);2096list_addtail(&merge->link, &merged_list);2097(void)qpu_merge_inst(devinfo, inst,2098inst, &merge->inst->qpu);2099if (merge->inst->uniform != -1) {2100chosen->inst->uniform =2101merge->inst->uniform;2102}21032104if (debug) {2105fprintf(stderr, "t=%4d: merging: ",2106time);2107v3d_qpu_dump(devinfo, &merge->inst->qpu);2108fprintf(stderr, "\n");2109fprintf(stderr, " result: ");2110v3d_qpu_dump(devinfo, inst);2111fprintf(stderr, "\n");2112}21132114if (scoreboard->fixup_ldvary) {2115scoreboard->fixup_ldvary = false;2116if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {2117/* Flag the ldvary as scheduled2118* now so we can try to merge the2119* follow-up instruction in the2120* the ldvary sequence into the2121* current instruction.2122*/2123mark_instruction_scheduled(2124devinfo, scoreboard->dag,2125time, merge);2126}2127}2128}2129if (mux_read_stalls(scoreboard, inst))2130c->qpu_inst_stalled_count++;2131}21322133/* Update the uniform index for the rewritten location --2134* branch target updating will still need to change2135* c->uniform_data[] using this index.2136*/2137if (qinst->uniform != -1) {2138if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)2139block->branch_uniform = *next_uniform;21402141c->uniform_data[*next_uniform] =2142orig_uniform_data[qinst->uniform];2143c->uniform_contents[*next_uniform] =2144orig_uniform_contents[qinst->uniform];2145qinst->uniform = *next_uniform;2146(*next_uniform)++;2147}21482149if (debug) {2150fprintf(stderr, "\n");2151}21522153/* Now that we've scheduled a new instruction, some of its2154* children can be promoted to the list of instructions ready to2155* be scheduled. Update the children's unblocked time for this2156* DAG edge as we do so.2157*/2158mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);2159list_for_each_entry(struct schedule_node, merge, &merged_list,2160link) {2161mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);21622163/* The merged VIR instruction doesn't get re-added to the2164* block, so free it now.2165*/2166free(merge->inst);2167}21682169if (inst->sig.thrsw) {2170time += emit_thrsw(c, block, scoreboard, qinst, false);2171} else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {2172emit_branch(c, block, scoreboard, qinst);2173} else {2174insert_scheduled_instruction(c, block,2175scoreboard, qinst);2176}2177}21782179return time;2180}21812182static uint32_t2183qpu_schedule_instructions_block(struct v3d_compile *c,2184struct choose_scoreboard *scoreboard,2185struct qblock *block,2186enum quniform_contents *orig_uniform_contents,2187uint32_t *orig_uniform_data,2188uint32_t *next_uniform)2189{2190void *mem_ctx = ralloc_context(NULL);2191scoreboard->dag = dag_create(mem_ctx);2192struct list_head setup_list;21932194list_inithead(&setup_list);21952196/* Wrap each instruction in a scheduler structure. */2197while (!list_is_empty(&block->instructions)) {2198struct qinst *qinst = (struct qinst *)block->instructions.next;2199struct schedule_node *n =2200rzalloc(mem_ctx, struct schedule_node);22012202dag_init_node(scoreboard->dag, &n->dag);2203n->inst = qinst;22042205list_del(&qinst->link);2206list_addtail(&n->link, &setup_list);2207}22082209calculate_forward_deps(c, scoreboard->dag, &setup_list);2210calculate_reverse_deps(c, scoreboard->dag, &setup_list);22112212dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);22132214uint32_t cycles = schedule_instructions(c, scoreboard, block,2215orig_uniform_contents,2216orig_uniform_data,2217next_uniform);22182219ralloc_free(mem_ctx);2220scoreboard->dag = NULL;22212222return cycles;2223}22242225static void2226qpu_set_branch_targets(struct v3d_compile *c)2227{2228vir_for_each_block(block, c) {2229/* The end block of the program has no branch. */2230if (!block->successors[0])2231continue;22322233/* If there was no branch instruction, then the successor2234* block must follow immediately after this one.2235*/2236if (block->branch_qpu_ip == ~0) {2237assert(block->end_qpu_ip + 1 ==2238block->successors[0]->start_qpu_ip);2239continue;2240}22412242/* Walk back through the delay slots to find the branch2243* instr.2244*/2245struct qinst *branch = NULL;2246struct list_head *entry = block->instructions.prev;2247int32_t delay_slot_count = -1;2248struct qinst *delay_slots_start = NULL;2249for (int i = 0; i < 3; i++) {2250entry = entry->prev;2251struct qinst *inst =2252container_of(entry, struct qinst, link);22532254if (delay_slot_count == -1) {2255if (!v3d_qpu_is_nop(&inst->qpu))2256delay_slot_count = i;2257else2258delay_slots_start = inst;2259}22602261if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {2262branch = inst;2263break;2264}2265}2266assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);2267assert(delay_slot_count >= 0 && delay_slot_count <= 3);2268assert(delay_slot_count == 0 || delay_slots_start != NULL);22692270/* Make sure that the if-we-don't-jump2271* successor was scheduled just after the2272* delay slots.2273*/2274assert(!block->successors[1] ||2275block->successors[1]->start_qpu_ip ==2276block->branch_qpu_ip + 4);22772278branch->qpu.branch.offset =2279((block->successors[0]->start_qpu_ip -2280(block->branch_qpu_ip + 4)) *2281sizeof(uint64_t));22822283/* Set up the relative offset to jump in the2284* uniform stream.2285*2286* Use a temporary here, because2287* uniform_data[inst->uniform] may be shared2288* between multiple instructions.2289*/2290assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);2291c->uniform_data[branch->uniform] =2292(block->successors[0]->start_uniform -2293(block->branch_uniform + 1)) * 4;22942295/* If this is an unconditional branch, try to fill any remaining2296* delay slots with the initial instructions of the successor2297* block.2298*2299* FIXME: we can do the same for conditional branches if we2300* predicate the instructions to match the branch condition.2301*/2302if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {2303struct list_head *successor_insts =2304&block->successors[0]->instructions;2305delay_slot_count = MIN2(delay_slot_count,2306list_length(successor_insts));2307struct qinst *s_inst =2308(struct qinst *) successor_insts->next;2309struct qinst *slot = delay_slots_start;2310int slots_filled = 0;2311while (slots_filled < delay_slot_count &&2312qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {2313memcpy(&slot->qpu, &s_inst->qpu,2314sizeof(slot->qpu));2315s_inst = (struct qinst *) s_inst->link.next;2316slot = (struct qinst *) slot->link.next;2317slots_filled++;2318}2319branch->qpu.branch.offset +=2320slots_filled * sizeof(uint64_t);2321}2322}2323}23242325uint32_t2326v3d_qpu_schedule_instructions(struct v3d_compile *c)2327{2328const struct v3d_device_info *devinfo = c->devinfo;2329struct qblock *end_block = list_last_entry(&c->blocks,2330struct qblock, link);23312332/* We reorder the uniforms as we schedule instructions, so save the2333* old data off and replace it.2334*/2335uint32_t *uniform_data = c->uniform_data;2336enum quniform_contents *uniform_contents = c->uniform_contents;2337c->uniform_contents = ralloc_array(c, enum quniform_contents,2338c->num_uniforms);2339c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);2340c->uniform_array_size = c->num_uniforms;2341uint32_t next_uniform = 0;23422343struct choose_scoreboard scoreboard;2344memset(&scoreboard, 0, sizeof(scoreboard));2345scoreboard.last_ldvary_tick = -10;2346scoreboard.last_unifa_write_tick = -10;2347scoreboard.last_magic_sfu_write_tick = -10;2348scoreboard.last_uniforms_reset_tick = -10;2349scoreboard.last_thrsw_tick = -10;2350scoreboard.last_branch_tick = -10;2351scoreboard.last_setmsf_tick = -10;2352scoreboard.last_stallable_sfu_tick = -10;23532354if (debug) {2355fprintf(stderr, "Pre-schedule instructions\n");2356vir_for_each_block(block, c) {2357fprintf(stderr, "BLOCK %d\n", block->index);2358list_for_each_entry(struct qinst, qinst,2359&block->instructions, link) {2360v3d_qpu_dump(devinfo, &qinst->qpu);2361fprintf(stderr, "\n");2362}2363}2364fprintf(stderr, "\n");2365}23662367uint32_t cycles = 0;2368vir_for_each_block(block, c) {2369block->start_qpu_ip = c->qpu_inst_count;2370block->branch_qpu_ip = ~0;2371block->start_uniform = next_uniform;23722373cycles += qpu_schedule_instructions_block(c,2374&scoreboard,2375block,2376uniform_contents,2377uniform_data,2378&next_uniform);23792380block->end_qpu_ip = c->qpu_inst_count - 1;2381}23822383/* Emit the program-end THRSW instruction. */;2384struct qinst *thrsw = vir_nop();2385thrsw->qpu.sig.thrsw = true;2386emit_thrsw(c, end_block, &scoreboard, thrsw, true);23872388qpu_set_branch_targets(c);23892390assert(next_uniform == c->num_uniforms);23912392return cycles;2393}239423952396