Path: blob/21.2-virgl/src/freedreno/ir3/ir3_lower_subgroups.c
4565 views
/*1* Copyright (C) 2021 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,19* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE20* SOFTWARE.21*/2223#include "ir3.h"2425/* Lower several macro-instructions needed for shader subgroup support that26* must be turned into if statements. We do this after RA and post-RA27* scheduling to give the scheduler a chance to rearrange them, because RA28* may need to insert OPC_META_READ_FIRST to handle splitting live ranges, and29* also because some (e.g. BALLOT and READ_FIRST) must produce a shared30* register that cannot be spilled to a normal register until after the if,31* which makes implementing spilling more complicated if they are already32* lowered.33*/3435static void36replace_pred(struct ir3_block *block, struct ir3_block *old_pred,37struct ir3_block *new_pred)38{39for (unsigned i = 0; i < block->predecessors_count; i++) {40if (block->predecessors[i] == old_pred) {41block->predecessors[i] = new_pred;42return;43}44}45}4647static void48replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred,49struct ir3_block *new_pred)50{51for (unsigned i = 0; i < block->physical_predecessors_count; i++) {52if (block->physical_predecessors[i] == old_pred) {53block->physical_predecessors[i] = new_pred;54return;55}56}57}5859static void60mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)61{62struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);63struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);64mov_dst->wrmask = dst->wrmask;65struct ir3_register *src = ir3_src_create(66mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);67src->uim_val = immed;68mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;69mov->cat1.src_type = mov->cat1.dst_type;70mov->repeat = util_last_bit(mov_dst->wrmask) - 1;71}7273static struct ir3_block *74split_block(struct ir3 *ir, struct ir3_block *before_block,75struct ir3_instruction *instr, struct ir3_block **then)76{77struct ir3_block *then_block = ir3_block_create(ir);78struct ir3_block *after_block = ir3_block_create(ir);79list_add(&then_block->node, &before_block->node);80list_add(&after_block->node, &then_block->node);8182for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {83after_block->successors[i] = before_block->successors[i];84if (after_block->successors[i])85replace_pred(after_block->successors[i], before_block, after_block);86}8788for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors);89i++) {90after_block->physical_successors[i] =91before_block->physical_successors[i];92if (after_block->physical_successors[i]) {93replace_physical_pred(after_block->physical_successors[i],94before_block, after_block);95}96}9798before_block->successors[0] = then_block;99before_block->successors[1] = after_block;100before_block->physical_successors[0] = then_block;101before_block->physical_successors[1] = after_block;102ir3_block_add_predecessor(then_block, before_block);103ir3_block_add_predecessor(after_block, before_block);104ir3_block_add_physical_predecessor(then_block, before_block);105ir3_block_add_physical_predecessor(after_block, before_block);106107then_block->successors[0] = after_block;108then_block->physical_successors[0] = after_block;109ir3_block_add_predecessor(after_block, then_block);110ir3_block_add_physical_predecessor(after_block, then_block);111112foreach_instr_from_safe (rem_instr, &instr->node,113&before_block->instr_list) {114list_del(&rem_instr->node);115list_addtail(&rem_instr->node, &after_block->instr_list);116rem_instr->block = after_block;117}118119after_block->brtype = before_block->brtype;120after_block->condition = before_block->condition;121122*then = then_block;123return after_block;124}125126static bool127lower_block(struct ir3 *ir, struct ir3_block **block)128{129bool progress = false;130131foreach_instr_safe (instr, &(*block)->instr_list) {132switch (instr->opc) {133case OPC_BALLOT_MACRO:134case OPC_ANY_MACRO:135case OPC_ALL_MACRO:136case OPC_ELECT_MACRO:137case OPC_READ_COND_MACRO:138case OPC_READ_FIRST_MACRO:139case OPC_SWZ_SHARED_MACRO:140break;141default:142continue;143}144145struct ir3_block *before_block = *block;146struct ir3_block *then_block;147struct ir3_block *after_block =148split_block(ir, before_block, instr, &then_block);149150/* For ballot, the destination must be initialized to 0 before we do151* the movmsk because the condition may be 0 and then the movmsk will152* be skipped. Because it's a shared register we have to wrap the153* initialization in a getone block.154*/155if (instr->opc == OPC_BALLOT_MACRO) {156before_block->brtype = IR3_BRANCH_GETONE;157before_block->condition = NULL;158mov_immed(instr->dsts[0], then_block, 0);159before_block = after_block;160after_block = split_block(ir, before_block, instr, &then_block);161}162163switch (instr->opc) {164case OPC_BALLOT_MACRO:165case OPC_READ_COND_MACRO:166case OPC_ANY_MACRO:167case OPC_ALL_MACRO:168before_block->condition = instr->srcs[0]->def->instr;169break;170default:171before_block->condition = NULL;172break;173}174175switch (instr->opc) {176case OPC_BALLOT_MACRO:177case OPC_READ_COND_MACRO:178before_block->brtype = IR3_BRANCH_COND;179break;180case OPC_ANY_MACRO:181before_block->brtype = IR3_BRANCH_ANY;182break;183case OPC_ALL_MACRO:184before_block->brtype = IR3_BRANCH_ALL;185break;186case OPC_ELECT_MACRO:187case OPC_READ_FIRST_MACRO:188case OPC_SWZ_SHARED_MACRO:189before_block->brtype = IR3_BRANCH_GETONE;190break;191default:192unreachable("bad opcode");193}194195switch (instr->opc) {196case OPC_ALL_MACRO:197case OPC_ANY_MACRO:198case OPC_ELECT_MACRO:199mov_immed(instr->dsts[0], then_block, 1);200mov_immed(instr->dsts[0], before_block, 0);201break;202203case OPC_BALLOT_MACRO: {204unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);205struct ir3_instruction *movmsk =206ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);207ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);208movmsk->repeat = comp_count - 1;209break;210}211212case OPC_READ_COND_MACRO:213case OPC_READ_FIRST_MACRO: {214struct ir3_instruction *mov =215ir3_instr_create(then_block, OPC_MOV, 1, 1);216unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;217ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);218struct ir3_register *new_src = ir3_src_create(mov, 0, 0);219*new_src = *instr->srcs[src];220mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;221break;222}223224case OPC_SWZ_SHARED_MACRO: {225struct ir3_instruction *swz =226ir3_instr_create(then_block, OPC_SWZ, 2, 2);227ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);228ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);229ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);230ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);231swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;232swz->repeat = 1;233break;234}235236default:237unreachable("bad opcode");238}239240*block = after_block;241list_delinit(&instr->node);242progress = true;243}244245return progress;246}247248bool249ir3_lower_subgroups(struct ir3 *ir)250{251bool progress = false;252253foreach_block (block, &ir->block_list)254progress |= lower_block(ir, &block);255256return progress;257}258259260