Path: blob/21.2-virgl/src/amd/compiler/aco_ssa_elimination.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_ir.h"2526#include <algorithm>27#include <map>28#include <vector>2930namespace aco {31namespace {3233struct phi_info_item {34Definition def;35Operand op;36};3738struct ssa_elimination_ctx {39/* The outer vectors should be indexed by block index. The inner vectors store phi information40* for each block. */41std::vector<std::vector<phi_info_item>> logical_phi_info;42std::vector<std::vector<phi_info_item>> linear_phi_info;43std::vector<bool> empty_blocks;44std::vector<bool> blocks_incoming_exec_used;45Program* program;4647ssa_elimination_ctx(Program* program_)48: logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),49empty_blocks(program_->blocks.size(), true),50blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)51{}52};5354void55collect_phi_info(ssa_elimination_ctx& ctx)56{57for (Block& block : ctx.program->blocks) {58for (aco_ptr<Instruction>& phi : block.instructions) {59if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)60break;6162for (unsigned i = 0; i < phi->operands.size(); i++) {63if (phi->operands[i].isUndefined())64continue;65if (phi->operands[i].physReg() == phi->definitions[0].physReg())66continue;6768assert(phi->definitions[0].size() == phi->operands[i].size());6970std::vector<unsigned>& preds =71phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;72uint32_t pred_idx = preds[i];73auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]74: ctx.linear_phi_info[pred_idx];75info_vec.push_back({phi->definitions[0], phi->operands[i]});76ctx.empty_blocks[pred_idx] = false;77}78}79}80}8182void83insert_parallelcopies(ssa_elimination_ctx& ctx)84{85/* insert the parallelcopies from logical phis before p_logical_end */86for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {87auto& logical_phi_info = ctx.logical_phi_info[block_idx];88if (logical_phi_info.empty())89continue;9091Block& block = ctx.program->blocks[block_idx];92unsigned idx = block.instructions.size() - 1;93while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {94assert(idx > 0);95idx--;96}9798std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);99aco_ptr<Pseudo_instruction> pc{100create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,101logical_phi_info.size(), logical_phi_info.size())};102unsigned i = 0;103for (auto& phi_info : logical_phi_info) {104pc->definitions[i] = phi_info.def;105pc->operands[i] = phi_info.op;106i++;107}108/* this shouldn't be needed since we're only copying vgprs */109pc->tmp_in_scc = false;110block.instructions.insert(it, std::move(pc));111}112113/* insert parallelcopies for the linear phis at the end of blocks just before the branch */114for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {115auto& linear_phi_info = ctx.linear_phi_info[block_idx];116if (linear_phi_info.empty())117continue;118119Block& block = ctx.program->blocks[block_idx];120std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();121--it;122assert((*it)->isBranch());123aco_ptr<Pseudo_instruction> pc{124create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,125linear_phi_info.size(), linear_phi_info.size())};126unsigned i = 0;127for (auto& phi_info : linear_phi_info) {128pc->definitions[i] = phi_info.def;129pc->operands[i] = phi_info.op;130i++;131}132pc->tmp_in_scc = block.scc_live_out;133pc->scratch_sgpr = block.scratch_sgpr;134block.instructions.insert(it, std::move(pc));135}136}137138bool139is_empty_block(Block* block, bool ignore_exec_writes)140{141/* check if this block is empty and the exec mask is not needed */142for (aco_ptr<Instruction>& instr : block->instructions) {143switch (instr->opcode) {144case aco_opcode::p_linear_phi:145case aco_opcode::p_phi:146case aco_opcode::p_logical_start:147case aco_opcode::p_logical_end:148case aco_opcode::p_branch: break;149case aco_opcode::p_parallelcopy:150for (unsigned i = 0; i < instr->definitions.size(); i++) {151if (ignore_exec_writes && instr->definitions[i].physReg() == exec)152continue;153if (instr->definitions[i].physReg() != instr->operands[i].physReg())154return false;155}156break;157case aco_opcode::s_andn2_b64:158case aco_opcode::s_andn2_b32:159if (ignore_exec_writes && instr->definitions[0].physReg() == exec)160break;161return false;162default: return false;163}164}165return true;166}167168void169try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)170{171/* check if the successor is another merge block which restores exec */172// TODO: divergent loops also restore exec173if (block->linear_succs.size() != 1 ||174!(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge))175return;176177/* check if this block is empty */178if (!is_empty_block(block, true))179return;180181/* keep the branch instruction and remove the rest */182aco_ptr<Instruction> branch = std::move(block->instructions.back());183block->instructions.clear();184block->instructions.emplace_back(std::move(branch));185}186187void188try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)189{190assert(block->linear_succs.size() == 2);191/* only remove this block if the successor got removed as well */192if (block->linear_succs[0] != block->linear_succs[1])193return;194195/* check if block is otherwise empty */196if (!is_empty_block(block, true))197return;198199unsigned succ_idx = block->linear_succs[0];200assert(block->linear_preds.size() == 2);201for (unsigned i = 0; i < 2; i++) {202Block* pred = &ctx.program->blocks[block->linear_preds[i]];203pred->linear_succs[0] = succ_idx;204ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;205206Pseudo_branch_instruction& branch = pred->instructions.back()->branch();207assert(branch.isBranch());208branch.target[0] = succ_idx;209branch.target[1] = succ_idx;210}211212block->instructions.clear();213block->linear_preds.clear();214block->linear_succs.clear();215}216217void218try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)219{220if (!is_empty_block(block, false))221return;222223Block& pred = ctx.program->blocks[block->linear_preds[0]];224Block& succ = ctx.program->blocks[block->linear_succs[0]];225Pseudo_branch_instruction& branch = pred.instructions.back()->branch();226if (branch.opcode == aco_opcode::p_branch) {227branch.target[0] = succ.index;228branch.target[1] = succ.index;229} else if (branch.target[0] == block->index) {230branch.target[0] = succ.index;231} else if (branch.target[0] == succ.index) {232assert(branch.target[1] == block->index);233branch.target[1] = succ.index;234branch.opcode = aco_opcode::p_branch;235} else if (branch.target[1] == block->index) {236/* check if there is a fall-through path from block to succ */237bool falls_through = block->index < succ.index;238for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) {239assert(ctx.program->blocks[j].index == j);240if (!ctx.program->blocks[j].instructions.empty())241falls_through = false;242}243if (falls_through) {244branch.target[1] = succ.index;245} else {246/* check if there is a fall-through path for the alternative target */247if (block->index >= branch.target[0])248return;249for (unsigned j = block->index + 1; j < branch.target[0]; j++) {250if (!ctx.program->blocks[j].instructions.empty())251return;252}253254/* This is a (uniform) break or continue block. The branch condition has to be inverted. */255if (branch.opcode == aco_opcode::p_cbranch_z)256branch.opcode = aco_opcode::p_cbranch_nz;257else if (branch.opcode == aco_opcode::p_cbranch_nz)258branch.opcode = aco_opcode::p_cbranch_z;259else260assert(false);261/* also invert the linear successors */262pred.linear_succs[0] = pred.linear_succs[1];263pred.linear_succs[1] = succ.index;264branch.target[1] = branch.target[0];265branch.target[0] = succ.index;266}267} else {268assert(false);269}270271if (branch.target[0] == branch.target[1])272branch.opcode = aco_opcode::p_branch;273274for (unsigned i = 0; i < pred.linear_succs.size(); i++)275if (pred.linear_succs[i] == block->index)276pred.linear_succs[i] = succ.index;277278for (unsigned i = 0; i < succ.linear_preds.size(); i++)279if (succ.linear_preds[i] == block->index)280succ.linear_preds[i] = pred.index;281282block->instructions.clear();283block->linear_preds.clear();284block->linear_succs.clear();285}286287bool288instr_writes_exec(Instruction* instr)289{290for (Definition& def : instr->definitions)291if (def.physReg() == exec || def.physReg() == exec_hi)292return true;293294return false;295}296297void298eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)299{300/* Check if any successor needs the outgoing exec mask from the current block. */301302bool exec_write_used;303304if (!ctx.logical_phi_info[block.index].empty()) {305exec_write_used = true;306} else {307bool copy_to_exec = false;308bool copy_from_exec = false;309310for (const auto& successor_phi_info : ctx.linear_phi_info[block.index]) {311copy_to_exec |= successor_phi_info.def.physReg() == exec;312copy_from_exec |= successor_phi_info.op.physReg() == exec;313}314315if (copy_from_exec)316exec_write_used = true;317else if (copy_to_exec)318exec_write_used = false;319else320/* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */321exec_write_used =322std::any_of(block.linear_succs.begin(), block.linear_succs.end(),323[&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });324}325326/* Go through all instructions and eliminate useless exec writes. */327328for (int i = block.instructions.size() - 1; i >= 0; --i) {329aco_ptr<Instruction>& instr = block.instructions[i];330331/* We already take information from phis into account before the loop, so let's just break on332* phis. */333if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)334break;335336/* See if the current instruction needs or writes exec. */337bool needs_exec = needs_exec_mask(instr.get());338bool writes_exec = instr_writes_exec(instr.get());339340/* See if we found an unused exec write. */341if (writes_exec && !exec_write_used) {342instr.reset();343continue;344}345346/* For a newly encountered exec write, clear the used flag. */347if (writes_exec)348exec_write_used = false;349350/* If the current instruction needs exec, mark it as used. */351exec_write_used |= needs_exec;352}353354/* Remember if the current block needs an incoming exec mask from its predecessors. */355ctx.blocks_incoming_exec_used[block.index] = exec_write_used;356357/* Cleanup: remove deleted instructions from the vector. */358auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);359block.instructions.resize(new_end - block.instructions.begin());360}361362void363jump_threading(ssa_elimination_ctx& ctx)364{365for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {366Block* block = &ctx.program->blocks[i];367eliminate_useless_exec_writes_in_block(ctx, *block);368369if (!ctx.empty_blocks[i])370continue;371372if (block->kind & block_kind_invert) {373try_remove_invert_block(ctx, block);374continue;375}376377if (block->linear_succs.size() > 1)378continue;379380if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)381try_remove_merge_block(ctx, block);382383if (block->linear_preds.size() == 1)384try_remove_simple_block(ctx, block);385}386}387388} /* end namespace */389390void391ssa_elimination(Program* program)392{393ssa_elimination_ctx ctx(program);394395/* Collect information about every phi-instruction */396collect_phi_info(ctx);397398/* eliminate empty blocks */399jump_threading(ctx);400401/* insert parallelcopies from SSA elimination */402insert_parallelcopies(ctx);403}404} // namespace aco405406407