Path: blob/21.2-virgl/src/amd/compiler/aco_optimizer.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_ir.h"2526#include "util/half_float.h"27#include "util/memstream.h"2829#include <algorithm>30#include <array>31#include <vector>3233namespace aco {3435#ifndef NDEBUG36void37perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)38{39if (cond) {40char* out;41size_t outsize;42struct u_memstream mem;43u_memstream_open(&mem, &out, &outsize);44FILE* const memf = u_memstream_get(&mem);4546fprintf(memf, "%s: ", msg);47aco_print_instr(instr, memf);48u_memstream_close(&mem);4950aco_perfwarn(program, out);51free(out);5253if (debug_flags & DEBUG_PERFWARN)54exit(1);55}56}57#endif5859/**60* The optimizer works in 4 phases:61* (1) The first pass collects information for each ssa-def,62* propagates reg->reg operands of the same type, inline constants63* and neg/abs input modifiers.64* (2) The second pass combines instructions like mad, omod, clamp and65* propagates sgpr's on VALU instructions.66* This pass depends on information collected in the first pass.67* (3) The third pass goes backwards, and selects instructions,68* i.e. decides if a mad instruction is profitable and eliminates dead code.69* (4) The fourth pass cleans up the sequence: literals get applied and dead70* instructions are removed from the sequence.71*/7273struct mad_info {74aco_ptr<Instruction> add_instr;75uint32_t mul_temp_id;76uint16_t literal_idx;77bool check_literal;7879mad_info(aco_ptr<Instruction> instr, uint32_t id)80: add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false)81{}82};8384enum Label {85label_vec = 1 << 0,86label_constant_32bit = 1 << 1,87/* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and88* 32-bit operations but this shouldn't cause any issues because we don't89* look through any conversions */90label_abs = 1 << 2,91label_neg = 1 << 3,92label_mul = 1 << 4,93label_temp = 1 << 5,94label_literal = 1 << 6,95label_mad = 1 << 7,96label_omod2 = 1 << 8,97label_omod4 = 1 << 9,98label_omod5 = 1 << 10,99label_clamp = 1 << 12,100label_undefined = 1 << 14,101label_vcc = 1 << 15,102label_b2f = 1 << 16,103label_add_sub = 1 << 17,104label_bitwise = 1 << 18,105label_minmax = 1 << 19,106label_vopc = 1 << 20,107label_uniform_bool = 1 << 21,108label_constant_64bit = 1 << 22,109label_uniform_bitwise = 1 << 23,110label_scc_invert = 1 << 24,111label_vcc_hint = 1 << 25,112label_scc_needed = 1 << 26,113label_b2i = 1 << 27,114label_fcanonicalize = 1 << 28,115label_constant_16bit = 1 << 29,116label_usedef = 1 << 30, /* generic label */117label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */118label_canonicalized = 1ull << 32,119label_extract = 1ull << 33,120label_insert = 1ull << 34,121};122123static constexpr uint64_t instr_usedef_labels =124label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |125label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract;126static constexpr uint64_t instr_mod_labels =127label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert;128129static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;130static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |131label_uniform_bool | label_scc_invert | label_b2i |132label_fcanonicalize;133static constexpr uint32_t val_labels =134label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;135136static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");137static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");138static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");139140struct ssa_info {141uint64_t label;142union {143uint32_t val;144Temp temp;145Instruction* instr;146};147148ssa_info() : label(0) {}149150void add_label(Label new_label)151{152/* Since all the instr_usedef_labels use instr for the same thing153* (indicating the defining instruction), there is usually no need to154* clear any other instr labels. */155if (new_label & instr_usedef_labels)156label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */157158if (new_label & instr_mod_labels) {159label &= ~instr_labels;160label &= ~(temp_labels | val_labels); /* instr, temp and val alias */161}162163if (new_label & temp_labels) {164label &= ~temp_labels;165label &= ~(instr_labels | val_labels); /* instr, temp and val alias */166}167168uint32_t const_labels =169label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;170if (new_label & const_labels) {171label &= ~val_labels | const_labels;172label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */173} else if (new_label & val_labels) {174label &= ~val_labels;175label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */176}177178label |= new_label;179}180181void set_vec(Instruction* vec)182{183add_label(label_vec);184instr = vec;185}186187bool is_vec() { return label & label_vec; }188189void set_constant(chip_class chip, uint64_t constant)190{191Operand op16 = Operand::c16(constant);192Operand op32 = Operand::get_const(chip, constant, 4);193add_label(label_literal);194val = constant;195196/* check that no upper bits are lost in case of packed 16bit constants */197if (chip >= GFX8 && !op16.isLiteral() && op16.constantValue64() == constant)198add_label(label_constant_16bit);199200if (!op32.isLiteral())201add_label(label_constant_32bit);202203if (Operand::is_constant_representable(constant, 8))204add_label(label_constant_64bit);205206if (label & label_constant_64bit) {207val = Operand::c64(constant).constantValue();208if (val != constant)209label &= ~(label_literal | label_constant_16bit | label_constant_32bit);210}211}212213bool is_constant(unsigned bits)214{215switch (bits) {216case 8: return label & label_literal;217case 16: return label & label_constant_16bit;218case 32: return label & label_constant_32bit;219case 64: return label & label_constant_64bit;220}221return false;222}223224bool is_literal(unsigned bits)225{226bool is_lit = label & label_literal;227switch (bits) {228case 8: return false;229case 16: return is_lit && ~(label & label_constant_16bit);230case 32: return is_lit && ~(label & label_constant_32bit);231case 64: return false;232}233return false;234}235236bool is_constant_or_literal(unsigned bits)237{238if (bits == 64)239return label & label_constant_64bit;240else241return label & label_literal;242}243244void set_abs(Temp abs_temp)245{246add_label(label_abs);247temp = abs_temp;248}249250bool is_abs() { return label & label_abs; }251252void set_neg(Temp neg_temp)253{254add_label(label_neg);255temp = neg_temp;256}257258bool is_neg() { return label & label_neg; }259260void set_neg_abs(Temp neg_abs_temp)261{262add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));263temp = neg_abs_temp;264}265266void set_mul(Instruction* mul)267{268add_label(label_mul);269instr = mul;270}271272bool is_mul() { return label & label_mul; }273274void set_temp(Temp tmp)275{276add_label(label_temp);277temp = tmp;278}279280bool is_temp() { return label & label_temp; }281282void set_mad(Instruction* mad, uint32_t mad_info_idx)283{284add_label(label_mad);285mad->pass_flags = mad_info_idx;286instr = mad;287}288289bool is_mad() { return label & label_mad; }290291void set_omod2(Instruction* mul)292{293add_label(label_omod2);294instr = mul;295}296297bool is_omod2() { return label & label_omod2; }298299void set_omod4(Instruction* mul)300{301add_label(label_omod4);302instr = mul;303}304305bool is_omod4() { return label & label_omod4; }306307void set_omod5(Instruction* mul)308{309add_label(label_omod5);310instr = mul;311}312313bool is_omod5() { return label & label_omod5; }314315void set_clamp(Instruction* med3)316{317add_label(label_clamp);318instr = med3;319}320321bool is_clamp() { return label & label_clamp; }322323void set_undefined() { add_label(label_undefined); }324325bool is_undefined() { return label & label_undefined; }326327void set_vcc(Temp vcc_val)328{329add_label(label_vcc);330temp = vcc_val;331}332333bool is_vcc() { return label & label_vcc; }334335void set_b2f(Temp b2f_val)336{337add_label(label_b2f);338temp = b2f_val;339}340341bool is_b2f() { return label & label_b2f; }342343void set_add_sub(Instruction* add_sub_instr)344{345add_label(label_add_sub);346instr = add_sub_instr;347}348349bool is_add_sub() { return label & label_add_sub; }350351void set_bitwise(Instruction* bitwise_instr)352{353add_label(label_bitwise);354instr = bitwise_instr;355}356357bool is_bitwise() { return label & label_bitwise; }358359void set_uniform_bitwise() { add_label(label_uniform_bitwise); }360361bool is_uniform_bitwise() { return label & label_uniform_bitwise; }362363void set_minmax(Instruction* minmax_instr)364{365add_label(label_minmax);366instr = minmax_instr;367}368369bool is_minmax() { return label & label_minmax; }370371void set_vopc(Instruction* vopc_instr)372{373add_label(label_vopc);374instr = vopc_instr;375}376377bool is_vopc() { return label & label_vopc; }378379void set_scc_needed() { add_label(label_scc_needed); }380381bool is_scc_needed() { return label & label_scc_needed; }382383void set_scc_invert(Temp scc_inv)384{385add_label(label_scc_invert);386temp = scc_inv;387}388389bool is_scc_invert() { return label & label_scc_invert; }390391void set_uniform_bool(Temp uniform_bool)392{393add_label(label_uniform_bool);394temp = uniform_bool;395}396397bool is_uniform_bool() { return label & label_uniform_bool; }398399void set_vcc_hint() { add_label(label_vcc_hint); }400401bool is_vcc_hint() { return label & label_vcc_hint; }402403void set_b2i(Temp b2i_val)404{405add_label(label_b2i);406temp = b2i_val;407}408409bool is_b2i() { return label & label_b2i; }410411void set_usedef(Instruction* label_instr)412{413add_label(label_usedef);414instr = label_instr;415}416417bool is_usedef() { return label & label_usedef; }418419void set_vop3p(Instruction* vop3p_instr)420{421add_label(label_vop3p);422instr = vop3p_instr;423}424425bool is_vop3p() { return label & label_vop3p; }426427void set_fcanonicalize(Temp tmp)428{429add_label(label_fcanonicalize);430temp = tmp;431}432433bool is_fcanonicalize() { return label & label_fcanonicalize; }434435void set_canonicalized() { add_label(label_canonicalized); }436437bool is_canonicalized() { return label & label_canonicalized; }438439void set_extract(Instruction* extract)440{441add_label(label_extract);442instr = extract;443}444445bool is_extract() { return label & label_extract; }446447void set_insert(Instruction* insert)448{449add_label(label_insert);450instr = insert;451}452453bool is_insert() { return label & label_insert; }454};455456struct opt_ctx {457Program* program;458float_mode fp_mode;459std::vector<aco_ptr<Instruction>> instructions;460ssa_info* info;461std::pair<uint32_t, Temp> last_literal;462std::vector<mad_info> mad_infos;463std::vector<uint16_t> uses;464};465466struct CmpInfo {467aco_opcode ordered;468aco_opcode unordered;469aco_opcode ordered_swapped;470aco_opcode unordered_swapped;471aco_opcode inverse;472aco_opcode f32;473unsigned size;474};475476ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo* info);477478bool479can_swap_operands(aco_ptr<Instruction>& instr)480{481if (instr->operands[0].isConstant() ||482(instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))483return false;484485switch (instr->opcode) {486case aco_opcode::v_add_u32:487case aco_opcode::v_add_co_u32:488case aco_opcode::v_add_co_u32_e64:489case aco_opcode::v_add_i32:490case aco_opcode::v_add_f16:491case aco_opcode::v_add_f32:492case aco_opcode::v_mul_f16:493case aco_opcode::v_mul_f32:494case aco_opcode::v_or_b32:495case aco_opcode::v_and_b32:496case aco_opcode::v_xor_b32:497case aco_opcode::v_max_f16:498case aco_opcode::v_max_f32:499case aco_opcode::v_min_f16:500case aco_opcode::v_min_f32:501case aco_opcode::v_max_i32:502case aco_opcode::v_min_i32:503case aco_opcode::v_max_u32:504case aco_opcode::v_min_u32:505case aco_opcode::v_max_i16:506case aco_opcode::v_min_i16:507case aco_opcode::v_max_u16:508case aco_opcode::v_min_u16:509case aco_opcode::v_max_i16_e64:510case aco_opcode::v_min_i16_e64:511case aco_opcode::v_max_u16_e64:512case aco_opcode::v_min_u16_e64: return true;513case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; return true;514case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true;515case aco_opcode::v_sub_co_u32: instr->opcode = aco_opcode::v_subrev_co_u32; return true;516case aco_opcode::v_sub_u16: instr->opcode = aco_opcode::v_subrev_u16; return true;517case aco_opcode::v_sub_u32: instr->opcode = aco_opcode::v_subrev_u32; return true;518default: {519CmpInfo info;520get_cmp_info(instr->opcode, &info);521if (info.ordered == instr->opcode) {522instr->opcode = info.ordered_swapped;523return true;524}525if (info.unordered == instr->opcode) {526instr->opcode = info.unordered_swapped;527return true;528}529return false;530}531}532}533534bool535can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)536{537if (instr->isVOP3())538return true;539540if (instr->isVOP3P())541return false;542543if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)544return false;545546if (instr->isDPP() || instr->isSDWA())547return false;548549return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&550instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&551instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&552instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&553instr->opcode != aco_opcode::v_readlane_b32 &&554instr->opcode != aco_opcode::v_writelane_b32 &&555instr->opcode != aco_opcode::v_readfirstlane_b32;556}557558bool559pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)560{561if (instr->definitions.empty())562return false;563564const bool vgpr =565instr->opcode == aco_opcode::p_as_uniform ||566std::all_of(instr->definitions.begin(), instr->definitions.end(),567[](const Definition& def) { return def.regClass().type() == RegType::vgpr; });568569/* don't propagate VGPRs into SGPR instructions */570if (temp.type() == RegType::vgpr && !vgpr)571return false;572573bool can_accept_sgpr =574ctx.program->chip_class >= GFX9 ||575std::none_of(instr->definitions.begin(), instr->definitions.end(),576[](const Definition& def) { return def.regClass().is_subdword(); });577578switch (instr->opcode) {579case aco_opcode::p_phi:580case aco_opcode::p_linear_phi:581case aco_opcode::p_parallelcopy:582case aco_opcode::p_create_vector:583if (temp.bytes() != instr->operands[index].bytes())584return false;585break;586case aco_opcode::p_extract_vector:587if (temp.type() == RegType::sgpr && !can_accept_sgpr)588return false;589break;590case aco_opcode::p_split_vector: {591if (temp.type() == RegType::sgpr && !can_accept_sgpr)592return false;593/* don't increase the vector size */594if (temp.bytes() > instr->operands[index].bytes())595return false;596/* We can decrease the vector size as smaller temporaries are only597* propagated by p_as_uniform instructions.598* If this propagation leads to invalid IR or hits the assertion below,599* it means that some undefined bytes within a dword are begin accessed600* and a bug in instruction_selection is likely. */601int decrease = instr->operands[index].bytes() - temp.bytes();602while (decrease > 0) {603decrease -= instr->definitions.back().bytes();604instr->definitions.pop_back();605}606assert(decrease == 0);607break;608}609case aco_opcode::p_as_uniform:610if (temp.regClass() == instr->definitions[0].regClass())611instr->opcode = aco_opcode::p_parallelcopy;612break;613default: return false;614}615616instr->operands[index].setTemp(temp);617return true;618}619620bool621can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)622{623if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())624return false;625return instr->opcode != aco_opcode::v_readfirstlane_b32 &&626instr->opcode != aco_opcode::v_readlane_b32 &&627instr->opcode != aco_opcode::v_readlane_b32_e64 &&628instr->opcode != aco_opcode::v_writelane_b32 &&629instr->opcode != aco_opcode::v_writelane_b32_e64 &&630instr->opcode != aco_opcode::v_permlane16_b32 &&631instr->opcode != aco_opcode::v_permlanex16_b32;632}633634void635to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)636{637if (instr->isVOP3())638return;639640aco_ptr<Instruction> tmp = std::move(instr);641Format format = asVOP3(tmp->format);642instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(),643tmp->definitions.size()));644std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());645for (unsigned i = 0; i < instr->definitions.size(); i++) {646instr->definitions[i] = tmp->definitions[i];647if (instr->definitions[i].isTemp()) {648ssa_info& info = ctx.info[instr->definitions[i].tempId()];649if (info.label & instr_usedef_labels && info.instr == tmp.get())650info.instr = instr.get();651}652}653/* we don't need to update any instr_mod_labels because they either haven't654* been applied yet or this instruction isn't dead and so they've been ignored */655}656657bool658is_operand_vgpr(Operand op)659{660return op.isTemp() && op.getTemp().type() == RegType::vgpr;661}662663void664to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)665{666aco_ptr<Instruction> tmp = convert_to_SDWA(ctx.program->chip_class, instr);667if (!tmp)668return;669670for (unsigned i = 0; i < instr->definitions.size(); i++) {671ssa_info& info = ctx.info[instr->definitions[i].tempId()];672if (info.label & instr_labels && info.instr == tmp.get())673info.instr = instr.get();674}675}676677/* only covers special cases */678bool679alu_can_accept_constant(aco_opcode opcode, unsigned operand)680{681switch (opcode) {682case aco_opcode::v_interp_p2_f32:683case aco_opcode::v_mac_f32:684case aco_opcode::v_writelane_b32:685case aco_opcode::v_writelane_b32_e64:686case aco_opcode::v_cndmask_b32: return operand != 2;687case aco_opcode::s_addk_i32:688case aco_opcode::s_mulk_i32:689case aco_opcode::p_wqm:690case aco_opcode::p_extract_vector:691case aco_opcode::p_split_vector:692case aco_opcode::v_readlane_b32:693case aco_opcode::v_readlane_b32_e64:694case aco_opcode::v_readfirstlane_b32:695case aco_opcode::p_extract:696case aco_opcode::p_insert: return operand != 0;697default: return true;698}699}700701bool702valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)703{704if (instr->opcode == aco_opcode::v_readlane_b32 ||705instr->opcode == aco_opcode::v_readlane_b32_e64 ||706instr->opcode == aco_opcode::v_writelane_b32 ||707instr->opcode == aco_opcode::v_writelane_b32_e64)708return operand != 1;709if (instr->opcode == aco_opcode::v_permlane16_b32 ||710instr->opcode == aco_opcode::v_permlanex16_b32)711return operand == 0;712return true;713}714715/* check constant bus and literal limitations */716bool717check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)718{719int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;720Operand literal32(s1);721Operand literal64(s2);722unsigned num_sgprs = 0;723unsigned sgpr[] = {0, 0};724725for (unsigned i = 0; i < num_operands; i++) {726Operand op = operands[i];727728if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {729/* two reads of the same SGPR count as 1 to the limit */730if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {731if (num_sgprs < 2)732sgpr[num_sgprs++] = op.tempId();733limit--;734if (limit < 0)735return false;736}737} else if (op.isLiteral()) {738if (ctx.program->chip_class < GFX10)739return false;740741if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())742return false;743if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())744return false;745746/* Any number of 32-bit literals counts as only 1 to the limit. Same747* (but separately) for 64-bit literals. */748if (op.size() == 1 && literal32.isUndefined()) {749limit--;750literal32 = op;751} else if (op.size() == 2 && literal64.isUndefined()) {752limit--;753literal64 = op;754}755756if (limit < 0)757return false;758}759}760761return true;762}763764bool765parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,766bool prevent_overflow)767{768Operand op = instr->operands[op_index];769770if (!op.isTemp())771return false;772Temp tmp = op.getTemp();773if (!ctx.info[tmp.id()].is_add_sub())774return false;775776Instruction* add_instr = ctx.info[tmp.id()].instr;777778switch (add_instr->opcode) {779case aco_opcode::v_add_u32:780case aco_opcode::v_add_co_u32:781case aco_opcode::v_add_co_u32_e64:782case aco_opcode::s_add_i32:783case aco_opcode::s_add_u32: break;784default: return false;785}786if (prevent_overflow && !add_instr->definitions[0].isNUW())787return false;788789if (add_instr->usesModifiers())790return false;791792for (unsigned i = 0; i < 2; i++) {793if (add_instr->operands[i].isConstant()) {794*offset = add_instr->operands[i].constantValue();795} else if (add_instr->operands[i].isTemp() &&796ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {797*offset = ctx.info[add_instr->operands[i].tempId()].val;798} else {799continue;800}801if (!add_instr->operands[!i].isTemp())802continue;803804uint32_t offset2 = 0;805if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {806*offset += offset2;807} else {808*base = add_instr->operands[!i].getTemp();809}810return true;811}812813return false;814}815816unsigned817get_operand_size(aco_ptr<Instruction>& instr, unsigned index)818{819if (instr->isPseudo())820return instr->operands[index].bytes() * 8u;821else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||822instr->opcode == aco_opcode::v_mad_i64_i32)823return index == 2 ? 64 : 32;824else if (instr->isVALU() || instr->isSALU())825return instr_info.operand_size[(int)instr->opcode];826else827return 0;828}829830Operand831get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)832{833if (bits == 64)834return Operand::c32_or_c64(info.val, true);835return Operand::get_const(ctx.program->chip_class, info.val, bits / 8u);836}837838bool839fixed_to_exec(Operand op)840{841return op.isFixed() && op.physReg() == exec;842}843844int845parse_extract(Instruction* instr)846{847if (instr->opcode == aco_opcode::p_extract) {848bool is_byte = instr->operands[2].constantEquals(8);849unsigned index = instr->operands[1].constantValue();850unsigned sel = (is_byte ? sdwa_ubyte0 : sdwa_uword0) + index;851if (!instr->operands[3].constantEquals(0))852sel |= sdwa_sext;853return sel;854} else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {855return instr->operands[2].constantEquals(8) ? sdwa_ubyte0 : sdwa_uword0;856} else {857return -1;858}859}860861int862parse_insert(Instruction* instr)863{864if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&865instr->operands[1].constantEquals(0)) {866return instr->operands[2].constantEquals(8) ? sdwa_ubyte0 : sdwa_uword0;867} else if (instr->opcode == aco_opcode::p_insert) {868bool is_byte = instr->operands[2].constantEquals(8);869unsigned index = instr->operands[1].constantValue();870unsigned sel = (is_byte ? sdwa_ubyte0 : sdwa_uword0) + index;871return sel;872} else {873return -1;874}875}876877bool878can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)879{880if (idx >= 2)881return false;882883Temp tmp = info.instr->operands[0].getTemp();884unsigned sel = parse_extract(info.instr);885886if (sel == sdwa_udword || sel == sdwa_sdword) {887return true;888} else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {889return true;890} else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&891(tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {892if (instr->isSDWA() &&893(static_cast<SDWA_instruction*>(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword)894return false;895return true;896} else if (instr->isVOP3() && (sel & sdwa_isword) &&897can_use_opsel(ctx.program->chip_class, instr->opcode, idx, (sel & sdwa_wordnum)) &&898!(instr->vop3().opsel & (1 << idx))) {899return true;900} else {901return false;902}903}904905/* Combine an p_extract (or p_insert, in some cases) instruction with instr.906* instr(p_extract(...)) -> instr()907*/908void909apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)910{911Temp tmp = info.instr->operands[0].getTemp();912unsigned sel = parse_extract(info.instr);913914if (sel == sdwa_udword || sel == sdwa_sdword) {915} else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {916switch (sel) {917case sdwa_ubyte0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;918case sdwa_ubyte1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;919case sdwa_ubyte2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;920case sdwa_ubyte3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;921}922} else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&923(tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {924to_SDWA(ctx, instr);925static_cast<SDWA_instruction*>(instr.get())->sel[idx] = sel;926} else if (instr->isVOP3()) {927if (sel & sdwa_wordnum)928instr->vop3().opsel |= 1 << idx;929}930931ctx.info[tmp.id()].label &= ~label_insert;932/* label_vopc seems to be the only one worth keeping at the moment */933for (Definition& def : instr->definitions)934ctx.info[def.tempId()].label &= label_vopc;935}936937void938check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)939{940/* only VALU can use SDWA */941if (!instr->isVALU())942return;943944for (unsigned i = 0; i < instr->operands.size(); i++) {945Operand op = instr->operands[i];946if (!op.isTemp())947continue;948ssa_info& info = ctx.info[op.tempId()];949if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||950op.getTemp().type() == RegType::sgpr)) {951if (!can_apply_extract(ctx, instr, i, info))952info.label &= ~label_extract;953}954}955}956957bool958does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)959{960if (ctx.program->chip_class <= GFX8) {961switch (op) {962case aco_opcode::v_min_f32:963case aco_opcode::v_max_f32:964case aco_opcode::v_med3_f32:965case aco_opcode::v_min3_f32:966case aco_opcode::v_max3_f32:967case aco_opcode::v_min_f16:968case aco_opcode::v_max_f16: return false;969default: break;970}971}972return op != aco_opcode::v_cndmask_b32;973}974975bool976can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp)977{978float_mode* fp = &ctx.fp_mode;979if (ctx.info[tmp.id()].is_canonicalized() ||980(tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)981return true;982983aco_opcode op = instr->opcode;984return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op);985}986987bool988is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info)989{990return info.is_temp() ||991(info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));992}993994bool995is_op_canonicalized(opt_ctx& ctx, Operand op)996{997float_mode* fp = &ctx.fp_mode;998if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||999(op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)1000return true;10011002if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {1003uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();1004if (op.bytes() == 2)1005return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;1006else if (op.bytes() == 4)1007return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;1008}1009return false;1010}10111012void1013label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)1014{1015if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {1016ASSERTED bool all_const = false;1017for (Operand& op : instr->operands)1018all_const =1019all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));1020perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());10211022ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||1023instr->opcode == aco_opcode::s_mov_b64 ||1024instr->opcode == aco_opcode::v_mov_b32;1025perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",1026instr.get());1027}10281029for (unsigned i = 0; i < instr->operands.size(); i++) {1030if (!instr->operands[i].isTemp())1031continue;10321033ssa_info info = ctx.info[instr->operands[i].tempId()];1034/* propagate undef */1035if (info.is_undefined() && is_phi(instr))1036instr->operands[i] = Operand(instr->operands[i].regClass());1037/* propagate reg->reg of same type */1038while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {1039instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);1040info = ctx.info[info.temp.id()];1041}10421043/* PSEUDO: propagate temporaries */1044if (instr->isPseudo()) {1045while (info.is_temp()) {1046pseudo_propagate_temp(ctx, instr, info.temp, i);1047info = ctx.info[info.temp.id()];1048}1049}10501051/* SALU / PSEUDO: propagate inline constants */1052if (instr->isSALU() || instr->isPseudo()) {1053unsigned bits = get_operand_size(instr, i);1054if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&1055!instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {1056instr->operands[i] = get_constant_op(ctx, info, bits);1057continue;1058}1059}10601061/* VALU: propagate neg, abs & inline constants */1062else if (instr->isVALU()) {1063if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr &&1064valu_can_accept_vgpr(instr, i)) {1065instr->operands[i].setTemp(info.temp);1066info = ctx.info[info.temp.id()];1067}1068/* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */1069if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&1070instr->operands.size() == 1) {1071instr->operands[i].setTemp(info.temp);1072info = ctx.info[info.temp.id()];1073}10741075/* for instructions other than v_cndmask_b32, the size of the instruction should match the1076* operand size */1077unsigned can_use_mod =1078instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;1079can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];10801081if (instr->isSDWA())1082can_use_mod = can_use_mod && (instr->sdwa().sel[i] & sdwa_asuint) == sdwa_udword;1083else1084can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr));10851086if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {1087instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;1088instr->operands[i].setTemp(info.temp);1089} else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {1090instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;1091instr->operands[i].setTemp(info.temp);1092} else if (info.is_neg() && can_use_mod &&1093can_eliminate_fcanonicalize(ctx, instr, info.temp)) {1094if (!instr->isDPP() && !instr->isSDWA())1095to_VOP3(ctx, instr);1096instr->operands[i].setTemp(info.temp);1097if (instr->isDPP() && !instr->dpp().abs[i])1098instr->dpp().neg[i] = true;1099else if (instr->isSDWA() && !instr->sdwa().abs[i])1100instr->sdwa().neg[i] = true;1101else if (instr->isVOP3() && !instr->vop3().abs[i])1102instr->vop3().neg[i] = true;1103}1104if (info.is_abs() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) {1105if (!instr->isDPP() && !instr->isSDWA())1106to_VOP3(ctx, instr);1107instr->operands[i] = Operand(info.temp);1108if (instr->isDPP())1109instr->dpp().abs[i] = true;1110else if (instr->isSDWA())1111instr->sdwa().abs[i] = true;1112else1113instr->vop3().abs[i] = true;1114continue;1115}1116unsigned bits = get_operand_size(instr, i);1117if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&1118(!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {1119Operand op = get_constant_op(ctx, info, bits);1120perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,1121"v_cndmask_b32 with a constant selector", instr.get());1122if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||1123instr->opcode == aco_opcode::v_readlane_b32 ||1124instr->opcode == aco_opcode::v_writelane_b32) {1125instr->operands[i] = op;1126continue;1127} else if (!instr->isVOP3() && can_swap_operands(instr)) {1128instr->operands[i] = instr->operands[0];1129instr->operands[0] = op;1130continue;1131} else if (can_use_VOP3(ctx, instr)) {1132to_VOP3(ctx, instr);1133instr->operands[i] = op;1134continue;1135}1136}1137}11381139/* MUBUF: propagate constants and combine additions */1140else if (instr->isMUBUF()) {1141MUBUF_instruction& mubuf = instr->mubuf();1142Temp base;1143uint32_t offset;1144while (info.is_temp())1145info = ctx.info[info.temp.id()];11461147/* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr1148* overflow for scratch accesses works only on GFX9+ and saddr overflow1149* never works. Since swizzling is the only thing that separates1150* scratch accesses and other accesses and swizzling changing how1151* addressing works significantly, this probably applies to swizzled1152* MUBUF accesses. */1153bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->chip_class < GFX9;1154bool saddr_prevent_overflow = mubuf.swizzled;11551156if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&1157mubuf.offset + info.val < 4096) {1158assert(!mubuf.idxen);1159instr->operands[1] = Operand(v1);1160mubuf.offset += info.val;1161mubuf.offen = false;1162continue;1163} else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {1164instr->operands[2] = Operand::c32(0);1165mubuf.offset += info.val;1166continue;1167} else if (mubuf.offen && i == 1 &&1168parse_base_offset(ctx, instr.get(), i, &base, &offset,1169vaddr_prevent_overflow) &&1170base.regClass() == v1 && mubuf.offset + offset < 4096) {1171assert(!mubuf.idxen);1172instr->operands[1].setTemp(base);1173mubuf.offset += offset;1174continue;1175} else if (i == 2 &&1176parse_base_offset(ctx, instr.get(), i, &base, &offset,1177saddr_prevent_overflow) &&1178base.regClass() == s1 && mubuf.offset + offset < 4096) {1179instr->operands[i].setTemp(base);1180mubuf.offset += offset;1181continue;1182}1183}11841185/* DS: combine additions */1186else if (instr->isDS()) {11871188DS_instruction& ds = instr->ds();1189Temp base;1190uint32_t offset;1191bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;1192if (has_usable_ds_offset && i == 0 &&1193parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&1194base.regClass() == instr->operands[i].regClass() &&1195instr->opcode != aco_opcode::ds_swizzle_b32) {1196if (instr->opcode == aco_opcode::ds_write2_b32 ||1197instr->opcode == aco_opcode::ds_read2_b32 ||1198instr->opcode == aco_opcode::ds_write2_b64 ||1199instr->opcode == aco_opcode::ds_read2_b64) {1200unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 ||1201instr->opcode == aco_opcode::ds_read2_b64)1202? 0x71203: 0x3;1204unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 ||1205instr->opcode == aco_opcode::ds_read2_b64)1206? 31207: 2;12081209if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&1210ds.offset1 + (offset >> shifts) <= 255) {1211instr->operands[i].setTemp(base);1212ds.offset0 += offset >> shifts;1213ds.offset1 += offset >> shifts;1214}1215} else {1216if (ds.offset0 + offset <= 65535) {1217instr->operands[i].setTemp(base);1218ds.offset0 += offset;1219}1220}1221}1222}12231224/* SMEM: propagate constants and combine additions */1225else if (instr->isSMEM()) {12261227SMEM_instruction& smem = instr->smem();1228Temp base;1229uint32_t offset;1230bool prevent_overflow = smem.operands[0].size() > 2 || smem.prevent_overflow;1231if (i == 1 && info.is_constant_or_literal(32) &&1232((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||1233(ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||1234(ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {1235instr->operands[i] = Operand::c32(info.val);1236continue;1237} else if (i == 1 &&1238parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) &&1239base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {1240bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);1241if (soe && (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) ||1242ctx.info[smem.operands.back().tempId()].val != 0)) {1243continue;1244}1245if (soe) {1246smem.operands[1] = Operand::c32(offset);1247smem.operands.back() = Operand(base);1248} else {1249SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(1250smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());1251new_instr->operands[0] = smem.operands[0];1252new_instr->operands[1] = Operand::c32(offset);1253if (smem.definitions.empty())1254new_instr->operands[2] = smem.operands[2];1255new_instr->operands.back() = Operand(base);1256if (!smem.definitions.empty())1257new_instr->definitions[0] = smem.definitions[0];1258new_instr->sync = smem.sync;1259new_instr->glc = smem.glc;1260new_instr->dlc = smem.dlc;1261new_instr->nv = smem.nv;1262new_instr->disable_wqm = smem.disable_wqm;1263instr.reset(new_instr);1264}1265continue;1266}1267}12681269else if (instr->isBranch()) {1270if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {1271/* Flip the branch instruction to get rid of the scc_invert instruction */1272instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz1273: aco_opcode::p_cbranch_z;1274instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);1275}1276}1277}12781279/* if this instruction doesn't define anything, return */1280if (instr->definitions.empty()) {1281check_sdwa_extract(ctx, instr);1282return;1283}12841285if (instr->isVALU() || instr->isVINTRP()) {1286if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||1287instr->opcode == aco_opcode::v_cndmask_b32) {1288bool canonicalized = true;1289if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {1290unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();1291for (unsigned i = 0; canonicalized && (i < ops); i++)1292canonicalized = is_op_canonicalized(ctx, instr->operands[i]);1293}1294if (canonicalized)1295ctx.info[instr->definitions[0].tempId()].set_canonicalized();1296}12971298if (instr->isVOPC()) {1299ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());1300check_sdwa_extract(ctx, instr);1301return;1302}1303if (instr->isVOP3P()) {1304ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());1305return;1306}1307}13081309switch (instr->opcode) {1310case aco_opcode::p_create_vector: {1311bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&1312instr->operands[0].regClass() == instr->definitions[0].regClass();1313if (copy_prop) {1314ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());1315break;1316}13171318/* expand vector operands */1319std::vector<Operand> ops;1320unsigned offset = 0;1321for (const Operand& op : instr->operands) {1322/* ensure that any expanded operands are properly aligned */1323bool aligned = offset % 4 == 0 || op.bytes() < 4;1324offset += op.bytes();1325if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {1326Instruction* vec = ctx.info[op.tempId()].instr;1327for (const Operand& vec_op : vec->operands)1328ops.emplace_back(vec_op);1329} else {1330ops.emplace_back(op);1331}1332}13331334/* combine expanded operands to new vector */1335if (ops.size() != instr->operands.size()) {1336assert(ops.size() > instr->operands.size());1337Definition def = instr->definitions[0];1338instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,1339Format::PSEUDO, ops.size(), 1));1340for (unsigned i = 0; i < ops.size(); i++) {1341if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&1342ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())1343ops[i].setTemp(ctx.info[ops[i].tempId()].temp);1344instr->operands[i] = ops[i];1345}1346instr->definitions[0] = def;1347} else {1348for (unsigned i = 0; i < ops.size(); i++) {1349assert(instr->operands[i] == ops[i]);1350}1351}1352ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());1353break;1354}1355case aco_opcode::p_split_vector: {1356ssa_info& info = ctx.info[instr->operands[0].tempId()];13571358if (info.is_constant_or_literal(32)) {1359uint32_t val = info.val;1360for (Definition def : instr->definitions) {1361uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);1362ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask);1363val >>= def.bytes() * 8u;1364}1365break;1366} else if (!info.is_vec()) {1367break;1368}13691370Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;1371unsigned split_offset = 0;1372unsigned vec_offset = 0;1373unsigned vec_index = 0;1374for (unsigned i = 0; i < instr->definitions.size();1375split_offset += instr->definitions[i++].bytes()) {1376while (vec_offset < split_offset && vec_index < vec->operands.size())1377vec_offset += vec->operands[vec_index++].bytes();13781379if (vec_offset != split_offset ||1380vec->operands[vec_index].bytes() != instr->definitions[i].bytes())1381continue;13821383Operand vec_op = vec->operands[vec_index];1384if (vec_op.isConstant()) {1385ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class,1386vec_op.constantValue64());1387} else if (vec_op.isUndefined()) {1388ctx.info[instr->definitions[i].tempId()].set_undefined();1389} else {1390assert(vec_op.isTemp());1391ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());1392}1393}1394break;1395}1396case aco_opcode::p_extract_vector: { /* mov */1397ssa_info& info = ctx.info[instr->operands[0].tempId()];1398const unsigned index = instr->operands[1].constantValue();1399const unsigned dst_offset = index * instr->definitions[0].bytes();14001401if (info.is_vec()) {1402/* check if we index directly into a vector element */1403Instruction* vec = info.instr;1404unsigned offset = 0;14051406for (const Operand& op : vec->operands) {1407if (offset < dst_offset) {1408offset += op.bytes();1409continue;1410} else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {1411break;1412}1413instr->operands[0] = op;1414break;1415}1416} else if (info.is_constant_or_literal(32)) {1417/* propagate constants */1418uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);1419uint32_t val = (info.val >> (dst_offset * 8u)) & mask;1420instr->operands[0] =1421Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());1422;1423} else if (index == 0 && instr->operands[0].size() == instr->definitions[0].size()) {1424ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());1425}14261427if (instr->operands[0].bytes() != instr->definitions[0].bytes())1428break;14291430/* convert this extract into a copy instruction */1431instr->opcode = aco_opcode::p_parallelcopy;1432instr->operands.pop_back();1433FALLTHROUGH;1434}1435case aco_opcode::p_parallelcopy: /* propagate */1436if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&1437instr->operands[0].regClass() != instr->definitions[0].regClass()) {1438/* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so1439* duplicate the vector instead.1440*/1441Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;1442aco_ptr<Instruction> old_copy = std::move(instr);14431444instr.reset(create_instruction<Pseudo_instruction>(1445aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));1446instr->definitions[0] = old_copy->definitions[0];1447std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());1448for (unsigned i = 0; i < vec->operands.size(); i++) {1449Operand& op = instr->operands[i];1450if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&1451ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())1452op.setTemp(ctx.info[op.tempId()].temp);1453}1454ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());1455break;1456}1457FALLTHROUGH;1458case aco_opcode::p_as_uniform:1459if (instr->definitions[0].isFixed()) {1460/* don't copy-propagate copies into fixed registers */1461} else if (instr->usesModifiers()) {1462// TODO1463} else if (instr->operands[0].isConstant()) {1464ctx.info[instr->definitions[0].tempId()].set_constant(1465ctx.program->chip_class, instr->operands[0].constantValue64());1466} else if (instr->operands[0].isTemp()) {1467ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());1468if (ctx.info[instr->operands[0].tempId()].is_canonicalized())1469ctx.info[instr->definitions[0].tempId()].set_canonicalized();1470} else {1471assert(instr->operands[0].isFixed());1472}1473break;1474case aco_opcode::p_is_helper:1475if (!ctx.program->needs_wqm)1476ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);1477break;1478case aco_opcode::v_mul_f16:1479case aco_opcode::v_mul_f32: { /* omod */1480ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());14811482/* TODO: try to move the negate/abs modifier to the consumer instead */1483bool uses_mods = instr->usesModifiers();1484bool fp16 = instr->opcode == aco_opcode::v_mul_f16;14851486for (unsigned i = 0; i < 2; i++) {1487if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {1488if (!instr->isDPP() && !instr->isSDWA() &&1489(instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */1490instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */1491bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);14921493VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL;1494if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))1495continue;14961497bool abs = vop3 && vop3->abs[i];1498bool neg = neg1 ^ (vop3 && vop3->neg[i]);14991500Temp other = instr->operands[i].getTemp();1501if (abs && neg && other.type() == RegType::vgpr)1502ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);1503else if (abs && !neg && other.type() == RegType::vgpr)1504ctx.info[instr->definitions[0].tempId()].set_abs(other);1505else if (!abs && neg && other.type() == RegType::vgpr)1506ctx.info[instr->definitions[0].tempId()].set_neg(other);1507else if (!abs && !neg)1508ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);1509} else if (uses_mods) {1510continue;1511} else if (instr->operands[!i].constantValue() ==1512(fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */1513ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());1514} else if (instr->operands[!i].constantValue() ==1515(fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */1516ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());1517} else if (instr->operands[!i].constantValue() ==1518(fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */1519ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());1520} else if (instr->operands[!i].constantValue() == 0u &&1521!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_641522: ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */1523ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);1524} else {1525continue;1526}1527break;1528}1529}1530break;1531}1532case aco_opcode::v_mul_lo_u16:1533if (instr->definitions[0].isNUW()) {1534/* Most of 16-bit mul optimizations are only valid if no overflow. */1535ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());1536}1537break;1538case aco_opcode::v_mul_u32_u24:1539ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());1540break;1541case aco_opcode::v_med3_f16:1542case aco_opcode::v_med3_f32: { /* clamp */1543VOP3_instruction& vop3 = instr->vop3();1544if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] ||1545vop3.omod != 0 || vop3.opsel != 0)1546break;15471548unsigned idx = 0;1549bool found_zero = false, found_one = false;1550bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;1551for (unsigned i = 0; i < 3; i++) {1552if (instr->operands[i].constantEquals(0))1553found_zero = true;1554else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */1555found_one = true;1556else1557idx = i;1558}1559if (found_zero && found_one && instr->operands[idx].isTemp())1560ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());1561break;1562}1563case aco_opcode::v_cndmask_b32:1564if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))1565ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());1566else if (instr->operands[0].constantEquals(0) &&1567instr->operands[1].constantEquals(0x3f800000u))1568ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());1569else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))1570ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());15711572ctx.info[instr->operands[2].tempId()].set_vcc_hint();1573break;1574case aco_opcode::v_cmp_lg_u32:1575if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */1576instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&1577ctx.info[instr->operands[1].tempId()].is_vcc())1578ctx.info[instr->definitions[0].tempId()].set_temp(1579ctx.info[instr->operands[1].tempId()].temp);1580break;1581case aco_opcode::p_linear_phi: {1582/* lower_bool_phis() can create phis like this */1583bool all_same_temp = instr->operands[0].isTemp();1584/* this check is needed when moving uniform loop counters out of a divergent loop */1585if (all_same_temp)1586all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();1587for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {1588if (!instr->operands[i].isTemp() ||1589instr->operands[i].tempId() != instr->operands[0].tempId())1590all_same_temp = false;1591}1592if (all_same_temp) {1593ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());1594} else {1595bool all_undef = instr->operands[0].isUndefined();1596for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {1597if (!instr->operands[i].isUndefined())1598all_undef = false;1599}1600if (all_undef)1601ctx.info[instr->definitions[0].tempId()].set_undefined();1602}1603break;1604}1605case aco_opcode::v_add_u32:1606case aco_opcode::v_add_co_u32:1607case aco_opcode::v_add_co_u32_e64:1608case aco_opcode::s_add_i32:1609case aco_opcode::s_add_u32:1610case aco_opcode::v_subbrev_co_u32:1611ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());1612break;1613case aco_opcode::s_not_b32:1614case aco_opcode::s_not_b64:1615if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {1616ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();1617ctx.info[instr->definitions[1].tempId()].set_scc_invert(1618ctx.info[instr->operands[0].tempId()].temp);1619} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {1620ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();1621ctx.info[instr->definitions[1].tempId()].set_scc_invert(1622ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());1623}1624ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());1625break;1626case aco_opcode::s_and_b32:1627case aco_opcode::s_and_b64:1628if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {1629if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {1630/* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a1631* uniform bool into divergent */1632ctx.info[instr->definitions[1].tempId()].set_temp(1633ctx.info[instr->operands[0].tempId()].temp);1634ctx.info[instr->definitions[0].tempId()].set_uniform_bool(1635ctx.info[instr->operands[0].tempId()].temp);1636break;1637} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {1638/* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction1639* already produces the same SCC */1640ctx.info[instr->definitions[1].tempId()].set_temp(1641ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());1642ctx.info[instr->definitions[0].tempId()].set_uniform_bool(1643ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());1644break;1645} else if (ctx.info[instr->operands[0].tempId()].is_vopc()) {1646Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr;1647/* Remove superfluous s_and when the VOPC instruction uses the same exec and thus1648* already produces the same result */1649if (vopc_instr->pass_flags == instr->pass_flags) {1650assert(instr->pass_flags > 0);1651ctx.info[instr->definitions[0].tempId()].set_temp(1652vopc_instr->definitions[0].getTemp());1653break;1654}1655}1656}1657FALLTHROUGH;1658case aco_opcode::s_or_b32:1659case aco_opcode::s_or_b64:1660case aco_opcode::s_xor_b32:1661case aco_opcode::s_xor_b64:1662if (std::all_of(instr->operands.begin(), instr->operands.end(),1663[&ctx](const Operand& op)1664{1665return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||1666ctx.info[op.tempId()].is_uniform_bitwise());1667})) {1668ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();1669}1670FALLTHROUGH;1671case aco_opcode::s_lshl_b32:1672case aco_opcode::v_or_b32:1673case aco_opcode::v_lshlrev_b32:1674case aco_opcode::v_bcnt_u32_b32:1675case aco_opcode::v_and_b32:1676case aco_opcode::v_xor_b32:1677ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());1678break;1679case aco_opcode::v_min_f32:1680case aco_opcode::v_min_f16:1681case aco_opcode::v_min_u32:1682case aco_opcode::v_min_i32:1683case aco_opcode::v_min_u16:1684case aco_opcode::v_min_i16:1685case aco_opcode::v_max_f32:1686case aco_opcode::v_max_f16:1687case aco_opcode::v_max_u32:1688case aco_opcode::v_max_i32:1689case aco_opcode::v_max_u16:1690case aco_opcode::v_max_i16:1691ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());1692break;1693case aco_opcode::s_cselect_b64:1694case aco_opcode::s_cselect_b32:1695if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {1696/* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */1697ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());1698}1699if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {1700/* Flip the operands to get rid of the scc_invert instruction */1701std::swap(instr->operands[0], instr->operands[1]);1702instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);1703}1704break;1705case aco_opcode::p_wqm:1706if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) {1707ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());1708}1709break;1710case aco_opcode::s_mul_i32:1711/* Testing every uint32_t shows that 0x3f800000*n is never a denormal.1712* This pattern is created from a uniform nir_op_b2f. */1713if (instr->operands[0].constantEquals(0x3f800000u))1714ctx.info[instr->definitions[0].tempId()].set_canonicalized();1715break;1716case aco_opcode::p_extract: {1717if (instr->definitions[0].bytes() == 4) {1718ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());1719if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()) >= 0)1720ctx.info[instr->operands[0].tempId()].set_insert(instr.get());1721}1722break;1723}1724case aco_opcode::p_insert: {1725if (instr->operands[0].bytes() == 4) {1726if (instr->operands[0].regClass() == v1)1727ctx.info[instr->operands[0].tempId()].set_insert(instr.get());1728if (parse_extract(instr.get()) >= 0)1729ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());1730ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());1731}1732break;1733}1734default: break;1735}17361737/* Don't remove label_extract if we can't apply the extract to1738* neg/abs instructions because we'll likely combine it into another valu. */1739if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))1740check_sdwa_extract(ctx, instr);1741}17421743ALWAYS_INLINE bool1744get_cmp_info(aco_opcode op, CmpInfo* info)1745{1746info->ordered = aco_opcode::num_opcodes;1747info->unordered = aco_opcode::num_opcodes;1748info->ordered_swapped = aco_opcode::num_opcodes;1749info->unordered_swapped = aco_opcode::num_opcodes;1750switch (op) {1751// clang-format off1752#define CMP2(ord, unord, ord_swap, unord_swap, sz) \1753case aco_opcode::v_cmp_##ord##_f##sz: \1754case aco_opcode::v_cmp_n##unord##_f##sz: \1755info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \1756info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \1757info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz; \1758info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz; \1759info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \1760: aco_opcode::v_cmp_n##ord##_f##sz; \1761info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \1762: aco_opcode::v_cmp_n##unord##_f32; \1763info->size = sz; \1764return true;1765#define CMP(ord, unord, ord_swap, unord_swap) \1766CMP2(ord, unord, ord_swap, unord_swap, 16) \1767CMP2(ord, unord, ord_swap, unord_swap, 32) \1768CMP2(ord, unord, ord_swap, unord_swap, 64)1769CMP(lt, /*n*/ge, gt, /*n*/le)1770CMP(eq, /*n*/lg, eq, /*n*/lg)1771CMP(le, /*n*/gt, ge, /*n*/lt)1772CMP(gt, /*n*/le, lt, /*n*/le)1773CMP(lg, /*n*/eq, lg, /*n*/eq)1774CMP(ge, /*n*/lt, le, /*n*/gt)1775#undef CMP1776#undef CMP21777#define ORD_TEST(sz) \1778case aco_opcode::v_cmp_u_f##sz: \1779info->f32 = aco_opcode::v_cmp_u_f32; \1780info->inverse = aco_opcode::v_cmp_o_f##sz; \1781info->size = sz; \1782return true; \1783case aco_opcode::v_cmp_o_f##sz: \1784info->f32 = aco_opcode::v_cmp_o_f32; \1785info->inverse = aco_opcode::v_cmp_u_f##sz; \1786info->size = sz; \1787return true;1788ORD_TEST(16)1789ORD_TEST(32)1790ORD_TEST(64)1791#undef ORD_TEST1792// clang-format on1793default: return false;1794}1795}17961797aco_opcode1798get_ordered(aco_opcode op)1799{1800CmpInfo info;1801return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;1802}18031804aco_opcode1805get_unordered(aco_opcode op)1806{1807CmpInfo info;1808return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;1809}18101811aco_opcode1812get_inverse(aco_opcode op)1813{1814CmpInfo info;1815return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;1816}18171818aco_opcode1819get_f32_cmp(aco_opcode op)1820{1821CmpInfo info;1822return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;1823}18241825unsigned1826get_cmp_bitsize(aco_opcode op)1827{1828CmpInfo info;1829return get_cmp_info(op, &info) ? info.size : 0;1830}18311832bool1833is_cmp(aco_opcode op)1834{1835CmpInfo info;1836return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;1837}18381839unsigned1840original_temp_id(opt_ctx& ctx, Temp tmp)1841{1842if (ctx.info[tmp.id()].is_temp())1843return ctx.info[tmp.id()].temp.id();1844else1845return tmp.id();1846}18471848void1849decrease_uses(opt_ctx& ctx, Instruction* instr)1850{1851if (!--ctx.uses[instr->definitions[0].tempId()]) {1852for (const Operand& op : instr->operands) {1853if (op.isTemp())1854ctx.uses[op.tempId()]--;1855}1856}1857}18581859Instruction*1860follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)1861{1862if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))1863return nullptr;1864if (!ignore_uses && ctx.uses[op.tempId()] > 1)1865return nullptr;18661867Instruction* instr = ctx.info[op.tempId()].instr;18681869if (instr->definitions.size() == 2) {1870assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());1871if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])1872return nullptr;1873}18741875return instr;1876}18771878/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)1879* s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */1880bool1881combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)1882{1883if (instr->definitions[0].regClass() != ctx.program->lane_mask)1884return false;1885if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])1886return false;18871888bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;18891890bool neg[2] = {false, false};1891bool abs[2] = {false, false};1892uint8_t opsel = 0;1893Instruction* op_instr[2];1894Temp op[2];18951896unsigned bitsize = 0;1897for (unsigned i = 0; i < 2; i++) {1898op_instr[i] = follow_operand(ctx, instr->operands[i], true);1899if (!op_instr[i])1900return false;19011902aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;1903unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);19041905if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)1906return false;1907if (bitsize && op_bitsize != bitsize)1908return false;1909if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())1910return false;19111912if (op_instr[i]->isVOP3()) {1913VOP3_instruction& vop3 = op_instr[i]->vop3();1914if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||1915vop3.opsel == 2)1916return false;1917neg[i] = vop3.neg[0];1918abs[i] = vop3.abs[0];1919opsel |= (vop3.opsel & 1) << i;1920} else if (op_instr[i]->isSDWA()) {1921return false;1922}19231924Temp op0 = op_instr[i]->operands[0].getTemp();1925Temp op1 = op_instr[i]->operands[1].getTemp();1926if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))1927return false;19281929op[i] = op1;1930bitsize = op_bitsize;1931}19321933if (op[1].type() == RegType::sgpr)1934std::swap(op[0], op[1]);1935unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);1936if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))1937return false;19381939ctx.uses[op[0].id()]++;1940ctx.uses[op[1].id()]++;1941decrease_uses(ctx, op_instr[0]);1942decrease_uses(ctx, op_instr[1]);19431944aco_opcode new_op = aco_opcode::num_opcodes;1945switch (bitsize) {1946case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;1947case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;1948case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;1949}1950Instruction* new_instr;1951if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {1952VOP3_instruction* vop3 =1953create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);1954for (unsigned i = 0; i < 2; i++) {1955vop3->neg[i] = neg[i];1956vop3->abs[i] = abs[i];1957}1958vop3->opsel = opsel;1959new_instr = static_cast<Instruction*>(vop3);1960} else {1961new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);1962instr->definitions[0].setHint(vcc);1963}1964new_instr->operands[0] = Operand(op[0]);1965new_instr->operands[1] = Operand(op[1]);1966new_instr->definitions[0] = instr->definitions[0];19671968ctx.info[instr->definitions[0].tempId()].label = 0;1969ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);19701971instr.reset(new_instr);19721973return true;1974}19751976/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)1977* s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */1978bool1979combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)1980{1981if (instr->definitions[0].regClass() != ctx.program->lane_mask)1982return false;1983if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])1984return false;19851986bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;1987aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;19881989Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);1990Instruction* cmp = follow_operand(ctx, instr->operands[1], true);1991if (!nan_test || !cmp)1992return false;1993if (nan_test->isSDWA() || cmp->isSDWA())1994return false;19951996if (get_f32_cmp(cmp->opcode) == expected_nan_test)1997std::swap(nan_test, cmp);1998else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)1999return false;20002001if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))2002return false;20032004if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())2005return false;2006if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())2007return false;20082009unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());2010unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());2011unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());2012unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());2013if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)2014return false;2015if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)2016return false;20172018ctx.uses[cmp->operands[0].tempId()]++;2019ctx.uses[cmp->operands[1].tempId()]++;2020decrease_uses(ctx, nan_test);2021decrease_uses(ctx, cmp);20222023aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);2024Instruction* new_instr;2025if (cmp->isVOP3()) {2026VOP3_instruction* new_vop3 =2027create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);2028VOP3_instruction& cmp_vop3 = cmp->vop3();2029memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));2030memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));2031new_vop3->clamp = cmp_vop3.clamp;2032new_vop3->omod = cmp_vop3.omod;2033new_vop3->opsel = cmp_vop3.opsel;2034new_instr = new_vop3;2035} else {2036new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);2037instr->definitions[0].setHint(vcc);2038}2039new_instr->operands[0] = cmp->operands[0];2040new_instr->operands[1] = cmp->operands[1];2041new_instr->definitions[0] = instr->definitions[0];20422043ctx.info[instr->definitions[0].tempId()].label = 0;2044ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);20452046instr.reset(new_instr);20472048return true;2049}20502051bool2052is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)2053{2054if (op.isConstant()) {2055*value = op.constantValue64();2056return true;2057} else if (op.isTemp()) {2058unsigned id = original_temp_id(ctx, op.getTemp());2059if (!ctx.info[id].is_constant_or_literal(bit_size))2060return false;2061*value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();2062return true;2063}2064return false;2065}20662067bool2068is_constant_nan(uint64_t value, unsigned bit_size)2069{2070if (bit_size == 16)2071return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);2072else if (bit_size == 32)2073return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);2074else2075return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);2076}20772078/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)2079* s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */2080bool2081combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)2082{2083if (instr->definitions[0].regClass() != ctx.program->lane_mask)2084return false;2085if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])2086return false;20872088bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;20892090Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);2091Instruction* cmp = follow_operand(ctx, instr->operands[1], true);20922093if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA())2094return false;2095if (nan_test->isSDWA() || cmp->isSDWA())2096return false;20972098aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;2099if (get_f32_cmp(cmp->opcode) == expected_nan_test)2100std::swap(nan_test, cmp);2101else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)2102return false;21032104unsigned bit_size = get_cmp_bitsize(cmp->opcode);2105if (!is_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)2106return false;21072108if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())2109return false;2110if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())2111return false;21122113unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());2114unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());2115if (prop_nan0 != prop_nan1)2116return false;21172118if (nan_test->isVOP3()) {2119VOP3_instruction& vop3 = nan_test->vop3();2120if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||2121vop3.opsel == 2)2122return false;2123}21242125int constant_operand = -1;2126for (unsigned i = 0; i < 2; i++) {2127if (cmp->operands[i].isTemp() &&2128original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {2129constant_operand = !i;2130break;2131}2132}2133if (constant_operand == -1)2134return false;21352136uint64_t constant_value;2137if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))2138return false;2139if (is_constant_nan(constant_value, bit_size))2140return false;21412142if (cmp->operands[0].isTemp())2143ctx.uses[cmp->operands[0].tempId()]++;2144if (cmp->operands[1].isTemp())2145ctx.uses[cmp->operands[1].tempId()]++;2146decrease_uses(ctx, nan_test);2147decrease_uses(ctx, cmp);21482149aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);2150Instruction* new_instr;2151if (cmp->isVOP3()) {2152VOP3_instruction* new_vop3 =2153create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);2154VOP3_instruction& cmp_vop3 = cmp->vop3();2155memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));2156memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));2157new_vop3->clamp = cmp_vop3.clamp;2158new_vop3->omod = cmp_vop3.omod;2159new_vop3->opsel = cmp_vop3.opsel;2160new_instr = new_vop3;2161} else {2162new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);2163instr->definitions[0].setHint(vcc);2164}2165new_instr->operands[0] = cmp->operands[0];2166new_instr->operands[1] = cmp->operands[1];2167new_instr->definitions[0] = instr->definitions[0];21682169ctx.info[instr->definitions[0].tempId()].label = 0;2170ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);21712172instr.reset(new_instr);21732174return true;2175}21762177/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */2178bool2179combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)2180{2181if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)2182return false;2183if (ctx.uses[instr->definitions[1].tempId()])2184return false;21852186Instruction* cmp = follow_operand(ctx, instr->operands[1]);2187if (!cmp)2188return false;21892190aco_opcode new_opcode = get_inverse(cmp->opcode);2191if (new_opcode == aco_opcode::num_opcodes)2192return false;21932194if (cmp->operands[0].isTemp())2195ctx.uses[cmp->operands[0].tempId()]++;2196if (cmp->operands[1].isTemp())2197ctx.uses[cmp->operands[1].tempId()]++;2198decrease_uses(ctx, cmp);21992200/* This creates a new instruction instead of modifying the existing2201* comparison so that the comparison is done with the correct exec mask. */2202Instruction* new_instr;2203if (cmp->isVOP3()) {2204VOP3_instruction* new_vop3 =2205create_instruction<VOP3_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);2206VOP3_instruction& cmp_vop3 = cmp->vop3();2207memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));2208memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));2209new_vop3->clamp = cmp_vop3.clamp;2210new_vop3->omod = cmp_vop3.omod;2211new_vop3->opsel = cmp_vop3.opsel;2212new_instr = new_vop3;2213} else if (cmp->isSDWA()) {2214SDWA_instruction* new_sdwa = create_instruction<SDWA_instruction>(2215new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1);2216SDWA_instruction& cmp_sdwa = cmp->sdwa();2217memcpy(new_sdwa->abs, cmp_sdwa.abs, sizeof(new_sdwa->abs));2218memcpy(new_sdwa->sel, cmp_sdwa.sel, sizeof(new_sdwa->sel));2219memcpy(new_sdwa->neg, cmp_sdwa.neg, sizeof(new_sdwa->neg));2220new_sdwa->dst_sel = cmp_sdwa.dst_sel;2221new_sdwa->dst_preserve = cmp_sdwa.dst_preserve;2222new_sdwa->clamp = cmp_sdwa.clamp;2223new_sdwa->omod = cmp_sdwa.omod;2224new_instr = new_sdwa;2225} else {2226new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);2227instr->definitions[0].setHint(vcc);2228}2229new_instr->operands[0] = cmp->operands[0];2230new_instr->operands[1] = cmp->operands[1];2231new_instr->definitions[0] = instr->definitions[0];22322233ctx.info[instr->definitions[0].tempId()].label = 0;2234ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);22352236instr.reset(new_instr);22372238return true;2239}22402241/* op1(op2(1, 2), 0) if swap = false2242* op1(0, op2(1, 2)) if swap = true */2243bool2244match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,2245const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3],2246uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,2247bool* inbetween_abs, bool* inbetween_opsel, bool* precise)2248{2249/* checks */2250if (op1_instr->opcode != op1)2251return false;22522253Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);2254if (!op2_instr || op2_instr->opcode != op2)2255return false;2256if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))2257return false;22582259VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL;2260VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL;22612262if (op1_instr->isSDWA() || op2_instr->isSDWA())2263return false;22642265/* don't support inbetween clamp/omod */2266if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))2267return false;22682269/* get operands and modifiers and check inbetween modifiers */2270*op1_clamp = op1_vop3 ? op1_vop3->clamp : false;2271*op1_omod = op1_vop3 ? op1_vop3->omod : 0u;22722273if (inbetween_neg)2274*inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;2275else if (op1_vop3 && op1_vop3->neg[swap])2276return false;22772278if (inbetween_abs)2279*inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;2280else if (op1_vop3 && op1_vop3->abs[swap])2281return false;22822283if (inbetween_opsel)2284*inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << (unsigned)swap) : false;2285else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap))2286return false;22872288*precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();22892290int shuffle[3];2291shuffle[shuffle_str[0] - '0'] = 0;2292shuffle[shuffle_str[1] - '0'] = 1;2293shuffle[shuffle_str[2] - '0'] = 2;22942295operands[shuffle[0]] = op1_instr->operands[!swap];2296neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;2297abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;2298if (op1_vop3 && (op1_vop3->opsel & (1 << (unsigned)!swap)))2299*opsel |= 1 << shuffle[0];23002301for (unsigned i = 0; i < 2; i++) {2302operands[shuffle[i + 1]] = op2_instr->operands[i];2303neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;2304abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;2305if (op2_vop3 && op2_vop3->opsel & (1 << i))2306*opsel |= 1 << shuffle[i + 1];2307}23082309/* check operands */2310if (!check_vop3_operands(ctx, 3, operands))2311return false;23122313return true;2314}23152316void2317create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,2318Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp,2319unsigned omod)2320{2321VOP3_instruction* new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1);2322memcpy(new_instr->abs, abs, sizeof(bool[3]));2323memcpy(new_instr->neg, neg, sizeof(bool[3]));2324new_instr->clamp = clamp;2325new_instr->omod = omod;2326new_instr->opsel = opsel;2327new_instr->operands[0] = operands[0];2328new_instr->operands[1] = operands[1];2329new_instr->operands[2] = operands[2];2330new_instr->definitions[0] = instr->definitions[0];2331ctx.info[instr->definitions[0].tempId()].label = 0;23322333instr.reset(new_instr);2334}23352336bool2337combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,2338const char* shuffle, uint8_t ops)2339{2340for (unsigned swap = 0; swap < 2; swap++) {2341if (!((1 << swap) & ops))2342continue;23432344Operand operands[3];2345bool neg[3], abs[3], clamp, precise;2346uint8_t opsel = 0, omod = 0;2347if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,2348abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {2349ctx.uses[instr->operands[swap].tempId()]--;2350create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);2351return true;2352}2353}2354return false;2355}23562357/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */2358bool2359combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)2360{2361bool is_or = instr->opcode == aco_opcode::v_or_b32;2362aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;23632364if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,2365"120", 1 | 2))2366return true;2367if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,2368"120", 1 | 2))2369return true;2370if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))2371return true;2372if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))2373return true;23742375if (instr->isSDWA())2376return false;23772378/* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)2379* v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)2380* v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)2381* v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)2382*/2383for (unsigned i = 0; i < 2; i++) {2384Instruction* extins = follow_operand(ctx, instr->operands[i]);2385if (!extins)2386continue;23872388aco_opcode op;2389Operand operands[3];23902391if (extins->opcode == aco_opcode::p_insert &&2392(extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {2393op = new_op_lshl;2394operands[1] =2395Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());2396} else if (is_or &&2397(extins->opcode == aco_opcode::p_insert ||2398(extins->opcode == aco_opcode::p_extract &&2399extins->operands[3].constantEquals(0))) &&2400extins->operands[1].constantEquals(0)) {2401op = aco_opcode::v_and_or_b32;2402operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);2403} else {2404continue;2405}24062407operands[0] = extins->operands[0];2408operands[2] = instr->operands[!i];24092410if (!check_vop3_operands(ctx, 3, operands))2411continue;24122413bool neg[3] = {}, abs[3] = {};2414uint8_t opsel = 0, omod = 0;2415bool clamp = false;2416if (instr->isVOP3())2417clamp = instr->vop3().clamp;24182419ctx.uses[instr->operands[i].tempId()]--;2420create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);2421return true;2422}24232424return false;2425}24262427bool2428combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)2429{2430/* TODO: this can handle SDWA min/max instructions by using opsel */2431if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))2432return true;24332434/* min(-max(a, b), c) -> min3(c, -a, -b) *2435* max(-min(a, b), c) -> max3(c, -a, -b) */2436for (unsigned swap = 0; swap < 2; swap++) {2437Operand operands[3];2438bool neg[3], abs[3], clamp, precise;2439uint8_t opsel = 0, omod = 0;2440bool inbetween_neg;2441if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg,2442abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&2443inbetween_neg) {2444ctx.uses[instr->operands[swap].tempId()]--;2445neg[1] = !neg[1];2446neg[2] = !neg[2];2447create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);2448return true;2449}2450}2451return false;2452}24532454/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)2455* s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)2456* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)2457* s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)2458* s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)2459* s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */2460bool2461combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)2462{2463/* checks */2464if (!instr->operands[0].isTemp())2465return false;2466if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])2467return false;24682469Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);2470if (!op2_instr)2471return false;2472switch (op2_instr->opcode) {2473case aco_opcode::s_and_b32:2474case aco_opcode::s_or_b32:2475case aco_opcode::s_xor_b32:2476case aco_opcode::s_and_b64:2477case aco_opcode::s_or_b64:2478case aco_opcode::s_xor_b64: break;2479default: return false;2480}24812482/* create instruction */2483std::swap(instr->definitions[0], op2_instr->definitions[0]);2484std::swap(instr->definitions[1], op2_instr->definitions[1]);2485ctx.uses[instr->operands[0].tempId()]--;2486ctx.info[op2_instr->definitions[0].tempId()].label = 0;24872488switch (op2_instr->opcode) {2489case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;2490case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;2491case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;2492case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;2493case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;2494case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;2495default: break;2496}24972498return true;2499}25002501/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)2502* s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)2503* s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)2504* s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */2505bool2506combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)2507{2508if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())2509return false;25102511for (unsigned i = 0; i < 2; i++) {2512Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);2513if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&2514op2_instr->opcode != aco_opcode::s_not_b64))2515continue;2516if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))2517continue;25182519if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&2520instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())2521continue;25222523ctx.uses[instr->operands[i].tempId()]--;2524instr->operands[0] = instr->operands[!i];2525instr->operands[1] = op2_instr->operands[0];2526ctx.info[instr->definitions[0].tempId()].label = 0;25272528switch (instr->opcode) {2529case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;2530case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;2531case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;2532case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;2533default: break;2534}25352536return true;2537}2538return false;2539}25402541/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */2542bool2543combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)2544{2545if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])2546return false;25472548for (unsigned i = 0; i < 2; i++) {2549Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);2550if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||2551ctx.uses[op2_instr->definitions[1].tempId()])2552continue;2553if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))2554continue;25552556uint32_t shift = op2_instr->operands[1].constantValue();2557if (shift < 1 || shift > 4)2558continue;25592560if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&2561instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())2562continue;25632564ctx.uses[instr->operands[i].tempId()]--;2565instr->operands[1] = instr->operands[!i];2566instr->operands[0] = op2_instr->operands[0];2567ctx.info[instr->definitions[0].tempId()].label = 0;25682569instr->opcode = std::array<aco_opcode, 4>{2570aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,2571aco_opcode::s_lshl4_add_u32}[shift - 1];25722573return true;2574}2575return false;2576}25772578bool2579combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)2580{2581if (instr->usesModifiers())2582return false;25832584for (unsigned i = 0; i < 2; i++) {2585if (!((1 << i) & ops))2586continue;2587if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&2588ctx.uses[instr->operands[i].tempId()] == 1) {25892590aco_ptr<Instruction> new_instr;2591if (instr->operands[!i].isTemp() &&2592instr->operands[!i].getTemp().type() == RegType::vgpr) {2593new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));2594} else if (ctx.program->chip_class >= GFX10 ||2595(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {2596new_instr.reset(2597create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));2598} else {2599return false;2600}2601ctx.uses[instr->operands[i].tempId()]--;2602new_instr->definitions[0] = instr->definitions[0];2603if (instr->definitions.size() == 2) {2604new_instr->definitions[1] = instr->definitions[1];2605} else {2606new_instr->definitions[1] =2607Definition(ctx.program->allocateTmp(ctx.program->lane_mask));2608/* Make sure the uses vector is large enough and the number of2609* uses properly initialized to 0.2610*/2611ctx.uses.push_back(0);2612}2613new_instr->definitions[1].setHint(vcc);2614new_instr->operands[0] = Operand::zero();2615new_instr->operands[1] = instr->operands[!i];2616new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);2617instr = std::move(new_instr);2618ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());2619return true;2620}2621}26222623return false;2624}26252626bool2627combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)2628{2629if (instr->usesModifiers())2630return false;26312632for (unsigned i = 0; i < 2; i++) {2633Instruction* op_instr = follow_operand(ctx, instr->operands[i]);2634if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&2635op_instr->operands[0].isTemp() &&2636op_instr->operands[0].getTemp().type() == RegType::vgpr &&2637op_instr->operands[1].constantEquals(0)) {2638aco_ptr<Instruction> new_instr{2639create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};2640ctx.uses[instr->operands[i].tempId()]--;2641new_instr->operands[0] = op_instr->operands[0];2642new_instr->operands[1] = instr->operands[!i];2643new_instr->definitions[0] = instr->definitions[0];2644instr = std::move(new_instr);2645ctx.info[instr->definitions[0].tempId()].label = 0;26462647return true;2648}2649}26502651return false;2652}26532654bool2655get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,2656aco_opcode* med3, bool* some_gfx9_only)2657{2658switch (op) {2659#define MINMAX(type, gfx9) \2660case aco_opcode::v_min_##type: \2661case aco_opcode::v_max_##type: \2662case aco_opcode::v_med3_##type: \2663*min = aco_opcode::v_min_##type; \2664*max = aco_opcode::v_max_##type; \2665*med3 = aco_opcode::v_med3_##type; \2666*min3 = aco_opcode::v_min3_##type; \2667*max3 = aco_opcode::v_max3_##type; \2668*some_gfx9_only = gfx9; \2669return true;2670MINMAX(f32, false)2671MINMAX(u32, false)2672MINMAX(i32, false)2673MINMAX(f16, true)2674MINMAX(u16, true)2675MINMAX(i16, true)2676#undef MINMAX2677default: return false;2678}2679}26802681/* when ub > lb:2682* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)2683* v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)2684*/2685bool2686combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,2687aco_opcode med)2688{2689/* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's2690* FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if2691* minVal > maxVal, which means we can always select it to a v_med3_f32 */2692aco_opcode other_op;2693if (instr->opcode == min)2694other_op = max;2695else if (instr->opcode == max)2696other_op = min;2697else2698return false;26992700for (unsigned swap = 0; swap < 2; swap++) {2701Operand operands[3];2702bool neg[3], abs[3], clamp, precise;2703uint8_t opsel = 0, omod = 0;2704if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,2705abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {2706/* max(min(src, upper), lower) returns upper if src is NaN, but2707* med3(src, lower, upper) returns lower.2708*/2709if (precise && instr->opcode != min)2710continue;27112712int const0_idx = -1, const1_idx = -1;2713uint32_t const0 = 0, const1 = 0;2714for (int i = 0; i < 3; i++) {2715uint32_t val;2716if (operands[i].isConstant()) {2717val = operands[i].constantValue();2718} else if (operands[i].isTemp() &&2719ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {2720val = ctx.info[operands[i].tempId()].val;2721} else {2722continue;2723}2724if (const0_idx >= 0) {2725const1_idx = i;2726const1 = val;2727} else {2728const0_idx = i;2729const0 = val;2730}2731}2732if (const0_idx < 0 || const1_idx < 0)2733continue;27342735if (opsel & (1 << const0_idx))2736const0 >>= 16;2737if (opsel & (1 << const1_idx))2738const1 >>= 16;27392740int lower_idx = const0_idx;2741switch (min) {2742case aco_opcode::v_min_f32:2743case aco_opcode::v_min_f16: {2744float const0_f, const1_f;2745if (min == aco_opcode::v_min_f32) {2746memcpy(&const0_f, &const0, 4);2747memcpy(&const1_f, &const1, 4);2748} else {2749const0_f = _mesa_half_to_float(const0);2750const1_f = _mesa_half_to_float(const1);2751}2752if (abs[const0_idx])2753const0_f = fabsf(const0_f);2754if (abs[const1_idx])2755const1_f = fabsf(const1_f);2756if (neg[const0_idx])2757const0_f = -const0_f;2758if (neg[const1_idx])2759const1_f = -const1_f;2760lower_idx = const0_f < const1_f ? const0_idx : const1_idx;2761break;2762}2763case aco_opcode::v_min_u32: {2764lower_idx = const0 < const1 ? const0_idx : const1_idx;2765break;2766}2767case aco_opcode::v_min_u16: {2768lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;2769break;2770}2771case aco_opcode::v_min_i32: {2772int32_t const0_i =2773const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;2774int32_t const1_i =2775const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;2776lower_idx = const0_i < const1_i ? const0_idx : const1_idx;2777break;2778}2779case aco_opcode::v_min_i16: {2780int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;2781int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;2782lower_idx = const0_i < const1_i ? const0_idx : const1_idx;2783break;2784}2785default: break;2786}2787int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;27882789if (instr->opcode == min) {2790if (upper_idx != 0 || lower_idx == 0)2791return false;2792} else {2793if (upper_idx == 0 || lower_idx != 0)2794return false;2795}27962797ctx.uses[instr->operands[swap].tempId()]--;2798create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);27992800return true;2801}2802}28032804return false;2805}28062807void2808apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)2809{2810bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||2811instr->opcode == aco_opcode::v_lshrrev_b64 ||2812instr->opcode == aco_opcode::v_ashrrev_i64;28132814/* find candidates and create the set of sgprs already read */2815unsigned sgpr_ids[2] = {0, 0};2816uint32_t operand_mask = 0;2817bool has_literal = false;2818for (unsigned i = 0; i < instr->operands.size(); i++) {2819if (instr->operands[i].isLiteral())2820has_literal = true;2821if (!instr->operands[i].isTemp())2822continue;2823if (instr->operands[i].getTemp().type() == RegType::sgpr) {2824if (instr->operands[i].tempId() != sgpr_ids[0])2825sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();2826}2827ssa_info& info = ctx.info[instr->operands[i].tempId()];2828if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::sgpr)2829operand_mask |= 1u << i;2830if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)2831operand_mask |= 1u << i;2832}2833unsigned max_sgprs = 1;2834if (ctx.program->chip_class >= GFX10 && !is_shift64)2835max_sgprs = 2;2836if (has_literal)2837max_sgprs--;28382839unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];28402841/* keep on applying sgprs until there is nothing left to be done */2842while (operand_mask) {2843uint32_t sgpr_idx = 0;2844uint32_t sgpr_info_id = 0;2845uint32_t mask = operand_mask;2846/* choose a sgpr */2847while (mask) {2848unsigned i = u_bit_scan(&mask);2849uint16_t uses = ctx.uses[instr->operands[i].tempId()];2850if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {2851sgpr_idx = i;2852sgpr_info_id = instr->operands[i].tempId();2853}2854}2855operand_mask &= ~(1u << sgpr_idx);28562857ssa_info& info = ctx.info[sgpr_info_id];28582859/* Applying two sgprs require making it VOP3, so don't do it unless it's2860* definitively beneficial.2861* TODO: this is too conservative because later the use count could be reduced to 1 */2862if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&2863!instr->isSDWA() && instr->format != Format::VOP3P)2864break;28652866Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;2867bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];2868if (new_sgpr && num_sgprs >= max_sgprs)2869continue;28702871if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||2872info.is_extract()) {2873/* can_apply_extract() checks SGPR encoding restrictions */2874if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))2875apply_extract(ctx, instr, sgpr_idx, info);2876else if (info.is_extract())2877continue;2878instr->operands[sgpr_idx] = Operand(sgpr);2879} else if (can_swap_operands(instr)) {2880instr->operands[sgpr_idx] = instr->operands[0];2881instr->operands[0] = Operand(sgpr);2882/* swap bits using a 4-entry LUT */2883uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;2884operand_mask = (operand_mask & ~0x3) | swapped;2885} else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {2886to_VOP3(ctx, instr);2887instr->operands[sgpr_idx] = Operand(sgpr);2888} else {2889continue;2890}28912892if (new_sgpr)2893sgpr_ids[num_sgprs++] = sgpr.id();2894ctx.uses[sgpr_info_id]--;2895ctx.uses[sgpr.id()]++;28962897/* TODO: handle when it's a VGPR */2898if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&2899ctx.info[sgpr.id()].temp.type() == RegType::sgpr)2900operand_mask |= 1u << sgpr_idx;2901}2902}29032904template <typename T>2905bool2906apply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info)2907{2908if (!def_info.is_clamp() && (instr->clamp || instr->omod))2909return false;29102911if (def_info.is_omod2())2912instr->omod = 1;2913else if (def_info.is_omod4())2914instr->omod = 2;2915else if (def_info.is_omod5())2916instr->omod = 3;2917else if (def_info.is_clamp())2918instr->clamp = true;29192920return true;2921}29222923/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */2924bool2925apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)2926{2927if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||2928!instr_info.can_use_output_modifiers[(int)instr->opcode])2929return false;29302931bool can_vop3 = can_use_VOP3(ctx, instr);2932if (!instr->isSDWA() && !can_vop3)2933return false;29342935/* omod flushes -0 to +0 and has no effect if denormals are enabled */2936bool can_use_omod = (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */2937if (instr->definitions[0].bytes() == 4)2938can_use_omod =2939can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32;2940else2941can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&2942!ctx.fp_mode.preserve_signed_zero_inf_nan16_64;29432944ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];29452946uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;2947if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))2948return false;2949/* if the omod/clamp instruction is dead, then the single user of this2950* instruction is a different instruction */2951if (!ctx.uses[def_info.instr->definitions[0].tempId()])2952return false;29532954/* MADs/FMAs are created later, so we don't have to update the original add */2955assert(!ctx.info[instr->definitions[0].tempId()].is_mad());29562957if (instr->isSDWA()) {2958if (!apply_omod_clamp_helper(ctx, &instr->sdwa(), def_info))2959return false;2960} else {2961to_VOP3(ctx, instr);2962if (!apply_omod_clamp_helper(ctx, &instr->vop3(), def_info))2963return false;2964}29652966instr->definitions[0].swapTemp(def_info.instr->definitions[0]);2967ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert;2968ctx.uses[def_info.instr->definitions[0].tempId()]--;29692970return true;2971}29722973/* Combine an p_insert (or p_extract, in some cases) instruction with instr.2974* p_insert(instr(...)) -> instr_insert().2975*/2976bool2977apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)2978{2979if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)2980return false;29812982ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];2983if (!def_info.is_insert())2984return false;2985/* if the insert instruction is dead, then the single user of this2986* instruction is a different instruction */2987if (!ctx.uses[def_info.instr->definitions[0].tempId()])2988return false;29892990/* MADs/FMAs are created later, so we don't have to update the original add */2991assert(!ctx.info[instr->definitions[0].tempId()].is_mad());29922993unsigned sel = parse_insert(def_info.instr);29942995if (instr->isVOP3() && (sel & sdwa_isword) && !(sel & sdwa_sext) &&2996can_use_opsel(ctx.program->chip_class, instr->opcode, 3, (sel & sdwa_wordnum))) {2997if (instr->vop3().opsel & (1 << 3))2998return false;2999if (sel & sdwa_wordnum)3000instr->vop3().opsel |= 1 << 3;3001} else {3002if (!can_use_SDWA(ctx.program->chip_class, instr, true))3003return false;30043005to_SDWA(ctx, instr);3006if ((static_cast<SDWA_instruction*>(instr.get())->dst_sel & sdwa_asuint) != sdwa_udword)3007return false;3008static_cast<SDWA_instruction*>(instr.get())->dst_sel = sel;3009}30103011instr->definitions[0].swapTemp(def_info.instr->definitions[0]);3012ctx.info[instr->definitions[0].tempId()].label = 0;3013ctx.uses[def_info.instr->definitions[0].tempId()]--;30143015return true;3016}30173018/* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */3019bool3020combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)3021{3022if (instr->usesModifiers())3023return false;30243025for (unsigned i = 0; i < 2; i++) {3026Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);3027if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&3028op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&3029!op_instr->usesModifiers()) {30303031aco_ptr<Instruction> new_instr;3032if (instr->operands[!i].isTemp() &&3033instr->operands[!i].getTemp().type() == RegType::vgpr) {3034new_instr.reset(3035create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));3036} else if (ctx.program->chip_class >= GFX10 ||3037(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {3038new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32,3039asVOP3(Format::VOP2), 3, 1));3040} else {3041return false;3042}30433044ctx.uses[instr->operands[i].tempId()]--;3045if (ctx.uses[instr->operands[i].tempId()])3046ctx.uses[op_instr->operands[2].tempId()]++;30473048new_instr->operands[0] = Operand::zero();3049new_instr->operands[1] = instr->operands[!i];3050new_instr->operands[2] = Operand(op_instr->operands[2]);3051new_instr->definitions[0] = instr->definitions[0];3052instr = std::move(new_instr);3053ctx.info[instr->definitions[0].tempId()].label = 0;3054return true;3055}3056}30573058return false;3059}30603061/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)3062* v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c) */3063bool3064combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)3065{3066if (instr->usesModifiers())3067return false;30683069for (unsigned i = 0; i < 2; i++) {3070Instruction* op_instr = follow_operand(ctx, instr->operands[i]);3071if (!op_instr)3072continue;30733074if (op_instr->opcode != aco_opcode::s_lshl_b32 &&3075op_instr->opcode != aco_opcode::v_lshlrev_b32)3076continue;30773078if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && op_instr->operands[1].isTemp() &&3079op_instr->operands[1].getTemp().type() == RegType::sgpr && instr->operands[!i].isTemp() &&3080instr->operands[!i].getTemp().type() == RegType::sgpr)3081return false;30823083int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;3084if (op_instr->operands[shift_op_idx].isConstant() &&3085op_instr->operands[shift_op_idx].constantValue() <= 6 && /* no literals */3086(op_instr->operands[!shift_op_idx].is24bit() ||3087op_instr->operands[!shift_op_idx].is16bit())) {3088uint32_t multiplier = 1 << op_instr->operands[shift_op_idx].constantValue();30893090ctx.uses[instr->operands[i].tempId()]--;30913092aco_ptr<VOP3_instruction> new_instr{3093create_instruction<VOP3_instruction>(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)};3094new_instr->operands[0] = op_instr->operands[!shift_op_idx];3095new_instr->operands[1] = Operand::c32(multiplier);3096new_instr->operands[2] = instr->operands[!i];3097new_instr->definitions[0] = instr->definitions[0];3098instr = std::move(new_instr);3099ctx.info[instr->definitions[0].tempId()].label = 0;3100return true;3101}3102}31033104return false;3105}31063107void3108propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi)3109{3110/* propagate swizzles which apply to a result down to the instruction's operands:3111* result = a.xy + b.xx -> result.yx = a.yx + b.xx */3112assert((opsel_lo & 1) == opsel_lo);3113assert((opsel_hi & 1) == opsel_hi);3114uint8_t tmp_lo = instr->opsel_lo;3115uint8_t tmp_hi = instr->opsel_hi;3116bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]};3117bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]};3118if (opsel_lo == 1) {3119instr->opsel_lo = tmp_hi;3120for (unsigned i = 0; i < 3; i++)3121instr->neg_lo[i] = neg_hi[i];3122}3123if (opsel_hi == 0) {3124instr->opsel_hi = tmp_lo;3125for (unsigned i = 0; i < 3; i++)3126instr->neg_hi[i] = neg_lo[i];3127}3128}31293130void3131combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)3132{3133VOP3P_instruction* vop3p = &instr->vop3p();31343135/* apply clamp */3136if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&3137vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1) {31383139ssa_info& info = ctx.info[instr->operands[0].tempId()];3140if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {3141VOP3P_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->vop3p();3142candidate->clamp = true;3143propagate_swizzles(candidate, vop3p->opsel_lo, vop3p->opsel_hi);3144instr->definitions[0].swapTemp(candidate->definitions[0]);3145ctx.info[candidate->definitions[0].tempId()].instr = candidate;3146ctx.uses[instr->definitions[0].tempId()]--;3147return;3148}3149}31503151/* check for fneg modifiers */3152if (instr_info.can_use_input_modifiers[(int)instr->opcode]) {3153/* at this point, we only have 2-operand instructions */3154assert(instr->operands.size() == 2);3155for (unsigned i = 0; i < 2; i++) {3156Operand& op = instr->operands[i];3157if (!op.isTemp())3158continue;31593160ssa_info& info = ctx.info[op.tempId()];3161if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&3162info.instr->operands[1].constantEquals(0xBC00)) {3163Operand ops[2] = {instr->operands[!i], info.instr->operands[0]};3164if (!check_vop3_operands(ctx, 2, ops))3165continue;31663167VOP3P_instruction* fneg = &info.instr->vop3p();3168if (fneg->clamp)3169continue;3170instr->operands[i] = fneg->operands[0];31713172/* opsel_lo/hi is either 0 or 1:3173* if 0 - pick selection from fneg->lo3174* if 1 - pick selection from fneg->hi3175*/3176bool opsel_lo = vop3p->opsel_lo & (1 << i);3177bool opsel_hi = vop3p->opsel_hi & (1 << i);3178vop3p->neg_lo[i] ^= true ^ (opsel_lo ? fneg->neg_hi[0] : fneg->neg_lo[0]);3179vop3p->neg_hi[i] ^= true ^ (opsel_hi ? fneg->neg_hi[0] : fneg->neg_lo[0]);3180vop3p->opsel_lo ^= ((opsel_lo ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;3181vop3p->opsel_hi ^= ((opsel_hi ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;31823183if (--ctx.uses[fneg->definitions[0].tempId()])3184ctx.uses[fneg->operands[0].tempId()]++;3185}3186}3187}31883189if (instr->opcode == aco_opcode::v_pk_add_f16) {3190if (instr->definitions[0].isPrecise())3191return;31923193Instruction* mul_instr = nullptr;3194unsigned add_op_idx = 0;3195uint8_t opsel_lo = 0, opsel_hi = 0;3196uint32_t uses = UINT32_MAX;31973198/* find the 'best' mul instruction to combine with the add */3199for (unsigned i = 0; i < 2; i++) {3200if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p())3201continue;3202ssa_info& info = ctx.info[instr->operands[i].tempId()];3203if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||3204info.instr->definitions[0].isPrecise())3205continue;32063207Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};3208if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))3209continue;32103211/* no clamp allowed between mul and add */3212if (info.instr->vop3p().clamp)3213continue;32143215mul_instr = info.instr;3216add_op_idx = 1 - i;3217opsel_lo = (vop3p->opsel_lo >> i) & 1;3218opsel_hi = (vop3p->opsel_hi >> i) & 1;3219uses = ctx.uses[instr->operands[i].tempId()];3220}32213222if (!mul_instr)3223return;32243225/* convert to mad */3226Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]};3227ctx.uses[mul_instr->definitions[0].tempId()]--;3228if (ctx.uses[mul_instr->definitions[0].tempId()]) {3229if (op[0].isTemp())3230ctx.uses[op[0].tempId()]++;3231if (op[1].isTemp())3232ctx.uses[op[1].tempId()]++;3233}32343235/* turn packed mul+add into v_pk_fma_f16 */3236assert(mul_instr->isVOP3P());3237aco_ptr<VOP3P_instruction> fma{3238create_instruction<VOP3P_instruction>(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)};3239VOP3P_instruction* mul = &mul_instr->vop3p();3240for (unsigned i = 0; i < 2; i++) {3241fma->operands[i] = op[i];3242fma->neg_lo[i] = mul->neg_lo[i];3243fma->neg_hi[i] = mul->neg_hi[i];3244}3245fma->operands[2] = op[2];3246fma->clamp = vop3p->clamp;3247fma->opsel_lo = mul->opsel_lo;3248fma->opsel_hi = mul->opsel_hi;3249propagate_swizzles(fma.get(), opsel_lo, opsel_hi);3250fma->opsel_lo |= (vop3p->opsel_lo << (2 - add_op_idx)) & 0x4;3251fma->opsel_hi |= (vop3p->opsel_hi << (2 - add_op_idx)) & 0x4;3252fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];3253fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];3254fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];3255fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];3256fma->definitions[0] = instr->definitions[0];3257instr.reset(fma.release());3258ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());3259return;3260}3261}32623263// TODO: we could possibly move the whole label_instruction pass to combine_instruction:3264// this would mean that we'd have to fix the instruction uses while value propagation32653266void3267combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)3268{3269if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))3270return;32713272if (instr->isVALU()) {3273/* Apply SDWA. Do this after label_instruction() so it can remove3274* label_extract if not all instructions can take SDWA. */3275for (unsigned i = 0; i < instr->operands.size(); i++) {3276Operand& op = instr->operands[i];3277if (!op.isTemp())3278continue;3279ssa_info& info = ctx.info[op.tempId()];3280if (info.is_extract() &&3281(info.instr->operands[0].getTemp().type() == RegType::vgpr ||3282instr->operands[i].getTemp().type() == RegType::sgpr) &&3283can_apply_extract(ctx, instr, i, info)) {3284apply_extract(ctx, instr, i, info);3285ctx.uses[instr->operands[i].tempId()]--;3286instr->operands[i].setTemp(info.instr->operands[0].getTemp());3287}3288}32893290if (can_apply_sgprs(ctx, instr))3291apply_sgprs(ctx, instr);3292while (apply_omod_clamp(ctx, instr))3293;3294apply_insert(ctx, instr);3295}32963297if (instr->isVOP3P())3298return combine_vop3p(ctx, instr);32993300if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {3301instr->definitions[0].setHint(vcc);3302}33033304if (instr->isSDWA())3305return;33063307/* TODO: There are still some peephole optimizations that could be done:3308* - abs(a - b) -> s_absdiff_i323309* - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b323310* - patterns for v_alignbit_b32 and v_alignbyte_b323311* These aren't probably too interesting though.3312* There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but3313* probably more useful than the previously mentioned optimizations.3314* The various comparison optimizations also currently only work with 32-bit3315* floats. */33163317/* neg(mul(a, b)) -> mul(neg(a), b) */3318if (ctx.info[instr->definitions[0].tempId()].is_neg() &&3319ctx.uses[instr->operands[1].tempId()] == 1) {3320Temp val = ctx.info[instr->definitions[0].tempId()].temp;33213322if (!ctx.info[val.id()].is_mul())3323return;33243325Instruction* mul_instr = ctx.info[val.id()].instr;33263327if (mul_instr->operands[0].isLiteral())3328return;3329if (mul_instr->isVOP3() && mul_instr->vop3().clamp)3330return;3331if (mul_instr->isSDWA())3332return;33333334/* convert to mul(neg(a), b) */3335ctx.uses[mul_instr->definitions[0].tempId()]--;3336Definition def = instr->definitions[0];3337/* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */3338bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();3339instr.reset(3340create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));3341instr->operands[0] = mul_instr->operands[0];3342instr->operands[1] = mul_instr->operands[1];3343instr->definitions[0] = def;3344VOP3_instruction& new_mul = instr->vop3();3345if (mul_instr->isVOP3()) {3346VOP3_instruction& mul = mul_instr->vop3();3347new_mul.neg[0] = mul.neg[0] && !is_abs;3348new_mul.neg[1] = mul.neg[1] && !is_abs;3349new_mul.abs[0] = mul.abs[0] || is_abs;3350new_mul.abs[1] = mul.abs[1] || is_abs;3351new_mul.omod = mul.omod;3352}3353new_mul.neg[0] ^= true;3354new_mul.clamp = false;33553356ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());3357return;3358}33593360/* combine mul+add -> mad */3361bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||3362instr->opcode == aco_opcode::v_subrev_f32;3363bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||3364instr->opcode == aco_opcode::v_subrev_f16;3365if (mad16 || mad32) {3366bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3)3367: (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);3368if (need_fma && instr->definitions[0].isPrecise())3369return;3370if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)3371return;33723373Instruction* mul_instr = nullptr;3374unsigned add_op_idx = 0;3375uint32_t uses = UINT32_MAX;3376/* find the 'best' mul instruction to combine with the add */3377for (unsigned i = 0; i < 2; i++) {3378if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())3379continue;3380/* check precision requirements */3381ssa_info& info = ctx.info[instr->operands[i].tempId()];3382if (need_fma && info.instr->definitions[0].isPrecise())3383continue;33843385/* no clamp/omod allowed between mul and add */3386if (info.instr->isVOP3() && (info.instr->vop3().clamp || info.instr->vop3().omod))3387continue;33883389Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};3390if (info.instr->isSDWA() || !check_vop3_operands(ctx, 3, op) ||3391ctx.uses[instr->operands[i].tempId()] >= uses)3392continue;33933394mul_instr = info.instr;3395add_op_idx = 1 - i;3396uses = ctx.uses[instr->operands[i].tempId()];3397}33983399if (mul_instr) {3400/* turn mul+add into v_mad/v_fma */3401Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],3402instr->operands[add_op_idx]};3403ctx.uses[mul_instr->definitions[0].tempId()]--;3404if (ctx.uses[mul_instr->definitions[0].tempId()]) {3405if (op[0].isTemp())3406ctx.uses[op[0].tempId()]++;3407if (op[1].isTemp())3408ctx.uses[op[1].tempId()]++;3409}34103411bool neg[3] = {false, false, false};3412bool abs[3] = {false, false, false};3413unsigned omod = 0;3414bool clamp = false;34153416if (mul_instr->isVOP3()) {3417VOP3_instruction& vop3 = mul_instr->vop3();3418neg[0] = vop3.neg[0];3419neg[1] = vop3.neg[1];3420abs[0] = vop3.abs[0];3421abs[1] = vop3.abs[1];3422}34233424if (instr->isVOP3()) {3425VOP3_instruction& vop3 = instr->vop3();3426neg[2] = vop3.neg[add_op_idx];3427abs[2] = vop3.abs[add_op_idx];3428omod = vop3.omod;3429clamp = vop3.clamp;3430/* abs of the multiplication result */3431if (vop3.abs[1 - add_op_idx]) {3432neg[0] = false;3433neg[1] = false;3434abs[0] = true;3435abs[1] = true;3436}3437/* neg of the multiplication result */3438neg[1] = neg[1] ^ vop3.neg[1 - add_op_idx];3439}3440if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)3441neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;3442else if (instr->opcode == aco_opcode::v_subrev_f32 ||3443instr->opcode == aco_opcode::v_subrev_f16)3444neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;34453446aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;3447if (mad16)3448mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f163449: aco_opcode::v_fma_f16)3450: (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f163451: aco_opcode::v_mad_f16);34523453aco_ptr<VOP3_instruction> mad{3454create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};3455for (unsigned i = 0; i < 3; i++) {3456mad->operands[i] = op[i];3457mad->neg[i] = neg[i];3458mad->abs[i] = abs[i];3459}3460mad->omod = omod;3461mad->clamp = clamp;3462mad->definitions[0] = instr->definitions[0];34633464/* mark this ssa_def to be re-checked for profitability and literals */3465ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());3466ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);3467instr.reset(mad.release());3468return;3469}3470}3471/* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */3472else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {3473for (unsigned i = 0; i < 2; i++) {3474if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&3475ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&3476instr->operands[!i].getTemp().type() == RegType::vgpr) {3477ctx.uses[instr->operands[i].tempId()]--;3478ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;34793480aco_ptr<VOP2_instruction> new_instr{3481create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};3482new_instr->operands[0] = Operand::zero();3483new_instr->operands[1] = instr->operands[!i];3484new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);3485new_instr->definitions[0] = instr->definitions[0];3486instr.reset(new_instr.release());3487ctx.info[instr->definitions[0].tempId()].label = 0;3488return;3489}3490}3491} else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {3492if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",34931 | 2)) {3494} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,3495"012", 1 | 2)) {3496} else if (combine_add_or_then_and_lshl(ctx, instr)) {3497}3498} else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {3499if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",35001 | 2)) {3501} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,3502"012", 1 | 2)) {3503}3504} else if (instr->opcode == aco_opcode::v_add_u32) {3505if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {3506} else if (combine_add_bcnt(ctx, instr)) {3507} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,3508aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {3509} else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) {3510if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",35111 | 2)) {3512} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,3513"120", 1 | 2)) {3514} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,3515"012", 1 | 2)) {3516} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,3517"012", 1 | 2)) {3518} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,3519"012", 1 | 2)) {3520} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16,3521aco_opcode::v_mad_u32_u16, "120", 1 | 2)) {3522} else if (combine_add_or_then_and_lshl(ctx, instr)) {3523}3524}3525} else if (instr->opcode == aco_opcode::v_add_co_u32 ||3526instr->opcode == aco_opcode::v_add_co_u32_e64) {3527bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;3528if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {3529} else if (!carry_out && combine_add_bcnt(ctx, instr)) {3530} else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,3531aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {3532} else if (!carry_out && combine_add_lshl(ctx, instr)) {3533}3534} else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||3535instr->opcode == aco_opcode::v_sub_co_u32_e64) {3536combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);3537} else if (instr->opcode == aco_opcode::v_subrev_u32 ||3538instr->opcode == aco_opcode::v_subrev_co_u32 ||3539instr->opcode == aco_opcode::v_subrev_co_u32_e64) {3540combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);3541} else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {3542combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",35432);3544} else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&3545ctx.program->chip_class >= GFX9) {3546combine_salu_lshl_add(ctx, instr);3547} else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {3548combine_salu_not_bitwise(ctx, instr);3549} else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||3550instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {3551if (combine_ordering_test(ctx, instr)) {3552} else if (combine_comparison_ordering(ctx, instr)) {3553} else if (combine_constant_comparison_ordering(ctx, instr)) {3554} else if (combine_salu_n2(ctx, instr)) {3555}3556} else if (instr->opcode == aco_opcode::v_and_b32) {3557combine_and_subbrev(ctx, instr);3558} else {3559aco_opcode min, max, min3, max3, med3;3560bool some_gfx9_only;3561if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&3562(!some_gfx9_only || ctx.program->chip_class >= GFX9)) {3563if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,3564instr->opcode == min ? min3 : max3)) {3565} else {3566combine_clamp(ctx, instr, min, max, med3);3567}3568}3569}35703571/* do this after combine_salu_n2() */3572if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64)3573combine_inverse_comparison(ctx, instr);3574}35753576bool3577to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)3578{3579switch (instr->opcode) {3580case aco_opcode::s_and_b32:3581case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;3582case aco_opcode::s_or_b32:3583case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;3584case aco_opcode::s_xor_b32:3585case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;3586default:3587/* Don't transform other instructions. They are very unlikely to appear here. */3588return false;3589}35903591for (Operand& op : instr->operands) {3592ctx.uses[op.tempId()]--;35933594if (ctx.info[op.tempId()].is_uniform_bool()) {3595/* Just use the uniform boolean temp. */3596op.setTemp(ctx.info[op.tempId()].temp);3597} else if (ctx.info[op.tempId()].is_uniform_bitwise()) {3598/* Use the SCC definition of the predecessor instruction.3599* This allows the predecessor to get picked up by the same optimization (if it has no3600* divergent users), and it also makes sure that the current instruction will keep working3601* even if the predecessor won't be transformed.3602*/3603Instruction* pred_instr = ctx.info[op.tempId()].instr;3604assert(pred_instr->definitions.size() >= 2);3605assert(pred_instr->definitions[1].isFixed() &&3606pred_instr->definitions[1].physReg() == scc);3607op.setTemp(pred_instr->definitions[1].getTemp());3608} else {3609unreachable("Invalid operand on uniform bitwise instruction.");3610}36113612ctx.uses[op.tempId()]++;3613}36143615instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));3616assert(instr->operands[0].regClass() == s1);3617assert(instr->operands[1].regClass() == s1);3618return true;3619}36203621void3622select_mul_u32_u24(opt_ctx& ctx, aco_ptr<Instruction>& instr)3623{3624if (instr->usesModifiers())3625return;36263627/* Only valid if the accumulator is zero (this is selected by isel to3628* combine more v_add_u32+v_mad_u32_u16 together), but the optimizer3629* fallbacks here when not possible.3630*/3631if (!instr->operands[2].constantEquals(0))3632return;36333634/* Only valid if the upper 16-bits of both operands are zero (because3635* v_mul_u32_u24 doesn't mask them).3636*/3637for (unsigned i = 0; i < 2; i++) {3638if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())3639return;3640}36413642bool swap = false;36433644/* VOP2 instructions can only take constants/sgprs in operand 0. */3645if ((instr->operands[1].isConstant() ||3646(instr->operands[1].hasRegClass() &&3647instr->operands[1].regClass().type() == RegType::sgpr))) {3648swap = true;3649if ((instr->operands[0].isConstant() ||3650(instr->operands[0].hasRegClass() &&3651instr->operands[0].regClass().type() == RegType::sgpr))) {3652/* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because3653* v_mul_u32_u24 has no advantages.3654*/3655return;3656}3657}36583659VOP2_instruction* new_instr =3660create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);3661new_instr->operands[0] = instr->operands[swap];3662new_instr->operands[1] = instr->operands[!swap];3663new_instr->definitions[0] = instr->definitions[0];3664instr.reset(new_instr);3665}36663667void3668select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)3669{3670const uint32_t threshold = 4;36713672if (is_dead(ctx.uses, instr.get())) {3673instr.reset();3674return;3675}36763677/* convert split_vector into a copy or extract_vector if only one definition is ever used */3678if (instr->opcode == aco_opcode::p_split_vector) {3679unsigned num_used = 0;3680unsigned idx = 0;3681unsigned split_offset = 0;3682for (unsigned i = 0, offset = 0; i < instr->definitions.size();3683offset += instr->definitions[i++].bytes()) {3684if (ctx.uses[instr->definitions[i].tempId()]) {3685num_used++;3686idx = i;3687split_offset = offset;3688}3689}3690bool done = false;3691if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&3692ctx.uses[instr->operands[0].tempId()] == 1) {3693Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;36943695unsigned off = 0;3696Operand op;3697for (Operand& vec_op : vec->operands) {3698if (off == split_offset) {3699op = vec_op;3700break;3701}3702off += vec_op.bytes();3703}3704if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {3705ctx.uses[instr->operands[0].tempId()]--;3706for (Operand& vec_op : vec->operands) {3707if (vec_op.isTemp())3708ctx.uses[vec_op.tempId()]--;3709}3710if (op.isTemp())3711ctx.uses[op.tempId()]++;37123713aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(3714aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};3715extract->operands[0] = op;3716extract->definitions[0] = instr->definitions[idx];3717instr.reset(extract.release());37183719done = true;3720}3721}37223723if (!done && num_used == 1 &&3724instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&3725split_offset % instr->definitions[idx].bytes() == 0) {3726aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(3727aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};3728extract->operands[0] = instr->operands[0];3729extract->operands[1] =3730Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());3731extract->definitions[0] = instr->definitions[idx];3732instr.reset(extract.release());3733}3734}37353736mad_info* mad_info = NULL;3737if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {3738mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];3739/* re-check mad instructions */3740if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {3741ctx.uses[mad_info->mul_temp_id]++;3742if (instr->operands[0].isTemp())3743ctx.uses[instr->operands[0].tempId()]--;3744if (instr->operands[1].isTemp())3745ctx.uses[instr->operands[1].tempId()]--;3746instr.swap(mad_info->add_instr);3747mad_info = NULL;3748}3749/* check literals */3750else if (!instr->usesModifiers()) {3751/* FMA can only take literals on GFX10+ */3752if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&3753ctx.program->chip_class < GFX10)3754return;3755/* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take3756* literals (GFX10+), these instructions don't exist.3757*/3758if (instr->opcode == aco_opcode::v_fma_legacy_f16)3759return;37603761bool sgpr_used = false;3762uint32_t literal_idx = 0;3763uint32_t literal_uses = UINT32_MAX;3764for (unsigned i = 0; i < instr->operands.size(); i++) {3765if (instr->operands[i].isConstant() && i > 0) {3766literal_uses = UINT32_MAX;3767break;3768}3769if (!instr->operands[i].isTemp())3770continue;3771unsigned bits = get_operand_size(instr, i);3772/* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX103773* or operands other than the 1st */3774if (instr->operands[i].getTemp().type() == RegType::sgpr &&3775(i > 0 || ctx.program->chip_class < GFX10)) {3776if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) {3777literal_uses = ctx.uses[instr->operands[i].tempId()];3778literal_idx = i;3779} else {3780literal_uses = UINT32_MAX;3781}3782sgpr_used = true;3783/* don't break because we still need to check constants */3784} else if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits) &&3785ctx.uses[instr->operands[i].tempId()] < literal_uses) {3786literal_uses = ctx.uses[instr->operands[i].tempId()];3787literal_idx = i;3788}3789}37903791/* Limit the number of literals to apply to not increase the code3792* size too much, but always apply literals for v_mad->v_madak3793* because both instructions are 64-bit and this doesn't increase3794* code size.3795* TODO: try to apply the literals earlier to lower the number of3796* uses below threshold3797*/3798if (literal_uses < threshold || literal_idx == 2) {3799ctx.uses[instr->operands[literal_idx].tempId()]--;3800mad_info->check_literal = true;3801mad_info->literal_idx = literal_idx;3802return;3803}3804}3805}38063807/* Mark SCC needed, so the uniform boolean transformation won't swap the definitions3808* when it isn't beneficial */3809if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&3810instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {3811ctx.info[instr->operands[0].tempId()].set_scc_needed();3812return;3813} else if ((instr->opcode == aco_opcode::s_cselect_b64 ||3814instr->opcode == aco_opcode::s_cselect_b32) &&3815instr->operands[2].isTemp()) {3816ctx.info[instr->operands[2].tempId()].set_scc_needed();3817} else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() &&3818ctx.info[instr->definitions[0].tempId()].is_scc_needed()) {3819/* Propagate label so it is correctly detected by the uniform bool transform */3820ctx.info[instr->operands[0].tempId()].set_scc_needed();38213822/* Fix definition to SCC, this will prevent RA from adding superfluous moves */3823instr->definitions[0].setFixed(scc);3824}38253826/* check for literals */3827if (!instr->isSALU() && !instr->isVALU())3828return;38293830/* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */3831if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&3832ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {3833bool transform_done = to_uniform_bool_instr(ctx, instr);38343835if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {3836/* Swap the two definition IDs in order to avoid overusing the SCC.3837* This reduces extra moves generated by RA. */3838uint32_t def0_id = instr->definitions[0].getTemp().id();3839uint32_t def1_id = instr->definitions[1].getTemp().id();3840instr->definitions[0].setTemp(Temp(def1_id, s1));3841instr->definitions[1].setTemp(Temp(def0_id, s1));3842}38433844return;3845}38463847if (instr->opcode == aco_opcode::v_mad_u32_u16)3848select_mul_u32_u24(ctx, instr);38493850if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||3851(instr->isVOP3P() && ctx.program->chip_class < GFX10))3852return; /* some encodings can't ever take literals */38533854/* we do not apply the literals yet as we don't know if it is profitable */3855Operand current_literal(s1);38563857unsigned literal_id = 0;3858unsigned literal_uses = UINT32_MAX;3859Operand literal(s1);3860unsigned num_operands = 1;3861if (instr->isSALU() ||3862(ctx.program->chip_class >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P())))3863num_operands = instr->operands.size();3864/* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */3865else if (instr->isVALU() && instr->operands.size() >= 3)3866return;38673868unsigned sgpr_ids[2] = {0, 0};3869bool is_literal_sgpr = false;3870uint32_t mask = 0;38713872/* choose a literal to apply */3873for (unsigned i = 0; i < num_operands; i++) {3874Operand op = instr->operands[i];3875unsigned bits = get_operand_size(instr, i);38763877if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&3878op.tempId() != sgpr_ids[0])3879sgpr_ids[!!sgpr_ids[0]] = op.tempId();38803881if (op.isLiteral()) {3882current_literal = op;3883continue;3884} else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {3885continue;3886}38873888if (!alu_can_accept_constant(instr->opcode, i))3889continue;38903891if (ctx.uses[op.tempId()] < literal_uses) {3892is_literal_sgpr = op.getTemp().type() == RegType::sgpr;3893mask = 0;3894literal = Operand::c32(ctx.info[op.tempId()].val);3895literal_uses = ctx.uses[op.tempId()];3896literal_id = op.tempId();3897}38983899mask |= (op.tempId() == literal_id) << i;3900}39013902/* don't go over the constant bus limit */3903bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||3904instr->opcode == aco_opcode::v_lshrrev_b64 ||3905instr->opcode == aco_opcode::v_ashrrev_i64;3906unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;3907if (ctx.program->chip_class >= GFX10 && !is_shift64)3908const_bus_limit = 2;39093910unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];3911if (num_sgprs == const_bus_limit && !is_literal_sgpr)3912return;39133914if (literal_id && literal_uses < threshold &&3915(current_literal.isUndefined() ||3916(current_literal.size() == literal.size() &&3917current_literal.constantValue() == literal.constantValue()))) {3918/* mark the literal to be applied */3919while (mask) {3920unsigned i = u_bit_scan(&mask);3921if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)3922ctx.uses[instr->operands[i].tempId()]--;3923}3924}3925}39263927void3928apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)3929{3930/* Cleanup Dead Instructions */3931if (!instr)3932return;39333934/* apply literals on MAD */3935if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {3936mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];3937if (info->check_literal &&3938(ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {3939aco_ptr<Instruction> new_mad;39403941aco_opcode new_op =3942info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;3943if (instr->opcode == aco_opcode::v_fma_f32)3944new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;3945else if (instr->opcode == aco_opcode::v_mad_f16 ||3946instr->opcode == aco_opcode::v_mad_legacy_f16)3947new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;3948else if (instr->opcode == aco_opcode::v_fma_f16)3949new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;39503951new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));3952if (info->literal_idx == 2) { /* add literal -> madak */3953new_mad->operands[0] = instr->operands[0];3954new_mad->operands[1] = instr->operands[1];3955} else { /* mul literal -> madmk */3956new_mad->operands[0] = instr->operands[1 - info->literal_idx];3957new_mad->operands[1] = instr->operands[2];3958}3959new_mad->operands[2] =3960Operand::c32(ctx.info[instr->operands[info->literal_idx].tempId()].val);3961new_mad->definitions[0] = instr->definitions[0];3962ctx.instructions.emplace_back(std::move(new_mad));3963return;3964}3965}39663967/* apply literals on other SALU/VALU */3968if (instr->isSALU() || instr->isVALU()) {3969for (unsigned i = 0; i < instr->operands.size(); i++) {3970Operand op = instr->operands[i];3971unsigned bits = get_operand_size(instr, i);3972if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {3973Operand literal = Operand::c32(ctx.info[op.tempId()].val);3974if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)3975to_VOP3(ctx, instr);3976instr->operands[i] = literal;3977}3978}3979}39803981ctx.instructions.emplace_back(std::move(instr));3982}39833984void3985optimize(Program* program)3986{3987opt_ctx ctx;3988ctx.program = program;3989std::vector<ssa_info> info(program->peekAllocationId());3990ctx.info = info.data();39913992/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */3993for (Block& block : program->blocks) {3994ctx.fp_mode = block.fp_mode;3995for (aco_ptr<Instruction>& instr : block.instructions)3996label_instruction(ctx, instr);3997}39983999ctx.uses = dead_code_analysis(program);40004001/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */4002for (Block& block : program->blocks) {4003ctx.fp_mode = block.fp_mode;4004for (aco_ptr<Instruction>& instr : block.instructions)4005combine_instruction(ctx, instr);4006}40074008/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */4009for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();4010++block_rit) {4011Block* block = &(*block_rit);4012ctx.fp_mode = block->fp_mode;4013for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();4014++instr_rit)4015select_instruction(ctx, *instr_rit);4016}40174018/* 4. Add literals to instructions */4019for (Block& block : program->blocks) {4020ctx.instructions.clear();4021ctx.fp_mode = block.fp_mode;4022for (aco_ptr<Instruction>& instr : block.instructions)4023apply_literals(ctx, instr);4024block.instructions.swap(ctx.instructions);4025}4026}40274028} // namespace aco402940304031