Path: blob/21.2-virgl/src/amd/compiler/aco_lower_to_hw_instr.cpp
4550 views
/*1* Copyright © 2018 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/2324#include "aco_builder.h"25#include "aco_ir.h"2627#include "common/sid.h"2829#include <map>30#include <vector>3132namespace aco {3334struct lower_context {35Program* program;36Block* block;37std::vector<aco_ptr<Instruction>> instructions;38};3940/* used by handle_operands() indirectly through Builder::copy */41uint8_t int8_mul_table[512] = {420, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9,431, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19,441, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29,451, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39,461, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49,471, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59,481, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23,492, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243,502, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23,512, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33,522, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27,532, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17,542, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43,555, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243,564, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45,573, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53,584, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13,595, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245,603, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63,615, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35,624, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19,635, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43,643, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,652, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55,661, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249,671, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255};6869aco_opcode70get_reduce_opcode(chip_class chip, ReduceOp op)71{72/* Because some 16-bit instructions are already VOP3 on GFX10, we use the73* 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use74* DPP with the arithmetic instructions. This requires to sign-extend.75*/76switch (op) {77case iadd8:78case iadd16:79if (chip >= GFX10) {80return aco_opcode::v_add_u32;81} else if (chip >= GFX8) {82return aco_opcode::v_add_u16;83} else {84return aco_opcode::v_add_co_u32;85}86break;87case imul8:88case imul16:89if (chip >= GFX10) {90return aco_opcode::v_mul_lo_u16_e64;91} else if (chip >= GFX8) {92return aco_opcode::v_mul_lo_u16;93} else {94return aco_opcode::v_mul_u32_u24;95}96break;97case fadd16: return aco_opcode::v_add_f16;98case fmul16: return aco_opcode::v_mul_f16;99case imax8:100case imax16:101if (chip >= GFX10) {102return aco_opcode::v_max_i32;103} else if (chip >= GFX8) {104return aco_opcode::v_max_i16;105} else {106return aco_opcode::v_max_i32;107}108break;109case imin8:110case imin16:111if (chip >= GFX10) {112return aco_opcode::v_min_i32;113} else if (chip >= GFX8) {114return aco_opcode::v_min_i16;115} else {116return aco_opcode::v_min_i32;117}118break;119case umin8:120case umin16:121if (chip >= GFX10) {122return aco_opcode::v_min_u32;123} else if (chip >= GFX8) {124return aco_opcode::v_min_u16;125} else {126return aco_opcode::v_min_u32;127}128break;129case umax8:130case umax16:131if (chip >= GFX10) {132return aco_opcode::v_max_u32;133} else if (chip >= GFX8) {134return aco_opcode::v_max_u16;135} else {136return aco_opcode::v_max_u32;137}138break;139case fmin16: return aco_opcode::v_min_f16;140case fmax16: return aco_opcode::v_max_f16;141case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;142case imul32: return aco_opcode::v_mul_lo_u32;143case fadd32: return aco_opcode::v_add_f32;144case fmul32: return aco_opcode::v_mul_f32;145case imax32: return aco_opcode::v_max_i32;146case imin32: return aco_opcode::v_min_i32;147case umin32: return aco_opcode::v_min_u32;148case umax32: return aco_opcode::v_max_u32;149case fmin32: return aco_opcode::v_min_f32;150case fmax32: return aco_opcode::v_max_f32;151case iand8:152case iand16:153case iand32: return aco_opcode::v_and_b32;154case ixor8:155case ixor16:156case ixor32: return aco_opcode::v_xor_b32;157case ior8:158case ior16:159case ior32: return aco_opcode::v_or_b32;160case iadd64: return aco_opcode::num_opcodes;161case imul64: return aco_opcode::num_opcodes;162case fadd64: return aco_opcode::v_add_f64;163case fmul64: return aco_opcode::v_mul_f64;164case imin64: return aco_opcode::num_opcodes;165case imax64: return aco_opcode::num_opcodes;166case umin64: return aco_opcode::num_opcodes;167case umax64: return aco_opcode::num_opcodes;168case fmin64: return aco_opcode::v_min_f64;169case fmax64: return aco_opcode::v_max_f64;170case iand64: return aco_opcode::num_opcodes;171case ior64: return aco_opcode::num_opcodes;172case ixor64: return aco_opcode::num_opcodes;173default: return aco_opcode::num_opcodes;174}175}176177bool178is_vop3_reduce_opcode(aco_opcode opcode)179{180/* 64-bit reductions are VOP3. */181if (opcode == aco_opcode::num_opcodes)182return true;183184return instr_info.format[(int)opcode] == Format::VOP3;185}186187void188emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)189{190Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);191if (instr->definitions.size() >= 2) {192assert(instr->definitions[1].regClass() == bld.lm);193instr->definitions[1].setFixed(vcc);194}195}196197void198emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,199PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,200unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)201{202Builder bld(ctx->program, &ctx->instructions);203Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};204Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};205Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};206Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};207Operand src1_64 = Operand(src1_reg, v2);208Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};209Operand vtmp_op64 = Operand(vtmp_reg, v2);210if (op == iadd64) {211if (ctx->program->chip_class >= GFX10) {212if (identity)213bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);214bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,215bound_ctrl);216bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);217} else {218bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],219dpp_ctrl, row_mask, bank_mask, bound_ctrl);220}221bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],222Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);223} else if (op == iand64) {224bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,225bound_ctrl);226bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,227bound_ctrl);228} else if (op == ior64) {229bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,230bound_ctrl);231bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,232bound_ctrl);233} else if (op == ixor64) {234bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,235bound_ctrl);236bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,237bound_ctrl);238} else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {239aco_opcode cmp = aco_opcode::num_opcodes;240switch (op) {241case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;242case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;243case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;244case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;245default: break;246}247248if (identity) {249bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);250bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);251}252bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,253bound_ctrl);254bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,255bound_ctrl);256257bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);258bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));259bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));260} else if (op == imul64) {261/* t4 = dpp(x_hi)262* t1 = umul_lo(t4, y_lo)263* t3 = dpp(x_lo)264* t0 = umul_lo(t3, y_hi)265* t2 = iadd(t0, t1)266* t5 = umul_hi(t3, y_lo)267* res_hi = iadd(t2, t5)268* res_lo = umul_lo(t3, y_lo)269* Requires that res_hi != src0[0] and res_hi != src1[0]270* and that vtmp[0] != res_hi.271*/272if (identity)273bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);274bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,275bound_ctrl);276bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);277if (identity)278bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);279bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,280bound_ctrl);281bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);282emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);283if (identity)284bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);285bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,286bound_ctrl);287bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);288emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);289if (identity)290bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);291bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,292bound_ctrl);293bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);294}295}296297void298emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,299ReduceOp op)300{301Builder bld(ctx->program, &ctx->instructions);302Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};303RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;304Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};305Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};306Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);307Operand src1_64 = Operand(src1_reg, v2);308309if (src0_rc == s1 &&310(op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {311assert(vtmp.reg() != 0);312bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);313bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);314src0_reg = vtmp;315src0[0] = Operand(vtmp, v1);316src0[1] = Operand(PhysReg{vtmp + 1}, v1);317src0_64 = Operand(vtmp, v2);318} else if (src0_rc == s1 && op == iadd64) {319assert(vtmp.reg() != 0);320bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);321src0[1] = Operand(PhysReg{vtmp + 1}, v1);322}323324if (op == iadd64) {325if (ctx->program->chip_class >= GFX10) {326bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);327} else {328bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);329}330bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],331Operand(vcc, bld.lm));332} else if (op == iand64) {333bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);334bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);335} else if (op == ior64) {336bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);337bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);338} else if (op == ixor64) {339bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);340bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);341} else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {342aco_opcode cmp = aco_opcode::num_opcodes;343switch (op) {344case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;345case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;346case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;347case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;348default: break;349}350351bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);352bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));353bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));354} else if (op == imul64) {355if (src1_reg == dst_reg) {356/* it's fine if src0==dst but not if src1==dst */357std::swap(src0_reg, src1_reg);358std::swap(src0[0], src1[0]);359std::swap(src0[1], src1[1]);360std::swap(src0_64, src1_64);361}362assert(!(src0_reg == src1_reg));363/* t1 = umul_lo(x_hi, y_lo)364* t0 = umul_lo(x_lo, y_hi)365* t2 = iadd(t0, t1)366* t5 = umul_hi(x_lo, y_lo)367* res_hi = iadd(t2, t5)368* res_lo = umul_lo(x_lo, y_lo)369* assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp370*/371Definition tmp0_def(PhysReg{src0_reg + 1}, v1);372Definition tmp1_def(PhysReg{src1_reg + 1}, v1);373Operand tmp0_op = src0[1];374Operand tmp1_op = src1[1];375bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);376bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);377emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);378bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);379emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);380bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);381}382}383384void385emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,386ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,387bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */388{389Builder bld(ctx->program, &ctx->instructions);390RegClass rc = RegClass(RegType::vgpr, size);391Definition dst(dst_reg, rc);392Operand src0(src0_reg, rc);393Operand src1(src1_reg, rc);394395aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);396bool vop3 = is_vop3_reduce_opcode(opcode);397398if (!vop3) {399if (opcode == aco_opcode::v_add_co_u32)400bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,401bound_ctrl);402else403bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);404return;405}406407if (opcode == aco_opcode::num_opcodes) {408emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,409bound_ctrl, identity);410return;411}412413if (identity)414bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);415if (identity && size >= 2)416bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);417418for (unsigned i = 0; i < size; i++)419bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),420Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);421422bld.vop3(opcode, dst, Operand(vtmp, rc), src1);423}424425void426emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,427ReduceOp op, unsigned size)428{429Builder bld(ctx->program, &ctx->instructions);430RegClass rc = RegClass(RegType::vgpr, size);431Definition dst(dst_reg, rc);432Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));433Operand src1(src1_reg, rc);434435aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);436bool vop3 = is_vop3_reduce_opcode(opcode);437438if (opcode == aco_opcode::num_opcodes) {439emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);440return;441}442443if (vop3) {444bld.vop3(opcode, dst, src0, src1);445} else if (opcode == aco_opcode::v_add_co_u32) {446bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);447} else {448bld.vop2(opcode, dst, src0, src1);449}450}451452void453emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,454unsigned row_mask, unsigned bank_mask, bool bound_ctrl)455{456Builder bld(ctx->program, &ctx->instructions);457for (unsigned i = 0; i < size; i++) {458bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),459Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);460}461}462463void464emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)465{466for (unsigned i = 0; i < size; i++) {467bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),468Operand(PhysReg{src + i}, v1), ds_pattern);469}470}471472void473emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,474PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)475{476assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);477assert(cluster_size <= ctx->program->wave_size);478479Builder bld(ctx->program, &ctx->instructions);480481Operand identity[2];482identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));483identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));484Operand vcndmask_identity[2] = {identity[0], identity[1]};485486/* First, copy the source to tmp and set inactive lanes to the identity */487bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),488Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));489490for (unsigned i = 0; i < src.size(); i++) {491/* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32492* except on GFX10, where v_writelane_b32 can take a literal. */493if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan &&494ctx->program->chip_class < GFX10) {495bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);496identity[i] = Operand(PhysReg{sitmp + i}, s1);497498bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);499vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);500} else if (identity[i].isLiteral()) {501bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);502vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);503}504}505506for (unsigned i = 0; i < src.size(); i++) {507bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),508vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),509Operand(stmp, bld.lm));510}511512if (src.regClass() == v1b) {513if (ctx->program->chip_class >= GFX8) {514aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(515aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};516sdwa->operands[0] = Operand(PhysReg{tmp}, v1);517sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);518if (reduce_op == imin8 || reduce_op == imax8)519sdwa->sel[0] = sdwa_sbyte;520else521sdwa->sel[0] = sdwa_ubyte;522sdwa->dst_sel = sdwa_udword;523bld.insert(std::move(sdwa));524} else {525aco_opcode opcode;526527if (reduce_op == imin8 || reduce_op == imax8)528opcode = aco_opcode::v_bfe_i32;529else530opcode = aco_opcode::v_bfe_u32;531532bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),533Operand::c32(8u));534}535} else if (src.regClass() == v2b) {536if (ctx->program->chip_class >= GFX10 &&537(reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||538reduce_op == umin16 || reduce_op == umax16)) {539aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(540aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};541sdwa->operands[0] = Operand(PhysReg{tmp}, v1);542sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);543if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)544sdwa->sel[0] = sdwa_sword;545else546sdwa->sel[0] = sdwa_uword;547sdwa->dst_sel = sdwa_udword;548bld.insert(std::move(sdwa));549} else if (ctx->program->chip_class == GFX6 || ctx->program->chip_class == GFX7) {550aco_opcode opcode;551552if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)553opcode = aco_opcode::v_bfe_i32;554else555opcode = aco_opcode::v_bfe_u32;556557bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),558Operand::c32(16u));559}560}561562bool reduction_needs_last_op = false;563switch (op) {564case aco_opcode::p_reduce:565if (cluster_size == 1)566break;567568if (ctx->program->chip_class <= GFX7) {569reduction_needs_last_op = true;570emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));571if (cluster_size == 2)572break;573emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());574emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));575if (cluster_size == 4)576break;577emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());578emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));579if (cluster_size == 8)580break;581emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());582emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));583if (cluster_size == 16)584break;585emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());586emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));587if (cluster_size == 32)588break;589emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());590for (unsigned i = 0; i < src.size(); i++)591bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),592Operand::zero());593// TODO: it would be more effective to do the last reduction step on SALU594emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());595reduction_needs_last_op = false;596break;597}598599emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,6000xf, false);601if (cluster_size == 2)602break;603emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,6040xf, false);605if (cluster_size == 4)606break;607emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,608false);609if (cluster_size == 8)610break;611emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);612if (cluster_size == 16)613break;614615if (ctx->program->chip_class >= GFX10) {616/* GFX10+ doesn't support row_bcast15 and row_bcast31 */617for (unsigned i = 0; i < src.size(); i++)618bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),619Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());620621if (cluster_size == 32) {622reduction_needs_last_op = true;623break;624}625626emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());627for (unsigned i = 0; i < src.size(); i++)628bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),629Operand::zero());630// TODO: it would be more effective to do the last reduction step on SALU631emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());632break;633}634635if (cluster_size == 32) {636emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));637reduction_needs_last_op = true;638break;639}640assert(cluster_size == 64);641emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,642false);643emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,644false);645break;646case aco_opcode::p_exclusive_scan:647if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */648/* shift rows right */649emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);650651/* fill in the gaps in rows 1 and 3 */652bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u));653bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u));654for (unsigned i = 0; i < src.size(); i++) {655Instruction* perm =656bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),657Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),658Operand::c32(0xffffffffu))659.instr;660perm->vop3().opsel = 1; /* FI (Fetch Inactive) */661}662bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX));663664if (ctx->program->wave_size == 64) {665/* fill in the gap in row 2 */666for (unsigned i = 0; i < src.size(); i++) {667bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),668Operand::c32(31u));669bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),670Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));671}672}673std::swap(tmp, vtmp);674} else if (ctx->program->chip_class >= GFX8) {675emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);676} else {677// TODO: use LDS on CS with a single write and shifted read678/* wavefront shift_right by 1 on SI/CI */679emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));680emit_ds_swizzle(bld, tmp, tmp, src.size(),681ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */682bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u));683bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));684for (unsigned i = 0; i < src.size(); i++)685bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),686Operand(PhysReg{tmp + i}, v1));687688bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));689emit_ds_swizzle(bld, tmp, tmp, src.size(),690ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */691bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u));692bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));693for (unsigned i = 0; i < src.size(); i++)694bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),695Operand(PhysReg{tmp + i}, v1));696697bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));698emit_ds_swizzle(bld, tmp, tmp, src.size(),699ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */700bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u),701Operand::c32(16u));702bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u),703Operand::c32(16u));704for (unsigned i = 0; i < src.size(); i++)705bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),706Operand(PhysReg{tmp + i}, v1));707708bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));709for (unsigned i = 0; i < src.size(); i++) {710bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),711Operand(PhysReg{vtmp + i}, v1));712bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),713Operand::zero());714bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),715Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));716identity[i] = Operand::zero(); /* prevent further uses of identity */717}718std::swap(tmp, vtmp);719}720721for (unsigned i = 0; i < src.size(); i++) {722if (!identity[i].isConstant() ||723identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */724if (ctx->program->chip_class < GFX10)725assert((identity[i].isConstant() && !identity[i].isLiteral()) ||726identity[i].physReg() == PhysReg{sitmp + i});727bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),728Operand(PhysReg{tmp + i}, v1));729}730}731FALLTHROUGH;732case aco_opcode::p_inclusive_scan:733assert(cluster_size == ctx->program->wave_size);734if (ctx->program->chip_class <= GFX7) {735emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));736bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu));737bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));738emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());739740bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));741emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));742bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu));743bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));744emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());745746bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));747emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));748bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u));749bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));750emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());751752bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));753emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));754bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u));755bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));756emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());757758bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));759emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));760bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),761Operand::c32(16u));762bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),763Operand::c32(16u));764emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());765766for (unsigned i = 0; i < src.size(); i++)767bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),768Operand::c32(31u));769bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),770Operand::c32(32u));771emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());772break;773}774775emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,776identity);777emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,778identity);779emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,780identity);781emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,782identity);783if (ctx->program->chip_class >= GFX10) {784bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),785Operand::c32(16u));786bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),787Operand::c32(16u));788for (unsigned i = 0; i < src.size(); i++) {789Instruction* perm =790bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),791Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),792Operand::c32(0xffffffffu))793.instr;794perm->vop3().opsel = 1; /* FI (Fetch Inactive) */795}796emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());797798if (ctx->program->wave_size == 64) {799bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),800Operand::c32(32u));801for (unsigned i = 0; i < src.size(); i++)802bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),803Operand::c32(31u));804emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());805}806} else {807emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,808false, identity);809emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,810false, identity);811}812break;813default: unreachable("Invalid reduction mode");814}815816if (op == aco_opcode::p_reduce) {817if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {818bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));819emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());820return;821}822823if (reduction_needs_last_op)824emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());825}826827/* restore exec */828bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));829830if (dst.regClass().type() == RegType::sgpr) {831for (unsigned k = 0; k < src.size(); k++) {832bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),833Operand::c32(ctx->program->wave_size - 1));834}835} else if (dst.physReg() != tmp) {836for (unsigned k = 0; k < src.size(); k++) {837bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),838Operand(PhysReg{tmp + k}, v1));839}840}841}842843void844emit_gfx10_wave64_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)845{846/* Emulates proper bpermute on GFX10 in wave64 mode.847*848* This is necessary because on GFX10 the bpermute instruction only works849* on half waves (you can think of it as having a cluster size of 32), so we850* manually swap the data between the two halves using two shared VGPRs.851*/852853assert(program->chip_class >= GFX10);854assert(program->wave_size == 64);855856unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;857Definition dst = instr->definitions[0];858Definition tmp_exec = instr->definitions[1];859Definition clobber_scc = instr->definitions[2];860Operand index_x4 = instr->operands[0];861Operand input_data = instr->operands[1];862Operand same_half = instr->operands[2];863864assert(dst.regClass() == v1);865assert(tmp_exec.regClass() == bld.lm);866assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);867assert(same_half.regClass() == bld.lm);868assert(index_x4.regClass() == v1);869assert(input_data.regClass().type() == RegType::vgpr);870assert(input_data.bytes() <= 4);871assert(dst.physReg() != index_x4.physReg());872assert(dst.physReg() != input_data.physReg());873assert(tmp_exec.physReg() != same_half.physReg());874875PhysReg shared_vgpr_lo(shared_vgpr_reg_0);876PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);877878/* Permute the input within the same half-wave */879bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);880881/* HI: Copy data from high lanes 32-63 to shared vgpr */882bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,883dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);884/* Save EXEC */885bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));886/* Set EXEC to enable LO lanes only */887bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero());888/* LO: Copy data from low lanes 0-31 to shared vgpr */889bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);890/* LO: bpermute shared vgpr (high lanes' data) */891bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,892Operand(shared_vgpr_hi, v1));893/* Set EXEC to enable HI lanes only */894bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u));895/* HI: bpermute shared vgpr (low lanes' data) */896bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,897Operand(shared_vgpr_lo, v1));898899/* Only enable lanes which use the other half's data */900bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,901Operand(tmp_exec.physReg(), s2), same_half);902/* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */903bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),9040x3, 0xf, false);905/* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */906bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),9070xc, 0xf, false);908909/* Restore saved EXEC */910bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));911912/* RA assumes that the result is always in the low part of the register, so we have to shift, if913* it's not there already */914if (input_data.physReg().byte()) {915unsigned right_shift = input_data.physReg().byte() * 8;916bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),917Operand(dst.physReg(), v1));918}919}920921void922emit_gfx6_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)923{924/* Emulates bpermute using readlane instructions */925926Operand index = instr->operands[0];927Operand input = instr->operands[1];928Definition dst = instr->definitions[0];929Definition temp_exec = instr->definitions[1];930Definition clobber_vcc = instr->definitions[2];931932assert(dst.regClass() == v1);933assert(temp_exec.regClass() == bld.lm);934assert(clobber_vcc.regClass() == bld.lm);935assert(clobber_vcc.physReg() == vcc);936assert(index.regClass() == v1);937assert(index.physReg() != dst.physReg());938assert(input.regClass().type() == RegType::vgpr);939assert(input.bytes() <= 4);940assert(input.physReg() != dst.physReg());941942/* Save original EXEC */943bld.sop1(aco_opcode::s_mov_b64, temp_exec, Operand(exec, s2));944945/* An "unrolled loop" that is executed per each lane.946* This takes only a few instructions per lane, as opposed to a "real" loop947* with branching, where the branch instruction alone would take 16+ cycles.948*/949for (unsigned n = 0; n < program->wave_size; ++n) {950/* Activate the lane which has N for its source index */951bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand::c32(n),952index);953/* Read the data from lane N */954bld.readlane(Definition(vcc, s1), input, Operand::c32(n));955/* On the active lane, move the data we read from lane N to the destination VGPR */956bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));957/* Restore original EXEC */958bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(temp_exec.physReg(), s2));959}960}961962struct copy_operation {963Operand op;964Definition def;965unsigned bytes;966union {967uint8_t uses[8];968uint64_t is_used = 0;969};970};971972void973split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,974const copy_operation& src, bool ignore_uses, unsigned max_size)975{976PhysReg def_reg = src.def.physReg();977PhysReg op_reg = src.op.physReg();978def_reg.reg_b += offset;979op_reg.reg_b += offset;980981/* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */982if (ctx->program->chip_class < GFX10 && src.def.regClass().type() == RegType::vgpr)983max_size = MIN2(max_size, 4);984unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;985986/* make sure the size is a power of two and reg % bytes == 0 */987unsigned bytes = 1;988for (; bytes <= max_size; bytes *= 2) {989unsigned next = bytes * 2u;990bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&991offset + next <= src.bytes && next <= max_size;992if (!src.op.isConstant() && can_increase)993can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;994for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)995can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);996if (!can_increase)997break;998}9991000RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u)1001: RegClass(src.def.regClass().type(), bytes).as_subdword();1002*def = Definition(src.def.tempId(), def_reg, def_cls);1003if (src.op.isConstant()) {1004assert(bytes >= 1 && bytes <= 8);1005uint64_t val = src.op.constantValue64() >> (offset * 8u);1006*op = Operand::get_const(ctx->program->chip_class, val, bytes);1007} else {1008RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u)1009: RegClass(src.op.regClass().type(), bytes).as_subdword();1010*op = Operand(op_reg, op_cls);1011op->setTemp(Temp(src.op.tempId(), op_cls));1012}1013}10141015uint32_t1016get_intersection_mask(int a_start, int a_size, int b_start, int b_size)1017{1018int intersection_start = MAX2(b_start - a_start, 0);1019int intersection_end = MAX2(b_start + b_size - a_start, 0);1020if (intersection_start >= a_size || intersection_end == 0)1021return 0;10221023uint32_t mask = u_bit_consecutive(0, a_size);1024return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;1025}10261027void1028copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)1029{1030assert(op.bytes() == dst.bytes());10311032if (dst.bytes() == 4 && op.isLiteral()) {1033uint32_t imm = op.constantValue();1034if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) {1035bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);1036return;1037} else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {1038uint32_t rev = util_bitreverse(imm);1039if (dst.regClass() == s1)1040bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev));1041else1042bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev));1043return;1044} else if (dst.regClass() == s1 && imm != 0) {1045unsigned start = (ffs(imm) - 1) & 0x1f;1046unsigned size = util_bitcount(imm) & 0x1f;1047if ((((1u << size) - 1u) << start) == imm) {1048bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));1049return;1050}1051}1052}10531054if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8)1055op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */10561057if (dst.regClass() == s1) {1058bld.sop1(aco_opcode::s_mov_b32, dst, op);1059} else if (dst.regClass() == s2) {1060/* s_ashr_i64 writes SCC, so we can't use it */1061assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));1062bld.sop1(aco_opcode::s_mov_b64, dst, op);1063} else if (dst.regClass() == v2) {1064if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {1065bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);1066} else {1067assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));1068bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);1069}1070} else if (dst.regClass() == v1) {1071bld.vop1(aco_opcode::v_mov_b32, dst, op);1072} else {1073assert(dst.regClass() == v1b || dst.regClass() == v2b);10741075if (dst.regClass() == v1b && ctx->program->chip_class >= GFX9) {1076uint8_t val = op.constantValue();1077Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));1078if (op32.isLiteral()) {1079uint32_t a = (uint32_t)int8_mul_table[val * 2];1080uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];1081bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,1082Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),1083Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));1084} else {1085bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);1086}1087} else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX9 && !op.isLiteral()) {1088if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {1089/* use v_mov_b32 to avoid possible issues with denormal flushing or1090* NaN. v_add_f16 is still needed for float constants. */1091uint32_t val32 = (int32_t)(int16_t)op.constantValue();1092bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));1093} else {1094bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());1095}1096} else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10 &&1097(ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {1098if (dst.physReg().byte() == 2) {1099Operand def_lo(dst.physReg().advance(-2), v2b);1100Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);1101instr->vop3().opsel = 0;1102} else {1103assert(dst.physReg().byte() == 0);1104Operand def_hi(dst.physReg().advance(2), v2b);1105Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);1106instr->vop3().opsel = 2;1107}1108} else {1109uint32_t offset = dst.physReg().byte() * 8u;1110uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;1111uint32_t val = (op.constantValue() << offset) & mask;1112dst = Definition(PhysReg(dst.physReg().reg()), v1);1113Operand def_op(dst.physReg(), v1);1114if (val != mask)1115bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);1116if (val != 0)1117bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);1118}1119}1120}11211122bool1123do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,1124PhysReg scratch_sgpr)1125{1126bool did_copy = false;1127for (unsigned offset = 0; offset < copy.bytes;) {1128if (copy.uses[offset]) {1129offset++;1130continue;1131}11321133Definition def;1134Operand op;1135split_copy(ctx, offset, &def, &op, copy, false, 8);11361137if (def.physReg() == scc) {1138bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());1139*preserve_scc = true;1140} else if (op.isConstant()) {1141copy_constant(ctx, bld, def, op);1142} else if (def.regClass() == v1) {1143bld.vop1(aco_opcode::v_mov_b32, def, op);1144} else if (def.regClass() == v2) {1145bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);1146} else if (def.regClass() == s1) {1147bld.sop1(aco_opcode::s_mov_b32, def, op);1148} else if (def.regClass() == s2) {1149bld.sop1(aco_opcode::s_mov_b64, def, op);1150} else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) {1151if (op.physReg().byte()) {1152assert(def.physReg().byte() == 0);1153bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);1154} else if (def.physReg().byte()) {1155assert(op.physReg().byte() == 0);1156/* preserve the target's lower half */1157uint32_t bits = def.physReg().byte() * 8;1158PhysReg lo_reg = PhysReg(def.physReg().reg());1159Definition lo_half =1160Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));1161Definition dst =1162Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));11631164if (def.physReg().reg() == op.physReg().reg()) {1165bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),1166Operand(lo_reg, lo_half.regClass()));1167if (def.physReg().byte() == 1) {1168bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);1169} else if (def.physReg().byte() == 2) {1170bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);1171} else if (def.physReg().byte() == 3) {1172bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),1173Operand::c32((1 << bits) + 1u));1174bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);1175}1176} else {1177lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));1178bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),1179Operand(lo_reg, lo_half.regClass()));1180bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,1181Operand(lo_half.physReg(), lo_half.regClass()),1182Operand::c32(4 - def.physReg().byte()));1183}1184} else {1185bld.vop1(aco_opcode::v_mov_b32, def, op);1186}1187} else if (def.regClass().is_subdword()) {1188bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);1189} else {1190unreachable("unsupported copy");1191}11921193did_copy = true;1194offset += def.bytes();1195}1196return did_copy;1197}11981199void1200do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,1201Pseudo_instruction* pi)1202{1203unsigned offset = 0;12041205if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&1206(copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {1207/* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte1208* swap */1209PhysReg op = copy.op.physReg();1210PhysReg def = copy.def.physReg();1211op.reg_b &= ~0x3;1212def.reg_b &= ~0x3;12131214copy_operation tmp;1215tmp.op = Operand(op, v1);1216tmp.def = Definition(def, v1);1217tmp.bytes = 4;1218memset(tmp.uses, 1, 4);1219do_swap(ctx, bld, tmp, preserve_scc, pi);12201221op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;1222def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;1223tmp.op = Operand(op, v1b);1224tmp.def = Definition(def, v1b);1225tmp.bytes = 1;1226tmp.uses[0] = 1;1227do_swap(ctx, bld, tmp, preserve_scc, pi);12281229offset = copy.bytes;1230}12311232for (; offset < copy.bytes;) {1233Definition def;1234Operand op;1235unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;1236split_copy(ctx, offset, &def, &op, copy, true, max_size);12371238assert(op.regClass() == def.regClass());1239Operand def_as_op = Operand(def.physReg(), def.regClass());1240Definition op_as_def = Definition(op.physReg(), op.regClass());1241if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {1242bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);1243} else if (def.regClass() == v1) {1244assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);1245bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);1246bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);1247bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);1248} else if (op.physReg() == scc || def.physReg() == scc) {1249/* we need to swap scc and another sgpr */1250assert(!preserve_scc);12511252PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();12531254bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));1255bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),1256Operand::zero());1257bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));1258} else if (def.regClass() == s1) {1259if (preserve_scc) {1260bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);1261bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);1262bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));1263} else {1264bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);1265bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);1266bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);1267}1268} else if (def.regClass() == s2) {1269if (preserve_scc)1270bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));1271bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);1272bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);1273bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);1274if (preserve_scc)1275bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),1276Operand::zero());1277} else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {1278bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,1279Operand::c32(2u));1280} else {1281assert(def.regClass().is_subdword());1282bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);1283bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);1284bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);1285}12861287offset += def.bytes();1288}12891290if (ctx->program->chip_class <= GFX7)1291return;12921293/* fixup in case we swapped bytes we shouldn't have */1294copy_operation tmp_copy = copy;1295tmp_copy.op.setFixed(copy.def.physReg());1296tmp_copy.def.setFixed(copy.op.physReg());1297do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);1298}12991300void1301do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)1302{1303if (lo.isConstant() && hi.isConstant()) {1304copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));1305return;1306}13071308bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&1309(ctx->program->chip_class >= GFX10 ||1310(ctx->program->chip_class >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));13111312if (can_use_pack) {1313Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);1314/* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */1315instr->vop3().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);1316return;1317}13181319/* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */1320if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&1321(!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() ||1322ctx->program->chip_class >= GFX10)) {1323bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));1324return;1325}13261327Definition def_lo = Definition(def.physReg(), v2b);1328Definition def_hi = Definition(def.physReg().advance(2), v2b);13291330if (lo.isConstant()) {1331/* move hi and zero low bits */1332if (hi.physReg().byte() == 0)1333bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);1334else1335bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);1336bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),1337Operand(def.physReg(), v1));1338return;1339}1340if (hi.isConstant()) {1341/* move lo and zero high bits */1342if (lo.physReg().byte() == 2)1343bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);1344else1345bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);1346bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),1347Operand(def.physReg(), v1));1348return;1349}13501351if (lo.physReg().reg() == def.physReg().reg()) {1352/* lo is in the high bits of def */1353assert(lo.physReg().byte() == 2);1354bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);1355lo.setFixed(def.physReg());1356} else if (hi.physReg() == def.physReg()) {1357/* hi is in the low bits of def */1358assert(hi.physReg().byte() == 0);1359bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);1360hi.setFixed(def.physReg().advance(2));1361} else if (ctx->program->chip_class >= GFX8) {1362/* either lo or hi can be placed with just a v_mov */1363assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);1364Operand& op = lo.physReg().byte() == 0 ? lo : hi;1365PhysReg reg = def.physReg().advance(op.physReg().byte());1366bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);1367op.setFixed(reg);1368}13691370if (ctx->program->chip_class >= GFX8) {1371/* either hi or lo are already placed correctly */1372if (lo.physReg().reg() == def.physReg().reg())1373bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);1374else1375bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);1376return;1377}13781379/* alignbyte needs the operands in the following way:1380* | xx hi | lo xx | >> 2 byte */1381if (lo.physReg().byte() != hi.physReg().byte()) {1382/* | xx lo | hi xx | => | lo hi | lo hi | */1383assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);1384bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));1385lo = Operand(def_hi.physReg(), v2b);1386hi = Operand(def_lo.physReg(), v2b);1387} else if (lo.physReg().byte() == 0) {1388/* | xx hi | xx lo | => | xx hi | lo 00 | */1389bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);1390lo = Operand(def_hi.physReg(), v2b);1391} else {1392/* | hi xx | lo xx | => | 00 hi | lo xx | */1393assert(hi.physReg().byte() == 2);1394bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);1395hi = Operand(def_lo.physReg(), v2b);1396}1397/* perform the alignbyte */1398bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));1399}14001401void1402try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,1403copy_operation& copy)1404{1405// TODO try more relaxed alignment for subdword copies1406unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);1407unsigned next_op_align = next_def_align;1408if (copy.def.regClass().type() == RegType::vgpr)1409next_def_align = MIN2(next_def_align, 4);1410if (copy.op.regClass().type() == RegType::vgpr)1411next_op_align = MIN2(next_op_align, 4);14121413if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||1414(!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))1415return;14161417auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));1418if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||1419copy.op.isConstant() != other->second.op.isConstant())1420return;14211422/* don't create 64-bit copies before GFX10 */1423if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&1424ctx->program->chip_class < GFX10)1425return;14261427unsigned new_size = copy.bytes + other->second.bytes;1428if (copy.op.isConstant()) {1429uint64_t val =1430copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));1431if (!Operand::is_constant_representable(val, copy.bytes + other->second.bytes, true,1432copy.def.regClass().type() == RegType::vgpr))1433return;1434copy.op = Operand::get_const(ctx->program->chip_class, val, new_size);1435} else {1436if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))1437return;1438copy.op = Operand(copy.op.physReg(), RegClass::get(copy.op.regClass().type(), new_size));1439}14401441copy.bytes = new_size;1442copy.def = Definition(copy.def.physReg(), RegClass::get(copy.def.regClass().type(), copy.bytes));1443copy_map.erase(other);1444}14451446void1447handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,1448chip_class chip_class, Pseudo_instruction* pi)1449{1450Builder bld(ctx->program, &ctx->instructions);1451unsigned num_instructions_before = ctx->instructions.size();1452aco_ptr<Instruction> mov;1453bool writes_scc = false;14541455/* count the number of uses for each dst reg */1456for (auto it = copy_map.begin(); it != copy_map.end();) {14571458if (it->second.def.physReg() == scc)1459writes_scc = true;14601461assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));14621463/* if src and dst reg are the same, remove operation */1464if (it->first == it->second.op.physReg()) {1465it = copy_map.erase(it);1466continue;1467}14681469/* split large copies */1470if (it->second.bytes > 8) {1471assert(!it->second.op.isConstant());1472assert(!it->second.def.regClass().is_subdword());1473RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);1474Definition hi_def = Definition(PhysReg{it->first + 2}, rc);1475rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);1476Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);1477copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};1478copy_map[hi_def.physReg()] = copy;1479assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);1480it->second.op = Operand(it->second.op.physReg(),1481it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);1482it->second.def = Definition(it->second.def.physReg(),1483it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);1484it->second.bytes = 8;1485}14861487try_coalesce_copies(ctx, copy_map, it->second);14881489/* check if the definition reg is used by another copy operation */1490for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {1491if (copy.second.op.isConstant())1492continue;1493for (uint16_t i = 0; i < it->second.bytes; i++) {1494/* distance might underflow */1495unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;1496if (distance < copy.second.bytes)1497it->second.uses[i] += 1;1498}1499}15001501++it;1502}15031504/* first, handle paths in the location transfer graph */1505bool preserve_scc = pi->tmp_in_scc && !writes_scc;1506bool skip_partial_copies = true;1507for (auto it = copy_map.begin();;) {1508if (copy_map.empty()) {1509ctx->program->statistics[statistic_copies] +=1510ctx->instructions.size() - num_instructions_before;1511return;1512}1513if (it == copy_map.end()) {1514if (!skip_partial_copies)1515break;1516skip_partial_copies = false;1517it = copy_map.begin();1518}15191520/* check if we can pack one register at once */1521if (it->first.byte() == 0 && it->second.bytes == 2) {1522PhysReg reg_hi = it->first.advance(2);1523std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);1524if (other != copy_map.end() && other->second.bytes == 2) {1525/* check if the target register is otherwise unused */1526bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&1527other->second.op.physReg() == it->first);1528bool unused_hi = !other->second.is_used ||1529(other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);1530if (unused_lo && unused_hi) {1531Operand lo = it->second.op;1532Operand hi = other->second.op;1533do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);1534copy_map.erase(it);1535copy_map.erase(other);15361537for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {1538for (uint16_t i = 0; i < other2.second.bytes; i++) {1539/* distance might underflow */1540unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;1541unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;1542if (distance_lo < 2 || distance_hi < 2)1543other2.second.uses[i] -= 1;1544}1545}1546it = copy_map.begin();1547continue;1548}1549}1550}15511552/* on GFX6/7, we need some small workarounds as there is no1553* SDWA instruction to do partial register writes */1554if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) {1555if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&1556pi->opcode == aco_opcode::p_split_vector) {1557/* Other operations might overwrite the high bits, so change all users1558* of the high bits to the new target where they are still available.1559* This mechanism depends on also emitting dead definitions. */1560PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);1561while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {1562std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();1563for (other = copy_map.begin(); other != copy_map.end(); other++) {1564/* on GFX6/7, if the high bits are used as operand, they cannot be a target */1565if (other->second.op.physReg() == reg_hi) {1566other->second.op.setFixed(it->first.advance(reg_hi.byte()));1567break; /* break because an operand can only be used once */1568}1569}1570reg_hi = reg_hi.advance(it->second.bytes);1571}1572} else if (it->first.byte()) {1573assert(pi->opcode == aco_opcode::p_create_vector);1574/* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,1575* move to the target operand's high bits. This is save to do as it cannot be an operand1576*/1577PhysReg lo = PhysReg(it->first.reg());1578std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);1579if (other != copy_map.end()) {1580assert(other->second.bytes == it->first.byte());1581PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());1582it->second.def = Definition(new_reg_hi, it->second.def.regClass());1583it->second.is_used = 0;1584other->second.bytes += it->second.bytes;1585other->second.def.setTemp(Temp(other->second.def.tempId(),1586RegClass::get(RegType::vgpr, other->second.bytes)));1587other->second.op.setTemp(Temp(other->second.op.tempId(),1588RegClass::get(RegType::vgpr, other->second.bytes)));1589/* if the new target's high bits are also a target, change uses */1590std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);1591if (target != copy_map.end()) {1592for (unsigned i = 0; i < it->second.bytes; i++)1593target->second.uses[i]++;1594}1595}1596}1597}15981599/* find portions where the target reg is not used as operand for any other copy */1600if (it->second.is_used) {1601if (it->second.op.isConstant() || skip_partial_copies) {1602/* we have to skip constants until is_used=0.1603* we also skip partial copies at the beginning to help coalescing */1604++it;1605continue;1606}16071608unsigned has_zero_use_bytes = 0;1609for (unsigned i = 0; i < it->second.bytes; i++)1610has_zero_use_bytes |= (it->second.uses[i] == 0) << i;16111612if (has_zero_use_bytes) {1613/* Skipping partial copying and doing a v_swap_b32 and then fixup1614* copies is usually beneficial for sub-dword copies, but if doing1615* a partial copy allows further copies, it should be done instead. */1616bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);1617for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {1618/* on GFX6/7, we can only do copies with full registers */1619if (partial_copy || ctx->program->chip_class <= GFX7)1620break;1621for (uint16_t i = 0; i < copy.second.bytes; i++) {1622/* distance might underflow */1623unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;1624if (distance < it->second.bytes && copy.second.uses[i] == 1 &&1625!it->second.uses[distance])1626partial_copy = true;1627}1628}16291630if (!partial_copy) {1631++it;1632continue;1633}1634} else {1635/* full target reg is used: register swapping needed */1636++it;1637continue;1638}1639}16401641bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);1642skip_partial_copies = did_copy;1643std::pair<PhysReg, copy_operation> copy = *it;16441645if (it->second.is_used == 0) {1646/* the target reg is not used as operand for any other copy, so we1647* copied to all of it */1648copy_map.erase(it);1649it = copy_map.begin();1650} else {1651/* we only performed some portions of this copy, so split it to only1652* leave the portions that still need to be done */1653copy_operation original = it->second; /* the map insertion below can overwrite this */1654copy_map.erase(it);1655for (unsigned offset = 0; offset < original.bytes;) {1656if (original.uses[offset] == 0) {1657offset++;1658continue;1659}1660Definition def;1661Operand op;1662split_copy(ctx, offset, &def, &op, original, false, 8);16631664copy_operation new_copy = {op, def, def.bytes()};1665for (unsigned i = 0; i < new_copy.bytes; i++)1666new_copy.uses[i] = original.uses[i + offset];1667copy_map[def.physReg()] = new_copy;16681669offset += def.bytes();1670}16711672it = copy_map.begin();1673}16741675/* Reduce the number of uses of the operand reg by one. Do this after1676* splitting the copy or removing it in case the copy writes to it's own1677* operand (for example, v[7:8] = v[8:9]) */1678if (did_copy && !copy.second.op.isConstant()) {1679for (std::pair<const PhysReg, copy_operation>& other : copy_map) {1680for (uint16_t i = 0; i < other.second.bytes; i++) {1681/* distance might underflow */1682unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;1683if (distance < copy.second.bytes && !copy.second.uses[distance])1684other.second.uses[i] -= 1;1685}1686}1687}1688}16891690/* all target regs are needed as operand somewhere which means, all entries are part of a cycle */1691unsigned largest = 0;1692for (const std::pair<const PhysReg, copy_operation>& op : copy_map)1693largest = MAX2(largest, op.second.bytes);16941695while (!copy_map.empty()) {16961697/* Perform larger swaps first, because larger swaps swaps can make other1698* swaps unnecessary. */1699auto it = copy_map.begin();1700for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {1701if (it2->second.bytes > it->second.bytes) {1702it = it2;1703if (it->second.bytes == largest)1704break;1705}1706}17071708/* should already be done */1709assert(!it->second.op.isConstant());17101711assert(it->second.op.isFixed());1712assert(it->second.def.regClass() == it->second.op.regClass());17131714if (it->first == it->second.op.physReg()) {1715copy_map.erase(it);1716continue;1717}17181719if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)1720assert(!(it->second.def.physReg() == pi->scratch_sgpr));17211722/* to resolve the cycle, we have to swap the src reg with the dst reg */1723copy_operation swap = it->second;17241725/* if this is self-intersecting, we have to split it because1726* self-intersecting swaps don't make sense */1727PhysReg src = swap.op.physReg(), dst = swap.def.physReg();1728if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {1729unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);1730RegType type = swap.def.regClass().type();17311732copy_operation remaining;1733src.reg_b += offset;1734dst.reg_b += offset;1735remaining.bytes = swap.bytes - offset;1736memcpy(remaining.uses, swap.uses + offset, remaining.bytes);1737remaining.op = Operand(src, RegClass::get(type, remaining.bytes));1738remaining.def = Definition(dst, RegClass::get(type, remaining.bytes));1739copy_map[dst] = remaining;17401741memset(swap.uses + offset, 0, swap.bytes - offset);1742swap.bytes = offset;1743}17441745/* GFX6-7 can only swap full registers */1746if (ctx->program->chip_class <= GFX7)1747swap.bytes = align(swap.bytes, 4);17481749do_swap(ctx, bld, swap, preserve_scc, pi);17501751/* remove from map */1752copy_map.erase(it);17531754/* change the operand reg of the target's uses and split uses if needed */1755uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);1756for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {1757if (target->second.op.physReg() == swap.def.physReg() &&1758swap.bytes == target->second.bytes) {1759target->second.op.setFixed(swap.op.physReg());1760break;1761}17621763uint32_t imask =1764get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,1765target->second.op.physReg().reg_b, target->second.bytes);17661767if (!imask)1768continue;17691770int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;17711772/* split and update the middle (the portion that reads the swap's1773* definition) to read the swap's operand instead */1774int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;1775int swap_def_end = swap.def.physReg().reg_b + swap.bytes;1776int before_bytes = MAX2(-offset, 0);1777int after_bytes = MAX2(target_op_end - swap_def_end, 0);1778int middle_bytes = target->second.bytes - before_bytes - after_bytes;17791780if (after_bytes) {1781unsigned after_offset = before_bytes + middle_bytes;1782assert(after_offset > 0);1783copy_operation copy;1784copy.bytes = after_bytes;1785memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);1786RegClass rc = RegClass::get(target->second.op.regClass().type(), after_bytes);1787copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);1788copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);1789copy_map[copy.def.physReg()] = copy;1790}17911792if (middle_bytes) {1793copy_operation copy;1794copy.bytes = middle_bytes;1795memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);1796RegClass rc = RegClass::get(target->second.op.regClass().type(), middle_bytes);1797copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);1798copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);1799copy_map[copy.def.physReg()] = copy;1800}18011802if (before_bytes) {1803copy_operation copy;1804target->second.bytes = before_bytes;1805RegClass rc = RegClass::get(target->second.op.regClass().type(), before_bytes);1806target->second.op = Operand(target->second.op.physReg(), rc);1807target->second.def = Definition(target->second.def.physReg(), rc);1808memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);1809}18101811/* break early since we know each byte of the swap's definition is used1812* at most once */1813bytes_left &= ~imask;1814if (!bytes_left)1815break;1816}1817}1818ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;1819}18201821void1822emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)1823{1824if (bld.program->chip_class >= GFX10) {1825if (set_round)1826bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);1827if (set_denorm)1828bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);1829} else if (set_round || set_denorm) {1830/* "((size - 1) << 11) | register" (MODE is encoded as register 1) */1831Instruction* instr =1832bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::c8(new_mode.val), (7 << 11) | 1).instr;1833/* has to be a literal */1834instr->operands[0].setFixed(PhysReg{255});1835}1836}18371838void1839emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)1840{1841float_mode config_mode;1842config_mode.val = program.config->float_mode;18431844bool set_round = always_set && block->fp_mode.round != config_mode.round;1845bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm;1846if (block->kind & block_kind_top_level) {1847for (unsigned pred : block->linear_preds) {1848if (program.blocks[pred].fp_mode.round != block->fp_mode.round)1849set_round = true;1850if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)1851set_denorm = true;1852}1853}1854/* only allow changing modes at top-level blocks so this doesn't break1855* the "jump over empty blocks" optimization */1856assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));1857emit_set_mode(bld, block->fp_mode, set_round, set_denorm);1858}18591860void1861lower_to_hw_instr(Program* program)1862{1863Block* discard_block = NULL;18641865for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {1866Block* block = &program->blocks[block_idx];1867lower_context ctx;1868ctx.program = program;1869ctx.block = block;1870Builder bld(program, &ctx.instructions);18711872emit_set_mode_from_block(bld, *program, block, (block_idx == 0));18731874for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {1875aco_ptr<Instruction>& instr = block->instructions[instr_idx];1876aco_ptr<Instruction> mov;1877if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {1878Pseudo_instruction* pi = &instr->pseudo();18791880switch (instr->opcode) {1881case aco_opcode::p_extract_vector: {1882PhysReg reg = instr->operands[0].physReg();1883Definition& def = instr->definitions[0];1884reg.reg_b += instr->operands[1].constantValue() * def.bytes();18851886if (reg == def.physReg())1887break;18881889RegClass op_rc = def.regClass().is_subdword()1890? def.regClass()1891: RegClass(instr->operands[0].getTemp().type(), def.size());1892std::map<PhysReg, copy_operation> copy_operations;1893copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};1894handle_operands(copy_operations, &ctx, program->chip_class, pi);1895break;1896}1897case aco_opcode::p_create_vector: {1898std::map<PhysReg, copy_operation> copy_operations;1899PhysReg reg = instr->definitions[0].physReg();19001901for (const Operand& op : instr->operands) {1902if (op.isConstant()) {1903const Definition def = Definition(1904reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));1905copy_operations[reg] = {op, def, op.bytes()};1906reg.reg_b += op.bytes();1907continue;1908}1909if (op.isUndefined()) {1910// TODO: coalesce subdword copies if dst byte is 01911reg.reg_b += op.bytes();1912continue;1913}19141915RegClass rc_def =1916op.regClass().is_subdword()1917? op.regClass()1918: RegClass(instr->definitions[0].getTemp().type(), op.size());1919const Definition def = Definition(reg, rc_def);1920copy_operations[def.physReg()] = {op, def, op.bytes()};1921reg.reg_b += op.bytes();1922}1923handle_operands(copy_operations, &ctx, program->chip_class, pi);1924break;1925}1926case aco_opcode::p_split_vector: {1927std::map<PhysReg, copy_operation> copy_operations;1928PhysReg reg = instr->operands[0].physReg();19291930for (const Definition& def : instr->definitions) {1931RegClass rc_op = def.regClass().is_subdword()1932? def.regClass()1933: RegClass(instr->operands[0].getTemp().type(), def.size());1934const Operand op = Operand(reg, rc_op);1935copy_operations[def.physReg()] = {op, def, def.bytes()};1936reg.reg_b += def.bytes();1937}1938handle_operands(copy_operations, &ctx, program->chip_class, pi);1939break;1940}1941case aco_opcode::p_parallelcopy:1942case aco_opcode::p_wqm: {1943std::map<PhysReg, copy_operation> copy_operations;1944for (unsigned j = 0; j < instr->operands.size(); j++) {1945assert(instr->definitions[j].bytes() == instr->operands[j].bytes());1946copy_operations[instr->definitions[j].physReg()] = {1947instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};1948}1949handle_operands(copy_operations, &ctx, program->chip_class, pi);1950break;1951}1952case aco_opcode::p_exit_early_if: {1953/* don't bother with an early exit near the end of the program */1954if ((block->instructions.size() - 1 - instr_idx) <= 4 &&1955block->instructions.back()->opcode == aco_opcode::s_endpgm) {1956unsigned null_exp_dest =1957(ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;1958bool ignore_early_exit = true;19591960for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {1961const aco_ptr<Instruction>& instr2 = block->instructions[k];1962if (instr2->opcode == aco_opcode::s_endpgm ||1963instr2->opcode == aco_opcode::p_logical_end)1964continue;1965else if (instr2->opcode == aco_opcode::exp &&1966instr2->exp().dest == null_exp_dest)1967continue;1968else if (instr2->opcode == aco_opcode::p_parallelcopy &&1969instr2->definitions[0].isFixed() &&1970instr2->definitions[0].physReg() == exec)1971continue;19721973ignore_early_exit = false;1974}19751976if (ignore_early_exit)1977break;1978}19791980if (!discard_block) {1981discard_block = program->create_and_insert_block();1982block = &program->blocks[block_idx];19831984bld.reset(discard_block);1985bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,1986V_008DFC_SQ_EXP_NULL, false, true, true);1987bld.sopp(aco_opcode::s_endpgm);19881989bld.reset(&ctx.instructions);1990}19911992// TODO: exec can be zero here with block_kind_discard19931994assert(instr->operands[0].physReg() == scc);1995bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0],1996discard_block->index);19971998discard_block->linear_preds.push_back(block->index);1999block->linear_succs.push_back(discard_block->index);2000break;2001}2002case aco_opcode::p_spill: {2003assert(instr->operands[0].regClass() == v1.as_linear());2004for (unsigned i = 0; i < instr->operands[2].size(); i++) {2005Operand src =2006instr->operands[2].isConstant()2007? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))2008: Operand(PhysReg{instr->operands[2].physReg() + i}, s1);2009bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,2010Operand::c32(instr->operands[1].constantValue() + i),2011instr->operands[0]);2012}2013break;2014}2015case aco_opcode::p_reload: {2016assert(instr->operands[0].regClass() == v1.as_linear());2017for (unsigned i = 0; i < instr->definitions[0].size(); i++)2018bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),2019instr->operands[0],2020Operand::c32(instr->operands[1].constantValue() + i));2021break;2022}2023case aco_opcode::p_as_uniform: {2024if (instr->operands[0].isConstant() ||2025instr->operands[0].regClass().type() == RegType::sgpr) {2026std::map<PhysReg, copy_operation> copy_operations;2027copy_operations[instr->definitions[0].physReg()] = {2028instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};2029handle_operands(copy_operations, &ctx, program->chip_class, pi);2030} else {2031assert(instr->operands[0].regClass().type() == RegType::vgpr);2032assert(instr->definitions[0].regClass().type() == RegType::sgpr);2033assert(instr->operands[0].size() == instr->definitions[0].size());2034for (unsigned i = 0; i < instr->definitions[0].size(); i++) {2035bld.vop1(aco_opcode::v_readfirstlane_b32,2036bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),2037Operand(PhysReg{instr->operands[0].physReg() + i}, v1));2038}2039}2040break;2041}2042case aco_opcode::p_bpermute: {2043if (ctx.program->chip_class <= GFX7)2044emit_gfx6_bpermute(program, instr, bld);2045else if (ctx.program->chip_class >= GFX10 && ctx.program->wave_size == 64)2046emit_gfx10_wave64_bpermute(program, instr, bld);2047else2048unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");2049break;2050}2051case aco_opcode::p_constaddr: {2052unsigned id = instr->definitions[0].tempId();2053PhysReg reg = instr->definitions[0].physReg();2054bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));2055bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),2056Operand(reg, s1), Operand::c32(id));2057bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc),2058Operand(reg.advance(4), s1), Operand::zero(), Operand(scc, s1));2059break;2060}2061case aco_opcode::p_extract: {2062assert(instr->operands[1].isConstant());2063assert(instr->operands[2].isConstant());2064assert(instr->operands[3].isConstant());2065if (instr->definitions[0].regClass() == s1)2066assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);2067Definition dst = instr->definitions[0];2068Operand op = instr->operands[0];2069unsigned bits = instr->operands[2].constantValue();2070unsigned index = instr->operands[1].constantValue();2071unsigned offset = index * bits;2072bool signext = !instr->operands[3].constantEquals(0);20732074if (dst.regClass() == s1) {2075if (offset == (32 - bits)) {2076bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,2077bld.def(s1, scc), op, Operand::c32(offset));2078} else if (offset == 0 && signext && (bits == 8 || bits == 16)) {2079bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,2080dst, op);2081} else {2082bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,2083bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));2084}2085} else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) {2086assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);2087if (offset == (32 - bits) && op.regClass() != s1) {2088bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,2089Operand::c32(offset), op);2090} else {2091bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,2092Operand::c32(offset), Operand::c32(bits));2093}2094} else if (dst.regClass() == v2b) {2095aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(2096aco_opcode::v_mov_b32,2097(Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};2098sdwa->operands[0] = Operand(op.physReg().advance(-op.physReg().byte()),2099RegClass::get(op.regClass().type(), 4));2100sdwa->definitions[0] = dst;2101sdwa->sel[0] = sdwa_ubyte0 + op.physReg().byte() + index;2102if (signext)2103sdwa->sel[0] |= sdwa_sext;2104sdwa->dst_sel = sdwa_uword;2105bld.insert(std::move(sdwa));2106}2107break;2108}2109case aco_opcode::p_insert: {2110assert(instr->operands[1].isConstant());2111assert(instr->operands[2].isConstant());2112if (instr->definitions[0].regClass() == s1)2113assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);2114Definition dst = instr->definitions[0];2115Operand op = instr->operands[0];2116unsigned bits = instr->operands[2].constantValue();2117unsigned index = instr->operands[1].constantValue();2118unsigned offset = index * bits;21192120if (dst.regClass() == s1) {2121if (offset == (32 - bits)) {2122bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,2123Operand::c32(offset));2124} else if (offset == 0) {2125bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,2126Operand::c32(bits << 16));2127} else {2128bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,2129Operand::c32(bits << 16));2130bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),2131Operand(dst.physReg(), s1), Operand::c32(offset));2132}2133} else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) {2134if (offset == (dst.bytes() * 8u - bits)) {2135bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);2136} else if (offset == 0) {2137bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));2138} else if (program->chip_class >= GFX9 ||2139(op.regClass() != s1 && program->chip_class >= GFX8)) {2140aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(2141aco_opcode::v_mov_b32,2142(Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};2143sdwa->operands[0] = op;2144sdwa->definitions[0] = dst;2145sdwa->sel[0] = sdwa_udword;2146sdwa->dst_sel = (bits == 8 ? sdwa_ubyte0 : sdwa_uword0) + (offset / bits);2147bld.insert(std::move(sdwa));2148} else {2149bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));2150bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),2151Operand(dst.physReg(), v1));2152}2153} else {2154assert(dst.regClass() == v2b);2155aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(2156aco_opcode::v_mov_b32,2157(Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};2158sdwa->operands[0] = op;2159sdwa->definitions[0] =2160Definition(dst.physReg().advance(-dst.physReg().byte()), v1);2161sdwa->sel[0] = sdwa_uword;2162sdwa->dst_sel = sdwa_ubyte0 + dst.physReg().byte() + index;2163sdwa->dst_preserve = 1;2164bld.insert(std::move(sdwa));2165}2166break;2167}2168default: break;2169}2170} else if (instr->isBranch()) {2171Pseudo_branch_instruction* branch = &instr->branch();2172uint32_t target = branch->target[0];21732174/* check if all blocks from current to target are empty */2175/* In case there are <= 4 SALU or <= 2 VALU instructions, remove the branch */2176bool can_remove = block->index < target;2177unsigned num_scalar = 0;2178unsigned num_vector = 0;2179for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {2180/* uniform branches must not be ignored if they2181* are about to jump over actual instructions */2182if (!program->blocks[i].instructions.empty() &&2183(branch->opcode != aco_opcode::p_cbranch_z ||2184branch->operands[0].physReg() != exec)) {2185can_remove = false;2186break;2187}21882189for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {2190if (inst->isSOPP()) {2191can_remove = false;2192} else if (inst->isSALU()) {2193num_scalar++;2194} else if (inst->isVALU()) {2195num_vector++;2196} else {2197can_remove = false;2198}21992200if (num_scalar + num_vector * 2 > 4)2201can_remove = false;22022203if (!can_remove)2204break;2205}2206}22072208if (can_remove)2209continue;22102211switch (instr->opcode) {2212case aco_opcode::p_branch:2213assert(block->linear_succs[0] == target);2214bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);2215break;2216case aco_opcode::p_cbranch_nz:2217assert(block->linear_succs[1] == target);2218if (branch->operands[0].physReg() == exec)2219bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);2220else if (branch->operands[0].physReg() == vcc)2221bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);2222else {2223assert(branch->operands[0].physReg() == scc);2224bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);2225}2226break;2227case aco_opcode::p_cbranch_z:2228assert(block->linear_succs[1] == target);2229if (branch->operands[0].physReg() == exec)2230bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);2231else if (branch->operands[0].physReg() == vcc)2232bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);2233else {2234assert(branch->operands[0].physReg() == scc);2235bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);2236}2237break;2238default: unreachable("Unknown Pseudo branch instruction!");2239}22402241} else if (instr->isReduction()) {2242Pseudo_reduction_instruction& reduce = instr->reduction();2243emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,2244reduce.operands[1].physReg(), // tmp2245reduce.definitions[1].physReg(), // stmp2246reduce.operands[2].physReg(), // vtmp2247reduce.definitions[2].physReg(), // sitmp2248reduce.operands[0], reduce.definitions[0]);2249} else if (instr->isBarrier()) {2250Pseudo_barrier_instruction& barrier = instr->barrier();22512252/* Anything larger than a workgroup isn't possible. Anything2253* smaller requires no instructions and this pseudo instruction2254* would only be included to control optimizations. */2255bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&2256program->workgroup_size > program->wave_size;22572258bld.insert(std::move(instr));2259if (emit_s_barrier)2260bld.sopp(aco_opcode::s_barrier);2261} else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {2262float_mode new_mode = block->fp_mode;2263new_mode.round16_64 = fp_round_ne;2264bool set_round = new_mode.round != block->fp_mode.round;22652266emit_set_mode(bld, new_mode, set_round, false);22672268instr->opcode = aco_opcode::v_cvt_f16_f32;2269ctx.instructions.emplace_back(std::move(instr));22702271emit_set_mode(bld, block->fp_mode, set_round, false);2272} else {2273ctx.instructions.emplace_back(std::move(instr));2274}2275}2276block->instructions.swap(ctx.instructions);2277}2278}22792280} // namespace aco228122822283