Path: blob/21.2-virgl/src/amd/compiler/tests/test_sdwa.cpp
7097 views
/*1* Copyright © 2020 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/23#include "helpers.h"24#include <stdarg.h>2526using namespace aco;2728BEGIN_TEST(validate.sdwa.allow)29for (unsigned i = GFX8; i <= GFX10; i++) {30//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm31if (!setup_cs("v1 v1 s1 s1", (chip_class)i))32continue;33//>> Validation results:34//! Validation passed3536SDWA_instruction *sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa();37sdwa->neg[0] = sdwa->neg[1] = sdwa->abs[0] = sdwa->abs[1] = true;3839sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa();40sdwa->dst_preserve = true;41sdwa->dst_sel = sdwa_ubyte0;4243sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa();44sdwa->sel[0] = sdwa_sbyte2;45sdwa->sel[1] = sdwa_uword1;4647finish_validator_test();48}49END_TEST5051BEGIN_TEST(validate.sdwa.support)52for (unsigned i = GFX7; i <= GFX10; i++) {53//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm54if (!setup_cs("v1 v1 s1 s1", (chip_class)i))55continue;56//>> Validation results:5758//~gfx7! SDWA is GFX8+ only: v1: %t0 = v_mul_f32 %a, %b59//~gfx7! Validation failed60//~gfx([89]|10)! Validation passed61bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);6263finish_validator_test();64}65END_TEST6667BEGIN_TEST(validate.sdwa.operands)68for (unsigned i = GFX8; i <= GFX10; i++) {69//>> v1: %vgpr0, v1: %vgp1, s1: %sgpr0, s1: %sgpr1 = p_startpgm70if (!setup_cs("v1 v1 s1 s1", (chip_class)i))71continue;72//>> Validation results:7374//~gfx8! Wrong source position for SGPR argument: v1: %_ = v_mul_f32 %sgpr0, %vgpr175//~gfx8! Wrong source position for SGPR argument: v1: %_ = v_mul_f32 %vgpr0, %sgpr176bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[2], inputs[1]);77bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[3]);7879//~gfx8! Wrong source position for constant argument: v1: %_ = v_mul_f32 4, %vgpr180//~gfx8! Wrong source position for constant argument: v1: %_ = v_mul_f32 %vgpr0, 481bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(4u), inputs[1]);82bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], Operand::c32(4u));8384//! Literal applied on wrong instruction format: v1: %_ = v_mul_f32 0x1234, %vgpr185//! Literal applied on wrong instruction format: v1: %_ = v_mul_f32 %vgpr0, 0x123486//! Wrong source position for Literal argument: v1: %_ = v_mul_f32 %vgpr0, 0x123487bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x1234u), inputs[1]);88bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], Operand::c32(0x1234u));8990//! Validation failed9192finish_validator_test();93}94END_TEST9596BEGIN_TEST(validate.sdwa.vopc)97for (unsigned i = GFX8; i <= GFX10; i++) {98//>> v1: %vgpr0, v1: %vgp1, s1: %sgpr0, s1: %sgpr1 = p_startpgm99if (!setup_cs("v1 v1 s1 s1", (chip_class)i))100continue;101//>> Validation results:102103bld.vopc_sdwa(aco_opcode::v_cmp_gt_f32, bld.def(bld.lm, vcc), inputs[0], inputs[1]);104105//~gfx8! SDWA+VOPC definition must be fixed to vcc on GFX8: s2: %_ = v_cmp_lt_f32 %vgpr0, %vgpr1106bld.vopc_sdwa(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), inputs[0], inputs[1]);107108//~gfx(9|10)! SDWA VOPC clamp only supported on GFX8: s2: %_:vcc = v_cmp_eq_f32 %vgpr0, %vgpr1 clamp109bld.vopc_sdwa(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm, vcc), inputs[0], inputs[1]).instr->sdwa().clamp = true;110111//! Validation failed112113finish_validator_test();114}115END_TEST116117BEGIN_TEST(validate.sdwa.omod)118for (unsigned i = GFX8; i <= GFX10; i++) {119//>> v1: %vgpr0, v1: %vgp1, s1: %sgpr0, s1: %sgpr1 = p_startpgm120if (!setup_cs("v1 v1 s1 s1", (chip_class)i))121continue;122//>> Validation results:123124//~gfx8! SDWA omod only supported on GFX9+: v1: %_ = v_mul_f32 %vgpr0, %vgpr1 *2125//~gfx8! Validation failed126//~gfx(9|10)! Validation passed127bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa().omod = 1;128129finish_validator_test();130}131END_TEST132133BEGIN_TEST(validate.sdwa.vcc)134for (unsigned i = GFX8; i <= GFX10; i++) {135//>> v1: %vgpr0, v1: %vgpr1, s2: %sgpr0 = p_startpgm136if (!setup_cs("v1 v1 s2", (chip_class)i))137continue;138//>> Validation results:139140//! 3rd operand must be fixed to vcc with SDWA: v1: %_ = v_cndmask_b32 %vgpr0, %vgpr1, %_141bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], inputs[2]);142bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], bld.vcc(inputs[2]));143144//! 2nd definition must be fixed to vcc with SDWA: v1: %_, s2: %_ = v_add_co_u32 %vgpr0, %vgpr1145bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm), inputs[0], inputs[1]);146bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm, vcc), inputs[0], inputs[1]);147148//! Validation failed149150finish_validator_test();151}152END_TEST153154BEGIN_TEST(optimize.sdwa.extract)155for (unsigned i = GFX7; i <= GFX10; i++) {156for (unsigned is_signed = 0; is_signed <= 1; is_signed++) {157//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm158if (!setup_cs("v1 v1 s1 s1", (chip_class)i, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned"))159continue;160161//; funcs['b'] = lambda bits: ('sext(%%b)[%s]' if variant.endswith('_signed') else '%%b[%s]') % bits162163//; def standard_test(index, offset, size):164//; res = 'v1: %%res%d = v_mul_f32 %%a, @b(%d:%d)\n' % (index, offset % 32, offset % 32 + size % 32 - 1)165//; res += 'p_unit_test %d, %%res%d' % (index, index)166//; return res167//; funcs['standard_test'] = lambda a: standard_test(*(int(v) for v in a.split(',')))168169aco_opcode ext = aco_opcode::p_extract;170aco_opcode ins = aco_opcode::p_insert;171172{173//~gfx[^7].*! @standard_test(0, 0, 8)174Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),175Operand::c32(is_signed));176writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte0_b));177178//~gfx[^7].*! @standard_test(1, 8, 8)179Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u),180Operand::c32(is_signed));181writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte1_b));182183//~gfx[^7].*! @standard_test(2, 16, 8)184Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), Operand::c32(8u),185Operand::c32(is_signed));186writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte2_b));187188//~gfx[^7].*! @standard_test(3, 24, 8)189Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), Operand::c32(8u),190Operand::c32(is_signed));191writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte3_b));192193//~gfx[^7].*! @standard_test(4, 0, 16)194Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u),195Operand::c32(is_signed));196writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word0_b));197198//~gfx[^7].*! @standard_test(5, 16, 16)199Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u),200Operand::c32(16u), Operand::c32(is_signed));201writeout(5, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word1_b));202203//~gfx[^7]_unsigned! @standard_test(6, 0, 8)204Temp bfi_byte0_b = bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u));205writeout(6, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte0_b));206207//~gfx[^7]_unsigned! @standard_test(7, 0, 16)208Temp bfi_word0_b =209bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u));210writeout(7, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_word0_b));211}212213//>> p_unit_test 63214writeout(63);215216{217//! v1: %tmp8 = p_insert %b, 1, 8218//! v1: %res8 = v_mul_f32 %a, %tmp8219//! p_unit_test 8, %res8220Temp bfi_byte1_b =221bld.pseudo(ins, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u));222writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte1_b));223224/* v_cvt_f32_ubyte[0-3] can be used instead of v_cvt_f32_u32+sdwa */225//~gfx7_signed! v1: %bfe_byte0_b = p_extract %b, 0, 8, 1226//~gfx7_signed! v1: %res9 = v_cvt_f32_u32 %bfe_byte0_b227//~gfx[^7]+_signed! v1: %res9 = v_cvt_f32_u32 @b(0:7)228//~gfx\d+_unsigned! v1: %res9 = v_cvt_f32_ubyte0 %b229//! p_unit_test 9, %res9230Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),231Operand::c32(is_signed));232writeout(9, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte0_b));233234//~gfx7_signed! v1: %bfe_byte1_b = p_extract %b, 1, 8, 1235//~gfx7_signed! v1: %res10 = v_cvt_f32_u32 %bfe_byte1_b236//~gfx[^7]+_signed! v1: %res10 = v_cvt_f32_u32 @b(8:15)237//~gfx\d+_unsigned! v1: %res10 = v_cvt_f32_ubyte1 %b238//! p_unit_test 10, %res10239Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u),240Operand::c32(is_signed));241writeout(10, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte1_b));242243//~gfx7_signed! v1: %bfe_byte2_b = p_extract %b, 2, 8, 1244//~gfx7_signed! v1: %res11 = v_cvt_f32_u32 %bfe_byte2_b245//~gfx[^7]+_signed! v1: %res11 = v_cvt_f32_u32 @b(16:23)246//~gfx\d+_unsigned! v1: %res11 = v_cvt_f32_ubyte2 %b247//! p_unit_test 11, %res11248Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), Operand::c32(8u),249Operand::c32(is_signed));250writeout(11, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte2_b));251252//~gfx7_signed! v1: %bfe_byte3_b = p_extract %b, 3, 8, 1253//~gfx7_signed! v1: %res12 = v_cvt_f32_u32 %bfe_byte3_b254//~gfx[^7]+_signed! v1: %res12 = v_cvt_f32_u32 @b(24:31)255//~gfx\d+_unsigned! v1: %res12 = v_cvt_f32_ubyte3 %b256//! p_unit_test 12, %res12257Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), Operand::c32(8u),258Operand::c32(is_signed));259writeout(12, bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), bfe_byte3_b));260261//! v1: %res13 = v_add_i16 %a, %b262//! p_unit_test 13, %res13263Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u),264Operand::c32(is_signed));265writeout(13, bld.vop3(aco_opcode::v_add_i16, bld.def(v1), inputs[0], bfe_word0_b));266267/* VOP3-only instructions can't use SDWA but they can use opsel instead */268//~gfx(9|10).*! v1: %res14 = v_add_i16 %a, hi(%b)269//~gfx(9|10).*! p_unit_test 14, %res14270Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u),271Operand::c32(16u), Operand::c32(is_signed));272writeout(14, bld.vop3(aco_opcode::v_add_i16, bld.def(v1), inputs[0], bfe_word1_b));273}274275finish_opt_test();276}277}278END_TEST279280BEGIN_TEST(optimize.sdwa.extract_modifiers)281for (unsigned i = GFX8; i <= GFX10; i++) {282//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm283if (!setup_cs("v1 v1 s1 s1", (chip_class)i))284continue;285286aco_opcode ext = aco_opcode::p_extract;287288//! v1: %res0 = v_mul_f32 %a, -%b[0:7]289//! p_unit_test 0, %res0290Temp byte0 = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),291Operand::zero());292Temp neg_byte0 = fneg(byte0);293writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_byte0));294295//~gfx8! v1: %neg = v_mul_f32 -1.0, %b296//~gfx8! v1: %res1 = v_mul_f32 %a, %neg[0:7]297//~gfx(9|10)! v1: %neg_byte0 = v_mul_f32 -1.0, %b dst_sel:ubyte0298//~gfx(9|10)! v1: %res1 = v_mul_f32 %a, %neg_byte0299//! p_unit_test 1, %res1300Temp neg = fneg(inputs[1]);301Temp byte0_neg =302bld.pseudo(ext, bld.def(v1), neg, Operand::zero(), Operand::c32(8u), Operand::zero());303writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_neg));304305//! v1: %res2 = v_mul_f32 %a, |%b[0:7]|306//! p_unit_test 2, %res2307Temp abs_byte0 = fabs(byte0);308writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], abs_byte0));309310//! v1: %abs = v_mul_f32 1.0, |%b|311//! v1: %res3 = v_mul_f32 %a, %abs[0:7]312//! p_unit_test 3, %res3313Temp abs = fabs(inputs[1]);314Temp byte0_abs =315bld.pseudo(ext, bld.def(v1), abs, Operand::zero(), Operand::c32(8u), Operand::zero());316writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_abs));317318//! v1: %res4 = v_mul_f32 %1, -|%2[0:7]|319//! p_unit_test 4, %res4320Temp neg_abs_byte0 = fneg(abs_byte0);321writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_abs_byte0));322323//~gfx8! v1: %neg_abs = v_mul_f32 -1.0, %abs324//~gfx8! v1: %res5 = v_mul_f32 %a, %neg_abs[0:7]325//~gfx(9|10)! v1: %neg_abs_byte0 = v_mul_f32 -1.0, %abs dst_sel:ubyte0326//~gfx(9|10)! v1: %res5 = v_mul_f32 %a, %neg_abs_byte0327//! p_unit_test 5, %res5328Temp neg_abs = fneg(abs);329Temp byte0_neg_abs =330bld.pseudo(ext, bld.def(v1), neg_abs, Operand::zero(), Operand::c32(8u), Operand::zero());331writeout(5, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_neg_abs));332333finish_opt_test();334}335END_TEST336337BEGIN_TEST(optimize.sdwa.extract.sgpr)338for (unsigned i = GFX8; i <= GFX10; i++) {339//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm340if (!setup_cs("v1 v1 s1 s1", (chip_class)i))341continue;342343aco_opcode ext = aco_opcode::p_extract;344345//~gfx8! v1: %byte0_b = p_extract %b, 0, 8, 0346//~gfx8! v1: %res1 = v_mul_f32 %c, %byte0_b347//~gfx(9|10)! v1: %res1 = v_mul_f32 %c, %b[0:7]348//! p_unit_test 1, %res1349Temp byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),350Operand::zero());351writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[2], byte0_b));352353//~gfx8! v1: %byte0_c = p_extract %c, 0, 8, 0354//~gfx8! v1: %res2 = v_mul_f32 %a, %byte0_c355//~gfx(9|10)! v1: %res2 = v_mul_f32 %a, %c[0:7]356//! p_unit_test 2, %res2357Temp byte0_c = bld.pseudo(ext, bld.def(v1), inputs[2], Operand::zero(), Operand::c32(8u),358Operand::zero());359writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_c));360361//~gfx8! v1: %byte0_c_2 = p_extract %c, 0, 8, 0362//~gfx8! v1: %res3 = v_mul_f32 %c, %byte0_c_2363//~gfx(9|10)! v1: %res3 = v_mul_f32 %c, %c[0:7]364//! p_unit_test 3, %res3365byte0_c = bld.pseudo(ext, bld.def(v1), inputs[2], Operand::zero(), Operand::c32(8u),366Operand::zero());367writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[2], byte0_c));368369//~gfx(8|9)! v1: %byte0_c_3 = p_extract %c, 0, 8, 0370//~gfx(8|9)! v1: %res4 = v_mul_f32 %d, %byte0_c_3371//~gfx10! v1: %res4 = v_mul_f32 %d, %c[0:7]372//! p_unit_test 4, %res4373byte0_c = bld.pseudo(ext, bld.def(v1), inputs[2], Operand::zero(), Operand::c32(8u),374Operand::zero());375writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[3], byte0_c));376377finish_opt_test();378}379END_TEST380381BEGIN_TEST(optimize.sdwa.from_vop3)382for (unsigned i = GFX8; i <= GFX10; i++) {383//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm384if (!setup_cs("v1 v1 s1 s1", (chip_class)i))385continue;386387//! v1: %res0 = v_mul_f32 -|%a|, %b[0:7]388//! p_unit_test 0, %res0389Temp byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),390Operand::c32(8u), Operand::zero());391VOP3_instruction *mul = &bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_b).instr->vop3();392mul->neg[0] = true;393mul->abs[0] = true;394writeout(0, mul->definitions[0].getTemp());395396//~gfx8! v1: %byte0_b_0 = p_extract %b, 0, 8, 0397//~gfx8! v1: %res1 = v_mul_f32 %a, %byte0_b_0 *4398//~gfx(9|10)! v1: %res1 = v_mul_f32 %a, %b[0:7] *4399//! p_unit_test 1, %res1400byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),401Operand::c32(8u), Operand::zero());402mul = &bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_b).instr->vop3();403mul->omod = 2;404writeout(1, mul->definitions[0].getTemp());405406//~gfx8! v1: %byte0_b_1 = p_extract %b, 0, 8, 0407//~gfx8! v1: %res2 = v_mul_f32 %byte0_b_1, %c408//~gfx(9|10)! v1: %res2 = v_mul_f32 %b[0:7], %c409//! p_unit_test 2, %res2410byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),411Operand::c32(8u), Operand::zero());412writeout(2, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), byte0_b, inputs[2]));413414if (i >= GFX10) {415//~gfx10! v1: %byte0_b_2 = p_extract %b, 0, 8, 0416//~gfx10! v1: %res3 = v_mul_f32 %byte0_b_2, 0x1234417//~gfx10! p_unit_test 3, %res3418byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),419Operand::c32(8u), Operand::zero());420writeout(3,421bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), byte0_b, Operand::c32(0x1234u)));422}423424finish_opt_test();425}426END_TEST427428BEGIN_TEST(optimize.sdwa.insert)429for (unsigned i = GFX7; i <= GFX10; i++) {430//>> v1: %a, v1: %b = p_startpgm431if (!setup_cs("v1 v1", (chip_class)i))432continue;433434aco_opcode ext = aco_opcode::p_extract;435aco_opcode ins = aco_opcode::p_insert;436437//~gfx[^7]! v1: %res0 = v_mul_f32 %a, %b dst_sel:ubyte0438//~gfx[^7]! p_unit_test 0, %res0439Temp val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);440writeout(0, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));441442//~gfx[^7]! v1: %res1 = v_mul_f32 %a, %b dst_sel:ubyte1443//~gfx[^7]! p_unit_test 1, %res1444val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);445writeout(1, bld.pseudo(ins, bld.def(v1), val, Operand::c32(1u), Operand::c32(8u)));446447//~gfx[^7]! v1: %res2 = v_mul_f32 %a, %b dst_sel:ubyte2448//~gfx[^7]! p_unit_test 2, %res2449val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);450writeout(2, bld.pseudo(ins, bld.def(v1), val, Operand::c32(2u), Operand::c32(8u)));451452//~gfx[^7]! v1: %res3 = v_mul_f32 %a, %b dst_sel:ubyte3453//~gfx[^7]! p_unit_test 3, %res3454val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);455writeout(3, bld.pseudo(ins, bld.def(v1), val, Operand::c32(3u), Operand::c32(8u)));456457//~gfx[^7]! v1: %res4 = v_mul_f32 %a, %b dst_sel:uword0458//~gfx[^7]! p_unit_test 4, %res4459val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);460writeout(4, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(16u)));461462//~gfx[^7]! v1: %res5 = v_mul_f32 %a, %b dst_sel:uword1463//~gfx[^7]! p_unit_test 5, %res5464val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);465writeout(5, bld.pseudo(ins, bld.def(v1), val, Operand::c32(1u), Operand::c32(16u)));466467//~gfx[^7]! v1: %res6 = v_mul_f32 %a, %b dst_sel:ubyte0468//~gfx[^7]! p_unit_test 6, %res6469val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);470writeout(4716, bld.pseudo(ext, bld.def(v1), val, Operand::zero(), Operand::c32(8u), Operand::zero()));472473//~gfx[^7]! v1: %res7 = v_mul_f32 %a, %b dst_sel:uword0474//~gfx[^7]! p_unit_test 7, %res7475val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);476writeout(4777, bld.pseudo(ext, bld.def(v1), val, Operand::zero(), Operand::c32(16u), Operand::zero()));478479//~gfx[^7]! v1: %tmp8 = v_mul_f32 %a, %b480//~gfx[^7]! v1: %res8 = p_extract %tmp8, 2, 8, 0481//~gfx[^7]! p_unit_test 8, %res8482val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);483writeout(4848, bld.pseudo(ext, bld.def(v1), val, Operand::c32(2u), Operand::c32(8u), Operand::zero()));485486//~gfx[^7]! v1: %tmp9 = v_mul_f32 %a, %b487//~gfx[^7]! v1: %res9 = p_extract %tmp9, 0, 8, 1488//~gfx[^7]! p_unit_test 9, %res9489val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);490writeout(4919, bld.pseudo(ext, bld.def(v1), val, Operand::zero(), Operand::c32(8u), Operand::c32(1u)));492493//>> p_unit_test 63494writeout(63);495496//! v1: %res10 = v_mul_f32 %a, %b497//! p_unit_test 10, %res10498val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);499bld.pseudo(ins, bld.def(v1), val, Operand::c32(1u), Operand::c32(16u));500writeout(10, val);501502//! v1: %res11 = v_sub_i16 %a, %b503//! p_unit_test 11, %res11504val = bld.vop3(aco_opcode::v_sub_i16, bld.def(v1), inputs[0], inputs[1]);505writeout(11, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(16u)));506507//~gfx[78]! v1: %tmp12 = v_sub_i16 %a, %b508//~gfx[78]! v1: %res12 = p_insert %tmp11, 1, 16509//~gfx(9|10)! v1: %res12 = v_sub_i16 %a, %b opsel_hi510//! p_unit_test 12, %res12511val = bld.vop3(aco_opcode::v_sub_i16, bld.def(v1), inputs[0], inputs[1]);512writeout(12, bld.pseudo(ins, bld.def(v1), val, Operand::c32(1u), Operand::c32(16u)));513514//! v1: %tmp13 = v_sub_i16 %a, %b515//! v1: %res13 = p_insert %tmp13, 0, 8516//! p_unit_test 13, %res13517val = bld.vop3(aco_opcode::v_sub_i16, bld.def(v1), inputs[0], inputs[1]);518writeout(13, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));519520finish_opt_test();521}522END_TEST523524BEGIN_TEST(optimize.sdwa.insert_modifiers)525for (unsigned i = GFX8; i <= GFX9; i++) {526//>> v1: %a = p_startpgm527if (!setup_cs("v1", (chip_class)i))528continue;529530aco_opcode ins = aco_opcode::p_insert;531532//~gfx8! v1: %tmp0 = v_rcp_f32 %a *2533//~gfx8! v1: %res0 = p_insert %tmp0, 0, 8534//~gfx9! v1: %res0 = v_rcp_f32 %a *2 dst_sel:ubyte0535//! p_unit_test 0, %res0536Temp val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);537val = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), val, Operand::c32(0x40000000u));538writeout(0, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));539540//! v1: %res1 = v_rcp_f32 %a clamp dst_sel:ubyte0541//! p_unit_test 1, %res1542val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);543val = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), val, Operand::zero(),544Operand::c32(0x3f800000u));545writeout(1, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));546547//! v1: %tmp2 = v_rcp_f32 %a dst_sel:ubyte0548//! v1: %res2 = v_mul_f32 %tmp2, 2.0549//! p_unit_test 2, %res2550val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);551val = bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u));552val = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), val, Operand::c32(0x40000000u));553writeout(2, val);554555//! v1: %tmp3 = v_rcp_f32 %a dst_sel:ubyte0556//! v1: %res3 = v_med3_f32 %tmp3, 0, 1.0557//! p_unit_test 3, %res3558val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);559val = bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u));560val = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), val, Operand::zero(),561Operand::c32(0x3f800000u));562writeout(3, val);563564//~gfx8! v1: %tmp4 = v_rcp_f32 %a *2 clamp565//~gfx8! v1: %res4 = p_insert %tmp4, 0, 8566//~gfx9! v1: %res4 = v_rcp_f32 %a *2 clamp dst_sel:ubyte0567//! p_unit_test 4, %res4568val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);569val = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), val, Operand::c32(0x40000000u));570val = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), val, Operand::zero(),571Operand::c32(0x3f800000u));572writeout(4, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));573574finish_opt_test();575}576END_TEST577578579