Path: blob/21.2-virgl/src/amd/compiler/tests/test_optimizer.cpp
7097 views
/*1* Copyright © 2020 Valve Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*22*/23#include "helpers.h"2425using namespace aco;2627BEGIN_TEST(optimize.neg)28for (unsigned i = GFX9; i <= GFX10; i++) {29//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm30if (!setup_cs("v1 v1 s1 s1", (chip_class)i))31continue;3233//! v1: %res0 = v_mul_f32 %a, -%b34//! p_unit_test 0, %res035Temp neg_b = fneg(inputs[1]);36writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));3738//~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a39//~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a40//~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a41//! p_unit_test 1, %res142Temp neg_a = fneg(inputs[0]);43writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));4445//! v1: %res2 = v_mul_f32 %a, %b46//! p_unit_test 2, %res247Temp neg_neg_a = fneg(neg_a);48writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));4950//! v1: %res3 = v_mul_f32 |%a|, %b51//! p_unit_test 3, %res352Temp abs_neg_a = fabs(neg_a);53writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));5455//! v1: %res4 = v_mul_f32 -|%a|, %b56//! p_unit_test 4, %res457Temp abs_a = fabs(inputs[0]);58Temp neg_abs_a = fneg(abs_a);59writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));6061//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:162//! p_unit_test 5, %res563writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));6465//! v1: %res6 = v_subrev_f32 %a, %b66//! p_unit_test 6, %res667writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));6869//! v1: %res7 = v_sub_f32 %b, %a70//! p_unit_test 7, %res771writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));7273//! v1: %res8 = v_mul_f32 %a, -%c74//! p_unit_test 8, %res875Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));76writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));7778// //! v1: %res9 = v_mul_f32 |%neg_a|, %b79// //! p_unit_test 9, %res980Temp abs_neg_abs_a = fabs(neg_abs_a);81writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));8283finish_opt_test();84}85END_TEST8687BEGIN_TEST(optimize.output_modifiers)88//>> v1: %a, v1: %b = p_startpgm89if (!setup_cs("v1 v1", GFX9))90return;9192program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;9394/* 32-bit modifiers */9596//! v1: %res0 = v_add_f32 %a, %b *0.597//! p_unit_test 0, %res098Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);99writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));100101//! v1: %res1 = v_add_f32 %a, %b *2102//! p_unit_test 1, %res1103tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);104writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));105106//! v1: %res2 = v_add_f32 %a, %b *4107//! p_unit_test 2, %res2108tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);109writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));110111//! v1: %res3 = v_add_f32 %a, %b clamp112//! p_unit_test 3, %res3113tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);114writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),115Operand::c32(0x3f800000u), tmp));116117//! v1: %res4 = v_add_f32 %a, %b *2 clamp118//! p_unit_test 4, %res4119tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);120tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);121writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),122Operand::c32(0x3f800000u), tmp));123124/* 16-bit modifiers */125126//! v2b: %res5 = v_add_f16 %a, %b *0.5127//! p_unit_test 5, %res5128tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);129writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));130131//! v2b: %res6 = v_add_f16 %a, %b *2132//! p_unit_test 6, %res6133tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);134writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));135136//! v2b: %res7 = v_add_f16 %a, %b *4137//! p_unit_test 7, %res7138tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);139writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));140141//! v2b: %res8 = v_add_f16 %a, %b clamp142//! p_unit_test 8, %res8143tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);144writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),145Operand::c16(0x3c00u), tmp));146147//! v2b: %res9 = v_add_f16 %a, %b *2 clamp148//! p_unit_test 9, %res9149tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);150tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);151writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),152Operand::c16(0x3c00u), tmp));153154/* clamping is done after omod */155156//! v1: %res10_tmp = v_add_f32 %a, %b clamp157//! v1: %res10 = v_mul_f32 2.0, %res10_tmp158//! p_unit_test 10, %res10159tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);160tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),161tmp);162writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));163164/* unsupported instructions */165166//! v1: %res11_tmp = v_xor_b32 %a, %b167//! v1: %res11 = v_mul_f32 2.0, %res11_tmp168//! p_unit_test 11, %res11169tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);170writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));171172/* several users */173174//! v1: %res12_tmp = v_add_f32 %a, %b175//! p_unit_test %res12_tmp176//! v1: %res12 = v_mul_f32 2.0, %res12_tmp177//! p_unit_test 12, %res12178tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);179bld.pseudo(aco_opcode::p_unit_test, tmp);180writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));181182//! v1: %res13 = v_add_f32 %a, %b183//! p_unit_test 13, %res13184tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);185bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);186writeout(13, tmp);187188/* omod has no effect if denormals are enabled but clamp is fine */189190//>> BB1191//! /* logical preds: / linear preds: / kind: uniform, */192program->next_fp_mode.denorm32 = fp_denorm_keep;193program->next_fp_mode.denorm16_64 = fp_denorm_flush;194bld.reset(program->create_and_insert_block());195196//! v1: %res14_tmp = v_add_f32 %a, %b197//! v1: %res14 = v_mul_f32 2.0, %res13_tmp198//! p_unit_test 14, %res14199tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);200writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));201202//! v1: %res15 = v_add_f32 %a, %b clamp203//! p_unit_test 15, %res15204tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);205writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),206Operand::c32(0x3f800000u), tmp));207208//>> BB2209//! /* logical preds: / linear preds: / kind: uniform, */210program->next_fp_mode.denorm32 = fp_denorm_flush;211program->next_fp_mode.denorm16_64 = fp_denorm_keep;212bld.reset(program->create_and_insert_block());213214//! v2b: %res16_tmp = v_add_f16 %a, %b215//! v2b: %res16 = v_mul_f16 2.0, %res15_tmp216//! p_unit_test 16, %res16217tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);218writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));219220//! v2b: %res17 = v_add_f16 %a, %b clamp221//! p_unit_test 17, %res17222tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);223writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),224Operand::c16(0x3c00u), tmp));225226/* omod flushes -0.0 to +0.0 */227228//>> BB3229//! /* logical preds: / linear preds: / kind: uniform, */230program->next_fp_mode.denorm32 = fp_denorm_keep;231program->next_fp_mode.denorm16_64 = fp_denorm_keep;232program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;233program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;234bld.reset(program->create_and_insert_block());235236//! v1: %res18_tmp = v_add_f32 %a, %b237//! v1: %res18 = v_mul_f32 2.0, %res18_tmp238//! p_unit_test 18, %res18239tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);240writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));241//! v1: %res19 = v_add_f32 %a, %b clamp242//! p_unit_test 19, %res19243tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);244writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),245Operand::c32(0x3f800000u), tmp));246247//>> BB4248//! /* logical preds: / linear preds: / kind: uniform, */249program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;250program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;251bld.reset(program->create_and_insert_block());252//! v2b: %res20_tmp = v_add_f16 %a, %b253//! v2b: %res20 = v_mul_f16 2.0, %res20_tmp254//! p_unit_test 20, %res20255tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);256writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));257//! v2b: %res21 = v_add_f16 %a, %b clamp258//! p_unit_test 21, %res21259tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);260writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),261Operand::c16(0x3c00u), tmp));262263finish_opt_test();264END_TEST265266Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)267{268return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);269}270271BEGIN_TEST(optimize.cndmask)272for (unsigned i = GFX9; i <= GFX10; i++) {273//>> v1: %a, s1: %b, s2: %c = p_startpgm274if (!setup_cs("v1 s1 s2", (chip_class)i))275continue;276277Temp subbrev;278279//! v1: %res0 = v_cndmask_b32 0, %a, %c280//! p_unit_test 0, %res0281subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));282writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));283284//! v1: %res1 = v_cndmask_b32 0, 42, %c285//! p_unit_test 1, %res1286subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));287writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));288289//~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c290//~gfx9! v1: %res2 = v_and_b32 %b, %subbrev291//~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c292//! p_unit_test 2, %res2293subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));294writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));295296//! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c297//! v1: %xor = v_xor_b32 %a, %subbrev1298//! v1: %res3 = v_cndmask_b32 0, %xor, %c299//! p_unit_test 3, %res3300subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));301Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);302writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));303304//! v1: %res4 = v_cndmask_b32 0, %a, %c305//! p_unit_test 4, %res4306Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),307Operand::c32(1u), Operand(inputs[2]));308Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);309writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));310311finish_opt_test();312}313END_TEST314315BEGIN_TEST(optimize.add_lshl)316for (unsigned i = GFX8; i <= GFX10; i++) {317//>> s1: %a, v1: %b = p_startpgm318if (!setup_cs("s1 v1", (chip_class)i))319continue;320321Temp shift;322323//~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3324//~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4325//~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4326//! p_unit_test 0, %res0327shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),328Operand::c32(3u));329writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,330Operand::c32(4u)));331332//~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3333//~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4334//~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b335//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1336//~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4337//~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b338//~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add339//! p_unit_test 1, %res1340shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),341Operand::c32(3u));342Temp sadd =343bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));344Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));345writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));346347//~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3348//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b349//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b350//! p_unit_test 2, %res2351Temp lshl =352bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));353writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));354355//~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7356//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b357//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b358//! p_unit_test 3, %res3359Operand a_24bit = Operand(inputs[0]);360a_24bit.set24bit(true);361lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));362writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));363364//! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3365//~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b366//~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b367//! p_unit_test 4, %carry368lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));369Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();370writeout(4, carry);371372//~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a373//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b374//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b375//! p_unit_test 5, %res5376lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);377writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));378379//~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b380//~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b381//! p_unit_test 6, %res6382lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));383writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));384385//~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b386//~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b387//! p_unit_test 7, %res7388Operand a_16bit = Operand(inputs[0]);389a_16bit.set16bit(true);390lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));391writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));392393finish_opt_test();394}395END_TEST396397Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)398{399a.set16bit(is16bit);400b.set16bit(is16bit);401402return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);403}404405BEGIN_TEST(optimize.mad_u32_u16)406for (unsigned i = GFX9; i <= GFX10; i++) {407//>> v1: %a, v1: %b, s1: %c = p_startpgm408if (!setup_cs("v1 v1 s1", (chip_class)i))409continue;410411//! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b412//! p_unit_test 0, %res0413writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand::zero()));414415//! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a416//! p_unit_test 1, %res1417writeout(1, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero()));418419//! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a420//! p_unit_test 2, %res2421writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand::c32(42u), Operand::zero()));422423//! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a424//! p_unit_test 3, %res3425writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand::zero()));426427//! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0428//! p_unit_test 4, %res4429writeout(4, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[2]), Operand::zero()));430431//! v1: %res5 = v_mad_u32_u16 42, %a, 0432//! p_unit_test 5, %res5433writeout(5,434create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero(), false));435436//~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b437//~gfx9! v1: %res6 = v_add_u32 %mul6, %b438//~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b439//~gfx10! v1: %res6 = v_add_u32 %mul6, %b440//! p_unit_test 6, %res6441Temp mul;442if (i >= GFX10) {443mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);444} else {445mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);446}447writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1]));448449//~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b450//~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b451//~gfx10! v1: %res7 = v_add_u32 %mul7, %b452//! p_unit_test 7, %res7453if (i >= GFX10) {454mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);455} else {456mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);457}458writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1]));459460finish_opt_test();461}462END_TEST463464BEGIN_TEST(optimize.bcnt)465for (unsigned i = GFX8; i <= GFX10; i++) {466//>> v1: %a, s1: %b = p_startpgm467if (!setup_cs("v1 s1", (chip_class)i))468continue;469470Temp bcnt;471472//! v1: %res0 = v_bcnt_u32_b32 %a, %a473//! p_unit_test 0, %res0474bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());475writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));476477//! v1: %res1 = v_bcnt_u32_b32 %a, %b478//! p_unit_test 1, %res1479bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());480writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));481482//! v1: %res2 = v_bcnt_u32_b32 %a, 42483//! p_unit_test 2, %res2484bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());485writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));486487//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0488//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a489//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a490//! p_unit_test 3, %res3491bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());492writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));493494//! v1: %bnct4 = v_bcnt_u32_b32 %a, 0495//~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a496//~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a497//! p_unit_test 4, %carry498bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());499Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();500writeout(4, carry);501502finish_opt_test();503}504END_TEST505506struct clamp_config {507const char *name;508aco_opcode min, max, med3;509Operand lb, ub;510};511512static const clamp_config clamp_configs[] = {513/* 0.0, 4.0 */514{"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,515Operand::zero(), Operand::c32(0x40800000u)},516{"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,517Operand::c16(0u), Operand::c16(0x4400)},518/* -1.0, 0.0 */519{"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,520Operand::c32(0xbf800000u), Operand::zero()},521{"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,522Operand::c16(0xBC00), Operand::c16(0u)},523/* 0, 3 */524{"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,525Operand::zero(), Operand::c32(3u)},526{"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,527Operand::c16(0u), Operand::c16(3u)},528{"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,529Operand::zero(), Operand::c32(3u)},530{"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,531Operand::c16(0u), Operand::c16(3u)},532/* -5, 0 */533{"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,534Operand::c32(0xfffffffbu), Operand::zero()},535{"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,536Operand::c16(0xfffbu), Operand::c16(0u)},537};538539BEGIN_TEST(optimize.clamp)540for (clamp_config cfg : clamp_configs) {541if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))542continue;543544//! cfg: @match_func(min max med3 lb ub)545fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);546fprintf(output, "%s ", instr_info.name[(int)cfg.max]);547fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);548aco_print_operand(&cfg.lb, output);549fprintf(output, " ");550aco_print_operand(&cfg.ub, output);551fprintf(output, "\n");552553//>> v1: %a, v1: %b, v1: %c = p_startpgm554555//! v1: %res0 = @med3 @ub, @lb, %a556//! p_unit_test 0, %res0557writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,558bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));559560//! v1: %res1 = @med3 @lb, @ub, %a561//! p_unit_test 1, %res1562writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,563bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));564565/* min constant must be greater than max constant */566//! v1: %res2_tmp = @min @lb, %a567//! v1: %res2 = @max @ub, %res2_tmp568//! p_unit_test 2, %res2569writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,570bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));571572//! v1: %res3_tmp = @max @ub, %a573//! v1: %res3 = @min @lb, %res3_tmp574//! p_unit_test 3, %res3575writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,576bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));577578/* needs two constants */579580//! v1: %res4_tmp = @max @lb, %a581//! v1: %res4 = @min %b, %res4_tmp582//! p_unit_test 4, %res4583writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],584bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));585586//! v1: %res5_tmp = @max %b, %a587//! v1: %res5 = @min @ub, %res5_tmp588//! p_unit_test 5, %res5589writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,590bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));591592//! v1: %res6_tmp = @max %c, %a593//! v1: %res6 = @min %b, %res6_tmp594//! p_unit_test 6, %res6595writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],596bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));597598/* correct NaN behaviour with precise */599600//! v1: %res7 = @med3 @ub, @lb, %a601//! p_unit_test 7, %res7602Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);603max.def(0).setPrecise(true);604Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);605max.def(0).setPrecise(true);606writeout(7, min);607608//! v1: (precise)%res8_tmp = @min @ub, %a609//! v1: %res8 = @max @lb, %res8_tmp610//! p_unit_test 8, %res8611min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);612min.def(0).setPrecise(true);613writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));614615finish_opt_test();616}617END_TEST618619BEGIN_TEST(optimize.const_comparison_ordering)620//>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm621if (!setup_cs("v1 v1 v2 v1", GFX9))622return;623624/* optimize to unordered comparison */625//! s2: %res0 = v_cmp_nge_f32 4.0, %a626//! p_unit_test 0, %res0627writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),628bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),629bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),630Operand::c32(0x40800000u), inputs[0])));631632//! s2: %res1 = v_cmp_nge_f32 4.0, %a633//! p_unit_test 1, %res1634writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),635bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),636bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),637Operand::c32(0x40800000u), inputs[0])));638639//! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a640//! p_unit_test 2, %res2641writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),642bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),643bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),644bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));645646/* optimize to ordered comparison */647//! s2: %res3 = v_cmp_lt_f32 4.0, %a648//! p_unit_test 3, %res3649writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),650bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),651bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),652Operand::c32(0x40800000u), inputs[0])));653654//! s2: %res4 = v_cmp_lt_f32 4.0, %a655//! p_unit_test 4, %res4656writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),657bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),658bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),659Operand::c32(0x40800000u), inputs[0])));660661//! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a662//! p_unit_test 5, %res5663writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),664bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),665bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),666bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));667668/* similar but unoptimizable expressions */669//! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a670//! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a671//! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0672//! p_unit_test 6, %res6673Temp src1 =674bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);675Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);676writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));677678//! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a679//! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a680//! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0681//! p_unit_test 7, %res7682src1 =683bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);684src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);685writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));686687//! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d688//! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a689//! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0690//! p_unit_test 8, %res8691src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);692src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);693writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));694695//! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a696//! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d697//! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0698//! p_unit_test 9, %res9699src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);700src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);701writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));702703/* bit sizes */704//! s2: %res10 = v_cmp_nge_f16 4.0, %b705//! p_unit_test 10, %res10706Temp input1_16 =707bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());708writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),709bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),710bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),711input1_16)));712713//! s2: %res11 = v_cmp_nge_f64 4.0, %c714//! p_unit_test 11, %res11715writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),716bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),717bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),718Operand::c64(0x4010000000000000u), inputs[2])));719720/* NaN */721uint16_t nan16 = 0x7e00;722uint32_t nan32 = 0x7fc00000;723uint64_t nan64 = 0xffffffffffffffffllu;724725//! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a726//! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a727//! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0728//! p_unit_test 12, %res12729src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);730src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);731writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));732733//! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a734//! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a735//! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0736//! p_unit_test 13, %res13737src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);738src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);739writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));740741//! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a742//! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a743//! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0744//! p_unit_test 14, %res14745src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);746src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);747writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));748749finish_opt_test();750END_TEST751752BEGIN_TEST(optimize.add3)753//>> v1: %a, v1: %b, v1: %c = p_startpgm754if (!setup_cs("v1 v1 v1", GFX9))755return;756757//! v1: %res0 = v_add3_u32 %a, %b, %c758//! p_unit_test 0, %res0759Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);760writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));761762//! v1: %tmp1 = v_add_u32 %b, %c clamp763//! v1: %res1 = v_add_u32 %a, %tmp1764//! p_unit_test 1, %res1765tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);766tmp.instr->vop3().clamp = true;767writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));768769//! v1: %tmp2 = v_add_u32 %b, %c770//! v1: %res2 = v_add_u32 %a, %tmp2 clamp771//! p_unit_test 2, %res2772tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);773tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);774tmp.instr->vop3().clamp = true;775writeout(2, tmp);776777finish_opt_test();778END_TEST779780BEGIN_TEST(optimize.minmax)781for (unsigned i = GFX9; i <= GFX10; i++) {782//>> v1: %a = p_startpgm783if (!setup_cs("v1", (chip_class)i))784continue;785786//! v1: %res0 = v_max3_f32 0, -0, %a787//! p_unit_test 0, %res0788Temp xor0 = fneg(inputs[0]);789Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);790Temp xor1 = fneg(min);791writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));792793//! v1: %res1 = v_max3_f32 0, -0, -%a794//! p_unit_test 1, %res1795min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));796xor1 = fneg(min);797writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));798799finish_opt_test();800}801END_TEST802803BEGIN_TEST(optimize.mad_32_24)804for (unsigned i = GFX8; i <= GFX9; i++) {805//>> v1: %a, v1: %b, v1: %c = p_startpgm806if (!setup_cs("v1 v1 v1", (chip_class)i))807continue;808809//! v1: %res0 = v_mad_u32_u24 %b, %c, %a810//! p_unit_test 0, %res0811Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);812writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));813814//! v1: %res1_tmp = v_mul_u32_u24 %b, %c815//! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp816//! p_unit_test 1, %res1817mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);818writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());819820finish_opt_test();821}822END_TEST823824BEGIN_TEST(optimize.add_lshlrev)825for (unsigned i = GFX8; i <= GFX10; i++) {826//>> v1: %a, v1: %b, s1: %c = p_startpgm827if (!setup_cs("v1 v1 s1", (chip_class)i))828continue;829830Temp lshl;831832//~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a833//~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b834//~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b835//! p_unit_test 0, %res0836lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));837writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));838839//~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a840//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b841//~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b842//! p_unit_test 1, %res1843Operand a_24bit = Operand(inputs[0]);844a_24bit.set24bit(true);845lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);846writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));847848//~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b849//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b850//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b851//! p_unit_test 2, %res2852Operand b_24bit = Operand(inputs[1]);853b_24bit.set24bit(true);854lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);855writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));856857//~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b858//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b859//! p_unit_test 3, %res3860lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);861writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));862863//~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b864//~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b865//! p_unit_test 4, %res4866Operand a_16bit = Operand(inputs[0]);867a_16bit.set16bit(true);868lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);869writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));870871//~gfx8! v1: %lshl5 = v_lshlrev_b32 4, (is24bit)%c872//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %c, %lshl5873//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c874//! p_unit_test 5, %res5875Operand c_24bit = Operand(inputs[2]);876c_24bit.set24bit(true);877lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);878writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));879880finish_opt_test();881}882END_TEST883884enum denorm_op {885denorm_mul1 = 0,886denorm_fneg = 1,887denorm_fabs = 2,888denorm_fnegabs = 3,889};890891static const char *denorm_op_names[] = {892"mul1",893"fneg",894"fabs",895"fnegabs",896};897898struct denorm_config {899bool flush;900unsigned op;901aco_opcode src;902aco_opcode dest;903};904905static const char *srcdest_op_name(aco_opcode op)906{907switch (op) {908case aco_opcode::v_cndmask_b32:909return "cndmask";910case aco_opcode::v_min_f32:911return "min";912case aco_opcode::v_rcp_f32:913return "rcp";914default:915return "none";916}917}918919static Temp emit_denorm_srcdest(aco_opcode op, Temp val)920{921switch (op) {922case aco_opcode::v_cndmask_b32:923return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);924case aco_opcode::v_min_f32:925return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);926case aco_opcode::v_rcp_f32:927return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);928default:929return val;930}931}932933BEGIN_TEST(optimize.denorm_propagation)934for (unsigned i = GFX8; i <= GFX9; i++) {935std::vector<denorm_config> configs;936for (bool flush : {false, true}) {937for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})938configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});939940for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {941for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})942configs.push_back({flush, op, aco_opcode::num_opcodes, dest});943}944945for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {946for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})947configs.push_back({flush, op, src, aco_opcode::num_opcodes});948}949}950951for (denorm_config cfg : configs) {952char subvariant[128];953sprintf(subvariant, "_%s_%s_%s_%s",954cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),955denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));956if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant))957continue;958959bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||960cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||961!cfg.flush;962963fprintf(output, "src, dest, op: %s %s %s\n",964srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);965fprintf(output, "can_propagate: %u\n", can_propagate);966//! src, dest, op: $src $dest $op967//! can_propagate: #can_propagate968//>> v1: %a, s2: %b = p_startpgm969970//; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',971//; 'min': 'v1: %{} = v_min_f32 0, {}',972//; 'rcp': 'v1: %{} = v_rcp_f32 {}'}973//; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',974//; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',975//; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',976//; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}977//; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}978979//; name = 'a'980//; if src != 'none':981//; insert_pattern(patterns[src].format('src_res', '%'+name))982//; name = 'src_res'983984//; if can_propagate:985//; name = inline_ops[op].format(name)986//; else:987//; insert_pattern(ops[op].format('op_res', name))988//; name = '%op_res'989990//; if dest != 'none':991//; insert_pattern(patterns[dest].format('dest_res', name))992//; name = '%dest_res'993994//; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))995//! p_unit_test 0, %res996997program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;998999Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);1000switch (cfg.op) {1001case denorm_mul1:1002val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);1003break;1004case denorm_fneg:1005val = fneg(val);1006break;1007case denorm_fabs:1008val = fabs(val);1009break;1010case denorm_fnegabs:1011val = fneg(fabs(val));1012break;1013}1014val = emit_denorm_srcdest(cfg.dest, val);1015writeout(10160, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));10171018finish_opt_test();1019}1020}1021END_TEST102210231024