Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/tests/test_optimizer.cpp
7097 views
1
/*
2
* Copyright © 2020 Valve Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
*/
24
#include "helpers.h"
25
26
using namespace aco;
27
28
BEGIN_TEST(optimize.neg)
29
for (unsigned i = GFX9; i <= GFX10; i++) {
30
//>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
31
if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
32
continue;
33
34
//! v1: %res0 = v_mul_f32 %a, -%b
35
//! p_unit_test 0, %res0
36
Temp neg_b = fneg(inputs[1]);
37
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
38
39
//~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
40
//~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
41
//~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
42
//! p_unit_test 1, %res1
43
Temp neg_a = fneg(inputs[0]);
44
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
45
46
//! v1: %res2 = v_mul_f32 %a, %b
47
//! p_unit_test 2, %res2
48
Temp neg_neg_a = fneg(neg_a);
49
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
50
51
//! v1: %res3 = v_mul_f32 |%a|, %b
52
//! p_unit_test 3, %res3
53
Temp abs_neg_a = fabs(neg_a);
54
writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
55
56
//! v1: %res4 = v_mul_f32 -|%a|, %b
57
//! p_unit_test 4, %res4
58
Temp abs_a = fabs(inputs[0]);
59
Temp neg_abs_a = fneg(abs_a);
60
writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
61
62
//! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
63
//! p_unit_test 5, %res5
64
writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
65
66
//! v1: %res6 = v_subrev_f32 %a, %b
67
//! p_unit_test 6, %res6
68
writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
69
70
//! v1: %res7 = v_sub_f32 %b, %a
71
//! p_unit_test 7, %res7
72
writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
73
74
//! v1: %res8 = v_mul_f32 %a, -%c
75
//! p_unit_test 8, %res8
76
Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
77
writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
78
79
// //! v1: %res9 = v_mul_f32 |%neg_a|, %b
80
// //! p_unit_test 9, %res9
81
Temp abs_neg_abs_a = fabs(neg_abs_a);
82
writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
83
84
finish_opt_test();
85
}
86
END_TEST
87
88
BEGIN_TEST(optimize.output_modifiers)
89
//>> v1: %a, v1: %b = p_startpgm
90
if (!setup_cs("v1 v1", GFX9))
91
return;
92
93
program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
94
95
/* 32-bit modifiers */
96
97
//! v1: %res0 = v_add_f32 %a, %b *0.5
98
//! p_unit_test 0, %res0
99
Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
100
writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
101
102
//! v1: %res1 = v_add_f32 %a, %b *2
103
//! p_unit_test 1, %res1
104
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
105
writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
106
107
//! v1: %res2 = v_add_f32 %a, %b *4
108
//! p_unit_test 2, %res2
109
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
110
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
111
112
//! v1: %res3 = v_add_f32 %a, %b clamp
113
//! p_unit_test 3, %res3
114
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
115
writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
116
Operand::c32(0x3f800000u), tmp));
117
118
//! v1: %res4 = v_add_f32 %a, %b *2 clamp
119
//! p_unit_test 4, %res4
120
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
121
tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
122
writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
123
Operand::c32(0x3f800000u), tmp));
124
125
/* 16-bit modifiers */
126
127
//! v2b: %res5 = v_add_f16 %a, %b *0.5
128
//! p_unit_test 5, %res5
129
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
130
writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
131
132
//! v2b: %res6 = v_add_f16 %a, %b *2
133
//! p_unit_test 6, %res6
134
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
135
writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
136
137
//! v2b: %res7 = v_add_f16 %a, %b *4
138
//! p_unit_test 7, %res7
139
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
140
writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
141
142
//! v2b: %res8 = v_add_f16 %a, %b clamp
143
//! p_unit_test 8, %res8
144
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
145
writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
146
Operand::c16(0x3c00u), tmp));
147
148
//! v2b: %res9 = v_add_f16 %a, %b *2 clamp
149
//! p_unit_test 9, %res9
150
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
151
tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
152
writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
153
Operand::c16(0x3c00u), tmp));
154
155
/* clamping is done after omod */
156
157
//! v1: %res10_tmp = v_add_f32 %a, %b clamp
158
//! v1: %res10 = v_mul_f32 2.0, %res10_tmp
159
//! p_unit_test 10, %res10
160
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
161
tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
162
tmp);
163
writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
164
165
/* unsupported instructions */
166
167
//! v1: %res11_tmp = v_xor_b32 %a, %b
168
//! v1: %res11 = v_mul_f32 2.0, %res11_tmp
169
//! p_unit_test 11, %res11
170
tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
171
writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
172
173
/* several users */
174
175
//! v1: %res12_tmp = v_add_f32 %a, %b
176
//! p_unit_test %res12_tmp
177
//! v1: %res12 = v_mul_f32 2.0, %res12_tmp
178
//! p_unit_test 12, %res12
179
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
180
bld.pseudo(aco_opcode::p_unit_test, tmp);
181
writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
182
183
//! v1: %res13 = v_add_f32 %a, %b
184
//! p_unit_test 13, %res13
185
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
186
bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
187
writeout(13, tmp);
188
189
/* omod has no effect if denormals are enabled but clamp is fine */
190
191
//>> BB1
192
//! /* logical preds: / linear preds: / kind: uniform, */
193
program->next_fp_mode.denorm32 = fp_denorm_keep;
194
program->next_fp_mode.denorm16_64 = fp_denorm_flush;
195
bld.reset(program->create_and_insert_block());
196
197
//! v1: %res14_tmp = v_add_f32 %a, %b
198
//! v1: %res14 = v_mul_f32 2.0, %res13_tmp
199
//! p_unit_test 14, %res14
200
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
201
writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
202
203
//! v1: %res15 = v_add_f32 %a, %b clamp
204
//! p_unit_test 15, %res15
205
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
206
writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
207
Operand::c32(0x3f800000u), tmp));
208
209
//>> BB2
210
//! /* logical preds: / linear preds: / kind: uniform, */
211
program->next_fp_mode.denorm32 = fp_denorm_flush;
212
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
213
bld.reset(program->create_and_insert_block());
214
215
//! v2b: %res16_tmp = v_add_f16 %a, %b
216
//! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
217
//! p_unit_test 16, %res16
218
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
219
writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
220
221
//! v2b: %res17 = v_add_f16 %a, %b clamp
222
//! p_unit_test 17, %res17
223
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
224
writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
225
Operand::c16(0x3c00u), tmp));
226
227
/* omod flushes -0.0 to +0.0 */
228
229
//>> BB3
230
//! /* logical preds: / linear preds: / kind: uniform, */
231
program->next_fp_mode.denorm32 = fp_denorm_keep;
232
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
233
program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
234
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
235
bld.reset(program->create_and_insert_block());
236
237
//! v1: %res18_tmp = v_add_f32 %a, %b
238
//! v1: %res18 = v_mul_f32 2.0, %res18_tmp
239
//! p_unit_test 18, %res18
240
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
241
writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
242
//! v1: %res19 = v_add_f32 %a, %b clamp
243
//! p_unit_test 19, %res19
244
tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
245
writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
246
Operand::c32(0x3f800000u), tmp));
247
248
//>> BB4
249
//! /* logical preds: / linear preds: / kind: uniform, */
250
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
251
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
252
bld.reset(program->create_and_insert_block());
253
//! v2b: %res20_tmp = v_add_f16 %a, %b
254
//! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
255
//! p_unit_test 20, %res20
256
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
257
writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
258
//! v2b: %res21 = v_add_f16 %a, %b clamp
259
//! p_unit_test 21, %res21
260
tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
261
writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
262
Operand::c16(0x3c00u), tmp));
263
264
finish_opt_test();
265
END_TEST
266
267
Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
268
{
269
return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
270
}
271
272
BEGIN_TEST(optimize.cndmask)
273
for (unsigned i = GFX9; i <= GFX10; i++) {
274
//>> v1: %a, s1: %b, s2: %c = p_startpgm
275
if (!setup_cs("v1 s1 s2", (chip_class)i))
276
continue;
277
278
Temp subbrev;
279
280
//! v1: %res0 = v_cndmask_b32 0, %a, %c
281
//! p_unit_test 0, %res0
282
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
283
writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
284
285
//! v1: %res1 = v_cndmask_b32 0, 42, %c
286
//! p_unit_test 1, %res1
287
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
288
writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
289
290
//~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
291
//~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
292
//~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
293
//! p_unit_test 2, %res2
294
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
295
writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
296
297
//! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
298
//! v1: %xor = v_xor_b32 %a, %subbrev1
299
//! v1: %res3 = v_cndmask_b32 0, %xor, %c
300
//! p_unit_test 3, %res3
301
subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
302
Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
303
writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
304
305
//! v1: %res4 = v_cndmask_b32 0, %a, %c
306
//! p_unit_test 4, %res4
307
Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
308
Operand::c32(1u), Operand(inputs[2]));
309
Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
310
writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
311
312
finish_opt_test();
313
}
314
END_TEST
315
316
BEGIN_TEST(optimize.add_lshl)
317
for (unsigned i = GFX8; i <= GFX10; i++) {
318
//>> s1: %a, v1: %b = p_startpgm
319
if (!setup_cs("s1 v1", (chip_class)i))
320
continue;
321
322
Temp shift;
323
324
//~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
325
//~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
326
//~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
327
//! p_unit_test 0, %res0
328
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
329
Operand::c32(3u));
330
writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
331
Operand::c32(4u)));
332
333
//~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
334
//~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
335
//~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
336
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
337
//~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
338
//~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
339
//~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
340
//! p_unit_test 1, %res1
341
shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
342
Operand::c32(3u));
343
Temp sadd =
344
bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
345
Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
346
writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
347
348
//~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
349
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
350
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
351
//! p_unit_test 2, %res2
352
Temp lshl =
353
bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));
354
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
355
356
//~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
357
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
358
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
359
//! p_unit_test 3, %res3
360
Operand a_24bit = Operand(inputs[0]);
361
a_24bit.set24bit(true);
362
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));
363
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
364
365
//! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
366
//~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
367
//~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
368
//! p_unit_test 4, %carry
369
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
370
Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
371
writeout(4, carry);
372
373
//~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
374
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
375
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
376
//! p_unit_test 5, %res5
377
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
378
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
379
380
//~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
381
//~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
382
//! p_unit_test 6, %res6
383
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
384
writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
385
386
//~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
387
//~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
388
//! p_unit_test 7, %res7
389
Operand a_16bit = Operand(inputs[0]);
390
a_16bit.set16bit(true);
391
lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));
392
writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
393
394
finish_opt_test();
395
}
396
END_TEST
397
398
Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
399
{
400
a.set16bit(is16bit);
401
b.set16bit(is16bit);
402
403
return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
404
}
405
406
BEGIN_TEST(optimize.mad_u32_u16)
407
for (unsigned i = GFX9; i <= GFX10; i++) {
408
//>> v1: %a, v1: %b, s1: %c = p_startpgm
409
if (!setup_cs("v1 v1 s1", (chip_class)i))
410
continue;
411
412
//! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
413
//! p_unit_test 0, %res0
414
writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand::zero()));
415
416
//! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
417
//! p_unit_test 1, %res1
418
writeout(1, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero()));
419
420
//! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
421
//! p_unit_test 2, %res2
422
writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand::c32(42u), Operand::zero()));
423
424
//! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
425
//! p_unit_test 3, %res3
426
writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand::zero()));
427
428
//! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
429
//! p_unit_test 4, %res4
430
writeout(4, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[2]), Operand::zero()));
431
432
//! v1: %res5 = v_mad_u32_u16 42, %a, 0
433
//! p_unit_test 5, %res5
434
writeout(5,
435
create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero(), false));
436
437
//~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b
438
//~gfx9! v1: %res6 = v_add_u32 %mul6, %b
439
//~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b
440
//~gfx10! v1: %res6 = v_add_u32 %mul6, %b
441
//! p_unit_test 6, %res6
442
Temp mul;
443
if (i >= GFX10) {
444
mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
445
} else {
446
mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
447
}
448
writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1]));
449
450
//~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b
451
//~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b
452
//~gfx10! v1: %res7 = v_add_u32 %mul7, %b
453
//! p_unit_test 7, %res7
454
if (i >= GFX10) {
455
mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
456
} else {
457
mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
458
}
459
writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1]));
460
461
finish_opt_test();
462
}
463
END_TEST
464
465
BEGIN_TEST(optimize.bcnt)
466
for (unsigned i = GFX8; i <= GFX10; i++) {
467
//>> v1: %a, s1: %b = p_startpgm
468
if (!setup_cs("v1 s1", (chip_class)i))
469
continue;
470
471
Temp bcnt;
472
473
//! v1: %res0 = v_bcnt_u32_b32 %a, %a
474
//! p_unit_test 0, %res0
475
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
476
writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
477
478
//! v1: %res1 = v_bcnt_u32_b32 %a, %b
479
//! p_unit_test 1, %res1
480
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
481
writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
482
483
//! v1: %res2 = v_bcnt_u32_b32 %a, 42
484
//! p_unit_test 2, %res2
485
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
486
writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
487
488
//! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
489
//~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
490
//~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
491
//! p_unit_test 3, %res3
492
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
493
writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
494
495
//! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
496
//~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
497
//~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
498
//! p_unit_test 4, %carry
499
bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
500
Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
501
writeout(4, carry);
502
503
finish_opt_test();
504
}
505
END_TEST
506
507
struct clamp_config {
508
const char *name;
509
aco_opcode min, max, med3;
510
Operand lb, ub;
511
};
512
513
static const clamp_config clamp_configs[] = {
514
/* 0.0, 4.0 */
515
{"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
516
Operand::zero(), Operand::c32(0x40800000u)},
517
{"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
518
Operand::c16(0u), Operand::c16(0x4400)},
519
/* -1.0, 0.0 */
520
{"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
521
Operand::c32(0xbf800000u), Operand::zero()},
522
{"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
523
Operand::c16(0xBC00), Operand::c16(0u)},
524
/* 0, 3 */
525
{"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
526
Operand::zero(), Operand::c32(3u)},
527
{"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
528
Operand::c16(0u), Operand::c16(3u)},
529
{"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
530
Operand::zero(), Operand::c32(3u)},
531
{"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
532
Operand::c16(0u), Operand::c16(3u)},
533
/* -5, 0 */
534
{"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
535
Operand::c32(0xfffffffbu), Operand::zero()},
536
{"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
537
Operand::c16(0xfffbu), Operand::c16(0u)},
538
};
539
540
BEGIN_TEST(optimize.clamp)
541
for (clamp_config cfg : clamp_configs) {
542
if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
543
continue;
544
545
//! cfg: @match_func(min max med3 lb ub)
546
fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
547
fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
548
fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
549
aco_print_operand(&cfg.lb, output);
550
fprintf(output, " ");
551
aco_print_operand(&cfg.ub, output);
552
fprintf(output, "\n");
553
554
//>> v1: %a, v1: %b, v1: %c = p_startpgm
555
556
//! v1: %res0 = @med3 @ub, @lb, %a
557
//! p_unit_test 0, %res0
558
writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
559
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
560
561
//! v1: %res1 = @med3 @lb, @ub, %a
562
//! p_unit_test 1, %res1
563
writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
564
bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
565
566
/* min constant must be greater than max constant */
567
//! v1: %res2_tmp = @min @lb, %a
568
//! v1: %res2 = @max @ub, %res2_tmp
569
//! p_unit_test 2, %res2
570
writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
571
bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
572
573
//! v1: %res3_tmp = @max @ub, %a
574
//! v1: %res3 = @min @lb, %res3_tmp
575
//! p_unit_test 3, %res3
576
writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
577
bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
578
579
/* needs two constants */
580
581
//! v1: %res4_tmp = @max @lb, %a
582
//! v1: %res4 = @min %b, %res4_tmp
583
//! p_unit_test 4, %res4
584
writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
585
bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
586
587
//! v1: %res5_tmp = @max %b, %a
588
//! v1: %res5 = @min @ub, %res5_tmp
589
//! p_unit_test 5, %res5
590
writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
591
bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
592
593
//! v1: %res6_tmp = @max %c, %a
594
//! v1: %res6 = @min %b, %res6_tmp
595
//! p_unit_test 6, %res6
596
writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
597
bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
598
599
/* correct NaN behaviour with precise */
600
601
//! v1: %res7 = @med3 @ub, @lb, %a
602
//! p_unit_test 7, %res7
603
Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
604
max.def(0).setPrecise(true);
605
Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
606
max.def(0).setPrecise(true);
607
writeout(7, min);
608
609
//! v1: (precise)%res8_tmp = @min @ub, %a
610
//! v1: %res8 = @max @lb, %res8_tmp
611
//! p_unit_test 8, %res8
612
min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
613
min.def(0).setPrecise(true);
614
writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
615
616
finish_opt_test();
617
}
618
END_TEST
619
620
BEGIN_TEST(optimize.const_comparison_ordering)
621
//>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
622
if (!setup_cs("v1 v1 v2 v1", GFX9))
623
return;
624
625
/* optimize to unordered comparison */
626
//! s2: %res0 = v_cmp_nge_f32 4.0, %a
627
//! p_unit_test 0, %res0
628
writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
629
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
630
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
631
Operand::c32(0x40800000u), inputs[0])));
632
633
//! s2: %res1 = v_cmp_nge_f32 4.0, %a
634
//! p_unit_test 1, %res1
635
writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
636
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
637
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
638
Operand::c32(0x40800000u), inputs[0])));
639
640
//! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
641
//! p_unit_test 2, %res2
642
writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
643
bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
644
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
645
bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
646
647
/* optimize to ordered comparison */
648
//! s2: %res3 = v_cmp_lt_f32 4.0, %a
649
//! p_unit_test 3, %res3
650
writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
651
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
652
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
653
Operand::c32(0x40800000u), inputs[0])));
654
655
//! s2: %res4 = v_cmp_lt_f32 4.0, %a
656
//! p_unit_test 4, %res4
657
writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
658
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
659
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
660
Operand::c32(0x40800000u), inputs[0])));
661
662
//! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
663
//! p_unit_test 5, %res5
664
writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
665
bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
666
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
667
bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
668
669
/* similar but unoptimizable expressions */
670
//! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
671
//! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
672
//! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
673
//! p_unit_test 6, %res6
674
Temp src1 =
675
bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
676
Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
677
writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
678
679
//! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
680
//! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
681
//! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
682
//! p_unit_test 7, %res7
683
src1 =
684
bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
685
src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
686
writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
687
688
//! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
689
//! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
690
//! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
691
//! p_unit_test 8, %res8
692
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
693
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
694
writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
695
696
//! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
697
//! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
698
//! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
699
//! p_unit_test 9, %res9
700
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
701
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
702
writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
703
704
/* bit sizes */
705
//! s2: %res10 = v_cmp_nge_f16 4.0, %b
706
//! p_unit_test 10, %res10
707
Temp input1_16 =
708
bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
709
writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
710
bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
711
bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
712
input1_16)));
713
714
//! s2: %res11 = v_cmp_nge_f64 4.0, %c
715
//! p_unit_test 11, %res11
716
writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
717
bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
718
bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
719
Operand::c64(0x4010000000000000u), inputs[2])));
720
721
/* NaN */
722
uint16_t nan16 = 0x7e00;
723
uint32_t nan32 = 0x7fc00000;
724
uint64_t nan64 = 0xffffffffffffffffllu;
725
726
//! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
727
//! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
728
//! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
729
//! p_unit_test 12, %res12
730
src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
731
src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
732
writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
733
734
//! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
735
//! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
736
//! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
737
//! p_unit_test 13, %res13
738
src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
739
src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
740
writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
741
742
//! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
743
//! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
744
//! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
745
//! p_unit_test 14, %res14
746
src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);
747
src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
748
writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
749
750
finish_opt_test();
751
END_TEST
752
753
BEGIN_TEST(optimize.add3)
754
//>> v1: %a, v1: %b, v1: %c = p_startpgm
755
if (!setup_cs("v1 v1 v1", GFX9))
756
return;
757
758
//! v1: %res0 = v_add3_u32 %a, %b, %c
759
//! p_unit_test 0, %res0
760
Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
761
writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
762
763
//! v1: %tmp1 = v_add_u32 %b, %c clamp
764
//! v1: %res1 = v_add_u32 %a, %tmp1
765
//! p_unit_test 1, %res1
766
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
767
tmp.instr->vop3().clamp = true;
768
writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
769
770
//! v1: %tmp2 = v_add_u32 %b, %c
771
//! v1: %res2 = v_add_u32 %a, %tmp2 clamp
772
//! p_unit_test 2, %res2
773
tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
774
tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
775
tmp.instr->vop3().clamp = true;
776
writeout(2, tmp);
777
778
finish_opt_test();
779
END_TEST
780
781
BEGIN_TEST(optimize.minmax)
782
for (unsigned i = GFX9; i <= GFX10; i++) {
783
//>> v1: %a = p_startpgm
784
if (!setup_cs("v1", (chip_class)i))
785
continue;
786
787
//! v1: %res0 = v_max3_f32 0, -0, %a
788
//! p_unit_test 0, %res0
789
Temp xor0 = fneg(inputs[0]);
790
Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
791
Temp xor1 = fneg(min);
792
writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
793
794
//! v1: %res1 = v_max3_f32 0, -0, -%a
795
//! p_unit_test 1, %res1
796
min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
797
xor1 = fneg(min);
798
writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
799
800
finish_opt_test();
801
}
802
END_TEST
803
804
BEGIN_TEST(optimize.mad_32_24)
805
for (unsigned i = GFX8; i <= GFX9; i++) {
806
//>> v1: %a, v1: %b, v1: %c = p_startpgm
807
if (!setup_cs("v1 v1 v1", (chip_class)i))
808
continue;
809
810
//! v1: %res0 = v_mad_u32_u24 %b, %c, %a
811
//! p_unit_test 0, %res0
812
Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
813
writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
814
815
//! v1: %res1_tmp = v_mul_u32_u24 %b, %c
816
//! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
817
//! p_unit_test 1, %res1
818
mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
819
writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
820
821
finish_opt_test();
822
}
823
END_TEST
824
825
BEGIN_TEST(optimize.add_lshlrev)
826
for (unsigned i = GFX8; i <= GFX10; i++) {
827
//>> v1: %a, v1: %b, s1: %c = p_startpgm
828
if (!setup_cs("v1 v1 s1", (chip_class)i))
829
continue;
830
831
Temp lshl;
832
833
//~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
834
//~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
835
//~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
836
//! p_unit_test 0, %res0
837
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
838
writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
839
840
//~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
841
//~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
842
//~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
843
//! p_unit_test 1, %res1
844
Operand a_24bit = Operand(inputs[0]);
845
a_24bit.set24bit(true);
846
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
847
writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
848
849
//~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
850
//~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
851
//~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
852
//! p_unit_test 2, %res2
853
Operand b_24bit = Operand(inputs[1]);
854
b_24bit.set24bit(true);
855
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
856
writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
857
858
//~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
859
//~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
860
//! p_unit_test 3, %res3
861
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
862
writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
863
864
//~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
865
//~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
866
//! p_unit_test 4, %res4
867
Operand a_16bit = Operand(inputs[0]);
868
a_16bit.set16bit(true);
869
lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
870
writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
871
872
//~gfx8! v1: %lshl5 = v_lshlrev_b32 4, (is24bit)%c
873
//~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %c, %lshl5
874
//~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
875
//! p_unit_test 5, %res5
876
Operand c_24bit = Operand(inputs[2]);
877
c_24bit.set24bit(true);
878
lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
879
writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
880
881
finish_opt_test();
882
}
883
END_TEST
884
885
enum denorm_op {
886
denorm_mul1 = 0,
887
denorm_fneg = 1,
888
denorm_fabs = 2,
889
denorm_fnegabs = 3,
890
};
891
892
static const char *denorm_op_names[] = {
893
"mul1",
894
"fneg",
895
"fabs",
896
"fnegabs",
897
};
898
899
struct denorm_config {
900
bool flush;
901
unsigned op;
902
aco_opcode src;
903
aco_opcode dest;
904
};
905
906
static const char *srcdest_op_name(aco_opcode op)
907
{
908
switch (op) {
909
case aco_opcode::v_cndmask_b32:
910
return "cndmask";
911
case aco_opcode::v_min_f32:
912
return "min";
913
case aco_opcode::v_rcp_f32:
914
return "rcp";
915
default:
916
return "none";
917
}
918
}
919
920
static Temp emit_denorm_srcdest(aco_opcode op, Temp val)
921
{
922
switch (op) {
923
case aco_opcode::v_cndmask_b32:
924
return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
925
case aco_opcode::v_min_f32:
926
return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
927
case aco_opcode::v_rcp_f32:
928
return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
929
default:
930
return val;
931
}
932
}
933
934
BEGIN_TEST(optimize.denorm_propagation)
935
for (unsigned i = GFX8; i <= GFX9; i++) {
936
std::vector<denorm_config> configs;
937
for (bool flush : {false, true}) {
938
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
939
configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
940
941
for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
942
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
943
configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
944
}
945
946
for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
947
for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
948
configs.push_back({flush, op, src, aco_opcode::num_opcodes});
949
}
950
}
951
952
for (denorm_config cfg : configs) {
953
char subvariant[128];
954
sprintf(subvariant, "_%s_%s_%s_%s",
955
cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
956
denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
957
if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant))
958
continue;
959
960
bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
961
cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
962
!cfg.flush;
963
964
fprintf(output, "src, dest, op: %s %s %s\n",
965
srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
966
fprintf(output, "can_propagate: %u\n", can_propagate);
967
//! src, dest, op: $src $dest $op
968
//! can_propagate: #can_propagate
969
//>> v1: %a, s2: %b = p_startpgm
970
971
//; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
972
//; 'min': 'v1: %{} = v_min_f32 0, {}',
973
//; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
974
//; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
975
//; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
976
//; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
977
//; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
978
//; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
979
980
//; name = 'a'
981
//; if src != 'none':
982
//; insert_pattern(patterns[src].format('src_res', '%'+name))
983
//; name = 'src_res'
984
985
//; if can_propagate:
986
//; name = inline_ops[op].format(name)
987
//; else:
988
//; insert_pattern(ops[op].format('op_res', name))
989
//; name = '%op_res'
990
991
//; if dest != 'none':
992
//; insert_pattern(patterns[dest].format('dest_res', name))
993
//; name = '%dest_res'
994
995
//; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
996
//! p_unit_test 0, %res
997
998
program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
999
1000
Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
1001
switch (cfg.op) {
1002
case denorm_mul1:
1003
val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
1004
break;
1005
case denorm_fneg:
1006
val = fneg(val);
1007
break;
1008
case denorm_fabs:
1009
val = fabs(val);
1010
break;
1011
case denorm_fnegabs:
1012
val = fneg(fabs(val));
1013
break;
1014
}
1015
val = emit_denorm_srcdest(cfg.dest, val);
1016
writeout(
1017
0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
1018
1019
finish_opt_test();
1020
}
1021
}
1022
END_TEST
1023
1024