Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/compiler/aco_optimizer.cpp
4550 views
1
/*
2
* Copyright © 2018 Valve Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
*/
24
25
#include "aco_ir.h"
26
27
#include "util/half_float.h"
28
#include "util/memstream.h"
29
30
#include <algorithm>
31
#include <array>
32
#include <vector>
33
34
namespace aco {
35
36
#ifndef NDEBUG
37
void
38
perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
39
{
40
if (cond) {
41
char* out;
42
size_t outsize;
43
struct u_memstream mem;
44
u_memstream_open(&mem, &out, &outsize);
45
FILE* const memf = u_memstream_get(&mem);
46
47
fprintf(memf, "%s: ", msg);
48
aco_print_instr(instr, memf);
49
u_memstream_close(&mem);
50
51
aco_perfwarn(program, out);
52
free(out);
53
54
if (debug_flags & DEBUG_PERFWARN)
55
exit(1);
56
}
57
}
58
#endif
59
60
/**
61
* The optimizer works in 4 phases:
62
* (1) The first pass collects information for each ssa-def,
63
* propagates reg->reg operands of the same type, inline constants
64
* and neg/abs input modifiers.
65
* (2) The second pass combines instructions like mad, omod, clamp and
66
* propagates sgpr's on VALU instructions.
67
* This pass depends on information collected in the first pass.
68
* (3) The third pass goes backwards, and selects instructions,
69
* i.e. decides if a mad instruction is profitable and eliminates dead code.
70
* (4) The fourth pass cleans up the sequence: literals get applied and dead
71
* instructions are removed from the sequence.
72
*/
73
74
struct mad_info {
75
aco_ptr<Instruction> add_instr;
76
uint32_t mul_temp_id;
77
uint16_t literal_idx;
78
bool check_literal;
79
80
mad_info(aco_ptr<Instruction> instr, uint32_t id)
81
: add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false)
82
{}
83
};
84
85
enum Label {
86
label_vec = 1 << 0,
87
label_constant_32bit = 1 << 1,
88
/* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
89
* 32-bit operations but this shouldn't cause any issues because we don't
90
* look through any conversions */
91
label_abs = 1 << 2,
92
label_neg = 1 << 3,
93
label_mul = 1 << 4,
94
label_temp = 1 << 5,
95
label_literal = 1 << 6,
96
label_mad = 1 << 7,
97
label_omod2 = 1 << 8,
98
label_omod4 = 1 << 9,
99
label_omod5 = 1 << 10,
100
label_clamp = 1 << 12,
101
label_undefined = 1 << 14,
102
label_vcc = 1 << 15,
103
label_b2f = 1 << 16,
104
label_add_sub = 1 << 17,
105
label_bitwise = 1 << 18,
106
label_minmax = 1 << 19,
107
label_vopc = 1 << 20,
108
label_uniform_bool = 1 << 21,
109
label_constant_64bit = 1 << 22,
110
label_uniform_bitwise = 1 << 23,
111
label_scc_invert = 1 << 24,
112
label_vcc_hint = 1 << 25,
113
label_scc_needed = 1 << 26,
114
label_b2i = 1 << 27,
115
label_fcanonicalize = 1 << 28,
116
label_constant_16bit = 1 << 29,
117
label_usedef = 1 << 30, /* generic label */
118
label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
119
label_canonicalized = 1ull << 32,
120
label_extract = 1ull << 33,
121
label_insert = 1ull << 34,
122
};
123
124
static constexpr uint64_t instr_usedef_labels =
125
label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |
126
label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract;
127
static constexpr uint64_t instr_mod_labels =
128
label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert;
129
130
static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;
131
static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
132
label_uniform_bool | label_scc_invert | label_b2i |
133
label_fcanonicalize;
134
static constexpr uint32_t val_labels =
135
label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
136
137
static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
138
static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
139
static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
140
141
struct ssa_info {
142
uint64_t label;
143
union {
144
uint32_t val;
145
Temp temp;
146
Instruction* instr;
147
};
148
149
ssa_info() : label(0) {}
150
151
void add_label(Label new_label)
152
{
153
/* Since all the instr_usedef_labels use instr for the same thing
154
* (indicating the defining instruction), there is usually no need to
155
* clear any other instr labels. */
156
if (new_label & instr_usedef_labels)
157
label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
158
159
if (new_label & instr_mod_labels) {
160
label &= ~instr_labels;
161
label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
162
}
163
164
if (new_label & temp_labels) {
165
label &= ~temp_labels;
166
label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
167
}
168
169
uint32_t const_labels =
170
label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
171
if (new_label & const_labels) {
172
label &= ~val_labels | const_labels;
173
label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
174
} else if (new_label & val_labels) {
175
label &= ~val_labels;
176
label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
177
}
178
179
label |= new_label;
180
}
181
182
void set_vec(Instruction* vec)
183
{
184
add_label(label_vec);
185
instr = vec;
186
}
187
188
bool is_vec() { return label & label_vec; }
189
190
void set_constant(chip_class chip, uint64_t constant)
191
{
192
Operand op16 = Operand::c16(constant);
193
Operand op32 = Operand::get_const(chip, constant, 4);
194
add_label(label_literal);
195
val = constant;
196
197
/* check that no upper bits are lost in case of packed 16bit constants */
198
if (chip >= GFX8 && !op16.isLiteral() && op16.constantValue64() == constant)
199
add_label(label_constant_16bit);
200
201
if (!op32.isLiteral())
202
add_label(label_constant_32bit);
203
204
if (Operand::is_constant_representable(constant, 8))
205
add_label(label_constant_64bit);
206
207
if (label & label_constant_64bit) {
208
val = Operand::c64(constant).constantValue();
209
if (val != constant)
210
label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
211
}
212
}
213
214
bool is_constant(unsigned bits)
215
{
216
switch (bits) {
217
case 8: return label & label_literal;
218
case 16: return label & label_constant_16bit;
219
case 32: return label & label_constant_32bit;
220
case 64: return label & label_constant_64bit;
221
}
222
return false;
223
}
224
225
bool is_literal(unsigned bits)
226
{
227
bool is_lit = label & label_literal;
228
switch (bits) {
229
case 8: return false;
230
case 16: return is_lit && ~(label & label_constant_16bit);
231
case 32: return is_lit && ~(label & label_constant_32bit);
232
case 64: return false;
233
}
234
return false;
235
}
236
237
bool is_constant_or_literal(unsigned bits)
238
{
239
if (bits == 64)
240
return label & label_constant_64bit;
241
else
242
return label & label_literal;
243
}
244
245
void set_abs(Temp abs_temp)
246
{
247
add_label(label_abs);
248
temp = abs_temp;
249
}
250
251
bool is_abs() { return label & label_abs; }
252
253
void set_neg(Temp neg_temp)
254
{
255
add_label(label_neg);
256
temp = neg_temp;
257
}
258
259
bool is_neg() { return label & label_neg; }
260
261
void set_neg_abs(Temp neg_abs_temp)
262
{
263
add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
264
temp = neg_abs_temp;
265
}
266
267
void set_mul(Instruction* mul)
268
{
269
add_label(label_mul);
270
instr = mul;
271
}
272
273
bool is_mul() { return label & label_mul; }
274
275
void set_temp(Temp tmp)
276
{
277
add_label(label_temp);
278
temp = tmp;
279
}
280
281
bool is_temp() { return label & label_temp; }
282
283
void set_mad(Instruction* mad, uint32_t mad_info_idx)
284
{
285
add_label(label_mad);
286
mad->pass_flags = mad_info_idx;
287
instr = mad;
288
}
289
290
bool is_mad() { return label & label_mad; }
291
292
void set_omod2(Instruction* mul)
293
{
294
add_label(label_omod2);
295
instr = mul;
296
}
297
298
bool is_omod2() { return label & label_omod2; }
299
300
void set_omod4(Instruction* mul)
301
{
302
add_label(label_omod4);
303
instr = mul;
304
}
305
306
bool is_omod4() { return label & label_omod4; }
307
308
void set_omod5(Instruction* mul)
309
{
310
add_label(label_omod5);
311
instr = mul;
312
}
313
314
bool is_omod5() { return label & label_omod5; }
315
316
void set_clamp(Instruction* med3)
317
{
318
add_label(label_clamp);
319
instr = med3;
320
}
321
322
bool is_clamp() { return label & label_clamp; }
323
324
void set_undefined() { add_label(label_undefined); }
325
326
bool is_undefined() { return label & label_undefined; }
327
328
void set_vcc(Temp vcc_val)
329
{
330
add_label(label_vcc);
331
temp = vcc_val;
332
}
333
334
bool is_vcc() { return label & label_vcc; }
335
336
void set_b2f(Temp b2f_val)
337
{
338
add_label(label_b2f);
339
temp = b2f_val;
340
}
341
342
bool is_b2f() { return label & label_b2f; }
343
344
void set_add_sub(Instruction* add_sub_instr)
345
{
346
add_label(label_add_sub);
347
instr = add_sub_instr;
348
}
349
350
bool is_add_sub() { return label & label_add_sub; }
351
352
void set_bitwise(Instruction* bitwise_instr)
353
{
354
add_label(label_bitwise);
355
instr = bitwise_instr;
356
}
357
358
bool is_bitwise() { return label & label_bitwise; }
359
360
void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
361
362
bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
363
364
void set_minmax(Instruction* minmax_instr)
365
{
366
add_label(label_minmax);
367
instr = minmax_instr;
368
}
369
370
bool is_minmax() { return label & label_minmax; }
371
372
void set_vopc(Instruction* vopc_instr)
373
{
374
add_label(label_vopc);
375
instr = vopc_instr;
376
}
377
378
bool is_vopc() { return label & label_vopc; }
379
380
void set_scc_needed() { add_label(label_scc_needed); }
381
382
bool is_scc_needed() { return label & label_scc_needed; }
383
384
void set_scc_invert(Temp scc_inv)
385
{
386
add_label(label_scc_invert);
387
temp = scc_inv;
388
}
389
390
bool is_scc_invert() { return label & label_scc_invert; }
391
392
void set_uniform_bool(Temp uniform_bool)
393
{
394
add_label(label_uniform_bool);
395
temp = uniform_bool;
396
}
397
398
bool is_uniform_bool() { return label & label_uniform_bool; }
399
400
void set_vcc_hint() { add_label(label_vcc_hint); }
401
402
bool is_vcc_hint() { return label & label_vcc_hint; }
403
404
void set_b2i(Temp b2i_val)
405
{
406
add_label(label_b2i);
407
temp = b2i_val;
408
}
409
410
bool is_b2i() { return label & label_b2i; }
411
412
void set_usedef(Instruction* label_instr)
413
{
414
add_label(label_usedef);
415
instr = label_instr;
416
}
417
418
bool is_usedef() { return label & label_usedef; }
419
420
void set_vop3p(Instruction* vop3p_instr)
421
{
422
add_label(label_vop3p);
423
instr = vop3p_instr;
424
}
425
426
bool is_vop3p() { return label & label_vop3p; }
427
428
void set_fcanonicalize(Temp tmp)
429
{
430
add_label(label_fcanonicalize);
431
temp = tmp;
432
}
433
434
bool is_fcanonicalize() { return label & label_fcanonicalize; }
435
436
void set_canonicalized() { add_label(label_canonicalized); }
437
438
bool is_canonicalized() { return label & label_canonicalized; }
439
440
void set_extract(Instruction* extract)
441
{
442
add_label(label_extract);
443
instr = extract;
444
}
445
446
bool is_extract() { return label & label_extract; }
447
448
void set_insert(Instruction* insert)
449
{
450
add_label(label_insert);
451
instr = insert;
452
}
453
454
bool is_insert() { return label & label_insert; }
455
};
456
457
struct opt_ctx {
458
Program* program;
459
float_mode fp_mode;
460
std::vector<aco_ptr<Instruction>> instructions;
461
ssa_info* info;
462
std::pair<uint32_t, Temp> last_literal;
463
std::vector<mad_info> mad_infos;
464
std::vector<uint16_t> uses;
465
};
466
467
struct CmpInfo {
468
aco_opcode ordered;
469
aco_opcode unordered;
470
aco_opcode ordered_swapped;
471
aco_opcode unordered_swapped;
472
aco_opcode inverse;
473
aco_opcode f32;
474
unsigned size;
475
};
476
477
ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo* info);
478
479
bool
480
can_swap_operands(aco_ptr<Instruction>& instr)
481
{
482
if (instr->operands[0].isConstant() ||
483
(instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
484
return false;
485
486
switch (instr->opcode) {
487
case aco_opcode::v_add_u32:
488
case aco_opcode::v_add_co_u32:
489
case aco_opcode::v_add_co_u32_e64:
490
case aco_opcode::v_add_i32:
491
case aco_opcode::v_add_f16:
492
case aco_opcode::v_add_f32:
493
case aco_opcode::v_mul_f16:
494
case aco_opcode::v_mul_f32:
495
case aco_opcode::v_or_b32:
496
case aco_opcode::v_and_b32:
497
case aco_opcode::v_xor_b32:
498
case aco_opcode::v_max_f16:
499
case aco_opcode::v_max_f32:
500
case aco_opcode::v_min_f16:
501
case aco_opcode::v_min_f32:
502
case aco_opcode::v_max_i32:
503
case aco_opcode::v_min_i32:
504
case aco_opcode::v_max_u32:
505
case aco_opcode::v_min_u32:
506
case aco_opcode::v_max_i16:
507
case aco_opcode::v_min_i16:
508
case aco_opcode::v_max_u16:
509
case aco_opcode::v_min_u16:
510
case aco_opcode::v_max_i16_e64:
511
case aco_opcode::v_min_i16_e64:
512
case aco_opcode::v_max_u16_e64:
513
case aco_opcode::v_min_u16_e64: return true;
514
case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; return true;
515
case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true;
516
case aco_opcode::v_sub_co_u32: instr->opcode = aco_opcode::v_subrev_co_u32; return true;
517
case aco_opcode::v_sub_u16: instr->opcode = aco_opcode::v_subrev_u16; return true;
518
case aco_opcode::v_sub_u32: instr->opcode = aco_opcode::v_subrev_u32; return true;
519
default: {
520
CmpInfo info;
521
get_cmp_info(instr->opcode, &info);
522
if (info.ordered == instr->opcode) {
523
instr->opcode = info.ordered_swapped;
524
return true;
525
}
526
if (info.unordered == instr->opcode) {
527
instr->opcode = info.unordered_swapped;
528
return true;
529
}
530
return false;
531
}
532
}
533
}
534
535
bool
536
can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
537
{
538
if (instr->isVOP3())
539
return true;
540
541
if (instr->isVOP3P())
542
return false;
543
544
if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)
545
return false;
546
547
if (instr->isDPP() || instr->isSDWA())
548
return false;
549
550
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
551
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
552
instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
553
instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
554
instr->opcode != aco_opcode::v_readlane_b32 &&
555
instr->opcode != aco_opcode::v_writelane_b32 &&
556
instr->opcode != aco_opcode::v_readfirstlane_b32;
557
}
558
559
bool
560
pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
561
{
562
if (instr->definitions.empty())
563
return false;
564
565
const bool vgpr =
566
instr->opcode == aco_opcode::p_as_uniform ||
567
std::all_of(instr->definitions.begin(), instr->definitions.end(),
568
[](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
569
570
/* don't propagate VGPRs into SGPR instructions */
571
if (temp.type() == RegType::vgpr && !vgpr)
572
return false;
573
574
bool can_accept_sgpr =
575
ctx.program->chip_class >= GFX9 ||
576
std::none_of(instr->definitions.begin(), instr->definitions.end(),
577
[](const Definition& def) { return def.regClass().is_subdword(); });
578
579
switch (instr->opcode) {
580
case aco_opcode::p_phi:
581
case aco_opcode::p_linear_phi:
582
case aco_opcode::p_parallelcopy:
583
case aco_opcode::p_create_vector:
584
if (temp.bytes() != instr->operands[index].bytes())
585
return false;
586
break;
587
case aco_opcode::p_extract_vector:
588
if (temp.type() == RegType::sgpr && !can_accept_sgpr)
589
return false;
590
break;
591
case aco_opcode::p_split_vector: {
592
if (temp.type() == RegType::sgpr && !can_accept_sgpr)
593
return false;
594
/* don't increase the vector size */
595
if (temp.bytes() > instr->operands[index].bytes())
596
return false;
597
/* We can decrease the vector size as smaller temporaries are only
598
* propagated by p_as_uniform instructions.
599
* If this propagation leads to invalid IR or hits the assertion below,
600
* it means that some undefined bytes within a dword are begin accessed
601
* and a bug in instruction_selection is likely. */
602
int decrease = instr->operands[index].bytes() - temp.bytes();
603
while (decrease > 0) {
604
decrease -= instr->definitions.back().bytes();
605
instr->definitions.pop_back();
606
}
607
assert(decrease == 0);
608
break;
609
}
610
case aco_opcode::p_as_uniform:
611
if (temp.regClass() == instr->definitions[0].regClass())
612
instr->opcode = aco_opcode::p_parallelcopy;
613
break;
614
default: return false;
615
}
616
617
instr->operands[index].setTemp(temp);
618
return true;
619
}
620
621
bool
622
can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
623
{
624
if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())
625
return false;
626
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
627
instr->opcode != aco_opcode::v_readlane_b32 &&
628
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
629
instr->opcode != aco_opcode::v_writelane_b32 &&
630
instr->opcode != aco_opcode::v_writelane_b32_e64 &&
631
instr->opcode != aco_opcode::v_permlane16_b32 &&
632
instr->opcode != aco_opcode::v_permlanex16_b32;
633
}
634
635
void
636
to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
637
{
638
if (instr->isVOP3())
639
return;
640
641
aco_ptr<Instruction> tmp = std::move(instr);
642
Format format = asVOP3(tmp->format);
643
instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(),
644
tmp->definitions.size()));
645
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
646
for (unsigned i = 0; i < instr->definitions.size(); i++) {
647
instr->definitions[i] = tmp->definitions[i];
648
if (instr->definitions[i].isTemp()) {
649
ssa_info& info = ctx.info[instr->definitions[i].tempId()];
650
if (info.label & instr_usedef_labels && info.instr == tmp.get())
651
info.instr = instr.get();
652
}
653
}
654
/* we don't need to update any instr_mod_labels because they either haven't
655
* been applied yet or this instruction isn't dead and so they've been ignored */
656
}
657
658
bool
659
is_operand_vgpr(Operand op)
660
{
661
return op.isTemp() && op.getTemp().type() == RegType::vgpr;
662
}
663
664
void
665
to_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)
666
{
667
aco_ptr<Instruction> tmp = convert_to_SDWA(ctx.program->chip_class, instr);
668
if (!tmp)
669
return;
670
671
for (unsigned i = 0; i < instr->definitions.size(); i++) {
672
ssa_info& info = ctx.info[instr->definitions[i].tempId()];
673
if (info.label & instr_labels && info.instr == tmp.get())
674
info.instr = instr.get();
675
}
676
}
677
678
/* only covers special cases */
679
bool
680
alu_can_accept_constant(aco_opcode opcode, unsigned operand)
681
{
682
switch (opcode) {
683
case aco_opcode::v_interp_p2_f32:
684
case aco_opcode::v_mac_f32:
685
case aco_opcode::v_writelane_b32:
686
case aco_opcode::v_writelane_b32_e64:
687
case aco_opcode::v_cndmask_b32: return operand != 2;
688
case aco_opcode::s_addk_i32:
689
case aco_opcode::s_mulk_i32:
690
case aco_opcode::p_wqm:
691
case aco_opcode::p_extract_vector:
692
case aco_opcode::p_split_vector:
693
case aco_opcode::v_readlane_b32:
694
case aco_opcode::v_readlane_b32_e64:
695
case aco_opcode::v_readfirstlane_b32:
696
case aco_opcode::p_extract:
697
case aco_opcode::p_insert: return operand != 0;
698
default: return true;
699
}
700
}
701
702
bool
703
valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
704
{
705
if (instr->opcode == aco_opcode::v_readlane_b32 ||
706
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
707
instr->opcode == aco_opcode::v_writelane_b32 ||
708
instr->opcode == aco_opcode::v_writelane_b32_e64)
709
return operand != 1;
710
if (instr->opcode == aco_opcode::v_permlane16_b32 ||
711
instr->opcode == aco_opcode::v_permlanex16_b32)
712
return operand == 0;
713
return true;
714
}
715
716
/* check constant bus and literal limitations */
717
bool
718
check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
719
{
720
int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
721
Operand literal32(s1);
722
Operand literal64(s2);
723
unsigned num_sgprs = 0;
724
unsigned sgpr[] = {0, 0};
725
726
for (unsigned i = 0; i < num_operands; i++) {
727
Operand op = operands[i];
728
729
if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
730
/* two reads of the same SGPR count as 1 to the limit */
731
if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
732
if (num_sgprs < 2)
733
sgpr[num_sgprs++] = op.tempId();
734
limit--;
735
if (limit < 0)
736
return false;
737
}
738
} else if (op.isLiteral()) {
739
if (ctx.program->chip_class < GFX10)
740
return false;
741
742
if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
743
return false;
744
if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
745
return false;
746
747
/* Any number of 32-bit literals counts as only 1 to the limit. Same
748
* (but separately) for 64-bit literals. */
749
if (op.size() == 1 && literal32.isUndefined()) {
750
limit--;
751
literal32 = op;
752
} else if (op.size() == 2 && literal64.isUndefined()) {
753
limit--;
754
literal64 = op;
755
}
756
757
if (limit < 0)
758
return false;
759
}
760
}
761
762
return true;
763
}
764
765
bool
766
parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
767
bool prevent_overflow)
768
{
769
Operand op = instr->operands[op_index];
770
771
if (!op.isTemp())
772
return false;
773
Temp tmp = op.getTemp();
774
if (!ctx.info[tmp.id()].is_add_sub())
775
return false;
776
777
Instruction* add_instr = ctx.info[tmp.id()].instr;
778
779
switch (add_instr->opcode) {
780
case aco_opcode::v_add_u32:
781
case aco_opcode::v_add_co_u32:
782
case aco_opcode::v_add_co_u32_e64:
783
case aco_opcode::s_add_i32:
784
case aco_opcode::s_add_u32: break;
785
default: return false;
786
}
787
if (prevent_overflow && !add_instr->definitions[0].isNUW())
788
return false;
789
790
if (add_instr->usesModifiers())
791
return false;
792
793
for (unsigned i = 0; i < 2; i++) {
794
if (add_instr->operands[i].isConstant()) {
795
*offset = add_instr->operands[i].constantValue();
796
} else if (add_instr->operands[i].isTemp() &&
797
ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
798
*offset = ctx.info[add_instr->operands[i].tempId()].val;
799
} else {
800
continue;
801
}
802
if (!add_instr->operands[!i].isTemp())
803
continue;
804
805
uint32_t offset2 = 0;
806
if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
807
*offset += offset2;
808
} else {
809
*base = add_instr->operands[!i].getTemp();
810
}
811
return true;
812
}
813
814
return false;
815
}
816
817
unsigned
818
get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
819
{
820
if (instr->isPseudo())
821
return instr->operands[index].bytes() * 8u;
822
else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
823
instr->opcode == aco_opcode::v_mad_i64_i32)
824
return index == 2 ? 64 : 32;
825
else if (instr->isVALU() || instr->isSALU())
826
return instr_info.operand_size[(int)instr->opcode];
827
else
828
return 0;
829
}
830
831
Operand
832
get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
833
{
834
if (bits == 64)
835
return Operand::c32_or_c64(info.val, true);
836
return Operand::get_const(ctx.program->chip_class, info.val, bits / 8u);
837
}
838
839
bool
840
fixed_to_exec(Operand op)
841
{
842
return op.isFixed() && op.physReg() == exec;
843
}
844
845
int
846
parse_extract(Instruction* instr)
847
{
848
if (instr->opcode == aco_opcode::p_extract) {
849
bool is_byte = instr->operands[2].constantEquals(8);
850
unsigned index = instr->operands[1].constantValue();
851
unsigned sel = (is_byte ? sdwa_ubyte0 : sdwa_uword0) + index;
852
if (!instr->operands[3].constantEquals(0))
853
sel |= sdwa_sext;
854
return sel;
855
} else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
856
return instr->operands[2].constantEquals(8) ? sdwa_ubyte0 : sdwa_uword0;
857
} else {
858
return -1;
859
}
860
}
861
862
int
863
parse_insert(Instruction* instr)
864
{
865
if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
866
instr->operands[1].constantEquals(0)) {
867
return instr->operands[2].constantEquals(8) ? sdwa_ubyte0 : sdwa_uword0;
868
} else if (instr->opcode == aco_opcode::p_insert) {
869
bool is_byte = instr->operands[2].constantEquals(8);
870
unsigned index = instr->operands[1].constantValue();
871
unsigned sel = (is_byte ? sdwa_ubyte0 : sdwa_uword0) + index;
872
return sel;
873
} else {
874
return -1;
875
}
876
}
877
878
bool
879
can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
880
{
881
if (idx >= 2)
882
return false;
883
884
Temp tmp = info.instr->operands[0].getTemp();
885
unsigned sel = parse_extract(info.instr);
886
887
if (sel == sdwa_udword || sel == sdwa_sdword) {
888
return true;
889
} else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {
890
return true;
891
} else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
892
(tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
893
if (instr->isSDWA() &&
894
(static_cast<SDWA_instruction*>(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword)
895
return false;
896
return true;
897
} else if (instr->isVOP3() && (sel & sdwa_isword) &&
898
can_use_opsel(ctx.program->chip_class, instr->opcode, idx, (sel & sdwa_wordnum)) &&
899
!(instr->vop3().opsel & (1 << idx))) {
900
return true;
901
} else {
902
return false;
903
}
904
}
905
906
/* Combine an p_extract (or p_insert, in some cases) instruction with instr.
907
* instr(p_extract(...)) -> instr()
908
*/
909
void
910
apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
911
{
912
Temp tmp = info.instr->operands[0].getTemp();
913
unsigned sel = parse_extract(info.instr);
914
915
if (sel == sdwa_udword || sel == sdwa_sdword) {
916
} else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {
917
switch (sel) {
918
case sdwa_ubyte0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
919
case sdwa_ubyte1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
920
case sdwa_ubyte2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
921
case sdwa_ubyte3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
922
}
923
} else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
924
(tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
925
to_SDWA(ctx, instr);
926
static_cast<SDWA_instruction*>(instr.get())->sel[idx] = sel;
927
} else if (instr->isVOP3()) {
928
if (sel & sdwa_wordnum)
929
instr->vop3().opsel |= 1 << idx;
930
}
931
932
ctx.info[tmp.id()].label &= ~label_insert;
933
/* label_vopc seems to be the only one worth keeping at the moment */
934
for (Definition& def : instr->definitions)
935
ctx.info[def.tempId()].label &= label_vopc;
936
}
937
938
void
939
check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
940
{
941
/* only VALU can use SDWA */
942
if (!instr->isVALU())
943
return;
944
945
for (unsigned i = 0; i < instr->operands.size(); i++) {
946
Operand op = instr->operands[i];
947
if (!op.isTemp())
948
continue;
949
ssa_info& info = ctx.info[op.tempId()];
950
if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
951
op.getTemp().type() == RegType::sgpr)) {
952
if (!can_apply_extract(ctx, instr, i, info))
953
info.label &= ~label_extract;
954
}
955
}
956
}
957
958
bool
959
does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
960
{
961
if (ctx.program->chip_class <= GFX8) {
962
switch (op) {
963
case aco_opcode::v_min_f32:
964
case aco_opcode::v_max_f32:
965
case aco_opcode::v_med3_f32:
966
case aco_opcode::v_min3_f32:
967
case aco_opcode::v_max3_f32:
968
case aco_opcode::v_min_f16:
969
case aco_opcode::v_max_f16: return false;
970
default: break;
971
}
972
}
973
return op != aco_opcode::v_cndmask_b32;
974
}
975
976
bool
977
can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp)
978
{
979
float_mode* fp = &ctx.fp_mode;
980
if (ctx.info[tmp.id()].is_canonicalized() ||
981
(tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
982
return true;
983
984
aco_opcode op = instr->opcode;
985
return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op);
986
}
987
988
bool
989
is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info)
990
{
991
return info.is_temp() ||
992
(info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));
993
}
994
995
bool
996
is_op_canonicalized(opt_ctx& ctx, Operand op)
997
{
998
float_mode* fp = &ctx.fp_mode;
999
if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
1000
(op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1001
return true;
1002
1003
if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
1004
uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
1005
if (op.bytes() == 2)
1006
return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
1007
else if (op.bytes() == 4)
1008
return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
1009
}
1010
return false;
1011
}
1012
1013
void
1014
label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1015
{
1016
if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
1017
ASSERTED bool all_const = false;
1018
for (Operand& op : instr->operands)
1019
all_const =
1020
all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
1021
perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
1022
1023
ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
1024
instr->opcode == aco_opcode::s_mov_b64 ||
1025
instr->opcode == aco_opcode::v_mov_b32;
1026
perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
1027
instr.get());
1028
}
1029
1030
for (unsigned i = 0; i < instr->operands.size(); i++) {
1031
if (!instr->operands[i].isTemp())
1032
continue;
1033
1034
ssa_info info = ctx.info[instr->operands[i].tempId()];
1035
/* propagate undef */
1036
if (info.is_undefined() && is_phi(instr))
1037
instr->operands[i] = Operand(instr->operands[i].regClass());
1038
/* propagate reg->reg of same type */
1039
while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
1040
instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
1041
info = ctx.info[info.temp.id()];
1042
}
1043
1044
/* PSEUDO: propagate temporaries */
1045
if (instr->isPseudo()) {
1046
while (info.is_temp()) {
1047
pseudo_propagate_temp(ctx, instr, info.temp, i);
1048
info = ctx.info[info.temp.id()];
1049
}
1050
}
1051
1052
/* SALU / PSEUDO: propagate inline constants */
1053
if (instr->isSALU() || instr->isPseudo()) {
1054
unsigned bits = get_operand_size(instr, i);
1055
if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&
1056
!instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
1057
instr->operands[i] = get_constant_op(ctx, info, bits);
1058
continue;
1059
}
1060
}
1061
1062
/* VALU: propagate neg, abs & inline constants */
1063
else if (instr->isVALU()) {
1064
if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr &&
1065
valu_can_accept_vgpr(instr, i)) {
1066
instr->operands[i].setTemp(info.temp);
1067
info = ctx.info[info.temp.id()];
1068
}
1069
/* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
1070
if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
1071
instr->operands.size() == 1) {
1072
instr->operands[i].setTemp(info.temp);
1073
info = ctx.info[info.temp.id()];
1074
}
1075
1076
/* for instructions other than v_cndmask_b32, the size of the instruction should match the
1077
* operand size */
1078
unsigned can_use_mod =
1079
instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
1080
can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
1081
1082
if (instr->isSDWA())
1083
can_use_mod = can_use_mod && (instr->sdwa().sel[i] & sdwa_asuint) == sdwa_udword;
1084
else
1085
can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr));
1086
1087
if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
1088
instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
1089
instr->operands[i].setTemp(info.temp);
1090
} else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
1091
instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
1092
instr->operands[i].setTemp(info.temp);
1093
} else if (info.is_neg() && can_use_mod &&
1094
can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
1095
if (!instr->isDPP() && !instr->isSDWA())
1096
to_VOP3(ctx, instr);
1097
instr->operands[i].setTemp(info.temp);
1098
if (instr->isDPP() && !instr->dpp().abs[i])
1099
instr->dpp().neg[i] = true;
1100
else if (instr->isSDWA() && !instr->sdwa().abs[i])
1101
instr->sdwa().neg[i] = true;
1102
else if (instr->isVOP3() && !instr->vop3().abs[i])
1103
instr->vop3().neg[i] = true;
1104
}
1105
if (info.is_abs() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
1106
if (!instr->isDPP() && !instr->isSDWA())
1107
to_VOP3(ctx, instr);
1108
instr->operands[i] = Operand(info.temp);
1109
if (instr->isDPP())
1110
instr->dpp().abs[i] = true;
1111
else if (instr->isSDWA())
1112
instr->sdwa().abs[i] = true;
1113
else
1114
instr->vop3().abs[i] = true;
1115
continue;
1116
}
1117
unsigned bits = get_operand_size(instr, i);
1118
if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
1119
(!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
1120
Operand op = get_constant_op(ctx, info, bits);
1121
perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
1122
"v_cndmask_b32 with a constant selector", instr.get());
1123
if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
1124
instr->opcode == aco_opcode::v_readlane_b32 ||
1125
instr->opcode == aco_opcode::v_writelane_b32) {
1126
instr->operands[i] = op;
1127
continue;
1128
} else if (!instr->isVOP3() && can_swap_operands(instr)) {
1129
instr->operands[i] = instr->operands[0];
1130
instr->operands[0] = op;
1131
continue;
1132
} else if (can_use_VOP3(ctx, instr)) {
1133
to_VOP3(ctx, instr);
1134
instr->operands[i] = op;
1135
continue;
1136
}
1137
}
1138
}
1139
1140
/* MUBUF: propagate constants and combine additions */
1141
else if (instr->isMUBUF()) {
1142
MUBUF_instruction& mubuf = instr->mubuf();
1143
Temp base;
1144
uint32_t offset;
1145
while (info.is_temp())
1146
info = ctx.info[info.temp.id()];
1147
1148
/* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
1149
* overflow for scratch accesses works only on GFX9+ and saddr overflow
1150
* never works. Since swizzling is the only thing that separates
1151
* scratch accesses and other accesses and swizzling changing how
1152
* addressing works significantly, this probably applies to swizzled
1153
* MUBUF accesses. */
1154
bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->chip_class < GFX9;
1155
bool saddr_prevent_overflow = mubuf.swizzled;
1156
1157
if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
1158
mubuf.offset + info.val < 4096) {
1159
assert(!mubuf.idxen);
1160
instr->operands[1] = Operand(v1);
1161
mubuf.offset += info.val;
1162
mubuf.offen = false;
1163
continue;
1164
} else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
1165
instr->operands[2] = Operand::c32(0);
1166
mubuf.offset += info.val;
1167
continue;
1168
} else if (mubuf.offen && i == 1 &&
1169
parse_base_offset(ctx, instr.get(), i, &base, &offset,
1170
vaddr_prevent_overflow) &&
1171
base.regClass() == v1 && mubuf.offset + offset < 4096) {
1172
assert(!mubuf.idxen);
1173
instr->operands[1].setTemp(base);
1174
mubuf.offset += offset;
1175
continue;
1176
} else if (i == 2 &&
1177
parse_base_offset(ctx, instr.get(), i, &base, &offset,
1178
saddr_prevent_overflow) &&
1179
base.regClass() == s1 && mubuf.offset + offset < 4096) {
1180
instr->operands[i].setTemp(base);
1181
mubuf.offset += offset;
1182
continue;
1183
}
1184
}
1185
1186
/* DS: combine additions */
1187
else if (instr->isDS()) {
1188
1189
DS_instruction& ds = instr->ds();
1190
Temp base;
1191
uint32_t offset;
1192
bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
1193
if (has_usable_ds_offset && i == 0 &&
1194
parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1195
base.regClass() == instr->operands[i].regClass() &&
1196
instr->opcode != aco_opcode::ds_swizzle_b32) {
1197
if (instr->opcode == aco_opcode::ds_write2_b32 ||
1198
instr->opcode == aco_opcode::ds_read2_b32 ||
1199
instr->opcode == aco_opcode::ds_write2_b64 ||
1200
instr->opcode == aco_opcode::ds_read2_b64) {
1201
unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 ||
1202
instr->opcode == aco_opcode::ds_read2_b64)
1203
? 0x7
1204
: 0x3;
1205
unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 ||
1206
instr->opcode == aco_opcode::ds_read2_b64)
1207
? 3
1208
: 2;
1209
1210
if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
1211
ds.offset1 + (offset >> shifts) <= 255) {
1212
instr->operands[i].setTemp(base);
1213
ds.offset0 += offset >> shifts;
1214
ds.offset1 += offset >> shifts;
1215
}
1216
} else {
1217
if (ds.offset0 + offset <= 65535) {
1218
instr->operands[i].setTemp(base);
1219
ds.offset0 += offset;
1220
}
1221
}
1222
}
1223
}
1224
1225
/* SMEM: propagate constants and combine additions */
1226
else if (instr->isSMEM()) {
1227
1228
SMEM_instruction& smem = instr->smem();
1229
Temp base;
1230
uint32_t offset;
1231
bool prevent_overflow = smem.operands[0].size() > 2 || smem.prevent_overflow;
1232
if (i == 1 && info.is_constant_or_literal(32) &&
1233
((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||
1234
(ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||
1235
(ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
1236
instr->operands[i] = Operand::c32(info.val);
1237
continue;
1238
} else if (i == 1 &&
1239
parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) &&
1240
base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
1241
bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
1242
if (soe && (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) ||
1243
ctx.info[smem.operands.back().tempId()].val != 0)) {
1244
continue;
1245
}
1246
if (soe) {
1247
smem.operands[1] = Operand::c32(offset);
1248
smem.operands.back() = Operand(base);
1249
} else {
1250
SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
1251
smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
1252
new_instr->operands[0] = smem.operands[0];
1253
new_instr->operands[1] = Operand::c32(offset);
1254
if (smem.definitions.empty())
1255
new_instr->operands[2] = smem.operands[2];
1256
new_instr->operands.back() = Operand(base);
1257
if (!smem.definitions.empty())
1258
new_instr->definitions[0] = smem.definitions[0];
1259
new_instr->sync = smem.sync;
1260
new_instr->glc = smem.glc;
1261
new_instr->dlc = smem.dlc;
1262
new_instr->nv = smem.nv;
1263
new_instr->disable_wqm = smem.disable_wqm;
1264
instr.reset(new_instr);
1265
}
1266
continue;
1267
}
1268
}
1269
1270
else if (instr->isBranch()) {
1271
if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1272
/* Flip the branch instruction to get rid of the scc_invert instruction */
1273
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
1274
: aco_opcode::p_cbranch_z;
1275
instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
1276
}
1277
}
1278
}
1279
1280
/* if this instruction doesn't define anything, return */
1281
if (instr->definitions.empty()) {
1282
check_sdwa_extract(ctx, instr);
1283
return;
1284
}
1285
1286
if (instr->isVALU() || instr->isVINTRP()) {
1287
if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
1288
instr->opcode == aco_opcode::v_cndmask_b32) {
1289
bool canonicalized = true;
1290
if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
1291
unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
1292
for (unsigned i = 0; canonicalized && (i < ops); i++)
1293
canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
1294
}
1295
if (canonicalized)
1296
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1297
}
1298
1299
if (instr->isVOPC()) {
1300
ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
1301
check_sdwa_extract(ctx, instr);
1302
return;
1303
}
1304
if (instr->isVOP3P()) {
1305
ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
1306
return;
1307
}
1308
}
1309
1310
switch (instr->opcode) {
1311
case aco_opcode::p_create_vector: {
1312
bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
1313
instr->operands[0].regClass() == instr->definitions[0].regClass();
1314
if (copy_prop) {
1315
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1316
break;
1317
}
1318
1319
/* expand vector operands */
1320
std::vector<Operand> ops;
1321
unsigned offset = 0;
1322
for (const Operand& op : instr->operands) {
1323
/* ensure that any expanded operands are properly aligned */
1324
bool aligned = offset % 4 == 0 || op.bytes() < 4;
1325
offset += op.bytes();
1326
if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {
1327
Instruction* vec = ctx.info[op.tempId()].instr;
1328
for (const Operand& vec_op : vec->operands)
1329
ops.emplace_back(vec_op);
1330
} else {
1331
ops.emplace_back(op);
1332
}
1333
}
1334
1335
/* combine expanded operands to new vector */
1336
if (ops.size() != instr->operands.size()) {
1337
assert(ops.size() > instr->operands.size());
1338
Definition def = instr->definitions[0];
1339
instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
1340
Format::PSEUDO, ops.size(), 1));
1341
for (unsigned i = 0; i < ops.size(); i++) {
1342
if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
1343
ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
1344
ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
1345
instr->operands[i] = ops[i];
1346
}
1347
instr->definitions[0] = def;
1348
} else {
1349
for (unsigned i = 0; i < ops.size(); i++) {
1350
assert(instr->operands[i] == ops[i]);
1351
}
1352
}
1353
ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1354
break;
1355
}
1356
case aco_opcode::p_split_vector: {
1357
ssa_info& info = ctx.info[instr->operands[0].tempId()];
1358
1359
if (info.is_constant_or_literal(32)) {
1360
uint32_t val = info.val;
1361
for (Definition def : instr->definitions) {
1362
uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
1363
ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask);
1364
val >>= def.bytes() * 8u;
1365
}
1366
break;
1367
} else if (!info.is_vec()) {
1368
break;
1369
}
1370
1371
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1372
unsigned split_offset = 0;
1373
unsigned vec_offset = 0;
1374
unsigned vec_index = 0;
1375
for (unsigned i = 0; i < instr->definitions.size();
1376
split_offset += instr->definitions[i++].bytes()) {
1377
while (vec_offset < split_offset && vec_index < vec->operands.size())
1378
vec_offset += vec->operands[vec_index++].bytes();
1379
1380
if (vec_offset != split_offset ||
1381
vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
1382
continue;
1383
1384
Operand vec_op = vec->operands[vec_index];
1385
if (vec_op.isConstant()) {
1386
ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class,
1387
vec_op.constantValue64());
1388
} else if (vec_op.isUndefined()) {
1389
ctx.info[instr->definitions[i].tempId()].set_undefined();
1390
} else {
1391
assert(vec_op.isTemp());
1392
ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1393
}
1394
}
1395
break;
1396
}
1397
case aco_opcode::p_extract_vector: { /* mov */
1398
ssa_info& info = ctx.info[instr->operands[0].tempId()];
1399
const unsigned index = instr->operands[1].constantValue();
1400
const unsigned dst_offset = index * instr->definitions[0].bytes();
1401
1402
if (info.is_vec()) {
1403
/* check if we index directly into a vector element */
1404
Instruction* vec = info.instr;
1405
unsigned offset = 0;
1406
1407
for (const Operand& op : vec->operands) {
1408
if (offset < dst_offset) {
1409
offset += op.bytes();
1410
continue;
1411
} else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1412
break;
1413
}
1414
instr->operands[0] = op;
1415
break;
1416
}
1417
} else if (info.is_constant_or_literal(32)) {
1418
/* propagate constants */
1419
uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
1420
uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
1421
instr->operands[0] =
1422
Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());
1423
;
1424
} else if (index == 0 && instr->operands[0].size() == instr->definitions[0].size()) {
1425
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1426
}
1427
1428
if (instr->operands[0].bytes() != instr->definitions[0].bytes())
1429
break;
1430
1431
/* convert this extract into a copy instruction */
1432
instr->opcode = aco_opcode::p_parallelcopy;
1433
instr->operands.pop_back();
1434
FALLTHROUGH;
1435
}
1436
case aco_opcode::p_parallelcopy: /* propagate */
1437
if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
1438
instr->operands[0].regClass() != instr->definitions[0].regClass()) {
1439
/* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
1440
* duplicate the vector instead.
1441
*/
1442
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1443
aco_ptr<Instruction> old_copy = std::move(instr);
1444
1445
instr.reset(create_instruction<Pseudo_instruction>(
1446
aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
1447
instr->definitions[0] = old_copy->definitions[0];
1448
std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
1449
for (unsigned i = 0; i < vec->operands.size(); i++) {
1450
Operand& op = instr->operands[i];
1451
if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
1452
ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
1453
op.setTemp(ctx.info[op.tempId()].temp);
1454
}
1455
ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1456
break;
1457
}
1458
FALLTHROUGH;
1459
case aco_opcode::p_as_uniform:
1460
if (instr->definitions[0].isFixed()) {
1461
/* don't copy-propagate copies into fixed registers */
1462
} else if (instr->usesModifiers()) {
1463
// TODO
1464
} else if (instr->operands[0].isConstant()) {
1465
ctx.info[instr->definitions[0].tempId()].set_constant(
1466
ctx.program->chip_class, instr->operands[0].constantValue64());
1467
} else if (instr->operands[0].isTemp()) {
1468
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1469
if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
1470
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1471
} else {
1472
assert(instr->operands[0].isFixed());
1473
}
1474
break;
1475
case aco_opcode::p_is_helper:
1476
if (!ctx.program->needs_wqm)
1477
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
1478
break;
1479
case aco_opcode::v_mul_f16:
1480
case aco_opcode::v_mul_f32: { /* omod */
1481
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
1482
1483
/* TODO: try to move the negate/abs modifier to the consumer instead */
1484
bool uses_mods = instr->usesModifiers();
1485
bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1486
1487
for (unsigned i = 0; i < 2; i++) {
1488
if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1489
if (!instr->isDPP() && !instr->isSDWA() &&
1490
(instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */
1491
instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
1492
bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
1493
1494
VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL;
1495
if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
1496
continue;
1497
1498
bool abs = vop3 && vop3->abs[i];
1499
bool neg = neg1 ^ (vop3 && vop3->neg[i]);
1500
1501
Temp other = instr->operands[i].getTemp();
1502
if (abs && neg && other.type() == RegType::vgpr)
1503
ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
1504
else if (abs && !neg && other.type() == RegType::vgpr)
1505
ctx.info[instr->definitions[0].tempId()].set_abs(other);
1506
else if (!abs && neg && other.type() == RegType::vgpr)
1507
ctx.info[instr->definitions[0].tempId()].set_neg(other);
1508
else if (!abs && !neg)
1509
ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
1510
} else if (uses_mods) {
1511
continue;
1512
} else if (instr->operands[!i].constantValue() ==
1513
(fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1514
ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
1515
} else if (instr->operands[!i].constantValue() ==
1516
(fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1517
ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
1518
} else if (instr->operands[!i].constantValue() ==
1519
(fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
1520
ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
1521
} else if (instr->operands[!i].constantValue() == 0u &&
1522
!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
1523
: ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */
1524
ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
1525
} else {
1526
continue;
1527
}
1528
break;
1529
}
1530
}
1531
break;
1532
}
1533
case aco_opcode::v_mul_lo_u16:
1534
if (instr->definitions[0].isNUW()) {
1535
/* Most of 16-bit mul optimizations are only valid if no overflow. */
1536
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
1537
}
1538
break;
1539
case aco_opcode::v_mul_u32_u24:
1540
ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
1541
break;
1542
case aco_opcode::v_med3_f16:
1543
case aco_opcode::v_med3_f32: { /* clamp */
1544
VOP3_instruction& vop3 = instr->vop3();
1545
if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] ||
1546
vop3.omod != 0 || vop3.opsel != 0)
1547
break;
1548
1549
unsigned idx = 0;
1550
bool found_zero = false, found_one = false;
1551
bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1552
for (unsigned i = 0; i < 3; i++) {
1553
if (instr->operands[i].constantEquals(0))
1554
found_zero = true;
1555
else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1556
found_one = true;
1557
else
1558
idx = i;
1559
}
1560
if (found_zero && found_one && instr->operands[idx].isTemp())
1561
ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
1562
break;
1563
}
1564
case aco_opcode::v_cndmask_b32:
1565
if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
1566
ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1567
else if (instr->operands[0].constantEquals(0) &&
1568
instr->operands[1].constantEquals(0x3f800000u))
1569
ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1570
else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
1571
ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1572
1573
ctx.info[instr->operands[2].tempId()].set_vcc_hint();
1574
break;
1575
case aco_opcode::v_cmp_lg_u32:
1576
if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1577
instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
1578
ctx.info[instr->operands[1].tempId()].is_vcc())
1579
ctx.info[instr->definitions[0].tempId()].set_temp(
1580
ctx.info[instr->operands[1].tempId()].temp);
1581
break;
1582
case aco_opcode::p_linear_phi: {
1583
/* lower_bool_phis() can create phis like this */
1584
bool all_same_temp = instr->operands[0].isTemp();
1585
/* this check is needed when moving uniform loop counters out of a divergent loop */
1586
if (all_same_temp)
1587
all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1588
for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1589
if (!instr->operands[i].isTemp() ||
1590
instr->operands[i].tempId() != instr->operands[0].tempId())
1591
all_same_temp = false;
1592
}
1593
if (all_same_temp) {
1594
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1595
} else {
1596
bool all_undef = instr->operands[0].isUndefined();
1597
for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1598
if (!instr->operands[i].isUndefined())
1599
all_undef = false;
1600
}
1601
if (all_undef)
1602
ctx.info[instr->definitions[0].tempId()].set_undefined();
1603
}
1604
break;
1605
}
1606
case aco_opcode::v_add_u32:
1607
case aco_opcode::v_add_co_u32:
1608
case aco_opcode::v_add_co_u32_e64:
1609
case aco_opcode::s_add_i32:
1610
case aco_opcode::s_add_u32:
1611
case aco_opcode::v_subbrev_co_u32:
1612
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1613
break;
1614
case aco_opcode::s_not_b32:
1615
case aco_opcode::s_not_b64:
1616
if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1617
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1618
ctx.info[instr->definitions[1].tempId()].set_scc_invert(
1619
ctx.info[instr->operands[0].tempId()].temp);
1620
} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1621
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1622
ctx.info[instr->definitions[1].tempId()].set_scc_invert(
1623
ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1624
}
1625
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1626
break;
1627
case aco_opcode::s_and_b32:
1628
case aco_opcode::s_and_b64:
1629
if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
1630
if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1631
/* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
1632
* uniform bool into divergent */
1633
ctx.info[instr->definitions[1].tempId()].set_temp(
1634
ctx.info[instr->operands[0].tempId()].temp);
1635
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
1636
ctx.info[instr->operands[0].tempId()].temp);
1637
break;
1638
} else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1639
/* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
1640
* already produces the same SCC */
1641
ctx.info[instr->definitions[1].tempId()].set_temp(
1642
ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1643
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
1644
ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1645
break;
1646
} else if (ctx.info[instr->operands[0].tempId()].is_vopc()) {
1647
Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr;
1648
/* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
1649
* already produces the same result */
1650
if (vopc_instr->pass_flags == instr->pass_flags) {
1651
assert(instr->pass_flags > 0);
1652
ctx.info[instr->definitions[0].tempId()].set_temp(
1653
vopc_instr->definitions[0].getTemp());
1654
break;
1655
}
1656
}
1657
}
1658
FALLTHROUGH;
1659
case aco_opcode::s_or_b32:
1660
case aco_opcode::s_or_b64:
1661
case aco_opcode::s_xor_b32:
1662
case aco_opcode::s_xor_b64:
1663
if (std::all_of(instr->operands.begin(), instr->operands.end(),
1664
[&ctx](const Operand& op)
1665
{
1666
return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
1667
ctx.info[op.tempId()].is_uniform_bitwise());
1668
})) {
1669
ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1670
}
1671
FALLTHROUGH;
1672
case aco_opcode::s_lshl_b32:
1673
case aco_opcode::v_or_b32:
1674
case aco_opcode::v_lshlrev_b32:
1675
case aco_opcode::v_bcnt_u32_b32:
1676
case aco_opcode::v_and_b32:
1677
case aco_opcode::v_xor_b32:
1678
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1679
break;
1680
case aco_opcode::v_min_f32:
1681
case aco_opcode::v_min_f16:
1682
case aco_opcode::v_min_u32:
1683
case aco_opcode::v_min_i32:
1684
case aco_opcode::v_min_u16:
1685
case aco_opcode::v_min_i16:
1686
case aco_opcode::v_max_f32:
1687
case aco_opcode::v_max_f16:
1688
case aco_opcode::v_max_u32:
1689
case aco_opcode::v_max_i32:
1690
case aco_opcode::v_max_u16:
1691
case aco_opcode::v_max_i16:
1692
ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
1693
break;
1694
case aco_opcode::s_cselect_b64:
1695
case aco_opcode::s_cselect_b32:
1696
if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
1697
/* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
1698
ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
1699
}
1700
if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
1701
/* Flip the operands to get rid of the scc_invert instruction */
1702
std::swap(instr->operands[0], instr->operands[1]);
1703
instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
1704
}
1705
break;
1706
case aco_opcode::p_wqm:
1707
if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1708
ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1709
}
1710
break;
1711
case aco_opcode::s_mul_i32:
1712
/* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
1713
* This pattern is created from a uniform nir_op_b2f. */
1714
if (instr->operands[0].constantEquals(0x3f800000u))
1715
ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1716
break;
1717
case aco_opcode::p_extract: {
1718
if (instr->definitions[0].bytes() == 4) {
1719
ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
1720
if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()) >= 0)
1721
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
1722
}
1723
break;
1724
}
1725
case aco_opcode::p_insert: {
1726
if (instr->operands[0].bytes() == 4) {
1727
if (instr->operands[0].regClass() == v1)
1728
ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
1729
if (parse_extract(instr.get()) >= 0)
1730
ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
1731
ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1732
}
1733
break;
1734
}
1735
default: break;
1736
}
1737
1738
/* Don't remove label_extract if we can't apply the extract to
1739
* neg/abs instructions because we'll likely combine it into another valu. */
1740
if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))
1741
check_sdwa_extract(ctx, instr);
1742
}
1743
1744
ALWAYS_INLINE bool
1745
get_cmp_info(aco_opcode op, CmpInfo* info)
1746
{
1747
info->ordered = aco_opcode::num_opcodes;
1748
info->unordered = aco_opcode::num_opcodes;
1749
info->ordered_swapped = aco_opcode::num_opcodes;
1750
info->unordered_swapped = aco_opcode::num_opcodes;
1751
switch (op) {
1752
// clang-format off
1753
#define CMP2(ord, unord, ord_swap, unord_swap, sz) \
1754
case aco_opcode::v_cmp_##ord##_f##sz: \
1755
case aco_opcode::v_cmp_n##unord##_f##sz: \
1756
info->ordered = aco_opcode::v_cmp_##ord##_f##sz; \
1757
info->unordered = aco_opcode::v_cmp_n##unord##_f##sz; \
1758
info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz; \
1759
info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz; \
1760
info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
1761
: aco_opcode::v_cmp_n##ord##_f##sz; \
1762
info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 \
1763
: aco_opcode::v_cmp_n##unord##_f32; \
1764
info->size = sz; \
1765
return true;
1766
#define CMP(ord, unord, ord_swap, unord_swap) \
1767
CMP2(ord, unord, ord_swap, unord_swap, 16) \
1768
CMP2(ord, unord, ord_swap, unord_swap, 32) \
1769
CMP2(ord, unord, ord_swap, unord_swap, 64)
1770
CMP(lt, /*n*/ge, gt, /*n*/le)
1771
CMP(eq, /*n*/lg, eq, /*n*/lg)
1772
CMP(le, /*n*/gt, ge, /*n*/lt)
1773
CMP(gt, /*n*/le, lt, /*n*/le)
1774
CMP(lg, /*n*/eq, lg, /*n*/eq)
1775
CMP(ge, /*n*/lt, le, /*n*/gt)
1776
#undef CMP
1777
#undef CMP2
1778
#define ORD_TEST(sz) \
1779
case aco_opcode::v_cmp_u_f##sz: \
1780
info->f32 = aco_opcode::v_cmp_u_f32; \
1781
info->inverse = aco_opcode::v_cmp_o_f##sz; \
1782
info->size = sz; \
1783
return true; \
1784
case aco_opcode::v_cmp_o_f##sz: \
1785
info->f32 = aco_opcode::v_cmp_o_f32; \
1786
info->inverse = aco_opcode::v_cmp_u_f##sz; \
1787
info->size = sz; \
1788
return true;
1789
ORD_TEST(16)
1790
ORD_TEST(32)
1791
ORD_TEST(64)
1792
#undef ORD_TEST
1793
// clang-format on
1794
default: return false;
1795
}
1796
}
1797
1798
aco_opcode
1799
get_ordered(aco_opcode op)
1800
{
1801
CmpInfo info;
1802
return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
1803
}
1804
1805
aco_opcode
1806
get_unordered(aco_opcode op)
1807
{
1808
CmpInfo info;
1809
return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
1810
}
1811
1812
aco_opcode
1813
get_inverse(aco_opcode op)
1814
{
1815
CmpInfo info;
1816
return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
1817
}
1818
1819
aco_opcode
1820
get_f32_cmp(aco_opcode op)
1821
{
1822
CmpInfo info;
1823
return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
1824
}
1825
1826
unsigned
1827
get_cmp_bitsize(aco_opcode op)
1828
{
1829
CmpInfo info;
1830
return get_cmp_info(op, &info) ? info.size : 0;
1831
}
1832
1833
bool
1834
is_cmp(aco_opcode op)
1835
{
1836
CmpInfo info;
1837
return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
1838
}
1839
1840
unsigned
1841
original_temp_id(opt_ctx& ctx, Temp tmp)
1842
{
1843
if (ctx.info[tmp.id()].is_temp())
1844
return ctx.info[tmp.id()].temp.id();
1845
else
1846
return tmp.id();
1847
}
1848
1849
void
1850
decrease_uses(opt_ctx& ctx, Instruction* instr)
1851
{
1852
if (!--ctx.uses[instr->definitions[0].tempId()]) {
1853
for (const Operand& op : instr->operands) {
1854
if (op.isTemp())
1855
ctx.uses[op.tempId()]--;
1856
}
1857
}
1858
}
1859
1860
Instruction*
1861
follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
1862
{
1863
if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
1864
return nullptr;
1865
if (!ignore_uses && ctx.uses[op.tempId()] > 1)
1866
return nullptr;
1867
1868
Instruction* instr = ctx.info[op.tempId()].instr;
1869
1870
if (instr->definitions.size() == 2) {
1871
assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
1872
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1873
return nullptr;
1874
}
1875
1876
return instr;
1877
}
1878
1879
/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
1880
* s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
1881
bool
1882
combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1883
{
1884
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1885
return false;
1886
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1887
return false;
1888
1889
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1890
1891
bool neg[2] = {false, false};
1892
bool abs[2] = {false, false};
1893
uint8_t opsel = 0;
1894
Instruction* op_instr[2];
1895
Temp op[2];
1896
1897
unsigned bitsize = 0;
1898
for (unsigned i = 0; i < 2; i++) {
1899
op_instr[i] = follow_operand(ctx, instr->operands[i], true);
1900
if (!op_instr[i])
1901
return false;
1902
1903
aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
1904
unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
1905
1906
if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
1907
return false;
1908
if (bitsize && op_bitsize != bitsize)
1909
return false;
1910
if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
1911
return false;
1912
1913
if (op_instr[i]->isVOP3()) {
1914
VOP3_instruction& vop3 = op_instr[i]->vop3();
1915
if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
1916
vop3.opsel == 2)
1917
return false;
1918
neg[i] = vop3.neg[0];
1919
abs[i] = vop3.abs[0];
1920
opsel |= (vop3.opsel & 1) << i;
1921
} else if (op_instr[i]->isSDWA()) {
1922
return false;
1923
}
1924
1925
Temp op0 = op_instr[i]->operands[0].getTemp();
1926
Temp op1 = op_instr[i]->operands[1].getTemp();
1927
if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
1928
return false;
1929
1930
op[i] = op1;
1931
bitsize = op_bitsize;
1932
}
1933
1934
if (op[1].type() == RegType::sgpr)
1935
std::swap(op[0], op[1]);
1936
unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
1937
if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))
1938
return false;
1939
1940
ctx.uses[op[0].id()]++;
1941
ctx.uses[op[1].id()]++;
1942
decrease_uses(ctx, op_instr[0]);
1943
decrease_uses(ctx, op_instr[1]);
1944
1945
aco_opcode new_op = aco_opcode::num_opcodes;
1946
switch (bitsize) {
1947
case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
1948
case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
1949
case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
1950
}
1951
Instruction* new_instr;
1952
if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
1953
VOP3_instruction* vop3 =
1954
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
1955
for (unsigned i = 0; i < 2; i++) {
1956
vop3->neg[i] = neg[i];
1957
vop3->abs[i] = abs[i];
1958
}
1959
vop3->opsel = opsel;
1960
new_instr = static_cast<Instruction*>(vop3);
1961
} else {
1962
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
1963
instr->definitions[0].setHint(vcc);
1964
}
1965
new_instr->operands[0] = Operand(op[0]);
1966
new_instr->operands[1] = Operand(op[1]);
1967
new_instr->definitions[0] = instr->definitions[0];
1968
1969
ctx.info[instr->definitions[0].tempId()].label = 0;
1970
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
1971
1972
instr.reset(new_instr);
1973
1974
return true;
1975
}
1976
1977
/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
1978
* s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
1979
bool
1980
combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1981
{
1982
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
1983
return false;
1984
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
1985
return false;
1986
1987
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
1988
aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
1989
1990
Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
1991
Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
1992
if (!nan_test || !cmp)
1993
return false;
1994
if (nan_test->isSDWA() || cmp->isSDWA())
1995
return false;
1996
1997
if (get_f32_cmp(cmp->opcode) == expected_nan_test)
1998
std::swap(nan_test, cmp);
1999
else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2000
return false;
2001
2002
if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
2003
return false;
2004
2005
if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2006
return false;
2007
if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
2008
return false;
2009
2010
unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
2011
unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
2012
unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2013
unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2014
if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
2015
return false;
2016
if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
2017
return false;
2018
2019
ctx.uses[cmp->operands[0].tempId()]++;
2020
ctx.uses[cmp->operands[1].tempId()]++;
2021
decrease_uses(ctx, nan_test);
2022
decrease_uses(ctx, cmp);
2023
2024
aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2025
Instruction* new_instr;
2026
if (cmp->isVOP3()) {
2027
VOP3_instruction* new_vop3 =
2028
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
2029
VOP3_instruction& cmp_vop3 = cmp->vop3();
2030
memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
2031
memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
2032
new_vop3->clamp = cmp_vop3.clamp;
2033
new_vop3->omod = cmp_vop3.omod;
2034
new_vop3->opsel = cmp_vop3.opsel;
2035
new_instr = new_vop3;
2036
} else {
2037
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
2038
instr->definitions[0].setHint(vcc);
2039
}
2040
new_instr->operands[0] = cmp->operands[0];
2041
new_instr->operands[1] = cmp->operands[1];
2042
new_instr->definitions[0] = instr->definitions[0];
2043
2044
ctx.info[instr->definitions[0].tempId()].label = 0;
2045
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2046
2047
instr.reset(new_instr);
2048
2049
return true;
2050
}
2051
2052
bool
2053
is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
2054
{
2055
if (op.isConstant()) {
2056
*value = op.constantValue64();
2057
return true;
2058
} else if (op.isTemp()) {
2059
unsigned id = original_temp_id(ctx, op.getTemp());
2060
if (!ctx.info[id].is_constant_or_literal(bit_size))
2061
return false;
2062
*value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();
2063
return true;
2064
}
2065
return false;
2066
}
2067
2068
bool
2069
is_constant_nan(uint64_t value, unsigned bit_size)
2070
{
2071
if (bit_size == 16)
2072
return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
2073
else if (bit_size == 32)
2074
return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);
2075
else
2076
return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);
2077
}
2078
2079
/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
2080
* s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
2081
bool
2082
combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2083
{
2084
if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2085
return false;
2086
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2087
return false;
2088
2089
bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2090
2091
Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2092
Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2093
2094
if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA())
2095
return false;
2096
if (nan_test->isSDWA() || cmp->isSDWA())
2097
return false;
2098
2099
aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2100
if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2101
std::swap(nan_test, cmp);
2102
else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2103
return false;
2104
2105
unsigned bit_size = get_cmp_bitsize(cmp->opcode);
2106
if (!is_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)
2107
return false;
2108
2109
if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2110
return false;
2111
if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
2112
return false;
2113
2114
unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2115
unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2116
if (prop_nan0 != prop_nan1)
2117
return false;
2118
2119
if (nan_test->isVOP3()) {
2120
VOP3_instruction& vop3 = nan_test->vop3();
2121
if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
2122
vop3.opsel == 2)
2123
return false;
2124
}
2125
2126
int constant_operand = -1;
2127
for (unsigned i = 0; i < 2; i++) {
2128
if (cmp->operands[i].isTemp() &&
2129
original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
2130
constant_operand = !i;
2131
break;
2132
}
2133
}
2134
if (constant_operand == -1)
2135
return false;
2136
2137
uint64_t constant_value;
2138
if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))
2139
return false;
2140
if (is_constant_nan(constant_value, bit_size))
2141
return false;
2142
2143
if (cmp->operands[0].isTemp())
2144
ctx.uses[cmp->operands[0].tempId()]++;
2145
if (cmp->operands[1].isTemp())
2146
ctx.uses[cmp->operands[1].tempId()]++;
2147
decrease_uses(ctx, nan_test);
2148
decrease_uses(ctx, cmp);
2149
2150
aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2151
Instruction* new_instr;
2152
if (cmp->isVOP3()) {
2153
VOP3_instruction* new_vop3 =
2154
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
2155
VOP3_instruction& cmp_vop3 = cmp->vop3();
2156
memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
2157
memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
2158
new_vop3->clamp = cmp_vop3.clamp;
2159
new_vop3->omod = cmp_vop3.omod;
2160
new_vop3->opsel = cmp_vop3.opsel;
2161
new_instr = new_vop3;
2162
} else {
2163
new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
2164
instr->definitions[0].setHint(vcc);
2165
}
2166
new_instr->operands[0] = cmp->operands[0];
2167
new_instr->operands[1] = cmp->operands[1];
2168
new_instr->definitions[0] = instr->definitions[0];
2169
2170
ctx.info[instr->definitions[0].tempId()].label = 0;
2171
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2172
2173
instr.reset(new_instr);
2174
2175
return true;
2176
}
2177
2178
/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */
2179
bool
2180
combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2181
{
2182
if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)
2183
return false;
2184
if (ctx.uses[instr->definitions[1].tempId()])
2185
return false;
2186
2187
Instruction* cmp = follow_operand(ctx, instr->operands[1]);
2188
if (!cmp)
2189
return false;
2190
2191
aco_opcode new_opcode = get_inverse(cmp->opcode);
2192
if (new_opcode == aco_opcode::num_opcodes)
2193
return false;
2194
2195
if (cmp->operands[0].isTemp())
2196
ctx.uses[cmp->operands[0].tempId()]++;
2197
if (cmp->operands[1].isTemp())
2198
ctx.uses[cmp->operands[1].tempId()]++;
2199
decrease_uses(ctx, cmp);
2200
2201
/* This creates a new instruction instead of modifying the existing
2202
* comparison so that the comparison is done with the correct exec mask. */
2203
Instruction* new_instr;
2204
if (cmp->isVOP3()) {
2205
VOP3_instruction* new_vop3 =
2206
create_instruction<VOP3_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
2207
VOP3_instruction& cmp_vop3 = cmp->vop3();
2208
memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
2209
memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
2210
new_vop3->clamp = cmp_vop3.clamp;
2211
new_vop3->omod = cmp_vop3.omod;
2212
new_vop3->opsel = cmp_vop3.opsel;
2213
new_instr = new_vop3;
2214
} else if (cmp->isSDWA()) {
2215
SDWA_instruction* new_sdwa = create_instruction<SDWA_instruction>(
2216
new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1);
2217
SDWA_instruction& cmp_sdwa = cmp->sdwa();
2218
memcpy(new_sdwa->abs, cmp_sdwa.abs, sizeof(new_sdwa->abs));
2219
memcpy(new_sdwa->sel, cmp_sdwa.sel, sizeof(new_sdwa->sel));
2220
memcpy(new_sdwa->neg, cmp_sdwa.neg, sizeof(new_sdwa->neg));
2221
new_sdwa->dst_sel = cmp_sdwa.dst_sel;
2222
new_sdwa->dst_preserve = cmp_sdwa.dst_preserve;
2223
new_sdwa->clamp = cmp_sdwa.clamp;
2224
new_sdwa->omod = cmp_sdwa.omod;
2225
new_instr = new_sdwa;
2226
} else {
2227
new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
2228
instr->definitions[0].setHint(vcc);
2229
}
2230
new_instr->operands[0] = cmp->operands[0];
2231
new_instr->operands[1] = cmp->operands[1];
2232
new_instr->definitions[0] = instr->definitions[0];
2233
2234
ctx.info[instr->definitions[0].tempId()].label = 0;
2235
ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2236
2237
instr.reset(new_instr);
2238
2239
return true;
2240
}
2241
2242
/* op1(op2(1, 2), 0) if swap = false
2243
* op1(0, op2(1, 2)) if swap = true */
2244
bool
2245
match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
2246
const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3],
2247
uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
2248
bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
2249
{
2250
/* checks */
2251
if (op1_instr->opcode != op1)
2252
return false;
2253
2254
Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
2255
if (!op2_instr || op2_instr->opcode != op2)
2256
return false;
2257
if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
2258
return false;
2259
2260
VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL;
2261
VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL;
2262
2263
if (op1_instr->isSDWA() || op2_instr->isSDWA())
2264
return false;
2265
2266
/* don't support inbetween clamp/omod */
2267
if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
2268
return false;
2269
2270
/* get operands and modifiers and check inbetween modifiers */
2271
*op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
2272
*op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
2273
2274
if (inbetween_neg)
2275
*inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
2276
else if (op1_vop3 && op1_vop3->neg[swap])
2277
return false;
2278
2279
if (inbetween_abs)
2280
*inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
2281
else if (op1_vop3 && op1_vop3->abs[swap])
2282
return false;
2283
2284
if (inbetween_opsel)
2285
*inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << (unsigned)swap) : false;
2286
else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap))
2287
return false;
2288
2289
*precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
2290
2291
int shuffle[3];
2292
shuffle[shuffle_str[0] - '0'] = 0;
2293
shuffle[shuffle_str[1] - '0'] = 1;
2294
shuffle[shuffle_str[2] - '0'] = 2;
2295
2296
operands[shuffle[0]] = op1_instr->operands[!swap];
2297
neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
2298
abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
2299
if (op1_vop3 && (op1_vop3->opsel & (1 << (unsigned)!swap)))
2300
*opsel |= 1 << shuffle[0];
2301
2302
for (unsigned i = 0; i < 2; i++) {
2303
operands[shuffle[i + 1]] = op2_instr->operands[i];
2304
neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
2305
abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
2306
if (op2_vop3 && op2_vop3->opsel & (1 << i))
2307
*opsel |= 1 << shuffle[i + 1];
2308
}
2309
2310
/* check operands */
2311
if (!check_vop3_operands(ctx, 3, operands))
2312
return false;
2313
2314
return true;
2315
}
2316
2317
void
2318
create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
2319
Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp,
2320
unsigned omod)
2321
{
2322
VOP3_instruction* new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1);
2323
memcpy(new_instr->abs, abs, sizeof(bool[3]));
2324
memcpy(new_instr->neg, neg, sizeof(bool[3]));
2325
new_instr->clamp = clamp;
2326
new_instr->omod = omod;
2327
new_instr->opsel = opsel;
2328
new_instr->operands[0] = operands[0];
2329
new_instr->operands[1] = operands[1];
2330
new_instr->operands[2] = operands[2];
2331
new_instr->definitions[0] = instr->definitions[0];
2332
ctx.info[instr->definitions[0].tempId()].label = 0;
2333
2334
instr.reset(new_instr);
2335
}
2336
2337
bool
2338
combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
2339
const char* shuffle, uint8_t ops)
2340
{
2341
for (unsigned swap = 0; swap < 2; swap++) {
2342
if (!((1 << swap) & ops))
2343
continue;
2344
2345
Operand operands[3];
2346
bool neg[3], abs[3], clamp, precise;
2347
uint8_t opsel = 0, omod = 0;
2348
if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
2349
abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
2350
ctx.uses[instr->operands[swap].tempId()]--;
2351
create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
2352
return true;
2353
}
2354
}
2355
return false;
2356
}
2357
2358
/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
2359
bool
2360
combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2361
{
2362
bool is_or = instr->opcode == aco_opcode::v_or_b32;
2363
aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
2364
2365
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
2366
"120", 1 | 2))
2367
return true;
2368
if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
2369
"120", 1 | 2))
2370
return true;
2371
if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
2372
return true;
2373
if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
2374
return true;
2375
2376
if (instr->isSDWA())
2377
return false;
2378
2379
/* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2380
* v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2381
* v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
2382
* v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
2383
*/
2384
for (unsigned i = 0; i < 2; i++) {
2385
Instruction* extins = follow_operand(ctx, instr->operands[i]);
2386
if (!extins)
2387
continue;
2388
2389
aco_opcode op;
2390
Operand operands[3];
2391
2392
if (extins->opcode == aco_opcode::p_insert &&
2393
(extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
2394
op = new_op_lshl;
2395
operands[1] =
2396
Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
2397
} else if (is_or &&
2398
(extins->opcode == aco_opcode::p_insert ||
2399
(extins->opcode == aco_opcode::p_extract &&
2400
extins->operands[3].constantEquals(0))) &&
2401
extins->operands[1].constantEquals(0)) {
2402
op = aco_opcode::v_and_or_b32;
2403
operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
2404
} else {
2405
continue;
2406
}
2407
2408
operands[0] = extins->operands[0];
2409
operands[2] = instr->operands[!i];
2410
2411
if (!check_vop3_operands(ctx, 3, operands))
2412
continue;
2413
2414
bool neg[3] = {}, abs[3] = {};
2415
uint8_t opsel = 0, omod = 0;
2416
bool clamp = false;
2417
if (instr->isVOP3())
2418
clamp = instr->vop3().clamp;
2419
2420
ctx.uses[instr->operands[i].tempId()]--;
2421
create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
2422
return true;
2423
}
2424
2425
return false;
2426
}
2427
2428
bool
2429
combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
2430
{
2431
/* TODO: this can handle SDWA min/max instructions by using opsel */
2432
if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
2433
return true;
2434
2435
/* min(-max(a, b), c) -> min3(c, -a, -b) *
2436
* max(-min(a, b), c) -> max3(c, -a, -b) */
2437
for (unsigned swap = 0; swap < 2; swap++) {
2438
Operand operands[3];
2439
bool neg[3], abs[3], clamp, precise;
2440
uint8_t opsel = 0, omod = 0;
2441
bool inbetween_neg;
2442
if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg,
2443
abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
2444
inbetween_neg) {
2445
ctx.uses[instr->operands[swap].tempId()]--;
2446
neg[1] = !neg[1];
2447
neg[2] = !neg[2];
2448
create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);
2449
return true;
2450
}
2451
}
2452
return false;
2453
}
2454
2455
/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
2456
* s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
2457
* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
2458
* s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
2459
* s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
2460
* s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
2461
bool
2462
combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2463
{
2464
/* checks */
2465
if (!instr->operands[0].isTemp())
2466
return false;
2467
if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2468
return false;
2469
2470
Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
2471
if (!op2_instr)
2472
return false;
2473
switch (op2_instr->opcode) {
2474
case aco_opcode::s_and_b32:
2475
case aco_opcode::s_or_b32:
2476
case aco_opcode::s_xor_b32:
2477
case aco_opcode::s_and_b64:
2478
case aco_opcode::s_or_b64:
2479
case aco_opcode::s_xor_b64: break;
2480
default: return false;
2481
}
2482
2483
/* create instruction */
2484
std::swap(instr->definitions[0], op2_instr->definitions[0]);
2485
std::swap(instr->definitions[1], op2_instr->definitions[1]);
2486
ctx.uses[instr->operands[0].tempId()]--;
2487
ctx.info[op2_instr->definitions[0].tempId()].label = 0;
2488
2489
switch (op2_instr->opcode) {
2490
case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
2491
case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
2492
case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
2493
case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
2494
case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
2495
case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
2496
default: break;
2497
}
2498
2499
return true;
2500
}
2501
2502
/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
2503
* s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
2504
* s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
2505
* s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
2506
bool
2507
combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2508
{
2509
if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
2510
return false;
2511
2512
for (unsigned i = 0; i < 2; i++) {
2513
Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
2514
if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
2515
op2_instr->opcode != aco_opcode::s_not_b64))
2516
continue;
2517
if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
2518
continue;
2519
2520
if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2521
instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2522
continue;
2523
2524
ctx.uses[instr->operands[i].tempId()]--;
2525
instr->operands[0] = instr->operands[!i];
2526
instr->operands[1] = op2_instr->operands[0];
2527
ctx.info[instr->definitions[0].tempId()].label = 0;
2528
2529
switch (instr->opcode) {
2530
case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
2531
case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
2532
case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
2533
case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
2534
default: break;
2535
}
2536
2537
return true;
2538
}
2539
return false;
2540
}
2541
2542
/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
2543
bool
2544
combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2545
{
2546
if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
2547
return false;
2548
2549
for (unsigned i = 0; i < 2; i++) {
2550
Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
2551
if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
2552
ctx.uses[op2_instr->definitions[1].tempId()])
2553
continue;
2554
if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))
2555
continue;
2556
2557
uint32_t shift = op2_instr->operands[1].constantValue();
2558
if (shift < 1 || shift > 4)
2559
continue;
2560
2561
if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2562
instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2563
continue;
2564
2565
ctx.uses[instr->operands[i].tempId()]--;
2566
instr->operands[1] = instr->operands[!i];
2567
instr->operands[0] = op2_instr->operands[0];
2568
ctx.info[instr->definitions[0].tempId()].label = 0;
2569
2570
instr->opcode = std::array<aco_opcode, 4>{
2571
aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
2572
aco_opcode::s_lshl4_add_u32}[shift - 1];
2573
2574
return true;
2575
}
2576
return false;
2577
}
2578
2579
bool
2580
combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
2581
{
2582
if (instr->usesModifiers())
2583
return false;
2584
2585
for (unsigned i = 0; i < 2; i++) {
2586
if (!((1 << i) & ops))
2587
continue;
2588
if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
2589
ctx.uses[instr->operands[i].tempId()] == 1) {
2590
2591
aco_ptr<Instruction> new_instr;
2592
if (instr->operands[!i].isTemp() &&
2593
instr->operands[!i].getTemp().type() == RegType::vgpr) {
2594
new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
2595
} else if (ctx.program->chip_class >= GFX10 ||
2596
(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
2597
new_instr.reset(
2598
create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
2599
} else {
2600
return false;
2601
}
2602
ctx.uses[instr->operands[i].tempId()]--;
2603
new_instr->definitions[0] = instr->definitions[0];
2604
if (instr->definitions.size() == 2) {
2605
new_instr->definitions[1] = instr->definitions[1];
2606
} else {
2607
new_instr->definitions[1] =
2608
Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
2609
/* Make sure the uses vector is large enough and the number of
2610
* uses properly initialized to 0.
2611
*/
2612
ctx.uses.push_back(0);
2613
}
2614
new_instr->definitions[1].setHint(vcc);
2615
new_instr->operands[0] = Operand::zero();
2616
new_instr->operands[1] = instr->operands[!i];
2617
new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
2618
instr = std::move(new_instr);
2619
ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
2620
return true;
2621
}
2622
}
2623
2624
return false;
2625
}
2626
2627
bool
2628
combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2629
{
2630
if (instr->usesModifiers())
2631
return false;
2632
2633
for (unsigned i = 0; i < 2; i++) {
2634
Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
2635
if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
2636
op_instr->operands[0].isTemp() &&
2637
op_instr->operands[0].getTemp().type() == RegType::vgpr &&
2638
op_instr->operands[1].constantEquals(0)) {
2639
aco_ptr<Instruction> new_instr{
2640
create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
2641
ctx.uses[instr->operands[i].tempId()]--;
2642
new_instr->operands[0] = op_instr->operands[0];
2643
new_instr->operands[1] = instr->operands[!i];
2644
new_instr->definitions[0] = instr->definitions[0];
2645
instr = std::move(new_instr);
2646
ctx.info[instr->definitions[0].tempId()].label = 0;
2647
2648
return true;
2649
}
2650
}
2651
2652
return false;
2653
}
2654
2655
bool
2656
get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
2657
aco_opcode* med3, bool* some_gfx9_only)
2658
{
2659
switch (op) {
2660
#define MINMAX(type, gfx9) \
2661
case aco_opcode::v_min_##type: \
2662
case aco_opcode::v_max_##type: \
2663
case aco_opcode::v_med3_##type: \
2664
*min = aco_opcode::v_min_##type; \
2665
*max = aco_opcode::v_max_##type; \
2666
*med3 = aco_opcode::v_med3_##type; \
2667
*min3 = aco_opcode::v_min3_##type; \
2668
*max3 = aco_opcode::v_max3_##type; \
2669
*some_gfx9_only = gfx9; \
2670
return true;
2671
MINMAX(f32, false)
2672
MINMAX(u32, false)
2673
MINMAX(i32, false)
2674
MINMAX(f16, true)
2675
MINMAX(u16, true)
2676
MINMAX(i16, true)
2677
#undef MINMAX
2678
default: return false;
2679
}
2680
}
2681
2682
/* when ub > lb:
2683
* v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
2684
* v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
2685
*/
2686
bool
2687
combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
2688
aco_opcode med)
2689
{
2690
/* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
2691
* FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
2692
* minVal > maxVal, which means we can always select it to a v_med3_f32 */
2693
aco_opcode other_op;
2694
if (instr->opcode == min)
2695
other_op = max;
2696
else if (instr->opcode == max)
2697
other_op = min;
2698
else
2699
return false;
2700
2701
for (unsigned swap = 0; swap < 2; swap++) {
2702
Operand operands[3];
2703
bool neg[3], abs[3], clamp, precise;
2704
uint8_t opsel = 0, omod = 0;
2705
if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
2706
abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
2707
/* max(min(src, upper), lower) returns upper if src is NaN, but
2708
* med3(src, lower, upper) returns lower.
2709
*/
2710
if (precise && instr->opcode != min)
2711
continue;
2712
2713
int const0_idx = -1, const1_idx = -1;
2714
uint32_t const0 = 0, const1 = 0;
2715
for (int i = 0; i < 3; i++) {
2716
uint32_t val;
2717
if (operands[i].isConstant()) {
2718
val = operands[i].constantValue();
2719
} else if (operands[i].isTemp() &&
2720
ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
2721
val = ctx.info[operands[i].tempId()].val;
2722
} else {
2723
continue;
2724
}
2725
if (const0_idx >= 0) {
2726
const1_idx = i;
2727
const1 = val;
2728
} else {
2729
const0_idx = i;
2730
const0 = val;
2731
}
2732
}
2733
if (const0_idx < 0 || const1_idx < 0)
2734
continue;
2735
2736
if (opsel & (1 << const0_idx))
2737
const0 >>= 16;
2738
if (opsel & (1 << const1_idx))
2739
const1 >>= 16;
2740
2741
int lower_idx = const0_idx;
2742
switch (min) {
2743
case aco_opcode::v_min_f32:
2744
case aco_opcode::v_min_f16: {
2745
float const0_f, const1_f;
2746
if (min == aco_opcode::v_min_f32) {
2747
memcpy(&const0_f, &const0, 4);
2748
memcpy(&const1_f, &const1, 4);
2749
} else {
2750
const0_f = _mesa_half_to_float(const0);
2751
const1_f = _mesa_half_to_float(const1);
2752
}
2753
if (abs[const0_idx])
2754
const0_f = fabsf(const0_f);
2755
if (abs[const1_idx])
2756
const1_f = fabsf(const1_f);
2757
if (neg[const0_idx])
2758
const0_f = -const0_f;
2759
if (neg[const1_idx])
2760
const1_f = -const1_f;
2761
lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
2762
break;
2763
}
2764
case aco_opcode::v_min_u32: {
2765
lower_idx = const0 < const1 ? const0_idx : const1_idx;
2766
break;
2767
}
2768
case aco_opcode::v_min_u16: {
2769
lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
2770
break;
2771
}
2772
case aco_opcode::v_min_i32: {
2773
int32_t const0_i =
2774
const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
2775
int32_t const1_i =
2776
const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
2777
lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2778
break;
2779
}
2780
case aco_opcode::v_min_i16: {
2781
int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
2782
int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
2783
lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
2784
break;
2785
}
2786
default: break;
2787
}
2788
int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
2789
2790
if (instr->opcode == min) {
2791
if (upper_idx != 0 || lower_idx == 0)
2792
return false;
2793
} else {
2794
if (upper_idx == 0 || lower_idx != 0)
2795
return false;
2796
}
2797
2798
ctx.uses[instr->operands[swap].tempId()]--;
2799
create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
2800
2801
return true;
2802
}
2803
}
2804
2805
return false;
2806
}
2807
2808
void
2809
apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2810
{
2811
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
2812
instr->opcode == aco_opcode::v_lshrrev_b64 ||
2813
instr->opcode == aco_opcode::v_ashrrev_i64;
2814
2815
/* find candidates and create the set of sgprs already read */
2816
unsigned sgpr_ids[2] = {0, 0};
2817
uint32_t operand_mask = 0;
2818
bool has_literal = false;
2819
for (unsigned i = 0; i < instr->operands.size(); i++) {
2820
if (instr->operands[i].isLiteral())
2821
has_literal = true;
2822
if (!instr->operands[i].isTemp())
2823
continue;
2824
if (instr->operands[i].getTemp().type() == RegType::sgpr) {
2825
if (instr->operands[i].tempId() != sgpr_ids[0])
2826
sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
2827
}
2828
ssa_info& info = ctx.info[instr->operands[i].tempId()];
2829
if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::sgpr)
2830
operand_mask |= 1u << i;
2831
if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)
2832
operand_mask |= 1u << i;
2833
}
2834
unsigned max_sgprs = 1;
2835
if (ctx.program->chip_class >= GFX10 && !is_shift64)
2836
max_sgprs = 2;
2837
if (has_literal)
2838
max_sgprs--;
2839
2840
unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
2841
2842
/* keep on applying sgprs until there is nothing left to be done */
2843
while (operand_mask) {
2844
uint32_t sgpr_idx = 0;
2845
uint32_t sgpr_info_id = 0;
2846
uint32_t mask = operand_mask;
2847
/* choose a sgpr */
2848
while (mask) {
2849
unsigned i = u_bit_scan(&mask);
2850
uint16_t uses = ctx.uses[instr->operands[i].tempId()];
2851
if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
2852
sgpr_idx = i;
2853
sgpr_info_id = instr->operands[i].tempId();
2854
}
2855
}
2856
operand_mask &= ~(1u << sgpr_idx);
2857
2858
ssa_info& info = ctx.info[sgpr_info_id];
2859
2860
/* Applying two sgprs require making it VOP3, so don't do it unless it's
2861
* definitively beneficial.
2862
* TODO: this is too conservative because later the use count could be reduced to 1 */
2863
if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
2864
!instr->isSDWA() && instr->format != Format::VOP3P)
2865
break;
2866
2867
Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
2868
bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
2869
if (new_sgpr && num_sgprs >= max_sgprs)
2870
continue;
2871
2872
if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
2873
info.is_extract()) {
2874
/* can_apply_extract() checks SGPR encoding restrictions */
2875
if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
2876
apply_extract(ctx, instr, sgpr_idx, info);
2877
else if (info.is_extract())
2878
continue;
2879
instr->operands[sgpr_idx] = Operand(sgpr);
2880
} else if (can_swap_operands(instr)) {
2881
instr->operands[sgpr_idx] = instr->operands[0];
2882
instr->operands[0] = Operand(sgpr);
2883
/* swap bits using a 4-entry LUT */
2884
uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
2885
operand_mask = (operand_mask & ~0x3) | swapped;
2886
} else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {
2887
to_VOP3(ctx, instr);
2888
instr->operands[sgpr_idx] = Operand(sgpr);
2889
} else {
2890
continue;
2891
}
2892
2893
if (new_sgpr)
2894
sgpr_ids[num_sgprs++] = sgpr.id();
2895
ctx.uses[sgpr_info_id]--;
2896
ctx.uses[sgpr.id()]++;
2897
2898
/* TODO: handle when it's a VGPR */
2899
if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&
2900
ctx.info[sgpr.id()].temp.type() == RegType::sgpr)
2901
operand_mask |= 1u << sgpr_idx;
2902
}
2903
}
2904
2905
template <typename T>
2906
bool
2907
apply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info)
2908
{
2909
if (!def_info.is_clamp() && (instr->clamp || instr->omod))
2910
return false;
2911
2912
if (def_info.is_omod2())
2913
instr->omod = 1;
2914
else if (def_info.is_omod4())
2915
instr->omod = 2;
2916
else if (def_info.is_omod5())
2917
instr->omod = 3;
2918
else if (def_info.is_clamp())
2919
instr->clamp = true;
2920
2921
return true;
2922
}
2923
2924
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
2925
bool
2926
apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2927
{
2928
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
2929
!instr_info.can_use_output_modifiers[(int)instr->opcode])
2930
return false;
2931
2932
bool can_vop3 = can_use_VOP3(ctx, instr);
2933
if (!instr->isSDWA() && !can_vop3)
2934
return false;
2935
2936
/* omod flushes -0 to +0 and has no effect if denormals are enabled */
2937
bool can_use_omod = (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */
2938
if (instr->definitions[0].bytes() == 4)
2939
can_use_omod =
2940
can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32;
2941
else
2942
can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&
2943
!ctx.fp_mode.preserve_signed_zero_inf_nan16_64;
2944
2945
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
2946
2947
uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
2948
if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
2949
return false;
2950
/* if the omod/clamp instruction is dead, then the single user of this
2951
* instruction is a different instruction */
2952
if (!ctx.uses[def_info.instr->definitions[0].tempId()])
2953
return false;
2954
2955
/* MADs/FMAs are created later, so we don't have to update the original add */
2956
assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
2957
2958
if (instr->isSDWA()) {
2959
if (!apply_omod_clamp_helper(ctx, &instr->sdwa(), def_info))
2960
return false;
2961
} else {
2962
to_VOP3(ctx, instr);
2963
if (!apply_omod_clamp_helper(ctx, &instr->vop3(), def_info))
2964
return false;
2965
}
2966
2967
instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
2968
ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert;
2969
ctx.uses[def_info.instr->definitions[0].tempId()]--;
2970
2971
return true;
2972
}
2973
2974
/* Combine an p_insert (or p_extract, in some cases) instruction with instr.
2975
* p_insert(instr(...)) -> instr_insert().
2976
*/
2977
bool
2978
apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2979
{
2980
if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
2981
return false;
2982
2983
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
2984
if (!def_info.is_insert())
2985
return false;
2986
/* if the insert instruction is dead, then the single user of this
2987
* instruction is a different instruction */
2988
if (!ctx.uses[def_info.instr->definitions[0].tempId()])
2989
return false;
2990
2991
/* MADs/FMAs are created later, so we don't have to update the original add */
2992
assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
2993
2994
unsigned sel = parse_insert(def_info.instr);
2995
2996
if (instr->isVOP3() && (sel & sdwa_isword) && !(sel & sdwa_sext) &&
2997
can_use_opsel(ctx.program->chip_class, instr->opcode, 3, (sel & sdwa_wordnum))) {
2998
if (instr->vop3().opsel & (1 << 3))
2999
return false;
3000
if (sel & sdwa_wordnum)
3001
instr->vop3().opsel |= 1 << 3;
3002
} else {
3003
if (!can_use_SDWA(ctx.program->chip_class, instr, true))
3004
return false;
3005
3006
to_SDWA(ctx, instr);
3007
if ((static_cast<SDWA_instruction*>(instr.get())->dst_sel & sdwa_asuint) != sdwa_udword)
3008
return false;
3009
static_cast<SDWA_instruction*>(instr.get())->dst_sel = sel;
3010
}
3011
3012
instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3013
ctx.info[instr->definitions[0].tempId()].label = 0;
3014
ctx.uses[def_info.instr->definitions[0].tempId()]--;
3015
3016
return true;
3017
}
3018
3019
/* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
3020
bool
3021
combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3022
{
3023
if (instr->usesModifiers())
3024
return false;
3025
3026
for (unsigned i = 0; i < 2; i++) {
3027
Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3028
if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
3029
op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
3030
!op_instr->usesModifiers()) {
3031
3032
aco_ptr<Instruction> new_instr;
3033
if (instr->operands[!i].isTemp() &&
3034
instr->operands[!i].getTemp().type() == RegType::vgpr) {
3035
new_instr.reset(
3036
create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
3037
} else if (ctx.program->chip_class >= GFX10 ||
3038
(instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3039
new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32,
3040
asVOP3(Format::VOP2), 3, 1));
3041
} else {
3042
return false;
3043
}
3044
3045
ctx.uses[instr->operands[i].tempId()]--;
3046
if (ctx.uses[instr->operands[i].tempId()])
3047
ctx.uses[op_instr->operands[2].tempId()]++;
3048
3049
new_instr->operands[0] = Operand::zero();
3050
new_instr->operands[1] = instr->operands[!i];
3051
new_instr->operands[2] = Operand(op_instr->operands[2]);
3052
new_instr->definitions[0] = instr->definitions[0];
3053
instr = std::move(new_instr);
3054
ctx.info[instr->definitions[0].tempId()].label = 0;
3055
return true;
3056
}
3057
}
3058
3059
return false;
3060
}
3061
3062
/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
3063
* v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c) */
3064
bool
3065
combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3066
{
3067
if (instr->usesModifiers())
3068
return false;
3069
3070
for (unsigned i = 0; i < 2; i++) {
3071
Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3072
if (!op_instr)
3073
continue;
3074
3075
if (op_instr->opcode != aco_opcode::s_lshl_b32 &&
3076
op_instr->opcode != aco_opcode::v_lshlrev_b32)
3077
continue;
3078
3079
if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && op_instr->operands[1].isTemp() &&
3080
op_instr->operands[1].getTemp().type() == RegType::sgpr && instr->operands[!i].isTemp() &&
3081
instr->operands[!i].getTemp().type() == RegType::sgpr)
3082
return false;
3083
3084
int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
3085
if (op_instr->operands[shift_op_idx].isConstant() &&
3086
op_instr->operands[shift_op_idx].constantValue() <= 6 && /* no literals */
3087
(op_instr->operands[!shift_op_idx].is24bit() ||
3088
op_instr->operands[!shift_op_idx].is16bit())) {
3089
uint32_t multiplier = 1 << op_instr->operands[shift_op_idx].constantValue();
3090
3091
ctx.uses[instr->operands[i].tempId()]--;
3092
3093
aco_ptr<VOP3_instruction> new_instr{
3094
create_instruction<VOP3_instruction>(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)};
3095
new_instr->operands[0] = op_instr->operands[!shift_op_idx];
3096
new_instr->operands[1] = Operand::c32(multiplier);
3097
new_instr->operands[2] = instr->operands[!i];
3098
new_instr->definitions[0] = instr->definitions[0];
3099
instr = std::move(new_instr);
3100
ctx.info[instr->definitions[0].tempId()].label = 0;
3101
return true;
3102
}
3103
}
3104
3105
return false;
3106
}
3107
3108
void
3109
propagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi)
3110
{
3111
/* propagate swizzles which apply to a result down to the instruction's operands:
3112
* result = a.xy + b.xx -> result.yx = a.yx + b.xx */
3113
assert((opsel_lo & 1) == opsel_lo);
3114
assert((opsel_hi & 1) == opsel_hi);
3115
uint8_t tmp_lo = instr->opsel_lo;
3116
uint8_t tmp_hi = instr->opsel_hi;
3117
bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]};
3118
bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]};
3119
if (opsel_lo == 1) {
3120
instr->opsel_lo = tmp_hi;
3121
for (unsigned i = 0; i < 3; i++)
3122
instr->neg_lo[i] = neg_hi[i];
3123
}
3124
if (opsel_hi == 0) {
3125
instr->opsel_hi = tmp_lo;
3126
for (unsigned i = 0; i < 3; i++)
3127
instr->neg_hi[i] = neg_lo[i];
3128
}
3129
}
3130
3131
void
3132
combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3133
{
3134
VOP3P_instruction* vop3p = &instr->vop3p();
3135
3136
/* apply clamp */
3137
if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
3138
vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1) {
3139
3140
ssa_info& info = ctx.info[instr->operands[0].tempId()];
3141
if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
3142
VOP3P_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->vop3p();
3143
candidate->clamp = true;
3144
propagate_swizzles(candidate, vop3p->opsel_lo, vop3p->opsel_hi);
3145
instr->definitions[0].swapTemp(candidate->definitions[0]);
3146
ctx.info[candidate->definitions[0].tempId()].instr = candidate;
3147
ctx.uses[instr->definitions[0].tempId()]--;
3148
return;
3149
}
3150
}
3151
3152
/* check for fneg modifiers */
3153
if (instr_info.can_use_input_modifiers[(int)instr->opcode]) {
3154
/* at this point, we only have 2-operand instructions */
3155
assert(instr->operands.size() == 2);
3156
for (unsigned i = 0; i < 2; i++) {
3157
Operand& op = instr->operands[i];
3158
if (!op.isTemp())
3159
continue;
3160
3161
ssa_info& info = ctx.info[op.tempId()];
3162
if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&
3163
info.instr->operands[1].constantEquals(0xBC00)) {
3164
Operand ops[2] = {instr->operands[!i], info.instr->operands[0]};
3165
if (!check_vop3_operands(ctx, 2, ops))
3166
continue;
3167
3168
VOP3P_instruction* fneg = &info.instr->vop3p();
3169
if (fneg->clamp)
3170
continue;
3171
instr->operands[i] = fneg->operands[0];
3172
3173
/* opsel_lo/hi is either 0 or 1:
3174
* if 0 - pick selection from fneg->lo
3175
* if 1 - pick selection from fneg->hi
3176
*/
3177
bool opsel_lo = vop3p->opsel_lo & (1 << i);
3178
bool opsel_hi = vop3p->opsel_hi & (1 << i);
3179
vop3p->neg_lo[i] ^= true ^ (opsel_lo ? fneg->neg_hi[0] : fneg->neg_lo[0]);
3180
vop3p->neg_hi[i] ^= true ^ (opsel_hi ? fneg->neg_hi[0] : fneg->neg_lo[0]);
3181
vop3p->opsel_lo ^= ((opsel_lo ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;
3182
vop3p->opsel_hi ^= ((opsel_hi ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;
3183
3184
if (--ctx.uses[fneg->definitions[0].tempId()])
3185
ctx.uses[fneg->operands[0].tempId()]++;
3186
}
3187
}
3188
}
3189
3190
if (instr->opcode == aco_opcode::v_pk_add_f16) {
3191
if (instr->definitions[0].isPrecise())
3192
return;
3193
3194
Instruction* mul_instr = nullptr;
3195
unsigned add_op_idx = 0;
3196
uint8_t opsel_lo = 0, opsel_hi = 0;
3197
uint32_t uses = UINT32_MAX;
3198
3199
/* find the 'best' mul instruction to combine with the add */
3200
for (unsigned i = 0; i < 2; i++) {
3201
if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p())
3202
continue;
3203
ssa_info& info = ctx.info[instr->operands[i].tempId()];
3204
if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||
3205
info.instr->definitions[0].isPrecise())
3206
continue;
3207
3208
Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
3209
if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3210
continue;
3211
3212
/* no clamp allowed between mul and add */
3213
if (info.instr->vop3p().clamp)
3214
continue;
3215
3216
mul_instr = info.instr;
3217
add_op_idx = 1 - i;
3218
opsel_lo = (vop3p->opsel_lo >> i) & 1;
3219
opsel_hi = (vop3p->opsel_hi >> i) & 1;
3220
uses = ctx.uses[instr->operands[i].tempId()];
3221
}
3222
3223
if (!mul_instr)
3224
return;
3225
3226
/* convert to mad */
3227
Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]};
3228
ctx.uses[mul_instr->definitions[0].tempId()]--;
3229
if (ctx.uses[mul_instr->definitions[0].tempId()]) {
3230
if (op[0].isTemp())
3231
ctx.uses[op[0].tempId()]++;
3232
if (op[1].isTemp())
3233
ctx.uses[op[1].tempId()]++;
3234
}
3235
3236
/* turn packed mul+add into v_pk_fma_f16 */
3237
assert(mul_instr->isVOP3P());
3238
aco_ptr<VOP3P_instruction> fma{
3239
create_instruction<VOP3P_instruction>(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)};
3240
VOP3P_instruction* mul = &mul_instr->vop3p();
3241
for (unsigned i = 0; i < 2; i++) {
3242
fma->operands[i] = op[i];
3243
fma->neg_lo[i] = mul->neg_lo[i];
3244
fma->neg_hi[i] = mul->neg_hi[i];
3245
}
3246
fma->operands[2] = op[2];
3247
fma->clamp = vop3p->clamp;
3248
fma->opsel_lo = mul->opsel_lo;
3249
fma->opsel_hi = mul->opsel_hi;
3250
propagate_swizzles(fma.get(), opsel_lo, opsel_hi);
3251
fma->opsel_lo |= (vop3p->opsel_lo << (2 - add_op_idx)) & 0x4;
3252
fma->opsel_hi |= (vop3p->opsel_hi << (2 - add_op_idx)) & 0x4;
3253
fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
3254
fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
3255
fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
3256
fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
3257
fma->definitions[0] = instr->definitions[0];
3258
instr.reset(fma.release());
3259
ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
3260
return;
3261
}
3262
}
3263
3264
// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
3265
// this would mean that we'd have to fix the instruction uses while value propagation
3266
3267
void
3268
combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3269
{
3270
if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
3271
return;
3272
3273
if (instr->isVALU()) {
3274
/* Apply SDWA. Do this after label_instruction() so it can remove
3275
* label_extract if not all instructions can take SDWA. */
3276
for (unsigned i = 0; i < instr->operands.size(); i++) {
3277
Operand& op = instr->operands[i];
3278
if (!op.isTemp())
3279
continue;
3280
ssa_info& info = ctx.info[op.tempId()];
3281
if (info.is_extract() &&
3282
(info.instr->operands[0].getTemp().type() == RegType::vgpr ||
3283
instr->operands[i].getTemp().type() == RegType::sgpr) &&
3284
can_apply_extract(ctx, instr, i, info)) {
3285
apply_extract(ctx, instr, i, info);
3286
ctx.uses[instr->operands[i].tempId()]--;
3287
instr->operands[i].setTemp(info.instr->operands[0].getTemp());
3288
}
3289
}
3290
3291
if (can_apply_sgprs(ctx, instr))
3292
apply_sgprs(ctx, instr);
3293
while (apply_omod_clamp(ctx, instr))
3294
;
3295
apply_insert(ctx, instr);
3296
}
3297
3298
if (instr->isVOP3P())
3299
return combine_vop3p(ctx, instr);
3300
3301
if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
3302
instr->definitions[0].setHint(vcc);
3303
}
3304
3305
if (instr->isSDWA())
3306
return;
3307
3308
/* TODO: There are still some peephole optimizations that could be done:
3309
* - abs(a - b) -> s_absdiff_i32
3310
* - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
3311
* - patterns for v_alignbit_b32 and v_alignbyte_b32
3312
* These aren't probably too interesting though.
3313
* There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
3314
* probably more useful than the previously mentioned optimizations.
3315
* The various comparison optimizations also currently only work with 32-bit
3316
* floats. */
3317
3318
/* neg(mul(a, b)) -> mul(neg(a), b) */
3319
if (ctx.info[instr->definitions[0].tempId()].is_neg() &&
3320
ctx.uses[instr->operands[1].tempId()] == 1) {
3321
Temp val = ctx.info[instr->definitions[0].tempId()].temp;
3322
3323
if (!ctx.info[val.id()].is_mul())
3324
return;
3325
3326
Instruction* mul_instr = ctx.info[val.id()].instr;
3327
3328
if (mul_instr->operands[0].isLiteral())
3329
return;
3330
if (mul_instr->isVOP3() && mul_instr->vop3().clamp)
3331
return;
3332
if (mul_instr->isSDWA())
3333
return;
3334
3335
/* convert to mul(neg(a), b) */
3336
ctx.uses[mul_instr->definitions[0].tempId()]--;
3337
Definition def = instr->definitions[0];
3338
/* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
3339
bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
3340
instr.reset(
3341
create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
3342
instr->operands[0] = mul_instr->operands[0];
3343
instr->operands[1] = mul_instr->operands[1];
3344
instr->definitions[0] = def;
3345
VOP3_instruction& new_mul = instr->vop3();
3346
if (mul_instr->isVOP3()) {
3347
VOP3_instruction& mul = mul_instr->vop3();
3348
new_mul.neg[0] = mul.neg[0] && !is_abs;
3349
new_mul.neg[1] = mul.neg[1] && !is_abs;
3350
new_mul.abs[0] = mul.abs[0] || is_abs;
3351
new_mul.abs[1] = mul.abs[1] || is_abs;
3352
new_mul.omod = mul.omod;
3353
}
3354
new_mul.neg[0] ^= true;
3355
new_mul.clamp = false;
3356
3357
ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
3358
return;
3359
}
3360
3361
/* combine mul+add -> mad */
3362
bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
3363
instr->opcode == aco_opcode::v_subrev_f32;
3364
bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
3365
instr->opcode == aco_opcode::v_subrev_f16;
3366
if (mad16 || mad32) {
3367
bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3)
3368
: (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
3369
if (need_fma && instr->definitions[0].isPrecise())
3370
return;
3371
if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)
3372
return;
3373
3374
Instruction* mul_instr = nullptr;
3375
unsigned add_op_idx = 0;
3376
uint32_t uses = UINT32_MAX;
3377
/* find the 'best' mul instruction to combine with the add */
3378
for (unsigned i = 0; i < 2; i++) {
3379
if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
3380
continue;
3381
/* check precision requirements */
3382
ssa_info& info = ctx.info[instr->operands[i].tempId()];
3383
if (need_fma && info.instr->definitions[0].isPrecise())
3384
continue;
3385
3386
/* no clamp/omod allowed between mul and add */
3387
if (info.instr->isVOP3() && (info.instr->vop3().clamp || info.instr->vop3().omod))
3388
continue;
3389
3390
Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
3391
if (info.instr->isSDWA() || !check_vop3_operands(ctx, 3, op) ||
3392
ctx.uses[instr->operands[i].tempId()] >= uses)
3393
continue;
3394
3395
mul_instr = info.instr;
3396
add_op_idx = 1 - i;
3397
uses = ctx.uses[instr->operands[i].tempId()];
3398
}
3399
3400
if (mul_instr) {
3401
/* turn mul+add into v_mad/v_fma */
3402
Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
3403
instr->operands[add_op_idx]};
3404
ctx.uses[mul_instr->definitions[0].tempId()]--;
3405
if (ctx.uses[mul_instr->definitions[0].tempId()]) {
3406
if (op[0].isTemp())
3407
ctx.uses[op[0].tempId()]++;
3408
if (op[1].isTemp())
3409
ctx.uses[op[1].tempId()]++;
3410
}
3411
3412
bool neg[3] = {false, false, false};
3413
bool abs[3] = {false, false, false};
3414
unsigned omod = 0;
3415
bool clamp = false;
3416
3417
if (mul_instr->isVOP3()) {
3418
VOP3_instruction& vop3 = mul_instr->vop3();
3419
neg[0] = vop3.neg[0];
3420
neg[1] = vop3.neg[1];
3421
abs[0] = vop3.abs[0];
3422
abs[1] = vop3.abs[1];
3423
}
3424
3425
if (instr->isVOP3()) {
3426
VOP3_instruction& vop3 = instr->vop3();
3427
neg[2] = vop3.neg[add_op_idx];
3428
abs[2] = vop3.abs[add_op_idx];
3429
omod = vop3.omod;
3430
clamp = vop3.clamp;
3431
/* abs of the multiplication result */
3432
if (vop3.abs[1 - add_op_idx]) {
3433
neg[0] = false;
3434
neg[1] = false;
3435
abs[0] = true;
3436
abs[1] = true;
3437
}
3438
/* neg of the multiplication result */
3439
neg[1] = neg[1] ^ vop3.neg[1 - add_op_idx];
3440
}
3441
if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
3442
neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
3443
else if (instr->opcode == aco_opcode::v_subrev_f32 ||
3444
instr->opcode == aco_opcode::v_subrev_f16)
3445
neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
3446
3447
aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
3448
if (mad16)
3449
mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16
3450
: aco_opcode::v_fma_f16)
3451
: (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16
3452
: aco_opcode::v_mad_f16);
3453
3454
aco_ptr<VOP3_instruction> mad{
3455
create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
3456
for (unsigned i = 0; i < 3; i++) {
3457
mad->operands[i] = op[i];
3458
mad->neg[i] = neg[i];
3459
mad->abs[i] = abs[i];
3460
}
3461
mad->omod = omod;
3462
mad->clamp = clamp;
3463
mad->definitions[0] = instr->definitions[0];
3464
3465
/* mark this ssa_def to be re-checked for profitability and literals */
3466
ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
3467
ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
3468
instr.reset(mad.release());
3469
return;
3470
}
3471
}
3472
/* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
3473
else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
3474
for (unsigned i = 0; i < 2; i++) {
3475
if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
3476
ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
3477
instr->operands[!i].getTemp().type() == RegType::vgpr) {
3478
ctx.uses[instr->operands[i].tempId()]--;
3479
ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
3480
3481
aco_ptr<VOP2_instruction> new_instr{
3482
create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
3483
new_instr->operands[0] = Operand::zero();
3484
new_instr->operands[1] = instr->operands[!i];
3485
new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
3486
new_instr->definitions[0] = instr->definitions[0];
3487
instr.reset(new_instr.release());
3488
ctx.info[instr->definitions[0].tempId()].label = 0;
3489
return;
3490
}
3491
}
3492
} else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
3493
if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
3494
1 | 2)) {
3495
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
3496
"012", 1 | 2)) {
3497
} else if (combine_add_or_then_and_lshl(ctx, instr)) {
3498
}
3499
} else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
3500
if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
3501
1 | 2)) {
3502
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
3503
"012", 1 | 2)) {
3504
}
3505
} else if (instr->opcode == aco_opcode::v_add_u32) {
3506
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
3507
} else if (combine_add_bcnt(ctx, instr)) {
3508
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
3509
aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
3510
} else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) {
3511
if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
3512
1 | 2)) {
3513
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
3514
"120", 1 | 2)) {
3515
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
3516
"012", 1 | 2)) {
3517
} else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
3518
"012", 1 | 2)) {
3519
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
3520
"012", 1 | 2)) {
3521
} else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16,
3522
aco_opcode::v_mad_u32_u16, "120", 1 | 2)) {
3523
} else if (combine_add_or_then_and_lshl(ctx, instr)) {
3524
}
3525
}
3526
} else if (instr->opcode == aco_opcode::v_add_co_u32 ||
3527
instr->opcode == aco_opcode::v_add_co_u32_e64) {
3528
bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
3529
if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
3530
} else if (!carry_out && combine_add_bcnt(ctx, instr)) {
3531
} else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
3532
aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
3533
} else if (!carry_out && combine_add_lshl(ctx, instr)) {
3534
}
3535
} else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
3536
instr->opcode == aco_opcode::v_sub_co_u32_e64) {
3537
combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
3538
} else if (instr->opcode == aco_opcode::v_subrev_u32 ||
3539
instr->opcode == aco_opcode::v_subrev_co_u32 ||
3540
instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
3541
combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
3542
} else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
3543
combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
3544
2);
3545
} else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
3546
ctx.program->chip_class >= GFX9) {
3547
combine_salu_lshl_add(ctx, instr);
3548
} else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
3549
combine_salu_not_bitwise(ctx, instr);
3550
} else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
3551
instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
3552
if (combine_ordering_test(ctx, instr)) {
3553
} else if (combine_comparison_ordering(ctx, instr)) {
3554
} else if (combine_constant_comparison_ordering(ctx, instr)) {
3555
} else if (combine_salu_n2(ctx, instr)) {
3556
}
3557
} else if (instr->opcode == aco_opcode::v_and_b32) {
3558
combine_and_subbrev(ctx, instr);
3559
} else {
3560
aco_opcode min, max, min3, max3, med3;
3561
bool some_gfx9_only;
3562
if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
3563
(!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
3564
if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
3565
instr->opcode == min ? min3 : max3)) {
3566
} else {
3567
combine_clamp(ctx, instr, min, max, med3);
3568
}
3569
}
3570
}
3571
3572
/* do this after combine_salu_n2() */
3573
if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64)
3574
combine_inverse_comparison(ctx, instr);
3575
}
3576
3577
bool
3578
to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3579
{
3580
switch (instr->opcode) {
3581
case aco_opcode::s_and_b32:
3582
case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
3583
case aco_opcode::s_or_b32:
3584
case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
3585
case aco_opcode::s_xor_b32:
3586
case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
3587
default:
3588
/* Don't transform other instructions. They are very unlikely to appear here. */
3589
return false;
3590
}
3591
3592
for (Operand& op : instr->operands) {
3593
ctx.uses[op.tempId()]--;
3594
3595
if (ctx.info[op.tempId()].is_uniform_bool()) {
3596
/* Just use the uniform boolean temp. */
3597
op.setTemp(ctx.info[op.tempId()].temp);
3598
} else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
3599
/* Use the SCC definition of the predecessor instruction.
3600
* This allows the predecessor to get picked up by the same optimization (if it has no
3601
* divergent users), and it also makes sure that the current instruction will keep working
3602
* even if the predecessor won't be transformed.
3603
*/
3604
Instruction* pred_instr = ctx.info[op.tempId()].instr;
3605
assert(pred_instr->definitions.size() >= 2);
3606
assert(pred_instr->definitions[1].isFixed() &&
3607
pred_instr->definitions[1].physReg() == scc);
3608
op.setTemp(pred_instr->definitions[1].getTemp());
3609
} else {
3610
unreachable("Invalid operand on uniform bitwise instruction.");
3611
}
3612
3613
ctx.uses[op.tempId()]++;
3614
}
3615
3616
instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
3617
assert(instr->operands[0].regClass() == s1);
3618
assert(instr->operands[1].regClass() == s1);
3619
return true;
3620
}
3621
3622
void
3623
select_mul_u32_u24(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3624
{
3625
if (instr->usesModifiers())
3626
return;
3627
3628
/* Only valid if the accumulator is zero (this is selected by isel to
3629
* combine more v_add_u32+v_mad_u32_u16 together), but the optimizer
3630
* fallbacks here when not possible.
3631
*/
3632
if (!instr->operands[2].constantEquals(0))
3633
return;
3634
3635
/* Only valid if the upper 16-bits of both operands are zero (because
3636
* v_mul_u32_u24 doesn't mask them).
3637
*/
3638
for (unsigned i = 0; i < 2; i++) {
3639
if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())
3640
return;
3641
}
3642
3643
bool swap = false;
3644
3645
/* VOP2 instructions can only take constants/sgprs in operand 0. */
3646
if ((instr->operands[1].isConstant() ||
3647
(instr->operands[1].hasRegClass() &&
3648
instr->operands[1].regClass().type() == RegType::sgpr))) {
3649
swap = true;
3650
if ((instr->operands[0].isConstant() ||
3651
(instr->operands[0].hasRegClass() &&
3652
instr->operands[0].regClass().type() == RegType::sgpr))) {
3653
/* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
3654
* v_mul_u32_u24 has no advantages.
3655
*/
3656
return;
3657
}
3658
}
3659
3660
VOP2_instruction* new_instr =
3661
create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
3662
new_instr->operands[0] = instr->operands[swap];
3663
new_instr->operands[1] = instr->operands[!swap];
3664
new_instr->definitions[0] = instr->definitions[0];
3665
instr.reset(new_instr);
3666
}
3667
3668
void
3669
select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3670
{
3671
const uint32_t threshold = 4;
3672
3673
if (is_dead(ctx.uses, instr.get())) {
3674
instr.reset();
3675
return;
3676
}
3677
3678
/* convert split_vector into a copy or extract_vector if only one definition is ever used */
3679
if (instr->opcode == aco_opcode::p_split_vector) {
3680
unsigned num_used = 0;
3681
unsigned idx = 0;
3682
unsigned split_offset = 0;
3683
for (unsigned i = 0, offset = 0; i < instr->definitions.size();
3684
offset += instr->definitions[i++].bytes()) {
3685
if (ctx.uses[instr->definitions[i].tempId()]) {
3686
num_used++;
3687
idx = i;
3688
split_offset = offset;
3689
}
3690
}
3691
bool done = false;
3692
if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
3693
ctx.uses[instr->operands[0].tempId()] == 1) {
3694
Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
3695
3696
unsigned off = 0;
3697
Operand op;
3698
for (Operand& vec_op : vec->operands) {
3699
if (off == split_offset) {
3700
op = vec_op;
3701
break;
3702
}
3703
off += vec_op.bytes();
3704
}
3705
if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
3706
ctx.uses[instr->operands[0].tempId()]--;
3707
for (Operand& vec_op : vec->operands) {
3708
if (vec_op.isTemp())
3709
ctx.uses[vec_op.tempId()]--;
3710
}
3711
if (op.isTemp())
3712
ctx.uses[op.tempId()]++;
3713
3714
aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
3715
aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
3716
extract->operands[0] = op;
3717
extract->definitions[0] = instr->definitions[idx];
3718
instr.reset(extract.release());
3719
3720
done = true;
3721
}
3722
}
3723
3724
if (!done && num_used == 1 &&
3725
instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
3726
split_offset % instr->definitions[idx].bytes() == 0) {
3727
aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
3728
aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
3729
extract->operands[0] = instr->operands[0];
3730
extract->operands[1] =
3731
Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
3732
extract->definitions[0] = instr->definitions[idx];
3733
instr.reset(extract.release());
3734
}
3735
}
3736
3737
mad_info* mad_info = NULL;
3738
if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
3739
mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
3740
/* re-check mad instructions */
3741
if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {
3742
ctx.uses[mad_info->mul_temp_id]++;
3743
if (instr->operands[0].isTemp())
3744
ctx.uses[instr->operands[0].tempId()]--;
3745
if (instr->operands[1].isTemp())
3746
ctx.uses[instr->operands[1].tempId()]--;
3747
instr.swap(mad_info->add_instr);
3748
mad_info = NULL;
3749
}
3750
/* check literals */
3751
else if (!instr->usesModifiers()) {
3752
/* FMA can only take literals on GFX10+ */
3753
if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
3754
ctx.program->chip_class < GFX10)
3755
return;
3756
/* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take
3757
* literals (GFX10+), these instructions don't exist.
3758
*/
3759
if (instr->opcode == aco_opcode::v_fma_legacy_f16)
3760
return;
3761
3762
bool sgpr_used = false;
3763
uint32_t literal_idx = 0;
3764
uint32_t literal_uses = UINT32_MAX;
3765
for (unsigned i = 0; i < instr->operands.size(); i++) {
3766
if (instr->operands[i].isConstant() && i > 0) {
3767
literal_uses = UINT32_MAX;
3768
break;
3769
}
3770
if (!instr->operands[i].isTemp())
3771
continue;
3772
unsigned bits = get_operand_size(instr, i);
3773
/* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10
3774
* or operands other than the 1st */
3775
if (instr->operands[i].getTemp().type() == RegType::sgpr &&
3776
(i > 0 || ctx.program->chip_class < GFX10)) {
3777
if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) {
3778
literal_uses = ctx.uses[instr->operands[i].tempId()];
3779
literal_idx = i;
3780
} else {
3781
literal_uses = UINT32_MAX;
3782
}
3783
sgpr_used = true;
3784
/* don't break because we still need to check constants */
3785
} else if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits) &&
3786
ctx.uses[instr->operands[i].tempId()] < literal_uses) {
3787
literal_uses = ctx.uses[instr->operands[i].tempId()];
3788
literal_idx = i;
3789
}
3790
}
3791
3792
/* Limit the number of literals to apply to not increase the code
3793
* size too much, but always apply literals for v_mad->v_madak
3794
* because both instructions are 64-bit and this doesn't increase
3795
* code size.
3796
* TODO: try to apply the literals earlier to lower the number of
3797
* uses below threshold
3798
*/
3799
if (literal_uses < threshold || literal_idx == 2) {
3800
ctx.uses[instr->operands[literal_idx].tempId()]--;
3801
mad_info->check_literal = true;
3802
mad_info->literal_idx = literal_idx;
3803
return;
3804
}
3805
}
3806
}
3807
3808
/* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
3809
* when it isn't beneficial */
3810
if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
3811
instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
3812
ctx.info[instr->operands[0].tempId()].set_scc_needed();
3813
return;
3814
} else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
3815
instr->opcode == aco_opcode::s_cselect_b32) &&
3816
instr->operands[2].isTemp()) {
3817
ctx.info[instr->operands[2].tempId()].set_scc_needed();
3818
} else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() &&
3819
ctx.info[instr->definitions[0].tempId()].is_scc_needed()) {
3820
/* Propagate label so it is correctly detected by the uniform bool transform */
3821
ctx.info[instr->operands[0].tempId()].set_scc_needed();
3822
3823
/* Fix definition to SCC, this will prevent RA from adding superfluous moves */
3824
instr->definitions[0].setFixed(scc);
3825
}
3826
3827
/* check for literals */
3828
if (!instr->isSALU() && !instr->isVALU())
3829
return;
3830
3831
/* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
3832
if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
3833
ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
3834
bool transform_done = to_uniform_bool_instr(ctx, instr);
3835
3836
if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
3837
/* Swap the two definition IDs in order to avoid overusing the SCC.
3838
* This reduces extra moves generated by RA. */
3839
uint32_t def0_id = instr->definitions[0].getTemp().id();
3840
uint32_t def1_id = instr->definitions[1].getTemp().id();
3841
instr->definitions[0].setTemp(Temp(def1_id, s1));
3842
instr->definitions[1].setTemp(Temp(def0_id, s1));
3843
}
3844
3845
return;
3846
}
3847
3848
if (instr->opcode == aco_opcode::v_mad_u32_u16)
3849
select_mul_u32_u24(ctx, instr);
3850
3851
if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
3852
(instr->isVOP3P() && ctx.program->chip_class < GFX10))
3853
return; /* some encodings can't ever take literals */
3854
3855
/* we do not apply the literals yet as we don't know if it is profitable */
3856
Operand current_literal(s1);
3857
3858
unsigned literal_id = 0;
3859
unsigned literal_uses = UINT32_MAX;
3860
Operand literal(s1);
3861
unsigned num_operands = 1;
3862
if (instr->isSALU() ||
3863
(ctx.program->chip_class >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P())))
3864
num_operands = instr->operands.size();
3865
/* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
3866
else if (instr->isVALU() && instr->operands.size() >= 3)
3867
return;
3868
3869
unsigned sgpr_ids[2] = {0, 0};
3870
bool is_literal_sgpr = false;
3871
uint32_t mask = 0;
3872
3873
/* choose a literal to apply */
3874
for (unsigned i = 0; i < num_operands; i++) {
3875
Operand op = instr->operands[i];
3876
unsigned bits = get_operand_size(instr, i);
3877
3878
if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
3879
op.tempId() != sgpr_ids[0])
3880
sgpr_ids[!!sgpr_ids[0]] = op.tempId();
3881
3882
if (op.isLiteral()) {
3883
current_literal = op;
3884
continue;
3885
} else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
3886
continue;
3887
}
3888
3889
if (!alu_can_accept_constant(instr->opcode, i))
3890
continue;
3891
3892
if (ctx.uses[op.tempId()] < literal_uses) {
3893
is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
3894
mask = 0;
3895
literal = Operand::c32(ctx.info[op.tempId()].val);
3896
literal_uses = ctx.uses[op.tempId()];
3897
literal_id = op.tempId();
3898
}
3899
3900
mask |= (op.tempId() == literal_id) << i;
3901
}
3902
3903
/* don't go over the constant bus limit */
3904
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
3905
instr->opcode == aco_opcode::v_lshrrev_b64 ||
3906
instr->opcode == aco_opcode::v_ashrrev_i64;
3907
unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
3908
if (ctx.program->chip_class >= GFX10 && !is_shift64)
3909
const_bus_limit = 2;
3910
3911
unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
3912
if (num_sgprs == const_bus_limit && !is_literal_sgpr)
3913
return;
3914
3915
if (literal_id && literal_uses < threshold &&
3916
(current_literal.isUndefined() ||
3917
(current_literal.size() == literal.size() &&
3918
current_literal.constantValue() == literal.constantValue()))) {
3919
/* mark the literal to be applied */
3920
while (mask) {
3921
unsigned i = u_bit_scan(&mask);
3922
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
3923
ctx.uses[instr->operands[i].tempId()]--;
3924
}
3925
}
3926
}
3927
3928
void
3929
apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3930
{
3931
/* Cleanup Dead Instructions */
3932
if (!instr)
3933
return;
3934
3935
/* apply literals on MAD */
3936
if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
3937
mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
3938
if (info->check_literal &&
3939
(ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
3940
aco_ptr<Instruction> new_mad;
3941
3942
aco_opcode new_op =
3943
info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
3944
if (instr->opcode == aco_opcode::v_fma_f32)
3945
new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
3946
else if (instr->opcode == aco_opcode::v_mad_f16 ||
3947
instr->opcode == aco_opcode::v_mad_legacy_f16)
3948
new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
3949
else if (instr->opcode == aco_opcode::v_fma_f16)
3950
new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
3951
3952
new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
3953
if (info->literal_idx == 2) { /* add literal -> madak */
3954
new_mad->operands[0] = instr->operands[0];
3955
new_mad->operands[1] = instr->operands[1];
3956
} else { /* mul literal -> madmk */
3957
new_mad->operands[0] = instr->operands[1 - info->literal_idx];
3958
new_mad->operands[1] = instr->operands[2];
3959
}
3960
new_mad->operands[2] =
3961
Operand::c32(ctx.info[instr->operands[info->literal_idx].tempId()].val);
3962
new_mad->definitions[0] = instr->definitions[0];
3963
ctx.instructions.emplace_back(std::move(new_mad));
3964
return;
3965
}
3966
}
3967
3968
/* apply literals on other SALU/VALU */
3969
if (instr->isSALU() || instr->isVALU()) {
3970
for (unsigned i = 0; i < instr->operands.size(); i++) {
3971
Operand op = instr->operands[i];
3972
unsigned bits = get_operand_size(instr, i);
3973
if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
3974
Operand literal = Operand::c32(ctx.info[op.tempId()].val);
3975
if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
3976
to_VOP3(ctx, instr);
3977
instr->operands[i] = literal;
3978
}
3979
}
3980
}
3981
3982
ctx.instructions.emplace_back(std::move(instr));
3983
}
3984
3985
void
3986
optimize(Program* program)
3987
{
3988
opt_ctx ctx;
3989
ctx.program = program;
3990
std::vector<ssa_info> info(program->peekAllocationId());
3991
ctx.info = info.data();
3992
3993
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
3994
for (Block& block : program->blocks) {
3995
ctx.fp_mode = block.fp_mode;
3996
for (aco_ptr<Instruction>& instr : block.instructions)
3997
label_instruction(ctx, instr);
3998
}
3999
4000
ctx.uses = dead_code_analysis(program);
4001
4002
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
4003
for (Block& block : program->blocks) {
4004
ctx.fp_mode = block.fp_mode;
4005
for (aco_ptr<Instruction>& instr : block.instructions)
4006
combine_instruction(ctx, instr);
4007
}
4008
4009
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
4010
for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
4011
++block_rit) {
4012
Block* block = &(*block_rit);
4013
ctx.fp_mode = block->fp_mode;
4014
for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
4015
++instr_rit)
4016
select_instruction(ctx, *instr_rit);
4017
}
4018
4019
/* 4. Add literals to instructions */
4020
for (Block& block : program->blocks) {
4021
ctx.instructions.clear();
4022
ctx.fp_mode = block.fp_mode;
4023
for (aco_ptr<Instruction>& instr : block.instructions)
4024
apply_literals(ctx, instr);
4025
block.instructions.swap(ctx.instructions);
4026
}
4027
}
4028
4029
} // namespace aco
4030
4031