Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/bifrost/bi_schedule.c
4564 views
1
/*
2
* Copyright (C) 2020 Collabora Ltd.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors (Collabora):
24
* Alyssa Rosenzweig <[email protected]>
25
*/
26
27
#include "compiler.h"
28
#include "bi_builder.h"
29
30
/* Arguments common to worklist, passed by value for convenience */
31
32
struct bi_worklist {
33
/* # of instructions in the block */
34
unsigned count;
35
36
/* Instructions in the block */
37
bi_instr **instructions;
38
39
/* Bitset of instructions in the block ready for scheduling */
40
BITSET_WORD *worklist;
41
42
/* The backwards dependency graph. nr_dependencies is the number of
43
* unscheduled instructions that must still be scheduled after (before)
44
* this instruction. dependents are which instructions need to be
45
* scheduled before (after) this instruction. */
46
unsigned *dep_counts;
47
BITSET_WORD **dependents;
48
};
49
50
/* State of a single tuple and clause under construction */
51
52
struct bi_reg_state {
53
/* Number of register writes */
54
unsigned nr_writes;
55
56
/* Register reads, expressed as (equivalence classes of)
57
* sources. Only 3 reads are allowed, but up to 2 may spill as
58
* "forced" for the next scheduled tuple, provided such a tuple
59
* can be constructed */
60
bi_index reads[5];
61
unsigned nr_reads;
62
63
/* The previous tuple scheduled (= the next tuple executed in the
64
* program) may require certain writes, in order to bypass the register
65
* file and use a temporary passthrough for the value. Up to 2 such
66
* constraints are architecturally satisfiable */
67
unsigned forced_count;
68
bi_index forceds[2];
69
};
70
71
struct bi_tuple_state {
72
/* Is this the last tuple in the clause */
73
bool last;
74
75
/* Scheduled ADD instruction, or null if none */
76
bi_instr *add;
77
78
/* Reads for previous (succeeding) tuple */
79
bi_index prev_reads[5];
80
unsigned nr_prev_reads;
81
bi_tuple *prev;
82
83
/* Register slot state for current tuple */
84
struct bi_reg_state reg;
85
86
/* Constants are shared in the tuple. If constant_count is nonzero, it
87
* is a size for constant count. Otherwise, fau is the slot read from
88
* FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89
* but within a tuple, that should be encoded as constant_count != 0
90
* and constants[0] = constants[1] = 0 */
91
unsigned constant_count;
92
93
union {
94
uint32_t constants[2];
95
enum bir_fau fau;
96
};
97
98
unsigned pcrel_idx;
99
};
100
101
struct bi_const_state {
102
unsigned constant_count;
103
bool pcrel; /* applies to first const */
104
uint32_t constants[2];
105
106
/* Index of the constant into the clause */
107
unsigned word_idx;
108
};
109
110
struct bi_clause_state {
111
/* Has a message-passing instruction already been assigned? */
112
bool message;
113
114
/* Indices already accessed, this needs to be tracked to avoid hazards
115
* around message-passing instructions */
116
unsigned access_count;
117
bi_index accesses[(BI_MAX_SRCS + 2) * 16];
118
119
unsigned tuple_count;
120
struct bi_const_state consts[8];
121
};
122
123
/* Determines messsage type by checking the table and a few special cases. Only
124
* case missing is tilebuffer instructions that access depth/stencil, which
125
* require a Z_STENCIL message (to implement
126
* ARM_shader_framebuffer_fetch_depth_stencil) */
127
128
static enum bifrost_message_type
129
bi_message_type_for_instr(bi_instr *ins)
130
{
131
enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
132
bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
133
134
if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
135
return BIFROST_MESSAGE_Z_STENCIL;
136
137
if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
138
return BIFROST_MESSAGE_ATTRIBUTE;
139
140
return msg;
141
}
142
143
/* Attribute, texture, and UBO load (attribute message) instructions support
144
* bindless, so just check the message type */
145
146
ASSERTED static bool
147
bi_supports_dtsel(bi_instr *ins)
148
{
149
switch (bi_message_type_for_instr(ins)) {
150
case BIFROST_MESSAGE_ATTRIBUTE:
151
return ins->op != BI_OPCODE_LD_GCLK_U64;
152
case BIFROST_MESSAGE_TEX:
153
return true;
154
default:
155
return false;
156
}
157
}
158
159
/* Adds an edge to the dependency graph */
160
161
static void
162
bi_push_dependency(unsigned parent, unsigned child,
163
BITSET_WORD **dependents, unsigned *dep_counts)
164
{
165
if (!BITSET_TEST(dependents[parent], child)) {
166
BITSET_SET(dependents[parent], child);
167
dep_counts[child]++;
168
}
169
}
170
171
static void
172
add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
173
BITSET_WORD **dependents, unsigned *dep_counts)
174
{
175
assert(index < 64);
176
util_dynarray_foreach(table + index, unsigned, parent)
177
bi_push_dependency(*parent, child, dependents, dep_counts);
178
}
179
180
static void
181
mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
182
{
183
assert(index < 64);
184
util_dynarray_append(&table[index], unsigned, parent);
185
}
186
187
static bool
188
bi_is_sched_barrier(bi_instr *I)
189
{
190
switch (I->op) {
191
case BI_OPCODE_BARRIER:
192
case BI_OPCODE_DISCARD_F32:
193
return true;
194
default:
195
return false;
196
}
197
}
198
199
static void
200
bi_create_dependency_graph(struct bi_worklist st, bool inorder)
201
{
202
struct util_dynarray last_read[64], last_write[64];
203
204
for (unsigned i = 0; i < 64; ++i) {
205
util_dynarray_init(&last_read[i], NULL);
206
util_dynarray_init(&last_write[i], NULL);
207
}
208
209
/* Initialize dependency graph */
210
for (unsigned i = 0; i < st.count; ++i) {
211
st.dependents[i] =
212
calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
213
214
st.dep_counts[i] = 0;
215
}
216
217
unsigned prev_msg = ~0;
218
219
/* Populate dependency graph */
220
for (signed i = st.count - 1; i >= 0; --i) {
221
bi_instr *ins = st.instructions[i];
222
223
bi_foreach_src(ins, s) {
224
if (ins->src[s].type != BI_INDEX_REGISTER) continue;
225
unsigned count = bi_count_read_registers(ins, s);
226
227
for (unsigned c = 0; c < count; ++c)
228
add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
229
}
230
231
/* Keep message-passing ops in order. (This pass only cares
232
* about bundling; reordering of message-passing instructions
233
* happens during earlier scheduling.) */
234
235
if (bi_message_type_for_instr(ins)) {
236
if (prev_msg != ~0)
237
bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
238
239
prev_msg = i;
240
}
241
242
/* Handle schedule barriers, adding All the deps */
243
if (inorder || bi_is_sched_barrier(ins)) {
244
for (unsigned j = 0; j < st.count; ++j) {
245
if (i == j) continue;
246
247
bi_push_dependency(MAX2(i, j), MIN2(i, j),
248
st.dependents, st.dep_counts);
249
}
250
}
251
252
bi_foreach_dest(ins, d) {
253
if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
254
unsigned dest = ins->dest[d].value;
255
256
unsigned count = bi_count_write_registers(ins, d);
257
258
for (unsigned c = 0; c < count; ++c) {
259
add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
260
add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
261
mark_access(last_write, dest + c, i);
262
}
263
}
264
265
bi_foreach_src(ins, s) {
266
if (ins->src[s].type != BI_INDEX_REGISTER) continue;
267
268
unsigned count = bi_count_read_registers(ins, s);
269
270
for (unsigned c = 0; c < count; ++c)
271
mark_access(last_read, ins->src[s].value + c, i);
272
}
273
}
274
275
/* If there is a branch, all instructions depend on it, as interblock
276
* execution must be purely in-order */
277
278
bi_instr *last = st.instructions[st.count - 1];
279
if (last->branch_target || last->op == BI_OPCODE_JUMP) {
280
for (signed i = st.count - 2; i >= 0; --i)
281
bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
282
}
283
284
/* Free the intermediate structures */
285
for (unsigned i = 0; i < 64; ++i) {
286
util_dynarray_fini(&last_read[i]);
287
util_dynarray_fini(&last_write[i]);
288
}
289
}
290
291
/* Scheduler pseudoinstruction lowerings to enable instruction pairings.
292
* Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
293
*/
294
295
static bi_instr *
296
bi_lower_cubeface(bi_context *ctx,
297
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
298
{
299
bi_instr *pinstr = tuple->add;
300
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
301
bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
302
pinstr->src[0], pinstr->src[1], pinstr->src[2]);
303
304
pinstr->op = BI_OPCODE_CUBEFACE2;
305
pinstr->dest[0] = pinstr->dest[1];
306
pinstr->dest[1] = bi_null();
307
pinstr->src[0] = cubeface1->dest[0];
308
pinstr->src[1] = bi_null();
309
pinstr->src[2] = bi_null();
310
311
return cubeface1;
312
}
313
314
/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
315
* have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
316
* arguments (rbase, address lo, address hi, rbase) */
317
318
static bi_instr *
319
bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
320
bi_tuple_state *tuple)
321
{
322
bi_instr *pinstr = tuple->add;
323
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
324
bi_instr *atom_c = bi_atom_c_return_i32(&b,
325
pinstr->src[1], pinstr->src[2], pinstr->src[0],
326
pinstr->atom_opc);
327
328
if (bi_is_null(pinstr->dest[0]))
329
atom_c->op = BI_OPCODE_ATOM_C_I32;
330
331
pinstr->op = BI_OPCODE_ATOM_CX;
332
pinstr->src[3] = atom_c->src[2];
333
334
return atom_c;
335
}
336
337
static bi_instr *
338
bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
339
bi_tuple_state *tuple)
340
{
341
bi_instr *pinstr = tuple->add;
342
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
343
bi_instr *atom_c = bi_atom_c1_return_i32(&b,
344
pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
345
346
if (bi_is_null(pinstr->dest[0]))
347
atom_c->op = BI_OPCODE_ATOM_C1_I32;
348
349
pinstr->op = BI_OPCODE_ATOM_CX;
350
pinstr->src[2] = pinstr->src[1];
351
pinstr->src[1] = pinstr->src[0];
352
pinstr->src[3] = bi_dontcare();
353
pinstr->src[0] = bi_null();
354
355
return atom_c;
356
}
357
358
static bi_instr *
359
bi_lower_seg_add(bi_context *ctx,
360
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
361
{
362
bi_instr *pinstr = tuple->add;
363
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
364
365
bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
366
pinstr->preserve_null, pinstr->seg);
367
368
pinstr->op = BI_OPCODE_SEG_ADD;
369
pinstr->src[0] = pinstr->src[1];
370
pinstr->src[1] = bi_null();
371
372
assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
373
pinstr->dest[0].value += 1;
374
375
return fma;
376
}
377
378
static bi_instr *
379
bi_lower_dtsel(bi_context *ctx,
380
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
381
{
382
bi_instr *add = tuple->add;
383
bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
384
385
bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
386
add->src[0], add->table);
387
add->src[0] = dtsel->dest[0];
388
389
assert(bi_supports_dtsel(add));
390
return dtsel;
391
}
392
393
/* Flatten linked list to array for O(1) indexing */
394
395
static bi_instr **
396
bi_flatten_block(bi_block *block, unsigned *len)
397
{
398
if (list_is_empty(&block->base.instructions))
399
return NULL;
400
401
*len = list_length(&block->base.instructions);
402
bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
403
404
unsigned i = 0;
405
406
bi_foreach_instr_in_block(block, ins)
407
instructions[i++] = ins;
408
409
return instructions;
410
}
411
412
/* The worklist would track instructions without outstanding dependencies. For
413
* debug, force in-order scheduling (no dependency graph is constructed).
414
*/
415
416
static struct bi_worklist
417
bi_initialize_worklist(bi_block *block, bool inorder)
418
{
419
struct bi_worklist st = { };
420
st.instructions = bi_flatten_block(block, &st.count);
421
422
if (!st.count)
423
return st;
424
425
st.dependents = calloc(st.count, sizeof(st.dependents[0]));
426
st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
427
428
bi_create_dependency_graph(st, inorder);
429
st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
430
431
for (unsigned i = 0; i < st.count; ++i) {
432
if (st.dep_counts[i] == 0)
433
BITSET_SET(st.worklist, i);
434
}
435
436
return st;
437
}
438
439
static void
440
bi_free_worklist(struct bi_worklist st)
441
{
442
free(st.dep_counts);
443
free(st.dependents);
444
free(st.instructions);
445
free(st.worklist);
446
}
447
448
static void
449
bi_update_worklist(struct bi_worklist st, unsigned idx)
450
{
451
assert(st.dep_counts[idx] == 0);
452
453
if (!st.dependents[idx])
454
return;
455
456
/* Iterate each dependent to remove one dependency (`done`),
457
* adding dependents to the worklist where possible. */
458
459
unsigned i;
460
BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
461
assert(st.dep_counts[i] != 0);
462
unsigned new_deps = --st.dep_counts[i];
463
464
if (new_deps == 0)
465
BITSET_SET(st.worklist, i);
466
}
467
468
free(st.dependents[idx]);
469
}
470
471
/* To work out the back-to-back flag, we need to detect branches and
472
* "fallthrough" branches, implied in the last clause of a block that falls
473
* through to another block with *multiple predecessors*. */
474
475
static bool
476
bi_back_to_back(bi_block *block)
477
{
478
/* Last block of a program */
479
if (!block->base.successors[0]) {
480
assert(!block->base.successors[1]);
481
return false;
482
}
483
484
/* Multiple successors? We're branching */
485
if (block->base.successors[1])
486
return false;
487
488
struct pan_block *succ = block->base.successors[0];
489
assert(succ->predecessors);
490
unsigned count = succ->predecessors->entries;
491
492
/* Back to back only if the successor has only a single predecessor */
493
return (count == 1);
494
}
495
496
/* Scheduler predicates */
497
498
/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
499
static bool
500
bi_can_iaddc(bi_instr *ins)
501
{
502
return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
503
ins->src[0].swizzle == BI_SWIZZLE_H01 &&
504
ins->src[1].swizzle == BI_SWIZZLE_H01);
505
}
506
507
ASSERTED static bool
508
bi_can_fma(bi_instr *ins)
509
{
510
/* +IADD.i32 -> *IADDC.i32 */
511
if (bi_can_iaddc(ins))
512
return true;
513
514
/* TODO: some additional fp16 constraints */
515
return bi_opcode_props[ins->op].fma;
516
}
517
518
static bool
519
bi_impacted_fadd_widens(bi_instr *I)
520
{
521
enum bi_swizzle swz0 = I->src[0].swizzle;
522
enum bi_swizzle swz1 = I->src[1].swizzle;
523
524
return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
525
(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
526
(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
527
}
528
529
ASSERTED static bool
530
bi_can_add(bi_instr *ins)
531
{
532
/* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
533
if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
534
return false;
535
536
/* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
537
if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
538
return false;
539
540
/* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
541
if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
542
return false;
543
544
/* TODO: some additional fp16 constraints */
545
return bi_opcode_props[ins->op].add;
546
}
547
548
ASSERTED static bool
549
bi_must_last(bi_instr *ins)
550
{
551
return bi_opcode_props[ins->op].last;
552
}
553
554
/* Architecturally, no single instruction has a "not last" constraint. However,
555
* pseudoinstructions writing multiple destinations (expanding to multiple
556
* paired instructions) can run afoul of the "no two writes on the last clause"
557
* constraint, so we check for that here.
558
*/
559
560
static bool
561
bi_must_not_last(bi_instr *ins)
562
{
563
return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]);
564
}
565
566
/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
567
* treat it as a message-passing instruction for the purpose of scheduling
568
* despite no passing no logical message. Otherwise invalid encoding faults may
569
* be raised for unknown reasons (possibly an errata).
570
*/
571
572
ASSERTED static bool
573
bi_must_message(bi_instr *ins)
574
{
575
return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
576
(ins->op == BI_OPCODE_DISCARD_F32);
577
}
578
579
static bool
580
bi_fma_atomic(enum bi_opcode op)
581
{
582
switch (op) {
583
case BI_OPCODE_ATOM_C_I32:
584
case BI_OPCODE_ATOM_C_I64:
585
case BI_OPCODE_ATOM_C1_I32:
586
case BI_OPCODE_ATOM_C1_I64:
587
case BI_OPCODE_ATOM_C1_RETURN_I32:
588
case BI_OPCODE_ATOM_C1_RETURN_I64:
589
case BI_OPCODE_ATOM_C_RETURN_I32:
590
case BI_OPCODE_ATOM_C_RETURN_I64:
591
case BI_OPCODE_ATOM_POST_I32:
592
case BI_OPCODE_ATOM_POST_I64:
593
case BI_OPCODE_ATOM_PRE_I64:
594
return true;
595
default:
596
return false;
597
}
598
}
599
600
ASSERTED static bool
601
bi_reads_zero(bi_instr *ins)
602
{
603
return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
604
}
605
606
static bool
607
bi_reads_temps(bi_instr *ins, unsigned src)
608
{
609
switch (ins->op) {
610
/* Cannot permute a temporary */
611
case BI_OPCODE_CLPER_V6_I32:
612
case BI_OPCODE_CLPER_V7_I32:
613
return src != 0;
614
case BI_OPCODE_IMULD:
615
return false;
616
default:
617
return true;
618
}
619
}
620
621
static bool
622
bi_impacted_t_modifiers(bi_instr *I, unsigned src)
623
{
624
enum bi_swizzle swizzle = I->src[src].swizzle;
625
626
switch (I->op) {
627
case BI_OPCODE_F16_TO_F32:
628
case BI_OPCODE_F16_TO_S32:
629
case BI_OPCODE_F16_TO_U32:
630
case BI_OPCODE_MKVEC_V2I16:
631
case BI_OPCODE_S16_TO_F32:
632
case BI_OPCODE_S16_TO_S32:
633
case BI_OPCODE_U16_TO_F32:
634
case BI_OPCODE_U16_TO_U32:
635
return (swizzle != BI_SWIZZLE_H00);
636
637
case BI_OPCODE_BRANCH_F32:
638
case BI_OPCODE_LOGB_F32:
639
case BI_OPCODE_ILOGB_F32:
640
case BI_OPCODE_FADD_F32:
641
case BI_OPCODE_FCMP_F32:
642
case BI_OPCODE_FREXPE_F32:
643
case BI_OPCODE_FREXPM_F32:
644
case BI_OPCODE_FROUND_F32:
645
return (swizzle != BI_SWIZZLE_H01);
646
647
case BI_OPCODE_IADD_S32:
648
case BI_OPCODE_IADD_U32:
649
case BI_OPCODE_ISUB_S32:
650
case BI_OPCODE_ISUB_U32:
651
case BI_OPCODE_IADD_V4S8:
652
case BI_OPCODE_IADD_V4U8:
653
case BI_OPCODE_ISUB_V4S8:
654
case BI_OPCODE_ISUB_V4U8:
655
return (src == 1) && (swizzle != BI_SWIZZLE_H01);
656
657
case BI_OPCODE_S8_TO_F32:
658
case BI_OPCODE_S8_TO_S32:
659
case BI_OPCODE_U8_TO_F32:
660
case BI_OPCODE_U8_TO_U32:
661
return (swizzle != BI_SWIZZLE_B0000);
662
663
case BI_OPCODE_V2S8_TO_V2F16:
664
case BI_OPCODE_V2S8_TO_V2S16:
665
case BI_OPCODE_V2U8_TO_V2F16:
666
case BI_OPCODE_V2U8_TO_V2U16:
667
return (swizzle != BI_SWIZZLE_B0022);
668
669
case BI_OPCODE_IADD_V2S16:
670
case BI_OPCODE_IADD_V2U16:
671
case BI_OPCODE_ISUB_V2S16:
672
case BI_OPCODE_ISUB_V2U16:
673
return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
674
675
#if 0
676
/* Restriction on IADD in 64-bit clauses on G72 */
677
case BI_OPCODE_IADD_S64:
678
case BI_OPCODE_IADD_U64:
679
return (src == 1) && (swizzle != BI_SWIZZLE_D0);
680
#endif
681
682
default:
683
return false;
684
}
685
}
686
687
ASSERTED static bool
688
bi_reads_t(bi_instr *ins, unsigned src)
689
{
690
/* Branch offset cannot come from passthrough */
691
if (bi_opcode_props[ins->op].branch)
692
return src != 2;
693
694
/* Table can never read passthrough */
695
if (bi_opcode_props[ins->op].table)
696
return false;
697
698
/* Staging register reads may happen before the succeeding register
699
* block encodes a write, so effectively there is no passthrough */
700
if (src == 0 && bi_opcode_props[ins->op].sr_read)
701
return false;
702
703
/* Bifrost cores newer than Mali G71 have restrictions on swizzles on
704
* same-cycle temporaries. Check the list for these hazards. */
705
if (bi_impacted_t_modifiers(ins, src))
706
return false;
707
708
/* Descriptor must not come from a passthrough */
709
switch (ins->op) {
710
case BI_OPCODE_LD_CVT:
711
case BI_OPCODE_LD_TILE:
712
case BI_OPCODE_ST_CVT:
713
case BI_OPCODE_ST_TILE:
714
case BI_OPCODE_TEXC:
715
return src != 2;
716
case BI_OPCODE_BLEND:
717
return src != 2 && src != 3;
718
719
/* Else, just check if we can read any temps */
720
default:
721
return bi_reads_temps(ins, src);
722
}
723
}
724
725
/* Counts the number of 64-bit constants required by a clause. TODO: We
726
* might want to account for merging, right now we overestimate, but
727
* that's probably fine most of the time */
728
729
static unsigned
730
bi_nconstants(struct bi_clause_state *clause)
731
{
732
unsigned count_32 = 0;
733
734
for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
735
count_32 += clause->consts[i].constant_count;
736
737
return DIV_ROUND_UP(count_32, 2);
738
}
739
740
/* Would there be space for constants if we added one tuple? */
741
742
static bool
743
bi_space_for_more_constants(struct bi_clause_state *clause)
744
{
745
return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
746
}
747
748
/* Updates the FAU assignment for a tuple. A valid FAU assignment must be
749
* possible (as a precondition), though not necessarily on the selected unit;
750
* this is gauranteed per-instruction by bi_lower_fau and per-tuple by
751
* bi_instr_schedulable */
752
753
static bool
754
bi_update_fau(struct bi_clause_state *clause,
755
struct bi_tuple_state *tuple,
756
bi_instr *instr, bool fma, bool destructive)
757
{
758
/* Maintain our own constants, for nondestructive mode */
759
uint32_t copied_constants[2], copied_count;
760
unsigned *constant_count = &tuple->constant_count;
761
uint32_t *constants = tuple->constants;
762
enum bir_fau fau = tuple->fau;
763
764
if (!destructive) {
765
memcpy(copied_constants, tuple->constants,
766
(*constant_count) * sizeof(constants[0]));
767
copied_count = tuple->constant_count;
768
769
constant_count = &copied_count;
770
constants = copied_constants;
771
}
772
773
bi_foreach_src(instr, s) {
774
bi_index src = instr->src[s];
775
776
if (src.type == BI_INDEX_FAU) {
777
bool no_constants = *constant_count == 0;
778
bool no_other_fau = (fau == src.value) || !fau;
779
bool mergable = no_constants && no_other_fau;
780
781
if (destructive) {
782
assert(mergable);
783
tuple->fau = src.value;
784
} else if (!mergable) {
785
return false;
786
}
787
788
fau = src.value;
789
} else if (src.type == BI_INDEX_CONSTANT) {
790
/* No need to reserve space if we have a fast 0 */
791
if (src.value == 0 && fma && bi_reads_zero(instr))
792
continue;
793
794
/* If there is a branch target, #0 by convention is the
795
* PC-relative offset to the target */
796
bool pcrel = instr->branch_target && src.value == 0;
797
bool found = false;
798
799
for (unsigned i = 0; i < *constant_count; ++i) {
800
found |= (constants[i] == src.value) &&
801
(i != tuple->pcrel_idx);
802
}
803
804
/* pcrel constants are unique, so don't match */
805
if (found && !pcrel)
806
continue;
807
808
bool no_fau = (*constant_count > 0) || !fau;
809
bool mergable = no_fau && ((*constant_count) < 2);
810
811
if (destructive) {
812
assert(mergable);
813
814
if (pcrel)
815
tuple->pcrel_idx = *constant_count;
816
} else if (!mergable)
817
return false;
818
819
constants[(*constant_count)++] = src.value;
820
}
821
}
822
823
/* Constants per clause may be limited by tuple count */
824
bool room_for_constants = (*constant_count == 0) ||
825
bi_space_for_more_constants(clause);
826
827
if (destructive)
828
assert(room_for_constants);
829
else if (!room_for_constants)
830
return false;
831
832
return true;
833
}
834
835
/* Given an in-progress tuple, a candidate new instruction to add to the tuple,
836
* and a source (index) from that candidate, determine whether this source is
837
* "new", in the sense of requiring an additional read slot. That is, checks
838
* whether the specified source reads from the register file via a read slot
839
* (determined by its type and placement) and whether the source was already
840
* specified by a prior read slot (to avoid double counting) */
841
842
static bool
843
bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
844
{
845
bi_index src = instr->src[src_idx];
846
847
/* Only consider sources which come from the register file */
848
if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
849
return false;
850
851
/* Staging register reads bypass the usual register file mechanism */
852
if (src_idx == 0 && bi_opcode_props[instr->op].sr_read)
853
return false;
854
855
/* If a source is already read in the tuple, it is already counted */
856
for (unsigned t = 0; t < reg->nr_reads; ++t)
857
if (bi_is_word_equiv(src, reg->reads[t]))
858
return false;
859
860
/* If a source is read in _this instruction_, it is already counted */
861
for (unsigned t = 0; t < src_idx; ++t)
862
if (bi_is_word_equiv(src, instr->src[t]))
863
return false;
864
865
return true;
866
}
867
868
/* Given two tuples in source order, count the number of register reads of the
869
* successor, determined as the number of unique words accessed that aren't
870
* written by the predecessor (since those are tempable).
871
*/
872
873
static unsigned
874
bi_count_succ_reads(bi_index t0, bi_index t1,
875
bi_index *succ_reads, unsigned nr_succ_reads)
876
{
877
unsigned reads = 0;
878
879
for (unsigned i = 0; i < nr_succ_reads; ++i) {
880
bool unique = true;
881
882
for (unsigned j = 0; j < i; ++j)
883
if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
884
unique = false;
885
886
if (!unique)
887
continue;
888
889
if (bi_is_word_equiv(succ_reads[i], t0))
890
continue;
891
892
if (bi_is_word_equiv(succ_reads[i], t1))
893
continue;
894
895
reads++;
896
}
897
898
return reads;
899
}
900
901
/* Not all instructions can read from the staging passthrough (as determined by
902
* reads_t), check if a given pair of instructions has such a restriction. Note
903
* we also use this mechanism to prevent data races around staging register
904
* reads, so we allow the input source to potentially be vector-valued */
905
906
static bool
907
bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
908
{
909
bi_foreach_src(add, s) {
910
bi_index src = add->src[s];
911
912
if (src.type != BI_INDEX_REGISTER)
913
continue;
914
915
unsigned count = bi_count_read_registers(add, s);
916
bool read = false;
917
918
for (unsigned d = 0; d < count; ++d)
919
read |= bi_is_equiv(fma, bi_register(src.value + d));
920
921
if (read && !bi_reads_t(add, s))
922
return true;
923
}
924
925
return false;
926
}
927
928
/* Likewise for cross-tuple passthrough (reads_temps) */
929
930
static bool
931
bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
932
{
933
bi_foreach_instr_in_tuple(succ, pins) {
934
bi_foreach_src(pins, s) {
935
if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
936
!bi_reads_temps(pins, s))
937
return true;
938
}
939
}
940
941
return false;
942
}
943
944
/* Is a register written other than the staging mechanism? ATEST is special,
945
* writing to both a staging register and a regular register (fixed packing).
946
* BLEND is special since it has to write r48 the normal way even if it never
947
* gets read. This depends on liveness analysis, as a register is not needed
948
* for a write that will be discarded after one tuple. */
949
950
static unsigned
951
bi_write_count(bi_instr *instr, uint64_t live_after_temp)
952
{
953
if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
954
return 1;
955
956
unsigned count = 0;
957
958
bi_foreach_dest(instr, d) {
959
if (d == 0 && bi_opcode_props[instr->op].sr_write)
960
continue;
961
962
if (bi_is_null(instr->dest[d]))
963
continue;
964
965
assert(instr->dest[0].type == BI_INDEX_REGISTER);
966
if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
967
count++;
968
}
969
970
return count;
971
}
972
973
/* Instruction placement entails two questions: what subset of instructions in
974
* the block can legally be scheduled? and of those which is the best? That is,
975
* we seek to maximize a cost function on a subset of the worklist satisfying a
976
* particular predicate. The necessary predicate is determined entirely by
977
* Bifrost's architectural limitations and is described in the accompanying
978
* whitepaper. The cost function is a heuristic. */
979
980
static bool
981
bi_instr_schedulable(bi_instr *instr,
982
struct bi_clause_state *clause,
983
struct bi_tuple_state *tuple,
984
uint64_t live_after_temp,
985
bool fma)
986
{
987
/* The units must match */
988
if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
989
return false;
990
991
/* There can only be one message-passing instruction per clause */
992
if (bi_must_message(instr) && clause->message)
993
return false;
994
995
/* Some instructions have placement requirements */
996
if (bi_must_last(instr) && !tuple->last)
997
return false;
998
999
if (bi_must_not_last(instr) && tuple->last)
1000
return false;
1001
1002
/* Message-passing instructions are not guaranteed write within the
1003
* same clause (most likely they will not), so if a later instruction
1004
* in the clause accesses the destination, the message-passing
1005
* instruction can't be scheduled */
1006
if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {
1007
unsigned nr = bi_count_write_registers(instr, 0);
1008
assert(instr->dest[0].type == BI_INDEX_REGISTER);
1009
unsigned reg = instr->dest[0].value;
1010
1011
for (unsigned i = 0; i < clause->access_count; ++i) {
1012
bi_index idx = clause->accesses[i];
1013
for (unsigned d = 0; d < nr; ++d) {
1014
if (bi_is_equiv(bi_register(reg + d), idx))
1015
return false;
1016
}
1017
}
1018
}
1019
1020
if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1021
unsigned nr = bi_count_read_registers(instr, 0);
1022
assert(instr->src[0].type == BI_INDEX_REGISTER);
1023
unsigned reg = instr->src[0].value;
1024
1025
for (unsigned i = 0; i < clause->access_count; ++i) {
1026
bi_index idx = clause->accesses[i];
1027
for (unsigned d = 0; d < nr; ++d) {
1028
if (bi_is_equiv(bi_register(reg + d), idx))
1029
return false;
1030
}
1031
}
1032
}
1033
1034
/* If FAU is already assigned, we may not disrupt that. Do a
1035
* non-disruptive test update */
1036
if (!bi_update_fau(clause, tuple, instr, fma, false))
1037
return false;
1038
1039
/* If this choice of FMA would force a staging passthrough, the ADD
1040
* instruction must support such a passthrough */
1041
if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1042
return false;
1043
1044
/* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1045
if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1046
return false;
1047
1048
/* Register file writes are limited */
1049
unsigned total_writes = tuple->reg.nr_writes;
1050
total_writes += bi_write_count(instr, live_after_temp);
1051
1052
/* Last tuple in a clause can only write a single value */
1053
if (tuple->last && total_writes > 1)
1054
return false;
1055
1056
/* Register file reads are limited, so count unique */
1057
1058
unsigned unique_new_srcs = 0;
1059
1060
bi_foreach_src(instr, s) {
1061
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1062
unique_new_srcs++;
1063
}
1064
1065
unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1066
1067
bool can_spill_to_moves = (!tuple->add);
1068
can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1069
can_spill_to_moves &= (clause->tuple_count < 7);
1070
1071
/* However, we can get an extra 1 or 2 sources by inserting moves */
1072
if (total_srcs > (can_spill_to_moves ? 4 : 3))
1073
return false;
1074
1075
/* Count effective reads for the successor */
1076
unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1077
tuple->add ? tuple->add->dest[0] : bi_null(),
1078
tuple->prev_reads, tuple->nr_prev_reads);
1079
1080
/* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1081
if ((signed) total_writes > (4 - (signed) succ_reads))
1082
return false;
1083
1084
return true;
1085
}
1086
1087
static signed
1088
bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1089
{
1090
signed cost = 0;
1091
1092
/* Instructions that can schedule to either FMA or to ADD should be
1093
* deprioritized since they're easier to reschedule elsewhere */
1094
if (bi_can_fma(instr) && bi_can_add(instr))
1095
cost++;
1096
1097
/* Message-passing instructions impose constraints on the registers
1098
* later in the clause, so schedule them as late within a clause as
1099
* possible (<==> prioritize them since we're backwards <==> decrease
1100
* cost) */
1101
if (bi_must_message(instr))
1102
cost--;
1103
1104
/* Last instructions are big constraints (XXX: no effect on shader-db) */
1105
if (bi_must_last(instr))
1106
cost -= 2;
1107
1108
return cost;
1109
}
1110
1111
static unsigned
1112
bi_choose_index(struct bi_worklist st,
1113
struct bi_clause_state *clause,
1114
struct bi_tuple_state *tuple,
1115
uint64_t live_after_temp,
1116
bool fma)
1117
{
1118
unsigned i, best_idx = ~0;
1119
signed best_cost = INT_MAX;
1120
1121
BITSET_FOREACH_SET(i, st.worklist, st.count) {
1122
bi_instr *instr = st.instructions[i];
1123
1124
if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1125
continue;
1126
1127
signed cost = bi_instr_cost(instr, tuple);
1128
1129
/* Tie break in favour of later instructions, under the
1130
* assumption this promotes temporary usage (reducing pressure
1131
* on the register file). This is a side effect of a prepass
1132
* scheduling for pressure. */
1133
1134
if (cost <= best_cost) {
1135
best_idx = i;
1136
best_cost = cost;
1137
}
1138
}
1139
1140
return best_idx;
1141
}
1142
1143
static void
1144
bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1145
bi_instr *instr, uint64_t live_after_temp, bool fma)
1146
{
1147
bi_update_fau(clause, tuple, instr, fma, true);
1148
1149
/* TODO: maybe opt a bit? or maybe doesn't matter */
1150
assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1151
memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1152
clause->access_count += BI_MAX_SRCS;
1153
memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1154
clause->access_count += BI_MAX_DESTS;
1155
tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1156
1157
bi_foreach_src(instr, s) {
1158
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1159
tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1160
}
1161
}
1162
1163
/* Choose the best instruction and pop it off the worklist. Returns NULL if no
1164
* instruction is available. This function is destructive. */
1165
1166
static bi_instr *
1167
bi_take_instr(bi_context *ctx, struct bi_worklist st,
1168
struct bi_clause_state *clause,
1169
struct bi_tuple_state *tuple,
1170
uint64_t live_after_temp,
1171
bool fma)
1172
{
1173
if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1174
return bi_lower_cubeface(ctx, clause, tuple);
1175
else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32)
1176
return bi_lower_atom_c(ctx, clause, tuple);
1177
else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32)
1178
return bi_lower_atom_c1(ctx, clause, tuple);
1179
else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1180
return bi_lower_seg_add(ctx, clause, tuple);
1181
else if (tuple->add && tuple->add->table)
1182
return bi_lower_dtsel(ctx, clause, tuple);
1183
1184
/* TODO: Optimize these moves */
1185
if (!fma && tuple->nr_prev_reads > 3) {
1186
/* Only spill by one source for now */
1187
assert(tuple->nr_prev_reads == 4);
1188
1189
/* Pick a source to spill */
1190
bi_index src = tuple->prev_reads[0];
1191
1192
/* Schedule the spill */
1193
bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1194
bi_instr *mov = bi_mov_i32_to(&b, src, src);
1195
bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1196
return mov;
1197
}
1198
1199
#ifndef NDEBUG
1200
/* Don't pair instructions if debugging */
1201
if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1202
return NULL;
1203
#endif
1204
1205
unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1206
1207
if (idx >= st.count)
1208
return NULL;
1209
1210
/* Update state to reflect taking the instruction */
1211
bi_instr *instr = st.instructions[idx];
1212
1213
BITSET_CLEAR(st.worklist, idx);
1214
bi_update_worklist(st, idx);
1215
bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1216
1217
/* Fixups */
1218
if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1219
assert(bi_can_iaddc(instr));
1220
instr->op = BI_OPCODE_IADDC_I32;
1221
instr->src[2] = bi_zero();
1222
}
1223
1224
return instr;
1225
}
1226
1227
/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1228
* to a passthrough register. If except_zero is true, the zeroth (first) source
1229
* is skipped, so staging register reads are not accidentally encoded as
1230
* passthrough (which is impossible) */
1231
1232
static void
1233
bi_use_passthrough(bi_instr *ins, bi_index old,
1234
enum bifrost_packed_src new,
1235
bool except_zero)
1236
{
1237
/* Optional for convenience */
1238
if (!ins || bi_is_null(old))
1239
return;
1240
1241
bi_foreach_src(ins, i) {
1242
if (i == 0 && except_zero)
1243
continue;
1244
1245
if (bi_is_word_equiv(ins->src[i], old)) {
1246
ins->src[i].type = BI_INDEX_PASS;
1247
ins->src[i].value = new;
1248
ins->src[i].reg = false;
1249
ins->src[i].offset = 0;
1250
}
1251
}
1252
}
1253
1254
/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1255
* intertuple passthroughs where necessary. Passthroughs are allowed as a
1256
* post-condition of scheduling. Note we rewrite ADD first, FMA second --
1257
* opposite the order of execution. This is deliberate -- if both FMA and ADD
1258
* write to the same logical register, the next executed tuple will get the
1259
* latter result. There's no interference issue under the assumption of correct
1260
* register allocation. */
1261
1262
static void
1263
bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1264
{
1265
bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1266
1267
if (prec.add) {
1268
bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1269
bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1270
}
1271
1272
if (prec.fma) {
1273
bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1274
bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1275
}
1276
}
1277
1278
static void
1279
bi_rewrite_fau_to_pass(bi_tuple *tuple)
1280
{
1281
bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1282
if (ins->src[s].type != BI_INDEX_FAU) continue;
1283
1284
bi_index pass = bi_passthrough(ins->src[s].offset ?
1285
BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1286
1287
ins->src[s] = bi_replace_index(ins->src[s], pass);
1288
}
1289
}
1290
1291
static void
1292
bi_rewrite_zero(bi_instr *ins, bool fma)
1293
{
1294
bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1295
1296
bi_foreach_src(ins, s) {
1297
bi_index src = ins->src[s];
1298
1299
if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1300
ins->src[s] = bi_replace_index(src, zero);
1301
}
1302
}
1303
1304
/* Assumes #0 to {T, FAU} rewrite has already occurred */
1305
1306
static void
1307
bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1308
{
1309
bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1310
if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1311
1312
uint32_t cons = ins->src[s].value;
1313
1314
ASSERTED bool lo = (cons == (constant & 0xffffffff));
1315
bool hi = (cons == (constant >> 32ull));
1316
1317
/* PC offsets always live in the upper half, set to zero by
1318
* convention before pack time. (This is safe, since if you
1319
* wanted to compare against zero, you would use a BRANCHZ
1320
* instruction instead.) */
1321
if (cons == 0 && ins->branch_target != NULL) {
1322
assert(pcrel);
1323
hi = true;
1324
lo = false;
1325
} else if (pcrel) {
1326
hi = false;
1327
}
1328
1329
assert(lo || hi);
1330
1331
ins->src[s] = bi_replace_index(ins->src[s],
1332
bi_passthrough(hi ? BIFROST_SRC_FAU_HI :
1333
BIFROST_SRC_FAU_LO));
1334
}
1335
}
1336
1337
/* Constructs a constant state given a tuple state. This has the
1338
* postcondition that pcrel applies to the first constant by convention,
1339
* and PC-relative constants will be #0 by convention here, so swap to
1340
* match if needed */
1341
1342
static struct bi_const_state
1343
bi_get_const_state(struct bi_tuple_state *tuple)
1344
{
1345
struct bi_const_state consts = {
1346
.constant_count = tuple->constant_count,
1347
.constants[0] = tuple->constants[0],
1348
.constants[1] = tuple->constants[1],
1349
.pcrel = tuple->add && tuple->add->branch_target,
1350
};
1351
1352
/* pcrel applies to the first constant by convention, and
1353
* PC-relative constants will be #0 by convention here, so swap
1354
* to match if needed */
1355
if (consts.pcrel && consts.constants[0]) {
1356
assert(consts.constant_count == 2);
1357
assert(consts.constants[1] == 0);
1358
1359
consts.constants[1] = consts.constants[0];
1360
consts.constants[0] = 0;
1361
}
1362
1363
return consts;
1364
}
1365
1366
/* Merges constants in a clause, satisfying the following rules, assuming no
1367
* more than one tuple has pcrel:
1368
*
1369
* 1. If a tuple has two constants, they must be packed together. If one is
1370
* pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1371
* (PC << 32)]. Otherwise choose an arbitrary order.
1372
*
1373
* 4. If a tuple has one constant, it may be shared with an existing
1374
* pair that already contains that constant, or it may be combined with another
1375
* (distinct) tuple of a single constant.
1376
*
1377
* This gaurantees a packing is possible. The next routine handles modification
1378
* related swapping, to satisfy format 12 and the lack of modification for
1379
* tuple count 5/8 in EC0.
1380
*/
1381
1382
static uint64_t
1383
bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1384
{
1385
/* At this point in the constant merge algorithm, pcrel constants are
1386
* treated as zero, so pcrel implies at least one constants is zero */
1387
assert(!pcrel || (c0 == 0 || c1 == 0));
1388
1389
/* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1390
uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1391
uint32_t lo = (c0 == hi) ? c1 : c0;
1392
1393
/* Merge in the selected order */
1394
return lo | (((uint64_t) hi) << 32ull);
1395
}
1396
1397
static unsigned
1398
bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1399
uint64_t *merged, unsigned *pcrel_pair)
1400
{
1401
unsigned merge_count = 0;
1402
1403
for (unsigned t = 0; t < tuple_count; ++t) {
1404
if (consts[t].constant_count != 2) continue;
1405
1406
unsigned idx = ~0;
1407
uint64_t val = bi_merge_u32(consts[t].constants[0],
1408
consts[t].constants[1], consts[t].pcrel);
1409
1410
/* Skip the pcrel pair if assigned, because if one is assigned,
1411
* this one is not pcrel by uniqueness so it's a mismatch */
1412
for (unsigned s = 0; s < merge_count; ++s) {
1413
if (merged[s] == val && (*pcrel_pair) != s) {
1414
idx = s;
1415
break;
1416
}
1417
}
1418
1419
if (idx == ~0) {
1420
idx = merge_count++;
1421
merged[idx] = val;
1422
1423
if (consts[t].pcrel)
1424
(*pcrel_pair) = idx;
1425
}
1426
1427
consts[t].word_idx = idx;
1428
}
1429
1430
return merge_count;
1431
}
1432
1433
static unsigned
1434
bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1435
uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1436
{
1437
bool pending = false, pending_pcrel = false;
1438
uint32_t pending_single = 0;
1439
1440
for (unsigned t = 0; t < tuple_count; ++t) {
1441
if (consts[t].constant_count != 1) continue;
1442
1443
uint32_t val = consts[t].constants[0];
1444
unsigned idx = ~0;
1445
1446
/* Try to match, but don't match pcrel with non-pcrel, even
1447
* though we can merge a pcrel with a non-pcrel single */
1448
for (unsigned i = 0; i < pair_count; ++i) {
1449
bool lo = ((pairs[i] & 0xffffffff) == val);
1450
bool hi = ((pairs[i] >> 32) == val);
1451
bool match = (lo || hi);
1452
match &= ((*pcrel_pair) != i);
1453
if (match && !consts[t].pcrel) {
1454
idx = i;
1455
break;
1456
}
1457
}
1458
1459
if (idx == ~0) {
1460
idx = pair_count;
1461
1462
if (pending && pending_single != val) {
1463
assert(!(pending_pcrel && consts[t].pcrel));
1464
bool pcrel = pending_pcrel || consts[t].pcrel;
1465
1466
if (pcrel)
1467
*pcrel_pair = idx;
1468
1469
pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1470
1471
pending = pending_pcrel = false;
1472
} else {
1473
pending = true;
1474
pending_pcrel = consts[t].pcrel;
1475
pending_single = val;
1476
}
1477
}
1478
1479
consts[t].word_idx = idx;
1480
}
1481
1482
/* Shift so it works whether pending_pcrel is set or not */
1483
if (pending) {
1484
if (pending_pcrel)
1485
*pcrel_pair = pair_count;
1486
1487
pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1488
}
1489
1490
return pair_count;
1491
}
1492
1493
static unsigned
1494
bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1495
{
1496
unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1497
return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1498
}
1499
1500
/* Swap two constants at word i and i+1 by swapping their actual positions and
1501
* swapping all references so the meaning of the clause is preserved */
1502
1503
static void
1504
bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1505
{
1506
uint64_t tmp_pair = pairs[i + 0];
1507
pairs[i + 0] = pairs[i + 1];
1508
pairs[i + 1] = tmp_pair;
1509
1510
for (unsigned t = 0; t < 8; ++t) {
1511
if (consts[t].word_idx == i)
1512
consts[t].word_idx = (i + 1);
1513
else if (consts[t].word_idx == (i + 1))
1514
consts[t].word_idx = i;
1515
}
1516
}
1517
1518
/* Given merged constants, one of which might be PC-relative, fix up the M
1519
* values so the PC-relative constant (if it exists) has the M1=4 modification
1520
* and other constants are used as-is (which might require swapping) */
1521
1522
static unsigned
1523
bi_apply_constant_modifiers(struct bi_const_state *consts,
1524
uint64_t *pairs, unsigned *pcrel_idx,
1525
unsigned tuple_count, unsigned constant_count)
1526
{
1527
unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1528
1529
/* Clauses with these tuple counts lack an M field for the packed EC0,
1530
* so EC0 cannot be PC-relative, which might require swapping (and
1531
* possibly adding an unused constant) to fit */
1532
1533
if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1534
constant_count = MAX2(constant_count, 2);
1535
*pcrel_idx = 1;
1536
bi_swap_constants(consts, pairs, 0);
1537
}
1538
1539
/* EC0 might be packed free, after that constants are packed in pairs
1540
* (with clause format 12), with M1 values computed from the pair */
1541
1542
for (unsigned i = start; i < constant_count; i += 2) {
1543
bool swap = false;
1544
bool last = (i + 1) == constant_count;
1545
1546
unsigned A1 = (pairs[i] >> 60);
1547
unsigned B1 = (pairs[i + 1] >> 60);
1548
1549
if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1550
/* PC-relative constant must be E0, not E1 */
1551
swap = (*pcrel_idx == (i + 1));
1552
1553
/* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1554
* equivalent to A = (B + 4) mod 16 and that we can
1555
* control A */
1556
unsigned B = swap ? A1 : B1;
1557
unsigned A = (B + 4) & 0xF;
1558
pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1559
1560
/* Swapped if swap set, identity if swap not set */
1561
*pcrel_idx = i;
1562
} else {
1563
/* Compute M1 value if we don't swap */
1564
unsigned M1 = (16 + A1 - B1) & 0xF;
1565
1566
/* For M1 = 0 or M1 >= 8, the constants are unchanged,
1567
* we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1568
* A1) % 16 >= 8, so swapping will let them be used
1569
* unchanged */
1570
swap = (M1 != 0) && (M1 < 8);
1571
1572
/* However, we can't swap the last constant, so we
1573
* force M1 = 0 instead for this case */
1574
if (last && swap) {
1575
pairs[i + 1] |= pairs[i] & (0xfull << 60);
1576
swap = false;
1577
}
1578
}
1579
1580
if (swap) {
1581
assert(!last);
1582
bi_swap_constants(consts, pairs, i);
1583
}
1584
}
1585
1586
return constant_count;
1587
}
1588
1589
/* Schedule a single clause. If no instructions remain, return NULL. */
1590
1591
static bi_clause *
1592
bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1593
{
1594
struct bi_clause_state clause_state = { 0 };
1595
bi_clause *clause = rzalloc(ctx, bi_clause);
1596
bi_tuple *tuple = NULL;
1597
1598
const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1599
1600
/* TODO: Decide flow control better */
1601
clause->flow_control = BIFROST_FLOW_NBTB;
1602
1603
/* The last clause can only write one instruction, so initialize that */
1604
struct bi_reg_state reg_state = {};
1605
bi_index prev_reads[5] = { bi_null() };
1606
unsigned nr_prev_reads = 0;
1607
1608
/* We need to track future liveness. The main *live set tracks what is
1609
* live at the current point int he program we are scheduling, but to
1610
* determine temp eligibility, we instead want what will be live after
1611
* the next tuple in the program. If you scheduled forwards, you'd need
1612
* a crystall ball for this. Luckily we schedule backwards, so we just
1613
* delay updates to the live_after_temp by an extra tuple. */
1614
uint64_t live_after_temp = *live;
1615
uint64_t live_next_tuple = live_after_temp;
1616
1617
do {
1618
struct bi_tuple_state tuple_state = {
1619
.last = (clause->tuple_count == 0),
1620
.reg = reg_state,
1621
.nr_prev_reads = nr_prev_reads,
1622
.prev = tuple,
1623
.pcrel_idx = ~0,
1624
};
1625
1626
assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1627
memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1628
1629
unsigned idx = max_tuples - clause->tuple_count - 1;
1630
1631
tuple = &clause->tuples[idx];
1632
1633
if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1634
unsigned nr = bi_count_read_registers(clause->message, 0);
1635
live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1636
}
1637
1638
/* Since we schedule backwards, we schedule ADD first */
1639
tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1640
tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1641
tuple->add = tuple_state.add;
1642
1643
/* Update liveness from the new instructions */
1644
if (tuple->add)
1645
*live = bi_postra_liveness_ins(*live, tuple->add);
1646
1647
if (tuple->fma)
1648
*live = bi_postra_liveness_ins(*live, tuple->fma);
1649
1650
/* Rotate in the new per-tuple liveness */
1651
live_after_temp = live_next_tuple;
1652
live_next_tuple = *live;
1653
1654
/* We may have a message, but only one per clause */
1655
if (tuple->add && bi_must_message(tuple->add)) {
1656
assert(!clause_state.message);
1657
clause_state.message = true;
1658
1659
clause->message_type =
1660
bi_message_type_for_instr(tuple->add);
1661
clause->message = tuple->add;
1662
1663
switch (tuple->add->op) {
1664
case BI_OPCODE_ATEST:
1665
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1666
break;
1667
case BI_OPCODE_LD_TILE:
1668
if (!ctx->inputs->is_blend)
1669
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1670
break;
1671
case BI_OPCODE_BLEND:
1672
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1673
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1674
break;
1675
default:
1676
break;
1677
}
1678
}
1679
1680
clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1681
1682
/* Before merging constants, eliminate zeroes, otherwise the
1683
* merging will fight over the #0 that never gets read (and is
1684
* never marked as read by update_fau) */
1685
if (tuple->fma && bi_reads_zero(tuple->fma))
1686
bi_rewrite_zero(tuple->fma, true);
1687
1688
/* Rewrite away FAU, constant write is deferred */
1689
if (!tuple_state.constant_count) {
1690
tuple->fau_idx = tuple_state.fau;
1691
bi_rewrite_fau_to_pass(tuple);
1692
}
1693
1694
/* Use passthrough register for cross-stage accesses. Since
1695
* there are just FMA and ADD stages, that means we rewrite to
1696
* passthrough the sources of the ADD that read from the
1697
* destination of the FMA */
1698
1699
if (tuple->fma) {
1700
bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1701
BIFROST_SRC_STAGE, false);
1702
}
1703
1704
/* Don't add an empty tuple, unless the worklist has nothing
1705
* but a (pseudo)instruction failing to schedule due to a "not
1706
* last instruction" constraint */
1707
1708
int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1709
bool not_last = (some_instruction > 0) &&
1710
bi_must_not_last(st.instructions[some_instruction - 1]);
1711
1712
bool insert_empty = tuple_state.last && not_last;
1713
1714
if (!(tuple->fma || tuple->add || insert_empty))
1715
break;
1716
1717
clause->tuple_count++;
1718
1719
/* Adding enough tuple might overflow constants */
1720
if (!bi_space_for_more_constants(&clause_state))
1721
break;
1722
1723
#ifndef NDEBUG
1724
/* Don't schedule more than 1 tuple if debugging */
1725
if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1726
break;
1727
#endif
1728
1729
/* Link through the register state */
1730
STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1731
memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1732
nr_prev_reads = tuple_state.reg.nr_reads;
1733
clause_state.tuple_count++;
1734
} while(clause->tuple_count < 8);
1735
1736
/* Don't schedule an empty clause */
1737
if (!clause->tuple_count)
1738
return NULL;
1739
1740
/* Before merging, rewrite away any tuples that read only zero */
1741
for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1742
bi_tuple *tuple = &clause->tuples[i];
1743
struct bi_const_state *st = &clause_state.consts[i];
1744
1745
if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1746
continue;
1747
1748
bi_foreach_instr_in_tuple(tuple, ins)
1749
bi_rewrite_zero(ins, false);
1750
1751
/* Constant has been demoted to FAU, so don't pack it separately */
1752
st->constant_count = 0;
1753
1754
/* Default */
1755
assert(tuple->fau_idx == BIR_FAU_ZERO);
1756
}
1757
1758
uint64_t constant_pairs[8] = { 0 };
1759
unsigned pcrel_idx = ~0;
1760
unsigned constant_words =
1761
bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1762
1763
constant_words = bi_apply_constant_modifiers(clause_state.consts,
1764
constant_pairs, &pcrel_idx, clause->tuple_count,
1765
constant_words);
1766
1767
clause->pcrel_idx = pcrel_idx;
1768
1769
for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1770
bi_tuple *tuple = &clause->tuples[i];
1771
1772
/* If no constants, leave FAU as it is, possibly defaulting to 0 */
1773
if (clause_state.consts[i].constant_count == 0)
1774
continue;
1775
1776
/* FAU is already handled */
1777
assert(!tuple->fau_idx);
1778
1779
unsigned word_idx = clause_state.consts[i].word_idx;
1780
assert(word_idx <= 8);
1781
1782
/* We could try to merge regardless of bottom bits as well, but
1783
* that's probably diminishing returns */
1784
uint64_t pair = constant_pairs[word_idx];
1785
unsigned lo = pair & 0xF;
1786
1787
tuple->fau_idx = bi_constant_field(word_idx) | lo;
1788
bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1789
}
1790
1791
clause->constant_count = constant_words;
1792
memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1793
1794
/* Branches must be last, so this can be factored out */
1795
bi_instr *last = clause->tuples[max_tuples - 1].add;
1796
clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1797
clause->block = block;
1798
1799
/* TODO: scoreboard assignment post-sched */
1800
clause->dependencies |= (1 << 0);
1801
1802
/* We emit in reverse and emitted to the back of the tuples array, so
1803
* move it up front for easy indexing */
1804
memmove(clause->tuples,
1805
clause->tuples + (max_tuples - clause->tuple_count),
1806
clause->tuple_count * sizeof(clause->tuples[0]));
1807
1808
/* Use passthrough register for cross-tuple accesses. Note this is
1809
* after the memmove, so this is forwards. Skip the first tuple since
1810
* there is nothing before it to passthrough */
1811
1812
for (unsigned t = 1; t < clause->tuple_count; ++t)
1813
bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1814
1815
return clause;
1816
}
1817
1818
static void
1819
bi_schedule_block(bi_context *ctx, bi_block *block)
1820
{
1821
list_inithead(&block->clauses);
1822
1823
/* Copy list to dynamic array */
1824
struct bi_worklist st = bi_initialize_worklist(block,
1825
bifrost_debug & BIFROST_DBG_INORDER);
1826
1827
if (!st.count) {
1828
bi_free_worklist(st);
1829
return;
1830
}
1831
1832
/* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1833
uint64_t live = block->reg_live_out;
1834
1835
/* Schedule as many clauses as needed to fill the block */
1836
bi_clause *u = NULL;
1837
while((u = bi_schedule_clause(ctx, block, st, &live)))
1838
list_add(&u->link, &block->clauses);
1839
1840
/* Back-to-back bit affects only the last clause of a block,
1841
* the rest are implicitly true */
1842
if (!list_is_empty(&block->clauses)) {
1843
bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1844
if (!bi_back_to_back(block))
1845
last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1846
}
1847
1848
/* Reorder instructions to match the new schedule. First remove
1849
* existing instructions and then recreate the list */
1850
1851
bi_foreach_instr_in_block_safe(block, ins) {
1852
list_del(&ins->link);
1853
}
1854
1855
bi_foreach_clause_in_block(block, clause) {
1856
for (unsigned i = 0; i < clause->tuple_count; ++i) {
1857
bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1858
list_addtail(&ins->link, &block->base.instructions);
1859
}
1860
}
1861
}
1862
1863
block->scheduled = true;
1864
1865
#ifndef NDEBUG
1866
unsigned i;
1867
bool incomplete = false;
1868
1869
BITSET_FOREACH_SET(i, st.worklist, st.count) {
1870
bi_print_instr(st.instructions[i], stderr);
1871
incomplete = true;
1872
}
1873
1874
if (incomplete)
1875
unreachable("The above instructions failed to schedule.");
1876
#endif
1877
1878
bi_free_worklist(st);
1879
}
1880
1881
static bool
1882
bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1883
{
1884
bi_index src = ins->src[s];
1885
1886
/* Staging registers can't have FAU accesses */
1887
if (s == 0 && bi_opcode_props[ins->op].sr_read)
1888
return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1889
1890
if (src.type == BI_INDEX_CONSTANT) {
1891
/* Allow fast zero */
1892
if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1893
return true;
1894
1895
if (!bi_is_null(*fau))
1896
return false;
1897
1898
/* Else, try to inline a constant */
1899
for (unsigned i = 0; i < *cwords; ++i) {
1900
if (src.value == constants[i])
1901
return true;
1902
}
1903
1904
if (*cwords >= 2)
1905
return false;
1906
1907
constants[(*cwords)++] = src.value;
1908
} else if (src.type == BI_INDEX_FAU) {
1909
if (*cwords != 0)
1910
return false;
1911
1912
/* Can only read from one pair of FAU words */
1913
if (!bi_is_null(*fau) && (src.value != fau->value))
1914
return false;
1915
1916
/* If there is a target, we'll need a PC-relative constant */
1917
if (ins->branch_target)
1918
return false;
1919
1920
*fau = src;
1921
}
1922
1923
return true;
1924
}
1925
1926
void
1927
bi_lower_fau(bi_context *ctx)
1928
{
1929
bi_foreach_instr_global_safe(ctx, ins) {
1930
bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
1931
1932
uint32_t constants[2];
1933
unsigned cwords = 0;
1934
bi_index fau = bi_null();
1935
1936
/* ATEST must have the ATEST datum encoded, not any other
1937
* uniform. See to it this is the case. */
1938
if (ins->op == BI_OPCODE_ATEST)
1939
fau = ins->src[2];
1940
1941
bi_foreach_src(ins, s) {
1942
if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
1943
1944
bi_index copy = bi_mov_i32(&b, ins->src[s]);
1945
ins->src[s] = bi_replace_index(ins->src[s], copy);
1946
}
1947
}
1948
}
1949
1950
/* On v6, ATEST cannot be the first clause of a shader, add a NOP if needed */
1951
1952
static void
1953
bi_add_nop_for_atest(bi_context *ctx)
1954
{
1955
/* Only needed on v6 */
1956
if (ctx->arch >= 7)
1957
return;
1958
1959
if (list_is_empty(&ctx->blocks))
1960
return;
1961
1962
/* Fetch the first clause of the shader */
1963
pan_block *block = list_first_entry(&ctx->blocks, pan_block, link);
1964
bi_clause *clause = bi_next_clause(ctx, block, NULL);
1965
1966
if (!clause || clause->message_type != BIFROST_MESSAGE_ATEST)
1967
return;
1968
1969
/* Add a NOP so we can wait for the dependencies required for ATEST to
1970
* execute */
1971
1972
bi_instr *I = rzalloc(ctx, bi_instr);
1973
I->op = BI_OPCODE_NOP_I32;
1974
I->dest[0] = bi_null();
1975
1976
bi_clause *new_clause = ralloc(ctx, bi_clause);
1977
*new_clause = (bi_clause) {
1978
.flow_control = BIFROST_FLOW_NBTB,
1979
.next_clause_prefetch = true,
1980
.block = clause->block,
1981
1982
.tuple_count = 1,
1983
.tuples[0] = { .fma = I, },
1984
};
1985
1986
list_add(&new_clause->link, &clause->block->clauses);
1987
}
1988
1989
void
1990
bi_schedule(bi_context *ctx)
1991
{
1992
/* Fed into both scheduling and DCE */
1993
bi_postra_liveness(ctx);
1994
1995
bi_foreach_block(ctx, block) {
1996
bi_block *bblock = (bi_block *) block;
1997
bi_schedule_block(ctx, bblock);
1998
}
1999
2000
bi_opt_dce_post_ra(ctx);
2001
bi_add_nop_for_atest(ctx);
2002
}
2003
2004
#ifndef NDEBUG
2005
2006
static bi_builder *
2007
bit_builder(void *memctx)
2008
{
2009
bi_context *ctx = rzalloc(memctx, bi_context);
2010
list_inithead(&ctx->blocks);
2011
2012
bi_block *blk = rzalloc(ctx, bi_block);
2013
2014
blk->base.predecessors = _mesa_set_create(blk,
2015
_mesa_hash_pointer,
2016
_mesa_key_pointer_equal);
2017
2018
list_addtail(&blk->base.link, &ctx->blocks);
2019
list_inithead(&blk->base.instructions);
2020
2021
bi_builder *b = rzalloc(memctx, bi_builder);
2022
b->shader = ctx;
2023
b->cursor = bi_after_block(blk);
2024
return b;
2025
}
2026
2027
#define TMP() bi_temp(b->shader)
2028
2029
static void
2030
bi_test_units(bi_builder *b)
2031
{
2032
bi_instr *mov = bi_mov_i32_to(b, TMP(), TMP());
2033
assert(bi_can_fma(mov));
2034
assert(bi_can_add(mov));
2035
assert(!bi_must_last(mov));
2036
assert(!bi_must_message(mov));
2037
assert(bi_reads_zero(mov));
2038
assert(bi_reads_temps(mov, 0));
2039
assert(bi_reads_t(mov, 0));
2040
2041
bi_instr *fma = bi_fma_f32_to(b, TMP(), TMP(), TMP(), bi_zero(), BI_ROUND_NONE);
2042
assert(bi_can_fma(fma));
2043
assert(!bi_can_add(fma));
2044
assert(!bi_must_last(fma));
2045
assert(!bi_must_message(fma));
2046
assert(bi_reads_zero(fma));
2047
for (unsigned i = 0; i < 3; ++i) {
2048
assert(bi_reads_temps(fma, i));
2049
assert(bi_reads_t(fma, i));
2050
}
2051
2052
bi_instr *load = bi_load_i128_to(b, TMP(), TMP(), TMP(), BI_SEG_UBO);
2053
assert(!bi_can_fma(load));
2054
assert(bi_can_add(load));
2055
assert(!bi_must_last(load));
2056
assert(bi_must_message(load));
2057
for (unsigned i = 0; i < 2; ++i) {
2058
assert(bi_reads_temps(load, i));
2059
assert(bi_reads_t(load, i));
2060
}
2061
2062
bi_instr *blend = bi_blend_to(b, TMP(), TMP(), TMP(), TMP(), TMP(), 4);
2063
assert(!bi_can_fma(load));
2064
assert(bi_can_add(load));
2065
assert(bi_must_last(blend));
2066
assert(bi_must_message(blend));
2067
for (unsigned i = 0; i < 4; ++i)
2068
assert(bi_reads_temps(blend, i));
2069
assert(!bi_reads_t(blend, 0));
2070
assert(bi_reads_t(blend, 1));
2071
assert(!bi_reads_t(blend, 2));
2072
assert(!bi_reads_t(blend, 3));
2073
}
2074
2075
int bi_test_scheduler(void)
2076
{
2077
void *memctx = NULL;
2078
2079
bi_test_units(bit_builder(memctx));
2080
2081
return 0;
2082
}
2083
#endif
2084
2085