Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3_delay.c
4565 views
1
/*
2
* Copyright (C) 2019 Google, Inc.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors:
24
* Rob Clark <[email protected]>
25
*/
26
27
#include "ir3.h"
28
29
/* The maximum number of nop's we may need to insert between two instructions.
30
*/
31
#define MAX_NOPS 6
32
33
/* The soft delay for approximating the cost of (ss). On a6xx, it takes the
34
* number of delay slots to get a SFU result back (ie. using nop's instead of
35
* (ss) is:
36
*
37
* 8 - single warp
38
* 9 - two warps
39
* 10 - four warps
40
*
41
* and so on. Not quite sure where it tapers out (ie. how many warps share an
42
* SFU unit). But 10 seems like a reasonable # to choose:
43
*/
44
#define SOFT_SS_NOPS 10
45
46
/*
47
* Helpers to figure out the necessary delay slots between instructions. Used
48
* both in scheduling pass(es) and the final pass to insert any required nop's
49
* so that the shader program is valid.
50
*
51
* Note that this needs to work both pre and post RA, so we can't assume ssa
52
* src iterators work.
53
*/
54
55
/* calculate required # of delay slots between the instruction that
56
* assigns a value and the one that consumes
57
*/
58
int
59
ir3_delayslots(struct ir3_instruction *assigner,
60
struct ir3_instruction *consumer, unsigned n, bool soft)
61
{
62
/* generally don't count false dependencies, since this can just be
63
* something like a barrier, or SSBO store.
64
*/
65
if (__is_false_dep(consumer, n))
66
return 0;
67
68
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
69
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
70
* handled with sync bits
71
*/
72
73
if (is_meta(assigner) || is_meta(consumer))
74
return 0;
75
76
if (writes_addr0(assigner) || writes_addr1(assigner))
77
return 6;
78
79
if (soft && is_sfu(assigner))
80
return SOFT_SS_NOPS;
81
82
/* handled via sync flags: */
83
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
84
return 0;
85
86
/* As far as we know, shader outputs don't need any delay. */
87
if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
88
return 0;
89
90
/* assigner must be alu: */
91
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
92
is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
93
return 6;
94
} else {
95
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
96
* a full-reg is read as a half-reg or when a half-reg is read as a
97
* full-reg.
98
*/
99
bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
100
(consumer->srcs[n]->flags & IR3_REG_HALF);
101
unsigned penalty = mismatched_half ? 2 : 0;
102
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
103
/* special case, 3rd src to cat3 not required on first cycle */
104
return 1 + penalty;
105
} else {
106
return 3 + penalty;
107
}
108
}
109
}
110
111
static bool
112
count_instruction(struct ir3_instruction *n)
113
{
114
/* NOTE: don't count branch/jump since we don't know yet if they will
115
* be eliminated later in resolve_jumps().. really should do that
116
* earlier so we don't have this constraint.
117
*/
118
return is_alu(n) ||
119
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
120
}
121
122
static unsigned
123
distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
124
{
125
unsigned d = 0;
126
127
/* Note that this relies on incrementally building up the block's
128
* instruction list.. but this is how scheduling and nopsched
129
* work.
130
*/
131
foreach_instr_rev (n, &block->instr_list) {
132
if ((n == instr) || (d >= maxd))
133
return MIN2(maxd, d + n->nop);
134
if (count_instruction(n))
135
d = MIN2(maxd, d + 1 + n->repeat + n->nop);
136
}
137
138
return maxd;
139
}
140
141
static unsigned
142
delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
143
struct ir3_instruction *consumer, unsigned srcn)
144
{
145
unsigned delay = 0;
146
147
if (assigner->opc == OPC_META_PHI)
148
return 0;
149
150
if (is_meta(assigner)) {
151
foreach_src_n (src, n, assigner) {
152
unsigned d;
153
154
if (!src->def)
155
continue;
156
157
d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
158
delay = MAX2(delay, d);
159
}
160
} else {
161
delay = ir3_delayslots(assigner, consumer, srcn, false);
162
delay -= distance(block, assigner, delay);
163
}
164
165
return delay;
166
}
167
168
/**
169
* Calculate delay for instruction before register allocation, using SSA
170
* source pointers. This can't handle inter-block dependencies.
171
*/
172
unsigned
173
ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
174
{
175
unsigned delay = 0;
176
177
foreach_src_n (src, i, instr) {
178
unsigned d = 0;
179
180
if (src->def && src->def->instr->block == block) {
181
d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
182
}
183
184
delay = MAX2(delay, d);
185
}
186
187
return delay;
188
}
189
190
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
191
* and have to handle relative accesses specially.
192
*/
193
194
static unsigned
195
post_ra_reg_elems(struct ir3_register *reg)
196
{
197
if (reg->flags & IR3_REG_RELATIV)
198
return reg->size;
199
return reg_elems(reg);
200
}
201
202
static unsigned
203
post_ra_reg_num(struct ir3_register *reg)
204
{
205
if (reg->flags & IR3_REG_RELATIV)
206
return reg->array.base;
207
return reg->num;
208
}
209
210
static unsigned
211
delay_calc_srcn_postra(struct ir3_instruction *assigner,
212
struct ir3_instruction *consumer, unsigned assigner_n,
213
unsigned consumer_n, bool soft, bool mergedregs)
214
{
215
struct ir3_register *src = consumer->srcs[consumer_n];
216
struct ir3_register *dst = assigner->dsts[assigner_n];
217
bool mismatched_half =
218
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
219
220
/* In the mergedregs case or when the register is a special register,
221
* half-registers do not alias with full registers.
222
*/
223
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
224
mismatched_half)
225
return 0;
226
227
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
228
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
229
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
230
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
231
232
if (dst_start >= src_end || src_start >= dst_end)
233
return 0;
234
235
unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
236
237
if (assigner->repeat == 0 && consumer->repeat == 0)
238
return delay;
239
240
/* If either side is a relative access, we can't really apply most of the
241
* reasoning below because we don't know which component aliases which.
242
* Just bail in this case.
243
*/
244
if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
245
return delay;
246
247
/* MOVMSK seems to require that all users wait until the entire
248
* instruction is finished, so just bail here.
249
*/
250
if (assigner->opc == OPC_MOVMSK)
251
return delay;
252
253
/* TODO: Handle the combination of (rpt) and different component sizes
254
* better like below. This complicates things significantly because the
255
* components don't line up.
256
*/
257
if (mismatched_half)
258
return delay;
259
260
/* If an instruction has a (rpt), then it acts as a sequence of
261
* instructions, reading its non-(r) sources at each cycle. First, get the
262
* register num for the first instruction where they interfere:
263
*/
264
265
unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
266
267
/* Now, for that first conflicting half/full register, figure out the
268
* sub-instruction within assigner/consumer it corresponds to. For (r)
269
* sources, this should already return the correct answer of 0. However we
270
* have to special-case the multi-mov instructions, where the
271
* sub-instructions sometimes come from the src/dst indices instead.
272
*/
273
unsigned first_src_instr;
274
if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
275
first_src_instr = consumer_n;
276
else
277
first_src_instr = first_num - src->num;
278
279
unsigned first_dst_instr;
280
if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
281
first_dst_instr = assigner_n;
282
else
283
first_dst_instr = first_num - dst->num;
284
285
/* The delay we return is relative to the *end* of assigner and the
286
* *beginning* of consumer, because it's the number of nops (or other
287
* things) needed between them. Any instructions after first_dst_instr
288
* subtract from the delay, and so do any instructions before
289
* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
290
* delay to account for that.
291
*
292
* Now, a priori, we need to go through this process for every
293
* conflicting regnum and take the minimum of the offsets to make sure
294
* that the appropriate number of nop's is inserted for every conflicting
295
* pair of sub-instructions. However, as we go to the next conflicting
296
* regnum (if any), the number of instructions after first_dst_instr
297
* decreases by 1 and the number of source instructions before
298
* first_src_instr correspondingly increases by 1, so the offset stays the
299
* same for all conflicting registers.
300
*/
301
unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
302
return offset > delay ? 0 : delay - offset;
303
}
304
305
static unsigned
306
delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
307
struct ir3_instruction *consumer, unsigned distance,
308
bool soft, bool pred, bool mergedregs)
309
{
310
unsigned delay = 0;
311
/* Search backwards starting at the instruction before start, unless it's
312
* NULL then search backwards from the block end.
313
*/
314
struct list_head *start_list =
315
start ? start->node.prev : block->instr_list.prev;
316
list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
317
&block->instr_list, node) {
318
if (count_instruction(assigner))
319
distance += assigner->nop;
320
321
if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
322
return delay;
323
324
if (is_meta(assigner))
325
continue;
326
327
unsigned new_delay = 0;
328
329
foreach_dst_n (dst, dst_n, assigner) {
330
if (dst->wrmask == 0)
331
continue;
332
foreach_src_n (src, src_n, consumer) {
333
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
334
continue;
335
336
unsigned src_delay = delay_calc_srcn_postra(
337
assigner, consumer, dst_n, src_n, soft, mergedregs);
338
new_delay = MAX2(new_delay, src_delay);
339
}
340
}
341
342
new_delay = new_delay > distance ? new_delay - distance : 0;
343
delay = MAX2(delay, new_delay);
344
345
if (count_instruction(assigner))
346
distance += 1 + assigner->repeat;
347
}
348
349
/* Note: this allows recursion into "block" if it has already been
350
* visited, but *not* recursion into its predecessors. We may have to
351
* visit the original block twice, for the loop case where we have to
352
* consider definititons in an earlier iterations of the same loop:
353
*
354
* while (...) {
355
* mov.u32u32 ..., r0.x
356
* ...
357
* mov.u32u32 r0.x, ...
358
* }
359
*
360
* However any other recursion would be unnecessary.
361
*/
362
363
if (pred && block->data != block) {
364
block->data = block;
365
366
for (unsigned i = 0; i < block->predecessors_count; i++) {
367
struct ir3_block *pred = block->predecessors[i];
368
unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
369
soft, pred, mergedregs);
370
delay = MAX2(delay, pred_delay);
371
}
372
373
block->data = NULL;
374
}
375
376
return delay;
377
}
378
379
/**
380
* Calculate delay for post-RA scheduling based on physical registers but not
381
* exact (i.e. don't recurse into predecessors, and make it possible to
382
* estimate impact of sync flags).
383
*
384
* @soft: If true, add additional delay for situations where they
385
* would not be strictly required because a sync flag would be
386
* used (but scheduler would prefer to schedule some other
387
* instructions first to avoid stalling on sync flag)
388
* @mergedregs: True if mergedregs is enabled.
389
*/
390
unsigned
391
ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
392
bool soft, bool mergedregs)
393
{
394
return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
395
}
396
397
/**
398
* Calculate delay for nop insertion. This must exactly match hardware
399
* requirements, including recursing into predecessor blocks.
400
*/
401
unsigned
402
ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
403
bool mergedregs)
404
{
405
return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
406
}
407
408
/**
409
* Remove nop instructions. The scheduler can insert placeholder nop's
410
* so that ir3_delay_calc() can account for nop's that won't be needed
411
* due to nop's triggered by a previous instruction. However, before
412
* legalize, we want to remove these. The legalize pass can insert
413
* some nop's if needed to hold (for example) sync flags. This final
414
* remaining nops are inserted by legalize after this.
415
*/
416
void
417
ir3_remove_nops(struct ir3 *ir)
418
{
419
foreach_block (block, &ir->block_list) {
420
foreach_instr_safe (instr, &block->instr_list) {
421
if (instr->opc == OPC_NOP) {
422
list_del(&instr->node);
423
}
424
}
425
}
426
}
427
428