Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/freedreno/a2xx/ir2.c
4574 views
1
/*
2
* Copyright (C) 2018 Jonathan Marek <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
* Authors:
24
* Jonathan Marek <[email protected]>
25
*/
26
27
#include "ir2_private.h"
28
29
static bool
30
scalar_possible(struct ir2_instr *instr)
31
{
32
if (instr->alu.scalar_opc == SCALAR_NONE)
33
return false;
34
35
return src_ncomp(instr) == 1;
36
}
37
38
static bool
39
is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
40
{
41
if (!a)
42
return true;
43
44
/* dont use same instruction twice */
45
if (a == b)
46
return false;
47
48
/* PRED_SET must be alone */
49
if (b->alu.scalar_opc >= PRED_SETEs &&
50
b->alu.scalar_opc <= PRED_SET_RESTOREs)
51
return false;
52
53
/* must write to same export (issues otherwise?) */
54
return a->alu.export == b->alu.export;
55
}
56
57
/* priority of vector instruction for scheduling (lower=higher prio) */
58
static unsigned
59
alu_vector_prio(struct ir2_instr *instr)
60
{
61
if (instr->alu.vector_opc == VECTOR_NONE)
62
return ~0u;
63
64
if (is_export(instr))
65
return 4;
66
67
/* TODO check src type and ncomps */
68
if (instr->src_count == 3)
69
return 0;
70
71
if (!scalar_possible(instr))
72
return 1;
73
74
return instr->src_count == 2 ? 2 : 3;
75
}
76
77
/* priority of scalar instruction for scheduling (lower=higher prio) */
78
static unsigned
79
alu_scalar_prio(struct ir2_instr *instr)
80
{
81
if (!scalar_possible(instr))
82
return ~0u;
83
84
/* this case is dealt with later */
85
if (instr->src_count > 1)
86
return ~0u;
87
88
if (is_export(instr))
89
return 4;
90
91
/* PRED to end of block */
92
if (instr->alu.scalar_opc >= PRED_SETEs &&
93
instr->alu.scalar_opc <= PRED_SET_RESTOREs)
94
return 5;
95
96
/* scalar only have highest priority */
97
return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
98
}
99
100
/* this is a bit messy:
101
* we want to find a slot where we can insert a scalar MOV with
102
* a vector instruction that was already scheduled
103
*/
104
static struct ir2_sched_instr *
105
insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
106
struct ir2_src src1, unsigned *comp)
107
{
108
struct ir2_sched_instr *sched = NULL, *s;
109
unsigned i, mask = 0xf;
110
111
/* go first earliest point where the mov can be inserted */
112
for (i = ctx->instr_sched_count - 1; i > 0; i--) {
113
s = &ctx->instr_sched[i - 1];
114
115
if (s->instr && s->instr->block_idx != block_idx)
116
break;
117
if (s->instr_s && s->instr_s->block_idx != block_idx)
118
break;
119
120
if (src1.type == IR2_SRC_SSA) {
121
if ((s->instr && s->instr->idx == src1.num) ||
122
(s->instr_s && s->instr_s->idx == src1.num))
123
break;
124
}
125
126
unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf);
127
if ((mask & mr) == 0)
128
break;
129
130
mask &= mr;
131
if (s->instr_s || s->instr->src_count == 3)
132
continue;
133
134
if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
135
continue;
136
137
sched = s;
138
}
139
*comp = ffs(mask) - 1;
140
141
if (sched) {
142
for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++)
143
s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4);
144
}
145
146
return sched;
147
}
148
149
/* case1:
150
* in this case, insert a mov to place the 2nd src into to same reg
151
* (scalar sources come from the same register)
152
*
153
* this is a common case which works when one of the srcs is input/const
154
* but for instrs which have 2 ssa/reg srcs, then its not ideal
155
*/
156
static bool
157
scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
158
{
159
struct ir2_src src0 = instr->src[order];
160
struct ir2_src src1 = instr->src[!order];
161
struct ir2_sched_instr *sched;
162
struct ir2_instr *ins;
163
struct ir2_reg *reg;
164
unsigned idx, comp;
165
166
switch (src0.type) {
167
case IR2_SRC_CONST:
168
case IR2_SRC_INPUT:
169
return false;
170
default:
171
break;
172
}
173
174
/* TODO, insert needs logic for this */
175
if (src1.type == IR2_SRC_REG)
176
return false;
177
178
/* we could do something if they match src1.. */
179
if (src0.negate || src0.abs)
180
return false;
181
182
reg = get_reg_src(ctx, &src0);
183
184
/* result not used more since we will overwrite */
185
for (int i = 0; i < 4; i++)
186
if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
187
return false;
188
189
/* find a place to insert the mov */
190
sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
191
if (!sched)
192
return false;
193
194
ins = &ctx->instr[idx = ctx->instr_count++];
195
ins->idx = idx;
196
ins->type = IR2_ALU;
197
ins->src[0] = src1;
198
ins->src_count = 1;
199
ins->is_ssa = true;
200
ins->ssa.idx = reg->idx;
201
ins->ssa.ncomp = 1;
202
ins->ssa.comp[0].c = comp;
203
ins->alu.scalar_opc = MAXs;
204
ins->alu.export = -1;
205
ins->alu.write_mask = 1;
206
ins->pred = instr->pred;
207
ins->block_idx = instr->block_idx;
208
209
instr->src[0] = src0;
210
instr->alu.src1_swizzle = comp;
211
212
sched->instr_s = ins;
213
return true;
214
}
215
216
/* fill sched with next fetch or (vector and/or scalar) alu instruction */
217
static int
218
sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
219
{
220
struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
221
unsigned avail_count = 0;
222
223
instr_alloc_type_t export = ~0u;
224
int block_idx = -1;
225
226
/* XXX merge this loop with the other one somehow? */
227
ir2_foreach_instr (instr, ctx) {
228
if (!instr->need_emit)
229
continue;
230
if (is_export(instr))
231
export = MIN2(export, export_buf(instr->alu.export));
232
}
233
234
ir2_foreach_instr (instr, ctx) {
235
if (!instr->need_emit)
236
continue;
237
238
/* dont mix exports */
239
if (is_export(instr) && export_buf(instr->alu.export) != export)
240
continue;
241
242
if (block_idx < 0)
243
block_idx = instr->block_idx;
244
else if (block_idx != instr->block_idx || /* must be same block */
245
instr->type == IR2_CF || /* CF/MEM must be alone */
246
(is_export(instr) && export == SQ_MEMORY))
247
break;
248
/* it works because IR2_CF is always at end of block
249
* and somewhat same idea with MEM exports, which might not be alone
250
* but will end up in-order at least
251
*/
252
253
/* check if dependencies are satisfied */
254
bool is_ok = true;
255
ir2_foreach_src (src, instr) {
256
if (src->type == IR2_SRC_REG) {
257
/* need to check if all previous instructions in the block
258
* which write the reg have been emitted
259
* slow..
260
* XXX: check components instead of whole register
261
*/
262
struct ir2_reg *reg = get_reg_src(ctx, src);
263
ir2_foreach_instr (p, ctx) {
264
if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
265
is_ok &= !p->need_emit;
266
}
267
} else if (src->type == IR2_SRC_SSA) {
268
/* in this case its easy, just check need_emit */
269
is_ok &= !ctx->instr[src->num].need_emit;
270
}
271
}
272
/* don't reorder non-ssa write before read */
273
if (!instr->is_ssa) {
274
ir2_foreach_instr (p, ctx) {
275
if (!p->need_emit || p->idx >= instr->idx)
276
continue;
277
278
ir2_foreach_src (src, p) {
279
if (get_reg_src(ctx, src) == instr->reg)
280
is_ok = false;
281
}
282
}
283
}
284
/* don't reorder across predicates */
285
if (avail_count && instr->pred != avail[0]->pred)
286
is_ok = false;
287
288
if (!is_ok)
289
continue;
290
291
avail[avail_count++] = instr;
292
}
293
294
if (!avail_count) {
295
assert(block_idx == -1);
296
return -1;
297
}
298
299
/* priority to FETCH instructions */
300
ir2_foreach_avail (instr) {
301
if (instr->type == IR2_ALU)
302
continue;
303
304
ra_src_free(ctx, instr);
305
ra_reg(ctx, get_reg(instr), -1, false, 0);
306
307
instr->need_emit = false;
308
sched->instr = instr;
309
sched->instr_s = NULL;
310
return block_idx;
311
}
312
313
/* TODO precompute priorities */
314
315
unsigned prio_v = ~0u, prio_s = ~0u, prio;
316
ir2_foreach_avail (instr) {
317
prio = alu_vector_prio(instr);
318
if (prio < prio_v) {
319
instr_v = instr;
320
prio_v = prio;
321
}
322
}
323
324
/* TODO can still insert scalar if src_count=3, if smart about it */
325
if (!instr_v || instr_v->src_count < 3) {
326
ir2_foreach_avail (instr) {
327
bool compat = is_alu_compatible(instr_v, instr);
328
329
prio = alu_scalar_prio(instr);
330
if (prio >= prio_v && !compat)
331
continue;
332
333
if (prio < prio_s) {
334
instr_s = instr;
335
prio_s = prio;
336
if (!compat)
337
instr_v = NULL;
338
}
339
}
340
}
341
342
assert(instr_v || instr_s);
343
344
/* now, we try more complex insertion of vector instruction as scalar
345
* TODO: if we are smart we can still insert if instr_v->src_count==3
346
*/
347
if (!instr_s && instr_v->src_count < 3) {
348
ir2_foreach_avail (instr) {
349
if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
350
continue;
351
352
/* at this point, src_count should always be 2 */
353
assert(instr->src_count == 2);
354
355
if (scalarize_case1(ctx, instr, 0)) {
356
instr_s = instr;
357
break;
358
}
359
if (scalarize_case1(ctx, instr, 1)) {
360
instr_s = instr;
361
break;
362
}
363
}
364
}
365
366
/* free src registers */
367
if (instr_v) {
368
instr_v->need_emit = false;
369
ra_src_free(ctx, instr_v);
370
}
371
372
if (instr_s) {
373
instr_s->need_emit = false;
374
ra_src_free(ctx, instr_s);
375
}
376
377
/* allocate dst registers */
378
if (instr_v)
379
ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v),
380
instr_v->alu.write_mask);
381
382
if (instr_s)
383
ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s),
384
instr_s->alu.write_mask);
385
386
sched->instr = instr_v;
387
sched->instr_s = instr_s;
388
return block_idx;
389
}
390
391
/* scheduling: determine order of instructions */
392
static void
393
schedule_instrs(struct ir2_context *ctx)
394
{
395
struct ir2_sched_instr *sched;
396
int block_idx;
397
398
/* allocate input registers */
399
for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
400
if (ctx->input[idx].initialized)
401
ra_reg(ctx, &ctx->input[idx], idx, false, 0);
402
403
for (;;) {
404
sched = &ctx->instr_sched[ctx->instr_sched_count++];
405
block_idx = sched_next(ctx, sched);
406
if (block_idx < 0)
407
break;
408
memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
409
410
/* catch texture fetch after scheduling and insert the
411
* SET_TEX_LOD right before it if necessary
412
* TODO clean this up
413
*/
414
struct ir2_instr *instr = sched->instr, *tex_lod;
415
if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH &&
416
instr->src_count == 2) {
417
/* generate the SET_LOD instruction */
418
tex_lod = &ctx->instr[ctx->instr_count++];
419
tex_lod->type = IR2_FETCH;
420
tex_lod->block_idx = instr->block_idx;
421
tex_lod->pred = instr->pred;
422
tex_lod->fetch.opc = TEX_SET_TEX_LOD;
423
tex_lod->src[0] = instr->src[1];
424
tex_lod->src_count = 1;
425
426
sched[1] = sched[0];
427
sched->instr = tex_lod;
428
ctx->instr_sched_count++;
429
}
430
431
bool free_block = true;
432
ir2_foreach_instr (instr, ctx)
433
free_block &= instr->block_idx != block_idx;
434
if (free_block)
435
ra_block_free(ctx, block_idx);
436
};
437
ctx->instr_sched_count--;
438
}
439
440
void
441
ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
442
struct fd2_shader_stateobj *fp)
443
{
444
struct ir2_context ctx = {};
445
bool binning = !fp && so->type == MESA_SHADER_VERTEX;
446
447
if (fp)
448
so->variant[variant].f = fp->variant[0].f;
449
450
ctx.so = so;
451
ctx.info = &so->variant[variant].info;
452
ctx.f = &so->variant[variant].f;
453
ctx.info->max_reg = -1;
454
455
/* convert nir to internal representation */
456
ir2_nir_compile(&ctx, binning);
457
458
/* copy propagate srcs */
459
cp_src(&ctx);
460
461
/* get ref_counts and kill non-needed instructions */
462
ra_count_refs(&ctx);
463
464
/* remove movs used to write outputs */
465
cp_export(&ctx);
466
467
/* instruction order.. and vector->scalar conversions */
468
schedule_instrs(&ctx);
469
470
/* finally, assemble to bitcode */
471
assemble(&ctx, binning);
472
}
473
474