CoCalc -- ir3.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/freedreno/ir3/ir3.c
⁴⁵⁶⁵ views
1
/*
2
 * Copyright (c) 2012 Rob Clark <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 * SOFTWARE.
22
 */
23

24
#include "ir3.h"
25

26
#include <assert.h>
27
#include <errno.h>
28
#include <stdbool.h>
29
#include <stdio.h>
30
#include <stdlib.h>
31
#include <string.h>
32

33
#include "util/bitscan.h"
34
#include "util/half_float.h"
35
#include "util/ralloc.h"
36
#include "util/u_math.h"
37

38
#include "instr-a3xx.h"
39
#include "ir3_shader.h"
40

41
/* simple allocator to carve allocations out of an up-front allocated heap,
42
 * so that we can free everything easily in one shot.
43
 */
44
void *
45
ir3_alloc(struct ir3 *shader, int sz)
46
{
47
   return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48
}
49

50
struct ir3 *
51
ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52
{
53
   struct ir3 *shader = rzalloc(v, struct ir3);
54

55
   shader->compiler = compiler;
56
   shader->type = v->type;
57

58
   list_inithead(&shader->block_list);
59
   list_inithead(&shader->array_list);
60

61
   return shader;
62
}
63

64
void
65
ir3_destroy(struct ir3 *shader)
66
{
67
   ralloc_free(shader);
68
}
69

70
static void
71
collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
72
                 struct ir3_info *info)
73
{
74
   struct ir3_shader_variant *v = info->data;
75
   unsigned repeat = instr->repeat;
76

77
   if (reg->flags & IR3_REG_IMMED) {
78
      /* nothing to do */
79
      return;
80
   }
81

82
   if (!(reg->flags & IR3_REG_R)) {
83
      repeat = 0;
84
   }
85

86
   unsigned components;
87
   int16_t max;
88

89
   if (reg->flags & IR3_REG_RELATIV) {
90
      components = reg->size;
91
      max = (reg->array.base + components - 1);
92
   } else {
93
      components = util_last_bit(reg->wrmask);
94
      max = (reg->num + repeat + components - 1);
95
   }
96

97
   if (reg->flags & IR3_REG_CONST) {
98
      info->max_const = MAX2(info->max_const, max >> 2);
99
   } else if (max < regid(48, 0)) {
100
      if (reg->flags & IR3_REG_HALF) {
101
         if (v->mergedregs) {
102
            /* starting w/ a6xx, half regs conflict with full regs: */
103
            info->max_reg = MAX2(info->max_reg, max >> 3);
104
         } else {
105
            info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
106
         }
107
      } else {
108
         info->max_reg = MAX2(info->max_reg, max >> 2);
109
      }
110
   }
111
}
112

113
bool
114
ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
115
{
116
   const struct ir3_compiler *compiler = v->shader->compiler;
117

118
   /* We can't support more than compiler->branchstack_size diverging threads
119
    * in a wave. Thus, doubling the threadsize is only possible if we don't
120
    * exceed the branchstack size limit.
121
    */
122
   if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
123
       compiler->branchstack_size) {
124
      return false;
125
   }
126

127
   switch (v->type) {
128
   case MESA_SHADER_COMPUTE: {
129
      unsigned threads_per_wg =
130
         v->local_size[0] * v->local_size[1] * v->local_size[2];
131

132
      /* For a5xx, if the workgroup size is greater than the maximum number
133
       * of threads per core with 32 threads per wave (512) then we have to
134
       * use the doubled threadsize because otherwise the workgroup wouldn't
135
       * fit. For smaller workgroup sizes, we follow the blob and use the
136
       * smaller threadsize.
137
       */
138
      if (compiler->gpu_id < 600) {
139
         return v->local_size_variable ||
140
                threads_per_wg >
141
                   compiler->threadsize_base * compiler->max_waves;
142
      }
143

144
      /* On a6xx, we prefer the larger threadsize unless the workgroup is
145
       * small enough that it would be useless. Note that because
146
       * threadsize_base is bumped to 64, we don't have to worry about the
147
       * workgroup fitting, unlike the a5xx case.
148
       */
149
      if (!v->local_size_variable) {
150
         if (threads_per_wg <= compiler->threadsize_base)
151
            return false;
152
      }
153
   }
154
      FALLTHROUGH;
155
   case MESA_SHADER_FRAGMENT: {
156
      /* Check that doubling the threadsize wouldn't exceed the regfile size */
157
      return regs_count * 2 <= compiler->reg_size_vec4;
158
   }
159

160
   default:
161
      /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
162
       * stages - the bit doesn't exist. The blob never used it for the VS
163
       * on earlier gen's anyway.
164
       */
165
      return false;
166
   }
167
}
168

169
/* Get the maximum number of waves that could be used even if this shader
170
 * didn't use any registers.
171
 */
172
unsigned
173
ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
174
                                  bool double_threadsize)
175
{
176
   const struct ir3_compiler *compiler = v->shader->compiler;
177
   unsigned max_waves = compiler->max_waves;
178

179
   /* If this is a compute shader, compute the limit based on shared size */
180
   if (v->type == MESA_SHADER_COMPUTE) {
181
      /* Shared is allocated in chunks of 1k */
182
      unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
183
      if (shared_per_wg > 0 && !v->local_size_variable) {
184
         unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
185
         unsigned threads_per_wg =
186
            v->local_size[0] * v->local_size[1] * v->local_size[2];
187
         unsigned waves_per_wg =
188
            DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
189
                                            (double_threadsize ? 2 : 1) *
190
                                            compiler->wave_granularity);
191
         max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
192
                                        compiler->wave_granularity);
193
      }
194
   }
195

196
   /* Compute the limit based on branchstack */
197
   if (v->branchstack > 0) {
198
      unsigned branchstack_max_waves = compiler->branchstack_size /
199
                                       v->branchstack *
200
                                       compiler->wave_granularity;
201
      max_waves = MIN2(max_waves, branchstack_max_waves);
202
   }
203

204
   return max_waves;
205
}
206

207
/* Get the maximum number of waves that could be launched limited by reg size.
208
 */
209
unsigned
210
ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
211
                                unsigned reg_count, bool double_threadsize)
212
{
213
   return reg_count ? (compiler->reg_size_vec4 /
214
                       (reg_count * (double_threadsize ? 2 : 1)) *
215
                       compiler->wave_granularity)
216
                    : compiler->max_waves;
217
}
218

219
void
220
ir3_collect_info(struct ir3_shader_variant *v)
221
{
222
   struct ir3_info *info = &v->info;
223
   struct ir3 *shader = v->ir;
224
   const struct ir3_compiler *compiler = v->shader->compiler;
225

226
   memset(info, 0, sizeof(*info));
227
   info->data = v;
228
   info->max_reg = -1;
229
   info->max_half_reg = -1;
230
   info->max_const = -1;
231
   info->multi_dword_ldp_stp = false;
232

233
   uint32_t instr_count = 0;
234
   foreach_block (block, &shader->block_list) {
235
      foreach_instr (instr, &block->instr_list) {
236
         instr_count++;
237
      }
238
   }
239

240
   v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
241

242
   /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
243
    * doesn't try to decode the following data as instructions (such as the
244
    * next stage's shader in turnip)
245
    */
246
   info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
247
   info->sizedwords = info->size / 4;
248

249
   foreach_block (block, &shader->block_list) {
250
      int sfu_delay = 0;
251

252
      foreach_instr (instr, &block->instr_list) {
253

254
         foreach_src (reg, instr) {
255
            collect_reg_info(instr, reg, info);
256
         }
257

258
         foreach_dst (reg, instr) {
259
            if (is_dest_gpr(reg)) {
260
               collect_reg_info(instr, reg, info);
261
            }
262
         }
263

264
         if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
265
            struct ir3_register *base =
266
               (instr->opc == OPC_STP) ? instr->srcs[2] : instr->srcs[1];
267
            if (base->iim_val * type_size(instr->cat6.type) > 32) {
268
               info->multi_dword_ldp_stp = true;
269
            }
270
         }
271

272
         if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
273
            info->last_baryf = info->instrs_count;
274

275
         unsigned instrs_count = 1 + instr->repeat + instr->nop;
276
         unsigned nops_count = instr->nop;
277

278
         if (instr->opc == OPC_NOP) {
279
            nops_count = 1 + instr->repeat;
280
            info->instrs_per_cat[0] += nops_count;
281
         } else {
282
            info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
283
            info->instrs_per_cat[0] += nops_count;
284
         }
285

286
         if (instr->opc == OPC_MOV) {
287
            if (instr->cat1.src_type == instr->cat1.dst_type) {
288
               info->mov_count += 1 + instr->repeat;
289
            } else {
290
               info->cov_count += 1 + instr->repeat;
291
            }
292
         }
293

294
         info->instrs_count += instrs_count;
295
         info->nops_count += nops_count;
296

297
         if (instr->flags & IR3_INSTR_SS) {
298
            info->ss++;
299
            info->sstall += sfu_delay;
300
            sfu_delay = 0;
301
         }
302

303
         if (instr->flags & IR3_INSTR_SY)
304
            info->sy++;
305

306
         if (is_sfu(instr)) {
307
            sfu_delay = 10;
308
         } else {
309
            int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
310
            sfu_delay -= n;
311
         }
312
      }
313
   }
314

315
   /* TODO: for a5xx and below, is there a separate regfile for
316
    * half-registers?
317
    */
318
   unsigned regs_count =
319
      info->max_reg + 1 +
320
      (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
321

322
   info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
323
   unsigned reg_independent_max_waves =
324
      ir3_get_reg_independent_max_waves(v, info->double_threadsize);
325
   unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
326
      compiler, regs_count, info->double_threadsize);
327
   info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
328
   assert(info->max_waves <= v->shader->compiler->max_waves);
329
}
330

331
static struct ir3_register *
332
reg_create(struct ir3 *shader, int num, int flags)
333
{
334
   struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
335
   reg->wrmask = 1;
336
   reg->flags = flags;
337
   reg->num = num;
338
   return reg;
339
}
340

341
static void
342
insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
343
{
344
   struct ir3 *shader = block->shader;
345

346
   instr->serialno = ++shader->instr_count;
347

348
   list_addtail(&instr->node, &block->instr_list);
349

350
   if (is_input(instr))
351
      array_insert(shader, shader->baryfs, instr);
352
}
353

354
struct ir3_block *
355
ir3_block_create(struct ir3 *shader)
356
{
357
   struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
358
#ifdef DEBUG
359
   block->serialno = ++shader->block_count;
360
#endif
361
   block->shader = shader;
362
   list_inithead(&block->node);
363
   list_inithead(&block->instr_list);
364
   return block;
365
}
366

367
void
368
ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
369
{
370
   array_insert(block, block->predecessors, pred);
371
}
372

373
void
374
ir3_block_add_physical_predecessor(struct ir3_block *block,
375
                                   struct ir3_block *pred)
376
{
377
   array_insert(block, block->physical_predecessors, pred);
378
}
379

380
void
381
ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
382
{
383
   for (unsigned i = 0; i < block->predecessors_count; i++) {
384
      if (block->predecessors[i] == pred) {
385
         if (i < block->predecessors_count - 1) {
386
            block->predecessors[i] =
387
               block->predecessors[block->predecessors_count - 1];
388
         }
389

390
         block->predecessors_count--;
391
         return;
392
      }
393
   }
394
}
395

396
unsigned
397
ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
398
{
399
   for (unsigned i = 0; i < block->predecessors_count; i++) {
400
      if (block->predecessors[i] == pred) {
401
         return i;
402
      }
403
   }
404

405
   unreachable("ir3_block_get_pred_index() invalid predecessor");
406
}
407

408
static struct ir3_instruction *
409
instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
410
{
411
   /* Add extra sources for array destinations and the address reg */
412
   if (1 <= opc_cat(opc))
413
      nsrc += 2;
414
   struct ir3_instruction *instr;
415
   unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
416
                 (nsrc * sizeof(instr->srcs[0]));
417
   char *ptr = ir3_alloc(block->shader, sz);
418

419
   instr = (struct ir3_instruction *)ptr;
420
   ptr += sizeof(*instr);
421
   instr->dsts = (struct ir3_register **)ptr;
422
   instr->srcs = instr->dsts + ndst;
423

424
#ifdef DEBUG
425
   instr->dsts_max = ndst;
426
   instr->srcs_max = nsrc;
427
#endif
428

429
   return instr;
430
}
431

432
struct ir3_instruction *
433
ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
434
{
435
   struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
436
   instr->block = block;
437
   instr->opc = opc;
438
   insert_instr(block, instr);
439
   return instr;
440
}
441

442
struct ir3_instruction *
443
ir3_instr_clone(struct ir3_instruction *instr)
444
{
445
   struct ir3_instruction *new_instr = instr_create(
446
      instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
447
   struct ir3_register **dsts, **srcs;
448

449
   dsts = new_instr->dsts;
450
   srcs = new_instr->srcs;
451
   *new_instr = *instr;
452
   new_instr->dsts = dsts;
453
   new_instr->srcs = srcs;
454

455
   insert_instr(instr->block, new_instr);
456

457
   /* clone registers: */
458
   new_instr->dsts_count = 0;
459
   new_instr->srcs_count = 0;
460
   foreach_dst (reg, instr) {
461
      struct ir3_register *new_reg =
462
         ir3_dst_create(new_instr, reg->num, reg->flags);
463
      *new_reg = *reg;
464
      if (new_reg->instr)
465
         new_reg->instr = new_instr;
466
   }
467
   foreach_src (reg, instr) {
468
      struct ir3_register *new_reg =
469
         ir3_src_create(new_instr, reg->num, reg->flags);
470
      *new_reg = *reg;
471
   }
472

473
   return new_instr;
474
}
475

476
/* Add a false dependency to instruction, to ensure it is scheduled first: */
477
void
478
ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
479
{
480
   for (unsigned i = 0; i < instr->deps_count; i++) {
481
      if (instr->deps[i] == dep)
482
         return;
483
   }
484

485
   array_insert(instr, instr->deps, dep);
486
}
487

488
struct ir3_register *
489
ir3_src_create(struct ir3_instruction *instr, int num, int flags)
490
{
491
   struct ir3 *shader = instr->block->shader;
492
#ifdef DEBUG
493
   debug_assert(instr->srcs_count < instr->srcs_max);
494
#endif
495
   struct ir3_register *reg = reg_create(shader, num, flags);
496
   instr->srcs[instr->srcs_count++] = reg;
497
   return reg;
498
}
499

500
struct ir3_register *
501
ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
502
{
503
   struct ir3 *shader = instr->block->shader;
504
#ifdef DEBUG
505
   debug_assert(instr->dsts_count < instr->dsts_max);
506
#endif
507
   struct ir3_register *reg = reg_create(shader, num, flags);
508
   instr->dsts[instr->dsts_count++] = reg;
509
   return reg;
510
}
511

512
struct ir3_register *
513
ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
514
{
515
   struct ir3_register *new_reg = reg_create(shader, 0, 0);
516
   *new_reg = *reg;
517
   return new_reg;
518
}
519

520
void
521
ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
522
                       struct ir3_register *last_write)
523
{
524
   assert(reg->flags & IR3_REG_ARRAY);
525
   struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
526
   *new_reg = *reg;
527
   new_reg->def = last_write;
528
   ir3_reg_tie(reg, new_reg);
529
}
530

531
void
532
ir3_instr_set_address(struct ir3_instruction *instr,
533
                      struct ir3_instruction *addr)
534
{
535
   if (!instr->address) {
536
      struct ir3 *ir = instr->block->shader;
537

538
      debug_assert(instr->block == addr->block);
539

540
      instr->address =
541
         ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
542
      instr->address->def = addr->dsts[0];
543
      debug_assert(reg_num(addr->dsts[0]) == REG_A0);
544
      unsigned comp = reg_comp(addr->dsts[0]);
545
      if (comp == 0) {
546
         array_insert(ir, ir->a0_users, instr);
547
      } else {
548
         debug_assert(comp == 1);
549
         array_insert(ir, ir->a1_users, instr);
550
      }
551
   } else {
552
      debug_assert(instr->address->def->instr == addr);
553
   }
554
}
555

556
void
557
ir3_block_clear_mark(struct ir3_block *block)
558
{
559
   foreach_instr (instr, &block->instr_list)
560
      instr->flags &= ~IR3_INSTR_MARK;
561
}
562

563
void
564
ir3_clear_mark(struct ir3 *ir)
565
{
566
   foreach_block (block, &ir->block_list) {
567
      ir3_block_clear_mark(block);
568
   }
569
}
570

571
unsigned
572
ir3_count_instructions(struct ir3 *ir)
573
{
574
   unsigned cnt = 1;
575
   foreach_block (block, &ir->block_list) {
576
      block->start_ip = cnt;
577
      foreach_instr (instr, &block->instr_list) {
578
         instr->ip = cnt++;
579
      }
580
      block->end_ip = cnt;
581
   }
582
   return cnt;
583
}
584

585
/* When counting instructions for RA, we insert extra fake instructions at the
586
 * beginning of each block, where values become live, and at the end where
587
 * values die. This prevents problems where values live-in at the beginning or
588
 * live-out at the end of a block from being treated as if they were
589
 * live-in/live-out at the first/last instruction, which would be incorrect.
590
 * In ir3_legalize these ip's are assumed to be actual ip's of the final
591
 * program, so it would be incorrect to use this everywhere.
592
 */
593

594
unsigned
595
ir3_count_instructions_ra(struct ir3 *ir)
596
{
597
   unsigned cnt = 1;
598
   foreach_block (block, &ir->block_list) {
599
      block->start_ip = cnt++;
600
      foreach_instr (instr, &block->instr_list) {
601
         instr->ip = cnt++;
602
      }
603
      block->end_ip = cnt++;
604
   }
605
   return cnt;
606
}
607

608
struct ir3_array *
609
ir3_lookup_array(struct ir3 *ir, unsigned id)
610
{
611
   foreach_array (arr, &ir->array_list)
612
      if (arr->id == id)
613
         return arr;
614
   return NULL;
615
}
616

617
void
618
ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
619
{
620
   /* We could do this in a single pass if we can assume instructions
621
    * are always sorted.  Which currently might not always be true.
622
    * (In particular after ir3_group pass, but maybe other places.)
623
    */
624
   foreach_block (block, &ir->block_list)
625
      foreach_instr (instr, &block->instr_list)
626
         instr->uses = NULL;
627

628
   foreach_block (block, &ir->block_list) {
629
      foreach_instr (instr, &block->instr_list) {
630
         foreach_ssa_src_n (src, n, instr) {
631
            if (__is_false_dep(instr, n) && !falsedeps)
632
               continue;
633
            if (!src->uses)
634
               src->uses = _mesa_pointer_set_create(mem_ctx);
635
            _mesa_set_add(src->uses, instr);
636
         }
637
      }
638
   }
639
}
640

641
/**
642
 * Set the destination type of an instruction, for example if a
643
 * conversion is folded in, handling the special cases where the
644
 * instruction's dest type or opcode needs to be fixed up.
645
 */
646
void
647
ir3_set_dst_type(struct ir3_instruction *instr, bool half)
648
{
649
   if (half) {
650
      instr->dsts[0]->flags |= IR3_REG_HALF;
651
   } else {
652
      instr->dsts[0]->flags &= ~IR3_REG_HALF;
653
   }
654

655
   switch (opc_cat(instr->opc)) {
656
   case 1: /* move instructions */
657
      if (half) {
658
         instr->cat1.dst_type = half_type(instr->cat1.dst_type);
659
      } else {
660
         instr->cat1.dst_type = full_type(instr->cat1.dst_type);
661
      }
662
      break;
663
   case 4:
664
      if (half) {
665
         instr->opc = cat4_half_opc(instr->opc);
666
      } else {
667
         instr->opc = cat4_full_opc(instr->opc);
668
      }
669
      break;
670
   case 5:
671
      if (half) {
672
         instr->cat5.type = half_type(instr->cat5.type);
673
      } else {
674
         instr->cat5.type = full_type(instr->cat5.type);
675
      }
676
      break;
677
   }
678
}
679

680
/**
681
 * One-time fixup for instruction src-types.  Other than cov's that
682
 * are folded, an instruction's src type does not change.
683
 */
684
void
685
ir3_fixup_src_type(struct ir3_instruction *instr)
686
{
687
   switch (opc_cat(instr->opc)) {
688
   case 1: /* move instructions */
689
      if (instr->srcs[0]->flags & IR3_REG_HALF) {
690
         instr->cat1.src_type = half_type(instr->cat1.src_type);
691
      } else {
692
         instr->cat1.src_type = full_type(instr->cat1.src_type);
693
      }
694
      break;
695
   case 3:
696
      if (instr->srcs[0]->flags & IR3_REG_HALF) {
697
         instr->opc = cat3_half_opc(instr->opc);
698
      } else {
699
         instr->opc = cat3_full_opc(instr->opc);
700
      }
701
      break;
702
   }
703
}
704

705
/**
706
 * Map a floating point immed to FLUT (float lookup table) value,
707
 * returns negative for immediates that cannot be mapped.
708
 */
709
int
710
ir3_flut(struct ir3_register *src_reg)
711
{
712
   static const struct {
713
      uint32_t f32;
714
      uint16_t f16;
715
   } flut[] = {
716
         { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
717
         { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
718
         { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
719
         { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
720
         { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
721
         { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
722
         { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
723
         { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
724
         { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
725
         { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
726
         { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
727
         { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
728
   };
729

730
   if (src_reg->flags & IR3_REG_HALF) {
731
      /* Note that half-float immeds are already lowered to 16b in nir: */
732
      uint32_t imm = src_reg->uim_val;
733
      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
734
         if (flut[i].f16 == imm) {
735
            return i;
736
         }
737
      }
738
   } else {
739
      uint32_t imm = src_reg->uim_val;
740
      for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
741
         if (flut[i].f32 == imm) {
742
            return i;
743
         }
744
      }
745
   }
746

747
   return -1;
748
}
749

750
static unsigned
751
cp_flags(unsigned flags)
752
{
753
   /* only considering these flags (at least for now): */
754
   flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
755
             IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
756
             IR3_REG_SHARED);
757
   return flags;
758
}
759

760
bool
761
ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
762
{
763
   struct ir3_compiler *compiler = instr->block->shader->compiler;
764
   unsigned valid_flags;
765

766
   if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
767
      return false;
768

769
   flags = cp_flags(flags);
770

771
   /* If destination is indirect, then source cannot be.. at least
772
    * I don't think so..
773
    */
774
   if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
775
       (flags & IR3_REG_RELATIV))
776
      return false;
777

778
   if (flags & IR3_REG_RELATIV) {
779
      /* TODO need to test on earlier gens.. pretty sure the earlier
780
       * problem was just that we didn't check that the src was from
781
       * same block (since we can't propagate address register values
782
       * across blocks currently)
783
       */
784
      if (compiler->gpu_id < 600)
785
         return false;
786

787
      /* NOTE in the special try_swap_mad_two_srcs() case we can be
788
       * called on a src that has already had an indirect load folded
789
       * in, in which case ssa() returns NULL
790
       */
791
      if (instr->srcs[n]->flags & IR3_REG_SSA) {
792
         struct ir3_instruction *src = ssa(instr->srcs[n]);
793
         if (src->address->def->instr->block != instr->block)
794
            return false;
795
      }
796
   }
797

798
   if (is_meta(instr)) {
799
      /* collect and phi nodes support const/immed sources, which will be
800
       * turned into move instructions, but not anything else.
801
       */
802
      if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
803
         return false;
804

805
      if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
806
         return false;
807

808
      return true;
809
   }
810

811
   switch (opc_cat(instr->opc)) {
812
   case 0: /* end, chmask */
813
      return flags == 0;
814
   case 1:
815
      switch (instr->opc) {
816
      case OPC_MOVMSK:
817
      case OPC_SWZ:
818
      case OPC_SCT:
819
      case OPC_GAT:
820
         valid_flags = IR3_REG_SHARED;
821
         break;
822
      default:
823
         valid_flags =
824
            IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
825
      }
826
      if (flags & ~valid_flags)
827
         return false;
828
      break;
829
   case 2:
830
      valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
831
                    IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
832

833
      if (flags & ~valid_flags)
834
         return false;
835

836
      if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
837
         unsigned m = n ^ 1;
838
         /* cannot deal w/ const or shared in both srcs:
839
          * (note that some cat2 actually only have a single src)
840
          */
841
         if (m < instr->srcs_count) {
842
            struct ir3_register *reg = instr->srcs[m];
843
            if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
844
                (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
845
               return false;
846
            if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
847
               return false;
848
         }
849
      }
850
      break;
851
   case 3:
852
      valid_flags =
853
         ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
854

855
      if (instr->opc == OPC_SHLG_B16) {
856
         valid_flags |= IR3_REG_IMMED;
857
         /* shlg.b16 can be RELATIV+CONST but not CONST: */
858
         if (flags & IR3_REG_RELATIV)
859
            valid_flags |= IR3_REG_CONST;
860
      } else {
861
         valid_flags |= IR3_REG_CONST;
862
      }
863

864
      if (flags & ~valid_flags)
865
         return false;
866

867
      if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
868
         /* cannot deal w/ const/shared/relativ in 2nd src: */
869
         if (n == 1)
870
            return false;
871
      }
872

873
      break;
874
   case 4:
875
      /* seems like blob compiler avoids const as src.. */
876
      /* TODO double check if this is still the case on a4xx */
877
      if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
878
         return false;
879
      if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
880
         return false;
881
      break;
882
   case 5:
883
      /* no flags allowed */
884
      if (flags)
885
         return false;
886
      break;
887
   case 6:
888
      valid_flags = IR3_REG_IMMED;
889
      if (flags & ~valid_flags)
890
         return false;
891

892
      if (flags & IR3_REG_IMMED) {
893
         /* doesn't seem like we can have immediate src for store
894
          * instructions:
895
          *
896
          * TODO this restriction could also apply to load instructions,
897
          * but for load instructions this arg is the address (and not
898
          * really sure any good way to test a hard-coded immed addr src)
899
          */
900
         if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
901
            return false;
902

903
         if ((instr->opc == OPC_LDL) && (n == 0))
904
            return false;
905

906
         if ((instr->opc == OPC_STL) && (n != 2))
907
            return false;
908

909
         if ((instr->opc == OPC_LDP) && (n == 0))
910
            return false;
911

912
         if ((instr->opc == OPC_STP) && (n != 2))
913
            return false;
914

915
         if (instr->opc == OPC_STLW && n == 0)
916
            return false;
917

918
         if (instr->opc == OPC_LDLW && n == 0)
919
            return false;
920

921
         /* disallow immediates in anything but the SSBO slot argument for
922
          * cat6 instructions:
923
          */
924
         if (is_atomic(instr->opc) && (n != 0))
925
            return false;
926

927
         if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
928
            return false;
929

930
         if (instr->opc == OPC_STG && (n == 2))
931
            return false;
932

933
         if (instr->opc == OPC_STG_A && (n == 4))
934
            return false;
935

936
         /* as with atomics, these cat6 instrs can only have an immediate
937
          * for SSBO/IBO slot argument
938
          */
939
         switch (instr->opc) {
940
         case OPC_LDIB:
941
         case OPC_STIB:
942
         case OPC_LDC:
943
         case OPC_RESINFO:
944
            if (n != 0)
945
               return false;
946
            break;
947
         default:
948
            break;
949
         }
950
      }
951

952
      break;
953
   }
954

955
   return true;
956
}
957

958
Product

Resources

Company