CoCalc -- brw_fs

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_builder.h
⁴⁵⁵⁰ views
1
/* -*- c++ -*- */
2
/*
3
 * Copyright © 2010-2015 Intel Corporation
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22
 * IN THE SOFTWARE.
23
 */
24

25
#ifndef BRW_FS_BUILDER_H
26
#define BRW_FS_BUILDER_H
27

28
#include "brw_ir_fs.h"
29
#include "brw_shader.h"
30

31
namespace brw {
32
   /**
33
    * Toolbox to assemble an FS IR program out of individual instructions.
34
    *
35
    * This object is meant to have an interface consistent with
36
    * brw::vec4_builder.  They cannot be fully interchangeable because
37
    * brw::fs_builder generates scalar code while brw::vec4_builder generates
38
    * vector code.
39
    */
40
   class fs_builder {
41
   public:
42
      /** Type used in this IR to represent a source of an instruction. */
43
      typedef fs_reg src_reg;
44

45
      /** Type used in this IR to represent the destination of an instruction. */
46
      typedef fs_reg dst_reg;
47

48
      /** Type used in this IR to represent an instruction. */
49
      typedef fs_inst instruction;
50

51
      /**
52
       * Construct an fs_builder that inserts instructions into \p shader.
53
       * \p dispatch_width gives the native execution width of the program.
54
       */
55
      fs_builder(backend_shader *shader,
56
                 unsigned dispatch_width) :
57
         shader(shader), block(NULL), cursor(NULL),
58
         _dispatch_width(dispatch_width),
59
         _group(0),
60
         force_writemask_all(false),
61
         annotation()
62
      {
63
      }
64

65
      /**
66
       * Construct an fs_builder that inserts instructions into \p shader
67
       * before instruction \p inst in basic block \p block.  The default
68
       * execution controls and debug annotation are initialized from the
69
       * instruction passed as argument.
70
       */
71
      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72
         shader(shader), block(block), cursor(inst),
73
         _dispatch_width(inst->exec_size),
74
         _group(inst->group),
75
         force_writemask_all(inst->force_writemask_all)
76
      {
77
         annotation.str = inst->annotation;
78
         annotation.ir = inst->ir;
79
      }
80

81
      /**
82
       * Construct an fs_builder that inserts instructions before \p cursor in
83
       * basic block \p block, inheriting other code generation parameters
84
       * from this.
85
       */
86
      fs_builder
87
      at(bblock_t *block, exec_node *cursor) const
88
      {
89
         fs_builder bld = *this;
90
         bld.block = block;
91
         bld.cursor = cursor;
92
         return bld;
93
      }
94

95
      /**
96
       * Construct an fs_builder appending instructions at the end of the
97
       * instruction list of the shader, inheriting other code generation
98
       * parameters from this.
99
       */
100
      fs_builder
101
      at_end() const
102
      {
103
         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104
      }
105

106
      /**
107
       * Construct a builder specifying the default SIMD width and group of
108
       * channel enable signals, inheriting other code generation parameters
109
       * from this.
110
       *
111
       * \p n gives the default SIMD width, \p i gives the slot group used for
112
       * predication and control flow masking in multiples of \p n channels.
113
       */
114
      fs_builder
115
      group(unsigned n, unsigned i) const
116
      {
117
         fs_builder bld = *this;
118

119
         if (n <= dispatch_width() && i < dispatch_width() / n) {
120
            bld._group += i * n;
121
         } else {
122
            /* The requested channel group isn't a subset of the channel group
123
             * of this builder, which means that the resulting instructions
124
             * would use (potentially undefined) channel enable signals not
125
             * specified by the parent builder.  That's only valid if the
126
             * instruction doesn't have per-channel semantics, in which case
127
             * we should clear off the default group index in order to prevent
128
             * emitting instructions with channel group not aligned to their
129
             * own execution size.
130
             */
131
            assert(force_writemask_all);
132
            bld._group = 0;
133
         }
134

135
         bld._dispatch_width = n;
136
         return bld;
137
      }
138

139
      /**
140
       * Alias for group() with width equal to eight.
141
       */
142
      fs_builder
143
      quarter(unsigned i) const
144
      {
145
         return group(8, i);
146
      }
147

148
      /**
149
       * Construct a builder with per-channel control flow execution masking
150
       * disabled if \p b is true.  If control flow execution masking is
151
       * already disabled this has no effect.
152
       */
153
      fs_builder
154
      exec_all(bool b = true) const
155
      {
156
         fs_builder bld = *this;
157
         if (b)
158
            bld.force_writemask_all = true;
159
         return bld;
160
      }
161

162
      /**
163
       * Construct a builder with the given debug annotation info.
164
       */
165
      fs_builder
166
      annotate(const char *str, const void *ir = NULL) const
167
      {
168
         fs_builder bld = *this;
169
         bld.annotation.str = str;
170
         bld.annotation.ir = ir;
171
         return bld;
172
      }
173

174
      /**
175
       * Get the SIMD width in use.
176
       */
177
      unsigned
178
      dispatch_width() const
179
      {
180
         return _dispatch_width;
181
      }
182

183
      /**
184
       * Get the channel group in use.
185
       */
186
      unsigned
187
      group() const
188
      {
189
         return _group;
190
      }
191

192
      /**
193
       * Allocate a virtual register of natural vector size (one for this IR)
194
       * and SIMD width.  \p n gives the amount of space to allocate in
195
       * dispatch_width units (which is just enough space for one logical
196
       * component in this IR).
197
       */
198
      dst_reg
199
      vgrf(enum brw_reg_type type, unsigned n = 1) const
200
      {
201
         assert(dispatch_width() <= 32);
202

203
         if (n > 0)
204
            return dst_reg(VGRF, shader->alloc.allocate(
205
                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206
                                           REG_SIZE)),
207
                           type);
208
         else
209
            return retype(null_reg_ud(), type);
210
      }
211

212
      /**
213
       * Create a null register of floating type.
214
       */
215
      dst_reg
216
      null_reg_f() const
217
      {
218
         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219
      }
220

221
      dst_reg
222
      null_reg_df() const
223
      {
224
         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225
      }
226

227
      /**
228
       * Create a null register of signed integer type.
229
       */
230
      dst_reg
231
      null_reg_d() const
232
      {
233
         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234
      }
235

236
      /**
237
       * Create a null register of unsigned integer type.
238
       */
239
      dst_reg
240
      null_reg_ud() const
241
      {
242
         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243
      }
244

245
      /**
246
       * Insert an instruction into the program.
247
       */
248
      instruction *
249
      emit(const instruction &inst) const
250
      {
251
         return emit(new(shader->mem_ctx) instruction(inst));
252
      }
253

254
      /**
255
       * Create and insert a nullary control instruction into the program.
256
       */
257
      instruction *
258
      emit(enum opcode opcode) const
259
      {
260
         return emit(instruction(opcode, dispatch_width()));
261
      }
262

263
      /**
264
       * Create and insert a nullary instruction into the program.
265
       */
266
      instruction *
267
      emit(enum opcode opcode, const dst_reg &dst) const
268
      {
269
         return emit(instruction(opcode, dispatch_width(), dst));
270
      }
271

272
      /**
273
       * Create and insert a unary instruction into the program.
274
       */
275
      instruction *
276
      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277
      {
278
         switch (opcode) {
279
         case SHADER_OPCODE_RCP:
280
         case SHADER_OPCODE_RSQ:
281
         case SHADER_OPCODE_SQRT:
282
         case SHADER_OPCODE_EXP2:
283
         case SHADER_OPCODE_LOG2:
284
         case SHADER_OPCODE_SIN:
285
         case SHADER_OPCODE_COS:
286
            return emit(instruction(opcode, dispatch_width(), dst,
287
                                    fix_math_operand(src0)));
288

289
         default:
290
            return emit(instruction(opcode, dispatch_width(), dst, src0));
291
         }
292
      }
293

294
      /**
295
       * Create and insert a binary instruction into the program.
296
       */
297
      instruction *
298
      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299
           const src_reg &src1) const
300
      {
301
         switch (opcode) {
302
         case SHADER_OPCODE_POW:
303
         case SHADER_OPCODE_INT_QUOTIENT:
304
         case SHADER_OPCODE_INT_REMAINDER:
305
            return emit(instruction(opcode, dispatch_width(), dst,
306
                                    fix_math_operand(src0),
307
                                    fix_math_operand(src1)));
308

309
         default:
310
            return emit(instruction(opcode, dispatch_width(), dst,
311
                                    src0, src1));
312

313
         }
314
      }
315

316
      /**
317
       * Create and insert a ternary instruction into the program.
318
       */
319
      instruction *
320
      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321
           const src_reg &src1, const src_reg &src2) const
322
      {
323
         switch (opcode) {
324
         case BRW_OPCODE_BFE:
325
         case BRW_OPCODE_BFI2:
326
         case BRW_OPCODE_MAD:
327
         case BRW_OPCODE_LRP:
328
            return emit(instruction(opcode, dispatch_width(), dst,
329
                                    fix_3src_operand(src0),
330
                                    fix_3src_operand(src1),
331
                                    fix_3src_operand(src2)));
332

333
         default:
334
            return emit(instruction(opcode, dispatch_width(), dst,
335
                                    src0, src1, src2));
336
         }
337
      }
338

339
      /**
340
       * Create and insert an instruction with a variable number of sources
341
       * into the program.
342
       */
343
      instruction *
344
      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345
           unsigned n) const
346
      {
347
         /* Use the emit() methods for specific operand counts to ensure that
348
          * opcode-specific operand fixups occur.
349
          */
350
         if (n == 2) {
351
            return emit(opcode, dst, srcs[0], srcs[1]);
352
         } else if (n == 3) {
353
            return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
354
         } else {
355
            return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
356
         }
357
      }
358

359
      /**
360
       * Insert a preallocated instruction into the program.
361
       */
362
      instruction *
363
      emit(instruction *inst) const
364
      {
365
         assert(inst->exec_size <= 32);
366
         assert(inst->exec_size == dispatch_width() ||
367
                force_writemask_all);
368

369
         inst->group = _group;
370
         inst->force_writemask_all = force_writemask_all;
371
         inst->annotation = annotation.str;
372
         inst->ir = annotation.ir;
373

374
         if (block)
375
            static_cast<instruction *>(cursor)->insert_before(block, inst);
376
         else
377
            cursor->insert_before(inst);
378

379
         return inst;
380
      }
381

382
      /**
383
       * Select \p src0 if the comparison of both sources with the given
384
       * conditional mod evaluates to true, otherwise select \p src1.
385
       *
386
       * Generally useful to get the minimum or maximum of two values.
387
       */
388
      instruction *
389
      emit_minmax(const dst_reg &dst, const src_reg &src0,
390
                  const src_reg &src1, brw_conditional_mod mod) const
391
      {
392
         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
393

394
         /* In some cases we can't have bytes as operand for src1, so use the
395
          * same type for both operand.
396
          */
397
         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
398
                                     fix_unsigned_negate(src1)));
399
      }
400

401
      /**
402
       * Copy any live channel from \p src to the first channel of the result.
403
       */
404
      src_reg
405
      emit_uniformize(const src_reg &src) const
406
      {
407
         /* FIXME: We use a vector chan_index and dst to allow constant and
408
          * copy propagration to move result all the way into the consuming
409
          * instruction (typically a surface index or sampler index for a
410
          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
411
          * dispatch. Once we teach const/copy propagation about scalars we
412
          * should go back to scalar destinations here.
413
          */
414
         const fs_builder ubld = exec_all();
415
         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
416
         const dst_reg dst = vgrf(src.type);
417

418
         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
419
         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
420

421
         return src_reg(component(dst, 0));
422
      }
423

424
      src_reg
425
      move_to_vgrf(const src_reg &src, unsigned num_components) const
426
      {
427
         src_reg *const src_comps = new src_reg[num_components];
428
         for (unsigned i = 0; i < num_components; i++)
429
            src_comps[i] = offset(src, dispatch_width(), i);
430

431
         const dst_reg dst = vgrf(src.type, num_components);
432
         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
433

434
         delete[] src_comps;
435

436
         return src_reg(dst);
437
      }
438

439
      void
440
      emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
441
                     const dst_reg &tmp,
442
                     unsigned left_offset, unsigned left_stride,
443
                     unsigned right_offset, unsigned right_stride) const
444
      {
445
         dst_reg left, right;
446
         left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
447
         right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
448
         if ((tmp.type == BRW_REGISTER_TYPE_Q ||
449
              tmp.type == BRW_REGISTER_TYPE_UQ) &&
450
             !shader->devinfo->has_64bit_int) {
451
            switch (opcode) {
452
            case BRW_OPCODE_MUL:
453
               /* This will get lowered by integer MUL lowering */
454
               set_condmod(mod, emit(opcode, right, left, right));
455
               break;
456

457
            case BRW_OPCODE_SEL: {
458
               /* In order for the comparisons to work out right, we need our
459
                * comparisons to be strict.
460
                */
461
               assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
462
               if (mod == BRW_CONDITIONAL_GE)
463
                  mod = BRW_CONDITIONAL_G;
464

465
               /* We treat the bottom 32 bits as unsigned regardless of
466
                * whether or not the integer as a whole is signed.
467
                */
468
               dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
469
               dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
470

471
               /* The upper bits get the same sign as the 64-bit type */
472
               brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
473
               dst_reg right_high = subscript(right, type32, 1);
474
               dst_reg left_high = subscript(left, type32, 1);
475

476
               /* Build up our comparison:
477
                *
478
                *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
479
                */
480
               CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
481
                                  retype(right_low, BRW_REGISTER_TYPE_UD), mod);
482
               set_predicate(BRW_PREDICATE_NORMAL,
483
                             CMP(null_reg_ud(), left_high, right_high,
484
                                 BRW_CONDITIONAL_EQ));
485
               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
486
                                 CMP(null_reg_ud(), left_high, right_high, mod));
487

488
               /* We could use selects here or we could use predicated MOVs
489
                * because the destination and second source (if it were a SEL)
490
                * are the same.
491
                */
492
               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
493
               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
494
               break;
495
            }
496

497
            default:
498
               unreachable("Unsupported 64-bit scan op");
499
            }
500
         } else {
501
            set_condmod(mod, emit(opcode, right, left, right));
502
         }
503
      }
504

505
      void
506
      emit_scan(enum opcode opcode, const dst_reg &tmp,
507
                unsigned cluster_size, brw_conditional_mod mod) const
508
      {
509
         assert(dispatch_width() >= 8);
510

511
         /* The instruction splitting code isn't advanced enough to split
512
          * these so we need to handle that ourselves.
513
          */
514
         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
515
            const unsigned half_width = dispatch_width() / 2;
516
            const fs_builder ubld = exec_all().group(half_width, 0);
517
            dst_reg left = tmp;
518
            dst_reg right = horiz_offset(tmp, half_width);
519
            ubld.emit_scan(opcode, left, cluster_size, mod);
520
            ubld.emit_scan(opcode, right, cluster_size, mod);
521
            if (cluster_size > half_width) {
522
               ubld.emit_scan_step(opcode, mod, tmp,
523
                                   half_width - 1, 0, half_width, 1);
524
            }
525
            return;
526
         }
527

528
         if (cluster_size > 1) {
529
            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
530
            ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
531
         }
532

533
         if (cluster_size > 2) {
534
            if (type_sz(tmp.type) <= 4) {
535
               const fs_builder ubld =
536
                  exec_all().group(dispatch_width() / 4, 0);
537
               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
538
               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
539
            } else {
540
               /* For 64-bit types, we have to do things differently because
541
                * the code above would land us with destination strides that
542
                * the hardware can't handle.  Fortunately, we'll only be
543
                * 8-wide in that case and it's the same number of
544
                * instructions.
545
                */
546
               const fs_builder ubld = exec_all().group(2, 0);
547
               for (unsigned i = 0; i < dispatch_width(); i += 4)
548
                  ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
549
            }
550
         }
551

552
         for (unsigned i = 4;
553
              i < MIN2(cluster_size, dispatch_width());
554
              i *= 2) {
555
            const fs_builder ubld = exec_all().group(i, 0);
556
            ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
557

558
            if (dispatch_width() > i * 2)
559
               ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
560

561
            if (dispatch_width() > i * 4) {
562
               ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
563
               ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
564
            }
565
         }
566
      }
567

568
      /**
569
       * Assorted arithmetic ops.
570
       * @{
571
       */
572
#define ALU1(op)                                        \
573
      instruction *                                     \
574
      op(const dst_reg &dst, const src_reg &src0) const \
575
      {                                                 \
576
         return emit(BRW_OPCODE_##op, dst, src0);       \
577
      }
578

579
#define ALU2(op)                                                        \
580
      instruction *                                                     \
581
      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
582
      {                                                                 \
583
         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
584
      }
585

586
#define ALU2_ACC(op)                                                    \
587
      instruction *                                                     \
588
      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
589
      {                                                                 \
590
         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
591
         inst->writes_accumulator = true;                               \
592
         return inst;                                                   \
593
      }
594

595
#define ALU3(op)                                                        \
596
      instruction *                                                     \
597
      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
598
         const src_reg &src2) const                                     \
599
      {                                                                 \
600
         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
601
      }
602

603
      ALU2(ADD)
604
      ALU2_ACC(ADDC)
605
      ALU2(AND)
606
      ALU2(ASR)
607
      ALU2(AVG)
608
      ALU3(BFE)
609
      ALU2(BFI1)
610
      ALU3(BFI2)
611
      ALU1(BFREV)
612
      ALU1(CBIT)
613
      ALU1(DIM)
614
      ALU2(DP2)
615
      ALU2(DP3)
616
      ALU2(DP4)
617
      ALU2(DPH)
618
      ALU1(F16TO32)
619
      ALU1(F32TO16)
620
      ALU1(FBH)
621
      ALU1(FBL)
622
      ALU1(FRC)
623
      ALU2(LINE)
624
      ALU1(LZD)
625
      ALU2(MAC)
626
      ALU2_ACC(MACH)
627
      ALU3(MAD)
628
      ALU1(MOV)
629
      ALU2(MUL)
630
      ALU1(NOT)
631
      ALU2(OR)
632
      ALU2(PLN)
633
      ALU1(RNDD)
634
      ALU1(RNDE)
635
      ALU1(RNDU)
636
      ALU1(RNDZ)
637
      ALU2(ROL)
638
      ALU2(ROR)
639
      ALU2(SAD2)
640
      ALU2_ACC(SADA2)
641
      ALU2(SEL)
642
      ALU2(SHL)
643
      ALU2(SHR)
644
      ALU2_ACC(SUBB)
645
      ALU2(XOR)
646

647
#undef ALU3
648
#undef ALU2_ACC
649
#undef ALU2
650
#undef ALU1
651
      /** @} */
652

653
      /**
654
       * CMP: Sets the low bit of the destination channels with the result
655
       * of the comparison, while the upper bits are undefined, and updates
656
       * the flag register with the packed 16 bits of the result.
657
       */
658
      instruction *
659
      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
660
          brw_conditional_mod condition) const
661
      {
662
         /* Take the instruction:
663
          *
664
          * CMP null<d> src0<f> src1<f>
665
          *
666
          * Original gfx4 does type conversion to the destination type
667
          * before comparison, producing garbage results for floating
668
          * point comparisons.
669
          *
670
          * The destination type doesn't matter on newer generations,
671
          * so we set the type to match src0 so we can compact the
672
          * instruction.
673
          */
674
         return set_condmod(condition,
675
                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
676
                                 fix_unsigned_negate(src0),
677
                                 fix_unsigned_negate(src1)));
678
      }
679

680
      /**
681
       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
682
       */
683
      instruction *
684
      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
685
           brw_conditional_mod condition) const
686
      {
687
         /* Take the instruction:
688
          *
689
          * CMP null<d> src0<f> src1<f>
690
          *
691
          * Original gfx4 does type conversion to the destination type
692
          * before comparison, producing garbage results for floating
693
          * point comparisons.
694
          *
695
          * The destination type doesn't matter on newer generations,
696
          * so we set the type to match src0 so we can compact the
697
          * instruction.
698
          */
699
         return set_condmod(condition,
700
                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
701
                                 fix_unsigned_negate(src0),
702
                                 fix_unsigned_negate(src1)));
703
      }
704

705
      /**
706
       * Gfx4 predicated IF.
707
       */
708
      instruction *
709
      IF(brw_predicate predicate) const
710
      {
711
         return set_predicate(predicate, emit(BRW_OPCODE_IF));
712
      }
713

714
      /**
715
       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
716
       */
717
      instruction *
718
      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
719
           const src_reg &src2, brw_conditional_mod condition) const
720
      {
721
         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
722
          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
723
          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
724
          */
725
         assert(src2.type == BRW_REGISTER_TYPE_F);
726

727
         return set_condmod(condition,
728
                            emit(BRW_OPCODE_CSEL,
729
                                 retype(dst, BRW_REGISTER_TYPE_F),
730
                                 retype(src0, BRW_REGISTER_TYPE_F),
731
                                 retype(src1, BRW_REGISTER_TYPE_F),
732
                                 src2));
733
      }
734

735
      /**
736
       * Emit a linear interpolation instruction.
737
       */
738
      instruction *
739
      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
740
          const src_reg &a) const
741
      {
742
         if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
743
            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
744
             * we need to reorder the operands.
745
             */
746
            return emit(BRW_OPCODE_LRP, dst, a, y, x);
747

748
         } else {
749
            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
750
            const dst_reg y_times_a = vgrf(dst.type);
751
            const dst_reg one_minus_a = vgrf(dst.type);
752
            const dst_reg x_times_one_minus_a = vgrf(dst.type);
753

754
            MUL(y_times_a, y, a);
755
            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
756
            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
757
            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
758
         }
759
      }
760

761
      /**
762
       * Collect a number of registers in a contiguous range of registers.
763
       */
764
      instruction *
765
      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
766
                   unsigned sources, unsigned header_size) const
767
      {
768
         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
769
         inst->header_size = header_size;
770
         inst->size_written = header_size * REG_SIZE;
771
         for (unsigned i = header_size; i < sources; i++) {
772
            inst->size_written +=
773
               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
774
                     REG_SIZE);
775
         }
776

777
         return inst;
778
      }
779

780
      instruction *
781
      UNDEF(const dst_reg &dst) const
782
      {
783
         assert(dst.file == VGRF);
784
         instruction *inst = emit(SHADER_OPCODE_UNDEF,
785
                                  retype(dst, BRW_REGISTER_TYPE_UD));
786
         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
787

788
         return inst;
789
      }
790

791
      backend_shader *shader;
792

793
   private:
794
      /**
795
       * Workaround for negation of UD registers.  See comment in
796
       * fs_generator::generate_code() for more details.
797
       */
798
      src_reg
799
      fix_unsigned_negate(const src_reg &src) const
800
      {
801
         if (src.type == BRW_REGISTER_TYPE_UD &&
802
             src.negate) {
803
            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
804
            MOV(temp, src);
805
            return src_reg(temp);
806
         } else {
807
            return src;
808
         }
809
      }
810

811
      /**
812
       * Workaround for source register modes not supported by the ternary
813
       * instruction encoding.
814
       */
815
      src_reg
816
      fix_3src_operand(const src_reg &src) const
817
      {
818
         switch (src.file) {
819
         case FIXED_GRF:
820
            /* FINISHME: Could handle scalar region, other stride=1 regions */
821
            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
822
                src.width != BRW_WIDTH_8 ||
823
                src.hstride != BRW_HORIZONTAL_STRIDE_1)
824
               break;
825
            FALLTHROUGH;
826
         case ATTR:
827
         case VGRF:
828
         case UNIFORM:
829
         case IMM:
830
            return src;
831
         default:
832
            break;
833
         }
834

835
         dst_reg expanded = vgrf(src.type);
836
         MOV(expanded, src);
837
         return expanded;
838
      }
839

840
      /**
841
       * Workaround for source register modes not supported by the math
842
       * instruction.
843
       */
844
      src_reg
845
      fix_math_operand(const src_reg &src) const
846
      {
847
         /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
848
          * might be able to do better by doing execsize = 1 math and then
849
          * expanding that result out, but we would need to be careful with
850
          * masking.
851
          *
852
          * Gfx6 hardware ignores source modifiers (negate and abs) on math
853
          * instructions, so we also move to a temp to set those up.
854
          *
855
          * Gfx7 relaxes most of the above restrictions, but still can't use IMM
856
          * operands to math
857
          */
858
         if ((shader->devinfo->ver == 6 &&
859
              (src.file == IMM || src.file == UNIFORM ||
860
               src.abs || src.negate)) ||
861
             (shader->devinfo->ver == 7 && src.file == IMM)) {
862
            const dst_reg tmp = vgrf(src.type);
863
            MOV(tmp, src);
864
            return tmp;
865
         } else {
866
            return src;
867
         }
868
      }
869

870
      bblock_t *block;
871
      exec_node *cursor;
872

873
      unsigned _dispatch_width;
874
      unsigned _group;
875
      bool force_writemask_all;
876

877
      /** Debug annotation info. */
878
      struct {
879
         const char *str;
880
         const void *ir;
881
      } annotation;
882
   };
883
}
884

885
#endif
886

887
Product

Resources

Company