CoCalc -- brw

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs.cpp
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2010 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
/** @file brw_fs.cpp
25
 *
26
 * This file drives the GLSL IR -> LIR translation, contains the
27
 * optimizations on the LIR, and drives the generation of native code
28
 * from the LIR.
29
 */
30

31
#include "main/macros.h"
32
#include "brw_eu.h"
33
#include "brw_fs.h"
34
#include "brw_fs_live_variables.h"
35
#include "brw_nir.h"
36
#include "brw_vec4_gs_visitor.h"
37
#include "brw_cfg.h"
38
#include "brw_dead_control_flow.h"
39
#include "dev/intel_debug.h"
40
#include "compiler/glsl_types.h"
41
#include "compiler/nir/nir_builder.h"
42
#include "program/prog_parameter.h"
43
#include "util/u_math.h"
44

45
using namespace brw;
46

47
static unsigned get_lowered_simd_width(const struct intel_device_info *devinfo,
48
                                       const fs_inst *inst);
49

50
void
51
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
52
              const fs_reg *src, unsigned sources)
53
{
54
   memset((void*)this, 0, sizeof(*this));
55

56
   this->src = new fs_reg[MAX2(sources, 3)];
57
   for (unsigned i = 0; i < sources; i++)
58
      this->src[i] = src[i];
59

60
   this->opcode = opcode;
61
   this->dst = dst;
62
   this->sources = sources;
63
   this->exec_size = exec_size;
64
   this->base_mrf = -1;
65

66
   assert(dst.file != IMM && dst.file != UNIFORM);
67

68
   assert(this->exec_size != 0);
69

70
   this->conditional_mod = BRW_CONDITIONAL_NONE;
71

72
   /* This will be the case for almost all instructions. */
73
   switch (dst.file) {
74
   case VGRF:
75
   case ARF:
76
   case FIXED_GRF:
77
   case MRF:
78
   case ATTR:
79
      this->size_written = dst.component_size(exec_size);
80
      break;
81
   case BAD_FILE:
82
      this->size_written = 0;
83
      break;
84
   case IMM:
85
   case UNIFORM:
86
      unreachable("Invalid destination register file");
87
   }
88

89
   this->writes_accumulator = false;
90
}
91

92
fs_inst::fs_inst()
93
{
94
   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95
}
96

97
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98
{
99
   init(opcode, exec_size, reg_undef, NULL, 0);
100
}
101

102
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
103
{
104
   init(opcode, exec_size, dst, NULL, 0);
105
}
106

107
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
108
                 const fs_reg &src0)
109
{
110
   const fs_reg src[1] = { src0 };
111
   init(opcode, exec_size, dst, src, 1);
112
}
113

114
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
115
                 const fs_reg &src0, const fs_reg &src1)
116
{
117
   const fs_reg src[2] = { src0, src1 };
118
   init(opcode, exec_size, dst, src, 2);
119
}
120

121
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
122
                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
123
{
124
   const fs_reg src[3] = { src0, src1, src2 };
125
   init(opcode, exec_size, dst, src, 3);
126
}
127

128
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
129
                 const fs_reg src[], unsigned sources)
130
{
131
   init(opcode, exec_width, dst, src, sources);
132
}
133

134
fs_inst::fs_inst(const fs_inst &that)
135
{
136
   memcpy((void*)this, &that, sizeof(that));
137

138
   this->src = new fs_reg[MAX2(that.sources, 3)];
139

140
   for (unsigned i = 0; i < that.sources; i++)
141
      this->src[i] = that.src[i];
142
}
143

144
fs_inst::~fs_inst()
145
{
146
   delete[] this->src;
147
}
148

149
void
150
fs_inst::resize_sources(uint8_t num_sources)
151
{
152
   if (this->sources != num_sources) {
153
      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
154

155
      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
156
         src[i] = this->src[i];
157

158
      delete[] this->src;
159
      this->src = src;
160
      this->sources = num_sources;
161
   }
162
}
163

164
void
165
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
166
                                       const fs_reg &dst,
167
                                       const fs_reg &surf_index,
168
                                       const fs_reg &varying_offset,
169
                                       uint32_t const_offset,
170
                                       uint8_t alignment)
171
{
172
   /* We have our constant surface use a pitch of 4 bytes, so our index can
173
    * be any component of a vector, and then we load 4 contiguous
174
    * components starting from that.
175
    *
176
    * We break down the const_offset to a portion added to the variable offset
177
    * and a portion done using fs_reg::offset, which means that if you have
178
    * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
179
    * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
180
    * later notice that those loads are all the same and eliminate the
181
    * redundant ones.
182
    */
183
   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
184
   bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
185

186
   /* The pull load message will load a vec4 (16 bytes). If we are loading
187
    * a double this means we are only loading 2 elements worth of data.
188
    * We also want to use a 32-bit data type for the dst of the load operation
189
    * so other parts of the driver don't get confused about the size of the
190
    * result.
191
    */
192
   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
193
   fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
194
                            vec4_result, surf_index, vec4_offset,
195
                            brw_imm_ud(alignment));
196
   inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
197

198
   shuffle_from_32bit_read(bld, dst, vec4_result,
199
                           (const_offset & 0xf) / type_sz(dst.type), 1);
200
}
201

202
/**
203
 * A helper for MOV generation for fixing up broken hardware SEND dependency
204
 * handling.
205
 */
206
void
207
fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
208
{
209
   /* The caller always wants uncompressed to emit the minimal extra
210
    * dependencies, and to avoid having to deal with aligning its regs to 2.
211
    */
212
   const fs_builder ubld = bld.annotate("send dependency resolve")
213
                              .quarter(0);
214

215
   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
216
}
217

218
bool
219
fs_inst::is_send_from_grf() const
220
{
221
   switch (opcode) {
222
   case SHADER_OPCODE_SEND:
223
   case SHADER_OPCODE_SHADER_TIME_ADD:
224
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
225
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
226
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
227
   case SHADER_OPCODE_URB_WRITE_SIMD8:
228
   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
229
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
230
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
231
   case SHADER_OPCODE_URB_READ_SIMD8:
232
   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
233
   case SHADER_OPCODE_INTERLOCK:
234
   case SHADER_OPCODE_MEMORY_FENCE:
235
   case SHADER_OPCODE_BARRIER:
236
      return true;
237
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238
      return src[1].file == VGRF;
239
   case FS_OPCODE_FB_WRITE:
240
   case FS_OPCODE_FB_READ:
241
      return src[0].file == VGRF;
242
   default:
243
      if (is_tex())
244
         return src[0].file == VGRF;
245

246
      return false;
247
   }
248
}
249

250
bool
251
fs_inst::is_control_source(unsigned arg) const
252
{
253
   switch (opcode) {
254
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
255
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
256
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
257
      return arg == 0;
258

259
   case SHADER_OPCODE_BROADCAST:
260
   case SHADER_OPCODE_SHUFFLE:
261
   case SHADER_OPCODE_QUAD_SWIZZLE:
262
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
263
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
264
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
265
   case SHADER_OPCODE_GET_BUFFER_SIZE:
266
      return arg == 1;
267

268
   case SHADER_OPCODE_MOV_INDIRECT:
269
   case SHADER_OPCODE_CLUSTER_BROADCAST:
270
   case SHADER_OPCODE_TEX:
271
   case FS_OPCODE_TXB:
272
   case SHADER_OPCODE_TXD:
273
   case SHADER_OPCODE_TXF:
274
   case SHADER_OPCODE_TXF_LZ:
275
   case SHADER_OPCODE_TXF_CMS:
276
   case SHADER_OPCODE_TXF_CMS_W:
277
   case SHADER_OPCODE_TXF_UMS:
278
   case SHADER_OPCODE_TXF_MCS:
279
   case SHADER_OPCODE_TXL:
280
   case SHADER_OPCODE_TXL_LZ:
281
   case SHADER_OPCODE_TXS:
282
   case SHADER_OPCODE_LOD:
283
   case SHADER_OPCODE_TG4:
284
   case SHADER_OPCODE_TG4_OFFSET:
285
   case SHADER_OPCODE_SAMPLEINFO:
286
      return arg == 1 || arg == 2;
287

288
   case SHADER_OPCODE_SEND:
289
      return arg == 0 || arg == 1;
290

291
   default:
292
      return false;
293
   }
294
}
295

296
bool
297
fs_inst::is_payload(unsigned arg) const
298
{
299
   switch (opcode) {
300
   case FS_OPCODE_FB_WRITE:
301
   case FS_OPCODE_FB_READ:
302
   case SHADER_OPCODE_URB_WRITE_SIMD8:
303
   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
304
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
305
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
306
   case SHADER_OPCODE_URB_READ_SIMD8:
307
   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
308
   case VEC4_OPCODE_UNTYPED_ATOMIC:
309
   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
310
   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
311
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
312
   case SHADER_OPCODE_SHADER_TIME_ADD:
313
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
314
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
315
   case SHADER_OPCODE_INTERLOCK:
316
   case SHADER_OPCODE_MEMORY_FENCE:
317
   case SHADER_OPCODE_BARRIER:
318
      return arg == 0;
319

320
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
321
      return arg == 1;
322

323
   case SHADER_OPCODE_SEND:
324
      return arg == 2 || arg == 3;
325

326
   default:
327
      if (is_tex())
328
         return arg == 0;
329
      else
330
         return false;
331
   }
332
}
333

334
/**
335
 * Returns true if this instruction's sources and destinations cannot
336
 * safely be the same register.
337
 *
338
 * In most cases, a register can be written over safely by the same
339
 * instruction that is its last use.  For a single instruction, the
340
 * sources are dereferenced before writing of the destination starts
341
 * (naturally).
342
 *
343
 * However, there are a few cases where this can be problematic:
344
 *
345
 * - Virtual opcodes that translate to multiple instructions in the
346
 *   code generator: if src == dst and one instruction writes the
347
 *   destination before a later instruction reads the source, then
348
 *   src will have been clobbered.
349
 *
350
 * - SIMD16 compressed instructions with certain regioning (see below).
351
 *
352
 * The register allocator uses this information to set up conflicts between
353
 * GRF sources and the destination.
354
 */
355
bool
356
fs_inst::has_source_and_destination_hazard() const
357
{
358
   switch (opcode) {
359
   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
360
      /* Multiple partial writes to the destination */
361
      return true;
362
   case SHADER_OPCODE_SHUFFLE:
363
      /* This instruction returns an arbitrary channel from the source and
364
       * gets split into smaller instructions in the generator.  It's possible
365
       * that one of the instructions will read from a channel corresponding
366
       * to an earlier instruction.
367
       */
368
   case SHADER_OPCODE_SEL_EXEC:
369
      /* This is implemented as
370
       *
371
       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
372
       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
373
       *
374
       * Because the source is only read in the second instruction, the first
375
       * may stomp all over it.
376
       */
377
      return true;
378
   case SHADER_OPCODE_QUAD_SWIZZLE:
379
      switch (src[1].ud) {
380
      case BRW_SWIZZLE_XXXX:
381
      case BRW_SWIZZLE_YYYY:
382
      case BRW_SWIZZLE_ZZZZ:
383
      case BRW_SWIZZLE_WWWW:
384
      case BRW_SWIZZLE_XXZZ:
385
      case BRW_SWIZZLE_YYWW:
386
      case BRW_SWIZZLE_XYXY:
387
      case BRW_SWIZZLE_ZWZW:
388
         /* These can be implemented as a single Align1 region on all
389
          * platforms, so there's never a hazard between source and
390
          * destination.  C.f. fs_generator::generate_quad_swizzle().
391
          */
392
         return false;
393
      default:
394
         return !is_uniform(src[0]);
395
      }
396
   default:
397
      /* The SIMD16 compressed instruction
398
       *
399
       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
400
       *
401
       * is actually decoded in hardware as:
402
       *
403
       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
404
       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
405
       *
406
       * Which is safe.  However, if we have uniform accesses
407
       * happening, we get into trouble:
408
       *
409
       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
410
       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
411
       *
412
       * Now our destination for the first instruction overwrote the
413
       * second instruction's src0, and we get garbage for those 8
414
       * pixels.  There's a similar issue for the pre-gfx6
415
       * pixel_x/pixel_y, which are registers of 16-bit values and thus
416
       * would get stomped by the first decode as well.
417
       */
418
      if (exec_size == 16) {
419
         for (int i = 0; i < sources; i++) {
420
            if (src[i].file == VGRF && (src[i].stride == 0 ||
421
                                        src[i].type == BRW_REGISTER_TYPE_UW ||
422
                                        src[i].type == BRW_REGISTER_TYPE_W ||
423
                                        src[i].type == BRW_REGISTER_TYPE_UB ||
424
                                        src[i].type == BRW_REGISTER_TYPE_B)) {
425
               return true;
426
            }
427
         }
428
      }
429
      return false;
430
   }
431
}
432

433
bool
434
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
435
{
436
   if (devinfo->ver == 6 && is_math())
437
      return false;
438

439
   if (is_send_from_grf())
440
      return false;
441

442
   /* From Wa_1604601757:
443
    *
444
    * "When multiplying a DW and any lower precision integer, source modifier
445
    *  is not supported."
446
    */
447
   if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
448
                              opcode == BRW_OPCODE_MAD)) {
449
      const brw_reg_type exec_type = get_exec_type(this);
450
      const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
451
         MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
452
         MIN2(type_sz(src[0].type), type_sz(src[1].type));
453

454
      if (brw_reg_type_is_integer(exec_type) &&
455
          type_sz(exec_type) >= 4 &&
456
          type_sz(exec_type) != min_type_sz)
457
         return false;
458
   }
459

460
   if (!backend_instruction::can_do_source_mods())
461
      return false;
462

463
   return true;
464
}
465

466
bool
467
fs_inst::can_do_cmod()
468
{
469
   if (!backend_instruction::can_do_cmod())
470
      return false;
471

472
   /* The accumulator result appears to get used for the conditional modifier
473
    * generation.  When negating a UD value, there is a 33rd bit generated for
474
    * the sign in the accumulator value, so now you can't check, for example,
475
    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
476
    */
477
   for (unsigned i = 0; i < sources; i++) {
478
      if (type_is_unsigned_int(src[i].type) && src[i].negate)
479
         return false;
480
   }
481

482
   return true;
483
}
484

485
bool
486
fs_inst::can_change_types() const
487
{
488
   return dst.type == src[0].type &&
489
          !src[0].abs && !src[0].negate && !saturate &&
490
          (opcode == BRW_OPCODE_MOV ||
491
           (opcode == BRW_OPCODE_SEL &&
492
            dst.type == src[1].type &&
493
            predicate != BRW_PREDICATE_NONE &&
494
            !src[1].abs && !src[1].negate));
495
}
496

497
void
498
fs_reg::init()
499
{
500
   memset((void*)this, 0, sizeof(*this));
501
   type = BRW_REGISTER_TYPE_UD;
502
   stride = 1;
503
}
504

505
/** Generic unset register constructor. */
506
fs_reg::fs_reg()
507
{
508
   init();
509
   this->file = BAD_FILE;
510
}
511

512
fs_reg::fs_reg(struct ::brw_reg reg) :
513
   backend_reg(reg)
514
{
515
   this->offset = 0;
516
   this->stride = 1;
517
   if (this->file == IMM &&
518
       (this->type != BRW_REGISTER_TYPE_V &&
519
        this->type != BRW_REGISTER_TYPE_UV &&
520
        this->type != BRW_REGISTER_TYPE_VF)) {
521
      this->stride = 0;
522
   }
523
}
524

525
bool
526
fs_reg::equals(const fs_reg &r) const
527
{
528
   return (this->backend_reg::equals(r) &&
529
           stride == r.stride);
530
}
531

532
bool
533
fs_reg::negative_equals(const fs_reg &r) const
534
{
535
   return (this->backend_reg::negative_equals(r) &&
536
           stride == r.stride);
537
}
538

539
bool
540
fs_reg::is_contiguous() const
541
{
542
   switch (file) {
543
   case ARF:
544
   case FIXED_GRF:
545
      return hstride == BRW_HORIZONTAL_STRIDE_1 &&
546
             vstride == width + hstride;
547
   case MRF:
548
   case VGRF:
549
   case ATTR:
550
      return stride == 1;
551
   case UNIFORM:
552
   case IMM:
553
   case BAD_FILE:
554
      return true;
555
   }
556

557
   unreachable("Invalid register file");
558
}
559

560
unsigned
561
fs_reg::component_size(unsigned width) const
562
{
563
   const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
564
                            hstride == 0 ? 0 :
565
                            1 << (hstride - 1));
566
   return MAX2(width * stride, 1) * type_sz(type);
567
}
568

569
/**
570
 * Create a MOV to read the timestamp register.
571
 */
572
fs_reg
573
fs_visitor::get_timestamp(const fs_builder &bld)
574
{
575
   assert(devinfo->ver >= 7);
576

577
   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
578
                                          BRW_ARF_TIMESTAMP,
579
                                          0),
580
                             BRW_REGISTER_TYPE_UD));
581

582
   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
583

584
   /* We want to read the 3 fields we care about even if it's not enabled in
585
    * the dispatch.
586
    */
587
   bld.group(4, 0).exec_all().MOV(dst, ts);
588

589
   return dst;
590
}
591

592
void
593
fs_visitor::emit_shader_time_begin()
594
{
595
   /* We want only the low 32 bits of the timestamp.  Since it's running
596
    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
597
    * which is plenty of time for our purposes.  It is identical across the
598
    * EUs, but since it's tracking GPU core speed it will increment at a
599
    * varying rate as render P-states change.
600
    */
601
   shader_start_time = component(
602
      get_timestamp(bld.annotate("shader time start")), 0);
603
}
604

605
void
606
fs_visitor::emit_shader_time_end()
607
{
608
   /* Insert our code just before the final SEND with EOT. */
609
   exec_node *end = this->instructions.get_tail();
610
   assert(end && ((fs_inst *) end)->eot);
611
   const fs_builder ibld = bld.annotate("shader time end")
612
                              .exec_all().at(NULL, end);
613
   const fs_reg timestamp = get_timestamp(ibld);
614

615
   /* We only use the low 32 bits of the timestamp - see
616
    * emit_shader_time_begin()).
617
    *
618
    * We could also check if render P-states have changed (or anything
619
    * else that might disrupt timing) by setting smear to 2 and checking if
620
    * that field is != 0.
621
    */
622
   const fs_reg shader_end_time = component(timestamp, 0);
623

624
   /* Check that there weren't any timestamp reset events (assuming these
625
    * were the only two timestamp reads that happened).
626
    */
627
   const fs_reg reset = component(timestamp, 2);
628
   set_condmod(BRW_CONDITIONAL_Z,
629
               ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
630
   ibld.IF(BRW_PREDICATE_NORMAL);
631

632
   fs_reg start = shader_start_time;
633
   start.negate = true;
634
   const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
635
                                        BRW_REGISTER_TYPE_UD),
636
                                 0);
637
   const fs_builder cbld = ibld.group(1, 0);
638
   cbld.group(1, 0).ADD(diff, start, shader_end_time);
639

640
   /* If there were no instructions between the two timestamp gets, the diff
641
    * is 2 cycles.  Remove that overhead, so I can forget about that when
642
    * trying to determine the time taken for single instructions.
643
    */
644
   cbld.ADD(diff, diff, brw_imm_ud(-2u));
645
   SHADER_TIME_ADD(cbld, 0, diff);
646
   SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
647
   ibld.emit(BRW_OPCODE_ELSE);
648
   SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
649
   ibld.emit(BRW_OPCODE_ENDIF);
650
}
651

652
void
653
fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
654
                            int shader_time_subindex,
655
                            fs_reg value)
656
{
657
   int index = shader_time_index * 3 + shader_time_subindex;
658
   struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
659

660
   fs_reg payload;
661
   if (dispatch_width == 8)
662
      payload = vgrf(glsl_type::uvec2_type);
663
   else
664
      payload = vgrf(glsl_type::uint_type);
665

666
   bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
667
}
668

669
void
670
fs_visitor::vfail(const char *format, va_list va)
671
{
672
   char *msg;
673

674
   if (failed)
675
      return;
676

677
   failed = true;
678

679
   msg = ralloc_vasprintf(mem_ctx, format, va);
680
   msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
681
         dispatch_width, stage_abbrev, msg);
682

683
   this->fail_msg = msg;
684

685
   if (unlikely(debug_enabled)) {
686
      fprintf(stderr, "%s",  msg);
687
   }
688
}
689

690
void
691
fs_visitor::fail(const char *format, ...)
692
{
693
   va_list va;
694

695
   va_start(va, format);
696
   vfail(format, va);
697
   va_end(va);
698
}
699

700
/**
701
 * Mark this program as impossible to compile with dispatch width greater
702
 * than n.
703
 *
704
 * During the SIMD8 compile (which happens first), we can detect and flag
705
 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
706
 * SIMD16+ compile altogether.
707
 *
708
 * During a compile of dispatch width greater than n (if one happens anyway),
709
 * this just calls fail().
710
 */
711
void
712
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
713
{
714
   if (dispatch_width > n) {
715
      fail("%s", msg);
716
   } else {
717
      max_dispatch_width = MIN2(max_dispatch_width, n);
718
      compiler->shader_perf_log(log_data,
719
                                "Shader dispatch width limited to SIMD%d: %s",
720
                                n, msg);
721
   }
722
}
723

724
/**
725
 * Returns true if the instruction has a flag that means it won't
726
 * update an entire destination register.
727
 *
728
 * For example, dead code elimination and live variable analysis want to know
729
 * when a write to a variable screens off any preceding values that were in
730
 * it.
731
 */
732
bool
733
fs_inst::is_partial_write() const
734
{
735
   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
736
           (this->exec_size * type_sz(this->dst.type)) < 32 ||
737
           !this->dst.is_contiguous() ||
738
           this->dst.offset % REG_SIZE != 0);
739
}
740

741
unsigned
742
fs_inst::components_read(unsigned i) const
743
{
744
   /* Return zero if the source is not present. */
745
   if (src[i].file == BAD_FILE)
746
      return 0;
747

748
   switch (opcode) {
749
   case FS_OPCODE_LINTERP:
750
      if (i == 0)
751
         return 2;
752
      else
753
         return 1;
754

755
   case FS_OPCODE_PIXEL_X:
756
   case FS_OPCODE_PIXEL_Y:
757
      assert(i < 2);
758
      if (i == 0)
759
         return 2;
760
      else
761
         return 1;
762

763
   case FS_OPCODE_FB_WRITE_LOGICAL:
764
      assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
765
      /* First/second FB write color. */
766
      if (i < 2)
767
         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
768
      else
769
         return 1;
770

771
   case SHADER_OPCODE_TEX_LOGICAL:
772
   case SHADER_OPCODE_TXD_LOGICAL:
773
   case SHADER_OPCODE_TXF_LOGICAL:
774
   case SHADER_OPCODE_TXL_LOGICAL:
775
   case SHADER_OPCODE_TXS_LOGICAL:
776
   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
777
   case FS_OPCODE_TXB_LOGICAL:
778
   case SHADER_OPCODE_TXF_CMS_LOGICAL:
779
   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
780
   case SHADER_OPCODE_TXF_UMS_LOGICAL:
781
   case SHADER_OPCODE_TXF_MCS_LOGICAL:
782
   case SHADER_OPCODE_LOD_LOGICAL:
783
   case SHADER_OPCODE_TG4_LOGICAL:
784
   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
785
   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
786
      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
787
             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
788
      /* Texture coordinates. */
789
      if (i == TEX_LOGICAL_SRC_COORDINATE)
790
         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
791
      /* Texture derivatives. */
792
      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
793
               opcode == SHADER_OPCODE_TXD_LOGICAL)
794
         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
795
      /* Texture offset. */
796
      else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
797
         return 2;
798
      /* MCS */
799
      else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
800
         return 2;
801
      else
802
         return 1;
803

804
   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
805
   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
806
      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
807
      /* Surface coordinates. */
808
      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
809
         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
810
      /* Surface operation source (ignored for reads). */
811
      else if (i == SURFACE_LOGICAL_SRC_DATA)
812
         return 0;
813
      else
814
         return 1;
815

816
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
817
   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
818
      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
819
             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
820
      /* Surface coordinates. */
821
      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
822
         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
823
      /* Surface operation source. */
824
      else if (i == SURFACE_LOGICAL_SRC_DATA)
825
         return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
826
      else
827
         return 1;
828

829
   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
830
   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
831
   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
832
      assert(src[2].file == IMM);
833
      return 1;
834

835
   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
836
      assert(src[2].file == IMM);
837
      if (i == 1) { /* data to write */
838
         const unsigned comps = src[2].ud / exec_size;
839
         assert(comps > 0);
840
         return comps;
841
      } else {
842
         return 1;
843
      }
844

845
   case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
846
   case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
847
      assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
848
      return 1;
849

850
   case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
851
      assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
852
      if (i == SURFACE_LOGICAL_SRC_DATA) {
853
         const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
854
         assert(comps > 0);
855
         return comps;
856
      } else {
857
         return 1;
858
      }
859

860
   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
861
      assert(src[2].file == IMM);
862
      return i == 1 ? src[2].ud : 1;
863

864
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
865
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
866
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
867
      assert(src[2].file == IMM);
868
      if (i == 1) {
869
         /* Data source */
870
         const unsigned op = src[2].ud;
871
         switch (op) {
872
         case BRW_AOP_INC:
873
         case BRW_AOP_DEC:
874
         case BRW_AOP_PREDEC:
875
            return 0;
876
         case BRW_AOP_CMPWR:
877
            return 2;
878
         default:
879
            return 1;
880
         }
881
      } else {
882
         return 1;
883
      }
884

885
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
886
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
887
      assert(src[2].file == IMM);
888
      if (i == 1) {
889
         /* Data source */
890
         const unsigned op = src[2].ud;
891
         return op == BRW_AOP_FCMPWR ? 2 : 1;
892
      } else {
893
         return 1;
894
      }
895

896
   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
897
   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
898
      /* Scattered logical opcodes use the following params:
899
       * src[0] Surface coordinates
900
       * src[1] Surface operation source (ignored for reads)
901
       * src[2] Surface
902
       * src[3] IMM with always 1 dimension.
903
       * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
904
       */
905
      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
906
             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
907
      return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
908

909
   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
910
   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
911
      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
912
             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
913
      return 1;
914

915
   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
916
   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
917
      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
918
             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
919
      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
920
      /* Surface coordinates. */
921
      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
922
         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
923
      /* Surface operation source. */
924
      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
925
         return 2;
926
      else if (i == SURFACE_LOGICAL_SRC_DATA &&
927
               (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
928
         return 0;
929
      else
930
         return 1;
931
   }
932
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
933
      return (i == 0 ? 2 : 1);
934

935
   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
936
      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
937
             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
938
      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
939
      /* Surface coordinates. */
940
      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
941
         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
942
      /* Surface operation source. */
943
      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
944
         return 2;
945
      else
946
         return 1;
947
   }
948

949
   default:
950
      return 1;
951
   }
952
}
953

954
unsigned
955
fs_inst::size_read(int arg) const
956
{
957
   switch (opcode) {
958
   case SHADER_OPCODE_SEND:
959
      if (arg == 2) {
960
         return mlen * REG_SIZE;
961
      } else if (arg == 3) {
962
         return ex_mlen * REG_SIZE;
963
      }
964
      break;
965

966
   case FS_OPCODE_FB_WRITE:
967
   case FS_OPCODE_REP_FB_WRITE:
968
      if (arg == 0) {
969
         if (base_mrf >= 0)
970
            return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
971
         else
972
            return mlen * REG_SIZE;
973
      }
974
      break;
975

976
   case FS_OPCODE_FB_READ:
977
   case SHADER_OPCODE_URB_WRITE_SIMD8:
978
   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
979
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
980
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
981
   case SHADER_OPCODE_URB_READ_SIMD8:
982
   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
983
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
984
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
985
      if (arg == 0)
986
         return mlen * REG_SIZE;
987
      break;
988

989
   case FS_OPCODE_SET_SAMPLE_ID:
990
      if (arg == 1)
991
         return 1;
992
      break;
993

994
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
995
      /* The payload is actually stored in src1 */
996
      if (arg == 1)
997
         return mlen * REG_SIZE;
998
      break;
999

1000
   case FS_OPCODE_LINTERP:
1001
      if (arg == 1)
1002
         return 16;
1003
      break;
1004

1005
   case SHADER_OPCODE_LOAD_PAYLOAD:
1006
      if (arg < this->header_size)
1007
         return REG_SIZE;
1008
      break;
1009

1010
   case CS_OPCODE_CS_TERMINATE:
1011
   case SHADER_OPCODE_BARRIER:
1012
      return REG_SIZE;
1013

1014
   case SHADER_OPCODE_MOV_INDIRECT:
1015
      if (arg == 0) {
1016
         assert(src[2].file == IMM);
1017
         return src[2].ud;
1018
      }
1019
      break;
1020

1021
   default:
1022
      if (is_tex() && arg == 0 && src[0].file == VGRF)
1023
         return mlen * REG_SIZE;
1024
      break;
1025
   }
1026

1027
   switch (src[arg].file) {
1028
   case UNIFORM:
1029
   case IMM:
1030
      return components_read(arg) * type_sz(src[arg].type);
1031
   case BAD_FILE:
1032
   case ARF:
1033
   case FIXED_GRF:
1034
   case VGRF:
1035
   case ATTR:
1036
      return components_read(arg) * src[arg].component_size(exec_size);
1037
   case MRF:
1038
      unreachable("MRF registers are not allowed as sources");
1039
   }
1040
   return 0;
1041
}
1042

1043
namespace {
1044
   unsigned
1045
   predicate_width(brw_predicate predicate)
1046
   {
1047
      switch (predicate) {
1048
      case BRW_PREDICATE_NONE:            return 1;
1049
      case BRW_PREDICATE_NORMAL:          return 1;
1050
      case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
1051
      case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
1052
      case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
1053
      case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
1054
      case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
1055
      case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
1056
      case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
1057
      case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
1058
      case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
1059
      case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
1060
      default: unreachable("Unsupported predicate");
1061
      }
1062
   }
1063

1064
   /* Return the subset of flag registers that an instruction could
1065
    * potentially read or write based on the execution controls and flag
1066
    * subregister number of the instruction.
1067
    */
1068
   unsigned
1069
   flag_mask(const fs_inst *inst, unsigned width)
1070
   {
1071
      assert(util_is_power_of_two_nonzero(width));
1072
      const unsigned start = (inst->flag_subreg * 16 + inst->group) &
1073
                             ~(width - 1);
1074
      const unsigned end = start + ALIGN(inst->exec_size, width);
1075
      return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
1076
   }
1077

1078
   unsigned
1079
   bit_mask(unsigned n)
1080
   {
1081
      return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
1082
   }
1083

1084
   unsigned
1085
   flag_mask(const fs_reg &r, unsigned sz)
1086
   {
1087
      if (r.file == ARF) {
1088
         const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
1089
         const unsigned end = start + sz;
1090
         return bit_mask(end) & ~bit_mask(start);
1091
      } else {
1092
         return 0;
1093
      }
1094
   }
1095
}
1096

1097
unsigned
1098
fs_inst::flags_read(const intel_device_info *devinfo) const
1099
{
1100
   if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
1101
       predicate == BRW_PREDICATE_ALIGN1_ALLV) {
1102
      /* The vertical predication modes combine corresponding bits from
1103
       * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
1104
       */
1105
      const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
1106
      return flag_mask(this, 1) << shift | flag_mask(this, 1);
1107
   } else if (predicate) {
1108
      return flag_mask(this, predicate_width(predicate));
1109
   } else {
1110
      unsigned mask = 0;
1111
      for (int i = 0; i < sources; i++) {
1112
         mask |= flag_mask(src[i], size_read(i));
1113
      }
1114
      return mask;
1115
   }
1116
}
1117

1118
unsigned
1119
fs_inst::flags_written(const intel_device_info *devinfo) const
1120
{
1121
   /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
1122
    * using a separte cmpn and sel instruction.  This lowering occurs in
1123
    * fs_vistor::lower_minmax which is called very, very late.
1124
    */
1125
   if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
1126
                            opcode != BRW_OPCODE_CSEL &&
1127
                            opcode != BRW_OPCODE_IF &&
1128
                            opcode != BRW_OPCODE_WHILE)) ||
1129
       opcode == FS_OPCODE_FB_WRITE) {
1130
      return flag_mask(this, 1);
1131
   } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1132
              opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
1133
      return flag_mask(this, 32);
1134
   } else {
1135
      return flag_mask(dst, size_written);
1136
   }
1137
}
1138

1139
/**
1140
 * Returns how many MRFs an FS opcode will write over.
1141
 *
1142
 * Note that this is not the 0 or 1 implied writes in an actual gen
1143
 * instruction -- the FS opcodes often generate MOVs in addition.
1144
 */
1145
unsigned
1146
fs_inst::implied_mrf_writes() const
1147
{
1148
   if (mlen == 0)
1149
      return 0;
1150

1151
   if (base_mrf == -1)
1152
      return 0;
1153

1154
   switch (opcode) {
1155
   case SHADER_OPCODE_RCP:
1156
   case SHADER_OPCODE_RSQ:
1157
   case SHADER_OPCODE_SQRT:
1158
   case SHADER_OPCODE_EXP2:
1159
   case SHADER_OPCODE_LOG2:
1160
   case SHADER_OPCODE_SIN:
1161
   case SHADER_OPCODE_COS:
1162
      return 1 * exec_size / 8;
1163
   case SHADER_OPCODE_POW:
1164
   case SHADER_OPCODE_INT_QUOTIENT:
1165
   case SHADER_OPCODE_INT_REMAINDER:
1166
      return 2 * exec_size / 8;
1167
   case SHADER_OPCODE_TEX:
1168
   case FS_OPCODE_TXB:
1169
   case SHADER_OPCODE_TXD:
1170
   case SHADER_OPCODE_TXF:
1171
   case SHADER_OPCODE_TXF_CMS:
1172
   case SHADER_OPCODE_TXF_MCS:
1173
   case SHADER_OPCODE_TG4:
1174
   case SHADER_OPCODE_TG4_OFFSET:
1175
   case SHADER_OPCODE_TXL:
1176
   case SHADER_OPCODE_TXS:
1177
   case SHADER_OPCODE_LOD:
1178
   case SHADER_OPCODE_SAMPLEINFO:
1179
      return 1;
1180
   case FS_OPCODE_FB_WRITE:
1181
   case FS_OPCODE_REP_FB_WRITE:
1182
      return src[0].file == BAD_FILE ? 0 : 2;
1183
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1184
   case SHADER_OPCODE_GFX4_SCRATCH_READ:
1185
      return 1;
1186
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1187
      return mlen;
1188
   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1189
      return mlen;
1190
   default:
1191
      unreachable("not reached");
1192
   }
1193
}
1194

1195
fs_reg
1196
fs_visitor::vgrf(const glsl_type *const type)
1197
{
1198
   int reg_width = dispatch_width / 8;
1199
   return fs_reg(VGRF,
1200
                 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1201
                 brw_type_for_base_type(type));
1202
}
1203

1204
fs_reg::fs_reg(enum brw_reg_file file, int nr)
1205
{
1206
   init();
1207
   this->file = file;
1208
   this->nr = nr;
1209
   this->type = BRW_REGISTER_TYPE_F;
1210
   this->stride = (file == UNIFORM ? 0 : 1);
1211
}
1212

1213
fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
1214
{
1215
   init();
1216
   this->file = file;
1217
   this->nr = nr;
1218
   this->type = type;
1219
   this->stride = (file == UNIFORM ? 0 : 1);
1220
}
1221

1222
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1223
 * This brings in those uniform definitions
1224
 */
1225
void
1226
fs_visitor::import_uniforms(fs_visitor *v)
1227
{
1228
   this->push_constant_loc = v->push_constant_loc;
1229
   this->pull_constant_loc = v->pull_constant_loc;
1230
   this->uniforms = v->uniforms;
1231
   this->subgroup_id = v->subgroup_id;
1232
   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
1233
      this->group_size[i] = v->group_size[i];
1234
}
1235

1236
void
1237
fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
1238
{
1239
   assert(stage == MESA_SHADER_FRAGMENT);
1240

1241
   /* gl_FragCoord.x */
1242
   bld.MOV(wpos, this->pixel_x);
1243
   wpos = offset(wpos, bld, 1);
1244

1245
   /* gl_FragCoord.y */
1246
   bld.MOV(wpos, this->pixel_y);
1247
   wpos = offset(wpos, bld, 1);
1248

1249
   /* gl_FragCoord.z */
1250
   if (devinfo->ver >= 6) {
1251
      bld.MOV(wpos, this->pixel_z);
1252
   } else {
1253
      bld.emit(FS_OPCODE_LINTERP, wpos,
1254
               this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
1255
               component(interp_reg(VARYING_SLOT_POS, 2), 0));
1256
   }
1257
   wpos = offset(wpos, bld, 1);
1258

1259
   /* gl_FragCoord.w: Already set up in emit_interpolation */
1260
   bld.MOV(wpos, this->wpos_w);
1261
}
1262

1263
enum brw_barycentric_mode
1264
brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
1265
{
1266
   /* Barycentric modes don't make sense for flat inputs. */
1267
   assert(mode != INTERP_MODE_FLAT);
1268

1269
   unsigned bary;
1270
   switch (op) {
1271
   case nir_intrinsic_load_barycentric_pixel:
1272
   case nir_intrinsic_load_barycentric_at_offset:
1273
      bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1274
      break;
1275
   case nir_intrinsic_load_barycentric_centroid:
1276
      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1277
      break;
1278
   case nir_intrinsic_load_barycentric_sample:
1279
   case nir_intrinsic_load_barycentric_at_sample:
1280
      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1281
      break;
1282
   default:
1283
      unreachable("invalid intrinsic");
1284
   }
1285

1286
   if (mode == INTERP_MODE_NOPERSPECTIVE)
1287
      bary += 3;
1288

1289
   return (enum brw_barycentric_mode) bary;
1290
}
1291

1292
/**
1293
 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1294
 */
1295
static enum brw_barycentric_mode
1296
centroid_to_pixel(enum brw_barycentric_mode bary)
1297
{
1298
   assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1299
          bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1300
   return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1301
}
1302

1303
fs_reg *
1304
fs_visitor::emit_frontfacing_interpolation()
1305
{
1306
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1307

1308
   if (devinfo->ver >= 12) {
1309
      fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
1310

1311
      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
1312
      bld.ASR(tmp, g1, brw_imm_d(15));
1313
      bld.NOT(*reg, tmp);
1314
   } else if (devinfo->ver >= 6) {
1315
      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1316
       * a boolean result from this (~0/true or 0/false).
1317
       *
1318
       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1319
       * this task in only one instruction:
1320
       *    - a negation source modifier will flip the bit; and
1321
       *    - a W -> D type conversion will sign extend the bit into the high
1322
       *      word of the destination.
1323
       *
1324
       * An ASR 15 fills the low word of the destination.
1325
       */
1326
      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1327
      g0.negate = true;
1328

1329
      bld.ASR(*reg, g0, brw_imm_d(15));
1330
   } else {
1331
      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1332
       * a boolean result from this (1/true or 0/false).
1333
       *
1334
       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1335
       * the negation source modifier to flip it. Unfortunately the SHR
1336
       * instruction only operates on UD (or D with an abs source modifier)
1337
       * sources without negation.
1338
       *
1339
       * Instead, use ASR (which will give ~0/true or 0/false).
1340
       */
1341
      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1342
      g1_6.negate = true;
1343

1344
      bld.ASR(*reg, g1_6, brw_imm_d(31));
1345
   }
1346

1347
   return reg;
1348
}
1349

1350
void
1351
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1352
{
1353
   assert(stage == MESA_SHADER_FRAGMENT);
1354
   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1355
   assert(dst.type == BRW_REGISTER_TYPE_F);
1356

1357
   if (wm_prog_data->persample_dispatch) {
1358
      /* Convert int_sample_pos to floating point */
1359
      bld.MOV(dst, int_sample_pos);
1360
      /* Scale to the range [0, 1] */
1361
      bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1362
   }
1363
   else {
1364
      /* From ARB_sample_shading specification:
1365
       * "When rendering to a non-multisample buffer, or if multisample
1366
       *  rasterization is disabled, gl_SamplePosition will always be
1367
       *  (0.5, 0.5).
1368
       */
1369
      bld.MOV(dst, brw_imm_f(0.5f));
1370
   }
1371
}
1372

1373
fs_reg *
1374
fs_visitor::emit_samplepos_setup()
1375
{
1376
   assert(devinfo->ver >= 6);
1377

1378
   const fs_builder abld = bld.annotate("compute sample position");
1379
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1380
   fs_reg pos = *reg;
1381
   fs_reg int_sample_x = vgrf(glsl_type::int_type);
1382
   fs_reg int_sample_y = vgrf(glsl_type::int_type);
1383

1384
   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1385
    * mode will be enabled.
1386
    *
1387
    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1388
    * R31.1:0         Position Offset X/Y for Slot[3:0]
1389
    * R31.3:2         Position Offset X/Y for Slot[7:4]
1390
    * .....
1391
    *
1392
    * The X, Y sample positions come in as bytes in  thread payload. So, read
1393
    * the positions using vstride=16, width=8, hstride=2.
1394
    */
1395
   const fs_reg sample_pos_reg =
1396
      fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
1397

1398
   /* Compute gl_SamplePosition.x */
1399
   abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
1400
   compute_sample_position(offset(pos, abld, 0), int_sample_x);
1401

1402
   /* Compute gl_SamplePosition.y */
1403
   abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
1404
   compute_sample_position(offset(pos, abld, 1), int_sample_y);
1405
   return reg;
1406
}
1407

1408
fs_reg *
1409
fs_visitor::emit_sampleid_setup()
1410
{
1411
   assert(stage == MESA_SHADER_FRAGMENT);
1412
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1413
   assert(devinfo->ver >= 6);
1414

1415
   const fs_builder abld = bld.annotate("compute sample id");
1416
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
1417

1418
   if (!key->multisample_fbo) {
1419
      /* As per GL_ARB_sample_shading specification:
1420
       * "When rendering to a non-multisample buffer, or if multisample
1421
       *  rasterization is disabled, gl_SampleID will always be zero."
1422
       */
1423
      abld.MOV(*reg, brw_imm_d(0));
1424
   } else if (devinfo->ver >= 8) {
1425
      /* Sample ID comes in as 4-bit numbers in g1.0:
1426
       *
1427
       *    15:12 Slot 3 SampleID (only used in SIMD16)
1428
       *     11:8 Slot 2 SampleID (only used in SIMD16)
1429
       *      7:4 Slot 1 SampleID
1430
       *      3:0 Slot 0 SampleID
1431
       *
1432
       * Each slot corresponds to four channels, so we want to replicate each
1433
       * half-byte value to 4 channels in a row:
1434
       *
1435
       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
1436
       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
1437
       *
1438
       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
1439
       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
1440
       *
1441
       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
1442
       * channels to read the first byte (7:0), and the second group of 8
1443
       * channels to read the second byte (15:8).  Then, we shift right by
1444
       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
1445
       * values into place.  Finally, we AND with 0xf to keep the low nibble.
1446
       *
1447
       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
1448
       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
1449
       *
1450
       * TODO: These payload bits exist on Gfx7 too, but they appear to always
1451
       *       be zero, so this code fails to work.  We should find out why.
1452
       */
1453
      const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
1454

1455
      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
1456
         const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
1457
         hbld.SHR(offset(tmp, hbld, i),
1458
                  stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
1459
                         1, 8, 0),
1460
                  brw_imm_v(0x44440000));
1461
      }
1462

1463
      abld.AND(*reg, tmp, brw_imm_w(0xf));
1464
   } else {
1465
      const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
1466
      const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
1467

1468
      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1469
       * 8x multisampling, subspan 0 will represent sample N (where N
1470
       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1471
       * 7. We can find the value of N by looking at R0.0 bits 7:6
1472
       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1473
       * (since samples are always delivered in pairs). That is, we
1474
       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1475
       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1476
       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1477
       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1478
       * populating a temporary variable with the sequence (0, 1, 2, 3),
1479
       * and then reading from it using vstride=1, width=4, hstride=0.
1480
       * These computations hold good for 4x multisampling as well.
1481
       *
1482
       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1483
       * the first four slots are sample 0 of subspan 0; the next four
1484
       * are sample 1 of subspan 0; the third group is sample 0 of
1485
       * subspan 1, and finally sample 1 of subspan 1.
1486
       */
1487

1488
      /* SKL+ has an extra bit for the Starting Sample Pair Index to
1489
       * accomodate 16x MSAA.
1490
       */
1491
      abld.exec_all().group(1, 0)
1492
          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1493
               brw_imm_ud(0xc0));
1494
      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1495

1496
      /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
1497
       * can assume 4x MSAA.  Disallow it on IVB+
1498
       *
1499
       * FINISHME: One day, we could come up with a way to do this that
1500
       * actually works on gfx7.
1501
       */
1502
      if (devinfo->ver >= 7)
1503
         limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
1504
      abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
1505

1506
      /* This special instruction takes care of setting vstride=1,
1507
       * width=4, hstride=0 of t2 during an ADD instruction.
1508
       */
1509
      abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1510
   }
1511

1512
   return reg;
1513
}
1514

1515
fs_reg *
1516
fs_visitor::emit_samplemaskin_setup()
1517
{
1518
   assert(stage == MESA_SHADER_FRAGMENT);
1519
   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1520
   assert(devinfo->ver >= 6);
1521

1522
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1523

1524
   /* The HW doesn't provide us with expected values. */
1525
   assert(!wm_prog_data->per_coarse_pixel_dispatch);
1526

1527
   fs_reg coverage_mask =
1528
      fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
1529

1530
   if (wm_prog_data->persample_dispatch) {
1531
      /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
1532
       * and a mask representing which sample is being processed by the
1533
       * current shader invocation.
1534
       *
1535
       * From the OES_sample_variables specification:
1536
       * "When per-sample shading is active due to the use of a fragment input
1537
       *  qualified by "sample" or due to the use of the gl_SampleID or
1538
       *  gl_SamplePosition variables, only the bit for the current sample is
1539
       *  set in gl_SampleMaskIn."
1540
       */
1541
      const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
1542

1543
      if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
1544
         nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
1545

1546
      fs_reg one = vgrf(glsl_type::int_type);
1547
      fs_reg enabled_mask = vgrf(glsl_type::int_type);
1548
      abld.MOV(one, brw_imm_d(1));
1549
      abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
1550
      abld.AND(*reg, enabled_mask, coverage_mask);
1551
   } else {
1552
      /* In per-pixel mode, the coverage mask is sufficient. */
1553
      *reg = coverage_mask;
1554
   }
1555
   return reg;
1556
}
1557

1558
fs_reg *
1559
fs_visitor::emit_shading_rate_setup()
1560
{
1561
   assert(devinfo->ver >= 11);
1562

1563
   const fs_builder abld = bld.annotate("compute fragment shading rate");
1564

1565
   fs_reg *reg = new(this->mem_ctx) fs_reg(bld.vgrf(BRW_REGISTER_TYPE_UD));
1566

1567
   struct brw_wm_prog_data *wm_prog_data =
1568
      brw_wm_prog_data(bld.shader->stage_prog_data);
1569

1570
   /* Coarse pixel shading size fields overlap with other fields of not in
1571
    * coarse pixel dispatch mode, so report 0 when that's not the case.
1572
    */
1573
   if (wm_prog_data->per_coarse_pixel_dispatch) {
1574
      /* The shading rates provided in the shader are the actual 2D shading
1575
       * rate while the SPIR-V built-in is the enum value that has the shading
1576
       * rate encoded as a bitfield.  Fortunately, the bitfield value is just
1577
       * the shading rate divided by two and shifted.
1578
       */
1579

1580
      /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
1581
      fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
1582
      /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
1583
      fs_reg actual_y = byte_offset(actual_x, 1);
1584

1585
      fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
1586
      fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
1587

1588
      abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
1589
      abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
1590
      abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
1591
      abld.OR(*reg, int_rate_x, int_rate_y);
1592
   } else {
1593
      abld.MOV(*reg, brw_imm_ud(0));
1594
   }
1595

1596
   return reg;
1597
}
1598

1599
fs_reg
1600
fs_visitor::resolve_source_modifiers(const fs_reg &src)
1601
{
1602
   if (!src.abs && !src.negate)
1603
      return src;
1604

1605
   fs_reg temp = bld.vgrf(src.type);
1606
   bld.MOV(temp, src);
1607

1608
   return temp;
1609
}
1610

1611
void
1612
fs_visitor::emit_gs_thread_end()
1613
{
1614
   assert(stage == MESA_SHADER_GEOMETRY);
1615

1616
   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1617

1618
   if (gs_compile->control_data_header_size_bits > 0) {
1619
      emit_gs_control_data_bits(this->final_gs_vertex_count);
1620
   }
1621

1622
   const fs_builder abld = bld.annotate("thread end");
1623
   fs_inst *inst;
1624

1625
   if (gs_prog_data->static_vertex_count != -1) {
1626
      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1627
         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1628
             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1629
             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1630
             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1631
            prev->eot = true;
1632

1633
            /* Delete now dead instructions. */
1634
            foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1635
               if (dead == prev)
1636
                  break;
1637
               dead->remove();
1638
            }
1639
            return;
1640
         } else if (prev->is_control_flow() || prev->has_side_effects()) {
1641
            break;
1642
         }
1643
      }
1644
      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1645
      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1646
      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1647
      inst->mlen = 1;
1648
   } else {
1649
      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1650
      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1651
      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1652
      sources[1] = this->final_gs_vertex_count;
1653
      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1654
      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1655
      inst->mlen = 2;
1656
   }
1657
   inst->eot = true;
1658
   inst->offset = 0;
1659
}
1660

1661
void
1662
fs_visitor::assign_curb_setup()
1663
{
1664
   unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1665

1666
   unsigned ubo_push_length = 0;
1667
   unsigned ubo_push_start[4];
1668
   for (int i = 0; i < 4; i++) {
1669
      ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1670
      ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1671
   }
1672

1673
   prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1674

1675
   uint64_t used = 0;
1676

1677
   if (stage == MESA_SHADER_COMPUTE &&
1678
       brw_cs_prog_data(prog_data)->uses_inline_data) {
1679
      /* With COMPUTE_WALKER, we can push up to one register worth of data via
1680
       * the inline data parameter in the COMPUTE_WALKER command itself.
1681
       *
1682
       * TODO: Support inline data and push at the same time.
1683
       */
1684
      assert(devinfo->verx10 >= 125);
1685
      assert(uniform_push_length <= 1);
1686
   } else if (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125) {
1687
      fs_builder ubld = bld.exec_all().group(8, 0).at(
1688
         cfg->first_block(), cfg->first_block()->start());
1689

1690
      /* The base address for our push data is passed in as R0.0[31:6].  We
1691
       * have to mask off the bottom 6 bits.
1692
       */
1693
      fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1694
      ubld.group(1, 0).AND(base_addr,
1695
                           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
1696
                           brw_imm_ud(INTEL_MASK(31, 6)));
1697

1698
      fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1699
      ubld.MOV(header0, brw_imm_ud(0));
1700
      ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4));
1701

1702
      /* On Gfx12-HP we load constants at the start of the program using A32
1703
       * stateless messages.
1704
       */
1705
      for (unsigned i = 0; i < uniform_push_length;) {
1706
         /* Limit ourselves to HW limit of 8 Owords (8 * 16bytes = 128 bytes
1707
          * or 4 registers).
1708
          */
1709
         unsigned num_regs = MIN2(uniform_push_length - i, 4);
1710
         assert(num_regs > 0);
1711
         num_regs = 1 << util_logbase2(num_regs);
1712

1713
         fs_reg header;
1714
         if (i == 0) {
1715
            header = header0;
1716
         } else {
1717
            header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1718
            ubld.MOV(header, brw_imm_ud(0));
1719
            ubld.group(1, 0).ADD(component(header, 2),
1720
                                 component(header0, 2),
1721
                                 brw_imm_ud(i * 2));
1722
         }
1723

1724
         fs_reg srcs[4] = {
1725
            brw_imm_ud(0), /* desc */
1726
            brw_imm_ud(0), /* ex_desc */
1727
            header, /* payload */
1728
            fs_reg(), /* payload2 */
1729
         };
1730

1731
         fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),
1732
                              BRW_REGISTER_TYPE_UD);
1733

1734
         /* This instruction has to be run SIMD16 if we're filling more than a
1735
          * single register.
1736
          */
1737
         unsigned send_width = MIN2(16, num_regs * 8);
1738

1739
         fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND,
1740
                                                        dest, srcs, 4);
1741
         send->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1742
         send->desc = brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
1743
                                  GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1744
                                  BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 2));
1745
         send->header_size = 1;
1746
         send->mlen = 1;
1747
         send->size_written = num_regs * REG_SIZE;
1748
         send->send_is_volatile = true;
1749

1750
         i += num_regs;
1751
      }
1752

1753
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1754
   }
1755

1756
   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1757
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1758
      for (unsigned int i = 0; i < inst->sources; i++) {
1759
	 if (inst->src[i].file == UNIFORM) {
1760
            int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1761
            int constant_nr;
1762
            if (inst->src[i].nr >= UBO_START) {
1763
               /* constant_nr is in 32-bit units, the rest are in bytes */
1764
               constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1765
                             inst->src[i].offset / 4;
1766
            } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1767
               constant_nr = push_constant_loc[uniform_nr];
1768
            } else {
1769
               /* Section 5.11 of the OpenGL 4.1 spec says:
1770
                * "Out-of-bounds reads return undefined values, which include
1771
                *  values from other variables of the active program or zero."
1772
                * Just return the first push constant.
1773
                */
1774
               constant_nr = 0;
1775
            }
1776

1777
            assert(constant_nr / 8 < 64);
1778
            used |= BITFIELD64_BIT(constant_nr / 8);
1779

1780
	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1781
						  constant_nr / 8,
1782
						  constant_nr % 8);
1783
            brw_reg.abs = inst->src[i].abs;
1784
            brw_reg.negate = inst->src[i].negate;
1785

1786
            assert(inst->src[i].stride == 0);
1787
            inst->src[i] = byte_offset(
1788
               retype(brw_reg, inst->src[i].type),
1789
               inst->src[i].offset % 4);
1790
	 }
1791
      }
1792
   }
1793

1794
   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1795
   if (want_zero) {
1796
      assert(!compiler->compact_params);
1797
      fs_builder ubld = bld.exec_all().group(8, 0).at(
1798
         cfg->first_block(), cfg->first_block()->start());
1799

1800
      /* push_reg_mask_param is in 32-bit units */
1801
      unsigned mask_param = stage_prog_data->push_reg_mask_param;
1802
      struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
1803
                                                            mask_param % 8);
1804

1805
      fs_reg b32;
1806
      for (unsigned i = 0; i < 64; i++) {
1807
         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1808
            fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
1809
            ubld.SHL(horiz_offset(shifted, 8),
1810
                     byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
1811
                     brw_imm_v(0x01234567));
1812
            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1813

1814
            fs_builder ubld16 = ubld.group(16, 0);
1815
            b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
1816
            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1817
         }
1818

1819
         if (want_zero & BITFIELD64_BIT(i)) {
1820
            assert(i < prog_data->curb_read_length);
1821
            struct brw_reg push_reg =
1822
               retype(brw_vec8_grf(payload.num_regs + i, 0),
1823
                      BRW_REGISTER_TYPE_D);
1824

1825
            ubld.AND(push_reg, push_reg, component(b32, i % 16));
1826
         }
1827
      }
1828

1829
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1830
   }
1831

1832
   /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1833
   this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1834
}
1835

1836
/*
1837
 * Build up an array of indices into the urb_setup array that
1838
 * references the active entries of the urb_setup array.
1839
 * Used to accelerate walking the active entries of the urb_setup array
1840
 * on each upload.
1841
 */
1842
void
1843
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1844
{
1845
   /* Make sure uint8_t is sufficient */
1846
   STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1847
   uint8_t index = 0;
1848
   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1849
      if (wm_prog_data->urb_setup[attr] >= 0) {
1850
         wm_prog_data->urb_setup_attribs[index++] = attr;
1851
      }
1852
   }
1853
   wm_prog_data->urb_setup_attribs_count = index;
1854
}
1855

1856
static void
1857
calculate_urb_setup(const struct intel_device_info *devinfo,
1858
                    const struct brw_wm_prog_key *key,
1859
                    struct brw_wm_prog_data *prog_data,
1860
                    const nir_shader *nir)
1861
{
1862
   memset(prog_data->urb_setup, -1,
1863
          sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1864

1865
   int urb_next = 0;
1866
   /* Figure out where each of the incoming setup attributes lands. */
1867
   if (devinfo->ver >= 6) {
1868
      if (util_bitcount64(nir->info.inputs_read &
1869
                            BRW_FS_VARYING_INPUT_MASK) <= 16) {
1870
         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1871
          * first 16 varying inputs, so we can put them wherever we want.
1872
          * Just put them in order.
1873
          *
1874
          * This is useful because it means that (a) inputs not used by the
1875
          * fragment shader won't take up valuable register space, and (b) we
1876
          * won't have to recompile the fragment shader if it gets paired with
1877
          * a different vertex (or geometry) shader.
1878
          */
1879
         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1880
            if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1881
                BITFIELD64_BIT(i)) {
1882
               prog_data->urb_setup[i] = urb_next++;
1883
            }
1884
         }
1885
      } else {
1886
         /* We have enough input varyings that the SF/SBE pipeline stage can't
1887
          * arbitrarily rearrange them to suit our whim; we have to put them
1888
          * in an order that matches the output of the previous pipeline stage
1889
          * (geometry or vertex shader).
1890
          */
1891

1892
         /* Re-compute the VUE map here in the case that the one coming from
1893
          * geometry has more than one position slot (used for Primitive
1894
          * Replication).
1895
          */
1896
         struct brw_vue_map prev_stage_vue_map;
1897
         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1898
                             key->input_slots_valid,
1899
                             nir->info.separate_shader, 1);
1900

1901
         int first_slot =
1902
            brw_compute_first_urb_slot_required(nir->info.inputs_read,
1903
                                                &prev_stage_vue_map);
1904

1905
         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1906
         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1907
              slot++) {
1908
            int varying = prev_stage_vue_map.slot_to_varying[slot];
1909
            if (varying != BRW_VARYING_SLOT_PAD &&
1910
                (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1911
                 BITFIELD64_BIT(varying))) {
1912
               prog_data->urb_setup[varying] = slot - first_slot;
1913
            }
1914
         }
1915
         urb_next = prev_stage_vue_map.num_slots - first_slot;
1916
      }
1917
   } else {
1918
      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1919
      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1920
         /* Point size is packed into the header, not as a general attribute */
1921
         if (i == VARYING_SLOT_PSIZ)
1922
            continue;
1923

1924
	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1925
	    /* The back color slot is skipped when the front color is
1926
	     * also written to.  In addition, some slots can be
1927
	     * written in the vertex shader and not read in the
1928
	     * fragment shader.  So the register number must always be
1929
	     * incremented, mapped or not.
1930
	     */
1931
	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1932
	       prog_data->urb_setup[i] = urb_next;
1933
            urb_next++;
1934
	 }
1935
      }
1936

1937
      /*
1938
       * It's a FS only attribute, and we did interpolation for this attribute
1939
       * in SF thread. So, count it here, too.
1940
       *
1941
       * See compile_sf_prog() for more info.
1942
       */
1943
      if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1944
         prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1945
   }
1946

1947
   prog_data->num_varying_inputs = urb_next;
1948
   prog_data->inputs = nir->info.inputs_read;
1949

1950
   brw_compute_urb_setup_index(prog_data);
1951
}
1952

1953
void
1954
fs_visitor::assign_urb_setup()
1955
{
1956
   assert(stage == MESA_SHADER_FRAGMENT);
1957
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1958

1959
   int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1960

1961
   /* Offset all the urb_setup[] index by the actual position of the
1962
    * setup regs, now that the location of the constants has been chosen.
1963
    */
1964
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1965
      for (int i = 0; i < inst->sources; i++) {
1966
         if (inst->src[i].file == ATTR) {
1967
            /* ATTR regs in the FS are in units of logical scalar inputs each
1968
             * of which consumes half of a GRF register.
1969
             */
1970
            assert(inst->src[i].offset < REG_SIZE / 2);
1971
            const unsigned grf = urb_start + inst->src[i].nr / 2;
1972
            const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
1973
                                    inst->src[i].offset;
1974
            const unsigned width = inst->src[i].stride == 0 ?
1975
                                   1 : MIN2(inst->exec_size, 8);
1976
            struct brw_reg reg = stride(
1977
               byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1978
                           offset),
1979
               width * inst->src[i].stride,
1980
               width, inst->src[i].stride);
1981
            reg.abs = inst->src[i].abs;
1982
            reg.negate = inst->src[i].negate;
1983
            inst->src[i] = reg;
1984
         }
1985
      }
1986
   }
1987

1988
   /* Each attribute is 4 setup channels, each of which is half a reg. */
1989
   this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1990
}
1991

1992
void
1993
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1994
{
1995
   for (int i = 0; i < inst->sources; i++) {
1996
      if (inst->src[i].file == ATTR) {
1997
         int grf = payload.num_regs +
1998
                   prog_data->curb_read_length +
1999
                   inst->src[i].nr +
2000
                   inst->src[i].offset / REG_SIZE;
2001

2002
         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
2003
          *
2004
          * VertStride must be used to cross GRF register boundaries. This
2005
          * rule implies that elements within a 'Width' cannot cross GRF
2006
          * boundaries.
2007
          *
2008
          * So, for registers that are large enough, we have to split the exec
2009
          * size in two and trust the compression state to sort it out.
2010
          */
2011
         unsigned total_size = inst->exec_size *
2012
                               inst->src[i].stride *
2013
                               type_sz(inst->src[i].type);
2014

2015
         assert(total_size <= 2 * REG_SIZE);
2016
         const unsigned exec_size =
2017
            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
2018

2019
         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
2020
         struct brw_reg reg =
2021
            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
2022
                               inst->src[i].offset % REG_SIZE),
2023
                   exec_size * inst->src[i].stride,
2024
                   width, inst->src[i].stride);
2025
         reg.abs = inst->src[i].abs;
2026
         reg.negate = inst->src[i].negate;
2027

2028
         inst->src[i] = reg;
2029
      }
2030
   }
2031
}
2032

2033
void
2034
fs_visitor::assign_vs_urb_setup()
2035
{
2036
   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
2037

2038
   assert(stage == MESA_SHADER_VERTEX);
2039

2040
   /* Each attribute is 4 regs. */
2041
   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
2042

2043
   assert(vs_prog_data->base.urb_read_length <= 15);
2044

2045
   /* Rewrite all ATTR file references to the hw grf that they land in. */
2046
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2047
      convert_attr_sources_to_hw_regs(inst);
2048
   }
2049
}
2050

2051
void
2052
fs_visitor::assign_tcs_urb_setup()
2053
{
2054
   assert(stage == MESA_SHADER_TESS_CTRL);
2055

2056
   /* Rewrite all ATTR file references to HW_REGs. */
2057
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2058
      convert_attr_sources_to_hw_regs(inst);
2059
   }
2060
}
2061

2062
void
2063
fs_visitor::assign_tes_urb_setup()
2064
{
2065
   assert(stage == MESA_SHADER_TESS_EVAL);
2066

2067
   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2068

2069
   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
2070

2071
   /* Rewrite all ATTR file references to HW_REGs. */
2072
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2073
      convert_attr_sources_to_hw_regs(inst);
2074
   }
2075
}
2076

2077
void
2078
fs_visitor::assign_gs_urb_setup()
2079
{
2080
   assert(stage == MESA_SHADER_GEOMETRY);
2081

2082
   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2083

2084
   first_non_payload_grf +=
2085
      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
2086

2087
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2088
      /* Rewrite all ATTR file references to GRFs. */
2089
      convert_attr_sources_to_hw_regs(inst);
2090
   }
2091
}
2092

2093

2094
/**
2095
 * Split large virtual GRFs into separate components if we can.
2096
 *
2097
 * This is mostly duplicated with what brw_fs_vector_splitting does,
2098
 * but that's really conservative because it's afraid of doing
2099
 * splitting that doesn't result in real progress after the rest of
2100
 * the optimization phases, which would cause infinite looping in
2101
 * optimization.  We can do it once here, safely.  This also has the
2102
 * opportunity to split interpolated values, or maybe even uniforms,
2103
 * which we don't have at the IR level.
2104
 *
2105
 * We want to split, because virtual GRFs are what we register
2106
 * allocate and spill (due to contiguousness requirements for some
2107
 * instructions), and they're what we naturally generate in the
2108
 * codegen process, but most virtual GRFs don't actually need to be
2109
 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2110
 * live intervals and better dead code elimination and coalescing.
2111
 */
2112
void
2113
fs_visitor::split_virtual_grfs()
2114
{
2115
   /* Compact the register file so we eliminate dead vgrfs.  This
2116
    * only defines split points for live registers, so if we have
2117
    * too large dead registers they will hit assertions later.
2118
    */
2119
   compact_virtual_grfs();
2120

2121
   int num_vars = this->alloc.count;
2122

2123
   /* Count the total number of registers */
2124
   int reg_count = 0;
2125
   int vgrf_to_reg[num_vars];
2126
   for (int i = 0; i < num_vars; i++) {
2127
      vgrf_to_reg[i] = reg_count;
2128
      reg_count += alloc.sizes[i];
2129
   }
2130

2131
   /* An array of "split points".  For each register slot, this indicates
2132
    * if this slot can be separated from the previous slot.  Every time an
2133
    * instruction uses multiple elements of a register (as a source or
2134
    * destination), we mark the used slots as inseparable.  Then we go
2135
    * through and split the registers into the smallest pieces we can.
2136
    */
2137
   bool *split_points = new bool[reg_count];
2138
   memset(split_points, 0, reg_count * sizeof(*split_points));
2139

2140
   /* Mark all used registers as fully splittable */
2141
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2142
      if (inst->dst.file == VGRF) {
2143
         int reg = vgrf_to_reg[inst->dst.nr];
2144
         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
2145
            split_points[reg + j] = true;
2146
      }
2147

2148
      for (int i = 0; i < inst->sources; i++) {
2149
         if (inst->src[i].file == VGRF) {
2150
            int reg = vgrf_to_reg[inst->src[i].nr];
2151
            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
2152
               split_points[reg + j] = true;
2153
         }
2154
      }
2155
   }
2156

2157
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2158
      /* We fix up undef instructions later */
2159
      if (inst->opcode == SHADER_OPCODE_UNDEF) {
2160
         /* UNDEF instructions are currently only used to undef entire
2161
          * registers.  We need this invariant later when we split them.
2162
          */
2163
         assert(inst->dst.file == VGRF);
2164
         assert(inst->dst.offset == 0);
2165
         assert(inst->size_written == alloc.sizes[inst->dst.nr] * REG_SIZE);
2166
         continue;
2167
      }
2168

2169
      if (inst->dst.file == VGRF) {
2170
         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2171
         for (unsigned j = 1; j < regs_written(inst); j++)
2172
            split_points[reg + j] = false;
2173
      }
2174
      for (int i = 0; i < inst->sources; i++) {
2175
         if (inst->src[i].file == VGRF) {
2176
            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2177
            for (unsigned j = 1; j < regs_read(inst, i); j++)
2178
               split_points[reg + j] = false;
2179
         }
2180
      }
2181
   }
2182

2183
   int *new_virtual_grf = new int[reg_count];
2184
   int *new_reg_offset = new int[reg_count];
2185

2186
   int reg = 0;
2187
   for (int i = 0; i < num_vars; i++) {
2188
      /* The first one should always be 0 as a quick sanity check. */
2189
      assert(split_points[reg] == false);
2190

2191
      /* j = 0 case */
2192
      new_reg_offset[reg] = 0;
2193
      reg++;
2194
      int offset = 1;
2195

2196
      /* j > 0 case */
2197
      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2198
         /* If this is a split point, reset the offset to 0 and allocate a
2199
          * new virtual GRF for the previous offset many registers
2200
          */
2201
         if (split_points[reg]) {
2202
            assert(offset <= MAX_VGRF_SIZE);
2203
            int grf = alloc.allocate(offset);
2204
            for (int k = reg - offset; k < reg; k++)
2205
               new_virtual_grf[k] = grf;
2206
            offset = 0;
2207
         }
2208
         new_reg_offset[reg] = offset;
2209
         offset++;
2210
         reg++;
2211
      }
2212

2213
      /* The last one gets the original register number */
2214
      assert(offset <= MAX_VGRF_SIZE);
2215
      alloc.sizes[i] = offset;
2216
      for (int k = reg - offset; k < reg; k++)
2217
         new_virtual_grf[k] = i;
2218
   }
2219
   assert(reg == reg_count);
2220

2221
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2222
      if (inst->opcode == SHADER_OPCODE_UNDEF) {
2223
         const fs_builder ibld(this, block, inst);
2224
         assert(inst->size_written % REG_SIZE == 0);
2225
         unsigned reg_offset = 0;
2226
         while (reg_offset < inst->size_written / REG_SIZE) {
2227
            reg = vgrf_to_reg[inst->dst.nr] + reg_offset;
2228
            ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));
2229
            reg_offset += alloc.sizes[new_virtual_grf[reg]];
2230
         }
2231
         inst->remove(block);
2232
         continue;
2233
      }
2234

2235
      if (inst->dst.file == VGRF) {
2236
         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2237
         inst->dst.nr = new_virtual_grf[reg];
2238
         inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
2239
                            inst->dst.offset % REG_SIZE;
2240
         assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2241
      }
2242
      for (int i = 0; i < inst->sources; i++) {
2243
	 if (inst->src[i].file == VGRF) {
2244
            reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2245
            inst->src[i].nr = new_virtual_grf[reg];
2246
            inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
2247
                                  inst->src[i].offset % REG_SIZE;
2248
            assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2249
         }
2250
      }
2251
   }
2252
   invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2253

2254
   delete[] split_points;
2255
   delete[] new_virtual_grf;
2256
   delete[] new_reg_offset;
2257
}
2258

2259
/**
2260
 * Remove unused virtual GRFs and compact the vgrf_* arrays.
2261
 *
2262
 * During code generation, we create tons of temporary variables, many of
2263
 * which get immediately killed and are never used again.  Yet, in later
2264
 * optimization and analysis passes, such as compute_live_intervals, we need
2265
 * to loop over all the virtual GRFs.  Compacting them can save a lot of
2266
 * overhead.
2267
 */
2268
bool
2269
fs_visitor::compact_virtual_grfs()
2270
{
2271
   bool progress = false;
2272
   int *remap_table = new int[this->alloc.count];
2273
   memset(remap_table, -1, this->alloc.count * sizeof(int));
2274

2275
   /* Mark which virtual GRFs are used. */
2276
   foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2277
      if (inst->dst.file == VGRF)
2278
         remap_table[inst->dst.nr] = 0;
2279

2280
      for (int i = 0; i < inst->sources; i++) {
2281
         if (inst->src[i].file == VGRF)
2282
            remap_table[inst->src[i].nr] = 0;
2283
      }
2284
   }
2285

2286
   /* Compact the GRF arrays. */
2287
   int new_index = 0;
2288
   for (unsigned i = 0; i < this->alloc.count; i++) {
2289
      if (remap_table[i] == -1) {
2290
         /* We just found an unused register.  This means that we are
2291
          * actually going to compact something.
2292
          */
2293
         progress = true;
2294
      } else {
2295
         remap_table[i] = new_index;
2296
         alloc.sizes[new_index] = alloc.sizes[i];
2297
         invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2298
         ++new_index;
2299
      }
2300
   }
2301

2302
   this->alloc.count = new_index;
2303

2304
   /* Patch all the instructions to use the newly renumbered registers */
2305
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2306
      if (inst->dst.file == VGRF)
2307
         inst->dst.nr = remap_table[inst->dst.nr];
2308

2309
      for (int i = 0; i < inst->sources; i++) {
2310
         if (inst->src[i].file == VGRF)
2311
            inst->src[i].nr = remap_table[inst->src[i].nr];
2312
      }
2313
   }
2314

2315
   /* Patch all the references to delta_xy, since they're used in register
2316
    * allocation.  If they're unused, switch them to BAD_FILE so we don't
2317
    * think some random VGRF is delta_xy.
2318
    */
2319
   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2320
      if (delta_xy[i].file == VGRF) {
2321
         if (remap_table[delta_xy[i].nr] != -1) {
2322
            delta_xy[i].nr = remap_table[delta_xy[i].nr];
2323
         } else {
2324
            delta_xy[i].file = BAD_FILE;
2325
         }
2326
      }
2327
   }
2328

2329
   delete[] remap_table;
2330

2331
   return progress;
2332
}
2333

2334
static int
2335
get_subgroup_id_param_index(const intel_device_info *devinfo,
2336
                            const brw_stage_prog_data *prog_data)
2337
{
2338
   if (prog_data->nr_params == 0)
2339
      return -1;
2340

2341
   if (devinfo->verx10 >= 125)
2342
      return -1;
2343

2344
   /* The local thread id is always the last parameter in the list */
2345
   uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
2346
   if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
2347
      return prog_data->nr_params - 1;
2348

2349
   return -1;
2350
}
2351

2352
/**
2353
 * Struct for handling complex alignments.
2354
 *
2355
 * A complex alignment is stored as multiplier and an offset.  A value is
2356
 * considered to be aligned if it is {offset} larger than a multiple of {mul}.
2357
 * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
2358
 * following:
2359
 *
2360
 *  N  | cplx_align_apply({8, 2}, N)
2361
 * ----+-----------------------------
2362
 *  4  | 6
2363
 *  6  | 6
2364
 *  8  | 14
2365
 *  10 | 14
2366
 *  12 | 14
2367
 *  14 | 14
2368
 *  16 | 22
2369
 */
2370
struct cplx_align {
2371
   unsigned mul:4;
2372
   unsigned offset:4;
2373
};
2374

2375
#define CPLX_ALIGN_MAX_MUL 8
2376

2377
static void
2378
cplx_align_assert_sane(struct cplx_align a)
2379
{
2380
   assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
2381
   assert(a.offset < a.mul);
2382
}
2383

2384
/**
2385
 * Combines two alignments to produce a least multiple of sorts.
2386
 *
2387
 * The returned alignment is the smallest (in terms of multiplier) such that
2388
 * anything aligned to both a and b will be aligned to the new alignment.
2389
 * This function will assert-fail if a and b are not compatible, i.e. if the
2390
 * offset parameters are such that no common alignment is possible.
2391
 */
2392
static struct cplx_align
2393
cplx_align_combine(struct cplx_align a, struct cplx_align b)
2394
{
2395
   cplx_align_assert_sane(a);
2396
   cplx_align_assert_sane(b);
2397

2398
   /* Assert that the alignments agree. */
2399
   assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
2400

2401
   return a.mul > b.mul ? a : b;
2402
}
2403

2404
/**
2405
 * Apply a complex alignment
2406
 *
2407
 * This function will return the smallest number greater than or equal to
2408
 * offset that is aligned to align.
2409
 */
2410
static unsigned
2411
cplx_align_apply(struct cplx_align align, unsigned offset)
2412
{
2413
   return ALIGN(offset - align.offset, align.mul) + align.offset;
2414
}
2415

2416
#define UNIFORM_SLOT_SIZE 4
2417

2418
struct uniform_slot_info {
2419
   /** True if the given uniform slot is live */
2420
   unsigned is_live:1;
2421

2422
   /** True if this slot and the next slot must remain contiguous */
2423
   unsigned contiguous:1;
2424

2425
   struct cplx_align align;
2426
};
2427

2428
static void
2429
mark_uniform_slots_read(struct uniform_slot_info *slots,
2430
                        unsigned num_slots, unsigned alignment)
2431
{
2432
   assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
2433
   assert(alignment <= CPLX_ALIGN_MAX_MUL);
2434

2435
   /* We can't align a slot to anything less than the slot size */
2436
   alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
2437

2438
   struct cplx_align align = {alignment, 0};
2439
   cplx_align_assert_sane(align);
2440

2441
   for (unsigned i = 0; i < num_slots; i++) {
2442
      slots[i].is_live = true;
2443
      if (i < num_slots - 1)
2444
         slots[i].contiguous = true;
2445

2446
      align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
2447
      if (slots[i].align.mul == 0) {
2448
         slots[i].align = align;
2449
      } else {
2450
         slots[i].align = cplx_align_combine(slots[i].align, align);
2451
      }
2452
   }
2453
}
2454

2455
/**
2456
 * Assign UNIFORM file registers to either push constants or pull constants.
2457
 *
2458
 * We allow a fragment shader to have more than the specified minimum
2459
 * maximum number of fragment shader uniform components (64).  If
2460
 * there are too many of these, they'd fill up all of register space.
2461
 * So, this will push some of them out to the pull constant buffer and
2462
 * update the program to load them.
2463
 */
2464
void
2465
fs_visitor::assign_constant_locations()
2466
{
2467
   /* Only the first compile gets to decide on locations. */
2468
   if (push_constant_loc) {
2469
      assert(pull_constant_loc);
2470
      return;
2471
   }
2472

2473
   if (compiler->compact_params) {
2474
      struct uniform_slot_info slots[uniforms + 1];
2475
      memset(slots, 0, sizeof(slots));
2476

2477
      foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2478
         for (int i = 0 ; i < inst->sources; i++) {
2479
            if (inst->src[i].file != UNIFORM)
2480
               continue;
2481

2482
            /* NIR tightly packs things so the uniform number might not be
2483
             * aligned (if we have a double right after a float, for
2484
             * instance).  This is fine because the process of re-arranging
2485
             * them will ensure that things are properly aligned.  The offset
2486
             * into that uniform, however, must be aligned.
2487
             *
2488
             * In Vulkan, we have explicit offsets but everything is crammed
2489
             * into a single "variable" so inst->src[i].nr will always be 0.
2490
             * Everything will be properly aligned relative to that one base.
2491
             */
2492
            assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
2493

2494
            unsigned u = inst->src[i].nr +
2495
                         inst->src[i].offset / UNIFORM_SLOT_SIZE;
2496

2497
            if (u >= uniforms)
2498
               continue;
2499

2500
            unsigned slots_read;
2501
            if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
2502
               slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
2503
            } else {
2504
               unsigned bytes_read = inst->components_read(i) *
2505
                                     type_sz(inst->src[i].type);
2506
               slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
2507
            }
2508

2509
            assert(u + slots_read <= uniforms);
2510
            mark_uniform_slots_read(&slots[u], slots_read,
2511
                                    type_sz(inst->src[i].type));
2512
         }
2513
      }
2514

2515
      int subgroup_id_index = get_subgroup_id_param_index(devinfo,
2516
                                                          stage_prog_data);
2517

2518
      /* Only allow 16 registers (128 uniform components) as push constants.
2519
       *
2520
       * Just demote the end of the list.  We could probably do better
2521
       * here, demoting things that are rarely used in the program first.
2522
       *
2523
       * If changing this value, note the limitation about total_regs in
2524
       * brw_curbe.c.
2525
       */
2526
      unsigned int max_push_components = 16 * 8;
2527
      if (subgroup_id_index >= 0)
2528
         max_push_components--; /* Save a slot for the thread ID */
2529

2530
      /* We push small arrays, but no bigger than 16 floats.  This is big
2531
       * enough for a vec4 but hopefully not large enough to push out other
2532
       * stuff.  We should probably use a better heuristic at some point.
2533
       */
2534
      const unsigned int max_chunk_size = 16;
2535

2536
      unsigned int num_push_constants = 0;
2537
      unsigned int num_pull_constants = 0;
2538

2539
      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2540
      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2541

2542
      /* Default to -1 meaning no location */
2543
      memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
2544
      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2545

2546
      int chunk_start = -1;
2547
      struct cplx_align align;
2548
      for (unsigned u = 0; u < uniforms; u++) {
2549
         if (!slots[u].is_live) {
2550
            assert(chunk_start == -1);
2551
            continue;
2552
         }
2553

2554
         /* Skip subgroup_id_index to put it in the last push register. */
2555
         if (subgroup_id_index == (int)u)
2556
            continue;
2557

2558
         if (chunk_start == -1) {
2559
            chunk_start = u;
2560
            align = slots[u].align;
2561
         } else {
2562
            /* Offset into the chunk */
2563
            unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
2564

2565
            /* Shift the slot alignment down by the chunk offset so it is
2566
             * comparable with the base chunk alignment.
2567
             */
2568
            struct cplx_align slot_align = slots[u].align;
2569
            slot_align.offset =
2570
               (slot_align.offset - chunk_offset) & (align.mul - 1);
2571

2572
            align = cplx_align_combine(align, slot_align);
2573
         }
2574

2575
         /* Sanity check the alignment */
2576
         cplx_align_assert_sane(align);
2577

2578
         if (slots[u].contiguous)
2579
            continue;
2580

2581
         /* Adjust the alignment to be in terms of slots, not bytes */
2582
         assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
2583
         assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
2584
         align.mul /= UNIFORM_SLOT_SIZE;
2585
         align.offset /= UNIFORM_SLOT_SIZE;
2586

2587
         unsigned push_start_align = cplx_align_apply(align, num_push_constants);
2588
         unsigned chunk_size = u - chunk_start + 1;
2589
         if ((!compiler->supports_pull_constants && u < UBO_START) ||
2590
             (chunk_size < max_chunk_size &&
2591
              push_start_align + chunk_size <= max_push_components)) {
2592
            /* Align up the number of push constants */
2593
            num_push_constants = push_start_align;
2594
            for (unsigned i = 0; i < chunk_size; i++)
2595
               push_constant_loc[chunk_start + i] = num_push_constants++;
2596
         } else {
2597
            /* We need to pull this one */
2598
            num_pull_constants = cplx_align_apply(align, num_pull_constants);
2599
            for (unsigned i = 0; i < chunk_size; i++)
2600
               pull_constant_loc[chunk_start + i] = num_pull_constants++;
2601
         }
2602

2603
         /* Reset the chunk and start again */
2604
         chunk_start = -1;
2605
      }
2606

2607
      /* Add the CS local thread ID uniform at the end of the push constants */
2608
      if (subgroup_id_index >= 0)
2609
         push_constant_loc[subgroup_id_index] = num_push_constants++;
2610

2611
      /* As the uniforms are going to be reordered, stash the old array and
2612
       * create two new arrays for push/pull params.
2613
       */
2614
      uint32_t *param = stage_prog_data->param;
2615
      stage_prog_data->nr_params = num_push_constants;
2616
      if (num_push_constants) {
2617
         stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
2618
                                                num_push_constants);
2619
      } else {
2620
         stage_prog_data->param = NULL;
2621
      }
2622
      assert(stage_prog_data->nr_pull_params == 0);
2623
      assert(stage_prog_data->pull_param == NULL);
2624
      if (num_pull_constants > 0) {
2625
         stage_prog_data->nr_pull_params = num_pull_constants;
2626
         stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
2627
                                                     num_pull_constants);
2628
      }
2629

2630
      /* Up until now, the param[] array has been indexed by reg + offset
2631
       * of UNIFORM registers.  Move pull constants into pull_param[] and
2632
       * condense param[] to only contain the uniforms we chose to push.
2633
       *
2634
       * NOTE: Because we are condensing the params[] array, we know that
2635
       * push_constant_loc[i] <= i and we can do it in one smooth loop without
2636
       * having to make a copy.
2637
       */
2638
      for (unsigned int i = 0; i < uniforms; i++) {
2639
         uint32_t value = param[i];
2640
         if (pull_constant_loc[i] != -1) {
2641
            stage_prog_data->pull_param[pull_constant_loc[i]] = value;
2642
         } else if (push_constant_loc[i] != -1) {
2643
            stage_prog_data->param[push_constant_loc[i]] = value;
2644
         }
2645
      }
2646
      ralloc_free(param);
2647
   } else {
2648
      /* If we don't want to compact anything, just set up dummy push/pull
2649
       * arrays.  All the rest of the compiler cares about are these arrays.
2650
       */
2651
      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2652
      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2653

2654
      for (unsigned u = 0; u < uniforms; u++)
2655
         push_constant_loc[u] = u;
2656

2657
      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2658
   }
2659

2660
   /* Now that we know how many regular uniforms we'll push, reduce the
2661
    * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2662
    */
2663
   /* For gen4/5:
2664
    * Only allow 16 registers (128 uniform components) as push constants.
2665
    *
2666
    * If changing this value, note the limitation about total_regs in
2667
    * brw_curbe.c/crocus_state.c
2668
    */
2669
   const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
2670
   unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2671
   for (int i = 0; i < 4; i++) {
2672
      struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
2673

2674
      if (push_length + range->length > max_push_length)
2675
         range->length = max_push_length - push_length;
2676

2677
      push_length += range->length;
2678
   }
2679
   assert(push_length <= max_push_length);
2680
}
2681

2682
bool
2683
fs_visitor::get_pull_locs(const fs_reg &src,
2684
                          unsigned *out_surf_index,
2685
                          unsigned *out_pull_index)
2686
{
2687
   assert(src.file == UNIFORM);
2688

2689
   if (src.nr >= UBO_START) {
2690
      const struct brw_ubo_range *range =
2691
         &prog_data->ubo_ranges[src.nr - UBO_START];
2692

2693
      /* If this access is in our (reduced) range, use the push data. */
2694
      if (src.offset / 32 < range->length)
2695
         return false;
2696

2697
      *out_surf_index = prog_data->binding_table.ubo_start + range->block;
2698
      *out_pull_index = (32 * range->start + src.offset) / 4;
2699

2700
      prog_data->has_ubo_pull = true;
2701
      return true;
2702
   }
2703

2704
   const unsigned location = src.nr + src.offset / 4;
2705

2706
   if (location < uniforms && pull_constant_loc[location] != -1) {
2707
      /* A regular uniform push constant */
2708
      *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
2709
      *out_pull_index = pull_constant_loc[location];
2710

2711
      prog_data->has_ubo_pull = true;
2712
      return true;
2713
   }
2714

2715
   return false;
2716
}
2717

2718
/**
2719
 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2720
 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2721
 */
2722
void
2723
fs_visitor::lower_constant_loads()
2724
{
2725
   unsigned index, pull_index;
2726

2727
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2728
      /* Set up the annotation tracking for new generated instructions. */
2729
      const fs_builder ibld(this, block, inst);
2730

2731
      for (int i = 0; i < inst->sources; i++) {
2732
	 if (inst->src[i].file != UNIFORM)
2733
	    continue;
2734

2735
         /* We'll handle this case later */
2736
         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
2737
            continue;
2738

2739
         if (!get_pull_locs(inst->src[i], &index, &pull_index))
2740
	    continue;
2741

2742
         assert(inst->src[i].stride == 0);
2743

2744
         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2745
         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2746
         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2747
         const unsigned base = pull_index * 4;
2748

2749
         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2750
                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
2751

2752
         /* Rewrite the instruction to use the temporary VGRF. */
2753
         inst->src[i].file = VGRF;
2754
         inst->src[i].nr = dst.nr;
2755
         inst->src[i].offset = (base & (block_sz - 1)) +
2756
                               inst->src[i].offset % 4;
2757
      }
2758

2759
      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
2760
          inst->src[0].file == UNIFORM) {
2761

2762
         if (!get_pull_locs(inst->src[0], &index, &pull_index))
2763
            continue;
2764

2765
         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2766
                                    brw_imm_ud(index),
2767
                                    inst->src[1],
2768
                                    pull_index * 4, 4);
2769
         inst->remove(block);
2770
      }
2771
   }
2772
   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2773
}
2774

2775
bool
2776
fs_visitor::opt_algebraic()
2777
{
2778
   bool progress = false;
2779

2780
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2781
      switch (inst->opcode) {
2782
      case BRW_OPCODE_MOV:
2783
         if (!devinfo->has_64bit_float &&
2784
             !devinfo->has_64bit_int &&
2785
             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
2786
              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2787
              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2788
            assert(inst->dst.type == inst->src[0].type);
2789
            assert(!inst->saturate);
2790
            assert(!inst->src[0].abs);
2791
            assert(!inst->src[0].negate);
2792
            const brw::fs_builder ibld(this, block, inst);
2793

2794
            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2795
                     subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
2796
            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2797
                     subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
2798

2799
            inst->remove(block);
2800
            progress = true;
2801
         }
2802

2803
         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2804
              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2805
             inst->dst.is_null() &&
2806
             (inst->src[0].abs || inst->src[0].negate)) {
2807
            inst->src[0].abs = false;
2808
            inst->src[0].negate = false;
2809
            progress = true;
2810
            break;
2811
         }
2812

2813
         if (inst->src[0].file != IMM)
2814
            break;
2815

2816
         if (inst->saturate) {
2817
            /* Full mixed-type saturates don't happen.  However, we can end up
2818
             * with things like:
2819
             *
2820
             *    mov.sat(8) g21<1>DF       -1F
2821
             *
2822
             * Other mixed-size-but-same-base-type cases may also be possible.
2823
             */
2824
            if (inst->dst.type != inst->src[0].type &&
2825
                inst->dst.type != BRW_REGISTER_TYPE_DF &&
2826
                inst->src[0].type != BRW_REGISTER_TYPE_F)
2827
               assert(!"unimplemented: saturate mixed types");
2828

2829
            if (brw_saturate_immediate(inst->src[0].type,
2830
                                       &inst->src[0].as_brw_reg())) {
2831
               inst->saturate = false;
2832
               progress = true;
2833
            }
2834
         }
2835
         break;
2836

2837
      case BRW_OPCODE_MUL:
2838
         if (inst->src[1].file != IMM)
2839
            continue;
2840

2841
         /* a * 1.0 = a */
2842
         if (inst->src[1].is_one()) {
2843
            inst->opcode = BRW_OPCODE_MOV;
2844
            inst->src[1] = reg_undef;
2845
            progress = true;
2846
            break;
2847
         }
2848

2849
         /* a * -1.0 = -a */
2850
         if (inst->src[1].is_negative_one()) {
2851
            inst->opcode = BRW_OPCODE_MOV;
2852
            inst->src[0].negate = !inst->src[0].negate;
2853
            inst->src[1] = reg_undef;
2854
            progress = true;
2855
            break;
2856
         }
2857

2858
         break;
2859
      case BRW_OPCODE_ADD:
2860
         if (inst->src[1].file != IMM)
2861
            continue;
2862

2863
         if (brw_reg_type_is_integer(inst->src[1].type) &&
2864
             inst->src[1].is_zero()) {
2865
            inst->opcode = BRW_OPCODE_MOV;
2866
            inst->src[1] = reg_undef;
2867
            progress = true;
2868
            break;
2869
         }
2870

2871
         if (inst->src[0].file == IMM) {
2872
            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2873
            inst->opcode = BRW_OPCODE_MOV;
2874
            inst->src[0].f += inst->src[1].f;
2875
            inst->src[1] = reg_undef;
2876
            progress = true;
2877
            break;
2878
         }
2879
         break;
2880
      case BRW_OPCODE_OR:
2881
         if (inst->src[0].equals(inst->src[1]) ||
2882
             inst->src[1].is_zero()) {
2883
            /* On Gfx8+, the OR instruction can have a source modifier that
2884
             * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
2885
             * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2886
             */
2887
            if (inst->src[0].negate) {
2888
               inst->opcode = BRW_OPCODE_NOT;
2889
               inst->src[0].negate = false;
2890
            } else {
2891
               inst->opcode = BRW_OPCODE_MOV;
2892
            }
2893
            inst->src[1] = reg_undef;
2894
            progress = true;
2895
            break;
2896
         }
2897
         break;
2898
      case BRW_OPCODE_CMP:
2899
         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2900
              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2901
             inst->src[1].is_zero() &&
2902
             (inst->src[0].abs || inst->src[0].negate)) {
2903
            inst->src[0].abs = false;
2904
            inst->src[0].negate = false;
2905
            progress = true;
2906
            break;
2907
         }
2908
         break;
2909
      case BRW_OPCODE_SEL:
2910
         if (!devinfo->has_64bit_float &&
2911
             !devinfo->has_64bit_int &&
2912
             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
2913
              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2914
              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2915
            assert(inst->dst.type == inst->src[0].type);
2916
            assert(!inst->saturate);
2917
            assert(!inst->src[0].abs && !inst->src[0].negate);
2918
            assert(!inst->src[1].abs && !inst->src[1].negate);
2919
            const brw::fs_builder ibld(this, block, inst);
2920

2921
            set_predicate(inst->predicate,
2922
                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2923
                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
2924
                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
2925
            set_predicate(inst->predicate,
2926
                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2927
                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
2928
                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
2929

2930
            inst->remove(block);
2931
            progress = true;
2932
         }
2933
         if (inst->src[0].equals(inst->src[1])) {
2934
            inst->opcode = BRW_OPCODE_MOV;
2935
            inst->src[1] = reg_undef;
2936
            inst->predicate = BRW_PREDICATE_NONE;
2937
            inst->predicate_inverse = false;
2938
            progress = true;
2939
         } else if (inst->saturate && inst->src[1].file == IMM) {
2940
            switch (inst->conditional_mod) {
2941
            case BRW_CONDITIONAL_LE:
2942
            case BRW_CONDITIONAL_L:
2943
               switch (inst->src[1].type) {
2944
               case BRW_REGISTER_TYPE_F:
2945
                  if (inst->src[1].f >= 1.0f) {
2946
                     inst->opcode = BRW_OPCODE_MOV;
2947
                     inst->src[1] = reg_undef;
2948
                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2949
                     progress = true;
2950
                  }
2951
                  break;
2952
               default:
2953
                  break;
2954
               }
2955
               break;
2956
            case BRW_CONDITIONAL_GE:
2957
            case BRW_CONDITIONAL_G:
2958
               switch (inst->src[1].type) {
2959
               case BRW_REGISTER_TYPE_F:
2960
                  if (inst->src[1].f <= 0.0f) {
2961
                     inst->opcode = BRW_OPCODE_MOV;
2962
                     inst->src[1] = reg_undef;
2963
                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2964
                     progress = true;
2965
                  }
2966
                  break;
2967
               default:
2968
                  break;
2969
               }
2970
            default:
2971
               break;
2972
            }
2973
         }
2974
         break;
2975
      case BRW_OPCODE_MAD:
2976
         if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
2977
             inst->src[1].type != BRW_REGISTER_TYPE_F ||
2978
             inst->src[2].type != BRW_REGISTER_TYPE_F)
2979
            break;
2980
         if (inst->src[1].is_one()) {
2981
            inst->opcode = BRW_OPCODE_ADD;
2982
            inst->src[1] = inst->src[2];
2983
            inst->src[2] = reg_undef;
2984
            progress = true;
2985
         } else if (inst->src[2].is_one()) {
2986
            inst->opcode = BRW_OPCODE_ADD;
2987
            inst->src[2] = reg_undef;
2988
            progress = true;
2989
         }
2990
         break;
2991
      case SHADER_OPCODE_BROADCAST:
2992
         if (is_uniform(inst->src[0])) {
2993
            inst->opcode = BRW_OPCODE_MOV;
2994
            inst->sources = 1;
2995
            inst->force_writemask_all = true;
2996
            progress = true;
2997
         } else if (inst->src[1].file == IMM) {
2998
            inst->opcode = BRW_OPCODE_MOV;
2999
            /* It's possible that the selected component will be too large and
3000
             * overflow the register.  This can happen if someone does a
3001
             * readInvocation() from GLSL or SPIR-V and provides an OOB
3002
             * invocationIndex.  If this happens and we some how manage
3003
             * to constant fold it in and get here, then component() may cause
3004
             * us to start reading outside of the VGRF which will lead to an
3005
             * assert later.  Instead, just let it wrap around if it goes over
3006
             * exec_size.
3007
             */
3008
            const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
3009
            inst->src[0] = component(inst->src[0], comp);
3010
            inst->sources = 1;
3011
            inst->force_writemask_all = true;
3012
            progress = true;
3013
         }
3014
         break;
3015

3016
      case SHADER_OPCODE_SHUFFLE:
3017
         if (is_uniform(inst->src[0])) {
3018
            inst->opcode = BRW_OPCODE_MOV;
3019
            inst->sources = 1;
3020
            progress = true;
3021
         } else if (inst->src[1].file == IMM) {
3022
            inst->opcode = BRW_OPCODE_MOV;
3023
            inst->src[0] = component(inst->src[0],
3024
                                     inst->src[1].ud);
3025
            inst->sources = 1;
3026
            progress = true;
3027
         }
3028
         break;
3029

3030
      default:
3031
	 break;
3032
      }
3033

3034
      /* Swap if src[0] is immediate. */
3035
      if (progress && inst->is_commutative()) {
3036
         if (inst->src[0].file == IMM) {
3037
            fs_reg tmp = inst->src[1];
3038
            inst->src[1] = inst->src[0];
3039
            inst->src[0] = tmp;
3040
         }
3041
      }
3042
   }
3043

3044
   if (progress)
3045
      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
3046
                          DEPENDENCY_INSTRUCTION_DETAIL);
3047

3048
   return progress;
3049
}
3050

3051
/**
3052
 * Optimize sample messages that have constant zero values for the trailing
3053
 * texture coordinates. We can just reduce the message length for these
3054
 * instructions instead of reserving a register for it. Trailing parameters
3055
 * that aren't sent default to zero anyway. This will cause the dead code
3056
 * eliminator to remove the MOV instruction that would otherwise be emitted to
3057
 * set up the zero value.
3058
 */
3059
bool
3060
fs_visitor::opt_zero_samples()
3061
{
3062
   /* Gfx4 infers the texturing opcode based on the message length so we can't
3063
    * change it.  Gfx12.5 has restrictions on the number of coordinate
3064
    * parameters that have to be provided for some texture types
3065
    * (Wa_14013363432).
3066
    */
3067
   if (devinfo->ver < 5 || devinfo->verx10 == 125)
3068
      return false;
3069

3070
   bool progress = false;
3071

3072
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3073
      if (!inst->is_tex())
3074
         continue;
3075

3076
      fs_inst *load_payload = (fs_inst *) inst->prev;
3077

3078
      if (load_payload->is_head_sentinel() ||
3079
          load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3080
         continue;
3081

3082
      /* We don't want to remove the message header or the first parameter.
3083
       * Removing the first parameter is not allowed, see the Haswell PRM
3084
       * volume 7, page 149:
3085
       *
3086
       *     "Parameter 0 is required except for the sampleinfo message, which
3087
       *      has no parameter 0"
3088
       */
3089
      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
3090
             load_payload->src[(inst->mlen - inst->header_size) /
3091
                               (inst->exec_size / 8) +
3092
                               inst->header_size - 1].is_zero()) {
3093
         inst->mlen -= inst->exec_size / 8;
3094
         progress = true;
3095
      }
3096
   }
3097

3098
   if (progress)
3099
      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3100

3101
   return progress;
3102
}
3103

3104
bool
3105
fs_visitor::opt_register_renaming()
3106
{
3107
   bool progress = false;
3108
   int depth = 0;
3109

3110
   unsigned remap[alloc.count];
3111
   memset(remap, ~0u, sizeof(unsigned) * alloc.count);
3112

3113
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3114
      if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
3115
         depth++;
3116
      } else if (inst->opcode == BRW_OPCODE_ENDIF ||
3117
                 inst->opcode == BRW_OPCODE_WHILE) {
3118
         depth--;
3119
      }
3120

3121
      /* Rewrite instruction sources. */
3122
      for (int i = 0; i < inst->sources; i++) {
3123
         if (inst->src[i].file == VGRF &&
3124
             remap[inst->src[i].nr] != ~0u &&
3125
             remap[inst->src[i].nr] != inst->src[i].nr) {
3126
            inst->src[i].nr = remap[inst->src[i].nr];
3127
            progress = true;
3128
         }
3129
      }
3130

3131
      const unsigned dst = inst->dst.nr;
3132

3133
      if (depth == 0 &&
3134
          inst->dst.file == VGRF &&
3135
          alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
3136
          !inst->is_partial_write()) {
3137
         if (remap[dst] == ~0u) {
3138
            remap[dst] = dst;
3139
         } else {
3140
            remap[dst] = alloc.allocate(regs_written(inst));
3141
            inst->dst.nr = remap[dst];
3142
            progress = true;
3143
         }
3144
      } else if (inst->dst.file == VGRF &&
3145
                 remap[dst] != ~0u &&
3146
                 remap[dst] != dst) {
3147
         inst->dst.nr = remap[dst];
3148
         progress = true;
3149
      }
3150
   }
3151

3152
   if (progress) {
3153
      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
3154
                          DEPENDENCY_VARIABLES);
3155

3156
      for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
3157
         if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {
3158
            delta_xy[i].nr = remap[delta_xy[i].nr];
3159
         }
3160
      }
3161
   }
3162

3163
   return progress;
3164
}
3165

3166
/**
3167
 * Remove redundant or useless halts.
3168
 *
3169
 * For example, we can eliminate halts in the following sequence:
3170
 *
3171
 * halt        (redundant with the next halt)
3172
 * halt        (useless; jumps to the next instruction)
3173
 * halt-target
3174
 */
3175
bool
3176
fs_visitor::opt_redundant_halt()
3177
{
3178
   bool progress = false;
3179

3180
   unsigned halt_count = 0;
3181
   fs_inst *halt_target = NULL;
3182
   bblock_t *halt_target_block = NULL;
3183
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3184
      if (inst->opcode == BRW_OPCODE_HALT)
3185
         halt_count++;
3186

3187
      if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
3188
         halt_target = inst;
3189
         halt_target_block = block;
3190
         break;
3191
      }
3192
   }
3193

3194
   if (!halt_target) {
3195
      assert(halt_count == 0);
3196
      return false;
3197
   }
3198

3199
   /* Delete any HALTs immediately before the halt target. */
3200
   for (fs_inst *prev = (fs_inst *) halt_target->prev;
3201
        !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
3202
        prev = (fs_inst *) halt_target->prev) {
3203
      prev->remove(halt_target_block);
3204
      halt_count--;
3205
      progress = true;
3206
   }
3207

3208
   if (halt_count == 0) {
3209
      halt_target->remove(halt_target_block);
3210
      progress = true;
3211
   }
3212

3213
   if (progress)
3214
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3215

3216
   return progress;
3217
}
3218

3219
/**
3220
 * Compute a bitmask with GRF granularity with a bit set for each GRF starting
3221
 * from \p r.offset which overlaps the region starting at \p s.offset and
3222
 * spanning \p ds bytes.
3223
 */
3224
static inline unsigned
3225
mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
3226
{
3227
   const int rel_offset = reg_offset(s) - reg_offset(r);
3228
   const int shift = rel_offset / REG_SIZE;
3229
   const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
3230
   assert(reg_space(r) == reg_space(s) &&
3231
          shift >= 0 && shift < int(8 * sizeof(unsigned)));
3232
   return ((1 << n) - 1) << shift;
3233
}
3234

3235
bool
3236
fs_visitor::compute_to_mrf()
3237
{
3238
   bool progress = false;
3239
   int next_ip = 0;
3240

3241
   /* No MRFs on Gen >= 7. */
3242
   if (devinfo->ver >= 7)
3243
      return false;
3244

3245
   const fs_live_variables &live = live_analysis.require();
3246

3247
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3248
      int ip = next_ip;
3249
      next_ip++;
3250

3251
      if (inst->opcode != BRW_OPCODE_MOV ||
3252
	  inst->is_partial_write() ||
3253
	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
3254
	  inst->dst.type != inst->src[0].type ||
3255
	  inst->src[0].abs || inst->src[0].negate ||
3256
          !inst->src[0].is_contiguous() ||
3257
          inst->src[0].offset % REG_SIZE != 0)
3258
	 continue;
3259

3260
      /* Can't compute-to-MRF this GRF if someone else was going to
3261
       * read it later.
3262
       */
3263
      if (live.vgrf_end[inst->src[0].nr] > ip)
3264
	 continue;
3265

3266
      /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
3267
       * things that computed the value of all GRFs of the source region.  The
3268
       * regs_left bitset keeps track of the registers we haven't yet found a
3269
       * generating instruction for.
3270
       */
3271
      unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
3272

3273
      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3274
         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3275
                             inst->src[0], inst->size_read(0))) {
3276
	    /* Found the last thing to write our reg we want to turn
3277
	     * into a compute-to-MRF.
3278
	     */
3279

3280
	    /* If this one instruction didn't populate all the
3281
	     * channels, bail.  We might be able to rewrite everything
3282
	     * that writes that reg, but it would require smarter
3283
	     * tracking.
3284
	     */
3285
	    if (scan_inst->is_partial_write())
3286
	       break;
3287

3288
            /* Handling things not fully contained in the source of the copy
3289
             * would need us to understand coalescing out more than one MOV at
3290
             * a time.
3291
             */
3292
            if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
3293
                                     inst->src[0], inst->size_read(0)))
3294
               break;
3295

3296
	    /* SEND instructions can't have MRF as a destination. */
3297
	    if (scan_inst->mlen)
3298
	       break;
3299

3300
	    if (devinfo->ver == 6) {
3301
	       /* gfx6 math instructions must have the destination be
3302
		* GRF, so no compute-to-MRF for them.
3303
		*/
3304
	       if (scan_inst->is_math()) {
3305
		  break;
3306
	       }
3307
	    }
3308

3309
            /* Clear the bits for any registers this instruction overwrites. */
3310
            regs_left &= ~mask_relative_to(
3311
               inst->src[0], scan_inst->dst, scan_inst->size_written);
3312
            if (!regs_left)
3313
               break;
3314
	 }
3315

3316
	 /* We don't handle control flow here.  Most computation of
3317
	  * values that end up in MRFs are shortly before the MRF
3318
	  * write anyway.
3319
	  */
3320
	 if (block->start() == scan_inst)
3321
	    break;
3322

3323
	 /* You can't read from an MRF, so if someone else reads our
3324
	  * MRF's source GRF that we wanted to rewrite, that stops us.
3325
	  */
3326
	 bool interfered = false;
3327
	 for (int i = 0; i < scan_inst->sources; i++) {
3328
            if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
3329
                                inst->src[0], inst->size_read(0))) {
3330
	       interfered = true;
3331
	    }
3332
	 }
3333
	 if (interfered)
3334
	    break;
3335

3336
         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3337
                             inst->dst, inst->size_written)) {
3338
	    /* If somebody else writes our MRF here, we can't
3339
	     * compute-to-MRF before that.
3340
	     */
3341
            break;
3342
         }
3343

3344
         if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
3345
             regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
3346
                             inst->dst, inst->size_written)) {
3347
	    /* Found a SEND instruction, which means that there are
3348
	     * live values in MRFs from base_mrf to base_mrf +
3349
	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3350
	     * above it.
3351
	     */
3352
            break;
3353
         }
3354
      }
3355

3356
      if (regs_left)
3357
         continue;
3358

3359
      /* Found all generating instructions of our MRF's source value, so it
3360
       * should be safe to rewrite them to point to the MRF directly.
3361
       */
3362
      regs_left = (1 << regs_read(inst, 0)) - 1;
3363

3364
      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3365
         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3366
                             inst->src[0], inst->size_read(0))) {
3367
            /* Clear the bits for any registers this instruction overwrites. */
3368
            regs_left &= ~mask_relative_to(
3369
               inst->src[0], scan_inst->dst, scan_inst->size_written);
3370

3371
            const unsigned rel_offset = reg_offset(scan_inst->dst) -
3372
                                        reg_offset(inst->src[0]);
3373

3374
            if (inst->dst.nr & BRW_MRF_COMPR4) {
3375
               /* Apply the same address transformation done by the hardware
3376
                * for COMPR4 MRF writes.
3377
                */
3378
               assert(rel_offset < 2 * REG_SIZE);
3379
               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
3380

3381
               /* Clear the COMPR4 bit if the generating instruction is not
3382
                * compressed.
3383
                */
3384
               if (scan_inst->size_written < 2 * REG_SIZE)
3385
                  scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
3386

3387
            } else {
3388
               /* Calculate the MRF number the result of this instruction is
3389
                * ultimately written to.
3390
                */
3391
               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
3392
            }
3393

3394
            scan_inst->dst.file = MRF;
3395
            scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
3396
            scan_inst->saturate |= inst->saturate;
3397
            if (!regs_left)
3398
               break;
3399
         }
3400
      }
3401

3402
      assert(!regs_left);
3403
      inst->remove(block);
3404
      progress = true;
3405
   }
3406

3407
   if (progress)
3408
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3409

3410
   return progress;
3411
}
3412

3413
/**
3414
 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3415
 * flow.  We could probably do better here with some form of divergence
3416
 * analysis.
3417
 */
3418
bool
3419
fs_visitor::eliminate_find_live_channel()
3420
{
3421
   bool progress = false;
3422
   unsigned depth = 0;
3423

3424
   if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
3425
      /* The optimization below assumes that channel zero is live on thread
3426
       * dispatch, which may not be the case if the fixed function dispatches
3427
       * threads sparsely.
3428
       */
3429
      return false;
3430
   }
3431

3432
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3433
      switch (inst->opcode) {
3434
      case BRW_OPCODE_IF:
3435
      case BRW_OPCODE_DO:
3436
         depth++;
3437
         break;
3438

3439
      case BRW_OPCODE_ENDIF:
3440
      case BRW_OPCODE_WHILE:
3441
         depth--;
3442
         break;
3443

3444
      case BRW_OPCODE_HALT:
3445
         /* This can potentially make control flow non-uniform until the end
3446
          * of the program.
3447
          */
3448
         return progress;
3449

3450
      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3451
         if (depth == 0) {
3452
            inst->opcode = BRW_OPCODE_MOV;
3453
            inst->src[0] = brw_imm_ud(0u);
3454
            inst->sources = 1;
3455
            inst->force_writemask_all = true;
3456
            progress = true;
3457
         }
3458
         break;
3459

3460
      default:
3461
         break;
3462
      }
3463
   }
3464

3465
   if (progress)
3466
      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3467

3468
   return progress;
3469
}
3470

3471
/**
3472
 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3473
 * instructions to FS_OPCODE_REP_FB_WRITE.
3474
 */
3475
void
3476
fs_visitor::emit_repclear_shader()
3477
{
3478
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3479
   int base_mrf = 0;
3480
   int color_mrf = base_mrf + 2;
3481
   fs_inst *mov;
3482

3483
   if (uniforms > 0) {
3484
      mov = bld.exec_all().group(4, 0)
3485
               .MOV(brw_message_reg(color_mrf),
3486
                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
3487
   } else {
3488
      struct brw_reg reg =
3489
         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
3490
                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
3491
                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
3492

3493
      mov = bld.exec_all().group(4, 0)
3494
               .MOV(brw_uvec_mrf(4, color_mrf, 0), fs_reg(reg));
3495
   }
3496

3497
   fs_inst *write = NULL;
3498
   if (key->nr_color_regions == 1) {
3499
      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3500
      write->saturate = key->clamp_fragment_color;
3501
      write->base_mrf = color_mrf;
3502
      write->target = 0;
3503
      write->header_size = 0;
3504
      write->mlen = 1;
3505
   } else {
3506
      assume(key->nr_color_regions > 0);
3507

3508
      struct brw_reg header =
3509
         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
3510
      bld.exec_all().group(16, 0)
3511
         .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3512

3513
      for (int i = 0; i < key->nr_color_regions; ++i) {
3514
         if (i > 0) {
3515
            bld.exec_all().group(1, 0)
3516
               .MOV(component(header, 2), brw_imm_ud(i));
3517
         }
3518

3519
         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3520
         write->saturate = key->clamp_fragment_color;
3521
         write->base_mrf = base_mrf;
3522
         write->target = i;
3523
         write->header_size = 2;
3524
         write->mlen = 3;
3525
      }
3526
   }
3527
   write->eot = true;
3528
   write->last_rt = true;
3529

3530
   calculate_cfg();
3531

3532
   assign_constant_locations();
3533
   assign_curb_setup();
3534

3535
   /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3536
   if (uniforms > 0) {
3537
      assert(mov->src[0].file == FIXED_GRF);
3538
      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
3539
   }
3540

3541
   lower_scoreboard();
3542
}
3543

3544
/**
3545
 * Walks through basic blocks, looking for repeated MRF writes and
3546
 * removing the later ones.
3547
 */
3548
bool
3549
fs_visitor::remove_duplicate_mrf_writes()
3550
{
3551
   fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];
3552
   bool progress = false;
3553

3554
   /* Need to update the MRF tracking for compressed instructions. */
3555
   if (dispatch_width >= 16)
3556
      return false;
3557

3558
   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3559

3560
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3561
      if (inst->is_control_flow()) {
3562
	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3563
      }
3564

3565
      if (inst->opcode == BRW_OPCODE_MOV &&
3566
	  inst->dst.file == MRF) {
3567
         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3568
	 if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
3569
             inst->dst.equals(prev_inst->dst) &&
3570
             inst->src[0].equals(prev_inst->src[0]) &&
3571
             inst->saturate == prev_inst->saturate &&
3572
             inst->predicate == prev_inst->predicate &&
3573
             inst->conditional_mod == prev_inst->conditional_mod &&
3574
             inst->exec_size == prev_inst->exec_size) {
3575
	    inst->remove(block);
3576
	    progress = true;
3577
	    continue;
3578
	 }
3579
      }
3580

3581
      /* Clear out the last-write records for MRFs that were overwritten. */
3582
      if (inst->dst.file == MRF) {
3583
         last_mrf_move[inst->dst.nr] = NULL;
3584
      }
3585

3586
      if (inst->mlen > 0 && inst->base_mrf != -1) {
3587
	 /* Found a SEND instruction, which will include two or fewer
3588
	  * implied MRF writes.  We could do better here.
3589
	  */
3590
	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3591
	    last_mrf_move[inst->base_mrf + i] = NULL;
3592
	 }
3593
      }
3594

3595
      /* Clear out any MRF move records whose sources got overwritten. */
3596
      for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3597
         if (last_mrf_move[i] &&
3598
             regions_overlap(inst->dst, inst->size_written,
3599
                             last_mrf_move[i]->src[0],
3600
                             last_mrf_move[i]->size_read(0))) {
3601
            last_mrf_move[i] = NULL;
3602
         }
3603
      }
3604

3605
      if (inst->opcode == BRW_OPCODE_MOV &&
3606
	  inst->dst.file == MRF &&
3607
	  inst->src[0].file != ARF &&
3608
	  !inst->is_partial_write()) {
3609
         last_mrf_move[inst->dst.nr] = inst;
3610
      }
3611
   }
3612

3613
   if (progress)
3614
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3615

3616
   return progress;
3617
}
3618

3619
/**
3620
 * Rounding modes for conversion instructions are included for each
3621
 * conversion, but right now it is a state. So once it is set,
3622
 * we don't need to call it again for subsequent calls.
3623
 *
3624
 * This is useful for vector/matrices conversions, as setting the
3625
 * mode once is enough for the full vector/matrix
3626
 */
3627
bool
3628
fs_visitor::remove_extra_rounding_modes()
3629
{
3630
   bool progress = false;
3631
   unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3632

3633
   brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
3634
   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3635
        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3636
        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3637
       execution_mode)
3638
      base_mode = BRW_RND_MODE_RTNE;
3639
   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3640
        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3641
        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3642
       execution_mode)
3643
      base_mode = BRW_RND_MODE_RTZ;
3644

3645
   foreach_block (block, cfg) {
3646
      brw_rnd_mode prev_mode = base_mode;
3647

3648
      foreach_inst_in_block_safe (fs_inst, inst, block) {
3649
         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
3650
            assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
3651
            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
3652
            if (mode == prev_mode) {
3653
               inst->remove(block);
3654
               progress = true;
3655
            } else {
3656
               prev_mode = mode;
3657
            }
3658
         }
3659
      }
3660
   }
3661

3662
   if (progress)
3663
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3664

3665
   return progress;
3666
}
3667

3668
static void
3669
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3670
{
3671
   /* Clear the flag for registers that actually got read (as expected). */
3672
   for (int i = 0; i < inst->sources; i++) {
3673
      int grf;
3674
      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3675
         grf = inst->src[i].nr;
3676
      } else {
3677
         continue;
3678
      }
3679

3680
      if (grf >= first_grf &&
3681
          grf < first_grf + grf_len) {
3682
         deps[grf - first_grf] = false;
3683
         if (inst->exec_size == 16)
3684
            deps[grf - first_grf + 1] = false;
3685
      }
3686
   }
3687
}
3688

3689
/**
3690
 * Implements this workaround for the original 965:
3691
 *
3692
 *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3693
 *      check for post destination dependencies on this instruction, software
3694
 *      must ensure that there is no destination hazard for the case of ‘write
3695
 *      followed by a posted write’ shown in the following example.
3696
 *
3697
 *      1. mov r3 0
3698
 *      2. send r3.xy <rest of send instruction>
3699
 *      3. mov r2 r3
3700
 *
3701
 *      Due to no post-destination dependency check on the ‘send’, the above
3702
 *      code sequence could have two instructions (1 and 2) in flight at the
3703
 *      same time that both consider ‘r3’ as the target of their final writes.
3704
 */
3705
void
3706
fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
3707
                                                        fs_inst *inst)
3708
{
3709
   int write_len = regs_written(inst);
3710
   int first_write_grf = inst->dst.nr;
3711
   bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
3712
   assert(write_len < (int)sizeof(needs_dep) - 1);
3713

3714
   memset(needs_dep, false, sizeof(needs_dep));
3715
   memset(needs_dep, true, write_len);
3716

3717
   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3718

3719
   /* Walk backwards looking for writes to registers we're writing which
3720
    * aren't read since being written.  If we hit the start of the program,
3721
    * we assume that there are no outstanding dependencies on entry to the
3722
    * program.
3723
    */
3724
   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3725
      /* If we hit control flow, assume that there *are* outstanding
3726
       * dependencies, and force their cleanup before our instruction.
3727
       */
3728
      if (block->start() == scan_inst && block->num != 0) {
3729
         for (int i = 0; i < write_len; i++) {
3730
            if (needs_dep[i])
3731
               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3732
                               first_write_grf + i);
3733
         }
3734
         return;
3735
      }
3736

3737
      /* We insert our reads as late as possible on the assumption that any
3738
       * instruction but a MOV that might have left us an outstanding
3739
       * dependency has more latency than a MOV.
3740
       */
3741
      if (scan_inst->dst.file == VGRF) {
3742
         for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3743
            int reg = scan_inst->dst.nr + i;
3744

3745
            if (reg >= first_write_grf &&
3746
                reg < first_write_grf + write_len &&
3747
                needs_dep[reg - first_write_grf]) {
3748
               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3749
               needs_dep[reg - first_write_grf] = false;
3750
               if (scan_inst->exec_size == 16)
3751
                  needs_dep[reg - first_write_grf + 1] = false;
3752
            }
3753
         }
3754
      }
3755

3756
      /* Clear the flag for registers that actually got read (as expected). */
3757
      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3758

3759
      /* Continue the loop only if we haven't resolved all the dependencies */
3760
      int i;
3761
      for (i = 0; i < write_len; i++) {
3762
         if (needs_dep[i])
3763
            break;
3764
      }
3765
      if (i == write_len)
3766
         return;
3767
   }
3768
}
3769

3770
/**
3771
 * Implements this workaround for the original 965:
3772
 *
3773
 *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3774
 *      used as a destination register until after it has been sourced by an
3775
 *      instruction with a different destination register.
3776
 */
3777
void
3778
fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3779
{
3780
   int write_len = regs_written(inst);
3781
   unsigned first_write_grf = inst->dst.nr;
3782
   bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
3783
   assert(write_len < (int)sizeof(needs_dep) - 1);
3784

3785
   memset(needs_dep, false, sizeof(needs_dep));
3786
   memset(needs_dep, true, write_len);
3787
   /* Walk forwards looking for writes to registers we're writing which aren't
3788
    * read before being written.
3789
    */
3790
   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
3791
      /* If we hit control flow, force resolve all remaining dependencies. */
3792
      if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3793
         for (int i = 0; i < write_len; i++) {
3794
            if (needs_dep[i])
3795
               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3796
                               first_write_grf + i);
3797
         }
3798
         return;
3799
      }
3800

3801
      /* Clear the flag for registers that actually got read (as expected). */
3802
      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3803

3804
      /* We insert our reads as late as possible since they're reading the
3805
       * result of a SEND, which has massive latency.
3806
       */
3807
      if (scan_inst->dst.file == VGRF &&
3808
          scan_inst->dst.nr >= first_write_grf &&
3809
          scan_inst->dst.nr < first_write_grf + write_len &&
3810
          needs_dep[scan_inst->dst.nr - first_write_grf]) {
3811
         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3812
                         scan_inst->dst.nr);
3813
         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3814
      }
3815

3816
      /* Continue the loop only if we haven't resolved all the dependencies */
3817
      int i;
3818
      for (i = 0; i < write_len; i++) {
3819
         if (needs_dep[i])
3820
            break;
3821
      }
3822
      if (i == write_len)
3823
         return;
3824
   }
3825
}
3826

3827
void
3828
fs_visitor::insert_gfx4_send_dependency_workarounds()
3829
{
3830
   if (devinfo->ver != 4 || devinfo->is_g4x)
3831
      return;
3832

3833
   bool progress = false;
3834

3835
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3836
      if (inst->mlen != 0 && inst->dst.file == VGRF) {
3837
         insert_gfx4_pre_send_dependency_workarounds(block, inst);
3838
         insert_gfx4_post_send_dependency_workarounds(block, inst);
3839
         progress = true;
3840
      }
3841
   }
3842

3843
   if (progress)
3844
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3845
}
3846

3847
/**
3848
 * Turns the generic expression-style uniform pull constant load instruction
3849
 * into a hardware-specific series of instructions for loading a pull
3850
 * constant.
3851
 *
3852
 * The expression style allows the CSE pass before this to optimize out
3853
 * repeated loads from the same offset, and gives the pre-register-allocation
3854
 * scheduling full flexibility, while the conversion to native instructions
3855
 * allows the post-register-allocation scheduler the best information
3856
 * possible.
3857
 *
3858
 * Note that execution masking for setting up pull constant loads is special:
3859
 * the channels that need to be written are unrelated to the current execution
3860
 * mask, since a later instruction will use one of the result channels as a
3861
 * source operand for all 8 or 16 of its channels.
3862
 */
3863
void
3864
fs_visitor::lower_uniform_pull_constant_loads()
3865
{
3866
   foreach_block_and_inst (block, fs_inst, inst, cfg) {
3867
      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3868
         continue;
3869

3870
      const fs_reg& surface = inst->src[0];
3871
      const fs_reg& offset_B = inst->src[1];
3872
      assert(offset_B.file == IMM);
3873

3874
      if (devinfo->has_lsc) {
3875
         const fs_builder ubld =
3876
            fs_builder(this, block, inst).group(8, 0).exec_all();
3877

3878
         const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3879
         ubld.MOV(payload, offset_B);
3880

3881
         inst->sfid = GFX12_SFID_UGM;
3882
         inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3883
                                   1 /* simd_size */,
3884
                                   LSC_ADDR_SURFTYPE_BTI,
3885
                                   LSC_ADDR_SIZE_A32,
3886
                                   1 /* num_coordinates */,
3887
                                   LSC_DATA_SIZE_D32,
3888
                                   inst->size_written / 4,
3889
                                   true /* transpose */,
3890
                                   LSC_CACHE_LOAD_L1STATE_L3MOCS,
3891
                                   true /* has_dest */);
3892

3893
         fs_reg ex_desc;
3894
         if (surface.file == IMM) {
3895
            ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
3896
         } else {
3897
            /* We only need the first component for the payload so we can use
3898
             * one of the other components for the extended descriptor
3899
             */
3900
            ex_desc = component(payload, 1);
3901
            ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
3902
         }
3903

3904
         /* Update the original instruction. */
3905
         inst->opcode = SHADER_OPCODE_SEND;
3906
         inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3907
         inst->ex_mlen = 0;
3908
         inst->header_size = 0;
3909
         inst->send_has_side_effects = false;
3910
         inst->send_is_volatile = true;
3911
         inst->exec_size = 1;
3912

3913
         /* Finally, the payload */
3914
         inst->resize_sources(3);
3915
         inst->src[0] = brw_imm_ud(0); /* desc */
3916
         inst->src[1] = ex_desc;
3917
         inst->src[2] = payload;
3918

3919
         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3920
      } else if (devinfo->ver >= 7) {
3921
         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3922
         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
3923

3924
         ubld.group(8, 0).MOV(payload,
3925
                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3926
         ubld.group(1, 0).MOV(component(payload, 2),
3927
                              brw_imm_ud(offset_B.ud / 16));
3928

3929
         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
3930
         inst->src[1] = payload;
3931
         inst->header_size = 1;
3932
         inst->mlen = 1;
3933

3934
         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3935
      } else {
3936
         /* Before register allocation, we didn't tell the scheduler about the
3937
          * MRF we use.  We know it's safe to use this MRF because nothing
3938
          * else does except for register spill/unspill, which generates and
3939
          * uses its MRF within a single IR instruction.
3940
          */
3941
         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
3942
         inst->mlen = 1;
3943
      }
3944
   }
3945
}
3946

3947
bool
3948
fs_visitor::lower_load_payload()
3949
{
3950
   bool progress = false;
3951

3952
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3953
      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3954
         continue;
3955

3956
      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3957
      assert(inst->saturate == false);
3958
      fs_reg dst = inst->dst;
3959

3960
      /* Get rid of COMPR4.  We'll add it back in if we need it */
3961
      if (dst.file == MRF)
3962
         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3963

3964
      const fs_builder ibld(this, block, inst);
3965
      const fs_builder ubld = ibld.exec_all();
3966

3967
      for (uint8_t i = 0; i < inst->header_size;) {
3968
         /* Number of header GRFs to initialize at once with a single MOV
3969
          * instruction.
3970
          */
3971
         const unsigned n =
3972
            (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3973
             inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3974
            2 : 1;
3975

3976
         if (inst->src[i].file != BAD_FILE)
3977
            ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
3978
                                     retype(inst->src[i], BRW_REGISTER_TYPE_UD));
3979

3980
         dst = byte_offset(dst, n * REG_SIZE);
3981
         i += n;
3982
      }
3983

3984
      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3985
          inst->exec_size > 8) {
3986
         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3987
          * a straightforward copy.  Instead, the result of the
3988
          * LOAD_PAYLOAD is treated as interleaved and the first four
3989
          * non-header sources are unpacked as:
3990
          *
3991
          * m + 0: r0
3992
          * m + 1: g0
3993
          * m + 2: b0
3994
          * m + 3: a0
3995
          * m + 4: r1
3996
          * m + 5: g1
3997
          * m + 6: b1
3998
          * m + 7: a1
3999
          *
4000
          * This is used for gen <= 5 fb writes.
4001
          */
4002
         assert(inst->exec_size == 16);
4003
         assert(inst->header_size + 4 <= inst->sources);
4004
         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
4005
            if (inst->src[i].file != BAD_FILE) {
4006
               if (devinfo->has_compr4) {
4007
                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
4008
                  compr4_dst.nr |= BRW_MRF_COMPR4;
4009
                  ibld.MOV(compr4_dst, inst->src[i]);
4010
               } else {
4011
                  /* Platform doesn't have COMPR4.  We have to fake it */
4012
                  fs_reg mov_dst = retype(dst, inst->src[i].type);
4013
                  ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
4014
                  mov_dst.nr += 4;
4015
                  ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
4016
               }
4017
            }
4018

4019
            dst.nr++;
4020
         }
4021

4022
         /* The loop above only ever incremented us through the first set
4023
          * of 4 registers.  However, thanks to the magic of COMPR4, we
4024
          * actually wrote to the first 8 registers, so we need to take
4025
          * that into account now.
4026
          */
4027
         dst.nr += 4;
4028

4029
         /* The COMPR4 code took care of the first 4 sources.  We'll let
4030
          * the regular path handle any remaining sources.  Yes, we are
4031
          * modifying the instruction but we're about to delete it so
4032
          * this really doesn't hurt anything.
4033
          */
4034
         inst->header_size += 4;
4035
      }
4036

4037
      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
4038
         if (inst->src[i].file != BAD_FILE) {
4039
            dst.type = inst->src[i].type;
4040
            ibld.MOV(dst, inst->src[i]);
4041
         } else {
4042
            dst.type = BRW_REGISTER_TYPE_UD;
4043
         }
4044
         dst = offset(dst, ibld, 1);
4045
      }
4046

4047
      inst->remove(block);
4048
      progress = true;
4049
   }
4050

4051
   if (progress)
4052
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4053

4054
   return progress;
4055
}
4056

4057
void
4058
fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
4059
{
4060
   const fs_builder ibld(this, block, inst);
4061

4062
   const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
4063
   if (inst->src[1].file == IMM &&
4064
       (( ud && inst->src[1].ud <= UINT16_MAX) ||
4065
        (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) {
4066
      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
4067
       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
4068
       * src1 are used.
4069
       *
4070
       * If multiplying by an immediate value that fits in 16-bits, do a
4071
       * single MUL instruction with that value in the proper location.
4072
       */
4073
      if (devinfo->ver < 7) {
4074
         fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
4075
         ibld.MOV(imm, inst->src[1]);
4076
         ibld.MUL(inst->dst, imm, inst->src[0]);
4077
      } else {
4078
         ibld.MUL(inst->dst, inst->src[0],
4079
                  ud ? brw_imm_uw(inst->src[1].ud)
4080
                     : brw_imm_w(inst->src[1].d));
4081
      }
4082
   } else {
4083
      /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
4084
       * do 32-bit integer multiplication in one instruction, but instead
4085
       * must do a sequence (which actually calculates a 64-bit result):
4086
       *
4087
       *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
4088
       *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
4089
       *    mov(8)  g2<1>D     acc0<8,8,1>D
4090
       *
4091
       * But on Gen > 6, the ability to use second accumulator register
4092
       * (acc1) for non-float data types was removed, preventing a simple
4093
       * implementation in SIMD16. A 16-channel result can be calculated by
4094
       * executing the three instructions twice in SIMD8, once with quarter
4095
       * control of 1Q for the first eight channels and again with 2Q for
4096
       * the second eight channels.
4097
       *
4098
       * Which accumulator register is implicitly accessed (by AccWrEnable
4099
       * for instance) is determined by the quarter control. Unfortunately
4100
       * Ivybridge (and presumably Baytrail) has a hardware bug in which an
4101
       * implicit accumulator access by an instruction with 2Q will access
4102
       * acc1 regardless of whether the data type is usable in acc1.
4103
       *
4104
       * Specifically, the 2Q mach(8) writes acc1 which does not exist for
4105
       * integer data types.
4106
       *
4107
       * Since we only want the low 32-bits of the result, we can do two
4108
       * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
4109
       * adjust the high result and add them (like the mach is doing):
4110
       *
4111
       *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
4112
       *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
4113
       *    shl(8)  g9<1>D     g8<8,8,1>D      16D
4114
       *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
4115
       *
4116
       * We avoid the shl instruction by realizing that we only want to add
4117
       * the low 16-bits of the "high" result to the high 16-bits of the
4118
       * "low" result and using proper regioning on the add:
4119
       *
4120
       *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
4121
       *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
4122
       *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
4123
       *
4124
       * Since it does not use the (single) accumulator register, we can
4125
       * schedule multi-component multiplications much better.
4126
       */
4127

4128
      bool needs_mov = false;
4129
      fs_reg orig_dst = inst->dst;
4130

4131
      /* Get a new VGRF for the "low" 32x16-bit multiplication result if
4132
       * reusing the original destination is impossible due to hardware
4133
       * restrictions, source/destination overlap, or it being the null
4134
       * register.
4135
       */
4136
      fs_reg low = inst->dst;
4137
      if (orig_dst.is_null() || orig_dst.file == MRF ||
4138
          regions_overlap(inst->dst, inst->size_written,
4139
                          inst->src[0], inst->size_read(0)) ||
4140
          regions_overlap(inst->dst, inst->size_written,
4141
                          inst->src[1], inst->size_read(1)) ||
4142
          inst->dst.stride >= 4) {
4143
         needs_mov = true;
4144
         low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
4145
                      inst->dst.type);
4146
      }
4147

4148
      /* Get a new VGRF but keep the same stride as inst->dst */
4149
      fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
4150
      high.stride = inst->dst.stride;
4151
      high.offset = inst->dst.offset % REG_SIZE;
4152

4153
      if (devinfo->ver >= 7) {
4154
         /* From Wa_1604601757:
4155
          *
4156
          * "When multiplying a DW and any lower precision integer, source modifier
4157
          *  is not supported."
4158
          *
4159
          * An unsupported negate modifier on src[1] would ordinarily be
4160
          * lowered by the subsequent lower_regioning pass.  In this case that
4161
          * pass would spawn another dword multiply.  Instead, lower the
4162
          * modifier first.
4163
          */
4164
         const bool source_mods_unsupported = (devinfo->ver >= 12);
4165

4166
         if (inst->src[1].abs || (inst->src[1].negate &&
4167
                                  source_mods_unsupported))
4168
            lower_src_modifiers(this, block, inst, 1);
4169

4170
         if (inst->src[1].file == IMM) {
4171
            ibld.MUL(low, inst->src[0],
4172
                     brw_imm_uw(inst->src[1].ud & 0xffff));
4173
            ibld.MUL(high, inst->src[0],
4174
                     brw_imm_uw(inst->src[1].ud >> 16));
4175
         } else {
4176
            ibld.MUL(low, inst->src[0],
4177
                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4178
            ibld.MUL(high, inst->src[0],
4179
                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
4180
         }
4181
      } else {
4182
         if (inst->src[0].abs)
4183
            lower_src_modifiers(this, block, inst, 0);
4184

4185
         ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
4186
                  inst->src[1]);
4187
         ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
4188
                  inst->src[1]);
4189
      }
4190

4191
      ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
4192
               subscript(low, BRW_REGISTER_TYPE_UW, 1),
4193
               subscript(high, BRW_REGISTER_TYPE_UW, 0));
4194

4195
      if (needs_mov || inst->conditional_mod)
4196
         set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
4197
   }
4198
}
4199

4200
void
4201
fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
4202
{
4203
   const fs_builder ibld(this, block, inst);
4204

4205
   /* Considering two 64-bit integers ab and cd where each letter        ab
4206
    * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
4207
    * only need to provide the YZ part of the result.               -------
4208
    *                                                                    BD
4209
    *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
4210
    *  about the lower 32 bits (since they are part of the upper     +  BC
4211
    *  32 bits of our result). AC is not needed since it starts      + AC
4212
    *  on the 65th bit of the result.                               -------
4213
    *                                                                  WXYZ
4214
    */
4215
   unsigned int q_regs = regs_written(inst);
4216
   unsigned int d_regs = (q_regs + 1) / 2;
4217

4218
   fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
4219
   fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4220
   fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4221

4222
   /* Here we need the full 64 bit result for 32b * 32b. */
4223
   if (devinfo->has_integer_dword_mul) {
4224
      ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4225
               subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4226
   } else {
4227
      fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4228
      fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4229
      fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);
4230

4231
      fs_inst *mul = ibld.MUL(acc,
4232
                            subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4233
                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4234
      mul->writes_accumulator = true;
4235

4236
      ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4237
                subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4238
      ibld.MOV(bd_low, acc);
4239

4240
      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
4241
      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
4242
   }
4243

4244
   ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
4245
            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4246
   ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4247
            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
4248

4249
   ibld.ADD(ad, ad, bc);
4250
   ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
4251
            subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
4252

4253
   if (devinfo->has_64bit_int) {
4254
      ibld.MOV(inst->dst, bd);
4255
   } else {
4256
      ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
4257
               subscript(bd, BRW_REGISTER_TYPE_UD, 0));
4258
      ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
4259
               subscript(bd, BRW_REGISTER_TYPE_UD, 1));
4260
   }
4261
}
4262

4263
void
4264
fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
4265
{
4266
   const fs_builder ibld(this, block, inst);
4267

4268
   /* According to the BDW+ BSpec page for the "Multiply Accumulate
4269
    * High" instruction:
4270
    *
4271
    *  "An added preliminary mov is required for source modification on
4272
    *   src1:
4273
    *      mov (8) r3.0<1>:d -r3<8;8,1>:d
4274
    *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
4275
    *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
4276
    */
4277
   if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
4278
      lower_src_modifiers(this, block, inst, 1);
4279

4280
   /* Should have been lowered to 8-wide. */
4281
   assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
4282
   const fs_reg acc = retype(brw_acc_reg(inst->exec_size), inst->dst.type);
4283
   fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
4284
   fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
4285

4286
   if (devinfo->ver >= 8) {
4287
      /* Until Gfx8, integer multiplies read 32-bits from one source,
4288
       * and 16-bits from the other, and relying on the MACH instruction
4289
       * to generate the high bits of the result.
4290
       *
4291
       * On Gfx8, the multiply instruction does a full 32x32-bit
4292
       * multiply, but in order to do a 64-bit multiply we can simulate
4293
       * the previous behavior and then use a MACH instruction.
4294
       */
4295
      assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
4296
             mul->src[1].type == BRW_REGISTER_TYPE_UD);
4297
      mul->src[1].type = BRW_REGISTER_TYPE_UW;
4298
      mul->src[1].stride *= 2;
4299

4300
      if (mul->src[1].file == IMM) {
4301
         mul->src[1] = brw_imm_uw(mul->src[1].ud);
4302
      }
4303
   } else if (devinfo->verx10 == 70 &&
4304
              inst->group > 0) {
4305
      /* Among other things the quarter control bits influence which
4306
       * accumulator register is used by the hardware for instructions
4307
       * that access the accumulator implicitly (e.g. MACH).  A
4308
       * second-half instruction would normally map to acc1, which
4309
       * doesn't exist on Gfx7 and up (the hardware does emulate it for
4310
       * floating-point instructions *only* by taking advantage of the
4311
       * extra precision of acc0 not normally used for floating point
4312
       * arithmetic).
4313
       *
4314
       * HSW and up are careful enough not to try to access an
4315
       * accumulator register that doesn't exist, but on earlier Gfx7
4316
       * hardware we need to make sure that the quarter control bits are
4317
       * zero to avoid non-deterministic behaviour and emit an extra MOV
4318
       * to get the result masked correctly according to the current
4319
       * channel enables.
4320
       */
4321
      mach->group = 0;
4322
      mach->force_writemask_all = true;
4323
      mach->dst = ibld.vgrf(inst->dst.type);
4324
      ibld.MOV(inst->dst, mach->dst);
4325
   }
4326
}
4327

4328
bool
4329
fs_visitor::lower_integer_multiplication()
4330
{
4331
   bool progress = false;
4332

4333
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4334
      if (inst->opcode == BRW_OPCODE_MUL) {
4335
         /* If the instruction is already in a form that does not need lowering,
4336
          * return early.
4337
          */
4338
         if (devinfo->ver >= 7) {
4339
            if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
4340
               continue;
4341
         } else {
4342
            if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
4343
               continue;
4344
         }
4345

4346
         if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
4347
              inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
4348
             (inst->src[0].type == BRW_REGISTER_TYPE_Q ||
4349
              inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
4350
             (inst->src[1].type == BRW_REGISTER_TYPE_Q ||
4351
              inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
4352
            lower_mul_qword_inst(inst, block);
4353
            inst->remove(block);
4354
            progress = true;
4355
         } else if (!inst->dst.is_accumulator() &&
4356
                    (inst->dst.type == BRW_REGISTER_TYPE_D ||
4357
                     inst->dst.type == BRW_REGISTER_TYPE_UD) &&
4358
                    (!devinfo->has_integer_dword_mul ||
4359
                     devinfo->verx10 >= 125)) {
4360
            lower_mul_dword_inst(inst, block);
4361
            inst->remove(block);
4362
            progress = true;
4363
         }
4364
      } else if (inst->opcode == SHADER_OPCODE_MULH) {
4365
         lower_mulh_inst(inst, block);
4366
         inst->remove(block);
4367
         progress = true;
4368
      }
4369

4370
   }
4371

4372
   if (progress)
4373
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4374

4375
   return progress;
4376
}
4377

4378
bool
4379
fs_visitor::lower_minmax()
4380
{
4381
   assert(devinfo->ver < 6);
4382

4383
   bool progress = false;
4384

4385
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4386
      const fs_builder ibld(this, block, inst);
4387

4388
      if (inst->opcode == BRW_OPCODE_SEL &&
4389
          inst->predicate == BRW_PREDICATE_NONE) {
4390
         /* If src1 is an immediate value that is not NaN, then it can't be
4391
          * NaN.  In that case, emit CMP because it is much better for cmod
4392
          * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
4393
          * support HF or DF, so it is not necessary to check for those.
4394
          */
4395
         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
4396
             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4397
            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4398
                     inst->conditional_mod);
4399
         } else {
4400
            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4401
                      inst->conditional_mod);
4402
         }
4403
         inst->predicate = BRW_PREDICATE_NORMAL;
4404
         inst->conditional_mod = BRW_CONDITIONAL_NONE;
4405

4406
         progress = true;
4407
      }
4408
   }
4409

4410
   if (progress)
4411
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4412

4413
   return progress;
4414
}
4415

4416
bool
4417
fs_visitor::lower_sub_sat()
4418
{
4419
   bool progress = false;
4420

4421
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4422
      const fs_builder ibld(this, block, inst);
4423

4424
      if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
4425
          inst->opcode == SHADER_OPCODE_ISUB_SAT) {
4426
         /* The fundamental problem is the hardware performs source negation
4427
          * at the bit width of the source.  If the source is 0x80000000D, the
4428
          * negation is 0x80000000D.  As a result, subtractSaturate(0,
4429
          * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
4430
          * are at least three ways to resolve this:
4431
          *
4432
          * 1. Use the accumulator for the negated source.  The accumulator is
4433
          *    33 bits, so our source 0x80000000 is sign-extended to
4434
          *    0x1800000000.  The negation of which is 0x080000000.  This
4435
          *    doesn't help for 64-bit integers (which are already bigger than
4436
          *    33 bits).  There are also only 8 accumulators, so SIMD16 or
4437
          *    SIMD32 instructions would have to be split into multiple SIMD8
4438
          *    instructions.
4439
          *
4440
          * 2. Use slightly different math.  For any n-bit value x, we know (x
4441
          *    >> 1) != -(x >> 1).  We can use this fact to only do
4442
          *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
4443
          *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4444
          *
4445
          * 3. For unsigned sources, it is sufficient to replace the
4446
          *    subtractSaturate with (a > b) ? a - b : 0.
4447
          *
4448
          * It may also be possible to use the SUBB instruction.  This
4449
          * implicitly writes the accumulator, so it could only be used in the
4450
          * same situations as #1 above.  It is further limited by only
4451
          * allowing UD sources.
4452
          */
4453
         if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
4454
             inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
4455
            fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
4456

4457
            ibld.MOV(acc, inst->src[1]);
4458
            fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4459
            add->saturate = true;
4460
            add->src[0].negate = true;
4461
         } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
4462
            /* tmp = src1 >> 1;
4463
             * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4464
             */
4465
            fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4466
            fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4467
            fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4468
            fs_inst *add;
4469

4470
            ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
4471

4472
            add = ibld.ADD(tmp2, inst->src[1], tmp1);
4473
            add->src[1].negate = true;
4474

4475
            add = ibld.ADD(tmp3, inst->src[0], tmp1);
4476
            add->src[1].negate = true;
4477
            add->saturate = true;
4478

4479
            add = ibld.ADD(inst->dst, tmp3, tmp2);
4480
            add->src[1].negate = true;
4481
            add->saturate = true;
4482
         } else {
4483
            /* a > b ? a - b : 0 */
4484
            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4485
                     BRW_CONDITIONAL_G);
4486

4487
            fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4488
            add->src[1].negate = !add->src[1].negate;
4489

4490
            ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
4491
               ->predicate = BRW_PREDICATE_NORMAL;
4492
         }
4493

4494
         inst->remove(block);
4495
         progress = true;
4496
      }
4497
   }
4498

4499
   if (progress)
4500
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4501

4502
   return progress;
4503
}
4504

4505
/**
4506
 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4507
 * by discard.  Due to the layout of the sample mask in the fragment shader
4508
 * thread payload, \p bld is required to have a dispatch_width() not greater
4509
 * than 16 for fragment shaders.
4510
 */
4511
static fs_reg
4512
sample_mask_reg(const fs_builder &bld)
4513
{
4514
   const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
4515

4516
   if (v->stage != MESA_SHADER_FRAGMENT) {
4517
      return brw_imm_ud(0xffffffff);
4518
   } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
4519
      assert(bld.dispatch_width() <= 16);
4520
      return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
4521
   } else {
4522
      assert(v->devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4523
      return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4524
                    BRW_REGISTER_TYPE_UW);
4525
   }
4526
}
4527

4528
static void
4529
setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
4530
                    fs_reg *dst, fs_reg color, unsigned components)
4531
{
4532
   if (key->clamp_fragment_color) {
4533
      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
4534
      assert(color.type == BRW_REGISTER_TYPE_F);
4535

4536
      for (unsigned i = 0; i < components; i++)
4537
         set_saturate(true,
4538
                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
4539

4540
      color = tmp;
4541
   }
4542

4543
   for (unsigned i = 0; i < components; i++)
4544
      dst[i] = offset(color, bld, i);
4545
}
4546

4547
uint32_t
4548
brw_fb_write_msg_control(const fs_inst *inst,
4549
                         const struct brw_wm_prog_data *prog_data)
4550
{
4551
   uint32_t mctl;
4552

4553
   if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
4554
      assert(inst->group == 0 && inst->exec_size == 16);
4555
      mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4556
   } else if (prog_data->dual_src_blend) {
4557
      assert(inst->exec_size == 8);
4558

4559
      if (inst->group % 16 == 0)
4560
         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4561
      else if (inst->group % 16 == 8)
4562
         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4563
      else
4564
         unreachable("Invalid dual-source FB write instruction group");
4565
   } else {
4566
      assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4567

4568
      if (inst->exec_size == 16)
4569
         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4570
      else if (inst->exec_size == 8)
4571
         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4572
      else
4573
         unreachable("Invalid FB write execution size");
4574
   }
4575

4576
   return mctl;
4577
}
4578

4579
static void
4580
lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
4581
                            const struct brw_wm_prog_data *prog_data,
4582
                            const brw_wm_prog_key *key,
4583
                            const fs_visitor::thread_payload &payload)
4584
{
4585
   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
4586
   const intel_device_info *devinfo = bld.shader->devinfo;
4587
   const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
4588
   const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
4589
   const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
4590
   const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
4591
   const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
4592
   const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
4593
   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
4594
   const unsigned components =
4595
      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
4596

4597
   assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
4598

4599
   /* We can potentially have a message length of up to 15, so we have to set
4600
    * base_mrf to either 0 or 1 in order to fit in m0..m15.
4601
    */
4602
   fs_reg sources[15];
4603
   int header_size = 2, payload_header_size;
4604
   unsigned length = 0;
4605

4606
   if (devinfo->ver < 6) {
4607
      /* TODO: Support SIMD32 on gfx4-5 */
4608
      assert(bld.group() < 16);
4609

4610
      /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
4611
       * an implied MOV from g0,g1 to the start of the message.  The MOV from
4612
       * g0 is handled by the hardware and the MOV from g1 is provided by the
4613
       * generator.  This is required because, on gfx4-5, the generator may
4614
       * generate two write messages with different message lengths in order
4615
       * to handle AA data properly.
4616
       *
4617
       * Also, since the pixel mask goes in the g0 portion of the message and
4618
       * since render target writes are the last thing in the shader, we write
4619
       * the pixel mask directly into g0 and it will get copied as part of the
4620
       * implied write.
4621
       */
4622
      if (prog_data->uses_kill) {
4623
         bld.exec_all().group(1, 0)
4624
            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
4625
                 sample_mask_reg(bld));
4626
      }
4627

4628
      assert(length == 0);
4629
      length = 2;
4630
   } else if ((devinfo->verx10 <= 70 &&
4631
               prog_data->uses_kill) ||
4632
              (devinfo->ver < 11 &&
4633
               (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
4634
      /* From the Sandy Bridge PRM, volume 4, page 198:
4635
       *
4636
       *     "Dispatched Pixel Enables. One bit per pixel indicating
4637
       *      which pixels were originally enabled when the thread was
4638
       *      dispatched. This field is only required for the end-of-
4639
       *      thread message and on all dual-source messages."
4640
       */
4641
      const fs_builder ubld = bld.exec_all().group(8, 0);
4642

4643
      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4644
      if (bld.group() < 16) {
4645
         /* The header starts off as g0 and g1 for the first half */
4646
         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4647
                                              BRW_REGISTER_TYPE_UD));
4648
      } else {
4649
         /* The header starts off as g0 and g2 for the second half */
4650
         assert(bld.group() < 32);
4651
         const fs_reg header_sources[2] = {
4652
            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4653
            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
4654
         };
4655
         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
4656

4657
         /* Gfx12 will require additional fix-ups if we ever hit this path. */
4658
         assert(devinfo->ver < 12);
4659
      }
4660

4661
      uint32_t g00_bits = 0;
4662

4663
      /* Set "Source0 Alpha Present to RenderTarget" bit in message
4664
       * header.
4665
       */
4666
      if (src0_alpha.file != BAD_FILE)
4667
         g00_bits |= 1 << 11;
4668

4669
      /* Set computes stencil to render target */
4670
      if (prog_data->computed_stencil)
4671
         g00_bits |= 1 << 14;
4672

4673
      if (g00_bits) {
4674
         /* OR extra bits into g0.0 */
4675
         ubld.group(1, 0).OR(component(header, 0),
4676
                             retype(brw_vec1_grf(0, 0),
4677
                                    BRW_REGISTER_TYPE_UD),
4678
                             brw_imm_ud(g00_bits));
4679
      }
4680

4681
      /* Set the render target index for choosing BLEND_STATE. */
4682
      if (inst->target > 0) {
4683
         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
4684
      }
4685

4686
      if (prog_data->uses_kill) {
4687
         ubld.group(1, 0).MOV(retype(component(header, 15),
4688
                                     BRW_REGISTER_TYPE_UW),
4689
                              sample_mask_reg(bld));
4690
      }
4691

4692
      assert(length == 0);
4693
      sources[0] = header;
4694
      sources[1] = horiz_offset(header, 8);
4695
      length = 2;
4696
   }
4697
   assert(length == 0 || length == 2);
4698
   header_size = length;
4699

4700
   if (payload.aa_dest_stencil_reg[0]) {
4701
      assert(inst->group < 16);
4702
      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
4703
      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
4704
         .MOV(sources[length],
4705
              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
4706
      length++;
4707
   }
4708

4709
   if (src0_alpha.file != BAD_FILE) {
4710
      for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
4711
         const fs_builder &ubld = bld.exec_all().group(8, i)
4712
                                    .annotate("FB write src0 alpha");
4713
         const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
4714
         ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
4715
         setup_color_payload(ubld, key, &sources[length], tmp, 1);
4716
         length++;
4717
      }
4718
   }
4719

4720
   if (sample_mask.file != BAD_FILE) {
4721
      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
4722
                               BRW_REGISTER_TYPE_UD);
4723

4724
      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
4725
       * relevant.  Since it's unsigned single words one vgrf is always
4726
       * 16-wide, but only the lower or higher 8 channels will be used by the
4727
       * hardware when doing a SIMD8 write depending on whether we have
4728
       * selected the subspans for the first or second half respectively.
4729
       */
4730
      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
4731
      sample_mask.type = BRW_REGISTER_TYPE_UW;
4732
      sample_mask.stride *= 2;
4733

4734
      bld.exec_all().annotate("FB write oMask")
4735
         .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
4736
                           inst->group % 16),
4737
              sample_mask);
4738
      length++;
4739
   }
4740

4741
   payload_header_size = length;
4742

4743
   setup_color_payload(bld, key, &sources[length], color0, components);
4744
   length += 4;
4745

4746
   if (color1.file != BAD_FILE) {
4747
      setup_color_payload(bld, key, &sources[length], color1, components);
4748
      length += 4;
4749
   }
4750

4751
   if (src_depth.file != BAD_FILE) {
4752
      sources[length] = src_depth;
4753
      length++;
4754
   }
4755

4756
   if (dst_depth.file != BAD_FILE) {
4757
      sources[length] = dst_depth;
4758
      length++;
4759
   }
4760

4761
   if (src_stencil.file != BAD_FILE) {
4762
      assert(devinfo->ver >= 9);
4763
      assert(bld.dispatch_width() == 8);
4764

4765
      /* XXX: src_stencil is only available on gfx9+. dst_depth is never
4766
       * available on gfx9+. As such it's impossible to have both enabled at the
4767
       * same time and therefore length cannot overrun the array.
4768
       */
4769
      assert(length < 15);
4770

4771
      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4772
      bld.exec_all().annotate("FB write OS")
4773
         .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
4774
              subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
4775
      length++;
4776
   }
4777

4778
   fs_inst *load;
4779
   if (devinfo->ver >= 7) {
4780
      /* Send from the GRF */
4781
      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
4782
      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
4783
      payload.nr = bld.shader->alloc.allocate(regs_written(load));
4784
      load->dst = payload;
4785

4786
      uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
4787

4788
      inst->desc =
4789
         (inst->group / 16) << 11 | /* rt slot group */
4790
         brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
4791
                           prog_data->per_coarse_pixel_dispatch);
4792

4793
      uint32_t ex_desc = 0;
4794
      if (devinfo->ver >= 11) {
4795
         /* Set the "Render Target Index" and "Src0 Alpha Present" fields
4796
          * in the extended message descriptor, in lieu of using a header.
4797
          */
4798
         ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
4799

4800
         if (key->nr_color_regions == 0)
4801
            ex_desc |= 1 << 20; /* Null Render Target */
4802
      }
4803
      inst->ex_desc = ex_desc;
4804

4805
      inst->opcode = SHADER_OPCODE_SEND;
4806
      inst->resize_sources(3);
4807
      inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
4808
      inst->src[0] = brw_imm_ud(0);
4809
      inst->src[1] = brw_imm_ud(0);
4810
      inst->src[2] = payload;
4811
      inst->mlen = regs_written(load);
4812
      inst->ex_mlen = 0;
4813
      inst->header_size = header_size;
4814
      inst->check_tdr = true;
4815
      inst->send_has_side_effects = true;
4816
   } else {
4817
      /* Send from the MRF */
4818
      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
4819
                              sources, length, payload_header_size);
4820

4821
      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
4822
       * will do this for us if we just give it a COMPR4 destination.
4823
       */
4824
      if (devinfo->ver < 6 && bld.dispatch_width() == 16)
4825
         load->dst.nr |= BRW_MRF_COMPR4;
4826

4827
      if (devinfo->ver < 6) {
4828
         /* Set up src[0] for the implied MOV from grf0-1 */
4829
         inst->resize_sources(1);
4830
         inst->src[0] = brw_vec8_grf(0, 0);
4831
      } else {
4832
         inst->resize_sources(0);
4833
      }
4834
      inst->base_mrf = 1;
4835
      inst->opcode = FS_OPCODE_FB_WRITE;
4836
      inst->mlen = regs_written(load);
4837
      inst->header_size = header_size;
4838
   }
4839
}
4840

4841
static void
4842
lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
4843
{
4844
   const intel_device_info *devinfo = bld.shader->devinfo;
4845
   const fs_builder &ubld = bld.exec_all().group(8, 0);
4846
   const unsigned length = 2;
4847
   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
4848

4849
   if (bld.group() < 16) {
4850
      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4851
                                           BRW_REGISTER_TYPE_UD));
4852
   } else {
4853
      assert(bld.group() < 32);
4854
      const fs_reg header_sources[] = {
4855
         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4856
         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
4857
      };
4858
      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
4859

4860
      if (devinfo->ver >= 12) {
4861
         /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
4862
          * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
4863
          * target message header format was updated accordingly -- However
4864
          * the updated format only works for the lower 16 channels in a
4865
          * SIMD32 thread, since the higher 16 channels want the subspan data
4866
          * from r2 instead of r1, so we need to copy over the contents of
4867
          * r1.1 in order to fix things up.
4868
          */
4869
         ubld.group(1, 0).MOV(component(header, 9),
4870
                              retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
4871
      }
4872
   }
4873

4874
   inst->resize_sources(1);
4875
   inst->src[0] = header;
4876
   inst->opcode = FS_OPCODE_FB_READ;
4877
   inst->mlen = length;
4878
   inst->header_size = length;
4879
}
4880

4881
static void
4882
lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
4883
                                const fs_reg &coordinate,
4884
                                const fs_reg &shadow_c,
4885
                                const fs_reg &lod, const fs_reg &lod2,
4886
                                const fs_reg &surface,
4887
                                const fs_reg &sampler,
4888
                                unsigned coord_components,
4889
                                unsigned grad_components)
4890
{
4891
   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
4892
                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
4893
   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
4894
   fs_reg msg_end = msg_begin;
4895

4896
   /* g0 header. */
4897
   msg_end = offset(msg_end, bld.group(8, 0), 1);
4898

4899
   for (unsigned i = 0; i < coord_components; i++)
4900
      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
4901
              offset(coordinate, bld, i));
4902

4903
   msg_end = offset(msg_end, bld, coord_components);
4904

4905
   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
4906
    * require all three components to be present and zero if they are unused.
4907
    */
4908
   if (coord_components > 0 &&
4909
       (has_lod || shadow_c.file != BAD_FILE ||
4910
        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
4911
      assert(coord_components <= 3);
4912
      for (unsigned i = 0; i < 3 - coord_components; i++)
4913
         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
4914

4915
      msg_end = offset(msg_end, bld, 3 - coord_components);
4916
   }
4917

4918
   if (op == SHADER_OPCODE_TXD) {
4919
      /* TXD unsupported in SIMD16 mode. */
4920
      assert(bld.dispatch_width() == 8);
4921

4922
      /* the slots for u and v are always present, but r is optional */
4923
      if (coord_components < 2)
4924
         msg_end = offset(msg_end, bld, 2 - coord_components);
4925

4926
      /*  P   = u, v, r
4927
       * dPdx = dudx, dvdx, drdx
4928
       * dPdy = dudy, dvdy, drdy
4929
       *
4930
       * 1-arg: Does not exist.
4931
       *
4932
       * 2-arg: dudx   dvdx   dudy   dvdy
4933
       *        dPdx.x dPdx.y dPdy.x dPdy.y
4934
       *        m4     m5     m6     m7
4935
       *
4936
       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
4937
       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
4938
       *        m5     m6     m7     m8     m9     m10
4939
       */
4940
      for (unsigned i = 0; i < grad_components; i++)
4941
         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
4942

4943
      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4944

4945
      for (unsigned i = 0; i < grad_components; i++)
4946
         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
4947

4948
      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4949
   }
4950

4951
   if (has_lod) {
4952
      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
4953
       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
4954
       */
4955
      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
4956
             bld.dispatch_width() == 16);
4957

4958
      const brw_reg_type type =
4959
         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
4960
          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
4961
      bld.MOV(retype(msg_end, type), lod);
4962
      msg_end = offset(msg_end, bld, 1);
4963
   }
4964

4965
   if (shadow_c.file != BAD_FILE) {
4966
      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
4967
         /* There's no plain shadow compare message, so we use shadow
4968
          * compare with a bias of 0.0.
4969
          */
4970
         bld.MOV(msg_end, brw_imm_f(0.0f));
4971
         msg_end = offset(msg_end, bld, 1);
4972
      }
4973

4974
      bld.MOV(msg_end, shadow_c);
4975
      msg_end = offset(msg_end, bld, 1);
4976
   }
4977

4978
   inst->opcode = op;
4979
   inst->src[0] = reg_undef;
4980
   inst->src[1] = surface;
4981
   inst->src[2] = sampler;
4982
   inst->resize_sources(3);
4983
   inst->base_mrf = msg_begin.nr;
4984
   inst->mlen = msg_end.nr - msg_begin.nr;
4985
   inst->header_size = 1;
4986
}
4987

4988
static void
4989
lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
4990
                                const fs_reg &coordinate,
4991
                                const fs_reg &shadow_c,
4992
                                const fs_reg &lod, const fs_reg &lod2,
4993
                                const fs_reg &sample_index,
4994
                                const fs_reg &surface,
4995
                                const fs_reg &sampler,
4996
                                unsigned coord_components,
4997
                                unsigned grad_components)
4998
{
4999
   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
5000
   fs_reg msg_coords = message;
5001
   unsigned header_size = 0;
5002

5003
   if (inst->offset != 0) {
5004
      /* The offsets set up by the visitor are in the m1 header, so we can't
5005
       * go headerless.
5006
       */
5007
      header_size = 1;
5008
      message.nr--;
5009
   }
5010

5011
   for (unsigned i = 0; i < coord_components; i++)
5012
      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
5013
              offset(coordinate, bld, i));
5014

5015
   fs_reg msg_end = offset(msg_coords, bld, coord_components);
5016
   fs_reg msg_lod = offset(msg_coords, bld, 4);
5017

5018
   if (shadow_c.file != BAD_FILE) {
5019
      fs_reg msg_shadow = msg_lod;
5020
      bld.MOV(msg_shadow, shadow_c);
5021
      msg_lod = offset(msg_shadow, bld, 1);
5022
      msg_end = msg_lod;
5023
   }
5024

5025
   switch (op) {
5026
   case SHADER_OPCODE_TXL:
5027
   case FS_OPCODE_TXB:
5028
      bld.MOV(msg_lod, lod);
5029
      msg_end = offset(msg_lod, bld, 1);
5030
      break;
5031
   case SHADER_OPCODE_TXD:
5032
      /**
5033
       *  P   =  u,    v,    r
5034
       * dPdx = dudx, dvdx, drdx
5035
       * dPdy = dudy, dvdy, drdy
5036
       *
5037
       * Load up these values:
5038
       * - dudx   dudy   dvdx   dvdy   drdx   drdy
5039
       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
5040
       */
5041
      msg_end = msg_lod;
5042
      for (unsigned i = 0; i < grad_components; i++) {
5043
         bld.MOV(msg_end, offset(lod, bld, i));
5044
         msg_end = offset(msg_end, bld, 1);
5045

5046
         bld.MOV(msg_end, offset(lod2, bld, i));
5047
         msg_end = offset(msg_end, bld, 1);
5048
      }
5049
      break;
5050
   case SHADER_OPCODE_TXS:
5051
      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
5052
      bld.MOV(msg_lod, lod);
5053
      msg_end = offset(msg_lod, bld, 1);
5054
      break;
5055
   case SHADER_OPCODE_TXF:
5056
      msg_lod = offset(msg_coords, bld, 3);
5057
      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
5058
      msg_end = offset(msg_lod, bld, 1);
5059
      break;
5060
   case SHADER_OPCODE_TXF_CMS:
5061
      msg_lod = offset(msg_coords, bld, 3);
5062
      /* lod */
5063
      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
5064
      /* sample index */
5065
      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
5066
      msg_end = offset(msg_lod, bld, 2);
5067
      break;
5068
   default:
5069
      break;
5070
   }
5071

5072
   inst->opcode = op;
5073
   inst->src[0] = reg_undef;
5074
   inst->src[1] = surface;
5075
   inst->src[2] = sampler;
5076
   inst->resize_sources(3);
5077
   inst->base_mrf = message.nr;
5078
   inst->mlen = msg_end.nr - message.nr;
5079
   inst->header_size = header_size;
5080

5081
   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5082
   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5083
}
5084

5085
static bool
5086
is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
5087
{
5088
   if (devinfo->verx10 <= 70)
5089
      return false;
5090

5091
   return sampler.file != IMM || sampler.ud >= 16;
5092
}
5093

5094
static unsigned
5095
sampler_msg_type(const intel_device_info *devinfo,
5096
                 opcode opcode, bool shadow_compare)
5097
{
5098
   assert(devinfo->ver >= 5);
5099
   switch (opcode) {
5100
   case SHADER_OPCODE_TEX:
5101
      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
5102
                              GFX5_SAMPLER_MESSAGE_SAMPLE;
5103
   case FS_OPCODE_TXB:
5104
      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
5105
                              GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
5106
   case SHADER_OPCODE_TXL:
5107
      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
5108
                              GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
5109
   case SHADER_OPCODE_TXL_LZ:
5110
      return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
5111
                              GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
5112
   case SHADER_OPCODE_TXS:
5113
   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5114
      return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
5115
   case SHADER_OPCODE_TXD:
5116
      assert(!shadow_compare || devinfo->verx10 >= 75);
5117
      return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
5118
                              GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
5119
   case SHADER_OPCODE_TXF:
5120
      return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
5121
   case SHADER_OPCODE_TXF_LZ:
5122
      assert(devinfo->ver >= 9);
5123
      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
5124
   case SHADER_OPCODE_TXF_CMS_W:
5125
      assert(devinfo->ver >= 9);
5126
      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
5127
   case SHADER_OPCODE_TXF_CMS:
5128
      return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
5129
                                 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
5130
   case SHADER_OPCODE_TXF_UMS:
5131
      assert(devinfo->ver >= 7);
5132
      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
5133
   case SHADER_OPCODE_TXF_MCS:
5134
      assert(devinfo->ver >= 7);
5135
      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
5136
   case SHADER_OPCODE_LOD:
5137
      return GFX5_SAMPLER_MESSAGE_LOD;
5138
   case SHADER_OPCODE_TG4:
5139
      assert(devinfo->ver >= 7);
5140
      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
5141
                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
5142
      break;
5143
   case SHADER_OPCODE_TG4_OFFSET:
5144
      assert(devinfo->ver >= 7);
5145
      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
5146
                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
5147
   case SHADER_OPCODE_SAMPLEINFO:
5148
      return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
5149
   default:
5150
      unreachable("not reached");
5151
   }
5152
}
5153

5154
static void
5155
lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
5156
                                const fs_reg &coordinate,
5157
                                const fs_reg &shadow_c,
5158
                                fs_reg lod, const fs_reg &lod2,
5159
                                const fs_reg &min_lod,
5160
                                const fs_reg &sample_index,
5161
                                const fs_reg &mcs,
5162
                                const fs_reg &surface,
5163
                                const fs_reg &sampler,
5164
                                const fs_reg &surface_handle,
5165
                                const fs_reg &sampler_handle,
5166
                                const fs_reg &tg4_offset,
5167
                                unsigned coord_components,
5168
                                unsigned grad_components)
5169
{
5170
   const intel_device_info *devinfo = bld.shader->devinfo;
5171
   const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
5172
   unsigned reg_width = bld.dispatch_width() / 8;
5173
   unsigned header_size = 0, length = 0;
5174
   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
5175
   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
5176
      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
5177

5178
   /* We must have exactly one of surface/sampler and surface/sampler_handle */
5179
   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5180
   assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
5181

5182
   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
5183
       inst->offset != 0 || inst->eot ||
5184
       op == SHADER_OPCODE_SAMPLEINFO ||
5185
       sampler_handle.file != BAD_FILE ||
5186
       is_high_sampler(devinfo, sampler)) {
5187
      /* For general texture offsets (no txf workaround), we need a header to
5188
       * put them in.
5189
       *
5190
       * TG4 needs to place its channel select in the header, for interaction
5191
       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
5192
       * larger sampler numbers we need to offset the Sampler State Pointer in
5193
       * the header.
5194
       */
5195
      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
5196
      header_size = 1;
5197
      length++;
5198

5199
      /* If we're requesting fewer than four channels worth of response,
5200
       * and we have an explicit header, we need to set up the sampler
5201
       * writemask.  It's reversed from normal: 1 means "don't write".
5202
       */
5203
      if (!inst->eot && regs_written(inst) != 4 * reg_width) {
5204
         assert(regs_written(inst) % reg_width == 0);
5205
         unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
5206
         inst->offset |= mask << 12;
5207
      }
5208

5209
      /* Build the actual header */
5210
      const fs_builder ubld = bld.exec_all().group(8, 0);
5211
      const fs_builder ubld1 = ubld.group(1, 0);
5212
      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
5213
      if (inst->offset) {
5214
         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
5215
      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
5216
                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
5217
         /* The vertex and fragment stages have g0.2 set to 0, so
5218
          * header0.2 is 0 when g0 is copied. Other stages may not, so we
5219
          * must set it to 0 to avoid setting undesirable bits in the
5220
          * message.
5221
          */
5222
         ubld1.MOV(component(header, 2), brw_imm_ud(0));
5223
      }
5224

5225
      if (sampler_handle.file != BAD_FILE) {
5226
         /* Bindless sampler handles aren't relative to the sampler state
5227
          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
5228
          * Instead, it's an absolute pointer relative to dynamic state base
5229
          * address.
5230
          *
5231
          * Sampler states are 16 bytes each and the pointer we give here has
5232
          * to be 32-byte aligned.  In order to avoid more indirect messages
5233
          * than required, we assume that all bindless sampler states are
5234
          * 32-byte aligned.  This sacrifices a bit of general state base
5235
          * address space but means we can do something more efficient in the
5236
          * shader.
5237
          */
5238
         ubld1.MOV(component(header, 3), sampler_handle);
5239
      } else if (is_high_sampler(devinfo, sampler)) {
5240
         fs_reg sampler_state_ptr =
5241
            retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
5242

5243
         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
5244
          * with the ones included in g0.3 bits 4:0.  Mask them out.
5245
          */
5246
         if (devinfo->ver >= 11) {
5247
            sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
5248
            ubld1.AND(sampler_state_ptr,
5249
                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
5250
                      brw_imm_ud(INTEL_MASK(31, 5)));
5251
         }
5252

5253
         if (sampler.file == BRW_IMMEDIATE_VALUE) {
5254
            assert(sampler.ud >= 16);
5255
            const int sampler_state_size = 16; /* 16 bytes */
5256

5257
            ubld1.ADD(component(header, 3), sampler_state_ptr,
5258
                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
5259
         } else {
5260
            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
5261
            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
5262
            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
5263
            ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
5264
         }
5265
      } else if (devinfo->ver >= 11) {
5266
         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
5267
          * with the ones included in g0.3 bits 4:0.  Mask them out.
5268
          */
5269
         ubld1.AND(component(header, 3),
5270
                   retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
5271
                   brw_imm_ud(INTEL_MASK(31, 5)));
5272
      }
5273
   }
5274

5275
   if (shadow_c.file != BAD_FILE) {
5276
      bld.MOV(sources[length], shadow_c);
5277
      length++;
5278
   }
5279

5280
   bool coordinate_done = false;
5281

5282
   /* Set up the LOD info */
5283
   switch (op) {
5284
   case FS_OPCODE_TXB:
5285
   case SHADER_OPCODE_TXL:
5286
      if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
5287
         op = SHADER_OPCODE_TXL_LZ;
5288
         break;
5289
      }
5290
      bld.MOV(sources[length], lod);
5291
      length++;
5292
      break;
5293
   case SHADER_OPCODE_TXD:
5294
      /* TXD should have been lowered in SIMD16 mode. */
5295
      assert(bld.dispatch_width() == 8);
5296

5297
      /* Load dPdx and the coordinate together:
5298
       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
5299
       */
5300
      for (unsigned i = 0; i < coord_components; i++) {
5301
         bld.MOV(sources[length++], offset(coordinate, bld, i));
5302

5303
         /* For cube map array, the coordinate is (u,v,r,ai) but there are
5304
          * only derivatives for (u, v, r).
5305
          */
5306
         if (i < grad_components) {
5307
            bld.MOV(sources[length++], offset(lod, bld, i));
5308
            bld.MOV(sources[length++], offset(lod2, bld, i));
5309
         }
5310
      }
5311

5312
      coordinate_done = true;
5313
      break;
5314
   case SHADER_OPCODE_TXS:
5315
      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
5316
      length++;
5317
      break;
5318
   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5319
      /* We need an LOD; just use 0 */
5320
      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
5321
      length++;
5322
      break;
5323
   case SHADER_OPCODE_TXF:
5324
      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
5325
       * On Gfx9 they are u, v, lod, r
5326
       */
5327
      bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
5328

5329
      if (devinfo->ver >= 9) {
5330
         if (coord_components >= 2) {
5331
            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
5332
                    offset(coordinate, bld, 1));
5333
         } else {
5334
            sources[length] = brw_imm_d(0);
5335
         }
5336
         length++;
5337
      }
5338

5339
      if (devinfo->ver >= 9 && lod.is_zero()) {
5340
         op = SHADER_OPCODE_TXF_LZ;
5341
      } else {
5342
         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
5343
         length++;
5344
      }
5345

5346
      for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
5347
         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5348
                 offset(coordinate, bld, i));
5349

5350
      coordinate_done = true;
5351
      break;
5352

5353
   case SHADER_OPCODE_TXF_CMS:
5354
   case SHADER_OPCODE_TXF_CMS_W:
5355
   case SHADER_OPCODE_TXF_UMS:
5356
   case SHADER_OPCODE_TXF_MCS:
5357
      if (op == SHADER_OPCODE_TXF_UMS ||
5358
          op == SHADER_OPCODE_TXF_CMS ||
5359
          op == SHADER_OPCODE_TXF_CMS_W) {
5360
         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
5361
         length++;
5362
      }
5363

5364
      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
5365
         /* Data from the multisample control surface. */
5366
         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
5367
         length++;
5368

5369
         /* On Gfx9+ we'll use ld2dms_w instead which has two registers for
5370
          * the MCS data.
5371
          */
5372
         if (op == SHADER_OPCODE_TXF_CMS_W) {
5373
            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
5374
                    mcs.file == IMM ?
5375
                    mcs :
5376
                    offset(mcs, bld, 1));
5377
            length++;
5378
         }
5379
      }
5380

5381
      /* There is no offsetting for this message; just copy in the integer
5382
       * texture coordinates.
5383
       */
5384
      for (unsigned i = 0; i < coord_components; i++)
5385
         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5386
                 offset(coordinate, bld, i));
5387

5388
      coordinate_done = true;
5389
      break;
5390
   case SHADER_OPCODE_TG4_OFFSET:
5391
      /* More crazy intermixing */
5392
      for (unsigned i = 0; i < 2; i++) /* u, v */
5393
         bld.MOV(sources[length++], offset(coordinate, bld, i));
5394

5395
      for (unsigned i = 0; i < 2; i++) /* offu, offv */
5396
         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5397
                 offset(tg4_offset, bld, i));
5398

5399
      if (coord_components == 3) /* r if present */
5400
         bld.MOV(sources[length++], offset(coordinate, bld, 2));
5401

5402
      coordinate_done = true;
5403
      break;
5404
   default:
5405
      break;
5406
   }
5407

5408
   /* Set up the coordinate (except for cases where it was done above) */
5409
   if (!coordinate_done) {
5410
      for (unsigned i = 0; i < coord_components; i++)
5411
         bld.MOV(sources[length++], offset(coordinate, bld, i));
5412
   }
5413

5414
   if (min_lod.file != BAD_FILE) {
5415
      /* Account for all of the missing coordinate sources */
5416
      length += 4 - coord_components;
5417
      if (op == SHADER_OPCODE_TXD)
5418
         length += (3 - grad_components) * 2;
5419

5420
      bld.MOV(sources[length++], min_lod);
5421
   }
5422

5423
   unsigned mlen;
5424
   if (reg_width == 2)
5425
      mlen = length * reg_width - header_size;
5426
   else
5427
      mlen = length * reg_width;
5428

5429
   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
5430
                                     BRW_REGISTER_TYPE_F);
5431
   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
5432

5433
   /* Generate the SEND. */
5434
   inst->opcode = SHADER_OPCODE_SEND;
5435
   inst->mlen = mlen;
5436
   inst->header_size = header_size;
5437

5438
   const unsigned msg_type =
5439
      sampler_msg_type(devinfo, op, inst->shadow_compare);
5440
   const unsigned simd_mode =
5441
      inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
5442
                             BRW_SAMPLER_SIMD_MODE_SIMD16;
5443

5444
   uint32_t base_binding_table_index;
5445
   switch (op) {
5446
   case SHADER_OPCODE_TG4:
5447
   case SHADER_OPCODE_TG4_OFFSET:
5448
      base_binding_table_index = prog_data->binding_table.gather_texture_start;
5449
      break;
5450
   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5451
      base_binding_table_index = prog_data->binding_table.image_start;
5452
      break;
5453
   default:
5454
      base_binding_table_index = prog_data->binding_table.texture_start;
5455
      break;
5456
   }
5457

5458
   inst->sfid = BRW_SFID_SAMPLER;
5459
   if (surface.file == IMM &&
5460
       (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
5461
      inst->desc = brw_sampler_desc(devinfo,
5462
                                    surface.ud + base_binding_table_index,
5463
                                    sampler.file == IMM ? sampler.ud % 16 : 0,
5464
                                    msg_type,
5465
                                    simd_mode,
5466
                                    0 /* return_format unused on gfx7+ */);
5467
      inst->src[0] = brw_imm_ud(0);
5468
      inst->src[1] = brw_imm_ud(0);
5469
   } else if (surface_handle.file != BAD_FILE) {
5470
      /* Bindless surface */
5471
      assert(devinfo->ver >= 9);
5472
      inst->desc = brw_sampler_desc(devinfo,
5473
                                    GFX9_BTI_BINDLESS,
5474
                                    sampler.file == IMM ? sampler.ud % 16 : 0,
5475
                                    msg_type,
5476
                                    simd_mode,
5477
                                    0 /* return_format unused on gfx7+ */);
5478

5479
      /* For bindless samplers, the entire address is included in the message
5480
       * header so we can leave the portion in the message descriptor 0.
5481
       */
5482
      if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
5483
         inst->src[0] = brw_imm_ud(0);
5484
      } else {
5485
         const fs_builder ubld = bld.group(1, 0).exec_all();
5486
         fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5487
         ubld.SHL(desc, sampler, brw_imm_ud(8));
5488
         inst->src[0] = desc;
5489
      }
5490

5491
      /* We assume that the driver provided the handle in the top 20 bits so
5492
       * we can use the surface handle directly as the extended descriptor.
5493
       */
5494
      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5495
   } else {
5496
      /* Immediate portion of the descriptor */
5497
      inst->desc = brw_sampler_desc(devinfo,
5498
                                    0, /* surface */
5499
                                    0, /* sampler */
5500
                                    msg_type,
5501
                                    simd_mode,
5502
                                    0 /* return_format unused on gfx7+ */);
5503
      const fs_builder ubld = bld.group(1, 0).exec_all();
5504
      fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5505
      if (surface.equals(sampler)) {
5506
         /* This case is common in GL */
5507
         ubld.MUL(desc, surface, brw_imm_ud(0x101));
5508
      } else {
5509
         if (sampler_handle.file != BAD_FILE) {
5510
            ubld.MOV(desc, surface);
5511
         } else if (sampler.file == IMM) {
5512
            ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
5513
         } else {
5514
            ubld.SHL(desc, sampler, brw_imm_ud(8));
5515
            ubld.OR(desc, desc, surface);
5516
         }
5517
      }
5518
      if (base_binding_table_index)
5519
         ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));
5520
      ubld.AND(desc, desc, brw_imm_ud(0xfff));
5521

5522
      inst->src[0] = component(desc, 0);
5523
      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5524
   }
5525

5526
   inst->ex_desc = 0;
5527

5528
   inst->src[2] = src_payload;
5529
   inst->resize_sources(3);
5530

5531
   if (inst->eot) {
5532
      /* EOT sampler messages don't make sense to split because it would
5533
       * involve ending half of the thread early.
5534
       */
5535
      assert(inst->group == 0);
5536
      /* We need to use SENDC for EOT sampler messages */
5537
      inst->check_tdr = true;
5538
      inst->send_has_side_effects = true;
5539
   }
5540

5541
   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5542
   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5543
}
5544

5545
static void
5546
lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
5547
{
5548
   const intel_device_info *devinfo = bld.shader->devinfo;
5549
   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
5550
   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
5551
   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
5552
   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
5553
   const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
5554
   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
5555
   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
5556
   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
5557
   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
5558
   const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
5559
   const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
5560
   const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
5561
   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
5562
   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
5563
   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
5564
   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
5565

5566
   if (devinfo->ver >= 7) {
5567
      lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
5568
                                      shadow_c, lod, lod2, min_lod,
5569
                                      sample_index,
5570
                                      mcs, surface, sampler,
5571
                                      surface_handle, sampler_handle,
5572
                                      tg4_offset,
5573
                                      coord_components, grad_components);
5574
   } else if (devinfo->ver >= 5) {
5575
      lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
5576
                                      shadow_c, lod, lod2, sample_index,
5577
                                      surface, sampler,
5578
                                      coord_components, grad_components);
5579
   } else {
5580
      lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
5581
                                      shadow_c, lod, lod2,
5582
                                      surface, sampler,
5583
                                      coord_components, grad_components);
5584
   }
5585
}
5586

5587
/**
5588
 * Predicate the specified instruction on the sample mask.
5589
 */
5590
static void
5591
emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
5592
{
5593
   assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
5594
          bld.group() == inst->group &&
5595
          bld.dispatch_width() == inst->exec_size);
5596

5597
   const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
5598
   const fs_reg sample_mask = sample_mask_reg(bld);
5599
   const unsigned subreg = sample_mask_flag_subreg(v);
5600

5601
   if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
5602
      assert(sample_mask.file == ARF &&
5603
             sample_mask.nr == brw_flag_subreg(subreg).nr &&
5604
             sample_mask.subnr == brw_flag_subreg(
5605
                subreg + inst->group / 16).subnr);
5606
   } else {
5607
      bld.group(1, 0).exec_all()
5608
         .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
5609
   }
5610

5611
   if (inst->predicate) {
5612
      assert(inst->predicate == BRW_PREDICATE_NORMAL);
5613
      assert(!inst->predicate_inverse);
5614
      assert(inst->flag_subreg == 0);
5615
      /* Combine the sample mask with the existing predicate by using a
5616
       * vertical predication mode.
5617
       */
5618
      inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
5619
   } else {
5620
      inst->flag_subreg = subreg;
5621
      inst->predicate = BRW_PREDICATE_NORMAL;
5622
      inst->predicate_inverse = false;
5623
   }
5624
}
5625

5626
static void
5627
setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
5628
                          const fs_reg &surface, const fs_reg &surface_handle)
5629
{
5630
   const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
5631

5632
   /* We must have exactly one of surface and surface_handle */
5633
   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5634

5635
   if (surface.file == IMM) {
5636
      inst->desc = desc | (surface.ud & 0xff);
5637
      inst->src[0] = brw_imm_ud(0);
5638
      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5639
   } else if (surface_handle.file != BAD_FILE) {
5640
      /* Bindless surface */
5641
      assert(devinfo->ver >= 9);
5642
      inst->desc = desc | GFX9_BTI_BINDLESS;
5643
      inst->src[0] = brw_imm_ud(0);
5644

5645
      /* We assume that the driver provided the handle in the top 20 bits so
5646
       * we can use the surface handle directly as the extended descriptor.
5647
       */
5648
      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5649
   } else {
5650
      inst->desc = desc;
5651
      const fs_builder ubld = bld.exec_all().group(1, 0);
5652
      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5653
      ubld.AND(tmp, surface, brw_imm_ud(0xff));
5654
      inst->src[0] = component(tmp, 0);
5655
      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5656
   }
5657
}
5658

5659
static void
5660
lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5661
{
5662
   const intel_device_info *devinfo = bld.shader->devinfo;
5663

5664
   /* Get the logical send arguments. */
5665
   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5666
   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5667
   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
5668
   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
5669
   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
5670
   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
5671
   const fs_reg &allow_sample_mask =
5672
      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
5673
   assert(arg.file == IMM);
5674
   assert(allow_sample_mask.file == IMM);
5675

5676
   /* Calculate the total number of components of the payload. */
5677
   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
5678
   const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
5679

5680
   const bool is_typed_access =
5681
      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
5682
      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
5683
      inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
5684

5685
   const bool is_surface_access = is_typed_access ||
5686
      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
5687
      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
5688
      inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
5689

5690
   const bool is_stateless =
5691
      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
5692
                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
5693

5694
   const bool has_side_effects = inst->has_side_effects();
5695

5696
   fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
5697
                                               fs_reg(brw_imm_d(0xffff));
5698

5699
   /* From the BDW PRM Volume 7, page 147:
5700
    *
5701
    *  "For the Data Cache Data Port*, the header must be present for the
5702
    *   following message types: [...] Typed read/write/atomics"
5703
    *
5704
    * Earlier generations have a similar wording.  Because of this restriction
5705
    * we don't attempt to implement sample masks via predication for such
5706
    * messages prior to Gfx9, since we have to provide a header anyway.  On
5707
    * Gfx11+ the header has been removed so we can only use predication.
5708
    *
5709
    * For all stateless A32 messages, we also need a header
5710
    */
5711
   fs_reg header;
5712
   if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
5713
      fs_builder ubld = bld.exec_all().group(8, 0);
5714
      header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5715
      if (is_stateless) {
5716
         assert(!is_surface_access);
5717
         ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
5718
      } else {
5719
         ubld.MOV(header, brw_imm_d(0));
5720
         if (is_surface_access)
5721
            ubld.group(1, 0).MOV(component(header, 7), sample_mask);
5722
      }
5723
   }
5724
   const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
5725

5726
   fs_reg payload, payload2;
5727
   unsigned mlen, ex_mlen = 0;
5728
   if (devinfo->ver >= 9 &&
5729
       (src.file == BAD_FILE || header.file == BAD_FILE)) {
5730
      /* We have split sends on gfx9 and above */
5731
      if (header.file == BAD_FILE) {
5732
         payload = bld.move_to_vgrf(addr, addr_sz);
5733
         payload2 = bld.move_to_vgrf(src, src_sz);
5734
         mlen = addr_sz * (inst->exec_size / 8);
5735
         ex_mlen = src_sz * (inst->exec_size / 8);
5736
      } else {
5737
         assert(src.file == BAD_FILE);
5738
         payload = header;
5739
         payload2 = bld.move_to_vgrf(addr, addr_sz);
5740
         mlen = header_sz;
5741
         ex_mlen = addr_sz * (inst->exec_size / 8);
5742
      }
5743
   } else {
5744
      /* Allocate space for the payload. */
5745
      const unsigned sz = header_sz + addr_sz + src_sz;
5746
      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
5747
      fs_reg *const components = new fs_reg[sz];
5748
      unsigned n = 0;
5749

5750
      /* Construct the payload. */
5751
      if (header.file != BAD_FILE)
5752
         components[n++] = header;
5753

5754
      for (unsigned i = 0; i < addr_sz; i++)
5755
         components[n++] = offset(addr, bld, i);
5756

5757
      for (unsigned i = 0; i < src_sz; i++)
5758
         components[n++] = offset(src, bld, i);
5759

5760
      bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
5761
      mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
5762

5763
      delete[] components;
5764
   }
5765

5766
   /* Predicate the instruction on the sample mask if no header is
5767
    * provided.
5768
    */
5769
   if ((header.file == BAD_FILE || !is_surface_access) &&
5770
       sample_mask.file != BAD_FILE && sample_mask.file != IMM)
5771
      emit_predicate_on_sample_mask(bld, inst);
5772

5773
   uint32_t sfid;
5774
   switch (inst->opcode) {
5775
   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5776
   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5777
      /* Byte scattered opcodes go through the normal data cache */
5778
      sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
5779
      break;
5780

5781
   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5782
   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5783
      sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
5784
              devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
5785
                                  BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
5786
      break;
5787

5788
   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5789
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5790
   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5791
   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5792
      /* Untyped Surface messages go through the data cache but the SFID value
5793
       * changed on Haswell.
5794
       */
5795
      sfid = (devinfo->verx10 >= 75 ?
5796
              HSW_SFID_DATAPORT_DATA_CACHE_1 :
5797
              GFX7_SFID_DATAPORT_DATA_CACHE);
5798
      break;
5799

5800
   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5801
   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5802
   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5803
      /* Typed surface messages go through the render cache on IVB and the
5804
       * data cache on HSW+.
5805
       */
5806
      sfid = (devinfo->verx10 >= 75 ?
5807
              HSW_SFID_DATAPORT_DATA_CACHE_1 :
5808
              GFX6_SFID_DATAPORT_RENDER_CACHE);
5809
      break;
5810

5811
   default:
5812
      unreachable("Unsupported surface opcode");
5813
   }
5814

5815
   uint32_t desc;
5816
   switch (inst->opcode) {
5817
   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5818
      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5819
                                            arg.ud, /* num_channels */
5820
                                            false   /* write */);
5821
      break;
5822

5823
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5824
      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5825
                                            arg.ud, /* num_channels */
5826
                                            true    /* write */);
5827
      break;
5828

5829
   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5830
      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5831
                                           arg.ud, /* bit_size */
5832
                                           false   /* write */);
5833
      break;
5834

5835
   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5836
      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5837
                                           arg.ud, /* bit_size */
5838
                                           true    /* write */);
5839
      break;
5840

5841
   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5842
      assert(arg.ud == 32); /* bit_size */
5843
      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
5844
                                            false  /* write */);
5845
      break;
5846

5847
   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5848
      assert(arg.ud == 32); /* bit_size */
5849
      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
5850
                                            true   /* write */);
5851
      break;
5852

5853
   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5854
      desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
5855
                                        arg.ud, /* atomic_op */
5856
                                        !inst->dst.is_null());
5857
      break;
5858

5859
   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5860
      desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
5861
                                              arg.ud, /* atomic_op */
5862
                                              !inst->dst.is_null());
5863
      break;
5864

5865
   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5866
      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5867
                                          arg.ud, /* num_channels */
5868
                                          false   /* write */);
5869
      break;
5870

5871
   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5872
      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5873
                                          arg.ud, /* num_channels */
5874
                                          true    /* write */);
5875
      break;
5876

5877
   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5878
      desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
5879
                                      arg.ud, /* atomic_op */
5880
                                      !inst->dst.is_null());
5881
      break;
5882

5883
   default:
5884
      unreachable("Unknown surface logical instruction");
5885
   }
5886

5887
   /* Update the original instruction. */
5888
   inst->opcode = SHADER_OPCODE_SEND;
5889
   inst->mlen = mlen;
5890
   inst->ex_mlen = ex_mlen;
5891
   inst->header_size = header_sz;
5892
   inst->send_has_side_effects = has_side_effects;
5893
   inst->send_is_volatile = !has_side_effects;
5894

5895
   /* Set up SFID and descriptors */
5896
   inst->sfid = sfid;
5897
   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
5898

5899
   /* Finally, the payload */
5900
   inst->src[2] = payload;
5901
   inst->src[3] = payload2;
5902

5903
   inst->resize_sources(4);
5904
}
5905

5906
static enum lsc_opcode
5907
brw_atomic_op_to_lsc_atomic_op(unsigned op)
5908
{
5909
   switch(op) {
5910
   case BRW_AOP_AND:
5911
      return LSC_OP_ATOMIC_AND;
5912
   case BRW_AOP_OR:
5913
      return LSC_OP_ATOMIC_OR;
5914
   case BRW_AOP_XOR:
5915
      return LSC_OP_ATOMIC_XOR;
5916
   case BRW_AOP_MOV:
5917
      return LSC_OP_ATOMIC_STORE;
5918
   case BRW_AOP_INC:
5919
      return LSC_OP_ATOMIC_INC;
5920
   case BRW_AOP_DEC:
5921
      return LSC_OP_ATOMIC_DEC;
5922
   case BRW_AOP_ADD:
5923
      return LSC_OP_ATOMIC_ADD;
5924
   case BRW_AOP_SUB:
5925
      return LSC_OP_ATOMIC_SUB;
5926
   case BRW_AOP_IMAX:
5927
      return LSC_OP_ATOMIC_MAX;
5928
   case BRW_AOP_IMIN:
5929
      return LSC_OP_ATOMIC_MIN;
5930
   case BRW_AOP_UMAX:
5931
      return LSC_OP_ATOMIC_UMAX;
5932
   case BRW_AOP_UMIN:
5933
      return LSC_OP_ATOMIC_UMIN;
5934
   case BRW_AOP_CMPWR:
5935
      return LSC_OP_ATOMIC_CMPXCHG;
5936
   default:
5937
      assert(false);
5938
      unreachable("invalid atomic opcode");
5939
   }
5940
}
5941

5942
static enum lsc_opcode
5943
brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)
5944
{
5945
   switch(aop) {
5946
   case BRW_AOP_FMAX:
5947
      return LSC_OP_ATOMIC_FMAX;
5948
   case BRW_AOP_FMIN:
5949
      return LSC_OP_ATOMIC_FMIN;
5950
   case BRW_AOP_FCMPWR:
5951
      return LSC_OP_ATOMIC_FCMPXCHG;
5952
   default:
5953
      unreachable("Unsupported float atomic opcode");
5954
   }
5955
}
5956

5957
static enum lsc_data_size
5958
lsc_bits_to_data_size(unsigned bit_size)
5959
{
5960
   switch (bit_size / 8) {
5961
   case 1:  return LSC_DATA_SIZE_D8U32;
5962
   case 2:  return LSC_DATA_SIZE_D16U32;
5963
   case 4:  return LSC_DATA_SIZE_D32;
5964
   case 8:  return LSC_DATA_SIZE_D64;
5965
   default:
5966
      unreachable("Unsupported data size.");
5967
   }
5968
}
5969

5970
static void
5971
lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5972
{
5973
   const intel_device_info *devinfo = bld.shader->devinfo;
5974
   assert(devinfo->has_lsc);
5975

5976
   /* Get the logical send arguments. */
5977
   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5978
   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5979
   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
5980
   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
5981
   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
5982
   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
5983
   const fs_reg allow_sample_mask =
5984
      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
5985
   assert(arg.file == IMM);
5986
   assert(allow_sample_mask.file == IMM);
5987

5988
   /* Calculate the total number of components of the payload. */
5989
   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
5990
   const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
5991

5992
   const bool has_side_effects = inst->has_side_effects();
5993

5994
   unsigned ex_mlen = 0;
5995
   fs_reg payload, payload2;
5996
   payload = bld.move_to_vgrf(addr, addr_sz);
5997
   if (src.file != BAD_FILE) {
5998
      payload2 = bld.move_to_vgrf(src, src_sz);
5999
      ex_mlen = src_sz * (inst->exec_size / 8);
6000
   }
6001

6002
   /* Predicate the instruction on the sample mask if needed */
6003
   fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
6004
                                               fs_reg(brw_imm_d(0xffff));
6005
   if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
6006
      emit_predicate_on_sample_mask(bld, inst);
6007

6008
   if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
6009
      inst->sfid = GFX12_SFID_SLM;
6010
   else
6011
      inst->sfid = GFX12_SFID_UGM;
6012

6013
   /* We must have exactly one of surface and surface_handle */
6014
   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
6015

6016
   enum lsc_addr_surface_type surf_type;
6017
   if (surface_handle.file != BAD_FILE)
6018
      surf_type = LSC_ADDR_SURFTYPE_BSS;
6019
   else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
6020
      surf_type = LSC_ADDR_SURFTYPE_FLAT;
6021
   else
6022
      surf_type = LSC_ADDR_SURFTYPE_BTI;
6023

6024
   switch (inst->opcode) {
6025
   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6026
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6027
                                surf_type, LSC_ADDR_SIZE_A32,
6028
                                1 /* num_coordinates */,
6029
                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
6030
                                false /* transpose */,
6031
                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6032
                                true /* has_dest */);
6033
      break;
6034
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6035
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
6036
                                surf_type, LSC_ADDR_SIZE_A32,
6037
                                1 /* num_coordinates */,
6038
                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
6039
                                false /* transpose */,
6040
                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6041
                                false /* has_dest */);
6042
      break;
6043
   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6044
   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
6045
      /* Bspec: Atomic instruction -> Cache section:
6046
       *
6047
       *    Atomic messages are always forced to "un-cacheable" in the L1
6048
       *    cache.
6049
       */
6050
      enum lsc_opcode opcode =
6051
         inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?
6052
         brw_atomic_op_to_lsc_fatomic_op(arg.ud) :
6053
         brw_atomic_op_to_lsc_atomic_op(arg.ud);
6054
      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
6055
                                surf_type, LSC_ADDR_SIZE_A32,
6056
                                1 /* num_coordinates */,
6057
                                LSC_DATA_SIZE_D32, 1 /* num_channels */,
6058
                                false /* transpose */,
6059
                                LSC_CACHE_STORE_L1UC_L3WB,
6060
                                !inst->dst.is_null());
6061
      break;
6062
   }
6063
   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6064
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6065
                                surf_type, LSC_ADDR_SIZE_A32,
6066
                                1 /* num_coordinates */,
6067
                                lsc_bits_to_data_size(arg.ud),
6068
                                1 /* num_channels */,
6069
                                false /* transpose */,
6070
                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6071
                                true /* has_dest */);
6072
      break;
6073
   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6074
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
6075
                                surf_type, LSC_ADDR_SIZE_A32,
6076
                                1 /* num_coordinates */,
6077
                                lsc_bits_to_data_size(arg.ud),
6078
                                1 /* num_channels */,
6079
                                false /* transpose */,
6080
                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6081
                                false /* has_dest */);
6082
      break;
6083
   default:
6084
      unreachable("Unknown surface logical instruction");
6085
   }
6086

6087
   inst->src[0] = brw_imm_ud(0);
6088

6089
   /* Set up extended descriptors */
6090
   switch (surf_type) {
6091
   case LSC_ADDR_SURFTYPE_FLAT:
6092
      inst->src[1] = brw_imm_ud(0);
6093
      break;
6094
   case LSC_ADDR_SURFTYPE_BSS:
6095
      /* We assume that the driver provided the handle in the top 20 bits so
6096
       * we can use the surface handle directly as the extended descriptor.
6097
       */
6098
      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
6099
      break;
6100
   case LSC_ADDR_SURFTYPE_BTI:
6101
      if (surface.file == IMM) {
6102
         inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
6103
      } else {
6104
         const fs_builder ubld = bld.exec_all().group(1, 0);
6105
         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6106
         ubld.SHL(tmp, surface, brw_imm_ud(24));
6107
         inst->src[1] = component(tmp, 0);
6108
      }
6109
      break;
6110
   default:
6111
      unreachable("Unknown surface type");
6112
   }
6113

6114
   /* Update the original instruction. */
6115
   inst->opcode = SHADER_OPCODE_SEND;
6116
   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6117
   inst->ex_mlen = ex_mlen;
6118
   inst->header_size = 0;
6119
   inst->send_has_side_effects = has_side_effects;
6120
   inst->send_is_volatile = !has_side_effects;
6121

6122
   /* Finally, the payload */
6123
   inst->src[2] = payload;
6124
   inst->src[3] = payload2;
6125

6126
   inst->resize_sources(4);
6127
}
6128

6129
static void
6130
lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
6131
{
6132
   const intel_device_info *devinfo = bld.shader->devinfo;
6133
   assert(devinfo->ver >= 9);
6134

6135
   /* Get the logical send arguments. */
6136
   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
6137
   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
6138
   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
6139
   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
6140
   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
6141
   assert(arg.file == IMM);
6142
   assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
6143
   assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
6144

6145
   const bool is_stateless =
6146
      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
6147
                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
6148

6149
   const bool has_side_effects = inst->has_side_effects();
6150

6151
   const bool align_16B =
6152
      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
6153

6154
   const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
6155

6156
   /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
6157
   fs_builder ubld = bld.exec_all().group(8, 0);
6158
   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6159

6160
   if (is_stateless)
6161
      ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
6162
   else
6163
      ubld.MOV(header, brw_imm_d(0));
6164

6165
   /* Address in OWord units when aligned to OWords. */
6166
   if (align_16B)
6167
      ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
6168
   else
6169
      ubld.group(1, 0).MOV(component(header, 2), addr);
6170

6171
   fs_reg data;
6172
   unsigned ex_mlen = 0;
6173
   if (write) {
6174
      const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
6175
      data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
6176
      ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
6177
   }
6178

6179
   inst->opcode = SHADER_OPCODE_SEND;
6180
   inst->mlen = 1;
6181
   inst->ex_mlen = ex_mlen;
6182
   inst->header_size = 1;
6183
   inst->send_has_side_effects = has_side_effects;
6184
   inst->send_is_volatile = !has_side_effects;
6185

6186
   inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
6187

6188
   const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
6189
                                                    arg.ud, write);
6190
   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
6191

6192
   inst->src[2] = header;
6193
   inst->src[3] = data;
6194

6195
   inst->resize_sources(4);
6196
}
6197

6198
static fs_reg
6199
emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
6200
{
6201
   const fs_builder ubld = bld.exec_all().group(8, 0);
6202
   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6203
   ubld.MOV(header, brw_imm_ud(0));
6204

6205
   /* Use a 2-wide MOV to fill out the address */
6206
   assert(type_sz(addr.type) == 8 && addr.stride == 0);
6207
   fs_reg addr_vec2 = addr;
6208
   addr_vec2.type = BRW_REGISTER_TYPE_UD;
6209
   addr_vec2.stride = 1;
6210
   ubld.group(2, 0).MOV(header, addr_vec2);
6211

6212
   return header;
6213
}
6214

6215
static void
6216
lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
6217
{
6218
   const intel_device_info *devinfo = bld.shader->devinfo;
6219

6220
   /* Get the logical send arguments. */
6221
   const fs_reg &addr = inst->src[0];
6222
   const fs_reg &src = inst->src[1];
6223
   const unsigned src_sz = type_sz(src.type);
6224

6225
   const unsigned src_comps = inst->components_read(1);
6226
   assert(inst->src[2].file == IMM);
6227
   const unsigned arg = inst->src[2].ud;
6228
   const bool has_side_effects = inst->has_side_effects();
6229

6230
   /* If the surface message has side effects and we're a fragment shader, we
6231
    * have to predicate with the sample mask to avoid helper invocations.
6232
    */
6233
   if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
6234
      emit_predicate_on_sample_mask(bld, inst);
6235

6236
   fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
6237
   fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
6238
                            BRW_REGISTER_TYPE_UD);
6239
   unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
6240

6241
   switch (inst->opcode) {
6242
   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6243
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6244
                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6245
                                1 /* num_coordinates */,
6246
                                LSC_DATA_SIZE_D32, arg /* num_channels */,
6247
                                false /* transpose */,
6248
                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6249
                                true /* has_dest */);
6250
      break;
6251
   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6252
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
6253
                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6254
                                1 /* num_coordinates */,
6255
                                LSC_DATA_SIZE_D32, arg /* num_channels */,
6256
                                false /* transpose */,
6257
                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6258
                                false /* has_dest */);
6259
      break;
6260
   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6261
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6262
                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6263
                                1 /* num_coordinates */,
6264
                                lsc_bits_to_data_size(arg),
6265
                                1 /* num_channels */,
6266
                                false /* transpose */,
6267
                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6268
                                true /* has_dest */);
6269
      break;
6270
   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6271
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
6272
                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6273
                                1 /* num_coordinates */,
6274
                                lsc_bits_to_data_size(arg),
6275
                                1 /* num_channels */,
6276
                                false /* transpose */,
6277
                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6278
                                false /* has_dest */);
6279
      break;
6280
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6281
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6282
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {
6283
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6284
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6285
      /* Bspec: Atomic instruction -> Cache section:
6286
       *
6287
       *    Atomic messages are always forced to "un-cacheable" in the L1
6288
       *    cache.
6289
       */
6290
      enum lsc_opcode opcode =
6291
         (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||
6292
          inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||
6293
          inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?
6294
         brw_atomic_op_to_lsc_atomic_op(arg) :
6295
         brw_atomic_op_to_lsc_fatomic_op(arg);
6296
      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
6297
                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6298
                                1 /* num_coordinates */,
6299
                                lsc_bits_to_data_size(src_sz * 8),
6300
                                1 /* num_channels */,
6301
                                false /* transpose */,
6302
                                LSC_CACHE_STORE_L1UC_L3WB,
6303
                                !inst->dst.is_null());
6304
      break;
6305
   }
6306
   default:
6307
      unreachable("Unknown A64 logical instruction");
6308
   }
6309

6310
   /* Update the original instruction. */
6311
   inst->opcode = SHADER_OPCODE_SEND;
6312
   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6313
   inst->ex_mlen = ex_mlen;
6314
   inst->header_size = 0;
6315
   inst->send_has_side_effects = has_side_effects;
6316
   inst->send_is_volatile = !has_side_effects;
6317

6318
   /* Set up SFID and descriptors */
6319
   inst->sfid = GFX12_SFID_UGM;
6320
   inst->resize_sources(4);
6321
   inst->src[0] = brw_imm_ud(0); /* desc */
6322
   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6323
   inst->src[2] = payload;
6324
   inst->src[3] = payload2;
6325
}
6326

6327
static void
6328
lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
6329
{
6330
   const intel_device_info *devinfo = bld.shader->devinfo;
6331

6332
   const fs_reg &addr = inst->src[0];
6333
   const fs_reg &src = inst->src[1];
6334
   const unsigned src_comps = inst->components_read(1);
6335
   assert(inst->src[2].file == IMM);
6336
   const unsigned arg = inst->src[2].ud;
6337
   const bool has_side_effects = inst->has_side_effects();
6338

6339
   /* If the surface message has side effects and we're a fragment shader, we
6340
    * have to predicate with the sample mask to avoid helper invocations.
6341
    */
6342
   if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
6343
      emit_predicate_on_sample_mask(bld, inst);
6344

6345
   fs_reg payload, payload2;
6346
   unsigned mlen, ex_mlen = 0, header_size = 0;
6347
   if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
6348
       inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
6349
       inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
6350
      assert(devinfo->ver >= 9);
6351

6352
      /* OWORD messages only take a scalar address in a header */
6353
      mlen = 1;
6354
      header_size = 1;
6355
      payload = emit_a64_oword_block_header(bld, addr);
6356

6357
      if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
6358
         ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
6359
         payload2 = retype(bld.move_to_vgrf(src, src_comps),
6360
                           BRW_REGISTER_TYPE_UD);
6361
      }
6362
   } else if (devinfo->ver >= 9) {
6363
      /* On Skylake and above, we have SENDS */
6364
      mlen = 2 * (inst->exec_size / 8);
6365
      ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
6366
      payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
6367
      payload2 = retype(bld.move_to_vgrf(src, src_comps),
6368
                        BRW_REGISTER_TYPE_UD);
6369
   } else {
6370
      /* Add two because the address is 64-bit */
6371
      const unsigned dwords = 2 + src_comps;
6372
      mlen = dwords * (inst->exec_size / 8);
6373

6374
      fs_reg sources[5];
6375

6376
      sources[0] = addr;
6377

6378
      for (unsigned i = 0; i < src_comps; i++)
6379
         sources[1 + i] = offset(src, bld, i);
6380

6381
      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
6382
      bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
6383
   }
6384

6385
   uint32_t desc;
6386
   switch (inst->opcode) {
6387
   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6388
      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
6389
                                                arg,   /* num_channels */
6390
                                                false  /* write */);
6391
      break;
6392

6393
   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6394
      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
6395
                                                arg,   /* num_channels */
6396
                                                true   /* write */);
6397
      break;
6398

6399
   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
6400
      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6401
                                            true,    /* align_16B */
6402
                                            arg,     /* num_dwords */
6403
                                            false    /* write */);
6404
      break;
6405

6406
   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6407
      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6408
                                            false,   /* align_16B */
6409
                                            arg,     /* num_dwords */
6410
                                            false    /* write */);
6411
      break;
6412

6413
   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
6414
      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6415
                                            true,    /* align_16B */
6416
                                            arg,     /* num_dwords */
6417
                                            true     /* write */);
6418
      break;
6419

6420
   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6421
      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
6422
                                               arg,   /* bit_size */
6423
                                               false  /* write */);
6424
      break;
6425

6426
   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6427
      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
6428
                                               arg,   /* bit_size */
6429
                                               true   /* write */);
6430
      break;
6431

6432
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6433
      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
6434
                                            arg,   /* atomic_op */
6435
                                            !inst->dst.is_null());
6436
      break;
6437

6438
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6439
      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,
6440
                                            arg,   /* atomic_op */
6441
                                            !inst->dst.is_null());
6442
      break;
6443

6444
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6445
      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
6446
                                            arg,   /* atomic_op */
6447
                                            !inst->dst.is_null());
6448
      break;
6449

6450
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6451
      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
6452
                                                  16, /* bit_size */
6453
                                                  arg,   /* atomic_op */
6454
                                                  !inst->dst.is_null());
6455
      break;
6456

6457
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6458
      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
6459
                                                  32, /* bit_size */
6460
                                                  arg,   /* atomic_op */
6461
                                                  !inst->dst.is_null());
6462
      break;
6463

6464
   default:
6465
      unreachable("Unknown A64 logical instruction");
6466
   }
6467

6468
   /* Update the original instruction. */
6469
   inst->opcode = SHADER_OPCODE_SEND;
6470
   inst->mlen = mlen;
6471
   inst->ex_mlen = ex_mlen;
6472
   inst->header_size = header_size;
6473
   inst->send_has_side_effects = has_side_effects;
6474
   inst->send_is_volatile = !has_side_effects;
6475

6476
   /* Set up SFID and descriptors */
6477
   inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
6478
   inst->desc = desc;
6479
   inst->resize_sources(4);
6480
   inst->src[0] = brw_imm_ud(0); /* desc */
6481
   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6482
   inst->src[2] = payload;
6483
   inst->src[3] = payload2;
6484
}
6485

6486
static void
6487
lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
6488
                                             fs_inst *inst)
6489
{
6490
   const intel_device_info *devinfo = bld.shader->devinfo;
6491
   ASSERTED const brw_compiler *compiler = bld.shader->compiler;
6492

6493
   fs_reg index = inst->src[0];
6494

6495
   /* We are switching the instruction from an ALU-like instruction to a
6496
    * send-from-grf instruction.  Since sends can't handle strides or
6497
    * source modifiers, we have to make a copy of the offset source.
6498
    */
6499
   fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);
6500

6501
   assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
6502
   unsigned alignment = inst->src[2].ud;
6503

6504
   inst->opcode = SHADER_OPCODE_SEND;
6505
   inst->sfid = GFX12_SFID_UGM;
6506
   inst->resize_sources(3);
6507
   inst->src[0] = brw_imm_ud(0);
6508

6509
   if (index.file == IMM) {
6510
      inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));
6511
   } else {
6512
      const fs_builder ubld = bld.exec_all().group(1, 0);
6513
      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6514
      ubld.SHL(tmp, index, brw_imm_ud(24));
6515
      inst->src[1] = component(tmp, 0);
6516
   }
6517

6518
   assert(!compiler->indirect_ubos_use_sampler);
6519

6520
   inst->src[2] = ubo_offset; /* payload */
6521
   if (alignment >= 4) {
6522
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6523
                                LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
6524
                                1 /* num_coordinates */,
6525
                                LSC_DATA_SIZE_D32,
6526
                                4 /* num_channels */,
6527
                                false /* transpose */,
6528
                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6529
                                true /* has_dest */);
6530
      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6531
   } else {
6532
      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6533
                                LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
6534
                                1 /* num_coordinates */,
6535
                                LSC_DATA_SIZE_D32,
6536
                                1 /* num_channels */,
6537
                                false /* transpose */,
6538
                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6539
                                true /* has_dest */);
6540
      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6541
      /* The byte scattered messages can only read one dword at a time so
6542
       * we have to duplicate the message 4 times to read the full vec4.
6543
       * Hopefully, dead code will clean up the mess if some of them aren't
6544
       * needed.
6545
       */
6546
      assert(inst->size_written == 16 * inst->exec_size);
6547
      inst->size_written /= 4;
6548
      for (unsigned c = 1; c < 4; c++) {
6549
         /* Emit a copy of the instruction because we're about to modify
6550
          * it.  Because this loop starts at 1, we will emit copies for the
6551
          * first 3 and the final one will be the modified instruction.
6552
          */
6553
         bld.emit(*inst);
6554

6555
         /* Offset the source */
6556
         inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6557
         bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
6558

6559
         /* Offset the destination */
6560
         inst->dst = offset(inst->dst, bld, 1);
6561
      }
6562
   }
6563
}
6564

6565
static void
6566
lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
6567
{
6568
   const intel_device_info *devinfo = bld.shader->devinfo;
6569
   const brw_compiler *compiler = bld.shader->compiler;
6570

6571
   if (devinfo->ver >= 7) {
6572
      fs_reg index = inst->src[0];
6573
      /* We are switching the instruction from an ALU-like instruction to a
6574
       * send-from-grf instruction.  Since sends can't handle strides or
6575
       * source modifiers, we have to make a copy of the offset source.
6576
       */
6577
      fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
6578
      bld.MOV(ubo_offset, inst->src[1]);
6579

6580
      assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
6581
      unsigned alignment = inst->src[2].ud;
6582

6583
      inst->opcode = SHADER_OPCODE_SEND;
6584
      inst->mlen = inst->exec_size / 8;
6585
      inst->resize_sources(3);
6586

6587
      if (index.file == IMM) {
6588
         inst->desc = index.ud & 0xff;
6589
         inst->src[0] = brw_imm_ud(0);
6590
      } else {
6591
         inst->desc = 0;
6592
         const fs_builder ubld = bld.exec_all().group(1, 0);
6593
         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6594
         ubld.AND(tmp, index, brw_imm_ud(0xff));
6595
         inst->src[0] = component(tmp, 0);
6596
      }
6597
      inst->src[1] = brw_imm_ud(0); /* ex_desc */
6598
      inst->src[2] = ubo_offset; /* payload */
6599

6600
      if (compiler->indirect_ubos_use_sampler) {
6601
         const unsigned simd_mode =
6602
            inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
6603
                                   BRW_SAMPLER_SIMD_MODE_SIMD16;
6604

6605
         inst->sfid = BRW_SFID_SAMPLER;
6606
         inst->desc |= brw_sampler_desc(devinfo, 0, 0,
6607
                                        GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
6608
                                        simd_mode, 0);
6609
      } else if (alignment >= 4) {
6610
         inst->sfid = (devinfo->verx10 >= 75 ?
6611
                       HSW_SFID_DATAPORT_DATA_CACHE_1 :
6612
                       GFX7_SFID_DATAPORT_DATA_CACHE);
6613
         inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
6614
                                                      4, /* num_channels */
6615
                                                      false   /* write */);
6616
      } else {
6617
         inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
6618
         inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
6619
                                                     32,     /* bit_size */
6620
                                                     false   /* write */);
6621
         /* The byte scattered messages can only read one dword at a time so
6622
          * we have to duplicate the message 4 times to read the full vec4.
6623
          * Hopefully, dead code will clean up the mess if some of them aren't
6624
          * needed.
6625
          */
6626
         assert(inst->size_written == 16 * inst->exec_size);
6627
         inst->size_written /= 4;
6628
         for (unsigned c = 1; c < 4; c++) {
6629
            /* Emit a copy of the instruction because we're about to modify
6630
             * it.  Because this loop starts at 1, we will emit copies for the
6631
             * first 3 and the final one will be the modified instruction.
6632
             */
6633
            bld.emit(*inst);
6634

6635
            /* Offset the source */
6636
            inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6637
            bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
6638

6639
            /* Offset the destination */
6640
            inst->dst = offset(inst->dst, bld, 1);
6641
         }
6642
      }
6643
   } else {
6644
      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
6645
                           BRW_REGISTER_TYPE_UD);
6646

6647
      bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
6648

6649
      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
6650
      inst->resize_sources(1);
6651
      inst->base_mrf = payload.nr;
6652
      inst->header_size = 1;
6653
      inst->mlen = 1 + inst->exec_size / 8;
6654
   }
6655
}
6656

6657
static void
6658
lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
6659
{
6660
   assert(bld.shader->devinfo->ver < 6);
6661

6662
   inst->base_mrf = 2;
6663
   inst->mlen = inst->sources * inst->exec_size / 8;
6664

6665
   if (inst->sources > 1) {
6666
      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
6667
       * "Message Payload":
6668
       *
6669
       * "Operand0[7].  For the INT DIV functions, this operand is the
6670
       *  denominator."
6671
       *  ...
6672
       * "Operand1[7].  For the INT DIV functions, this operand is the
6673
       *  numerator."
6674
       */
6675
      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
6676
      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
6677
      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
6678

6679
      inst->resize_sources(1);
6680
      inst->src[0] = src0;
6681

6682
      assert(inst->exec_size == 8);
6683
      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
6684
   }
6685
}
6686

6687
static void
6688
lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
6689
{
6690
   const intel_device_info *devinfo = bld.shader->devinfo;
6691
   fs_reg global_addr = inst->src[0];
6692
   const fs_reg &btd_record = inst->src[1];
6693

6694
   const unsigned mlen = 2;
6695
   const fs_builder ubld = bld.exec_all().group(8, 0);
6696
   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
6697

6698
   ubld.MOV(header, brw_imm_ud(0));
6699
   switch (inst->opcode) {
6700
   case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
6701
      assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
6702
      global_addr.type = BRW_REGISTER_TYPE_UD;
6703
      global_addr.stride = 1;
6704
      ubld.group(2, 0).MOV(header, global_addr);
6705
      break;
6706

6707
   case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
6708
      /* The bottom bit is the Stack ID release bit */
6709
      ubld.group(1, 0).MOV(header, brw_imm_ud(1));
6710
      break;
6711

6712
   default:
6713
      unreachable("Invalid BTD message");
6714
   }
6715

6716
   /* Stack IDs are always in R1 regardless of whether we're coming from a
6717
    * bindless shader or a regular compute shader.
6718
    */
6719
   fs_reg stack_ids =
6720
      retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);
6721
   bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
6722

6723
   unsigned ex_mlen = 0;
6724
   fs_reg payload;
6725
   if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
6726
      ex_mlen = 2 * (inst->exec_size / 8);
6727
      payload = bld.move_to_vgrf(btd_record, 1);
6728
   } else {
6729
      assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
6730
      /* All these messages take a BTD and things complain if we don't provide
6731
       * one for RETIRE.  However, it shouldn't ever actually get used so fill
6732
       * it with zero.
6733
       */
6734
      ex_mlen = 2 * (inst->exec_size / 8);
6735
      payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
6736
   }
6737

6738
   /* Update the original instruction. */
6739
   inst->opcode = SHADER_OPCODE_SEND;
6740
   inst->mlen = mlen;
6741
   inst->ex_mlen = ex_mlen;
6742
   inst->header_size = 0; /* HW docs require has_header = false */
6743
   inst->send_has_side_effects = true;
6744
   inst->send_is_volatile = false;
6745

6746
   /* Set up SFID and descriptors */
6747
   inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
6748
   inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
6749
                                   GEN_RT_BTD_MESSAGE_SPAWN);
6750
   inst->resize_sources(4);
6751
   inst->src[0] = brw_imm_ud(0); /* desc */
6752
   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6753
   inst->src[2] = header;
6754
   inst->src[3] = payload;
6755
}
6756

6757
static void
6758
lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
6759
{
6760
   const intel_device_info *devinfo = bld.shader->devinfo;
6761
   const fs_reg &bvh_level = inst->src[0];
6762
   assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);
6763
   const uint32_t trace_ray_control = inst->src[1].ud;
6764

6765
   const unsigned mlen = 1;
6766
   const fs_builder ubld = bld.exec_all().group(8, 0);
6767
   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6768
   ubld.MOV(header, brw_imm_ud(0));
6769
   ubld.group(2, 0).MOV(header,
6770
      retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));
6771
   /* TODO: Bit 128 is ray_query */
6772

6773
   const unsigned ex_mlen = inst->exec_size / 8;
6774
   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
6775
   const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);
6776
   if (bvh_level.file == BRW_IMMEDIATE_VALUE) {
6777
      bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));
6778
   } else {
6779
      bld.AND(payload, bvh_level, brw_imm_ud(0x7));
6780
      if (trc_bits != 0)
6781
         bld.OR(payload, payload, brw_imm_ud(trc_bits));
6782
   }
6783
   bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
6784
           retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
6785
           brw_imm_uw(0x7ff));
6786

6787
   /* Update the original instruction. */
6788
   inst->opcode = SHADER_OPCODE_SEND;
6789
   inst->mlen = mlen;
6790
   inst->ex_mlen = ex_mlen;
6791
   inst->header_size = 0; /* HW docs require has_header = false */
6792
   inst->send_has_side_effects = true;
6793
   inst->send_is_volatile = false;
6794

6795
   /* Set up SFID and descriptors */
6796
   inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
6797
   inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
6798
   inst->resize_sources(4);
6799
   inst->src[0] = brw_imm_ud(0); /* desc */
6800
   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6801
   inst->src[2] = header;
6802
   inst->src[3] = payload;
6803
}
6804

6805
bool
6806
fs_visitor::lower_logical_sends()
6807
{
6808
   bool progress = false;
6809

6810
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
6811
      const fs_builder ibld(this, block, inst);
6812

6813
      switch (inst->opcode) {
6814
      case FS_OPCODE_FB_WRITE_LOGICAL:
6815
         assert(stage == MESA_SHADER_FRAGMENT);
6816
         lower_fb_write_logical_send(ibld, inst,
6817
                                     brw_wm_prog_data(prog_data),
6818
                                     (const brw_wm_prog_key *)key,
6819
                                     payload);
6820
         break;
6821

6822
      case FS_OPCODE_FB_READ_LOGICAL:
6823
         lower_fb_read_logical_send(ibld, inst);
6824
         break;
6825

6826
      case SHADER_OPCODE_TEX_LOGICAL:
6827
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
6828
         break;
6829

6830
      case SHADER_OPCODE_TXD_LOGICAL:
6831
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
6832
         break;
6833

6834
      case SHADER_OPCODE_TXF_LOGICAL:
6835
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
6836
         break;
6837

6838
      case SHADER_OPCODE_TXL_LOGICAL:
6839
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
6840
         break;
6841

6842
      case SHADER_OPCODE_TXS_LOGICAL:
6843
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
6844
         break;
6845

6846
      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
6847
         lower_sampler_logical_send(ibld, inst,
6848
                                    SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
6849
         break;
6850

6851
      case FS_OPCODE_TXB_LOGICAL:
6852
         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
6853
         break;
6854

6855
      case SHADER_OPCODE_TXF_CMS_LOGICAL:
6856
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
6857
         break;
6858

6859
      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
6860
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
6861
         break;
6862

6863
      case SHADER_OPCODE_TXF_UMS_LOGICAL:
6864
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
6865
         break;
6866

6867
      case SHADER_OPCODE_TXF_MCS_LOGICAL:
6868
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
6869
         break;
6870

6871
      case SHADER_OPCODE_LOD_LOGICAL:
6872
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
6873
         break;
6874

6875
      case SHADER_OPCODE_TG4_LOGICAL:
6876
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
6877
         break;
6878

6879
      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
6880
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
6881
         break;
6882

6883
      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
6884
         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
6885
         break;
6886

6887
      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6888
      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6889
      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6890
      case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
6891
      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6892
      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6893
         if (devinfo->has_lsc) {
6894
            lower_lsc_surface_logical_send(ibld, inst);
6895
            break;
6896
         }
6897
      case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
6898
      case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
6899
      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
6900
      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
6901
      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
6902
         lower_surface_logical_send(ibld, inst);
6903
         break;
6904

6905
      case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
6906
      case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6907
      case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
6908
         lower_surface_block_logical_send(ibld, inst);
6909
         break;
6910

6911
      case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6912
      case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6913
      case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6914
      case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6915
      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6916
      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6917
      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6918
      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6919
      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6920
         if (devinfo->has_lsc) {
6921
            lower_lsc_a64_logical_send(ibld, inst);
6922
            break;
6923
         }
6924
      case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
6925
      case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6926
      case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
6927
         lower_a64_logical_send(ibld, inst);
6928
         break;
6929

6930
      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
6931
         if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
6932
            lower_lsc_varying_pull_constant_logical_send(ibld, inst);
6933
         else
6934
            lower_varying_pull_constant_logical_send(ibld, inst);
6935
         break;
6936

6937
      case SHADER_OPCODE_RCP:
6938
      case SHADER_OPCODE_RSQ:
6939
      case SHADER_OPCODE_SQRT:
6940
      case SHADER_OPCODE_EXP2:
6941
      case SHADER_OPCODE_LOG2:
6942
      case SHADER_OPCODE_SIN:
6943
      case SHADER_OPCODE_COS:
6944
      case SHADER_OPCODE_POW:
6945
      case SHADER_OPCODE_INT_QUOTIENT:
6946
      case SHADER_OPCODE_INT_REMAINDER:
6947
         /* The math opcodes are overloaded for the send-like and
6948
          * expression-like instructions which seems kind of icky.  Gfx6+ has
6949
          * a native (but rather quirky) MATH instruction so we don't need to
6950
          * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
6951
          * logical instructions (which we can easily recognize because they
6952
          * have mlen = 0) into send-like virtual instructions.
6953
          */
6954
         if (devinfo->ver < 6 && inst->mlen == 0) {
6955
            lower_math_logical_send(ibld, inst);
6956
            break;
6957

6958
         } else {
6959
            continue;
6960
         }
6961

6962
      case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
6963
      case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
6964
         lower_btd_logical_send(ibld, inst);
6965
         break;
6966

6967
      case RT_OPCODE_TRACE_RAY_LOGICAL:
6968
         lower_trace_ray_logical_send(ibld, inst);
6969
         break;
6970

6971
      default:
6972
         continue;
6973
      }
6974

6975
      progress = true;
6976
   }
6977

6978
   if (progress)
6979
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6980

6981
   return progress;
6982
}
6983

6984
static bool
6985
is_mixed_float_with_fp32_dst(const fs_inst *inst)
6986
{
6987
   /* This opcode sometimes uses :W type on the source even if the operand is
6988
    * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
6989
    */
6990
   if (inst->opcode == BRW_OPCODE_F16TO32)
6991
      return true;
6992

6993
   if (inst->dst.type != BRW_REGISTER_TYPE_F)
6994
      return false;
6995

6996
   for (int i = 0; i < inst->sources; i++) {
6997
      if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
6998
         return true;
6999
   }
7000

7001
   return false;
7002
}
7003

7004
static bool
7005
is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
7006
{
7007
   /* This opcode sometimes uses :W type on the destination even if the
7008
    * destination is a :HF, because in gfx7 there is no support for :HF, and
7009
    * thus it uses :W.
7010
    */
7011
   if (inst->opcode == BRW_OPCODE_F32TO16 &&
7012
       inst->dst.stride == 1)
7013
      return true;
7014

7015
   if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
7016
       inst->dst.stride != 1)
7017
      return false;
7018

7019
   for (int i = 0; i < inst->sources; i++) {
7020
      if (inst->src[i].type == BRW_REGISTER_TYPE_F)
7021
         return true;
7022
   }
7023

7024
   return false;
7025
}
7026

7027
/**
7028
 * Get the closest allowed SIMD width for instruction \p inst accounting for
7029
 * some common regioning and execution control restrictions that apply to FPU
7030
 * instructions.  These restrictions don't necessarily have any relevance to
7031
 * instructions not executed by the FPU pipeline like extended math, control
7032
 * flow or send message instructions.
7033
 *
7034
 * For virtual opcodes it's really up to the instruction -- In some cases
7035
 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
7036
 * instructions) it may simplify virtual instruction lowering if we can
7037
 * enforce FPU-like regioning restrictions already on the virtual instruction,
7038
 * in other cases (e.g. virtual send-like instructions) this may be
7039
 * excessively restrictive.
7040
 */
7041
static unsigned
7042
get_fpu_lowered_simd_width(const struct intel_device_info *devinfo,
7043
                           const fs_inst *inst)
7044
{
7045
   /* Maximum execution size representable in the instruction controls. */
7046
   unsigned max_width = MIN2(32, inst->exec_size);
7047

7048
   /* According to the PRMs:
7049
    *  "A. In Direct Addressing mode, a source cannot span more than 2
7050
    *      adjacent GRF registers.
7051
    *   B. A destination cannot span more than 2 adjacent GRF registers."
7052
    *
7053
    * Look for the source or destination with the largest register region
7054
    * which is the one that is going to limit the overall execution size of
7055
    * the instruction due to this rule.
7056
    */
7057
   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
7058

7059
   for (unsigned i = 0; i < inst->sources; i++)
7060
      reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
7061

7062
   /* Calculate the maximum execution size of the instruction based on the
7063
    * factor by which it goes over the hardware limit of 2 GRFs.
7064
    */
7065
   if (reg_count > 2)
7066
      max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
7067

7068
   /* According to the IVB PRMs:
7069
    *  "When destination spans two registers, the source MUST span two
7070
    *   registers. The exception to the above rule:
7071
    *
7072
    *    - When source is scalar, the source registers are not incremented.
7073
    *    - When source is packed integer Word and destination is packed
7074
    *      integer DWord, the source register is not incremented but the
7075
    *      source sub register is incremented."
7076
    *
7077
    * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
7078
    * restrictions.  The code below intentionally doesn't check whether the
7079
    * destination type is integer because empirically the hardware doesn't
7080
    * seem to care what the actual type is as long as it's dword-aligned.
7081
    */
7082
   if (devinfo->ver < 8) {
7083
      for (unsigned i = 0; i < inst->sources; i++) {
7084
         /* IVB implements DF scalars as <0;2,1> regions. */
7085
         const bool is_scalar_exception = is_uniform(inst->src[i]) &&
7086
            (devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
7087
         const bool is_packed_word_exception =
7088
            type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
7089
            type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
7090

7091
         /* We check size_read(i) against size_written instead of REG_SIZE
7092
          * because we want to properly handle SIMD32.  In SIMD32, you can end
7093
          * up with writes to 4 registers and a source that reads 2 registers
7094
          * and we may still need to lower all the way to SIMD8 in that case.
7095
          */
7096
         if (inst->size_written > REG_SIZE &&
7097
             inst->size_read(i) != 0 &&
7098
             inst->size_read(i) < inst->size_written &&
7099
             !is_scalar_exception && !is_packed_word_exception) {
7100
            const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
7101
            max_width = MIN2(max_width, inst->exec_size / reg_count);
7102
         }
7103
      }
7104
   }
7105

7106
   if (devinfo->ver < 6) {
7107
      /* From the G45 PRM, Volume 4 Page 361:
7108
       *
7109
       *    "Operand Alignment Rule: With the exceptions listed below, a
7110
       *     source/destination operand in general should be aligned to even
7111
       *     256-bit physical register with a region size equal to two 256-bit
7112
       *     physical registers."
7113
       *
7114
       * Normally we enforce this by allocating virtual registers to the
7115
       * even-aligned class.  But we need to handle payload registers.
7116
       */
7117
      for (unsigned i = 0; i < inst->sources; i++) {
7118
         if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
7119
             inst->size_read(i) > REG_SIZE) {
7120
            max_width = MIN2(max_width, 8);
7121
         }
7122
      }
7123
   }
7124

7125
   /* From the IVB PRMs:
7126
    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
7127
    *   are applied for both halves of the SIMD32 instruction. If different
7128
    *   execution mask channels are required, split the instruction into two
7129
    *   SIMD16 instructions."
7130
    *
7131
    * There is similar text in the HSW PRMs.  Gfx4-6 don't even implement
7132
    * 32-wide control flow support in hardware and will behave similarly.
7133
    */
7134
   if (devinfo->ver < 8 && !inst->force_writemask_all)
7135
      max_width = MIN2(max_width, 16);
7136

7137
   /* From the IVB PRMs (applies to HSW too):
7138
    *  "Instructions with condition modifiers must not use SIMD32."
7139
    *
7140
    * From the BDW PRMs (applies to later hardware too):
7141
    *  "Ternary instruction with condition modifiers must not use SIMD32."
7142
    */
7143
   if (inst->conditional_mod && (devinfo->ver < 8 || inst->is_3src(devinfo)))
7144
      max_width = MIN2(max_width, 16);
7145

7146
   /* From the IVB PRMs (applies to other devices that don't have the
7147
    * intel_device_info::supports_simd16_3src flag set):
7148
    *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
7149
    *   SIMD8 is not allowed for DF operations."
7150
    */
7151
   if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
7152
      max_width = MIN2(max_width, inst->exec_size / reg_count);
7153

7154
   /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
7155
    * the 8-bit quarter of the execution mask signals specified in the
7156
    * instruction control fields) for the second compressed half of any
7157
    * single-precision instruction (for double-precision instructions
7158
    * it's hardwired to use NibCtrl+1, at least on HSW), which means that
7159
    * the EU will apply the wrong execution controls for the second
7160
    * sequential GRF write if the number of channels per GRF is not exactly
7161
    * eight in single-precision mode (or four in double-float mode).
7162
    *
7163
    * In this situation we calculate the maximum size of the split
7164
    * instructions so they only ever write to a single register.
7165
    */
7166
   if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
7167
       !inst->force_writemask_all) {
7168
      const unsigned channels_per_grf = inst->exec_size /
7169
         DIV_ROUND_UP(inst->size_written, REG_SIZE);
7170
      const unsigned exec_type_size = get_exec_type_size(inst);
7171
      assert(exec_type_size);
7172

7173
      /* The hardware shifts exactly 8 channels per compressed half of the
7174
       * instruction in single-precision mode and exactly 4 in double-precision.
7175
       */
7176
      if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
7177
         max_width = MIN2(max_width, channels_per_grf);
7178

7179
      /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
7180
       * because HW applies the same channel enable signals to both halves of
7181
       * the compressed instruction which will be just wrong under
7182
       * non-uniform control flow.
7183
       */
7184
      if (devinfo->verx10 == 70 &&
7185
          (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
7186
         max_width = MIN2(max_width, 4);
7187
   }
7188

7189
   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
7190
    * Float Operations:
7191
    *
7192
    *    "No SIMD16 in mixed mode when destination is f32. Instruction
7193
    *     execution size must be no more than 8."
7194
    *
7195
    * FIXME: the simulator doesn't seem to complain if we don't do this and
7196
    * empirical testing with existing CTS tests show that they pass just fine
7197
    * without implementing this, however, since our interpretation of the PRM
7198
    * is that conversion MOVs between HF and F are still mixed-float
7199
    * instructions (and therefore subject to this restriction) we decided to
7200
    * split them to be safe. Might be useful to do additional investigation to
7201
    * lift the restriction if we can ensure that it is safe though, since these
7202
    * conversions are common when half-float types are involved since many
7203
    * instructions do not support HF types and conversions from/to F are
7204
    * required.
7205
    */
7206
   if (is_mixed_float_with_fp32_dst(inst))
7207
      max_width = MIN2(max_width, 8);
7208

7209
   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
7210
    * Float Operations:
7211
    *
7212
    *    "No SIMD16 in mixed mode when destination is packed f16 for both
7213
    *     Align1 and Align16."
7214
    */
7215
   if (is_mixed_float_with_packed_fp16_dst(inst))
7216
      max_width = MIN2(max_width, 8);
7217

7218
   /* Only power-of-two execution sizes are representable in the instruction
7219
    * control fields.
7220
    */
7221
   return 1 << util_logbase2(max_width);
7222
}
7223

7224
/**
7225
 * Get the maximum allowed SIMD width for instruction \p inst accounting for
7226
 * various payload size restrictions that apply to sampler message
7227
 * instructions.
7228
 *
7229
 * This is only intended to provide a maximum theoretical bound for the
7230
 * execution size of the message based on the number of argument components
7231
 * alone, which in most cases will determine whether the SIMD8 or SIMD16
7232
 * variant of the message can be used, though some messages may have
7233
 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
7234
 * the message length to determine the exact SIMD width and argument count,
7235
 * which makes a number of sampler message combinations impossible to
7236
 * represent).
7237
 */
7238
static unsigned
7239
get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
7240
                               const fs_inst *inst)
7241
{
7242
   /* If we have a min_lod parameter on anything other than a simple sample
7243
    * message, it will push it over 5 arguments and we have to fall back to
7244
    * SIMD8.
7245
    */
7246
   if (inst->opcode != SHADER_OPCODE_TEX &&
7247
       inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
7248
      return 8;
7249

7250
   /* Calculate the number of coordinate components that have to be present
7251
    * assuming that additional arguments follow the texel coordinates in the
7252
    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
7253
    * need to pad to four or three components depending on the message,
7254
    * pre-ILK we need to pad to at most three components.
7255
    */
7256
   const unsigned req_coord_components =
7257
      (devinfo->ver >= 7 ||
7258
       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
7259
      (devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
7260
                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
7261
      3;
7262

7263
   /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
7264
    * variant of the TXL or TXF message.
7265
    */
7266
   const bool implicit_lod = devinfo->ver >= 9 &&
7267
                             (inst->opcode == SHADER_OPCODE_TXL ||
7268
                              inst->opcode == SHADER_OPCODE_TXF) &&
7269
                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
7270

7271
   /* Calculate the total number of argument components that need to be passed
7272
    * to the sampler unit.
7273
    */
7274
   const unsigned num_payload_components =
7275
      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
7276
           req_coord_components) +
7277
      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
7278
      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
7279
      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
7280
      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
7281
      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
7282
       inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
7283
      inst->components_read(TEX_LOGICAL_SRC_MCS);
7284

7285
   /* SIMD16 messages with more than five arguments exceed the maximum message
7286
    * size supported by the sampler, regardless of whether a header is
7287
    * provided or not.
7288
    */
7289
   return MIN2(inst->exec_size,
7290
               num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
7291
}
7292

7293
/**
7294
 * Get the closest native SIMD width supported by the hardware for instruction
7295
 * \p inst.  The instruction will be left untouched by
7296
 * fs_visitor::lower_simd_width() if the returned value is equal to the
7297
 * original execution size.
7298
 */
7299
static unsigned
7300
get_lowered_simd_width(const struct intel_device_info *devinfo,
7301
                       const fs_inst *inst)
7302
{
7303
   switch (inst->opcode) {
7304
   case BRW_OPCODE_MOV:
7305
   case BRW_OPCODE_SEL:
7306
   case BRW_OPCODE_NOT:
7307
   case BRW_OPCODE_AND:
7308
   case BRW_OPCODE_OR:
7309
   case BRW_OPCODE_XOR:
7310
   case BRW_OPCODE_SHR:
7311
   case BRW_OPCODE_SHL:
7312
   case BRW_OPCODE_ASR:
7313
   case BRW_OPCODE_ROR:
7314
   case BRW_OPCODE_ROL:
7315
   case BRW_OPCODE_CMPN:
7316
   case BRW_OPCODE_CSEL:
7317
   case BRW_OPCODE_F32TO16:
7318
   case BRW_OPCODE_F16TO32:
7319
   case BRW_OPCODE_BFREV:
7320
   case BRW_OPCODE_BFE:
7321
   case BRW_OPCODE_ADD:
7322
   case BRW_OPCODE_MUL:
7323
   case BRW_OPCODE_AVG:
7324
   case BRW_OPCODE_FRC:
7325
   case BRW_OPCODE_RNDU:
7326
   case BRW_OPCODE_RNDD:
7327
   case BRW_OPCODE_RNDE:
7328
   case BRW_OPCODE_RNDZ:
7329
   case BRW_OPCODE_LZD:
7330
   case BRW_OPCODE_FBH:
7331
   case BRW_OPCODE_FBL:
7332
   case BRW_OPCODE_CBIT:
7333
   case BRW_OPCODE_SAD2:
7334
   case BRW_OPCODE_MAD:
7335
   case BRW_OPCODE_LRP:
7336
   case FS_OPCODE_PACK:
7337
   case SHADER_OPCODE_SEL_EXEC:
7338
   case SHADER_OPCODE_CLUSTER_BROADCAST:
7339
   case SHADER_OPCODE_MOV_RELOC_IMM:
7340
      return get_fpu_lowered_simd_width(devinfo, inst);
7341

7342
   case BRW_OPCODE_CMP: {
7343
      /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
7344
       * when the destination is a GRF the dependency-clear bit on the flag
7345
       * register is cleared early.
7346
       *
7347
       * Suggested workarounds are to disable coissuing CMP instructions
7348
       * or to split CMP(16) instructions into two CMP(8) instructions.
7349
       *
7350
       * We choose to split into CMP(8) instructions since disabling
7351
       * coissuing would affect CMP instructions not otherwise affected by
7352
       * the errata.
7353
       */
7354
      const unsigned max_width = (devinfo->verx10 == 70 &&
7355
                                  !inst->dst.is_null() ? 8 : ~0);
7356
      return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
7357
   }
7358
   case BRW_OPCODE_BFI1:
7359
   case BRW_OPCODE_BFI2:
7360
      /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
7361
       * should
7362
       *  "Force BFI instructions to be executed always in SIMD8."
7363
       */
7364
      return MIN2(devinfo->is_haswell ? 8 : ~0u,
7365
                  get_fpu_lowered_simd_width(devinfo, inst));
7366

7367
   case BRW_OPCODE_IF:
7368
      assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
7369
      return inst->exec_size;
7370

7371
   case SHADER_OPCODE_RCP:
7372
   case SHADER_OPCODE_RSQ:
7373
   case SHADER_OPCODE_SQRT:
7374
   case SHADER_OPCODE_EXP2:
7375
   case SHADER_OPCODE_LOG2:
7376
   case SHADER_OPCODE_SIN:
7377
   case SHADER_OPCODE_COS: {
7378
      /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
7379
       * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
7380
       */
7381
      if (devinfo->ver == 6 || (devinfo->ver == 4 && !devinfo->is_g4x))
7382
         return MIN2(8, inst->exec_size);
7383
      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
7384
         return MIN2(8, inst->exec_size);
7385
      return MIN2(16, inst->exec_size);
7386
   }
7387

7388
   case SHADER_OPCODE_POW: {
7389
      /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
7390
       * to SIMD8 with half-float
7391
       */
7392
      if (devinfo->ver < 7)
7393
         return MIN2(8, inst->exec_size);
7394
      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
7395
         return MIN2(8, inst->exec_size);
7396
      return MIN2(16, inst->exec_size);
7397
   }
7398

7399
   case SHADER_OPCODE_USUB_SAT:
7400
   case SHADER_OPCODE_ISUB_SAT:
7401
      return get_fpu_lowered_simd_width(devinfo, inst);
7402

7403
   case SHADER_OPCODE_INT_QUOTIENT:
7404
   case SHADER_OPCODE_INT_REMAINDER:
7405
      /* Integer division is limited to SIMD8 on all generations. */
7406
      return MIN2(8, inst->exec_size);
7407

7408
   case FS_OPCODE_LINTERP:
7409
   case SHADER_OPCODE_GET_BUFFER_SIZE:
7410
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
7411
   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
7412
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
7413
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
7414
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
7415
      return MIN2(16, inst->exec_size);
7416

7417
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
7418
      /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
7419
       * message used to implement varying pull constant loads, so expand it
7420
       * to SIMD16.  An alternative with longer message payload length but
7421
       * shorter return payload would be to use the SIMD8 sampler message that
7422
       * takes (header, u, v, r) as parameters instead of (header, u).
7423
       */
7424
      return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
7425

7426
   case FS_OPCODE_DDX_COARSE:
7427
   case FS_OPCODE_DDX_FINE:
7428
   case FS_OPCODE_DDY_COARSE:
7429
   case FS_OPCODE_DDY_FINE:
7430
      /* The implementation of this virtual opcode may require emitting
7431
       * compressed Align16 instructions, which are severely limited on some
7432
       * generations.
7433
       *
7434
       * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
7435
       * Region Restrictions):
7436
       *
7437
       *  "In Align16 access mode, SIMD16 is not allowed for DW operations
7438
       *   and SIMD8 is not allowed for DF operations."
7439
       *
7440
       * In this context, "DW operations" means "operations acting on 32-bit
7441
       * values", so it includes operations on floats.
7442
       *
7443
       * Gfx4 has a similar restriction.  From the i965 PRM, section 11.5.3
7444
       * (Instruction Compression -> Rules and Restrictions):
7445
       *
7446
       *  "A compressed instruction must be in Align1 access mode. Align16
7447
       *   mode instructions cannot be compressed."
7448
       *
7449
       * Similar text exists in the g45 PRM.
7450
       *
7451
       * Empirically, compressed align16 instructions using odd register
7452
       * numbers don't appear to work on Sandybridge either.
7453
       */
7454
      return (devinfo->ver == 4 || devinfo->ver == 6 ||
7455
              (devinfo->verx10 == 70) ?
7456
              MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
7457

7458
   case SHADER_OPCODE_MULH:
7459
      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
7460
       * is 8-wide on Gfx7+.
7461
       */
7462
      return (devinfo->ver >= 7 ? 8 :
7463
              get_fpu_lowered_simd_width(devinfo, inst));
7464

7465
   case FS_OPCODE_FB_WRITE_LOGICAL:
7466
      /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
7467
       * here.
7468
       */
7469
      assert(devinfo->ver != 6 ||
7470
             inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
7471
             inst->exec_size == 8);
7472
      /* Dual-source FB writes are unsupported in SIMD16 mode. */
7473
      return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
7474
              8 : MIN2(16, inst->exec_size));
7475

7476
   case FS_OPCODE_FB_READ_LOGICAL:
7477
      return MIN2(16, inst->exec_size);
7478

7479
   case SHADER_OPCODE_TEX_LOGICAL:
7480
   case SHADER_OPCODE_TXF_CMS_LOGICAL:
7481
   case SHADER_OPCODE_TXF_UMS_LOGICAL:
7482
   case SHADER_OPCODE_TXF_MCS_LOGICAL:
7483
   case SHADER_OPCODE_LOD_LOGICAL:
7484
   case SHADER_OPCODE_TG4_LOGICAL:
7485
   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
7486
   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
7487
   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
7488
      return get_sampler_lowered_simd_width(devinfo, inst);
7489

7490
   case SHADER_OPCODE_TXD_LOGICAL:
7491
      /* TXD is unsupported in SIMD16 mode. */
7492
      return 8;
7493

7494
   case SHADER_OPCODE_TXL_LOGICAL:
7495
   case FS_OPCODE_TXB_LOGICAL:
7496
      /* Only one execution size is representable pre-ILK depending on whether
7497
       * the shadow reference argument is present.
7498
       */
7499
      if (devinfo->ver == 4)
7500
         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
7501
      else
7502
         return get_sampler_lowered_simd_width(devinfo, inst);
7503

7504
   case SHADER_OPCODE_TXF_LOGICAL:
7505
   case SHADER_OPCODE_TXS_LOGICAL:
7506
      /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
7507
       * messages.  Use SIMD16 instead.
7508
       */
7509
      if (devinfo->ver == 4)
7510
         return 16;
7511
      else
7512
         return get_sampler_lowered_simd_width(devinfo, inst);
7513

7514
   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
7515
   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
7516
   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
7517
      return 8;
7518

7519
   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
7520
   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
7521
   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
7522
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
7523
   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
7524
   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
7525
   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
7526
   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
7527
      return MIN2(16, inst->exec_size);
7528

7529
   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
7530
   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
7531
   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
7532
   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
7533
      return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
7534

7535
   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
7536
   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
7537
   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
7538
      assert(inst->exec_size <= 16);
7539
      return inst->exec_size;
7540

7541
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
7542
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
7543
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
7544
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
7545
   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
7546
      return 8;
7547

7548
   case SHADER_OPCODE_URB_READ_SIMD8:
7549
   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
7550
   case SHADER_OPCODE_URB_WRITE_SIMD8:
7551
   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
7552
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
7553
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
7554
      return MIN2(8, inst->exec_size);
7555

7556
   case SHADER_OPCODE_QUAD_SWIZZLE: {
7557
      const unsigned swiz = inst->src[1].ud;
7558
      return (is_uniform(inst->src[0]) ?
7559
                 get_fpu_lowered_simd_width(devinfo, inst) :
7560
              devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
7561
              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
7562
              get_fpu_lowered_simd_width(devinfo, inst));
7563
   }
7564
   case SHADER_OPCODE_MOV_INDIRECT: {
7565
      /* From IVB and HSW PRMs:
7566
       *
7567
       * "2.When the destination requires two registers and the sources are
7568
       *  indirect, the sources must use 1x1 regioning mode.
7569
       *
7570
       * In case of DF instructions in HSW/IVB, the exec_size is limited by
7571
       * the EU decompression logic not handling VxH indirect addressing
7572
       * correctly.
7573
       */
7574
      const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
7575
      /* Prior to Broadwell, we only have 8 address subregisters. */
7576
      return MIN3(devinfo->ver >= 8 ? 16 : 8,
7577
                  max_size / (inst->dst.stride * type_sz(inst->dst.type)),
7578
                  inst->exec_size);
7579
   }
7580

7581
   case SHADER_OPCODE_LOAD_PAYLOAD: {
7582
      const unsigned reg_count =
7583
         DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
7584

7585
      if (reg_count > 2) {
7586
         /* Only LOAD_PAYLOAD instructions with per-channel destination region
7587
          * can be easily lowered (which excludes headers and heterogeneous
7588
          * types).
7589
          */
7590
         assert(!inst->header_size);
7591
         for (unsigned i = 0; i < inst->sources; i++)
7592
            assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
7593
                   inst->src[i].file == BAD_FILE);
7594

7595
         return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
7596
      } else {
7597
         return inst->exec_size;
7598
      }
7599
   }
7600
   default:
7601
      return inst->exec_size;
7602
   }
7603
}
7604

7605
/**
7606
 * Return true if splitting out the group of channels of instruction \p inst
7607
 * given by lbld.group() requires allocating a temporary for the i-th source
7608
 * of the lowered instruction.
7609
 */
7610
static inline bool
7611
needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
7612
{
7613
   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
7614
            (inst->components_read(i) == 1 &&
7615
             lbld.dispatch_width() <= inst->exec_size)) ||
7616
          (inst->flags_written(lbld.shader->devinfo) &
7617
           flag_mask(inst->src[i], type_sz(inst->src[i].type)));
7618
}
7619

7620
/**
7621
 * Extract the data that would be consumed by the channel group given by
7622
 * lbld.group() from the i-th source region of instruction \p inst and return
7623
 * it as result in packed form.
7624
 */
7625
static fs_reg
7626
emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
7627
{
7628
   assert(lbld.group() >= inst->group);
7629

7630
   /* Specified channel group from the source region. */
7631
   const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
7632

7633
   if (needs_src_copy(lbld, inst, i)) {
7634
      /* Builder of the right width to perform the copy avoiding uninitialized
7635
       * data if the lowered execution size is greater than the original
7636
       * execution size of the instruction.
7637
       */
7638
      const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
7639
                                              inst->exec_size), 0);
7640
      const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
7641

7642
      for (unsigned k = 0; k < inst->components_read(i); ++k)
7643
         cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
7644

7645
      return tmp;
7646

7647
   } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
7648
      /* The source is invariant for all dispatch_width-wide groups of the
7649
       * original region.
7650
       */
7651
      return inst->src[i];
7652

7653
   } else {
7654
      /* We can just point the lowered instruction at the right channel group
7655
       * from the original region.
7656
       */
7657
      return src;
7658
   }
7659
}
7660

7661
/**
7662
 * Return true if splitting out the group of channels of instruction \p inst
7663
 * given by lbld.group() requires allocating a temporary for the destination
7664
 * of the lowered instruction and copying the data back to the original
7665
 * destination region.
7666
 */
7667
static inline bool
7668
needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
7669
{
7670
   /* If the instruction writes more than one component we'll have to shuffle
7671
    * the results of multiple lowered instructions in order to make sure that
7672
    * they end up arranged correctly in the original destination region.
7673
    */
7674
   if (inst->size_written > inst->dst.component_size(inst->exec_size))
7675
      return true;
7676

7677
   /* If the lowered execution size is larger than the original the result of
7678
    * the instruction won't fit in the original destination, so we'll have to
7679
    * allocate a temporary in any case.
7680
    */
7681
   if (lbld.dispatch_width() > inst->exec_size)
7682
      return true;
7683

7684
   for (unsigned i = 0; i < inst->sources; i++) {
7685
      /* If we already made a copy of the source for other reasons there won't
7686
       * be any overlap with the destination.
7687
       */
7688
      if (needs_src_copy(lbld, inst, i))
7689
         continue;
7690

7691
      /* In order to keep the logic simple we emit a copy whenever the
7692
       * destination region doesn't exactly match an overlapping source, which
7693
       * may point at the source and destination not being aligned group by
7694
       * group which could cause one of the lowered instructions to overwrite
7695
       * the data read from the same source by other lowered instructions.
7696
       */
7697
      if (regions_overlap(inst->dst, inst->size_written,
7698
                          inst->src[i], inst->size_read(i)) &&
7699
          !inst->dst.equals(inst->src[i]))
7700
        return true;
7701
   }
7702

7703
   return false;
7704
}
7705

7706
/**
7707
 * Insert data from a packed temporary into the channel group given by
7708
 * lbld.group() of the destination region of instruction \p inst and return
7709
 * the temporary as result.  Any copy instructions that are required for
7710
 * unzipping the previous value (in the case of partial writes) will be
7711
 * inserted using \p lbld_before and any copy instructions required for
7712
 * zipping up the destination of \p inst will be inserted using \p lbld_after.
7713
 */
7714
static fs_reg
7715
emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
7716
         fs_inst *inst)
7717
{
7718
   assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
7719
   assert(lbld_before.group() == lbld_after.group());
7720
   assert(lbld_after.group() >= inst->group);
7721

7722
   /* Specified channel group from the destination region. */
7723
   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
7724
   const unsigned dst_size = inst->size_written /
7725
      inst->dst.component_size(inst->exec_size);
7726

7727
   if (needs_dst_copy(lbld_after, inst)) {
7728
      const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
7729

7730
      if (inst->predicate) {
7731
         /* Handle predication by copying the original contents of
7732
          * the destination into the temporary before emitting the
7733
          * lowered instruction.
7734
          */
7735
         const fs_builder gbld_before =
7736
            lbld_before.group(MIN2(lbld_before.dispatch_width(),
7737
                                   inst->exec_size), 0);
7738
         for (unsigned k = 0; k < dst_size; ++k) {
7739
            gbld_before.MOV(offset(tmp, lbld_before, k),
7740
                            offset(dst, inst->exec_size, k));
7741
         }
7742
      }
7743

7744
      const fs_builder gbld_after =
7745
         lbld_after.group(MIN2(lbld_after.dispatch_width(),
7746
                               inst->exec_size), 0);
7747
      for (unsigned k = 0; k < dst_size; ++k) {
7748
         /* Use a builder of the right width to perform the copy avoiding
7749
          * uninitialized data if the lowered execution size is greater than
7750
          * the original execution size of the instruction.
7751
          */
7752
         gbld_after.MOV(offset(dst, inst->exec_size, k),
7753
                        offset(tmp, lbld_after, k));
7754
      }
7755

7756
      return tmp;
7757

7758
   } else {
7759
      /* No need to allocate a temporary for the lowered instruction, just
7760
       * take the right group of channels from the original region.
7761
       */
7762
      return dst;
7763
   }
7764
}
7765

7766
bool
7767
fs_visitor::lower_simd_width()
7768
{
7769
   bool progress = false;
7770

7771
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
7772
      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
7773

7774
      if (lower_width != inst->exec_size) {
7775
         /* Builder matching the original instruction.  We may also need to
7776
          * emit an instruction of width larger than the original, set the
7777
          * execution size of the builder to the highest of both for now so
7778
          * we're sure that both cases can be handled.
7779
          */
7780
         const unsigned max_width = MAX2(inst->exec_size, lower_width);
7781
         const fs_builder ibld = bld.at(block, inst)
7782
                                    .exec_all(inst->force_writemask_all)
7783
                                    .group(max_width, inst->group / max_width);
7784

7785
         /* Split the copies in chunks of the execution width of either the
7786
          * original or the lowered instruction, whichever is lower.
7787
          */
7788
         const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
7789
         const unsigned dst_size = inst->size_written /
7790
            inst->dst.component_size(inst->exec_size);
7791

7792
         assert(!inst->writes_accumulator && !inst->mlen);
7793

7794
         /* Inserting the zip, unzip, and duplicated instructions in all of
7795
          * the right spots is somewhat tricky.  All of the unzip and any
7796
          * instructions from the zip which unzip the destination prior to
7797
          * writing need to happen before all of the per-group instructions
7798
          * and the zip instructions need to happen after.  In order to sort
7799
          * this all out, we insert the unzip instructions before \p inst,
7800
          * insert the per-group instructions after \p inst (i.e. before
7801
          * inst->next), and insert the zip instructions before the
7802
          * instruction after \p inst.  Since we are inserting instructions
7803
          * after \p inst, inst->next is a moving target and we need to save
7804
          * it off here so that we insert the zip instructions in the right
7805
          * place.
7806
          *
7807
          * Since we're inserting split instructions after after_inst, the
7808
          * instructions will end up in the reverse order that we insert them.
7809
          * However, certain render target writes require that the low group
7810
          * instructions come before the high group.  From the Ivy Bridge PRM
7811
          * Vol. 4, Pt. 1, Section 3.9.11:
7812
          *
7813
          *    "If multiple SIMD8 Dual Source messages are delivered by the
7814
          *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
7815
          *    issued before the SIMD8_DUALSRC_HI message with the same Slot
7816
          *    Group Select setting."
7817
          *
7818
          * And, from Section 3.9.11.1 of the same PRM:
7819
          *
7820
          *    "When SIMD32 or SIMD16 PS threads send render target writes
7821
          *    with multiple SIMD8 and SIMD16 messages, the following must
7822
          *    hold:
7823
          *
7824
          *    All the slots (as described above) must have a corresponding
7825
          *    render target write irrespective of the slot's validity. A slot
7826
          *    is considered valid when at least one sample is enabled. For
7827
          *    example, a SIMD16 PS thread must send two SIMD8 render target
7828
          *    writes to cover all the slots.
7829
          *
7830
          *    PS thread must send SIMD render target write messages with
7831
          *    increasing slot numbers. For example, SIMD16 thread has
7832
          *    Slot[15:0] and if two SIMD8 render target writes are used, the
7833
          *    first SIMD8 render target write must send Slot[7:0] and the
7834
          *    next one must send Slot[15:8]."
7835
          *
7836
          * In order to make low group instructions come before high group
7837
          * instructions (this is required for some render target writes), we
7838
          * split from the highest group to lowest.
7839
          */
7840
         exec_node *const after_inst = inst->next;
7841
         for (int i = n - 1; i >= 0; i--) {
7842
            /* Emit a copy of the original instruction with the lowered width.
7843
             * If the EOT flag was set throw it away except for the last
7844
             * instruction to avoid killing the thread prematurely.
7845
             */
7846
            fs_inst split_inst = *inst;
7847
            split_inst.exec_size = lower_width;
7848
            split_inst.eot = inst->eot && i == int(n - 1);
7849

7850
            /* Select the correct channel enables for the i-th group, then
7851
             * transform the sources and destination and emit the lowered
7852
             * instruction.
7853
             */
7854
            const fs_builder lbld = ibld.group(lower_width, i);
7855

7856
            for (unsigned j = 0; j < inst->sources; j++)
7857
               split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
7858

7859
            split_inst.dst = emit_zip(lbld.at(block, inst),
7860
                                      lbld.at(block, after_inst), inst);
7861
            split_inst.size_written =
7862
               split_inst.dst.component_size(lower_width) * dst_size;
7863

7864
            lbld.at(block, inst->next).emit(split_inst);
7865
         }
7866

7867
         inst->remove(block);
7868
         progress = true;
7869
      }
7870
   }
7871

7872
   if (progress)
7873
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7874

7875
   return progress;
7876
}
7877

7878
/**
7879
 * Transform barycentric vectors into the interleaved form expected by the PLN
7880
 * instruction and returned by the Gfx7+ PI shared function.
7881
 *
7882
 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
7883
 * follows in the register file:
7884
 *
7885
 *    rN+0: X[0-7]
7886
 *    rN+1: Y[0-7]
7887
 *    rN+2: X[8-15]
7888
 *    rN+3: Y[8-15]
7889
 *
7890
 * There is no need to handle SIMD32 here -- This is expected to be run after
7891
 * SIMD lowering, since SIMD lowering relies on vectors having the standard
7892
 * component layout.
7893
 */
7894
bool
7895
fs_visitor::lower_barycentrics()
7896
{
7897
   const bool has_interleaved_layout = devinfo->has_pln || devinfo->ver >= 7;
7898
   bool progress = false;
7899

7900
   if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
7901
      return false;
7902

7903
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
7904
      if (inst->exec_size < 16)
7905
         continue;
7906

7907
      const fs_builder ibld(this, block, inst);
7908
      const fs_builder ubld = ibld.exec_all().group(8, 0);
7909

7910
      switch (inst->opcode) {
7911
      case FS_OPCODE_LINTERP : {
7912
         assert(inst->exec_size == 16);
7913
         const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
7914
         fs_reg srcs[4];
7915

7916
         for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
7917
            srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
7918
                                   8 * (i / 2));
7919

7920
         ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
7921

7922
         inst->src[0] = tmp;
7923
         progress = true;
7924
         break;
7925
      }
7926
      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
7927
      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
7928
      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
7929
         assert(inst->exec_size == 16);
7930
         const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
7931

7932
         for (unsigned i = 0; i < 2; i++) {
7933
            for (unsigned g = 0; g < inst->exec_size / 8; g++) {
7934
               fs_inst *mov = ibld.at(block, inst->next).group(8, g)
7935
                                  .MOV(horiz_offset(offset(inst->dst, ibld, i),
7936
                                                    8 * g),
7937
                                       offset(tmp, ubld, 2 * g + i));
7938
               mov->predicate = inst->predicate;
7939
               mov->predicate_inverse = inst->predicate_inverse;
7940
               mov->flag_subreg = inst->flag_subreg;
7941
            }
7942
         }
7943

7944
         inst->dst = tmp;
7945
         progress = true;
7946
         break;
7947
      }
7948
      default:
7949
         break;
7950
      }
7951
   }
7952

7953
   if (progress)
7954
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7955

7956
   return progress;
7957
}
7958

7959
/**
7960
 * Lower a derivative instruction as the floating-point difference of two
7961
 * swizzles of the source, specified as \p swz0 and \p swz1.
7962
 */
7963
static bool
7964
lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst,
7965
                 unsigned swz0, unsigned swz1)
7966
{
7967
   const fs_builder ibld(v, block, inst);
7968
   const fs_reg tmp0 = ibld.vgrf(inst->src[0].type);
7969
   const fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
7970

7971
   ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
7972
   ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
7973

7974
   inst->resize_sources(2);
7975
   inst->src[0] = negate(tmp0);
7976
   inst->src[1] = tmp1;
7977
   inst->opcode = BRW_OPCODE_ADD;
7978

7979
   return true;
7980
}
7981

7982
/**
7983
 * Lower derivative instructions on platforms where codegen cannot implement
7984
 * them efficiently (i.e. XeHP).
7985
 */
7986
bool
7987
fs_visitor::lower_derivatives()
7988
{
7989
   bool progress = false;
7990

7991
   if (devinfo->verx10 < 125)
7992
      return false;
7993

7994
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
7995
      if (inst->opcode == FS_OPCODE_DDX_COARSE)
7996
         progress |= lower_derivative(this, block, inst,
7997
                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
7998

7999
      else if (inst->opcode == FS_OPCODE_DDX_FINE)
8000
         progress |= lower_derivative(this, block, inst,
8001
                                      BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
8002

8003
      else if (inst->opcode == FS_OPCODE_DDY_COARSE)
8004
         progress |= lower_derivative(this, block, inst,
8005
                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
8006

8007
      else if (inst->opcode == FS_OPCODE_DDY_FINE)
8008
         progress |= lower_derivative(this, block, inst,
8009
                                      BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
8010
   }
8011

8012
   if (progress)
8013
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8014

8015
   return progress;
8016
}
8017

8018
void
8019
fs_visitor::dump_instructions() const
8020
{
8021
   dump_instructions(NULL);
8022
}
8023

8024
void
8025
fs_visitor::dump_instructions(const char *name) const
8026
{
8027
   FILE *file = stderr;
8028
   if (name && geteuid() != 0) {
8029
      file = fopen(name, "w");
8030
      if (!file)
8031
         file = stderr;
8032
   }
8033

8034
   if (cfg) {
8035
      const register_pressure &rp = regpressure_analysis.require();
8036
      unsigned ip = 0, max_pressure = 0;
8037
      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
8038
         max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
8039
         fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
8040
         dump_instruction(inst, file);
8041
         ip++;
8042
      }
8043
      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
8044
   } else {
8045
      int ip = 0;
8046
      foreach_in_list(backend_instruction, inst, &instructions) {
8047
         fprintf(file, "%4d: ", ip++);
8048
         dump_instruction(inst, file);
8049
      }
8050
   }
8051

8052
   if (file != stderr) {
8053
      fclose(file);
8054
   }
8055
}
8056

8057
void
8058
fs_visitor::dump_instruction(const backend_instruction *be_inst) const
8059
{
8060
   dump_instruction(be_inst, stderr);
8061
}
8062

8063
void
8064
fs_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const
8065
{
8066
   const fs_inst *inst = (const fs_inst *)be_inst;
8067

8068
   if (inst->predicate) {
8069
      fprintf(file, "(%cf%d.%d) ",
8070
              inst->predicate_inverse ? '-' : '+',
8071
              inst->flag_subreg / 2,
8072
              inst->flag_subreg % 2);
8073
   }
8074

8075
   fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
8076
   if (inst->saturate)
8077
      fprintf(file, ".sat");
8078
   if (inst->conditional_mod) {
8079
      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
8080
      if (!inst->predicate &&
8081
          (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
8082
                                inst->opcode != BRW_OPCODE_CSEL &&
8083
                                inst->opcode != BRW_OPCODE_IF &&
8084
                                inst->opcode != BRW_OPCODE_WHILE))) {
8085
         fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
8086
                 inst->flag_subreg % 2);
8087
      }
8088
   }
8089
   fprintf(file, "(%d) ", inst->exec_size);
8090

8091
   if (inst->mlen) {
8092
      fprintf(file, "(mlen: %d) ", inst->mlen);
8093
   }
8094

8095
   if (inst->ex_mlen) {
8096
      fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
8097
   }
8098

8099
   if (inst->eot) {
8100
      fprintf(file, "(EOT) ");
8101
   }
8102

8103
   switch (inst->dst.file) {
8104
   case VGRF:
8105
      fprintf(file, "vgrf%d", inst->dst.nr);
8106
      break;
8107
   case FIXED_GRF:
8108
      fprintf(file, "g%d", inst->dst.nr);
8109
      break;
8110
   case MRF:
8111
      fprintf(file, "m%d", inst->dst.nr);
8112
      break;
8113
   case BAD_FILE:
8114
      fprintf(file, "(null)");
8115
      break;
8116
   case UNIFORM:
8117
      fprintf(file, "***u%d***", inst->dst.nr);
8118
      break;
8119
   case ATTR:
8120
      fprintf(file, "***attr%d***", inst->dst.nr);
8121
      break;
8122
   case ARF:
8123
      switch (inst->dst.nr) {
8124
      case BRW_ARF_NULL:
8125
         fprintf(file, "null");
8126
         break;
8127
      case BRW_ARF_ADDRESS:
8128
         fprintf(file, "a0.%d", inst->dst.subnr);
8129
         break;
8130
      case BRW_ARF_ACCUMULATOR:
8131
         fprintf(file, "acc%d", inst->dst.subnr);
8132
         break;
8133
      case BRW_ARF_FLAG:
8134
         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
8135
         break;
8136
      default:
8137
         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
8138
         break;
8139
      }
8140
      break;
8141
   case IMM:
8142
      unreachable("not reached");
8143
   }
8144

8145
   if (inst->dst.offset ||
8146
       (inst->dst.file == VGRF &&
8147
        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
8148
      const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
8149
      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
8150
              inst->dst.offset % reg_size);
8151
   }
8152

8153
   if (inst->dst.stride != 1)
8154
      fprintf(file, "<%u>", inst->dst.stride);
8155
   fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
8156

8157
   for (int i = 0; i < inst->sources; i++) {
8158
      if (inst->src[i].negate)
8159
         fprintf(file, "-");
8160
      if (inst->src[i].abs)
8161
         fprintf(file, "|");
8162
      switch (inst->src[i].file) {
8163
      case VGRF:
8164
         fprintf(file, "vgrf%d", inst->src[i].nr);
8165
         break;
8166
      case FIXED_GRF:
8167
         fprintf(file, "g%d", inst->src[i].nr);
8168
         break;
8169
      case MRF:
8170
         fprintf(file, "***m%d***", inst->src[i].nr);
8171
         break;
8172
      case ATTR:
8173
         fprintf(file, "attr%d", inst->src[i].nr);
8174
         break;
8175
      case UNIFORM:
8176
         fprintf(file, "u%d", inst->src[i].nr);
8177
         break;
8178
      case BAD_FILE:
8179
         fprintf(file, "(null)");
8180
         break;
8181
      case IMM:
8182
         switch (inst->src[i].type) {
8183
         case BRW_REGISTER_TYPE_HF:
8184
            fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
8185
            break;
8186
         case BRW_REGISTER_TYPE_F:
8187
            fprintf(file, "%-gf", inst->src[i].f);
8188
            break;
8189
         case BRW_REGISTER_TYPE_DF:
8190
            fprintf(file, "%fdf", inst->src[i].df);
8191
            break;
8192
         case BRW_REGISTER_TYPE_W:
8193
         case BRW_REGISTER_TYPE_D:
8194
            fprintf(file, "%dd", inst->src[i].d);
8195
            break;
8196
         case BRW_REGISTER_TYPE_UW:
8197
         case BRW_REGISTER_TYPE_UD:
8198
            fprintf(file, "%uu", inst->src[i].ud);
8199
            break;
8200
         case BRW_REGISTER_TYPE_Q:
8201
            fprintf(file, "%" PRId64 "q", inst->src[i].d64);
8202
            break;
8203
         case BRW_REGISTER_TYPE_UQ:
8204
            fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
8205
            break;
8206
         case BRW_REGISTER_TYPE_VF:
8207
            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
8208
                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
8209
                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
8210
                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
8211
                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
8212
            break;
8213
         case BRW_REGISTER_TYPE_V:
8214
         case BRW_REGISTER_TYPE_UV:
8215
            fprintf(file, "%08x%s", inst->src[i].ud,
8216
                    inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
8217
            break;
8218
         default:
8219
            fprintf(file, "???");
8220
            break;
8221
         }
8222
         break;
8223
      case ARF:
8224
         switch (inst->src[i].nr) {
8225
         case BRW_ARF_NULL:
8226
            fprintf(file, "null");
8227
            break;
8228
         case BRW_ARF_ADDRESS:
8229
            fprintf(file, "a0.%d", inst->src[i].subnr);
8230
            break;
8231
         case BRW_ARF_ACCUMULATOR:
8232
            fprintf(file, "acc%d", inst->src[i].subnr);
8233
            break;
8234
         case BRW_ARF_FLAG:
8235
            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
8236
            break;
8237
         default:
8238
            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
8239
            break;
8240
         }
8241
         break;
8242
      }
8243

8244
      if (inst->src[i].offset ||
8245
          (inst->src[i].file == VGRF &&
8246
           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
8247
         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
8248
         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
8249
                 inst->src[i].offset % reg_size);
8250
      }
8251

8252
      if (inst->src[i].abs)
8253
         fprintf(file, "|");
8254

8255
      if (inst->src[i].file != IMM) {
8256
         unsigned stride;
8257
         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
8258
            unsigned hstride = inst->src[i].hstride;
8259
            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
8260
         } else {
8261
            stride = inst->src[i].stride;
8262
         }
8263
         if (stride != 1)
8264
            fprintf(file, "<%u>", stride);
8265

8266
         fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
8267
      }
8268

8269
      if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
8270
         fprintf(file, ", ");
8271
   }
8272

8273
   fprintf(file, " ");
8274

8275
   if (inst->force_writemask_all)
8276
      fprintf(file, "NoMask ");
8277

8278
   if (inst->exec_size != dispatch_width)
8279
      fprintf(file, "group%d ", inst->group);
8280

8281
   fprintf(file, "\n");
8282
}
8283

8284
void
8285
fs_visitor::setup_fs_payload_gfx6()
8286
{
8287
   assert(stage == MESA_SHADER_FRAGMENT);
8288
   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
8289
   const unsigned payload_width = MIN2(16, dispatch_width);
8290
   assert(dispatch_width % payload_width == 0);
8291
   assert(devinfo->ver >= 6);
8292

8293
   /* R0: PS thread payload header. */
8294
   payload.num_regs++;
8295

8296
   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
8297
      /* R1: masks, pixel X/Y coordinates. */
8298
      payload.subspan_coord_reg[j] = payload.num_regs++;
8299
   }
8300

8301
   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
8302
      /* R3-26: barycentric interpolation coordinates.  These appear in the
8303
       * same order that they appear in the brw_barycentric_mode enum.  Each
8304
       * set of coordinates occupies 2 registers if dispatch width == 8 and 4
8305
       * registers if dispatch width == 16.  Coordinates only appear if they
8306
       * were enabled using the "Barycentric Interpolation Mode" bits in
8307
       * WM_STATE.
8308
       */
8309
      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
8310
         if (prog_data->barycentric_interp_modes & (1 << i)) {
8311
            payload.barycentric_coord_reg[i][j] = payload.num_regs;
8312
            payload.num_regs += payload_width / 4;
8313
         }
8314
      }
8315

8316
      /* R27-28: interpolated depth if uses source depth */
8317
      if (prog_data->uses_src_depth) {
8318
         payload.source_depth_reg[j] = payload.num_regs;
8319
         payload.num_regs += payload_width / 8;
8320
      }
8321

8322
      /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
8323
      if (prog_data->uses_src_w) {
8324
         payload.source_w_reg[j] = payload.num_regs;
8325
         payload.num_regs += payload_width / 8;
8326
      }
8327

8328
      /* R31: MSAA position offsets. */
8329
      if (prog_data->uses_pos_offset) {
8330
         payload.sample_pos_reg[j] = payload.num_regs;
8331
         payload.num_regs++;
8332
      }
8333

8334
      /* R32-33: MSAA input coverage mask */
8335
      if (prog_data->uses_sample_mask) {
8336
         assert(devinfo->ver >= 7);
8337
         payload.sample_mask_in_reg[j] = payload.num_regs;
8338
         payload.num_regs += payload_width / 8;
8339
      }
8340

8341
      /* R66: Source Depth and/or W Attribute Vertex Deltas */
8342
      if (prog_data->uses_depth_w_coefficients) {
8343
         payload.depth_w_coef_reg[j] = payload.num_regs;
8344
         payload.num_regs++;
8345
      }
8346
   }
8347

8348
   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
8349
      source_depth_to_render_target = true;
8350
   }
8351
}
8352

8353
void
8354
fs_visitor::setup_vs_payload()
8355
{
8356
   /* R0: thread header, R1: urb handles */
8357
   payload.num_regs = 2;
8358
}
8359

8360
void
8361
fs_visitor::setup_gs_payload()
8362
{
8363
   assert(stage == MESA_SHADER_GEOMETRY);
8364

8365
   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
8366
   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
8367

8368
   /* R0: thread header, R1: output URB handles */
8369
   payload.num_regs = 2;
8370

8371
   if (gs_prog_data->include_primitive_id) {
8372
      /* R2: Primitive ID 0..7 */
8373
      payload.num_regs++;
8374
   }
8375

8376
   /* Always enable VUE handles so we can safely use pull model if needed.
8377
    *
8378
    * The push model for a GS uses a ton of register space even for trivial
8379
    * scenarios with just a few inputs, so just make things easier and a bit
8380
    * safer by always having pull model available.
8381
    */
8382
   gs_prog_data->base.include_vue_handles = true;
8383

8384
   /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
8385
   payload.num_regs += nir->info.gs.vertices_in;
8386

8387
   /* Use a maximum of 24 registers for push-model inputs. */
8388
   const unsigned max_push_components = 24;
8389

8390
   /* If pushing our inputs would take too many registers, reduce the URB read
8391
    * length (which is in HWords, or 8 registers), and resort to pulling.
8392
    *
8393
    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
8394
    * have to multiply by VerticesIn to obtain the total storage requirement.
8395
    */
8396
   if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
8397
       max_push_components) {
8398
      vue_prog_data->urb_read_length =
8399
         ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
8400
   }
8401
}
8402

8403
void
8404
fs_visitor::setup_cs_payload()
8405
{
8406
   assert(devinfo->ver >= 7);
8407
   /* TODO: Fill out uses_btd_stack_ids automatically */
8408
   payload.num_regs = 1 + brw_cs_prog_data(prog_data)->uses_btd_stack_ids;
8409
}
8410

8411
brw::register_pressure::register_pressure(const fs_visitor *v)
8412
{
8413
   const fs_live_variables &live = v->live_analysis.require();
8414
   const unsigned num_instructions = v->cfg->num_blocks ?
8415
      v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
8416

8417
   regs_live_at_ip = new unsigned[num_instructions]();
8418

8419
   for (unsigned reg = 0; reg < v->alloc.count; reg++) {
8420
      for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
8421
         regs_live_at_ip[ip] += v->alloc.sizes[reg];
8422
   }
8423
}
8424

8425
brw::register_pressure::~register_pressure()
8426
{
8427
   delete[] regs_live_at_ip;
8428
}
8429

8430
void
8431
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
8432
{
8433
   backend_shader::invalidate_analysis(c);
8434
   live_analysis.invalidate(c);
8435
   regpressure_analysis.invalidate(c);
8436
}
8437

8438
void
8439
fs_visitor::optimize()
8440
{
8441
   /* Start by validating the shader we currently have. */
8442
   validate();
8443

8444
   /* bld is the common builder object pointing at the end of the program we
8445
    * used to translate it into i965 IR.  For the optimization and lowering
8446
    * passes coming next, any code added after the end of the program without
8447
    * having explicitly called fs_builder::at() clearly points at a mistake.
8448
    * Ideally optimization passes wouldn't be part of the visitor so they
8449
    * wouldn't have access to bld at all, but they do, so just in case some
8450
    * pass forgets to ask for a location explicitly set it to NULL here to
8451
    * make it trip.  The dispatch width is initialized to a bogus value to
8452
    * make sure that optimizations set the execution controls explicitly to
8453
    * match the code they are manipulating instead of relying on the defaults.
8454
    */
8455
   bld = fs_builder(this, 64);
8456

8457
   assign_constant_locations();
8458
   lower_constant_loads();
8459

8460
   validate();
8461

8462
   split_virtual_grfs();
8463
   validate();
8464

8465
#define OPT(pass, args...) ({                                           \
8466
      pass_num++;                                                       \
8467
      bool this_progress = pass(args);                                  \
8468
                                                                        \
8469
      if ((INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {           \
8470
         char filename[64];                                             \
8471
         snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
8472
                  stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
8473
                                                                        \
8474
         backend_shader::dump_instructions(filename);                   \
8475
      }                                                                 \
8476
                                                                        \
8477
      validate();                                                       \
8478
                                                                        \
8479
      progress = progress || this_progress;                             \
8480
      this_progress;                                                    \
8481
   })
8482

8483
   if (INTEL_DEBUG & DEBUG_OPTIMIZER) {
8484
      char filename[64];
8485
      snprintf(filename, 64, "%s%d-%s-00-00-start",
8486
               stage_abbrev, dispatch_width, nir->info.name);
8487

8488
      backend_shader::dump_instructions(filename);
8489
   }
8490

8491
   bool progress = false;
8492
   int iteration = 0;
8493
   int pass_num = 0;
8494

8495
   /* Before anything else, eliminate dead code.  The results of some NIR
8496
    * instructions may effectively be calculated twice.  Once when the
8497
    * instruction is encountered, and again when the user of that result is
8498
    * encountered.  Wipe those away before algebraic optimizations and
8499
    * especially copy propagation can mix things up.
8500
    */
8501
   OPT(dead_code_eliminate);
8502

8503
   OPT(remove_extra_rounding_modes);
8504

8505
   do {
8506
      progress = false;
8507
      pass_num = 0;
8508
      iteration++;
8509

8510
      OPT(remove_duplicate_mrf_writes);
8511

8512
      OPT(opt_algebraic);
8513
      OPT(opt_cse);
8514
      OPT(opt_copy_propagation);
8515
      OPT(opt_predicated_break, this);
8516
      OPT(opt_cmod_propagation);
8517
      OPT(dead_code_eliminate);
8518
      OPT(opt_peephole_sel);
8519
      OPT(dead_control_flow_eliminate, this);
8520
      OPT(opt_register_renaming);
8521
      OPT(opt_saturate_propagation);
8522
      OPT(register_coalesce);
8523
      OPT(compute_to_mrf);
8524
      OPT(eliminate_find_live_channel);
8525

8526
      OPT(compact_virtual_grfs);
8527
   } while (progress);
8528

8529
   progress = false;
8530
   pass_num = 0;
8531

8532
   if (OPT(lower_pack)) {
8533
      OPT(register_coalesce);
8534
      OPT(dead_code_eliminate);
8535
   }
8536

8537
   OPT(lower_simd_width);
8538
   OPT(lower_barycentrics);
8539
   OPT(lower_logical_sends);
8540

8541
   /* After logical SEND lowering. */
8542
   OPT(fixup_nomask_control_flow);
8543

8544
   if (progress) {
8545
      OPT(opt_copy_propagation);
8546
      /* Only run after logical send lowering because it's easier to implement
8547
       * in terms of physical sends.
8548
       */
8549
      if (OPT(opt_zero_samples))
8550
         OPT(opt_copy_propagation);
8551
      /* Run after logical send lowering to give it a chance to CSE the
8552
       * LOAD_PAYLOAD instructions created to construct the payloads of
8553
       * e.g. texturing messages in cases where it wasn't possible to CSE the
8554
       * whole logical instruction.
8555
       */
8556
      OPT(opt_cse);
8557
      OPT(register_coalesce);
8558
      OPT(compute_to_mrf);
8559
      OPT(dead_code_eliminate);
8560
      OPT(remove_duplicate_mrf_writes);
8561
      OPT(opt_peephole_sel);
8562
   }
8563

8564
   OPT(opt_redundant_halt);
8565

8566
   if (OPT(lower_load_payload)) {
8567
      split_virtual_grfs();
8568

8569
      /* Lower 64 bit MOVs generated by payload lowering. */
8570
      if (!devinfo->has_64bit_float && !devinfo->has_64bit_int)
8571
         OPT(opt_algebraic);
8572

8573
      OPT(register_coalesce);
8574
      OPT(lower_simd_width);
8575
      OPT(compute_to_mrf);
8576
      OPT(dead_code_eliminate);
8577
   }
8578

8579
   OPT(opt_combine_constants);
8580
   if (OPT(lower_integer_multiplication)) {
8581
      /* If lower_integer_multiplication made progress, it may have produced
8582
       * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
8583
       * one more time to clean those up if they exist.
8584
       */
8585
      OPT(lower_integer_multiplication);
8586
   }
8587
   OPT(lower_sub_sat);
8588

8589
   if (devinfo->ver <= 5 && OPT(lower_minmax)) {
8590
      OPT(opt_cmod_propagation);
8591
      OPT(opt_cse);
8592
      OPT(opt_copy_propagation);
8593
      OPT(dead_code_eliminate);
8594
   }
8595

8596
   progress = false;
8597
   OPT(lower_derivatives);
8598
   OPT(lower_regioning);
8599
   if (progress) {
8600
      OPT(opt_copy_propagation);
8601
      OPT(dead_code_eliminate);
8602
      OPT(lower_simd_width);
8603
   }
8604

8605
   OPT(fixup_sends_duplicate_payload);
8606

8607
   lower_uniform_pull_constant_loads();
8608

8609
   validate();
8610
}
8611

8612
/**
8613
 * From the Skylake PRM Vol. 2a docs for sends:
8614
 *
8615
 *    "It is required that the second block of GRFs does not overlap with the
8616
 *    first block."
8617
 *
8618
 * There are plenty of cases where we may accidentally violate this due to
8619
 * having, for instance, both sources be the constant 0.  This little pass
8620
 * just adds a new vgrf for the second payload and copies it over.
8621
 */
8622
bool
8623
fs_visitor::fixup_sends_duplicate_payload()
8624
{
8625
   bool progress = false;
8626

8627
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
8628
      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
8629
          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
8630
                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
8631
         fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
8632
                             BRW_REGISTER_TYPE_UD);
8633
         /* Sadly, we've lost all notion of channels and bit sizes at this
8634
          * point.  Just WE_all it.
8635
          */
8636
         const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0);
8637
         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
8638
         fs_reg copy_dst = tmp;
8639
         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
8640
            if (inst->ex_mlen == i + 1) {
8641
               /* Only one register left; do SIMD8 */
8642
               ibld.group(8, 0).MOV(copy_dst, copy_src);
8643
            } else {
8644
               ibld.MOV(copy_dst, copy_src);
8645
            }
8646
            copy_src = offset(copy_src, ibld, 1);
8647
            copy_dst = offset(copy_dst, ibld, 1);
8648
         }
8649
         inst->src[3] = tmp;
8650
         progress = true;
8651
      }
8652
   }
8653

8654
   if (progress)
8655
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8656

8657
   return progress;
8658
}
8659

8660
/**
8661
 * Three source instruction must have a GRF/MRF destination register.
8662
 * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
8663
 */
8664
void
8665
fs_visitor::fixup_3src_null_dest()
8666
{
8667
   bool progress = false;
8668

8669
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
8670
      if (inst->is_3src(devinfo) && inst->dst.is_null()) {
8671
         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
8672
                            inst->dst.type);
8673
         progress = true;
8674
      }
8675
   }
8676

8677
   if (progress)
8678
      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
8679
                          DEPENDENCY_VARIABLES);
8680
}
8681

8682
/**
8683
 * Find the first instruction in the program that might start a region of
8684
 * divergent control flow due to a HALT jump.  There is no
8685
 * find_halt_control_flow_region_end(), the region of divergence extends until
8686
 * the only SHADER_OPCODE_HALT_TARGET in the program.
8687
 */
8688
static const fs_inst *
8689
find_halt_control_flow_region_start(const fs_visitor *v)
8690
{
8691
   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
8692
      if (inst->opcode == BRW_OPCODE_HALT ||
8693
          inst->opcode == SHADER_OPCODE_HALT_TARGET)
8694
         return inst;
8695
   }
8696

8697
   return NULL;
8698
}
8699

8700
/**
8701
 * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
8702
 * can cause a BB to be executed with all channels disabled, which will lead
8703
 * to the execution of any NoMask instructions in it, even though any
8704
 * execution-masked instructions will be correctly shot down.  This may break
8705
 * assumptions of some NoMask SEND messages whose descriptor depends on data
8706
 * generated by live invocations of the shader.
8707
 *
8708
 * This avoids the problem by predicating certain instructions on an ANY
8709
 * horizontal predicate that makes sure that their execution is omitted when
8710
 * all channels of the program are disabled.
8711
 */
8712
bool
8713
fs_visitor::fixup_nomask_control_flow()
8714
{
8715
   if (devinfo->ver != 12)
8716
      return false;
8717

8718
   const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
8719
                              dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
8720
                              BRW_PREDICATE_ALIGN1_ANY8H;
8721
   const fs_inst *halt_start = find_halt_control_flow_region_start(this);
8722
   unsigned depth = 0;
8723
   bool progress = false;
8724

8725
   const fs_live_variables &live_vars = live_analysis.require();
8726

8727
   /* Scan the program backwards in order to be able to easily determine
8728
    * whether the flag register is live at any point.
8729
    */
8730
   foreach_block_reverse_safe(block, cfg) {
8731
      BITSET_WORD flag_liveout = live_vars.block_data[block->num]
8732
                                               .flag_liveout[0];
8733
      STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
8734

8735
      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
8736
         if (!inst->predicate && inst->exec_size >= 8)
8737
            flag_liveout &= ~inst->flags_written(devinfo);
8738

8739
         switch (inst->opcode) {
8740
         case BRW_OPCODE_DO:
8741
         case BRW_OPCODE_IF:
8742
            /* Note that this doesn't handle BRW_OPCODE_HALT since only
8743
             * the first one in the program closes the region of divergent
8744
             * control flow due to any HALT instructions -- Instead this is
8745
             * handled with the halt_start check below.
8746
             */
8747
            depth--;
8748
            break;
8749

8750
         case BRW_OPCODE_WHILE:
8751
         case BRW_OPCODE_ENDIF:
8752
         case SHADER_OPCODE_HALT_TARGET:
8753
            depth++;
8754
            break;
8755

8756
         default:
8757
            /* Note that the vast majority of NoMask SEND instructions in the
8758
             * program are harmless while executed in a block with all
8759
             * channels disabled, since any instructions with side effects we
8760
             * could hit here should be execution-masked.
8761
             *
8762
             * The main concern is NoMask SEND instructions where the message
8763
             * descriptor or header depends on data generated by live
8764
             * invocations of the shader (RESINFO and
8765
             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
8766
             * computed surface index seem to be the only examples right now
8767
             * where this could easily lead to GPU hangs).  Unfortunately we
8768
             * have no straightforward way to detect that currently, so just
8769
             * predicate any NoMask SEND instructions we find under control
8770
             * flow.
8771
             *
8772
             * If this proves to have a measurable performance impact it can
8773
             * be easily extended with a whitelist of messages we know we can
8774
             * safely omit the predication for.
8775
             */
8776
            if (depth && inst->force_writemask_all &&
8777
                is_send(inst) && !inst->predicate) {
8778
               /* We need to load the execution mask into the flag register by
8779
                * using a builder with channel group matching the whole shader
8780
                * (rather than the default which is derived from the original
8781
                * instruction), in order to avoid getting a right-shifted
8782
                * value.
8783
                */
8784
               const fs_builder ubld = fs_builder(this, block, inst)
8785
                                       .exec_all().group(dispatch_width, 0);
8786
               const fs_reg flag = retype(brw_flag_reg(0, 0),
8787
                                          BRW_REGISTER_TYPE_UD);
8788

8789
               /* Due to the lack of flag register allocation we need to save
8790
                * and restore the flag register if it's live.
8791
                */
8792
               const bool save_flag = flag_liveout &
8793
                                      flag_mask(flag, dispatch_width / 8);
8794
               const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
8795

8796
               if (save_flag)
8797
                  ubld.group(1, 0).MOV(tmp, flag);
8798

8799
               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
8800

8801
               set_predicate(pred, inst);
8802
               inst->flag_subreg = 0;
8803

8804
               if (save_flag)
8805
                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
8806

8807
               progress = true;
8808
            }
8809
            break;
8810
         }
8811

8812
         if (inst == halt_start)
8813
            depth--;
8814

8815
         flag_liveout |= inst->flags_read(devinfo);
8816
      }
8817
   }
8818

8819
   if (progress)
8820
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8821

8822
   return progress;
8823
}
8824

8825
void
8826
fs_visitor::allocate_registers(bool allow_spilling)
8827
{
8828
   bool allocated;
8829

8830
   static const enum instruction_scheduler_mode pre_modes[] = {
8831
      SCHEDULE_PRE,
8832
      SCHEDULE_PRE_NON_LIFO,
8833
      SCHEDULE_PRE_LIFO,
8834
   };
8835

8836
   static const char *scheduler_mode_name[] = {
8837
      "top-down",
8838
      "non-lifo",
8839
      "lifo"
8840
   };
8841

8842
   bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
8843

8844
   /* Try each scheduling heuristic to see if it can successfully register
8845
    * allocate without spilling.  They should be ordered by decreasing
8846
    * performance but increasing likelihood of allocating.
8847
    */
8848
   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
8849
      schedule_instructions(pre_modes[i]);
8850
      this->shader_stats.scheduler_mode = scheduler_mode_name[i];
8851

8852
      if (0) {
8853
         assign_regs_trivial();
8854
         allocated = true;
8855
         break;
8856
      }
8857

8858
      /* Scheduling may create additional opportunities for CMOD propagation,
8859
       * so let's do it again.  If CMOD propagation made any progress,
8860
       * eliminate dead code one more time.
8861
       */
8862
      bool progress = false;
8863
      const int iteration = 99;
8864
      int pass_num = 0;
8865

8866
      if (OPT(opt_cmod_propagation)) {
8867
         /* dead_code_eliminate "undoes" the fixing done by
8868
          * fixup_3src_null_dest, so we have to do it again if
8869
          * dead_code_eliminiate makes any progress.
8870
          */
8871
         if (OPT(dead_code_eliminate))
8872
            fixup_3src_null_dest();
8873
      }
8874

8875
      bool can_spill = allow_spilling &&
8876
                       (i == ARRAY_SIZE(pre_modes) - 1);
8877

8878
      /* We should only spill registers on the last scheduling. */
8879
      assert(!spilled_any_registers);
8880

8881
      allocated = assign_regs(can_spill, spill_all);
8882
      if (allocated)
8883
         break;
8884
   }
8885

8886
   if (!allocated) {
8887
      fail("Failure to register allocate.  Reduce number of "
8888
           "live scalar values to avoid this.");
8889
   } else if (spilled_any_registers) {
8890
      compiler->shader_perf_log(log_data,
8891
                                "%s shader triggered register spilling.  "
8892
                                "Try reducing the number of live scalar "
8893
                                "values to improve performance.\n",
8894
                                stage_name);
8895
   }
8896

8897
   /* This must come after all optimization and register allocation, since
8898
    * it inserts dead code that happens to have side effects, and it does
8899
    * so based on the actual physical registers in use.
8900
    */
8901
   insert_gfx4_send_dependency_workarounds();
8902

8903
   if (failed)
8904
      return;
8905

8906
   opt_bank_conflicts();
8907

8908
   schedule_instructions(SCHEDULE_POST);
8909

8910
   if (last_scratch > 0) {
8911
      ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
8912

8913
      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
8914

8915
      if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
8916
         if (devinfo->is_haswell) {
8917
            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
8918
             * field documentation, Haswell supports a minimum of 2kB of
8919
             * scratch space for compute shaders, unlike every other stage
8920
             * and platform.
8921
             */
8922
            prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
8923
         } else if (devinfo->ver <= 7) {
8924
            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
8925
             * field documentation, platforms prior to Haswell measure scratch
8926
             * size linearly with a range of [1kB, 12kB] and 1kB granularity.
8927
             */
8928
            prog_data->total_scratch = ALIGN(last_scratch, 1024);
8929
            max_scratch_size = 12 * 1024;
8930
         }
8931
      }
8932

8933
      /* We currently only support up to 2MB of scratch space.  If we
8934
       * need to support more eventually, the documentation suggests
8935
       * that we could allocate a larger buffer, and partition it out
8936
       * ourselves.  We'd just have to undo the hardware's address
8937
       * calculation by subtracting (FFTID * Per Thread Scratch Space)
8938
       * and then add FFTID * (Larger Per Thread Scratch Space).
8939
       *
8940
       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
8941
       * Thread Group Tracking > Local Memory/Scratch Space.
8942
       */
8943
      assert(prog_data->total_scratch < max_scratch_size);
8944
   }
8945

8946
   lower_scoreboard();
8947
}
8948

8949
bool
8950
fs_visitor::run_vs()
8951
{
8952
   assert(stage == MESA_SHADER_VERTEX);
8953

8954
   setup_vs_payload();
8955

8956
   if (shader_time_index >= 0)
8957
      emit_shader_time_begin();
8958

8959
   emit_nir_code();
8960

8961
   if (failed)
8962
      return false;
8963

8964
   emit_urb_writes();
8965

8966
   if (shader_time_index >= 0)
8967
      emit_shader_time_end();
8968

8969
   calculate_cfg();
8970

8971
   optimize();
8972

8973
   assign_curb_setup();
8974
   assign_vs_urb_setup();
8975

8976
   fixup_3src_null_dest();
8977
   allocate_registers(true /* allow_spilling */);
8978

8979
   return !failed;
8980
}
8981

8982
void
8983
fs_visitor::set_tcs_invocation_id()
8984
{
8985
   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
8986
   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
8987

8988
   const unsigned instance_id_mask =
8989
      devinfo->ver >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
8990
   const unsigned instance_id_shift =
8991
      devinfo->ver >= 11 ? 16 : 17;
8992

8993
   /* Get instance number from g0.2 bits 22:16 or 23:17 */
8994
   fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
8995
   bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
8996
           brw_imm_ud(instance_id_mask));
8997

8998
   invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
8999

9000
   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
9001
      /* gl_InvocationID is just the thread number */
9002
      bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
9003
      return;
9004
   }
9005

9006
   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
9007

9008
   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
9009
   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
9010
   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
9011
   bld.MOV(channels_ud, channels_uw);
9012

9013
   if (tcs_prog_data->instances == 1) {
9014
      invocation_id = channels_ud;
9015
   } else {
9016
      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
9017
      bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
9018
      bld.ADD(invocation_id, instance_times_8, channels_ud);
9019
   }
9020
}
9021

9022
bool
9023
fs_visitor::run_tcs()
9024
{
9025
   assert(stage == MESA_SHADER_TESS_CTRL);
9026

9027
   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
9028
   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
9029
   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
9030

9031
   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
9032
          vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
9033

9034
   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
9035
      /* r1-r4 contain the ICP handles. */
9036
      payload.num_regs = 5;
9037
   } else {
9038
      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
9039
      assert(tcs_key->input_vertices > 0);
9040
      /* r1 contains output handles, r2 may contain primitive ID, then the
9041
       * ICP handles occupy the next 1-32 registers.
9042
       */
9043
      payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
9044
                         tcs_key->input_vertices;
9045
   }
9046

9047
   if (shader_time_index >= 0)
9048
      emit_shader_time_begin();
9049

9050
   /* Initialize gl_InvocationID */
9051
   set_tcs_invocation_id();
9052

9053
   const bool fix_dispatch_mask =
9054
      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
9055
      (nir->info.tess.tcs_vertices_out % 8) != 0;
9056

9057
   /* Fix the disptach mask */
9058
   if (fix_dispatch_mask) {
9059
      bld.CMP(bld.null_reg_ud(), invocation_id,
9060
              brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
9061
      bld.IF(BRW_PREDICATE_NORMAL);
9062
   }
9063

9064
   emit_nir_code();
9065

9066
   if (fix_dispatch_mask) {
9067
      bld.emit(BRW_OPCODE_ENDIF);
9068
   }
9069

9070
   /* Emit EOT write; set TR DS Cache bit */
9071
   fs_reg srcs[3] = {
9072
      fs_reg(get_tcs_output_urb_handle()),
9073
      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
9074
      fs_reg(brw_imm_ud(0)),
9075
   };
9076
   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
9077
   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
9078

9079
   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
9080
                            bld.null_reg_ud(), payload);
9081
   inst->mlen = 3;
9082
   inst->eot = true;
9083

9084
   if (shader_time_index >= 0)
9085
      emit_shader_time_end();
9086

9087
   if (failed)
9088
      return false;
9089

9090
   calculate_cfg();
9091

9092
   optimize();
9093

9094
   assign_curb_setup();
9095
   assign_tcs_urb_setup();
9096

9097
   fixup_3src_null_dest();
9098
   allocate_registers(true /* allow_spilling */);
9099

9100
   return !failed;
9101
}
9102

9103
bool
9104
fs_visitor::run_tes()
9105
{
9106
   assert(stage == MESA_SHADER_TESS_EVAL);
9107

9108
   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
9109
   payload.num_regs = 5;
9110

9111
   if (shader_time_index >= 0)
9112
      emit_shader_time_begin();
9113

9114
   emit_nir_code();
9115

9116
   if (failed)
9117
      return false;
9118

9119
   emit_urb_writes();
9120

9121
   if (shader_time_index >= 0)
9122
      emit_shader_time_end();
9123

9124
   calculate_cfg();
9125

9126
   optimize();
9127

9128
   assign_curb_setup();
9129
   assign_tes_urb_setup();
9130

9131
   fixup_3src_null_dest();
9132
   allocate_registers(true /* allow_spilling */);
9133

9134
   return !failed;
9135
}
9136

9137
bool
9138
fs_visitor::run_gs()
9139
{
9140
   assert(stage == MESA_SHADER_GEOMETRY);
9141

9142
   setup_gs_payload();
9143

9144
   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
9145

9146
   if (gs_compile->control_data_header_size_bits > 0) {
9147
      /* Create a VGRF to store accumulated control data bits. */
9148
      this->control_data_bits = vgrf(glsl_type::uint_type);
9149

9150
      /* If we're outputting more than 32 control data bits, then EmitVertex()
9151
       * will set control_data_bits to 0 after emitting the first vertex.
9152
       * Otherwise, we need to initialize it to 0 here.
9153
       */
9154
      if (gs_compile->control_data_header_size_bits <= 32) {
9155
         const fs_builder abld = bld.annotate("initialize control data bits");
9156
         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
9157
      }
9158
   }
9159

9160
   if (shader_time_index >= 0)
9161
      emit_shader_time_begin();
9162

9163
   emit_nir_code();
9164

9165
   emit_gs_thread_end();
9166

9167
   if (shader_time_index >= 0)
9168
      emit_shader_time_end();
9169

9170
   if (failed)
9171
      return false;
9172

9173
   calculate_cfg();
9174

9175
   optimize();
9176

9177
   assign_curb_setup();
9178
   assign_gs_urb_setup();
9179

9180
   fixup_3src_null_dest();
9181
   allocate_registers(true /* allow_spilling */);
9182

9183
   return !failed;
9184
}
9185

9186
/* From the SKL PRM, Volume 16, Workarounds:
9187
 *
9188
 *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
9189
 *              only header phases (R0-R2)
9190
 *
9191
 *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
9192
 *       have been header only.
9193
 *
9194
 * Instead of enabling push constants one can alternatively enable one of the
9195
 * inputs. Here one simply chooses "layer" which shouldn't impose much
9196
 * overhead.
9197
 */
9198
static void
9199
gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
9200
{
9201
   if (wm_prog_data->num_varying_inputs)
9202
      return;
9203

9204
   if (wm_prog_data->base.curb_read_length)
9205
      return;
9206

9207
   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
9208
   wm_prog_data->num_varying_inputs = 1;
9209

9210
   brw_compute_urb_setup_index(wm_prog_data);
9211
}
9212

9213
bool
9214
fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
9215
{
9216
   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
9217
   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
9218

9219
   assert(stage == MESA_SHADER_FRAGMENT);
9220

9221
   if (devinfo->ver >= 6)
9222
      setup_fs_payload_gfx6();
9223
   else
9224
      setup_fs_payload_gfx4();
9225

9226
   if (0) {
9227
      emit_dummy_fs();
9228
   } else if (do_rep_send) {
9229
      assert(dispatch_width == 16);
9230
      emit_repclear_shader();
9231
   } else {
9232
      if (shader_time_index >= 0)
9233
         emit_shader_time_begin();
9234

9235
      if (nir->info.inputs_read > 0 ||
9236
          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
9237
          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
9238
         if (devinfo->ver < 6)
9239
            emit_interpolation_setup_gfx4();
9240
         else
9241
            emit_interpolation_setup_gfx6();
9242
      }
9243

9244
      /* We handle discards by keeping track of the still-live pixels in f0.1.
9245
       * Initialize it with the dispatched pixels.
9246
       */
9247
      if (wm_prog_data->uses_kill) {
9248
         const unsigned lower_width = MIN2(dispatch_width, 16);
9249
         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
9250
            const fs_reg dispatch_mask =
9251
               devinfo->ver >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
9252
               brw_vec1_grf(0, 0);
9253
            bld.exec_all().group(1, 0)
9254
               .MOV(sample_mask_reg(bld.group(lower_width, i)),
9255
                    retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
9256
         }
9257
      }
9258

9259
      if (nir->info.writes_memory)
9260
         wm_prog_data->has_side_effects = true;
9261

9262
      emit_nir_code();
9263

9264
      if (failed)
9265
	 return false;
9266

9267
      if (wm_key->alpha_test_func)
9268
         emit_alpha_test();
9269

9270
      emit_fb_writes();
9271

9272
      if (shader_time_index >= 0)
9273
         emit_shader_time_end();
9274

9275
      calculate_cfg();
9276

9277
      optimize();
9278

9279
      assign_curb_setup();
9280

9281
      if (devinfo->ver >= 9)
9282
         gfx9_ps_header_only_workaround(wm_prog_data);
9283

9284
      assign_urb_setup();
9285

9286
      fixup_3src_null_dest();
9287

9288
      allocate_registers(allow_spilling);
9289

9290
      if (failed)
9291
         return false;
9292
   }
9293

9294
   return !failed;
9295
}
9296

9297
bool
9298
fs_visitor::run_cs(bool allow_spilling)
9299
{
9300
   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
9301

9302
   setup_cs_payload();
9303

9304
   if (shader_time_index >= 0)
9305
      emit_shader_time_begin();
9306

9307
   if (devinfo->is_haswell && prog_data->total_shared > 0) {
9308
      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
9309
      const fs_builder abld = bld.exec_all().group(1, 0);
9310
      abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
9311
               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
9312
   }
9313

9314
   emit_nir_code();
9315

9316
   if (failed)
9317
      return false;
9318

9319
   emit_cs_terminate();
9320

9321
   if (shader_time_index >= 0)
9322
      emit_shader_time_end();
9323

9324
   calculate_cfg();
9325

9326
   optimize();
9327

9328
   assign_curb_setup();
9329

9330
   fixup_3src_null_dest();
9331
   allocate_registers(allow_spilling);
9332

9333
   if (failed)
9334
      return false;
9335

9336
   return !failed;
9337
}
9338

9339
bool
9340
fs_visitor::run_bs(bool allow_spilling)
9341
{
9342
   assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
9343

9344
   /* R0: thread header, R1: stack IDs, R2: argument addresses */
9345
   payload.num_regs = 3;
9346

9347
   if (shader_time_index >= 0)
9348
      emit_shader_time_begin();
9349

9350
   emit_nir_code();
9351

9352
   if (failed)
9353
      return false;
9354

9355
   /* TODO(RT): Perhaps rename this? */
9356
   emit_cs_terminate();
9357

9358
   if (shader_time_index >= 0)
9359
      emit_shader_time_end();
9360

9361
   calculate_cfg();
9362

9363
   optimize();
9364

9365
   assign_curb_setup();
9366

9367
   fixup_3src_null_dest();
9368
   allocate_registers(allow_spilling);
9369

9370
   if (failed)
9371
      return false;
9372

9373
   return !failed;
9374
}
9375

9376
static bool
9377
is_used_in_not_interp_frag_coord(nir_ssa_def *def)
9378
{
9379
   nir_foreach_use(src, def) {
9380
      if (src->parent_instr->type != nir_instr_type_intrinsic)
9381
         return true;
9382

9383
      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
9384
      if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
9385
         return true;
9386
   }
9387

9388
   nir_foreach_if_use(src, def)
9389
      return true;
9390

9391
   return false;
9392
}
9393

9394
/**
9395
 * Return a bitfield where bit n is set if barycentric interpolation mode n
9396
 * (see enum brw_barycentric_mode) is needed by the fragment shader.
9397
 *
9398
 * We examine the load_barycentric intrinsics rather than looking at input
9399
 * variables so that we catch interpolateAtCentroid() messages too, which
9400
 * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
9401
 */
9402
static unsigned
9403
brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
9404
                                     const nir_shader *shader)
9405
{
9406
   unsigned barycentric_interp_modes = 0;
9407

9408
   nir_foreach_function(f, shader) {
9409
      if (!f->impl)
9410
         continue;
9411

9412
      nir_foreach_block(block, f->impl) {
9413
         nir_foreach_instr(instr, block) {
9414
            if (instr->type != nir_instr_type_intrinsic)
9415
               continue;
9416

9417
            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9418
            switch (intrin->intrinsic) {
9419
            case nir_intrinsic_load_barycentric_pixel:
9420
            case nir_intrinsic_load_barycentric_centroid:
9421
            case nir_intrinsic_load_barycentric_sample:
9422
               break;
9423
            default:
9424
               continue;
9425
            }
9426

9427
            /* Ignore WPOS; it doesn't require interpolation. */
9428
            assert(intrin->dest.is_ssa);
9429
            if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa))
9430
               continue;
9431

9432
            enum glsl_interp_mode interp = (enum glsl_interp_mode)
9433
               nir_intrinsic_interp_mode(intrin);
9434
            nir_intrinsic_op bary_op = intrin->intrinsic;
9435
            enum brw_barycentric_mode bary =
9436
               brw_barycentric_mode(interp, bary_op);
9437

9438
            barycentric_interp_modes |= 1 << bary;
9439

9440
            if (devinfo->needs_unlit_centroid_workaround &&
9441
                bary_op == nir_intrinsic_load_barycentric_centroid)
9442
               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
9443
         }
9444
      }
9445
   }
9446

9447
   return barycentric_interp_modes;
9448
}
9449

9450
static void
9451
brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
9452
                        const nir_shader *shader)
9453
{
9454
   prog_data->flat_inputs = 0;
9455

9456
   nir_foreach_shader_in_variable(var, shader) {
9457
      unsigned slots = glsl_count_attribute_slots(var->type, false);
9458
      for (unsigned s = 0; s < slots; s++) {
9459
         int input_index = prog_data->urb_setup[var->data.location + s];
9460

9461
         if (input_index < 0)
9462
            continue;
9463

9464
         /* flat shading */
9465
         if (var->data.interpolation == INTERP_MODE_FLAT)
9466
            prog_data->flat_inputs |= 1 << input_index;
9467
      }
9468
   }
9469
}
9470

9471
static uint8_t
9472
computed_depth_mode(const nir_shader *shader)
9473
{
9474
   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
9475
      switch (shader->info.fs.depth_layout) {
9476
      case FRAG_DEPTH_LAYOUT_NONE:
9477
      case FRAG_DEPTH_LAYOUT_ANY:
9478
         return BRW_PSCDEPTH_ON;
9479
      case FRAG_DEPTH_LAYOUT_GREATER:
9480
         return BRW_PSCDEPTH_ON_GE;
9481
      case FRAG_DEPTH_LAYOUT_LESS:
9482
         return BRW_PSCDEPTH_ON_LE;
9483
      case FRAG_DEPTH_LAYOUT_UNCHANGED:
9484
         return BRW_PSCDEPTH_OFF;
9485
      }
9486
   }
9487
   return BRW_PSCDEPTH_OFF;
9488
}
9489

9490
/**
9491
 * Move load_interpolated_input with simple (payload-based) barycentric modes
9492
 * to the top of the program so we don't emit multiple PLNs for the same input.
9493
 *
9494
 * This works around CSE not being able to handle non-dominating cases
9495
 * such as:
9496
 *
9497
 *    if (...) {
9498
 *       interpolate input
9499
 *    } else {
9500
 *       interpolate the same exact input
9501
 *    }
9502
 *
9503
 * This should be replaced by global value numbering someday.
9504
 */
9505
bool
9506
brw_nir_move_interpolation_to_top(nir_shader *nir)
9507
{
9508
   bool progress = false;
9509

9510
   nir_foreach_function(f, nir) {
9511
      if (!f->impl)
9512
         continue;
9513

9514
      nir_block *top = nir_start_block(f->impl);
9515
      exec_node *cursor_node = NULL;
9516

9517
      nir_foreach_block(block, f->impl) {
9518
         if (block == top)
9519
            continue;
9520

9521
         nir_foreach_instr_safe(instr, block) {
9522
            if (instr->type != nir_instr_type_intrinsic)
9523
               continue;
9524

9525
            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9526
            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
9527
               continue;
9528
            nir_intrinsic_instr *bary_intrinsic =
9529
               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
9530
            nir_intrinsic_op op = bary_intrinsic->intrinsic;
9531

9532
            /* Leave interpolateAtSample/Offset() where they are. */
9533
            if (op == nir_intrinsic_load_barycentric_at_sample ||
9534
                op == nir_intrinsic_load_barycentric_at_offset)
9535
               continue;
9536

9537
            nir_instr *move[3] = {
9538
               &bary_intrinsic->instr,
9539
               intrin->src[1].ssa->parent_instr,
9540
               instr
9541
            };
9542

9543
            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
9544
               if (move[i]->block != top) {
9545
                  move[i]->block = top;
9546
                  exec_node_remove(&move[i]->node);
9547
                  if (cursor_node) {
9548
                     exec_node_insert_after(cursor_node, &move[i]->node);
9549
                  } else {
9550
                     exec_list_push_head(&top->instr_list, &move[i]->node);
9551
                  }
9552
                  cursor_node = &move[i]->node;
9553
                  progress = true;
9554
               }
9555
            }
9556
         }
9557
      }
9558
      nir_metadata_preserve(f->impl, nir_metadata_block_index |
9559
                                     nir_metadata_dominance);
9560
   }
9561

9562
   return progress;
9563
}
9564

9565
/**
9566
 * Demote per-sample barycentric intrinsics to centroid.
9567
 *
9568
 * Useful when rendering to a non-multisampled buffer.
9569
 */
9570
bool
9571
brw_nir_demote_sample_qualifiers(nir_shader *nir)
9572
{
9573
   bool progress = true;
9574

9575
   nir_foreach_function(f, nir) {
9576
      if (!f->impl)
9577
         continue;
9578

9579
      nir_builder b;
9580
      nir_builder_init(&b, f->impl);
9581

9582
      nir_foreach_block(block, f->impl) {
9583
         nir_foreach_instr_safe(instr, block) {
9584
            if (instr->type != nir_instr_type_intrinsic)
9585
               continue;
9586

9587
            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9588
            if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
9589
                intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
9590
               continue;
9591

9592
            b.cursor = nir_before_instr(instr);
9593
            nir_ssa_def *centroid =
9594
               nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
9595
                                    nir_intrinsic_interp_mode(intrin));
9596
            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
9597
                                     centroid);
9598
            nir_instr_remove(instr);
9599
            progress = true;
9600
         }
9601
      }
9602

9603
      nir_metadata_preserve(f->impl, nir_metadata_block_index |
9604
                                     nir_metadata_dominance);
9605
   }
9606

9607
   return progress;
9608
}
9609

9610
void
9611
brw_nir_populate_wm_prog_data(const nir_shader *shader,
9612
                              const struct intel_device_info *devinfo,
9613
                              const struct brw_wm_prog_key *key,
9614
                              struct brw_wm_prog_data *prog_data)
9615
{
9616
   /* key->alpha_test_func means simulating alpha testing via discards,
9617
    * so the shader definitely kills pixels.
9618
    */
9619
   prog_data->uses_kill = shader->info.fs.uses_discard ||
9620
      key->alpha_test_func;
9621
   prog_data->uses_omask = !key->ignore_sample_mask_out &&
9622
      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
9623
   prog_data->computed_depth_mode = computed_depth_mode(shader);
9624
   prog_data->computed_stencil =
9625
      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
9626

9627
   prog_data->persample_dispatch =
9628
      key->multisample_fbo &&
9629
      (key->persample_interp ||
9630
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
9631
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
9632
       shader->info.fs.uses_sample_qualifier ||
9633
       shader->info.outputs_read);
9634

9635
   if (devinfo->ver >= 6) {
9636
      prog_data->uses_sample_mask =
9637
         BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
9638

9639
      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
9640
       *
9641
       *    "MSDISPMODE_PERSAMPLE is required in order to select
9642
       *    POSOFFSET_SAMPLE"
9643
       *
9644
       * So we can only really get sample positions if we are doing real
9645
       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
9646
       * persample dispatch, we hard-code it to 0.5.
9647
       */
9648
      prog_data->uses_pos_offset = prog_data->persample_dispatch &&
9649
         BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
9650
   }
9651

9652
   prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
9653

9654
   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
9655
   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
9656
   prog_data->inner_coverage = shader->info.fs.inner_coverage;
9657

9658
   prog_data->barycentric_interp_modes =
9659
      brw_compute_barycentric_interp_modes(devinfo, shader);
9660

9661
   prog_data->per_coarse_pixel_dispatch =
9662
      key->coarse_pixel &&
9663
      !prog_data->persample_dispatch &&
9664
      !prog_data->uses_sample_mask &&
9665
      (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
9666
      !prog_data->computed_stencil;
9667

9668
   prog_data->uses_src_w =
9669
      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
9670
   prog_data->uses_src_depth =
9671
      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
9672
      !prog_data->per_coarse_pixel_dispatch;
9673
   prog_data->uses_depth_w_coefficients =
9674
      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
9675
      prog_data->per_coarse_pixel_dispatch;
9676

9677
   calculate_urb_setup(devinfo, key, prog_data, shader);
9678
   brw_compute_flat_inputs(prog_data, shader);
9679
}
9680

9681
/**
9682
 * Pre-gfx6, the register file of the EUs was shared between threads,
9683
 * and each thread used some subset allocated on a 16-register block
9684
 * granularity.  The unit states wanted these block counts.
9685
 */
9686
static inline int
9687
brw_register_blocks(int reg_count)
9688
{
9689
   return ALIGN(reg_count, 16) / 16 - 1;
9690
}
9691

9692
const unsigned *
9693
brw_compile_fs(const struct brw_compiler *compiler,
9694
               void *mem_ctx,
9695
               struct brw_compile_fs_params *params)
9696
{
9697
   struct nir_shader *nir = params->nir;
9698
   const struct brw_wm_prog_key *key = params->key;
9699
   struct brw_wm_prog_data *prog_data = params->prog_data;
9700
   bool allow_spilling = params->allow_spilling;
9701
   const bool debug_enabled =
9702
      INTEL_DEBUG & (params->debug_flag ? params->debug_flag : DEBUG_WM);
9703

9704
   prog_data->base.stage = MESA_SHADER_FRAGMENT;
9705

9706
   const struct intel_device_info *devinfo = compiler->devinfo;
9707
   const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
9708

9709
   brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size, true);
9710
   brw_nir_lower_fs_inputs(nir, devinfo, key);
9711
   brw_nir_lower_fs_outputs(nir);
9712

9713
   if (devinfo->ver < 6)
9714
      brw_setup_vue_interpolation(params->vue_map, nir, prog_data);
9715

9716
   /* From the SKL PRM, Volume 7, "Alpha Coverage":
9717
    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
9718
    *   hardware, regardless of the state setting for this feature."
9719
    */
9720
   if (devinfo->ver > 6 && key->alpha_to_coverage) {
9721
      /* Run constant fold optimization in order to get the correct source
9722
       * offset to determine render target 0 store instruction in
9723
       * emit_alpha_to_coverage pass.
9724
       */
9725
      NIR_PASS_V(nir, nir_opt_constant_folding);
9726
      NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage);
9727
   }
9728

9729
   if (!key->multisample_fbo)
9730
      NIR_PASS_V(nir, brw_nir_demote_sample_qualifiers);
9731
   NIR_PASS_V(nir, brw_nir_move_interpolation_to_top);
9732
   brw_postprocess_nir(nir, compiler, true, debug_enabled,
9733
                       key->base.robust_buffer_access);
9734

9735
   brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
9736

9737
   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
9738
   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
9739
   float throughput = 0;
9740
   bool has_spilled = false;
9741

9742
   v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9743
                       &prog_data->base, nir, 8,
9744
                       params->shader_time ? params->shader_time_index8 : -1,
9745
                       debug_enabled);
9746
   if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
9747
      params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
9748
      delete v8;
9749
      return NULL;
9750
   } else if (!(INTEL_DEBUG & DEBUG_NO8)) {
9751
      simd8_cfg = v8->cfg;
9752
      prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs;
9753
      prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
9754
      const performance &perf = v8->performance_analysis.require();
9755
      throughput = MAX2(throughput, perf.throughput);
9756
      has_spilled = v8->spilled_any_registers;
9757
      allow_spilling = false;
9758
   }
9759

9760
   /* Limit dispatch width to simd8 with dual source blending on gfx8.
9761
    * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
9762
    */
9763
   if (devinfo->ver == 8 && prog_data->dual_src_blend &&
9764
       !(INTEL_DEBUG & DEBUG_NO8)) {
9765
      assert(!params->use_rep_send);
9766
      v8->limit_dispatch_width(8, "gfx8 workaround: "
9767
                               "using SIMD8 when dual src blending.\n");
9768
   }
9769

9770
   if (key->coarse_pixel) {
9771
      if (prog_data->dual_src_blend) {
9772
         v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
9773
                                  " use SIMD8 messages.\n");
9774
      }
9775
      v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
9776
                               " pixel shading.\n");
9777
   }
9778

9779
   if (!has_spilled &&
9780
       v8->max_dispatch_width >= 16 &&
9781
       (!(INTEL_DEBUG & DEBUG_NO16) || params->use_rep_send)) {
9782
      /* Try a SIMD16 compile */
9783
      v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9784
                           &prog_data->base, nir, 16,
9785
                           params->shader_time ? params->shader_time_index16 : -1,
9786
                           debug_enabled);
9787
      v16->import_uniforms(v8);
9788
      if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
9789
         compiler->shader_perf_log(params->log_data,
9790
                                   "SIMD16 shader failed to compile: %s",
9791
                                   v16->fail_msg);
9792
      } else {
9793
         simd16_cfg = v16->cfg;
9794
         prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs;
9795
         prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
9796
         const performance &perf = v16->performance_analysis.require();
9797
         throughput = MAX2(throughput, perf.throughput);
9798
         has_spilled = v16->spilled_any_registers;
9799
         allow_spilling = false;
9800
      }
9801
   }
9802

9803
   const bool simd16_failed = v16 && !simd16_cfg;
9804

9805
   /* Currently, the compiler only supports SIMD32 on SNB+ */
9806
   if (!has_spilled &&
9807
       v8->max_dispatch_width >= 32 && !params->use_rep_send &&
9808
       devinfo->ver >= 6 && !simd16_failed &&
9809
       !(INTEL_DEBUG & DEBUG_NO32)) {
9810
      /* Try a SIMD32 compile */
9811
      v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9812
                           &prog_data->base, nir, 32,
9813
                           params->shader_time ? params->shader_time_index32 : -1,
9814
                           debug_enabled);
9815
      v32->import_uniforms(v8);
9816
      if (!v32->run_fs(allow_spilling, false)) {
9817
         compiler->shader_perf_log(params->log_data,
9818
                                   "SIMD32 shader failed to compile: %s",
9819
                                   v32->fail_msg);
9820
      } else {
9821
         const performance &perf = v32->performance_analysis.require();
9822

9823
         if (!(INTEL_DEBUG & DEBUG_DO32) && throughput >= perf.throughput) {
9824
            compiler->shader_perf_log(params->log_data, "SIMD32 shader inefficient\n");
9825
         } else {
9826
            simd32_cfg = v32->cfg;
9827
            prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs;
9828
            prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
9829
            throughput = MAX2(throughput, perf.throughput);
9830
         }
9831
      }
9832
   }
9833

9834
   /* When the caller requests a repclear shader, they want SIMD16-only */
9835
   if (params->use_rep_send)
9836
      simd8_cfg = NULL;
9837

9838
   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
9839
    * at the top to select the shader.  We've never implemented that.
9840
    * Instead, we just give them exactly one shader and we pick the widest one
9841
    * available.
9842
    */
9843
   if (compiler->devinfo->ver < 5) {
9844
      if (simd32_cfg || simd16_cfg)
9845
         simd8_cfg = NULL;
9846
      if (simd32_cfg)
9847
         simd16_cfg = NULL;
9848
   }
9849

9850
   /* If computed depth is enabled SNB only allows SIMD8. */
9851
   if (compiler->devinfo->ver == 6 &&
9852
       prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
9853
      assert(simd16_cfg == NULL && simd32_cfg == NULL);
9854

9855
   if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
9856
      /* Iron lake and earlier only have one Dispatch GRF start field.  Make
9857
       * the data available in the base prog data struct for convenience.
9858
       */
9859
      if (simd16_cfg) {
9860
         prog_data->base.dispatch_grf_start_reg =
9861
            prog_data->dispatch_grf_start_reg_16;
9862
      } else if (simd32_cfg) {
9863
         prog_data->base.dispatch_grf_start_reg =
9864
            prog_data->dispatch_grf_start_reg_32;
9865
      }
9866
   }
9867

9868
   if (prog_data->persample_dispatch) {
9869
      /* Starting with SandyBridge (where we first get MSAA), the different
9870
       * pixel dispatch combinations are grouped into classifications A
9871
       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On most hardware
9872
       * generations, the only configurations supporting persample dispatch
9873
       * are those in which only one dispatch width is enabled.
9874
       *
9875
       * The Gfx12 hardware spec has a similar dispatch grouping table, but
9876
       * the following conflicting restriction applies (from the page on
9877
       * "Structure_3DSTATE_PS_BODY"), so we need to keep the SIMD16 shader:
9878
       *
9879
       *  "SIMD32 may only be enabled if SIMD16 or (dual)SIMD8 is also
9880
       *   enabled."
9881
       */
9882
      if (simd32_cfg || simd16_cfg)
9883
         simd8_cfg = NULL;
9884
      if (simd32_cfg && devinfo->ver < 12)
9885
         simd16_cfg = NULL;
9886
   }
9887

9888
   fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
9889
                  v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
9890

9891
   if (unlikely(debug_enabled)) {
9892
      g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
9893
                                     nir->info.label ?
9894
                                        nir->info.label : "unnamed",
9895
                                     nir->info.name));
9896
   }
9897

9898
   struct brw_compile_stats *stats = params->stats;
9899

9900
   if (simd8_cfg) {
9901
      prog_data->dispatch_8 = true;
9902
      g.generate_code(simd8_cfg, 8, v8->shader_stats,
9903
                      v8->performance_analysis.require(), stats);
9904
      stats = stats ? stats + 1 : NULL;
9905
   }
9906

9907
   if (simd16_cfg) {
9908
      prog_data->dispatch_16 = true;
9909
      prog_data->prog_offset_16 = g.generate_code(
9910
         simd16_cfg, 16, v16->shader_stats,
9911
         v16->performance_analysis.require(), stats);
9912
      stats = stats ? stats + 1 : NULL;
9913
   }
9914

9915
   if (simd32_cfg) {
9916
      prog_data->dispatch_32 = true;
9917
      prog_data->prog_offset_32 = g.generate_code(
9918
         simd32_cfg, 32, v32->shader_stats,
9919
         v32->performance_analysis.require(), stats);
9920
      stats = stats ? stats + 1 : NULL;
9921
   }
9922

9923
   g.add_const_data(nir->constant_data, nir->constant_data_size);
9924

9925
   delete v8;
9926
   delete v16;
9927
   delete v32;
9928

9929
   return g.get_assembly();
9930
}
9931

9932
fs_reg *
9933
fs_visitor::emit_cs_work_group_id_setup()
9934
{
9935
   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
9936

9937
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
9938

9939
   struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
9940
   struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
9941
   struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
9942

9943
   bld.MOV(*reg, r0_1);
9944
   bld.MOV(offset(*reg, bld, 1), r0_6);
9945
   bld.MOV(offset(*reg, bld, 2), r0_7);
9946

9947
   return reg;
9948
}
9949

9950
unsigned
9951
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
9952
                             unsigned threads)
9953
{
9954
   assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
9955
   assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
9956
   return cs_prog_data->push.per_thread.size * threads +
9957
          cs_prog_data->push.cross_thread.size;
9958
}
9959

9960
static void
9961
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
9962
{
9963
   block->dwords = dwords;
9964
   block->regs = DIV_ROUND_UP(dwords, 8);
9965
   block->size = block->regs * 32;
9966
}
9967

9968
static void
9969
cs_fill_push_const_info(const struct intel_device_info *devinfo,
9970
                        struct brw_cs_prog_data *cs_prog_data)
9971
{
9972
   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
9973
   int subgroup_id_index = get_subgroup_id_param_index(devinfo, prog_data);
9974
   bool cross_thread_supported = devinfo->verx10 >= 75;
9975

9976
   /* The thread ID should be stored in the last param dword */
9977
   assert(subgroup_id_index == -1 ||
9978
          subgroup_id_index == (int)prog_data->nr_params - 1);
9979

9980
   unsigned cross_thread_dwords, per_thread_dwords;
9981
   if (!cross_thread_supported) {
9982
      cross_thread_dwords = 0u;
9983
      per_thread_dwords = prog_data->nr_params;
9984
   } else if (subgroup_id_index >= 0) {
9985
      /* Fill all but the last register with cross-thread payload */
9986
      cross_thread_dwords = 8 * (subgroup_id_index / 8);
9987
      per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
9988
      assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
9989
   } else {
9990
      /* Fill all data using cross-thread payload */
9991
      cross_thread_dwords = prog_data->nr_params;
9992
      per_thread_dwords = 0u;
9993
   }
9994

9995
   fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
9996
   fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
9997

9998
   assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
9999
          cs_prog_data->push.per_thread.size == 0);
10000
   assert(cs_prog_data->push.cross_thread.dwords +
10001
          cs_prog_data->push.per_thread.dwords ==
10002
             prog_data->nr_params);
10003
}
10004

10005
static bool
10006
filter_simd(const nir_instr *instr, const void * /* options */)
10007
{
10008
   if (instr->type != nir_instr_type_intrinsic)
10009
      return false;
10010

10011
   switch (nir_instr_as_intrinsic(instr)->intrinsic) {
10012
   case nir_intrinsic_load_simd_width_intel:
10013
   case nir_intrinsic_load_subgroup_id:
10014
      return true;
10015

10016
   default:
10017
      return false;
10018
   }
10019
}
10020

10021
static nir_ssa_def *
10022
lower_simd(nir_builder *b, nir_instr *instr, void *options)
10023
{
10024
   uintptr_t simd_width = (uintptr_t)options;
10025

10026
   switch (nir_instr_as_intrinsic(instr)->intrinsic) {
10027
   case nir_intrinsic_load_simd_width_intel:
10028
      return nir_imm_int(b, simd_width);
10029

10030
   case nir_intrinsic_load_subgroup_id:
10031
      /* If the whole workgroup fits in one thread, we can lower subgroup_id
10032
       * to a constant zero.
10033
       */
10034
      if (!b->shader->info.workgroup_size_variable) {
10035
         unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
10036
                                         b->shader->info.workgroup_size[1] *
10037
                                         b->shader->info.workgroup_size[2];
10038
         if (local_workgroup_size <= simd_width)
10039
            return nir_imm_int(b, 0);
10040
      }
10041
      return NULL;
10042

10043
   default:
10044
      return NULL;
10045
   }
10046
}
10047

10048
static void
10049
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
10050
{
10051
   nir_shader_lower_instructions(nir, filter_simd, lower_simd,
10052
                                 (void *)(uintptr_t)dispatch_width);
10053
}
10054

10055
static nir_shader *
10056
compile_cs_to_nir(const struct brw_compiler *compiler,
10057
                  void *mem_ctx,
10058
                  const struct brw_cs_prog_key *key,
10059
                  const nir_shader *src_shader,
10060
                  unsigned dispatch_width,
10061
                  bool debug_enabled)
10062
{
10063
   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
10064
   brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
10065

10066
   NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);
10067

10068
   /* Clean up after the local index and ID calculations. */
10069
   NIR_PASS_V(shader, nir_opt_constant_folding);
10070
   NIR_PASS_V(shader, nir_opt_dce);
10071

10072
   brw_postprocess_nir(shader, compiler, true, debug_enabled,
10073
                       key->base.robust_buffer_access);
10074

10075
   return shader;
10076
}
10077

10078
const unsigned *
10079
brw_compile_cs(const struct brw_compiler *compiler,
10080
               void *mem_ctx,
10081
               struct brw_compile_cs_params *params)
10082
{
10083
   const nir_shader *nir = params->nir;
10084
   const struct brw_cs_prog_key *key = params->key;
10085
   struct brw_cs_prog_data *prog_data = params->prog_data;
10086
   int shader_time_index = params->shader_time ? params->shader_time_index : -1;
10087

10088
   const bool debug_enabled = INTEL_DEBUG & DEBUG_CS;
10089

10090
   prog_data->base.stage = MESA_SHADER_COMPUTE;
10091
   prog_data->base.total_shared = nir->info.shared_size;
10092

10093
   /* Generate code for all the possible SIMD variants. */
10094
   bool generate_all;
10095

10096
   unsigned min_dispatch_width;
10097
   unsigned max_dispatch_width;
10098

10099
   if (nir->info.workgroup_size_variable) {
10100
      generate_all = true;
10101
      min_dispatch_width = 8;
10102
      max_dispatch_width = 32;
10103
   } else {
10104
      generate_all = false;
10105
      prog_data->local_size[0] = nir->info.workgroup_size[0];
10106
      prog_data->local_size[1] = nir->info.workgroup_size[1];
10107
      prog_data->local_size[2] = nir->info.workgroup_size[2];
10108
      unsigned local_workgroup_size = prog_data->local_size[0] *
10109
                                      prog_data->local_size[1] *
10110
                                      prog_data->local_size[2];
10111

10112
      /* Limit max_threads to 64 for the GPGPU_WALKER command */
10113
      const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
10114
      min_dispatch_width = util_next_power_of_two(
10115
         MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));
10116
      assert(min_dispatch_width <= 32);
10117
      max_dispatch_width = 32;
10118
   }
10119

10120
   if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
10121
      /* These enum values are expressly chosen to be equal to the subgroup
10122
       * size that they require.
10123
       */
10124
      const unsigned required_dispatch_width =
10125
         (unsigned)key->base.subgroup_size_type;
10126
      assert(required_dispatch_width == 8 ||
10127
             required_dispatch_width == 16 ||
10128
             required_dispatch_width == 32);
10129
      if (required_dispatch_width < min_dispatch_width ||
10130
          required_dispatch_width > max_dispatch_width) {
10131
         params->error_str = ralloc_strdup(mem_ctx,
10132
                                           "Cannot satisfy explicit subgroup size");
10133
         return NULL;
10134
      }
10135
      min_dispatch_width = max_dispatch_width = required_dispatch_width;
10136
   }
10137

10138
   assert(min_dispatch_width <= max_dispatch_width);
10139

10140
   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
10141
   fs_visitor *v = NULL;
10142

10143
   if (!(INTEL_DEBUG & DEBUG_NO8) &&
10144
       min_dispatch_width <= 8 && max_dispatch_width >= 8) {
10145
      nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
10146
                                           nir, 8, debug_enabled);
10147
      v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10148
                          &prog_data->base,
10149
                          nir8, 8, shader_time_index, debug_enabled);
10150
      if (!v8->run_cs(true /* allow_spilling */)) {
10151
         params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
10152
         delete v8;
10153
         return NULL;
10154
      }
10155

10156
      /* We should always be able to do SIMD32 for compute shaders */
10157
      assert(v8->max_dispatch_width >= 32);
10158

10159
      v = v8;
10160
      prog_data->prog_mask |= 1 << 0;
10161
      if (v8->spilled_any_registers)
10162
         prog_data->prog_spilled |= 1 << 0;
10163
      cs_fill_push_const_info(compiler->devinfo, prog_data);
10164
   }
10165

10166
   if (!(INTEL_DEBUG & DEBUG_NO16) &&
10167
       (generate_all || !prog_data->prog_spilled) &&
10168
       min_dispatch_width <= 16 && max_dispatch_width >= 16) {
10169
      /* Try a SIMD16 compile */
10170
      nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
10171
                                            nir, 16, debug_enabled);
10172
      v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10173
                           &prog_data->base,
10174
                           nir16, 16, shader_time_index, debug_enabled);
10175
      if (v8)
10176
         v16->import_uniforms(v8);
10177

10178
      const bool allow_spilling = generate_all || v == NULL;
10179
      if (!v16->run_cs(allow_spilling)) {
10180
         compiler->shader_perf_log(params->log_data,
10181
                                   "SIMD16 shader failed to compile: %s",
10182
                                   v16->fail_msg);
10183
         if (!v) {
10184
            assert(v8 == NULL);
10185
            params->error_str = ralloc_asprintf(
10186
               mem_ctx, "Not enough threads for SIMD8 and "
10187
               "couldn't generate SIMD16: %s", v16->fail_msg);
10188
            delete v16;
10189
            return NULL;
10190
         }
10191
      } else {
10192
         /* We should always be able to do SIMD32 for compute shaders */
10193
         assert(v16->max_dispatch_width >= 32);
10194

10195
         v = v16;
10196
         prog_data->prog_mask |= 1 << 1;
10197
         if (v16->spilled_any_registers)
10198
            prog_data->prog_spilled |= 1 << 1;
10199
         cs_fill_push_const_info(compiler->devinfo, prog_data);
10200
      }
10201
   }
10202

10203
   /* The SIMD32 is only enabled for cases it is needed unless forced.
10204
    *
10205
    * TODO: Use performance_analysis and drop this boolean.
10206
    */
10207
   const bool needs_32 = v == NULL ||
10208
                         (INTEL_DEBUG & DEBUG_DO32) ||
10209
                         generate_all;
10210

10211
   if (!(INTEL_DEBUG & DEBUG_NO32) &&
10212
       (generate_all || !prog_data->prog_spilled) &&
10213
       needs_32 &&
10214
       min_dispatch_width <= 32 && max_dispatch_width >= 32) {
10215
      /* Try a SIMD32 compile */
10216
      nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
10217
                                            nir, 32, debug_enabled);
10218
      v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10219
                           &prog_data->base,
10220
                           nir32, 32, shader_time_index, debug_enabled);
10221
      if (v8)
10222
         v32->import_uniforms(v8);
10223
      else if (v16)
10224
         v32->import_uniforms(v16);
10225

10226
      const bool allow_spilling = generate_all || v == NULL;
10227
      if (!v32->run_cs(allow_spilling)) {
10228
         compiler->shader_perf_log(params->log_data,
10229
                                   "SIMD32 shader failed to compile: %s",
10230
                                   v32->fail_msg);
10231
         if (!v) {
10232
            assert(v8 == NULL);
10233
            assert(v16 == NULL);
10234
            params->error_str = ralloc_asprintf(
10235
               mem_ctx, "Not enough threads for SIMD16 and "
10236
               "couldn't generate SIMD32: %s", v32->fail_msg);
10237
            delete v32;
10238
            return NULL;
10239
         }
10240
      } else {
10241
         v = v32;
10242
         prog_data->prog_mask |= 1 << 2;
10243
         if (v32->spilled_any_registers)
10244
            prog_data->prog_spilled |= 1 << 2;
10245
         cs_fill_push_const_info(compiler->devinfo, prog_data);
10246
      }
10247
   }
10248

10249
   if (unlikely(!v) && (INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32))) {
10250
      params->error_str =
10251
         ralloc_strdup(mem_ctx,
10252
                       "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
10253
      return NULL;
10254
   }
10255

10256
   assert(v);
10257

10258
   const unsigned *ret = NULL;
10259

10260
   fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
10261
                  v->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
10262
   if (unlikely(debug_enabled)) {
10263
      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
10264
                                   nir->info.label ?
10265
                                   nir->info.label : "unnamed",
10266
                                   nir->info.name);
10267
      g.enable_debug(name);
10268
   }
10269

10270
   struct brw_compile_stats *stats = params->stats;
10271
   if (generate_all) {
10272
      if (prog_data->prog_mask & (1 << 0)) {
10273
         assert(v8);
10274
         prog_data->prog_offset[0] =
10275
            g.generate_code(v8->cfg, 8, v8->shader_stats,
10276
                            v8->performance_analysis.require(), stats);
10277
         stats = stats ? stats + 1 : NULL;
10278
      }
10279

10280
      if (prog_data->prog_mask & (1 << 1)) {
10281
         assert(v16);
10282
         prog_data->prog_offset[1] =
10283
            g.generate_code(v16->cfg, 16, v16->shader_stats,
10284
                            v16->performance_analysis.require(), stats);
10285
         stats = stats ? stats + 1 : NULL;
10286
      }
10287

10288
      if (prog_data->prog_mask & (1 << 2)) {
10289
         assert(v32);
10290
         prog_data->prog_offset[2] =
10291
            g.generate_code(v32->cfg, 32, v32->shader_stats,
10292
                            v32->performance_analysis.require(), stats);
10293
         stats = stats ? stats + 1 : NULL;
10294
      }
10295
   } else {
10296
      /* Only one dispatch width will be valid, and will be at offset 0,
10297
       * which is already the default value of prog_offset_* fields.
10298
       */
10299
      prog_data->prog_mask = 1 << (v->dispatch_width / 16);
10300
      g.generate_code(v->cfg, v->dispatch_width, v->shader_stats,
10301
                      v->performance_analysis.require(), stats);
10302
   }
10303

10304
   g.add_const_data(nir->constant_data, nir->constant_data_size);
10305

10306
   ret = g.get_assembly();
10307

10308
   delete v8;
10309
   delete v16;
10310
   delete v32;
10311

10312
   return ret;
10313
}
10314

10315
static unsigned
10316
brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
10317
                                const struct brw_cs_prog_data *cs_prog_data,
10318
                                unsigned group_size)
10319
{
10320
   const unsigned mask = cs_prog_data->prog_mask;
10321
   assert(mask != 0);
10322

10323
   static const unsigned simd8  = 1 << 0;
10324
   static const unsigned simd16 = 1 << 1;
10325
   static const unsigned simd32 = 1 << 2;
10326

10327
   if ((INTEL_DEBUG & DEBUG_DO32) && (mask & simd32))
10328
      return 32;
10329

10330
   /* Limit max_threads to 64 for the GPGPU_WALKER command */
10331
   const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads);
10332

10333
   if ((mask & simd8) && group_size <= 8 * max_threads) {
10334
      /* Prefer SIMD16 if can do without spilling.  Matches logic in
10335
       * brw_compile_cs.
10336
       */
10337
      if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
10338
         return 16;
10339
      return 8;
10340
   }
10341

10342
   if ((mask & simd16) && group_size <= 16 * max_threads)
10343
      return 16;
10344

10345
   assert(mask & simd32);
10346
   assert(group_size <= 32 * max_threads);
10347
   return 32;
10348
}
10349

10350
struct brw_cs_dispatch_info
10351
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
10352
                         const struct brw_cs_prog_data *prog_data,
10353
                         const unsigned *override_local_size)
10354
{
10355
   struct brw_cs_dispatch_info info = {};
10356

10357
   const unsigned *sizes =
10358
      override_local_size ? override_local_size :
10359
                            prog_data->local_size;
10360

10361
   info.group_size = sizes[0] * sizes[1] * sizes[2];
10362
   info.simd_size =
10363
      brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
10364
   info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
10365

10366
   const uint32_t remainder = info.group_size & (info.simd_size - 1);
10367
   if (remainder > 0)
10368
      info.right_mask = ~0u >> (32 - remainder);
10369
   else
10370
      info.right_mask = ~0u >> (32 - info.simd_size);
10371

10372
   return info;
10373
}
10374

10375
static uint8_t
10376
compile_single_bs(const struct brw_compiler *compiler, void *log_data,
10377
                  void *mem_ctx,
10378
                  const struct brw_bs_prog_key *key,
10379
                  struct brw_bs_prog_data *prog_data,
10380
                  nir_shader *shader,
10381
                  fs_generator *g,
10382
                  struct brw_compile_stats *stats,
10383
                  int *prog_offset,
10384
                  char **error_str)
10385
{
10386
   const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;
10387

10388
   prog_data->base.stage = shader->info.stage;
10389
   prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
10390
                                    shader->scratch_size);
10391

10392
   const unsigned max_dispatch_width = 16;
10393
   brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width, true);
10394
   brw_postprocess_nir(shader, compiler, true, debug_enabled,
10395
                       key->base.robust_buffer_access);
10396

10397
   fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;
10398
   bool has_spilled = false;
10399

10400
   uint8_t simd_size = 0;
10401
   if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
10402
      v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
10403
                          &prog_data->base, shader,
10404
                          8, -1 /* shader time */, debug_enabled);
10405
      const bool allow_spilling = true;
10406
      if (!v8->run_bs(allow_spilling)) {
10407
         if (error_str)
10408
            *error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
10409
         delete v8;
10410
         return 0;
10411
      } else {
10412
         v = v8;
10413
         simd_size = 8;
10414
         if (v8->spilled_any_registers)
10415
            has_spilled = true;
10416
      }
10417
   }
10418

10419
   if (!has_spilled && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
10420
      v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
10421
                           &prog_data->base, shader,
10422
                           16, -1 /* shader time */, debug_enabled);
10423
      const bool allow_spilling = (v == NULL);
10424
      if (!v16->run_bs(allow_spilling)) {
10425
         compiler->shader_perf_log(log_data,
10426
                                   "SIMD16 shader failed to compile: %s",
10427
                                   v16->fail_msg);
10428
         if (v == NULL) {
10429
            assert(v8 == NULL);
10430
            if (error_str) {
10431
               *error_str = ralloc_asprintf(
10432
                  mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s",
10433
                  v16->fail_msg);
10434
            }
10435
            delete v16;
10436
            return 0;
10437
         }
10438
      } else {
10439
         v = v16;
10440
         simd_size = 16;
10441
         if (v16->spilled_any_registers)
10442
            has_spilled = true;
10443
      }
10444
   }
10445

10446
   if (unlikely(v == NULL)) {
10447
      assert(INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16));
10448
      if (error_str) {
10449
         *error_str = ralloc_strdup(mem_ctx,
10450
            "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
10451
      }
10452
      return false;
10453
   }
10454

10455
   assert(v);
10456

10457
   int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,
10458
                                 v->performance_analysis.require(), stats);
10459
   if (prog_offset)
10460
      *prog_offset = offset;
10461
   else
10462
      assert(offset == 0);
10463

10464
   delete v8;
10465
   delete v16;
10466

10467
   return simd_size;
10468
}
10469

10470
uint64_t
10471
brw_bsr(const struct intel_device_info *devinfo,
10472
        uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
10473
{
10474
   assert(offset % 64 == 0);
10475
   assert(simd_size == 8 || simd_size == 16);
10476
   assert(local_arg_offset % 8 == 0);
10477

10478
   return offset |
10479
          SET_BITS(simd_size > 8, 4, 4) |
10480
          SET_BITS(local_arg_offset / 8, 2, 0);
10481
}
10482

10483
const unsigned *
10484
brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
10485
               void *mem_ctx,
10486
               const struct brw_bs_prog_key *key,
10487
               struct brw_bs_prog_data *prog_data,
10488
               nir_shader *shader,
10489
               unsigned num_resume_shaders,
10490
               struct nir_shader **resume_shaders,
10491
               struct brw_compile_stats *stats,
10492
               char **error_str)
10493
{
10494
   const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;
10495

10496
   prog_data->base.stage = shader->info.stage;
10497
   prog_data->max_stack_size = 0;
10498

10499
   fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
10500
                  false, shader->info.stage);
10501
   if (unlikely(debug_enabled)) {
10502
      char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s",
10503
                                   shader->info.label ?
10504
                                      shader->info.label : "unnamed",
10505
                                   gl_shader_stage_name(shader->info.stage),
10506
                                   shader->info.name);
10507
      g.enable_debug(name);
10508
   }
10509

10510
   prog_data->simd_size =
10511
      compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
10512
                        shader, &g, stats, NULL, error_str);
10513
   if (prog_data->simd_size == 0)
10514
      return NULL;
10515

10516
   uint64_t *resume_sbt = ralloc_array(mem_ctx, uint64_t, num_resume_shaders);
10517
   for (unsigned i = 0; i < num_resume_shaders; i++) {
10518
      if (INTEL_DEBUG & DEBUG_RT) {
10519
         char *name = ralloc_asprintf(mem_ctx, "%s %s resume(%u) shader %s",
10520
                                      shader->info.label ?
10521
                                         shader->info.label : "unnamed",
10522
                                      gl_shader_stage_name(shader->info.stage),
10523
                                      i, shader->info.name);
10524
         g.enable_debug(name);
10525
      }
10526

10527
      /* TODO: Figure out shader stats etc. for resume shaders */
10528
      int offset = 0;
10529
      uint8_t simd_size =
10530
         compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
10531
                           resume_shaders[i], &g, NULL, &offset, error_str);
10532
      if (simd_size == 0)
10533
         return NULL;
10534

10535
      assert(offset > 0);
10536
      resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
10537
   }
10538

10539
   /* We only have one constant data so we want to make sure they're all the
10540
    * same.
10541
    */
10542
   for (unsigned i = 0; i < num_resume_shaders; i++) {
10543
      assert(resume_shaders[i]->constant_data_size ==
10544
             shader->constant_data_size);
10545
      assert(memcmp(resume_shaders[i]->constant_data,
10546
                    shader->constant_data,
10547
                    shader->constant_data_size) == 0);
10548
   }
10549

10550
   g.add_const_data(shader->constant_data, shader->constant_data_size);
10551
   g.add_resume_sbt(num_resume_shaders, resume_sbt);
10552

10553
   return g.get_assembly();
10554
}
10555

10556
/**
10557
 * Test the dispatch mask packing assumptions of
10558
 * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
10559
 * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
10560
 * executed with an unexpected dispatch mask.
10561
 */
10562
static UNUSED void
10563
brw_fs_test_dispatch_packing(const fs_builder &bld)
10564
{
10565
   const gl_shader_stage stage = bld.shader->stage;
10566

10567
   if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
10568
                                     bld.shader->stage_prog_data)) {
10569
      const fs_builder ubld = bld.exec_all().group(1, 0);
10570
      const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
10571
      const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
10572
                           brw_dmask_reg());
10573

10574
      ubld.ADD(tmp, mask, brw_imm_ud(1));
10575
      ubld.AND(tmp, mask, tmp);
10576

10577
      /* This will loop forever if the dispatch mask doesn't have the expected
10578
       * form '2^n-1', in which case tmp will be non-zero.
10579
       */
10580
      bld.emit(BRW_OPCODE_DO);
10581
      bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
10582
      set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
10583
   }
10584
}
10585

10586
unsigned
10587
fs_visitor::workgroup_size() const
10588
{
10589
   assert(stage == MESA_SHADER_COMPUTE);
10590
   const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
10591
   return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
10592
}
10593

10594
Product

Resources

Company