CoCalc -- brw_fs_lower

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_lower_regioning.cpp
⁴⁵⁵⁰ views
1
/*
2
 * Copyright © 2018 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include "brw_fs.h"
25
#include "brw_cfg.h"
26
#include "brw_fs_builder.h"
27

28
using namespace brw;
29

30
namespace {
31
   /* From the SKL PRM Vol 2a, "Move":
32
    *
33
    * "A mov with the same source and destination type, no source modifier,
34
    *  and no saturation is a raw move. A packed byte destination region (B
35
    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36
    *  using raw move."
37
    */
38
   bool
39
   is_byte_raw_mov(const fs_inst *inst)
40
   {
41
      return type_sz(inst->dst.type) == 1 &&
42
             inst->opcode == BRW_OPCODE_MOV &&
43
             inst->src[0].type == inst->dst.type &&
44
             !inst->saturate &&
45
             !inst->src[0].negate &&
46
             !inst->src[0].abs;
47
   }
48

49
   /*
50
    * Return an acceptable byte stride for the destination of an instruction
51
    * that requires it to have some particular alignment.
52
    */
53
   unsigned
54
   required_dst_byte_stride(const fs_inst *inst)
55
   {
56
      if (inst->dst.is_accumulator()) {
57
         /* If the destination is an accumulator, insist that we leave the
58
          * stride alone.  We cannot "fix" accumulator destinations by writing
59
          * to a temporary and emitting a MOV into the original destination.
60
          * For multiply instructions (our one use of the accumulator), the
61
          * MUL writes the full 66 bits of the accumulator whereas the MOV we
62
          * would emit only writes 33 bits and leaves the top 33 bits
63
          * undefined.
64
          *
65
          * It's safe to just require the original stride here because the
66
          * lowering pass will detect the mismatch in has_invalid_src_region
67
          * and fix the sources of the multiply instead of the destination.
68
          */
69
         return inst->dst.stride * type_sz(inst->dst.type);
70
      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71
          !is_byte_raw_mov(inst)) {
72
         return get_exec_type_size(inst);
73
      } else {
74
         /* Calculate the maximum byte stride and the minimum/maximum type
75
          * size across all source and destination operands we are required to
76
          * lower.
77
          */
78
         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79
         unsigned min_size = type_sz(inst->dst.type);
80
         unsigned max_size = type_sz(inst->dst.type);
81

82
         for (unsigned i = 0; i < inst->sources; i++) {
83
            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84
               const unsigned size = type_sz(inst->src[i].type);
85
               max_stride = MAX2(max_stride, inst->src[i].stride * size);
86
               min_size = MIN2(min_size, size);
87
               max_size = MAX2(max_size, size);
88
            }
89
         }
90

91
         /* All operands involved in lowering need to fit in the calculated
92
          * stride.
93
          */
94
         assert(max_size <= 4 * min_size);
95

96
         /* Attempt to use the largest byte stride among all present operands,
97
          * but never exceed a stride of 4 since that would lead to illegal
98
          * destination regions during lowering.
99
          */
100
         return MIN2(max_stride, 4 * min_size);
101
      }
102
   }
103

104
   /*
105
    * Return an acceptable byte sub-register offset for the destination of an
106
    * instruction that requires it to be aligned to the sub-register offset of
107
    * the sources.
108
    */
109
   unsigned
110
   required_dst_byte_offset(const fs_inst *inst)
111
   {
112
      for (unsigned i = 0; i < inst->sources; i++) {
113
         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114
            if (reg_offset(inst->src[i]) % REG_SIZE !=
115
                reg_offset(inst->dst) % REG_SIZE)
116
               return 0;
117
      }
118

119
      return reg_offset(inst->dst) % REG_SIZE;
120
   }
121

122
   /*
123
    * Return whether the instruction has an unsupported channel bit layout
124
    * specified for the i-th source region.
125
    */
126
   bool
127
   has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
128
                          unsigned i)
129
   {
130
      if (is_unordered(inst) || inst->is_control_source(i))
131
         return false;
132

133
      /* Empirical testing shows that Broadwell has a bug affecting half-float
134
       * MAD instructions when any of its sources has a non-zero offset, such
135
       * as:
136
       *
137
       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
138
       *
139
       * We used to generate code like this for SIMD8 executions where we
140
       * used to pack components Y and W of a vector at offset 16B of a SIMD
141
       * register. The problem doesn't occur if the stride of the source is 0.
142
       */
143
      if (devinfo->ver == 8 &&
144
          inst->opcode == BRW_OPCODE_MAD &&
145
          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
146
          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
147
          inst->src[i].stride != 0) {
148
         return true;
149
      }
150

151
      const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
152
      const unsigned src_byte_stride = inst->src[i].stride *
153
         type_sz(inst->src[i].type);
154
      const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
155
      const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
156

157
      return has_dst_aligned_region_restriction(devinfo, inst) &&
158
             !is_uniform(inst->src[i]) &&
159
             (src_byte_stride != dst_byte_stride ||
160
              src_byte_offset != dst_byte_offset);
161
   }
162

163
   /*
164
    * Return whether the instruction has an unsupported channel bit layout
165
    * specified for the destination region.
166
    */
167
   bool
168
   has_invalid_dst_region(const intel_device_info *devinfo,
169
                          const fs_inst *inst)
170
   {
171
      if (is_unordered(inst)) {
172
         return false;
173
      } else {
174
         const brw_reg_type exec_type = get_exec_type(inst);
175
         const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
176
         const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
177
         const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
178
            type_sz(inst->dst.type) < type_sz(exec_type);
179

180
         return (has_dst_aligned_region_restriction(devinfo, inst) &&
181
                 (required_dst_byte_stride(inst) != dst_byte_stride ||
182
                  required_dst_byte_offset(inst) != dst_byte_offset)) ||
183
                (is_narrowing_conversion &&
184
                 required_dst_byte_stride(inst) != dst_byte_stride);
185
      }
186
   }
187

188
   /**
189
    * Return a non-zero value if the execution type of the instruction is
190
    * unsupported.  The destination and sources matching the returned mask
191
    * will be bit-cast to an integer type of appropriate size, lowering any
192
    * source or destination modifiers into separate MOV instructions.
193
    */
194
   unsigned
195
   has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
196
   {
197
      switch (inst->opcode) {
198
      case SHADER_OPCODE_SHUFFLE:
199
      case SHADER_OPCODE_QUAD_SWIZZLE:
200
         return has_dst_aligned_region_restriction(devinfo, inst) ?
201
                0x1 : 0;
202

203
      case SHADER_OPCODE_BROADCAST:
204
      case SHADER_OPCODE_MOV_INDIRECT:
205
         return (((devinfo->verx10 == 70) ||
206
                  devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
207
                  devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
208
                (devinfo->verx10 >= 125 &&
209
                 brw_reg_type_is_floating_point(inst->src[0].type)) ?
210
                0x1 : 0;
211

212
      default:
213
         return 0;
214
      }
215
   }
216

217
   /*
218
    * Return whether the instruction has unsupported source modifiers
219
    * specified for the i-th source region.
220
    */
221
   bool
222
   has_invalid_src_modifiers(const intel_device_info *devinfo,
223
                             const fs_inst *inst, unsigned i)
224
   {
225
      return (!inst->can_do_source_mods(devinfo) &&
226
              (inst->src[i].negate || inst->src[i].abs)) ||
227
             ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
228
              (inst->src[i].negate || inst->src[i].abs ||
229
               inst->src[i].type != get_exec_type(inst)));
230
   }
231

232
   /*
233
    * Return whether the instruction has an unsupported type conversion
234
    * specified for the destination.
235
    */
236
   bool
237
   has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
238
   {
239
      switch (inst->opcode) {
240
      case BRW_OPCODE_MOV:
241
         return false;
242
      case BRW_OPCODE_SEL:
243
         return inst->dst.type != get_exec_type(inst);
244
      default:
245
         /* FIXME: We assume the opcodes not explicitly mentioned before just
246
          * work fine with arbitrary conversions, unless they need to be
247
          * bit-cast.
248
          */
249
         return has_invalid_exec_type(devinfo, inst) &&
250
                inst->dst.type != get_exec_type(inst);
251
      }
252
   }
253

254
   /**
255
    * Return whether the instruction has unsupported destination modifiers.
256
    */
257
   bool
258
   has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
259
   {
260
      return (has_invalid_exec_type(devinfo, inst) &&
261
              (inst->saturate || inst->conditional_mod)) ||
262
             has_invalid_conversion(devinfo, inst);
263
   }
264

265
   /**
266
    * Return whether the instruction has non-standard semantics for the
267
    * conditional mod which don't cause the flag register to be updated with
268
    * the comparison result.
269
    */
270
   bool
271
   has_inconsistent_cmod(const fs_inst *inst)
272
   {
273
      return inst->opcode == BRW_OPCODE_SEL ||
274
             inst->opcode == BRW_OPCODE_CSEL ||
275
             inst->opcode == BRW_OPCODE_IF ||
276
             inst->opcode == BRW_OPCODE_WHILE;
277
   }
278

279
   bool
280
   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
281
}
282

283
namespace brw {
284
   /**
285
    * Remove any modifiers from the \p i-th source region of the instruction,
286
    * including negate, abs and any implicit type conversion to the execution
287
    * type.  Instead any source modifiers will be implemented as a separate
288
    * MOV instruction prior to the original instruction.
289
    */
290
   bool
291
   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
292
   {
293
      assert(inst->components_read(i) == 1);
294
      assert(v->devinfo->has_integer_dword_mul ||
295
             inst->opcode != BRW_OPCODE_MUL ||
296
             brw_reg_type_is_floating_point(get_exec_type(inst)) ||
297
             MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
298
             type_sz(inst->src[i].type) == get_exec_type_size(inst));
299

300
      const fs_builder ibld(v, block, inst);
301
      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
302

303
      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
304
      inst->src[i] = tmp;
305

306
      return true;
307
   }
308
}
309

310
namespace {
311
   /**
312
    * Remove any modifiers from the destination region of the instruction,
313
    * including saturate, conditional mod and any implicit type conversion
314
    * from the execution type.  Instead any destination modifiers will be
315
    * implemented as a separate MOV instruction after the original
316
    * instruction.
317
    */
318
   bool
319
   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
320
   {
321
      const fs_builder ibld(v, block, inst);
322
      const brw_reg_type type = get_exec_type(inst);
323
      /* Not strictly necessary, but if possible use a temporary with the same
324
       * channel alignment as the current destination in order to avoid
325
       * violating the restrictions enforced later on by lower_src_region()
326
       * and lower_dst_region(), which would introduce additional copy
327
       * instructions into the program unnecessarily.
328
       */
329
      const unsigned stride =
330
         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
331
         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
332
      fs_reg tmp = ibld.vgrf(type, stride);
333
      ibld.UNDEF(tmp);
334
      tmp = horiz_stride(tmp, stride);
335

336
      /* Emit a MOV taking care of all the destination modifiers. */
337
      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
338
      mov->saturate = inst->saturate;
339
      if (!has_inconsistent_cmod(inst))
340
         mov->conditional_mod = inst->conditional_mod;
341
      if (inst->opcode != BRW_OPCODE_SEL) {
342
         mov->predicate = inst->predicate;
343
         mov->predicate_inverse = inst->predicate_inverse;
344
      }
345
      mov->flag_subreg = inst->flag_subreg;
346
      lower_instruction(v, block, mov);
347

348
      /* Point the original instruction at the temporary, and clean up any
349
       * destination modifiers.
350
       */
351
      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
352
      inst->dst = tmp;
353
      inst->size_written = inst->dst.component_size(inst->exec_size);
354
      inst->saturate = false;
355
      if (!has_inconsistent_cmod(inst))
356
         inst->conditional_mod = BRW_CONDITIONAL_NONE;
357

358
      assert(!inst->flags_written(v->devinfo) || !mov->predicate);
359
      return true;
360
   }
361

362
   /**
363
    * Remove any non-trivial shuffling of data from the \p i-th source region
364
    * of the instruction.  Instead implement the region as a series of integer
365
    * copies into a temporary with the same channel layout as the destination.
366
    */
367
   bool
368
   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
369
   {
370
      assert(inst->components_read(i) == 1);
371
      const fs_builder ibld(v, block, inst);
372
      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
373
                              type_sz(inst->src[i].type);
374
      assert(stride > 0);
375
      fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
376
      ibld.UNDEF(tmp);
377
      tmp = horiz_stride(tmp, stride);
378

379
      /* Emit a series of 32-bit integer copies with any source modifiers
380
       * cleaned up (because their semantics are dependent on the type).
381
       */
382
      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
383
                                                 false);
384
      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
385
      fs_reg raw_src = inst->src[i];
386
      raw_src.negate = false;
387
      raw_src.abs = false;
388

389
      for (unsigned j = 0; j < n; j++)
390
         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
391

392
      /* Point the original instruction at the temporary, making sure to keep
393
       * any source modifiers in the instruction.
394
       */
395
      fs_reg lower_src = tmp;
396
      lower_src.negate = inst->src[i].negate;
397
      lower_src.abs = inst->src[i].abs;
398
      inst->src[i] = lower_src;
399

400
      return true;
401
   }
402

403
   /**
404
    * Remove any non-trivial shuffling of data from the destination region of
405
    * the instruction.  Instead implement the region as a series of integer
406
    * copies from a temporary with a channel layout compatible with the
407
    * sources.
408
    */
409
   bool
410
   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
411
   {
412
      /* We cannot replace the result of an integer multiply which writes the
413
       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
414
       * value whereas the MOV will act on only 32 or 33 bits of the
415
       * accumulator.
416
       */
417
      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
418
             brw_reg_type_is_floating_point(inst->dst.type));
419

420
      const fs_builder ibld(v, block, inst);
421
      const unsigned stride = required_dst_byte_stride(inst) /
422
                              type_sz(inst->dst.type);
423
      assert(stride > 0);
424
      fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
425
      ibld.UNDEF(tmp);
426
      tmp = horiz_stride(tmp, stride);
427

428
      /* Emit a series of 32-bit integer copies from the temporary into the
429
       * original destination.
430
       */
431
      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
432
                                                 false);
433
      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
434

435
      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
436
         /* Note that in general we cannot simply predicate the copies on the
437
          * same flag register as the original instruction, since it may have
438
          * been overwritten by the instruction itself.  Instead initialize
439
          * the temporary with the previous contents of the destination
440
          * register.
441
          */
442
         for (unsigned j = 0; j < n; j++)
443
            ibld.MOV(subscript(tmp, raw_type, j),
444
                     subscript(inst->dst, raw_type, j));
445
      }
446

447
      for (unsigned j = 0; j < n; j++)
448
         ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
449
                                        subscript(tmp, raw_type, j));
450

451
      /* Point the original instruction at the temporary, making sure to keep
452
       * any destination modifiers in the instruction.
453
       */
454
      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
455
      inst->dst = tmp;
456
      inst->size_written = inst->dst.component_size(inst->exec_size);
457

458
      return true;
459
   }
460

461
   /**
462
    * Bit-cast sources and destination of the instruction to an appropriate
463
    * integer type, to be used in cases where the instruction doesn't support
464
    * some other execution type.
465
    */
466
   bool
467
   lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
468
   {
469
      assert(inst->dst.type == get_exec_type(inst));
470
      const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
471
      const brw_reg_type raw_type = brw_int_type(type_sz(inst->dst.type), false);
472

473
      for (unsigned i = 0; i < inst->sources; i++) {
474
         if (mask & (1u << i)) {
475
            assert(inst->src[i].type == inst->dst.type);
476
            inst->src[i].type = raw_type;
477
         }
478
      }
479

480
      inst->dst.type = raw_type;
481

482
      return true;
483
   }
484

485
   /**
486
    * Legalize the source and destination regioning controls of the specified
487
    * instruction.
488
    */
489
   bool
490
   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
491
   {
492
      const intel_device_info *devinfo = v->devinfo;
493
      bool progress = false;
494

495
      if (has_invalid_dst_modifiers(devinfo, inst))
496
         progress |= lower_dst_modifiers(v, block, inst);
497

498
      if (has_invalid_dst_region(devinfo, inst))
499
         progress |= lower_dst_region(v, block, inst);
500

501
      for (unsigned i = 0; i < inst->sources; i++) {
502
         if (has_invalid_src_modifiers(devinfo, inst, i))
503
            progress |= lower_src_modifiers(v, block, inst, i);
504

505
         if (has_invalid_src_region(devinfo, inst, i))
506
            progress |= lower_src_region(v, block, inst, i);
507
      }
508

509
      if (has_invalid_exec_type(devinfo, inst))
510
         progress |= lower_exec_type(v, block, inst);
511

512
      return progress;
513
   }
514
}
515

516
bool
517
fs_visitor::lower_regioning()
518
{
519
   bool progress = false;
520

521
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
522
      progress |= lower_instruction(this, block, inst);
523

524
   if (progress)
525
      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
526

527
   return progress;
528
}
529

530
Product

Resources

Company