Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_lower_regioning.cpp
4550 views
1
/*
2
* Copyright © 2018 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#include "brw_fs.h"
25
#include "brw_cfg.h"
26
#include "brw_fs_builder.h"
27
28
using namespace brw;
29
30
namespace {
31
/* From the SKL PRM Vol 2a, "Move":
32
*
33
* "A mov with the same source and destination type, no source modifier,
34
* and no saturation is a raw move. A packed byte destination region (B
35
* or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36
* using raw move."
37
*/
38
bool
39
is_byte_raw_mov(const fs_inst *inst)
40
{
41
return type_sz(inst->dst.type) == 1 &&
42
inst->opcode == BRW_OPCODE_MOV &&
43
inst->src[0].type == inst->dst.type &&
44
!inst->saturate &&
45
!inst->src[0].negate &&
46
!inst->src[0].abs;
47
}
48
49
/*
50
* Return an acceptable byte stride for the destination of an instruction
51
* that requires it to have some particular alignment.
52
*/
53
unsigned
54
required_dst_byte_stride(const fs_inst *inst)
55
{
56
if (inst->dst.is_accumulator()) {
57
/* If the destination is an accumulator, insist that we leave the
58
* stride alone. We cannot "fix" accumulator destinations by writing
59
* to a temporary and emitting a MOV into the original destination.
60
* For multiply instructions (our one use of the accumulator), the
61
* MUL writes the full 66 bits of the accumulator whereas the MOV we
62
* would emit only writes 33 bits and leaves the top 33 bits
63
* undefined.
64
*
65
* It's safe to just require the original stride here because the
66
* lowering pass will detect the mismatch in has_invalid_src_region
67
* and fix the sources of the multiply instead of the destination.
68
*/
69
return inst->dst.stride * type_sz(inst->dst.type);
70
} else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71
!is_byte_raw_mov(inst)) {
72
return get_exec_type_size(inst);
73
} else {
74
/* Calculate the maximum byte stride and the minimum/maximum type
75
* size across all source and destination operands we are required to
76
* lower.
77
*/
78
unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79
unsigned min_size = type_sz(inst->dst.type);
80
unsigned max_size = type_sz(inst->dst.type);
81
82
for (unsigned i = 0; i < inst->sources; i++) {
83
if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84
const unsigned size = type_sz(inst->src[i].type);
85
max_stride = MAX2(max_stride, inst->src[i].stride * size);
86
min_size = MIN2(min_size, size);
87
max_size = MAX2(max_size, size);
88
}
89
}
90
91
/* All operands involved in lowering need to fit in the calculated
92
* stride.
93
*/
94
assert(max_size <= 4 * min_size);
95
96
/* Attempt to use the largest byte stride among all present operands,
97
* but never exceed a stride of 4 since that would lead to illegal
98
* destination regions during lowering.
99
*/
100
return MIN2(max_stride, 4 * min_size);
101
}
102
}
103
104
/*
105
* Return an acceptable byte sub-register offset for the destination of an
106
* instruction that requires it to be aligned to the sub-register offset of
107
* the sources.
108
*/
109
unsigned
110
required_dst_byte_offset(const fs_inst *inst)
111
{
112
for (unsigned i = 0; i < inst->sources; i++) {
113
if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114
if (reg_offset(inst->src[i]) % REG_SIZE !=
115
reg_offset(inst->dst) % REG_SIZE)
116
return 0;
117
}
118
119
return reg_offset(inst->dst) % REG_SIZE;
120
}
121
122
/*
123
* Return whether the instruction has an unsupported channel bit layout
124
* specified for the i-th source region.
125
*/
126
bool
127
has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
128
unsigned i)
129
{
130
if (is_unordered(inst) || inst->is_control_source(i))
131
return false;
132
133
/* Empirical testing shows that Broadwell has a bug affecting half-float
134
* MAD instructions when any of its sources has a non-zero offset, such
135
* as:
136
*
137
* mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
138
*
139
* We used to generate code like this for SIMD8 executions where we
140
* used to pack components Y and W of a vector at offset 16B of a SIMD
141
* register. The problem doesn't occur if the stride of the source is 0.
142
*/
143
if (devinfo->ver == 8 &&
144
inst->opcode == BRW_OPCODE_MAD &&
145
inst->src[i].type == BRW_REGISTER_TYPE_HF &&
146
reg_offset(inst->src[i]) % REG_SIZE > 0 &&
147
inst->src[i].stride != 0) {
148
return true;
149
}
150
151
const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
152
const unsigned src_byte_stride = inst->src[i].stride *
153
type_sz(inst->src[i].type);
154
const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
155
const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
156
157
return has_dst_aligned_region_restriction(devinfo, inst) &&
158
!is_uniform(inst->src[i]) &&
159
(src_byte_stride != dst_byte_stride ||
160
src_byte_offset != dst_byte_offset);
161
}
162
163
/*
164
* Return whether the instruction has an unsupported channel bit layout
165
* specified for the destination region.
166
*/
167
bool
168
has_invalid_dst_region(const intel_device_info *devinfo,
169
const fs_inst *inst)
170
{
171
if (is_unordered(inst)) {
172
return false;
173
} else {
174
const brw_reg_type exec_type = get_exec_type(inst);
175
const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
176
const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
177
const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
178
type_sz(inst->dst.type) < type_sz(exec_type);
179
180
return (has_dst_aligned_region_restriction(devinfo, inst) &&
181
(required_dst_byte_stride(inst) != dst_byte_stride ||
182
required_dst_byte_offset(inst) != dst_byte_offset)) ||
183
(is_narrowing_conversion &&
184
required_dst_byte_stride(inst) != dst_byte_stride);
185
}
186
}
187
188
/**
189
* Return a non-zero value if the execution type of the instruction is
190
* unsupported. The destination and sources matching the returned mask
191
* will be bit-cast to an integer type of appropriate size, lowering any
192
* source or destination modifiers into separate MOV instructions.
193
*/
194
unsigned
195
has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
196
{
197
switch (inst->opcode) {
198
case SHADER_OPCODE_SHUFFLE:
199
case SHADER_OPCODE_QUAD_SWIZZLE:
200
return has_dst_aligned_region_restriction(devinfo, inst) ?
201
0x1 : 0;
202
203
case SHADER_OPCODE_BROADCAST:
204
case SHADER_OPCODE_MOV_INDIRECT:
205
return (((devinfo->verx10 == 70) ||
206
devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
207
devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
208
(devinfo->verx10 >= 125 &&
209
brw_reg_type_is_floating_point(inst->src[0].type)) ?
210
0x1 : 0;
211
212
default:
213
return 0;
214
}
215
}
216
217
/*
218
* Return whether the instruction has unsupported source modifiers
219
* specified for the i-th source region.
220
*/
221
bool
222
has_invalid_src_modifiers(const intel_device_info *devinfo,
223
const fs_inst *inst, unsigned i)
224
{
225
return (!inst->can_do_source_mods(devinfo) &&
226
(inst->src[i].negate || inst->src[i].abs)) ||
227
((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
228
(inst->src[i].negate || inst->src[i].abs ||
229
inst->src[i].type != get_exec_type(inst)));
230
}
231
232
/*
233
* Return whether the instruction has an unsupported type conversion
234
* specified for the destination.
235
*/
236
bool
237
has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
238
{
239
switch (inst->opcode) {
240
case BRW_OPCODE_MOV:
241
return false;
242
case BRW_OPCODE_SEL:
243
return inst->dst.type != get_exec_type(inst);
244
default:
245
/* FIXME: We assume the opcodes not explicitly mentioned before just
246
* work fine with arbitrary conversions, unless they need to be
247
* bit-cast.
248
*/
249
return has_invalid_exec_type(devinfo, inst) &&
250
inst->dst.type != get_exec_type(inst);
251
}
252
}
253
254
/**
255
* Return whether the instruction has unsupported destination modifiers.
256
*/
257
bool
258
has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
259
{
260
return (has_invalid_exec_type(devinfo, inst) &&
261
(inst->saturate || inst->conditional_mod)) ||
262
has_invalid_conversion(devinfo, inst);
263
}
264
265
/**
266
* Return whether the instruction has non-standard semantics for the
267
* conditional mod which don't cause the flag register to be updated with
268
* the comparison result.
269
*/
270
bool
271
has_inconsistent_cmod(const fs_inst *inst)
272
{
273
return inst->opcode == BRW_OPCODE_SEL ||
274
inst->opcode == BRW_OPCODE_CSEL ||
275
inst->opcode == BRW_OPCODE_IF ||
276
inst->opcode == BRW_OPCODE_WHILE;
277
}
278
279
bool
280
lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
281
}
282
283
namespace brw {
284
/**
285
* Remove any modifiers from the \p i-th source region of the instruction,
286
* including negate, abs and any implicit type conversion to the execution
287
* type. Instead any source modifiers will be implemented as a separate
288
* MOV instruction prior to the original instruction.
289
*/
290
bool
291
lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
292
{
293
assert(inst->components_read(i) == 1);
294
assert(v->devinfo->has_integer_dword_mul ||
295
inst->opcode != BRW_OPCODE_MUL ||
296
brw_reg_type_is_floating_point(get_exec_type(inst)) ||
297
MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
298
type_sz(inst->src[i].type) == get_exec_type_size(inst));
299
300
const fs_builder ibld(v, block, inst);
301
const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
302
303
lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
304
inst->src[i] = tmp;
305
306
return true;
307
}
308
}
309
310
namespace {
311
/**
312
* Remove any modifiers from the destination region of the instruction,
313
* including saturate, conditional mod and any implicit type conversion
314
* from the execution type. Instead any destination modifiers will be
315
* implemented as a separate MOV instruction after the original
316
* instruction.
317
*/
318
bool
319
lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
320
{
321
const fs_builder ibld(v, block, inst);
322
const brw_reg_type type = get_exec_type(inst);
323
/* Not strictly necessary, but if possible use a temporary with the same
324
* channel alignment as the current destination in order to avoid
325
* violating the restrictions enforced later on by lower_src_region()
326
* and lower_dst_region(), which would introduce additional copy
327
* instructions into the program unnecessarily.
328
*/
329
const unsigned stride =
330
type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
331
type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
332
fs_reg tmp = ibld.vgrf(type, stride);
333
ibld.UNDEF(tmp);
334
tmp = horiz_stride(tmp, stride);
335
336
/* Emit a MOV taking care of all the destination modifiers. */
337
fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
338
mov->saturate = inst->saturate;
339
if (!has_inconsistent_cmod(inst))
340
mov->conditional_mod = inst->conditional_mod;
341
if (inst->opcode != BRW_OPCODE_SEL) {
342
mov->predicate = inst->predicate;
343
mov->predicate_inverse = inst->predicate_inverse;
344
}
345
mov->flag_subreg = inst->flag_subreg;
346
lower_instruction(v, block, mov);
347
348
/* Point the original instruction at the temporary, and clean up any
349
* destination modifiers.
350
*/
351
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
352
inst->dst = tmp;
353
inst->size_written = inst->dst.component_size(inst->exec_size);
354
inst->saturate = false;
355
if (!has_inconsistent_cmod(inst))
356
inst->conditional_mod = BRW_CONDITIONAL_NONE;
357
358
assert(!inst->flags_written(v->devinfo) || !mov->predicate);
359
return true;
360
}
361
362
/**
363
* Remove any non-trivial shuffling of data from the \p i-th source region
364
* of the instruction. Instead implement the region as a series of integer
365
* copies into a temporary with the same channel layout as the destination.
366
*/
367
bool
368
lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
369
{
370
assert(inst->components_read(i) == 1);
371
const fs_builder ibld(v, block, inst);
372
const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
373
type_sz(inst->src[i].type);
374
assert(stride > 0);
375
fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
376
ibld.UNDEF(tmp);
377
tmp = horiz_stride(tmp, stride);
378
379
/* Emit a series of 32-bit integer copies with any source modifiers
380
* cleaned up (because their semantics are dependent on the type).
381
*/
382
const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
383
false);
384
const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
385
fs_reg raw_src = inst->src[i];
386
raw_src.negate = false;
387
raw_src.abs = false;
388
389
for (unsigned j = 0; j < n; j++)
390
ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
391
392
/* Point the original instruction at the temporary, making sure to keep
393
* any source modifiers in the instruction.
394
*/
395
fs_reg lower_src = tmp;
396
lower_src.negate = inst->src[i].negate;
397
lower_src.abs = inst->src[i].abs;
398
inst->src[i] = lower_src;
399
400
return true;
401
}
402
403
/**
404
* Remove any non-trivial shuffling of data from the destination region of
405
* the instruction. Instead implement the region as a series of integer
406
* copies from a temporary with a channel layout compatible with the
407
* sources.
408
*/
409
bool
410
lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
411
{
412
/* We cannot replace the result of an integer multiply which writes the
413
* accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
414
* value whereas the MOV will act on only 32 or 33 bits of the
415
* accumulator.
416
*/
417
assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
418
brw_reg_type_is_floating_point(inst->dst.type));
419
420
const fs_builder ibld(v, block, inst);
421
const unsigned stride = required_dst_byte_stride(inst) /
422
type_sz(inst->dst.type);
423
assert(stride > 0);
424
fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
425
ibld.UNDEF(tmp);
426
tmp = horiz_stride(tmp, stride);
427
428
/* Emit a series of 32-bit integer copies from the temporary into the
429
* original destination.
430
*/
431
const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
432
false);
433
const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
434
435
if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
436
/* Note that in general we cannot simply predicate the copies on the
437
* same flag register as the original instruction, since it may have
438
* been overwritten by the instruction itself. Instead initialize
439
* the temporary with the previous contents of the destination
440
* register.
441
*/
442
for (unsigned j = 0; j < n; j++)
443
ibld.MOV(subscript(tmp, raw_type, j),
444
subscript(inst->dst, raw_type, j));
445
}
446
447
for (unsigned j = 0; j < n; j++)
448
ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
449
subscript(tmp, raw_type, j));
450
451
/* Point the original instruction at the temporary, making sure to keep
452
* any destination modifiers in the instruction.
453
*/
454
assert(inst->size_written == inst->dst.component_size(inst->exec_size));
455
inst->dst = tmp;
456
inst->size_written = inst->dst.component_size(inst->exec_size);
457
458
return true;
459
}
460
461
/**
462
* Bit-cast sources and destination of the instruction to an appropriate
463
* integer type, to be used in cases where the instruction doesn't support
464
* some other execution type.
465
*/
466
bool
467
lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
468
{
469
assert(inst->dst.type == get_exec_type(inst));
470
const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
471
const brw_reg_type raw_type = brw_int_type(type_sz(inst->dst.type), false);
472
473
for (unsigned i = 0; i < inst->sources; i++) {
474
if (mask & (1u << i)) {
475
assert(inst->src[i].type == inst->dst.type);
476
inst->src[i].type = raw_type;
477
}
478
}
479
480
inst->dst.type = raw_type;
481
482
return true;
483
}
484
485
/**
486
* Legalize the source and destination regioning controls of the specified
487
* instruction.
488
*/
489
bool
490
lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
491
{
492
const intel_device_info *devinfo = v->devinfo;
493
bool progress = false;
494
495
if (has_invalid_dst_modifiers(devinfo, inst))
496
progress |= lower_dst_modifiers(v, block, inst);
497
498
if (has_invalid_dst_region(devinfo, inst))
499
progress |= lower_dst_region(v, block, inst);
500
501
for (unsigned i = 0; i < inst->sources; i++) {
502
if (has_invalid_src_modifiers(devinfo, inst, i))
503
progress |= lower_src_modifiers(v, block, inst, i);
504
505
if (has_invalid_src_region(devinfo, inst, i))
506
progress |= lower_src_region(v, block, inst, i);
507
}
508
509
if (has_invalid_exec_type(devinfo, inst))
510
progress |= lower_exec_type(v, block, inst);
511
512
return progress;
513
}
514
}
515
516
bool
517
fs_visitor::lower_regioning()
518
{
519
bool progress = false;
520
521
foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
522
progress |= lower_instruction(this, block, inst);
523
524
if (progress)
525
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
526
527
return progress;
528
}
529
530