Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs.cpp
4550 views
1
/*
2
* Copyright © 2010 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
/** @file brw_fs.cpp
25
*
26
* This file drives the GLSL IR -> LIR translation, contains the
27
* optimizations on the LIR, and drives the generation of native code
28
* from the LIR.
29
*/
30
31
#include "main/macros.h"
32
#include "brw_eu.h"
33
#include "brw_fs.h"
34
#include "brw_fs_live_variables.h"
35
#include "brw_nir.h"
36
#include "brw_vec4_gs_visitor.h"
37
#include "brw_cfg.h"
38
#include "brw_dead_control_flow.h"
39
#include "dev/intel_debug.h"
40
#include "compiler/glsl_types.h"
41
#include "compiler/nir/nir_builder.h"
42
#include "program/prog_parameter.h"
43
#include "util/u_math.h"
44
45
using namespace brw;
46
47
static unsigned get_lowered_simd_width(const struct intel_device_info *devinfo,
48
const fs_inst *inst);
49
50
void
51
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
52
const fs_reg *src, unsigned sources)
53
{
54
memset((void*)this, 0, sizeof(*this));
55
56
this->src = new fs_reg[MAX2(sources, 3)];
57
for (unsigned i = 0; i < sources; i++)
58
this->src[i] = src[i];
59
60
this->opcode = opcode;
61
this->dst = dst;
62
this->sources = sources;
63
this->exec_size = exec_size;
64
this->base_mrf = -1;
65
66
assert(dst.file != IMM && dst.file != UNIFORM);
67
68
assert(this->exec_size != 0);
69
70
this->conditional_mod = BRW_CONDITIONAL_NONE;
71
72
/* This will be the case for almost all instructions. */
73
switch (dst.file) {
74
case VGRF:
75
case ARF:
76
case FIXED_GRF:
77
case MRF:
78
case ATTR:
79
this->size_written = dst.component_size(exec_size);
80
break;
81
case BAD_FILE:
82
this->size_written = 0;
83
break;
84
case IMM:
85
case UNIFORM:
86
unreachable("Invalid destination register file");
87
}
88
89
this->writes_accumulator = false;
90
}
91
92
fs_inst::fs_inst()
93
{
94
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95
}
96
97
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98
{
99
init(opcode, exec_size, reg_undef, NULL, 0);
100
}
101
102
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
103
{
104
init(opcode, exec_size, dst, NULL, 0);
105
}
106
107
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
108
const fs_reg &src0)
109
{
110
const fs_reg src[1] = { src0 };
111
init(opcode, exec_size, dst, src, 1);
112
}
113
114
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
115
const fs_reg &src0, const fs_reg &src1)
116
{
117
const fs_reg src[2] = { src0, src1 };
118
init(opcode, exec_size, dst, src, 2);
119
}
120
121
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
122
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
123
{
124
const fs_reg src[3] = { src0, src1, src2 };
125
init(opcode, exec_size, dst, src, 3);
126
}
127
128
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
129
const fs_reg src[], unsigned sources)
130
{
131
init(opcode, exec_width, dst, src, sources);
132
}
133
134
fs_inst::fs_inst(const fs_inst &that)
135
{
136
memcpy((void*)this, &that, sizeof(that));
137
138
this->src = new fs_reg[MAX2(that.sources, 3)];
139
140
for (unsigned i = 0; i < that.sources; i++)
141
this->src[i] = that.src[i];
142
}
143
144
fs_inst::~fs_inst()
145
{
146
delete[] this->src;
147
}
148
149
void
150
fs_inst::resize_sources(uint8_t num_sources)
151
{
152
if (this->sources != num_sources) {
153
fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
154
155
for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
156
src[i] = this->src[i];
157
158
delete[] this->src;
159
this->src = src;
160
this->sources = num_sources;
161
}
162
}
163
164
void
165
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
166
const fs_reg &dst,
167
const fs_reg &surf_index,
168
const fs_reg &varying_offset,
169
uint32_t const_offset,
170
uint8_t alignment)
171
{
172
/* We have our constant surface use a pitch of 4 bytes, so our index can
173
* be any component of a vector, and then we load 4 contiguous
174
* components starting from that.
175
*
176
* We break down the const_offset to a portion added to the variable offset
177
* and a portion done using fs_reg::offset, which means that if you have
178
* GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
179
* we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
180
* later notice that those loads are all the same and eliminate the
181
* redundant ones.
182
*/
183
fs_reg vec4_offset = vgrf(glsl_type::uint_type);
184
bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
185
186
/* The pull load message will load a vec4 (16 bytes). If we are loading
187
* a double this means we are only loading 2 elements worth of data.
188
* We also want to use a 32-bit data type for the dst of the load operation
189
* so other parts of the driver don't get confused about the size of the
190
* result.
191
*/
192
fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
193
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
194
vec4_result, surf_index, vec4_offset,
195
brw_imm_ud(alignment));
196
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
197
198
shuffle_from_32bit_read(bld, dst, vec4_result,
199
(const_offset & 0xf) / type_sz(dst.type), 1);
200
}
201
202
/**
203
* A helper for MOV generation for fixing up broken hardware SEND dependency
204
* handling.
205
*/
206
void
207
fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
208
{
209
/* The caller always wants uncompressed to emit the minimal extra
210
* dependencies, and to avoid having to deal with aligning its regs to 2.
211
*/
212
const fs_builder ubld = bld.annotate("send dependency resolve")
213
.quarter(0);
214
215
ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
216
}
217
218
bool
219
fs_inst::is_send_from_grf() const
220
{
221
switch (opcode) {
222
case SHADER_OPCODE_SEND:
223
case SHADER_OPCODE_SHADER_TIME_ADD:
224
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
225
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
226
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
227
case SHADER_OPCODE_URB_WRITE_SIMD8:
228
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
229
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
230
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
231
case SHADER_OPCODE_URB_READ_SIMD8:
232
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
233
case SHADER_OPCODE_INTERLOCK:
234
case SHADER_OPCODE_MEMORY_FENCE:
235
case SHADER_OPCODE_BARRIER:
236
return true;
237
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238
return src[1].file == VGRF;
239
case FS_OPCODE_FB_WRITE:
240
case FS_OPCODE_FB_READ:
241
return src[0].file == VGRF;
242
default:
243
if (is_tex())
244
return src[0].file == VGRF;
245
246
return false;
247
}
248
}
249
250
bool
251
fs_inst::is_control_source(unsigned arg) const
252
{
253
switch (opcode) {
254
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
255
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
256
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
257
return arg == 0;
258
259
case SHADER_OPCODE_BROADCAST:
260
case SHADER_OPCODE_SHUFFLE:
261
case SHADER_OPCODE_QUAD_SWIZZLE:
262
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
263
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
264
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
265
case SHADER_OPCODE_GET_BUFFER_SIZE:
266
return arg == 1;
267
268
case SHADER_OPCODE_MOV_INDIRECT:
269
case SHADER_OPCODE_CLUSTER_BROADCAST:
270
case SHADER_OPCODE_TEX:
271
case FS_OPCODE_TXB:
272
case SHADER_OPCODE_TXD:
273
case SHADER_OPCODE_TXF:
274
case SHADER_OPCODE_TXF_LZ:
275
case SHADER_OPCODE_TXF_CMS:
276
case SHADER_OPCODE_TXF_CMS_W:
277
case SHADER_OPCODE_TXF_UMS:
278
case SHADER_OPCODE_TXF_MCS:
279
case SHADER_OPCODE_TXL:
280
case SHADER_OPCODE_TXL_LZ:
281
case SHADER_OPCODE_TXS:
282
case SHADER_OPCODE_LOD:
283
case SHADER_OPCODE_TG4:
284
case SHADER_OPCODE_TG4_OFFSET:
285
case SHADER_OPCODE_SAMPLEINFO:
286
return arg == 1 || arg == 2;
287
288
case SHADER_OPCODE_SEND:
289
return arg == 0 || arg == 1;
290
291
default:
292
return false;
293
}
294
}
295
296
bool
297
fs_inst::is_payload(unsigned arg) const
298
{
299
switch (opcode) {
300
case FS_OPCODE_FB_WRITE:
301
case FS_OPCODE_FB_READ:
302
case SHADER_OPCODE_URB_WRITE_SIMD8:
303
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
304
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
305
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
306
case SHADER_OPCODE_URB_READ_SIMD8:
307
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
308
case VEC4_OPCODE_UNTYPED_ATOMIC:
309
case VEC4_OPCODE_UNTYPED_SURFACE_READ:
310
case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
311
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
312
case SHADER_OPCODE_SHADER_TIME_ADD:
313
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
314
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
315
case SHADER_OPCODE_INTERLOCK:
316
case SHADER_OPCODE_MEMORY_FENCE:
317
case SHADER_OPCODE_BARRIER:
318
return arg == 0;
319
320
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
321
return arg == 1;
322
323
case SHADER_OPCODE_SEND:
324
return arg == 2 || arg == 3;
325
326
default:
327
if (is_tex())
328
return arg == 0;
329
else
330
return false;
331
}
332
}
333
334
/**
335
* Returns true if this instruction's sources and destinations cannot
336
* safely be the same register.
337
*
338
* In most cases, a register can be written over safely by the same
339
* instruction that is its last use. For a single instruction, the
340
* sources are dereferenced before writing of the destination starts
341
* (naturally).
342
*
343
* However, there are a few cases where this can be problematic:
344
*
345
* - Virtual opcodes that translate to multiple instructions in the
346
* code generator: if src == dst and one instruction writes the
347
* destination before a later instruction reads the source, then
348
* src will have been clobbered.
349
*
350
* - SIMD16 compressed instructions with certain regioning (see below).
351
*
352
* The register allocator uses this information to set up conflicts between
353
* GRF sources and the destination.
354
*/
355
bool
356
fs_inst::has_source_and_destination_hazard() const
357
{
358
switch (opcode) {
359
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
360
/* Multiple partial writes to the destination */
361
return true;
362
case SHADER_OPCODE_SHUFFLE:
363
/* This instruction returns an arbitrary channel from the source and
364
* gets split into smaller instructions in the generator. It's possible
365
* that one of the instructions will read from a channel corresponding
366
* to an earlier instruction.
367
*/
368
case SHADER_OPCODE_SEL_EXEC:
369
/* This is implemented as
370
*
371
* mov(16) g4<1>D 0D { align1 WE_all 1H };
372
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
373
*
374
* Because the source is only read in the second instruction, the first
375
* may stomp all over it.
376
*/
377
return true;
378
case SHADER_OPCODE_QUAD_SWIZZLE:
379
switch (src[1].ud) {
380
case BRW_SWIZZLE_XXXX:
381
case BRW_SWIZZLE_YYYY:
382
case BRW_SWIZZLE_ZZZZ:
383
case BRW_SWIZZLE_WWWW:
384
case BRW_SWIZZLE_XXZZ:
385
case BRW_SWIZZLE_YYWW:
386
case BRW_SWIZZLE_XYXY:
387
case BRW_SWIZZLE_ZWZW:
388
/* These can be implemented as a single Align1 region on all
389
* platforms, so there's never a hazard between source and
390
* destination. C.f. fs_generator::generate_quad_swizzle().
391
*/
392
return false;
393
default:
394
return !is_uniform(src[0]);
395
}
396
default:
397
/* The SIMD16 compressed instruction
398
*
399
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
400
*
401
* is actually decoded in hardware as:
402
*
403
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
404
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
405
*
406
* Which is safe. However, if we have uniform accesses
407
* happening, we get into trouble:
408
*
409
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
410
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
411
*
412
* Now our destination for the first instruction overwrote the
413
* second instruction's src0, and we get garbage for those 8
414
* pixels. There's a similar issue for the pre-gfx6
415
* pixel_x/pixel_y, which are registers of 16-bit values and thus
416
* would get stomped by the first decode as well.
417
*/
418
if (exec_size == 16) {
419
for (int i = 0; i < sources; i++) {
420
if (src[i].file == VGRF && (src[i].stride == 0 ||
421
src[i].type == BRW_REGISTER_TYPE_UW ||
422
src[i].type == BRW_REGISTER_TYPE_W ||
423
src[i].type == BRW_REGISTER_TYPE_UB ||
424
src[i].type == BRW_REGISTER_TYPE_B)) {
425
return true;
426
}
427
}
428
}
429
return false;
430
}
431
}
432
433
bool
434
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
435
{
436
if (devinfo->ver == 6 && is_math())
437
return false;
438
439
if (is_send_from_grf())
440
return false;
441
442
/* From Wa_1604601757:
443
*
444
* "When multiplying a DW and any lower precision integer, source modifier
445
* is not supported."
446
*/
447
if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
448
opcode == BRW_OPCODE_MAD)) {
449
const brw_reg_type exec_type = get_exec_type(this);
450
const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
451
MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
452
MIN2(type_sz(src[0].type), type_sz(src[1].type));
453
454
if (brw_reg_type_is_integer(exec_type) &&
455
type_sz(exec_type) >= 4 &&
456
type_sz(exec_type) != min_type_sz)
457
return false;
458
}
459
460
if (!backend_instruction::can_do_source_mods())
461
return false;
462
463
return true;
464
}
465
466
bool
467
fs_inst::can_do_cmod()
468
{
469
if (!backend_instruction::can_do_cmod())
470
return false;
471
472
/* The accumulator result appears to get used for the conditional modifier
473
* generation. When negating a UD value, there is a 33rd bit generated for
474
* the sign in the accumulator value, so now you can't check, for example,
475
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
476
*/
477
for (unsigned i = 0; i < sources; i++) {
478
if (type_is_unsigned_int(src[i].type) && src[i].negate)
479
return false;
480
}
481
482
return true;
483
}
484
485
bool
486
fs_inst::can_change_types() const
487
{
488
return dst.type == src[0].type &&
489
!src[0].abs && !src[0].negate && !saturate &&
490
(opcode == BRW_OPCODE_MOV ||
491
(opcode == BRW_OPCODE_SEL &&
492
dst.type == src[1].type &&
493
predicate != BRW_PREDICATE_NONE &&
494
!src[1].abs && !src[1].negate));
495
}
496
497
void
498
fs_reg::init()
499
{
500
memset((void*)this, 0, sizeof(*this));
501
type = BRW_REGISTER_TYPE_UD;
502
stride = 1;
503
}
504
505
/** Generic unset register constructor. */
506
fs_reg::fs_reg()
507
{
508
init();
509
this->file = BAD_FILE;
510
}
511
512
fs_reg::fs_reg(struct ::brw_reg reg) :
513
backend_reg(reg)
514
{
515
this->offset = 0;
516
this->stride = 1;
517
if (this->file == IMM &&
518
(this->type != BRW_REGISTER_TYPE_V &&
519
this->type != BRW_REGISTER_TYPE_UV &&
520
this->type != BRW_REGISTER_TYPE_VF)) {
521
this->stride = 0;
522
}
523
}
524
525
bool
526
fs_reg::equals(const fs_reg &r) const
527
{
528
return (this->backend_reg::equals(r) &&
529
stride == r.stride);
530
}
531
532
bool
533
fs_reg::negative_equals(const fs_reg &r) const
534
{
535
return (this->backend_reg::negative_equals(r) &&
536
stride == r.stride);
537
}
538
539
bool
540
fs_reg::is_contiguous() const
541
{
542
switch (file) {
543
case ARF:
544
case FIXED_GRF:
545
return hstride == BRW_HORIZONTAL_STRIDE_1 &&
546
vstride == width + hstride;
547
case MRF:
548
case VGRF:
549
case ATTR:
550
return stride == 1;
551
case UNIFORM:
552
case IMM:
553
case BAD_FILE:
554
return true;
555
}
556
557
unreachable("Invalid register file");
558
}
559
560
unsigned
561
fs_reg::component_size(unsigned width) const
562
{
563
const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
564
hstride == 0 ? 0 :
565
1 << (hstride - 1));
566
return MAX2(width * stride, 1) * type_sz(type);
567
}
568
569
/**
570
* Create a MOV to read the timestamp register.
571
*/
572
fs_reg
573
fs_visitor::get_timestamp(const fs_builder &bld)
574
{
575
assert(devinfo->ver >= 7);
576
577
fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
578
BRW_ARF_TIMESTAMP,
579
0),
580
BRW_REGISTER_TYPE_UD));
581
582
fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
583
584
/* We want to read the 3 fields we care about even if it's not enabled in
585
* the dispatch.
586
*/
587
bld.group(4, 0).exec_all().MOV(dst, ts);
588
589
return dst;
590
}
591
592
void
593
fs_visitor::emit_shader_time_begin()
594
{
595
/* We want only the low 32 bits of the timestamp. Since it's running
596
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
597
* which is plenty of time for our purposes. It is identical across the
598
* EUs, but since it's tracking GPU core speed it will increment at a
599
* varying rate as render P-states change.
600
*/
601
shader_start_time = component(
602
get_timestamp(bld.annotate("shader time start")), 0);
603
}
604
605
void
606
fs_visitor::emit_shader_time_end()
607
{
608
/* Insert our code just before the final SEND with EOT. */
609
exec_node *end = this->instructions.get_tail();
610
assert(end && ((fs_inst *) end)->eot);
611
const fs_builder ibld = bld.annotate("shader time end")
612
.exec_all().at(NULL, end);
613
const fs_reg timestamp = get_timestamp(ibld);
614
615
/* We only use the low 32 bits of the timestamp - see
616
* emit_shader_time_begin()).
617
*
618
* We could also check if render P-states have changed (or anything
619
* else that might disrupt timing) by setting smear to 2 and checking if
620
* that field is != 0.
621
*/
622
const fs_reg shader_end_time = component(timestamp, 0);
623
624
/* Check that there weren't any timestamp reset events (assuming these
625
* were the only two timestamp reads that happened).
626
*/
627
const fs_reg reset = component(timestamp, 2);
628
set_condmod(BRW_CONDITIONAL_Z,
629
ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
630
ibld.IF(BRW_PREDICATE_NORMAL);
631
632
fs_reg start = shader_start_time;
633
start.negate = true;
634
const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
635
BRW_REGISTER_TYPE_UD),
636
0);
637
const fs_builder cbld = ibld.group(1, 0);
638
cbld.group(1, 0).ADD(diff, start, shader_end_time);
639
640
/* If there were no instructions between the two timestamp gets, the diff
641
* is 2 cycles. Remove that overhead, so I can forget about that when
642
* trying to determine the time taken for single instructions.
643
*/
644
cbld.ADD(diff, diff, brw_imm_ud(-2u));
645
SHADER_TIME_ADD(cbld, 0, diff);
646
SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
647
ibld.emit(BRW_OPCODE_ELSE);
648
SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
649
ibld.emit(BRW_OPCODE_ENDIF);
650
}
651
652
void
653
fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
654
int shader_time_subindex,
655
fs_reg value)
656
{
657
int index = shader_time_index * 3 + shader_time_subindex;
658
struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
659
660
fs_reg payload;
661
if (dispatch_width == 8)
662
payload = vgrf(glsl_type::uvec2_type);
663
else
664
payload = vgrf(glsl_type::uint_type);
665
666
bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
667
}
668
669
void
670
fs_visitor::vfail(const char *format, va_list va)
671
{
672
char *msg;
673
674
if (failed)
675
return;
676
677
failed = true;
678
679
msg = ralloc_vasprintf(mem_ctx, format, va);
680
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
681
dispatch_width, stage_abbrev, msg);
682
683
this->fail_msg = msg;
684
685
if (unlikely(debug_enabled)) {
686
fprintf(stderr, "%s", msg);
687
}
688
}
689
690
void
691
fs_visitor::fail(const char *format, ...)
692
{
693
va_list va;
694
695
va_start(va, format);
696
vfail(format, va);
697
va_end(va);
698
}
699
700
/**
701
* Mark this program as impossible to compile with dispatch width greater
702
* than n.
703
*
704
* During the SIMD8 compile (which happens first), we can detect and flag
705
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
706
* SIMD16+ compile altogether.
707
*
708
* During a compile of dispatch width greater than n (if one happens anyway),
709
* this just calls fail().
710
*/
711
void
712
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
713
{
714
if (dispatch_width > n) {
715
fail("%s", msg);
716
} else {
717
max_dispatch_width = MIN2(max_dispatch_width, n);
718
compiler->shader_perf_log(log_data,
719
"Shader dispatch width limited to SIMD%d: %s",
720
n, msg);
721
}
722
}
723
724
/**
725
* Returns true if the instruction has a flag that means it won't
726
* update an entire destination register.
727
*
728
* For example, dead code elimination and live variable analysis want to know
729
* when a write to a variable screens off any preceding values that were in
730
* it.
731
*/
732
bool
733
fs_inst::is_partial_write() const
734
{
735
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
736
(this->exec_size * type_sz(this->dst.type)) < 32 ||
737
!this->dst.is_contiguous() ||
738
this->dst.offset % REG_SIZE != 0);
739
}
740
741
unsigned
742
fs_inst::components_read(unsigned i) const
743
{
744
/* Return zero if the source is not present. */
745
if (src[i].file == BAD_FILE)
746
return 0;
747
748
switch (opcode) {
749
case FS_OPCODE_LINTERP:
750
if (i == 0)
751
return 2;
752
else
753
return 1;
754
755
case FS_OPCODE_PIXEL_X:
756
case FS_OPCODE_PIXEL_Y:
757
assert(i < 2);
758
if (i == 0)
759
return 2;
760
else
761
return 1;
762
763
case FS_OPCODE_FB_WRITE_LOGICAL:
764
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
765
/* First/second FB write color. */
766
if (i < 2)
767
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
768
else
769
return 1;
770
771
case SHADER_OPCODE_TEX_LOGICAL:
772
case SHADER_OPCODE_TXD_LOGICAL:
773
case SHADER_OPCODE_TXF_LOGICAL:
774
case SHADER_OPCODE_TXL_LOGICAL:
775
case SHADER_OPCODE_TXS_LOGICAL:
776
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
777
case FS_OPCODE_TXB_LOGICAL:
778
case SHADER_OPCODE_TXF_CMS_LOGICAL:
779
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
780
case SHADER_OPCODE_TXF_UMS_LOGICAL:
781
case SHADER_OPCODE_TXF_MCS_LOGICAL:
782
case SHADER_OPCODE_LOD_LOGICAL:
783
case SHADER_OPCODE_TG4_LOGICAL:
784
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
785
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
786
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
787
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
788
/* Texture coordinates. */
789
if (i == TEX_LOGICAL_SRC_COORDINATE)
790
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
791
/* Texture derivatives. */
792
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
793
opcode == SHADER_OPCODE_TXD_LOGICAL)
794
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
795
/* Texture offset. */
796
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
797
return 2;
798
/* MCS */
799
else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
800
return 2;
801
else
802
return 1;
803
804
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
805
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
806
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
807
/* Surface coordinates. */
808
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
809
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
810
/* Surface operation source (ignored for reads). */
811
else if (i == SURFACE_LOGICAL_SRC_DATA)
812
return 0;
813
else
814
return 1;
815
816
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
817
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
818
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
819
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
820
/* Surface coordinates. */
821
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
822
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
823
/* Surface operation source. */
824
else if (i == SURFACE_LOGICAL_SRC_DATA)
825
return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
826
else
827
return 1;
828
829
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
830
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
831
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
832
assert(src[2].file == IMM);
833
return 1;
834
835
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
836
assert(src[2].file == IMM);
837
if (i == 1) { /* data to write */
838
const unsigned comps = src[2].ud / exec_size;
839
assert(comps > 0);
840
return comps;
841
} else {
842
return 1;
843
}
844
845
case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
846
case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
847
assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
848
return 1;
849
850
case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
851
assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
852
if (i == SURFACE_LOGICAL_SRC_DATA) {
853
const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
854
assert(comps > 0);
855
return comps;
856
} else {
857
return 1;
858
}
859
860
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
861
assert(src[2].file == IMM);
862
return i == 1 ? src[2].ud : 1;
863
864
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
865
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
866
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
867
assert(src[2].file == IMM);
868
if (i == 1) {
869
/* Data source */
870
const unsigned op = src[2].ud;
871
switch (op) {
872
case BRW_AOP_INC:
873
case BRW_AOP_DEC:
874
case BRW_AOP_PREDEC:
875
return 0;
876
case BRW_AOP_CMPWR:
877
return 2;
878
default:
879
return 1;
880
}
881
} else {
882
return 1;
883
}
884
885
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
886
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
887
assert(src[2].file == IMM);
888
if (i == 1) {
889
/* Data source */
890
const unsigned op = src[2].ud;
891
return op == BRW_AOP_FCMPWR ? 2 : 1;
892
} else {
893
return 1;
894
}
895
896
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
897
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
898
/* Scattered logical opcodes use the following params:
899
* src[0] Surface coordinates
900
* src[1] Surface operation source (ignored for reads)
901
* src[2] Surface
902
* src[3] IMM with always 1 dimension.
903
* src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
904
*/
905
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
906
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
907
return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
908
909
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
910
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
911
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
912
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
913
return 1;
914
915
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
916
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
917
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
918
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
919
const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
920
/* Surface coordinates. */
921
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
922
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
923
/* Surface operation source. */
924
else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
925
return 2;
926
else if (i == SURFACE_LOGICAL_SRC_DATA &&
927
(op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
928
return 0;
929
else
930
return 1;
931
}
932
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
933
return (i == 0 ? 2 : 1);
934
935
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
936
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
937
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
938
const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
939
/* Surface coordinates. */
940
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
941
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
942
/* Surface operation source. */
943
else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
944
return 2;
945
else
946
return 1;
947
}
948
949
default:
950
return 1;
951
}
952
}
953
954
unsigned
955
fs_inst::size_read(int arg) const
956
{
957
switch (opcode) {
958
case SHADER_OPCODE_SEND:
959
if (arg == 2) {
960
return mlen * REG_SIZE;
961
} else if (arg == 3) {
962
return ex_mlen * REG_SIZE;
963
}
964
break;
965
966
case FS_OPCODE_FB_WRITE:
967
case FS_OPCODE_REP_FB_WRITE:
968
if (arg == 0) {
969
if (base_mrf >= 0)
970
return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
971
else
972
return mlen * REG_SIZE;
973
}
974
break;
975
976
case FS_OPCODE_FB_READ:
977
case SHADER_OPCODE_URB_WRITE_SIMD8:
978
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
979
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
980
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
981
case SHADER_OPCODE_URB_READ_SIMD8:
982
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
983
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
984
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
985
if (arg == 0)
986
return mlen * REG_SIZE;
987
break;
988
989
case FS_OPCODE_SET_SAMPLE_ID:
990
if (arg == 1)
991
return 1;
992
break;
993
994
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
995
/* The payload is actually stored in src1 */
996
if (arg == 1)
997
return mlen * REG_SIZE;
998
break;
999
1000
case FS_OPCODE_LINTERP:
1001
if (arg == 1)
1002
return 16;
1003
break;
1004
1005
case SHADER_OPCODE_LOAD_PAYLOAD:
1006
if (arg < this->header_size)
1007
return REG_SIZE;
1008
break;
1009
1010
case CS_OPCODE_CS_TERMINATE:
1011
case SHADER_OPCODE_BARRIER:
1012
return REG_SIZE;
1013
1014
case SHADER_OPCODE_MOV_INDIRECT:
1015
if (arg == 0) {
1016
assert(src[2].file == IMM);
1017
return src[2].ud;
1018
}
1019
break;
1020
1021
default:
1022
if (is_tex() && arg == 0 && src[0].file == VGRF)
1023
return mlen * REG_SIZE;
1024
break;
1025
}
1026
1027
switch (src[arg].file) {
1028
case UNIFORM:
1029
case IMM:
1030
return components_read(arg) * type_sz(src[arg].type);
1031
case BAD_FILE:
1032
case ARF:
1033
case FIXED_GRF:
1034
case VGRF:
1035
case ATTR:
1036
return components_read(arg) * src[arg].component_size(exec_size);
1037
case MRF:
1038
unreachable("MRF registers are not allowed as sources");
1039
}
1040
return 0;
1041
}
1042
1043
namespace {
1044
unsigned
1045
predicate_width(brw_predicate predicate)
1046
{
1047
switch (predicate) {
1048
case BRW_PREDICATE_NONE: return 1;
1049
case BRW_PREDICATE_NORMAL: return 1;
1050
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
1051
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
1052
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
1053
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
1054
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
1055
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
1056
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
1057
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
1058
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
1059
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
1060
default: unreachable("Unsupported predicate");
1061
}
1062
}
1063
1064
/* Return the subset of flag registers that an instruction could
1065
* potentially read or write based on the execution controls and flag
1066
* subregister number of the instruction.
1067
*/
1068
unsigned
1069
flag_mask(const fs_inst *inst, unsigned width)
1070
{
1071
assert(util_is_power_of_two_nonzero(width));
1072
const unsigned start = (inst->flag_subreg * 16 + inst->group) &
1073
~(width - 1);
1074
const unsigned end = start + ALIGN(inst->exec_size, width);
1075
return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
1076
}
1077
1078
unsigned
1079
bit_mask(unsigned n)
1080
{
1081
return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
1082
}
1083
1084
unsigned
1085
flag_mask(const fs_reg &r, unsigned sz)
1086
{
1087
if (r.file == ARF) {
1088
const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
1089
const unsigned end = start + sz;
1090
return bit_mask(end) & ~bit_mask(start);
1091
} else {
1092
return 0;
1093
}
1094
}
1095
}
1096
1097
unsigned
1098
fs_inst::flags_read(const intel_device_info *devinfo) const
1099
{
1100
if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
1101
predicate == BRW_PREDICATE_ALIGN1_ALLV) {
1102
/* The vertical predication modes combine corresponding bits from
1103
* f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
1104
*/
1105
const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
1106
return flag_mask(this, 1) << shift | flag_mask(this, 1);
1107
} else if (predicate) {
1108
return flag_mask(this, predicate_width(predicate));
1109
} else {
1110
unsigned mask = 0;
1111
for (int i = 0; i < sources; i++) {
1112
mask |= flag_mask(src[i], size_read(i));
1113
}
1114
return mask;
1115
}
1116
}
1117
1118
unsigned
1119
fs_inst::flags_written(const intel_device_info *devinfo) const
1120
{
1121
/* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
1122
* using a separte cmpn and sel instruction. This lowering occurs in
1123
* fs_vistor::lower_minmax which is called very, very late.
1124
*/
1125
if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
1126
opcode != BRW_OPCODE_CSEL &&
1127
opcode != BRW_OPCODE_IF &&
1128
opcode != BRW_OPCODE_WHILE)) ||
1129
opcode == FS_OPCODE_FB_WRITE) {
1130
return flag_mask(this, 1);
1131
} else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1132
opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
1133
return flag_mask(this, 32);
1134
} else {
1135
return flag_mask(dst, size_written);
1136
}
1137
}
1138
1139
/**
1140
* Returns how many MRFs an FS opcode will write over.
1141
*
1142
* Note that this is not the 0 or 1 implied writes in an actual gen
1143
* instruction -- the FS opcodes often generate MOVs in addition.
1144
*/
1145
unsigned
1146
fs_inst::implied_mrf_writes() const
1147
{
1148
if (mlen == 0)
1149
return 0;
1150
1151
if (base_mrf == -1)
1152
return 0;
1153
1154
switch (opcode) {
1155
case SHADER_OPCODE_RCP:
1156
case SHADER_OPCODE_RSQ:
1157
case SHADER_OPCODE_SQRT:
1158
case SHADER_OPCODE_EXP2:
1159
case SHADER_OPCODE_LOG2:
1160
case SHADER_OPCODE_SIN:
1161
case SHADER_OPCODE_COS:
1162
return 1 * exec_size / 8;
1163
case SHADER_OPCODE_POW:
1164
case SHADER_OPCODE_INT_QUOTIENT:
1165
case SHADER_OPCODE_INT_REMAINDER:
1166
return 2 * exec_size / 8;
1167
case SHADER_OPCODE_TEX:
1168
case FS_OPCODE_TXB:
1169
case SHADER_OPCODE_TXD:
1170
case SHADER_OPCODE_TXF:
1171
case SHADER_OPCODE_TXF_CMS:
1172
case SHADER_OPCODE_TXF_MCS:
1173
case SHADER_OPCODE_TG4:
1174
case SHADER_OPCODE_TG4_OFFSET:
1175
case SHADER_OPCODE_TXL:
1176
case SHADER_OPCODE_TXS:
1177
case SHADER_OPCODE_LOD:
1178
case SHADER_OPCODE_SAMPLEINFO:
1179
return 1;
1180
case FS_OPCODE_FB_WRITE:
1181
case FS_OPCODE_REP_FB_WRITE:
1182
return src[0].file == BAD_FILE ? 0 : 2;
1183
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1184
case SHADER_OPCODE_GFX4_SCRATCH_READ:
1185
return 1;
1186
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1187
return mlen;
1188
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1189
return mlen;
1190
default:
1191
unreachable("not reached");
1192
}
1193
}
1194
1195
fs_reg
1196
fs_visitor::vgrf(const glsl_type *const type)
1197
{
1198
int reg_width = dispatch_width / 8;
1199
return fs_reg(VGRF,
1200
alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1201
brw_type_for_base_type(type));
1202
}
1203
1204
fs_reg::fs_reg(enum brw_reg_file file, int nr)
1205
{
1206
init();
1207
this->file = file;
1208
this->nr = nr;
1209
this->type = BRW_REGISTER_TYPE_F;
1210
this->stride = (file == UNIFORM ? 0 : 1);
1211
}
1212
1213
fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
1214
{
1215
init();
1216
this->file = file;
1217
this->nr = nr;
1218
this->type = type;
1219
this->stride = (file == UNIFORM ? 0 : 1);
1220
}
1221
1222
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1223
* This brings in those uniform definitions
1224
*/
1225
void
1226
fs_visitor::import_uniforms(fs_visitor *v)
1227
{
1228
this->push_constant_loc = v->push_constant_loc;
1229
this->pull_constant_loc = v->pull_constant_loc;
1230
this->uniforms = v->uniforms;
1231
this->subgroup_id = v->subgroup_id;
1232
for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
1233
this->group_size[i] = v->group_size[i];
1234
}
1235
1236
void
1237
fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
1238
{
1239
assert(stage == MESA_SHADER_FRAGMENT);
1240
1241
/* gl_FragCoord.x */
1242
bld.MOV(wpos, this->pixel_x);
1243
wpos = offset(wpos, bld, 1);
1244
1245
/* gl_FragCoord.y */
1246
bld.MOV(wpos, this->pixel_y);
1247
wpos = offset(wpos, bld, 1);
1248
1249
/* gl_FragCoord.z */
1250
if (devinfo->ver >= 6) {
1251
bld.MOV(wpos, this->pixel_z);
1252
} else {
1253
bld.emit(FS_OPCODE_LINTERP, wpos,
1254
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
1255
component(interp_reg(VARYING_SLOT_POS, 2), 0));
1256
}
1257
wpos = offset(wpos, bld, 1);
1258
1259
/* gl_FragCoord.w: Already set up in emit_interpolation */
1260
bld.MOV(wpos, this->wpos_w);
1261
}
1262
1263
enum brw_barycentric_mode
1264
brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
1265
{
1266
/* Barycentric modes don't make sense for flat inputs. */
1267
assert(mode != INTERP_MODE_FLAT);
1268
1269
unsigned bary;
1270
switch (op) {
1271
case nir_intrinsic_load_barycentric_pixel:
1272
case nir_intrinsic_load_barycentric_at_offset:
1273
bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1274
break;
1275
case nir_intrinsic_load_barycentric_centroid:
1276
bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1277
break;
1278
case nir_intrinsic_load_barycentric_sample:
1279
case nir_intrinsic_load_barycentric_at_sample:
1280
bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1281
break;
1282
default:
1283
unreachable("invalid intrinsic");
1284
}
1285
1286
if (mode == INTERP_MODE_NOPERSPECTIVE)
1287
bary += 3;
1288
1289
return (enum brw_barycentric_mode) bary;
1290
}
1291
1292
/**
1293
* Turn one of the two CENTROID barycentric modes into PIXEL mode.
1294
*/
1295
static enum brw_barycentric_mode
1296
centroid_to_pixel(enum brw_barycentric_mode bary)
1297
{
1298
assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1299
bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1300
return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1301
}
1302
1303
fs_reg *
1304
fs_visitor::emit_frontfacing_interpolation()
1305
{
1306
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1307
1308
if (devinfo->ver >= 12) {
1309
fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
1310
1311
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
1312
bld.ASR(tmp, g1, brw_imm_d(15));
1313
bld.NOT(*reg, tmp);
1314
} else if (devinfo->ver >= 6) {
1315
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1316
* a boolean result from this (~0/true or 0/false).
1317
*
1318
* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1319
* this task in only one instruction:
1320
* - a negation source modifier will flip the bit; and
1321
* - a W -> D type conversion will sign extend the bit into the high
1322
* word of the destination.
1323
*
1324
* An ASR 15 fills the low word of the destination.
1325
*/
1326
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1327
g0.negate = true;
1328
1329
bld.ASR(*reg, g0, brw_imm_d(15));
1330
} else {
1331
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1332
* a boolean result from this (1/true or 0/false).
1333
*
1334
* Like in the above case, since the bit is the MSB of g1.6:UD we can use
1335
* the negation source modifier to flip it. Unfortunately the SHR
1336
* instruction only operates on UD (or D with an abs source modifier)
1337
* sources without negation.
1338
*
1339
* Instead, use ASR (which will give ~0/true or 0/false).
1340
*/
1341
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1342
g1_6.negate = true;
1343
1344
bld.ASR(*reg, g1_6, brw_imm_d(31));
1345
}
1346
1347
return reg;
1348
}
1349
1350
void
1351
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1352
{
1353
assert(stage == MESA_SHADER_FRAGMENT);
1354
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1355
assert(dst.type == BRW_REGISTER_TYPE_F);
1356
1357
if (wm_prog_data->persample_dispatch) {
1358
/* Convert int_sample_pos to floating point */
1359
bld.MOV(dst, int_sample_pos);
1360
/* Scale to the range [0, 1] */
1361
bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1362
}
1363
else {
1364
/* From ARB_sample_shading specification:
1365
* "When rendering to a non-multisample buffer, or if multisample
1366
* rasterization is disabled, gl_SamplePosition will always be
1367
* (0.5, 0.5).
1368
*/
1369
bld.MOV(dst, brw_imm_f(0.5f));
1370
}
1371
}
1372
1373
fs_reg *
1374
fs_visitor::emit_samplepos_setup()
1375
{
1376
assert(devinfo->ver >= 6);
1377
1378
const fs_builder abld = bld.annotate("compute sample position");
1379
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1380
fs_reg pos = *reg;
1381
fs_reg int_sample_x = vgrf(glsl_type::int_type);
1382
fs_reg int_sample_y = vgrf(glsl_type::int_type);
1383
1384
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1385
* mode will be enabled.
1386
*
1387
* From the Ivy Bridge PRM, volume 2 part 1, page 344:
1388
* R31.1:0 Position Offset X/Y for Slot[3:0]
1389
* R31.3:2 Position Offset X/Y for Slot[7:4]
1390
* .....
1391
*
1392
* The X, Y sample positions come in as bytes in thread payload. So, read
1393
* the positions using vstride=16, width=8, hstride=2.
1394
*/
1395
const fs_reg sample_pos_reg =
1396
fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
1397
1398
/* Compute gl_SamplePosition.x */
1399
abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
1400
compute_sample_position(offset(pos, abld, 0), int_sample_x);
1401
1402
/* Compute gl_SamplePosition.y */
1403
abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
1404
compute_sample_position(offset(pos, abld, 1), int_sample_y);
1405
return reg;
1406
}
1407
1408
fs_reg *
1409
fs_visitor::emit_sampleid_setup()
1410
{
1411
assert(stage == MESA_SHADER_FRAGMENT);
1412
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1413
assert(devinfo->ver >= 6);
1414
1415
const fs_builder abld = bld.annotate("compute sample id");
1416
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
1417
1418
if (!key->multisample_fbo) {
1419
/* As per GL_ARB_sample_shading specification:
1420
* "When rendering to a non-multisample buffer, or if multisample
1421
* rasterization is disabled, gl_SampleID will always be zero."
1422
*/
1423
abld.MOV(*reg, brw_imm_d(0));
1424
} else if (devinfo->ver >= 8) {
1425
/* Sample ID comes in as 4-bit numbers in g1.0:
1426
*
1427
* 15:12 Slot 3 SampleID (only used in SIMD16)
1428
* 11:8 Slot 2 SampleID (only used in SIMD16)
1429
* 7:4 Slot 1 SampleID
1430
* 3:0 Slot 0 SampleID
1431
*
1432
* Each slot corresponds to four channels, so we want to replicate each
1433
* half-byte value to 4 channels in a row:
1434
*
1435
* dst+0: .7 .6 .5 .4 .3 .2 .1 .0
1436
* 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
1437
*
1438
* dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
1439
* 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
1440
*
1441
* First, we read g1.0 with a <1,8,0>UB region, causing the first 8
1442
* channels to read the first byte (7:0), and the second group of 8
1443
* channels to read the second byte (15:8). Then, we shift right by
1444
* a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
1445
* values into place. Finally, we AND with 0xf to keep the low nibble.
1446
*
1447
* shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
1448
* and(16) dst<1>D tmp<8,8,1>W 0xf:W
1449
*
1450
* TODO: These payload bits exist on Gfx7 too, but they appear to always
1451
* be zero, so this code fails to work. We should find out why.
1452
*/
1453
const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
1454
1455
for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
1456
const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
1457
hbld.SHR(offset(tmp, hbld, i),
1458
stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
1459
1, 8, 0),
1460
brw_imm_v(0x44440000));
1461
}
1462
1463
abld.AND(*reg, tmp, brw_imm_w(0xf));
1464
} else {
1465
const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
1466
const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
1467
1468
/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1469
* 8x multisampling, subspan 0 will represent sample N (where N
1470
* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1471
* 7. We can find the value of N by looking at R0.0 bits 7:6
1472
* ("Starting Sample Pair Index (SSPI)") and multiplying by two
1473
* (since samples are always delivered in pairs). That is, we
1474
* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1475
* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1476
* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1477
* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1478
* populating a temporary variable with the sequence (0, 1, 2, 3),
1479
* and then reading from it using vstride=1, width=4, hstride=0.
1480
* These computations hold good for 4x multisampling as well.
1481
*
1482
* For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1483
* the first four slots are sample 0 of subspan 0; the next four
1484
* are sample 1 of subspan 0; the third group is sample 0 of
1485
* subspan 1, and finally sample 1 of subspan 1.
1486
*/
1487
1488
/* SKL+ has an extra bit for the Starting Sample Pair Index to
1489
* accomodate 16x MSAA.
1490
*/
1491
abld.exec_all().group(1, 0)
1492
.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1493
brw_imm_ud(0xc0));
1494
abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1495
1496
/* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we
1497
* can assume 4x MSAA. Disallow it on IVB+
1498
*
1499
* FINISHME: One day, we could come up with a way to do this that
1500
* actually works on gfx7.
1501
*/
1502
if (devinfo->ver >= 7)
1503
limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
1504
abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
1505
1506
/* This special instruction takes care of setting vstride=1,
1507
* width=4, hstride=0 of t2 during an ADD instruction.
1508
*/
1509
abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1510
}
1511
1512
return reg;
1513
}
1514
1515
fs_reg *
1516
fs_visitor::emit_samplemaskin_setup()
1517
{
1518
assert(stage == MESA_SHADER_FRAGMENT);
1519
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1520
assert(devinfo->ver >= 6);
1521
1522
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1523
1524
/* The HW doesn't provide us with expected values. */
1525
assert(!wm_prog_data->per_coarse_pixel_dispatch);
1526
1527
fs_reg coverage_mask =
1528
fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
1529
1530
if (wm_prog_data->persample_dispatch) {
1531
/* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
1532
* and a mask representing which sample is being processed by the
1533
* current shader invocation.
1534
*
1535
* From the OES_sample_variables specification:
1536
* "When per-sample shading is active due to the use of a fragment input
1537
* qualified by "sample" or due to the use of the gl_SampleID or
1538
* gl_SamplePosition variables, only the bit for the current sample is
1539
* set in gl_SampleMaskIn."
1540
*/
1541
const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
1542
1543
if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
1544
nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
1545
1546
fs_reg one = vgrf(glsl_type::int_type);
1547
fs_reg enabled_mask = vgrf(glsl_type::int_type);
1548
abld.MOV(one, brw_imm_d(1));
1549
abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
1550
abld.AND(*reg, enabled_mask, coverage_mask);
1551
} else {
1552
/* In per-pixel mode, the coverage mask is sufficient. */
1553
*reg = coverage_mask;
1554
}
1555
return reg;
1556
}
1557
1558
fs_reg *
1559
fs_visitor::emit_shading_rate_setup()
1560
{
1561
assert(devinfo->ver >= 11);
1562
1563
const fs_builder abld = bld.annotate("compute fragment shading rate");
1564
1565
fs_reg *reg = new(this->mem_ctx) fs_reg(bld.vgrf(BRW_REGISTER_TYPE_UD));
1566
1567
struct brw_wm_prog_data *wm_prog_data =
1568
brw_wm_prog_data(bld.shader->stage_prog_data);
1569
1570
/* Coarse pixel shading size fields overlap with other fields of not in
1571
* coarse pixel dispatch mode, so report 0 when that's not the case.
1572
*/
1573
if (wm_prog_data->per_coarse_pixel_dispatch) {
1574
/* The shading rates provided in the shader are the actual 2D shading
1575
* rate while the SPIR-V built-in is the enum value that has the shading
1576
* rate encoded as a bitfield. Fortunately, the bitfield value is just
1577
* the shading rate divided by two and shifted.
1578
*/
1579
1580
/* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
1581
fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
1582
/* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
1583
fs_reg actual_y = byte_offset(actual_x, 1);
1584
1585
fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
1586
fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
1587
1588
abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
1589
abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
1590
abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
1591
abld.OR(*reg, int_rate_x, int_rate_y);
1592
} else {
1593
abld.MOV(*reg, brw_imm_ud(0));
1594
}
1595
1596
return reg;
1597
}
1598
1599
fs_reg
1600
fs_visitor::resolve_source_modifiers(const fs_reg &src)
1601
{
1602
if (!src.abs && !src.negate)
1603
return src;
1604
1605
fs_reg temp = bld.vgrf(src.type);
1606
bld.MOV(temp, src);
1607
1608
return temp;
1609
}
1610
1611
void
1612
fs_visitor::emit_gs_thread_end()
1613
{
1614
assert(stage == MESA_SHADER_GEOMETRY);
1615
1616
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1617
1618
if (gs_compile->control_data_header_size_bits > 0) {
1619
emit_gs_control_data_bits(this->final_gs_vertex_count);
1620
}
1621
1622
const fs_builder abld = bld.annotate("thread end");
1623
fs_inst *inst;
1624
1625
if (gs_prog_data->static_vertex_count != -1) {
1626
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1627
if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1628
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1629
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1630
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1631
prev->eot = true;
1632
1633
/* Delete now dead instructions. */
1634
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1635
if (dead == prev)
1636
break;
1637
dead->remove();
1638
}
1639
return;
1640
} else if (prev->is_control_flow() || prev->has_side_effects()) {
1641
break;
1642
}
1643
}
1644
fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1645
abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1646
inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1647
inst->mlen = 1;
1648
} else {
1649
fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1650
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1651
sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1652
sources[1] = this->final_gs_vertex_count;
1653
abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1654
inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1655
inst->mlen = 2;
1656
}
1657
inst->eot = true;
1658
inst->offset = 0;
1659
}
1660
1661
void
1662
fs_visitor::assign_curb_setup()
1663
{
1664
unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1665
1666
unsigned ubo_push_length = 0;
1667
unsigned ubo_push_start[4];
1668
for (int i = 0; i < 4; i++) {
1669
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1670
ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1671
}
1672
1673
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1674
1675
uint64_t used = 0;
1676
1677
if (stage == MESA_SHADER_COMPUTE &&
1678
brw_cs_prog_data(prog_data)->uses_inline_data) {
1679
/* With COMPUTE_WALKER, we can push up to one register worth of data via
1680
* the inline data parameter in the COMPUTE_WALKER command itself.
1681
*
1682
* TODO: Support inline data and push at the same time.
1683
*/
1684
assert(devinfo->verx10 >= 125);
1685
assert(uniform_push_length <= 1);
1686
} else if (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125) {
1687
fs_builder ubld = bld.exec_all().group(8, 0).at(
1688
cfg->first_block(), cfg->first_block()->start());
1689
1690
/* The base address for our push data is passed in as R0.0[31:6]. We
1691
* have to mask off the bottom 6 bits.
1692
*/
1693
fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1694
ubld.group(1, 0).AND(base_addr,
1695
retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
1696
brw_imm_ud(INTEL_MASK(31, 6)));
1697
1698
fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1699
ubld.MOV(header0, brw_imm_ud(0));
1700
ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4));
1701
1702
/* On Gfx12-HP we load constants at the start of the program using A32
1703
* stateless messages.
1704
*/
1705
for (unsigned i = 0; i < uniform_push_length;) {
1706
/* Limit ourselves to HW limit of 8 Owords (8 * 16bytes = 128 bytes
1707
* or 4 registers).
1708
*/
1709
unsigned num_regs = MIN2(uniform_push_length - i, 4);
1710
assert(num_regs > 0);
1711
num_regs = 1 << util_logbase2(num_regs);
1712
1713
fs_reg header;
1714
if (i == 0) {
1715
header = header0;
1716
} else {
1717
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1718
ubld.MOV(header, brw_imm_ud(0));
1719
ubld.group(1, 0).ADD(component(header, 2),
1720
component(header0, 2),
1721
brw_imm_ud(i * 2));
1722
}
1723
1724
fs_reg srcs[4] = {
1725
brw_imm_ud(0), /* desc */
1726
brw_imm_ud(0), /* ex_desc */
1727
header, /* payload */
1728
fs_reg(), /* payload2 */
1729
};
1730
1731
fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),
1732
BRW_REGISTER_TYPE_UD);
1733
1734
/* This instruction has to be run SIMD16 if we're filling more than a
1735
* single register.
1736
*/
1737
unsigned send_width = MIN2(16, num_regs * 8);
1738
1739
fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND,
1740
dest, srcs, 4);
1741
send->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1742
send->desc = brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
1743
GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1744
BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 2));
1745
send->header_size = 1;
1746
send->mlen = 1;
1747
send->size_written = num_regs * REG_SIZE;
1748
send->send_is_volatile = true;
1749
1750
i += num_regs;
1751
}
1752
1753
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1754
}
1755
1756
/* Map the offsets in the UNIFORM file to fixed HW regs. */
1757
foreach_block_and_inst(block, fs_inst, inst, cfg) {
1758
for (unsigned int i = 0; i < inst->sources; i++) {
1759
if (inst->src[i].file == UNIFORM) {
1760
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1761
int constant_nr;
1762
if (inst->src[i].nr >= UBO_START) {
1763
/* constant_nr is in 32-bit units, the rest are in bytes */
1764
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1765
inst->src[i].offset / 4;
1766
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1767
constant_nr = push_constant_loc[uniform_nr];
1768
} else {
1769
/* Section 5.11 of the OpenGL 4.1 spec says:
1770
* "Out-of-bounds reads return undefined values, which include
1771
* values from other variables of the active program or zero."
1772
* Just return the first push constant.
1773
*/
1774
constant_nr = 0;
1775
}
1776
1777
assert(constant_nr / 8 < 64);
1778
used |= BITFIELD64_BIT(constant_nr / 8);
1779
1780
struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1781
constant_nr / 8,
1782
constant_nr % 8);
1783
brw_reg.abs = inst->src[i].abs;
1784
brw_reg.negate = inst->src[i].negate;
1785
1786
assert(inst->src[i].stride == 0);
1787
inst->src[i] = byte_offset(
1788
retype(brw_reg, inst->src[i].type),
1789
inst->src[i].offset % 4);
1790
}
1791
}
1792
}
1793
1794
uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1795
if (want_zero) {
1796
assert(!compiler->compact_params);
1797
fs_builder ubld = bld.exec_all().group(8, 0).at(
1798
cfg->first_block(), cfg->first_block()->start());
1799
1800
/* push_reg_mask_param is in 32-bit units */
1801
unsigned mask_param = stage_prog_data->push_reg_mask_param;
1802
struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
1803
mask_param % 8);
1804
1805
fs_reg b32;
1806
for (unsigned i = 0; i < 64; i++) {
1807
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1808
fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
1809
ubld.SHL(horiz_offset(shifted, 8),
1810
byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
1811
brw_imm_v(0x01234567));
1812
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1813
1814
fs_builder ubld16 = ubld.group(16, 0);
1815
b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
1816
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1817
}
1818
1819
if (want_zero & BITFIELD64_BIT(i)) {
1820
assert(i < prog_data->curb_read_length);
1821
struct brw_reg push_reg =
1822
retype(brw_vec8_grf(payload.num_regs + i, 0),
1823
BRW_REGISTER_TYPE_D);
1824
1825
ubld.AND(push_reg, push_reg, component(b32, i % 16));
1826
}
1827
}
1828
1829
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1830
}
1831
1832
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1833
this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1834
}
1835
1836
/*
1837
* Build up an array of indices into the urb_setup array that
1838
* references the active entries of the urb_setup array.
1839
* Used to accelerate walking the active entries of the urb_setup array
1840
* on each upload.
1841
*/
1842
void
1843
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1844
{
1845
/* Make sure uint8_t is sufficient */
1846
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1847
uint8_t index = 0;
1848
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1849
if (wm_prog_data->urb_setup[attr] >= 0) {
1850
wm_prog_data->urb_setup_attribs[index++] = attr;
1851
}
1852
}
1853
wm_prog_data->urb_setup_attribs_count = index;
1854
}
1855
1856
static void
1857
calculate_urb_setup(const struct intel_device_info *devinfo,
1858
const struct brw_wm_prog_key *key,
1859
struct brw_wm_prog_data *prog_data,
1860
const nir_shader *nir)
1861
{
1862
memset(prog_data->urb_setup, -1,
1863
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1864
1865
int urb_next = 0;
1866
/* Figure out where each of the incoming setup attributes lands. */
1867
if (devinfo->ver >= 6) {
1868
if (util_bitcount64(nir->info.inputs_read &
1869
BRW_FS_VARYING_INPUT_MASK) <= 16) {
1870
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1871
* first 16 varying inputs, so we can put them wherever we want.
1872
* Just put them in order.
1873
*
1874
* This is useful because it means that (a) inputs not used by the
1875
* fragment shader won't take up valuable register space, and (b) we
1876
* won't have to recompile the fragment shader if it gets paired with
1877
* a different vertex (or geometry) shader.
1878
*/
1879
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1880
if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1881
BITFIELD64_BIT(i)) {
1882
prog_data->urb_setup[i] = urb_next++;
1883
}
1884
}
1885
} else {
1886
/* We have enough input varyings that the SF/SBE pipeline stage can't
1887
* arbitrarily rearrange them to suit our whim; we have to put them
1888
* in an order that matches the output of the previous pipeline stage
1889
* (geometry or vertex shader).
1890
*/
1891
1892
/* Re-compute the VUE map here in the case that the one coming from
1893
* geometry has more than one position slot (used for Primitive
1894
* Replication).
1895
*/
1896
struct brw_vue_map prev_stage_vue_map;
1897
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1898
key->input_slots_valid,
1899
nir->info.separate_shader, 1);
1900
1901
int first_slot =
1902
brw_compute_first_urb_slot_required(nir->info.inputs_read,
1903
&prev_stage_vue_map);
1904
1905
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1906
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1907
slot++) {
1908
int varying = prev_stage_vue_map.slot_to_varying[slot];
1909
if (varying != BRW_VARYING_SLOT_PAD &&
1910
(nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1911
BITFIELD64_BIT(varying))) {
1912
prog_data->urb_setup[varying] = slot - first_slot;
1913
}
1914
}
1915
urb_next = prev_stage_vue_map.num_slots - first_slot;
1916
}
1917
} else {
1918
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1919
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1920
/* Point size is packed into the header, not as a general attribute */
1921
if (i == VARYING_SLOT_PSIZ)
1922
continue;
1923
1924
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1925
/* The back color slot is skipped when the front color is
1926
* also written to. In addition, some slots can be
1927
* written in the vertex shader and not read in the
1928
* fragment shader. So the register number must always be
1929
* incremented, mapped or not.
1930
*/
1931
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1932
prog_data->urb_setup[i] = urb_next;
1933
urb_next++;
1934
}
1935
}
1936
1937
/*
1938
* It's a FS only attribute, and we did interpolation for this attribute
1939
* in SF thread. So, count it here, too.
1940
*
1941
* See compile_sf_prog() for more info.
1942
*/
1943
if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1944
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1945
}
1946
1947
prog_data->num_varying_inputs = urb_next;
1948
prog_data->inputs = nir->info.inputs_read;
1949
1950
brw_compute_urb_setup_index(prog_data);
1951
}
1952
1953
void
1954
fs_visitor::assign_urb_setup()
1955
{
1956
assert(stage == MESA_SHADER_FRAGMENT);
1957
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1958
1959
int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1960
1961
/* Offset all the urb_setup[] index by the actual position of the
1962
* setup regs, now that the location of the constants has been chosen.
1963
*/
1964
foreach_block_and_inst(block, fs_inst, inst, cfg) {
1965
for (int i = 0; i < inst->sources; i++) {
1966
if (inst->src[i].file == ATTR) {
1967
/* ATTR regs in the FS are in units of logical scalar inputs each
1968
* of which consumes half of a GRF register.
1969
*/
1970
assert(inst->src[i].offset < REG_SIZE / 2);
1971
const unsigned grf = urb_start + inst->src[i].nr / 2;
1972
const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
1973
inst->src[i].offset;
1974
const unsigned width = inst->src[i].stride == 0 ?
1975
1 : MIN2(inst->exec_size, 8);
1976
struct brw_reg reg = stride(
1977
byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1978
offset),
1979
width * inst->src[i].stride,
1980
width, inst->src[i].stride);
1981
reg.abs = inst->src[i].abs;
1982
reg.negate = inst->src[i].negate;
1983
inst->src[i] = reg;
1984
}
1985
}
1986
}
1987
1988
/* Each attribute is 4 setup channels, each of which is half a reg. */
1989
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1990
}
1991
1992
void
1993
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1994
{
1995
for (int i = 0; i < inst->sources; i++) {
1996
if (inst->src[i].file == ATTR) {
1997
int grf = payload.num_regs +
1998
prog_data->curb_read_length +
1999
inst->src[i].nr +
2000
inst->src[i].offset / REG_SIZE;
2001
2002
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
2003
*
2004
* VertStride must be used to cross GRF register boundaries. This
2005
* rule implies that elements within a 'Width' cannot cross GRF
2006
* boundaries.
2007
*
2008
* So, for registers that are large enough, we have to split the exec
2009
* size in two and trust the compression state to sort it out.
2010
*/
2011
unsigned total_size = inst->exec_size *
2012
inst->src[i].stride *
2013
type_sz(inst->src[i].type);
2014
2015
assert(total_size <= 2 * REG_SIZE);
2016
const unsigned exec_size =
2017
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
2018
2019
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
2020
struct brw_reg reg =
2021
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
2022
inst->src[i].offset % REG_SIZE),
2023
exec_size * inst->src[i].stride,
2024
width, inst->src[i].stride);
2025
reg.abs = inst->src[i].abs;
2026
reg.negate = inst->src[i].negate;
2027
2028
inst->src[i] = reg;
2029
}
2030
}
2031
}
2032
2033
void
2034
fs_visitor::assign_vs_urb_setup()
2035
{
2036
struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
2037
2038
assert(stage == MESA_SHADER_VERTEX);
2039
2040
/* Each attribute is 4 regs. */
2041
this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
2042
2043
assert(vs_prog_data->base.urb_read_length <= 15);
2044
2045
/* Rewrite all ATTR file references to the hw grf that they land in. */
2046
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2047
convert_attr_sources_to_hw_regs(inst);
2048
}
2049
}
2050
2051
void
2052
fs_visitor::assign_tcs_urb_setup()
2053
{
2054
assert(stage == MESA_SHADER_TESS_CTRL);
2055
2056
/* Rewrite all ATTR file references to HW_REGs. */
2057
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2058
convert_attr_sources_to_hw_regs(inst);
2059
}
2060
}
2061
2062
void
2063
fs_visitor::assign_tes_urb_setup()
2064
{
2065
assert(stage == MESA_SHADER_TESS_EVAL);
2066
2067
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2068
2069
first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
2070
2071
/* Rewrite all ATTR file references to HW_REGs. */
2072
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2073
convert_attr_sources_to_hw_regs(inst);
2074
}
2075
}
2076
2077
void
2078
fs_visitor::assign_gs_urb_setup()
2079
{
2080
assert(stage == MESA_SHADER_GEOMETRY);
2081
2082
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2083
2084
first_non_payload_grf +=
2085
8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
2086
2087
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2088
/* Rewrite all ATTR file references to GRFs. */
2089
convert_attr_sources_to_hw_regs(inst);
2090
}
2091
}
2092
2093
2094
/**
2095
* Split large virtual GRFs into separate components if we can.
2096
*
2097
* This is mostly duplicated with what brw_fs_vector_splitting does,
2098
* but that's really conservative because it's afraid of doing
2099
* splitting that doesn't result in real progress after the rest of
2100
* the optimization phases, which would cause infinite looping in
2101
* optimization. We can do it once here, safely. This also has the
2102
* opportunity to split interpolated values, or maybe even uniforms,
2103
* which we don't have at the IR level.
2104
*
2105
* We want to split, because virtual GRFs are what we register
2106
* allocate and spill (due to contiguousness requirements for some
2107
* instructions), and they're what we naturally generate in the
2108
* codegen process, but most virtual GRFs don't actually need to be
2109
* contiguous sets of GRFs. If we split, we'll end up with reduced
2110
* live intervals and better dead code elimination and coalescing.
2111
*/
2112
void
2113
fs_visitor::split_virtual_grfs()
2114
{
2115
/* Compact the register file so we eliminate dead vgrfs. This
2116
* only defines split points for live registers, so if we have
2117
* too large dead registers they will hit assertions later.
2118
*/
2119
compact_virtual_grfs();
2120
2121
int num_vars = this->alloc.count;
2122
2123
/* Count the total number of registers */
2124
int reg_count = 0;
2125
int vgrf_to_reg[num_vars];
2126
for (int i = 0; i < num_vars; i++) {
2127
vgrf_to_reg[i] = reg_count;
2128
reg_count += alloc.sizes[i];
2129
}
2130
2131
/* An array of "split points". For each register slot, this indicates
2132
* if this slot can be separated from the previous slot. Every time an
2133
* instruction uses multiple elements of a register (as a source or
2134
* destination), we mark the used slots as inseparable. Then we go
2135
* through and split the registers into the smallest pieces we can.
2136
*/
2137
bool *split_points = new bool[reg_count];
2138
memset(split_points, 0, reg_count * sizeof(*split_points));
2139
2140
/* Mark all used registers as fully splittable */
2141
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2142
if (inst->dst.file == VGRF) {
2143
int reg = vgrf_to_reg[inst->dst.nr];
2144
for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
2145
split_points[reg + j] = true;
2146
}
2147
2148
for (int i = 0; i < inst->sources; i++) {
2149
if (inst->src[i].file == VGRF) {
2150
int reg = vgrf_to_reg[inst->src[i].nr];
2151
for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
2152
split_points[reg + j] = true;
2153
}
2154
}
2155
}
2156
2157
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2158
/* We fix up undef instructions later */
2159
if (inst->opcode == SHADER_OPCODE_UNDEF) {
2160
/* UNDEF instructions are currently only used to undef entire
2161
* registers. We need this invariant later when we split them.
2162
*/
2163
assert(inst->dst.file == VGRF);
2164
assert(inst->dst.offset == 0);
2165
assert(inst->size_written == alloc.sizes[inst->dst.nr] * REG_SIZE);
2166
continue;
2167
}
2168
2169
if (inst->dst.file == VGRF) {
2170
int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2171
for (unsigned j = 1; j < regs_written(inst); j++)
2172
split_points[reg + j] = false;
2173
}
2174
for (int i = 0; i < inst->sources; i++) {
2175
if (inst->src[i].file == VGRF) {
2176
int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2177
for (unsigned j = 1; j < regs_read(inst, i); j++)
2178
split_points[reg + j] = false;
2179
}
2180
}
2181
}
2182
2183
int *new_virtual_grf = new int[reg_count];
2184
int *new_reg_offset = new int[reg_count];
2185
2186
int reg = 0;
2187
for (int i = 0; i < num_vars; i++) {
2188
/* The first one should always be 0 as a quick sanity check. */
2189
assert(split_points[reg] == false);
2190
2191
/* j = 0 case */
2192
new_reg_offset[reg] = 0;
2193
reg++;
2194
int offset = 1;
2195
2196
/* j > 0 case */
2197
for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2198
/* If this is a split point, reset the offset to 0 and allocate a
2199
* new virtual GRF for the previous offset many registers
2200
*/
2201
if (split_points[reg]) {
2202
assert(offset <= MAX_VGRF_SIZE);
2203
int grf = alloc.allocate(offset);
2204
for (int k = reg - offset; k < reg; k++)
2205
new_virtual_grf[k] = grf;
2206
offset = 0;
2207
}
2208
new_reg_offset[reg] = offset;
2209
offset++;
2210
reg++;
2211
}
2212
2213
/* The last one gets the original register number */
2214
assert(offset <= MAX_VGRF_SIZE);
2215
alloc.sizes[i] = offset;
2216
for (int k = reg - offset; k < reg; k++)
2217
new_virtual_grf[k] = i;
2218
}
2219
assert(reg == reg_count);
2220
2221
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2222
if (inst->opcode == SHADER_OPCODE_UNDEF) {
2223
const fs_builder ibld(this, block, inst);
2224
assert(inst->size_written % REG_SIZE == 0);
2225
unsigned reg_offset = 0;
2226
while (reg_offset < inst->size_written / REG_SIZE) {
2227
reg = vgrf_to_reg[inst->dst.nr] + reg_offset;
2228
ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));
2229
reg_offset += alloc.sizes[new_virtual_grf[reg]];
2230
}
2231
inst->remove(block);
2232
continue;
2233
}
2234
2235
if (inst->dst.file == VGRF) {
2236
reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2237
inst->dst.nr = new_virtual_grf[reg];
2238
inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
2239
inst->dst.offset % REG_SIZE;
2240
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2241
}
2242
for (int i = 0; i < inst->sources; i++) {
2243
if (inst->src[i].file == VGRF) {
2244
reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2245
inst->src[i].nr = new_virtual_grf[reg];
2246
inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
2247
inst->src[i].offset % REG_SIZE;
2248
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2249
}
2250
}
2251
}
2252
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2253
2254
delete[] split_points;
2255
delete[] new_virtual_grf;
2256
delete[] new_reg_offset;
2257
}
2258
2259
/**
2260
* Remove unused virtual GRFs and compact the vgrf_* arrays.
2261
*
2262
* During code generation, we create tons of temporary variables, many of
2263
* which get immediately killed and are never used again. Yet, in later
2264
* optimization and analysis passes, such as compute_live_intervals, we need
2265
* to loop over all the virtual GRFs. Compacting them can save a lot of
2266
* overhead.
2267
*/
2268
bool
2269
fs_visitor::compact_virtual_grfs()
2270
{
2271
bool progress = false;
2272
int *remap_table = new int[this->alloc.count];
2273
memset(remap_table, -1, this->alloc.count * sizeof(int));
2274
2275
/* Mark which virtual GRFs are used. */
2276
foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2277
if (inst->dst.file == VGRF)
2278
remap_table[inst->dst.nr] = 0;
2279
2280
for (int i = 0; i < inst->sources; i++) {
2281
if (inst->src[i].file == VGRF)
2282
remap_table[inst->src[i].nr] = 0;
2283
}
2284
}
2285
2286
/* Compact the GRF arrays. */
2287
int new_index = 0;
2288
for (unsigned i = 0; i < this->alloc.count; i++) {
2289
if (remap_table[i] == -1) {
2290
/* We just found an unused register. This means that we are
2291
* actually going to compact something.
2292
*/
2293
progress = true;
2294
} else {
2295
remap_table[i] = new_index;
2296
alloc.sizes[new_index] = alloc.sizes[i];
2297
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2298
++new_index;
2299
}
2300
}
2301
2302
this->alloc.count = new_index;
2303
2304
/* Patch all the instructions to use the newly renumbered registers */
2305
foreach_block_and_inst(block, fs_inst, inst, cfg) {
2306
if (inst->dst.file == VGRF)
2307
inst->dst.nr = remap_table[inst->dst.nr];
2308
2309
for (int i = 0; i < inst->sources; i++) {
2310
if (inst->src[i].file == VGRF)
2311
inst->src[i].nr = remap_table[inst->src[i].nr];
2312
}
2313
}
2314
2315
/* Patch all the references to delta_xy, since they're used in register
2316
* allocation. If they're unused, switch them to BAD_FILE so we don't
2317
* think some random VGRF is delta_xy.
2318
*/
2319
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2320
if (delta_xy[i].file == VGRF) {
2321
if (remap_table[delta_xy[i].nr] != -1) {
2322
delta_xy[i].nr = remap_table[delta_xy[i].nr];
2323
} else {
2324
delta_xy[i].file = BAD_FILE;
2325
}
2326
}
2327
}
2328
2329
delete[] remap_table;
2330
2331
return progress;
2332
}
2333
2334
static int
2335
get_subgroup_id_param_index(const intel_device_info *devinfo,
2336
const brw_stage_prog_data *prog_data)
2337
{
2338
if (prog_data->nr_params == 0)
2339
return -1;
2340
2341
if (devinfo->verx10 >= 125)
2342
return -1;
2343
2344
/* The local thread id is always the last parameter in the list */
2345
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
2346
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
2347
return prog_data->nr_params - 1;
2348
2349
return -1;
2350
}
2351
2352
/**
2353
* Struct for handling complex alignments.
2354
*
2355
* A complex alignment is stored as multiplier and an offset. A value is
2356
* considered to be aligned if it is {offset} larger than a multiple of {mul}.
2357
* For instance, with an alignment of {8, 2}, cplx_align_apply would do the
2358
* following:
2359
*
2360
* N | cplx_align_apply({8, 2}, N)
2361
* ----+-----------------------------
2362
* 4 | 6
2363
* 6 | 6
2364
* 8 | 14
2365
* 10 | 14
2366
* 12 | 14
2367
* 14 | 14
2368
* 16 | 22
2369
*/
2370
struct cplx_align {
2371
unsigned mul:4;
2372
unsigned offset:4;
2373
};
2374
2375
#define CPLX_ALIGN_MAX_MUL 8
2376
2377
static void
2378
cplx_align_assert_sane(struct cplx_align a)
2379
{
2380
assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
2381
assert(a.offset < a.mul);
2382
}
2383
2384
/**
2385
* Combines two alignments to produce a least multiple of sorts.
2386
*
2387
* The returned alignment is the smallest (in terms of multiplier) such that
2388
* anything aligned to both a and b will be aligned to the new alignment.
2389
* This function will assert-fail if a and b are not compatible, i.e. if the
2390
* offset parameters are such that no common alignment is possible.
2391
*/
2392
static struct cplx_align
2393
cplx_align_combine(struct cplx_align a, struct cplx_align b)
2394
{
2395
cplx_align_assert_sane(a);
2396
cplx_align_assert_sane(b);
2397
2398
/* Assert that the alignments agree. */
2399
assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
2400
2401
return a.mul > b.mul ? a : b;
2402
}
2403
2404
/**
2405
* Apply a complex alignment
2406
*
2407
* This function will return the smallest number greater than or equal to
2408
* offset that is aligned to align.
2409
*/
2410
static unsigned
2411
cplx_align_apply(struct cplx_align align, unsigned offset)
2412
{
2413
return ALIGN(offset - align.offset, align.mul) + align.offset;
2414
}
2415
2416
#define UNIFORM_SLOT_SIZE 4
2417
2418
struct uniform_slot_info {
2419
/** True if the given uniform slot is live */
2420
unsigned is_live:1;
2421
2422
/** True if this slot and the next slot must remain contiguous */
2423
unsigned contiguous:1;
2424
2425
struct cplx_align align;
2426
};
2427
2428
static void
2429
mark_uniform_slots_read(struct uniform_slot_info *slots,
2430
unsigned num_slots, unsigned alignment)
2431
{
2432
assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
2433
assert(alignment <= CPLX_ALIGN_MAX_MUL);
2434
2435
/* We can't align a slot to anything less than the slot size */
2436
alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
2437
2438
struct cplx_align align = {alignment, 0};
2439
cplx_align_assert_sane(align);
2440
2441
for (unsigned i = 0; i < num_slots; i++) {
2442
slots[i].is_live = true;
2443
if (i < num_slots - 1)
2444
slots[i].contiguous = true;
2445
2446
align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
2447
if (slots[i].align.mul == 0) {
2448
slots[i].align = align;
2449
} else {
2450
slots[i].align = cplx_align_combine(slots[i].align, align);
2451
}
2452
}
2453
}
2454
2455
/**
2456
* Assign UNIFORM file registers to either push constants or pull constants.
2457
*
2458
* We allow a fragment shader to have more than the specified minimum
2459
* maximum number of fragment shader uniform components (64). If
2460
* there are too many of these, they'd fill up all of register space.
2461
* So, this will push some of them out to the pull constant buffer and
2462
* update the program to load them.
2463
*/
2464
void
2465
fs_visitor::assign_constant_locations()
2466
{
2467
/* Only the first compile gets to decide on locations. */
2468
if (push_constant_loc) {
2469
assert(pull_constant_loc);
2470
return;
2471
}
2472
2473
if (compiler->compact_params) {
2474
struct uniform_slot_info slots[uniforms + 1];
2475
memset(slots, 0, sizeof(slots));
2476
2477
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2478
for (int i = 0 ; i < inst->sources; i++) {
2479
if (inst->src[i].file != UNIFORM)
2480
continue;
2481
2482
/* NIR tightly packs things so the uniform number might not be
2483
* aligned (if we have a double right after a float, for
2484
* instance). This is fine because the process of re-arranging
2485
* them will ensure that things are properly aligned. The offset
2486
* into that uniform, however, must be aligned.
2487
*
2488
* In Vulkan, we have explicit offsets but everything is crammed
2489
* into a single "variable" so inst->src[i].nr will always be 0.
2490
* Everything will be properly aligned relative to that one base.
2491
*/
2492
assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
2493
2494
unsigned u = inst->src[i].nr +
2495
inst->src[i].offset / UNIFORM_SLOT_SIZE;
2496
2497
if (u >= uniforms)
2498
continue;
2499
2500
unsigned slots_read;
2501
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
2502
slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
2503
} else {
2504
unsigned bytes_read = inst->components_read(i) *
2505
type_sz(inst->src[i].type);
2506
slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
2507
}
2508
2509
assert(u + slots_read <= uniforms);
2510
mark_uniform_slots_read(&slots[u], slots_read,
2511
type_sz(inst->src[i].type));
2512
}
2513
}
2514
2515
int subgroup_id_index = get_subgroup_id_param_index(devinfo,
2516
stage_prog_data);
2517
2518
/* Only allow 16 registers (128 uniform components) as push constants.
2519
*
2520
* Just demote the end of the list. We could probably do better
2521
* here, demoting things that are rarely used in the program first.
2522
*
2523
* If changing this value, note the limitation about total_regs in
2524
* brw_curbe.c.
2525
*/
2526
unsigned int max_push_components = 16 * 8;
2527
if (subgroup_id_index >= 0)
2528
max_push_components--; /* Save a slot for the thread ID */
2529
2530
/* We push small arrays, but no bigger than 16 floats. This is big
2531
* enough for a vec4 but hopefully not large enough to push out other
2532
* stuff. We should probably use a better heuristic at some point.
2533
*/
2534
const unsigned int max_chunk_size = 16;
2535
2536
unsigned int num_push_constants = 0;
2537
unsigned int num_pull_constants = 0;
2538
2539
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2540
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2541
2542
/* Default to -1 meaning no location */
2543
memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
2544
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2545
2546
int chunk_start = -1;
2547
struct cplx_align align;
2548
for (unsigned u = 0; u < uniforms; u++) {
2549
if (!slots[u].is_live) {
2550
assert(chunk_start == -1);
2551
continue;
2552
}
2553
2554
/* Skip subgroup_id_index to put it in the last push register. */
2555
if (subgroup_id_index == (int)u)
2556
continue;
2557
2558
if (chunk_start == -1) {
2559
chunk_start = u;
2560
align = slots[u].align;
2561
} else {
2562
/* Offset into the chunk */
2563
unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
2564
2565
/* Shift the slot alignment down by the chunk offset so it is
2566
* comparable with the base chunk alignment.
2567
*/
2568
struct cplx_align slot_align = slots[u].align;
2569
slot_align.offset =
2570
(slot_align.offset - chunk_offset) & (align.mul - 1);
2571
2572
align = cplx_align_combine(align, slot_align);
2573
}
2574
2575
/* Sanity check the alignment */
2576
cplx_align_assert_sane(align);
2577
2578
if (slots[u].contiguous)
2579
continue;
2580
2581
/* Adjust the alignment to be in terms of slots, not bytes */
2582
assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
2583
assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
2584
align.mul /= UNIFORM_SLOT_SIZE;
2585
align.offset /= UNIFORM_SLOT_SIZE;
2586
2587
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
2588
unsigned chunk_size = u - chunk_start + 1;
2589
if ((!compiler->supports_pull_constants && u < UBO_START) ||
2590
(chunk_size < max_chunk_size &&
2591
push_start_align + chunk_size <= max_push_components)) {
2592
/* Align up the number of push constants */
2593
num_push_constants = push_start_align;
2594
for (unsigned i = 0; i < chunk_size; i++)
2595
push_constant_loc[chunk_start + i] = num_push_constants++;
2596
} else {
2597
/* We need to pull this one */
2598
num_pull_constants = cplx_align_apply(align, num_pull_constants);
2599
for (unsigned i = 0; i < chunk_size; i++)
2600
pull_constant_loc[chunk_start + i] = num_pull_constants++;
2601
}
2602
2603
/* Reset the chunk and start again */
2604
chunk_start = -1;
2605
}
2606
2607
/* Add the CS local thread ID uniform at the end of the push constants */
2608
if (subgroup_id_index >= 0)
2609
push_constant_loc[subgroup_id_index] = num_push_constants++;
2610
2611
/* As the uniforms are going to be reordered, stash the old array and
2612
* create two new arrays for push/pull params.
2613
*/
2614
uint32_t *param = stage_prog_data->param;
2615
stage_prog_data->nr_params = num_push_constants;
2616
if (num_push_constants) {
2617
stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
2618
num_push_constants);
2619
} else {
2620
stage_prog_data->param = NULL;
2621
}
2622
assert(stage_prog_data->nr_pull_params == 0);
2623
assert(stage_prog_data->pull_param == NULL);
2624
if (num_pull_constants > 0) {
2625
stage_prog_data->nr_pull_params = num_pull_constants;
2626
stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
2627
num_pull_constants);
2628
}
2629
2630
/* Up until now, the param[] array has been indexed by reg + offset
2631
* of UNIFORM registers. Move pull constants into pull_param[] and
2632
* condense param[] to only contain the uniforms we chose to push.
2633
*
2634
* NOTE: Because we are condensing the params[] array, we know that
2635
* push_constant_loc[i] <= i and we can do it in one smooth loop without
2636
* having to make a copy.
2637
*/
2638
for (unsigned int i = 0; i < uniforms; i++) {
2639
uint32_t value = param[i];
2640
if (pull_constant_loc[i] != -1) {
2641
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
2642
} else if (push_constant_loc[i] != -1) {
2643
stage_prog_data->param[push_constant_loc[i]] = value;
2644
}
2645
}
2646
ralloc_free(param);
2647
} else {
2648
/* If we don't want to compact anything, just set up dummy push/pull
2649
* arrays. All the rest of the compiler cares about are these arrays.
2650
*/
2651
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2652
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2653
2654
for (unsigned u = 0; u < uniforms; u++)
2655
push_constant_loc[u] = u;
2656
2657
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2658
}
2659
2660
/* Now that we know how many regular uniforms we'll push, reduce the
2661
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2662
*/
2663
/* For gen4/5:
2664
* Only allow 16 registers (128 uniform components) as push constants.
2665
*
2666
* If changing this value, note the limitation about total_regs in
2667
* brw_curbe.c/crocus_state.c
2668
*/
2669
const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
2670
unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2671
for (int i = 0; i < 4; i++) {
2672
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
2673
2674
if (push_length + range->length > max_push_length)
2675
range->length = max_push_length - push_length;
2676
2677
push_length += range->length;
2678
}
2679
assert(push_length <= max_push_length);
2680
}
2681
2682
bool
2683
fs_visitor::get_pull_locs(const fs_reg &src,
2684
unsigned *out_surf_index,
2685
unsigned *out_pull_index)
2686
{
2687
assert(src.file == UNIFORM);
2688
2689
if (src.nr >= UBO_START) {
2690
const struct brw_ubo_range *range =
2691
&prog_data->ubo_ranges[src.nr - UBO_START];
2692
2693
/* If this access is in our (reduced) range, use the push data. */
2694
if (src.offset / 32 < range->length)
2695
return false;
2696
2697
*out_surf_index = prog_data->binding_table.ubo_start + range->block;
2698
*out_pull_index = (32 * range->start + src.offset) / 4;
2699
2700
prog_data->has_ubo_pull = true;
2701
return true;
2702
}
2703
2704
const unsigned location = src.nr + src.offset / 4;
2705
2706
if (location < uniforms && pull_constant_loc[location] != -1) {
2707
/* A regular uniform push constant */
2708
*out_surf_index = stage_prog_data->binding_table.pull_constants_start;
2709
*out_pull_index = pull_constant_loc[location];
2710
2711
prog_data->has_ubo_pull = true;
2712
return true;
2713
}
2714
2715
return false;
2716
}
2717
2718
/**
2719
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2720
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2721
*/
2722
void
2723
fs_visitor::lower_constant_loads()
2724
{
2725
unsigned index, pull_index;
2726
2727
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2728
/* Set up the annotation tracking for new generated instructions. */
2729
const fs_builder ibld(this, block, inst);
2730
2731
for (int i = 0; i < inst->sources; i++) {
2732
if (inst->src[i].file != UNIFORM)
2733
continue;
2734
2735
/* We'll handle this case later */
2736
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
2737
continue;
2738
2739
if (!get_pull_locs(inst->src[i], &index, &pull_index))
2740
continue;
2741
2742
assert(inst->src[i].stride == 0);
2743
2744
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2745
const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2746
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2747
const unsigned base = pull_index * 4;
2748
2749
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2750
dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
2751
2752
/* Rewrite the instruction to use the temporary VGRF. */
2753
inst->src[i].file = VGRF;
2754
inst->src[i].nr = dst.nr;
2755
inst->src[i].offset = (base & (block_sz - 1)) +
2756
inst->src[i].offset % 4;
2757
}
2758
2759
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
2760
inst->src[0].file == UNIFORM) {
2761
2762
if (!get_pull_locs(inst->src[0], &index, &pull_index))
2763
continue;
2764
2765
VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2766
brw_imm_ud(index),
2767
inst->src[1],
2768
pull_index * 4, 4);
2769
inst->remove(block);
2770
}
2771
}
2772
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2773
}
2774
2775
bool
2776
fs_visitor::opt_algebraic()
2777
{
2778
bool progress = false;
2779
2780
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2781
switch (inst->opcode) {
2782
case BRW_OPCODE_MOV:
2783
if (!devinfo->has_64bit_float &&
2784
!devinfo->has_64bit_int &&
2785
(inst->dst.type == BRW_REGISTER_TYPE_DF ||
2786
inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2787
inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2788
assert(inst->dst.type == inst->src[0].type);
2789
assert(!inst->saturate);
2790
assert(!inst->src[0].abs);
2791
assert(!inst->src[0].negate);
2792
const brw::fs_builder ibld(this, block, inst);
2793
2794
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2795
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
2796
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2797
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
2798
2799
inst->remove(block);
2800
progress = true;
2801
}
2802
2803
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2804
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2805
inst->dst.is_null() &&
2806
(inst->src[0].abs || inst->src[0].negate)) {
2807
inst->src[0].abs = false;
2808
inst->src[0].negate = false;
2809
progress = true;
2810
break;
2811
}
2812
2813
if (inst->src[0].file != IMM)
2814
break;
2815
2816
if (inst->saturate) {
2817
/* Full mixed-type saturates don't happen. However, we can end up
2818
* with things like:
2819
*
2820
* mov.sat(8) g21<1>DF -1F
2821
*
2822
* Other mixed-size-but-same-base-type cases may also be possible.
2823
*/
2824
if (inst->dst.type != inst->src[0].type &&
2825
inst->dst.type != BRW_REGISTER_TYPE_DF &&
2826
inst->src[0].type != BRW_REGISTER_TYPE_F)
2827
assert(!"unimplemented: saturate mixed types");
2828
2829
if (brw_saturate_immediate(inst->src[0].type,
2830
&inst->src[0].as_brw_reg())) {
2831
inst->saturate = false;
2832
progress = true;
2833
}
2834
}
2835
break;
2836
2837
case BRW_OPCODE_MUL:
2838
if (inst->src[1].file != IMM)
2839
continue;
2840
2841
/* a * 1.0 = a */
2842
if (inst->src[1].is_one()) {
2843
inst->opcode = BRW_OPCODE_MOV;
2844
inst->src[1] = reg_undef;
2845
progress = true;
2846
break;
2847
}
2848
2849
/* a * -1.0 = -a */
2850
if (inst->src[1].is_negative_one()) {
2851
inst->opcode = BRW_OPCODE_MOV;
2852
inst->src[0].negate = !inst->src[0].negate;
2853
inst->src[1] = reg_undef;
2854
progress = true;
2855
break;
2856
}
2857
2858
break;
2859
case BRW_OPCODE_ADD:
2860
if (inst->src[1].file != IMM)
2861
continue;
2862
2863
if (brw_reg_type_is_integer(inst->src[1].type) &&
2864
inst->src[1].is_zero()) {
2865
inst->opcode = BRW_OPCODE_MOV;
2866
inst->src[1] = reg_undef;
2867
progress = true;
2868
break;
2869
}
2870
2871
if (inst->src[0].file == IMM) {
2872
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2873
inst->opcode = BRW_OPCODE_MOV;
2874
inst->src[0].f += inst->src[1].f;
2875
inst->src[1] = reg_undef;
2876
progress = true;
2877
break;
2878
}
2879
break;
2880
case BRW_OPCODE_OR:
2881
if (inst->src[0].equals(inst->src[1]) ||
2882
inst->src[1].is_zero()) {
2883
/* On Gfx8+, the OR instruction can have a source modifier that
2884
* performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
2885
* or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2886
*/
2887
if (inst->src[0].negate) {
2888
inst->opcode = BRW_OPCODE_NOT;
2889
inst->src[0].negate = false;
2890
} else {
2891
inst->opcode = BRW_OPCODE_MOV;
2892
}
2893
inst->src[1] = reg_undef;
2894
progress = true;
2895
break;
2896
}
2897
break;
2898
case BRW_OPCODE_CMP:
2899
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2900
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2901
inst->src[1].is_zero() &&
2902
(inst->src[0].abs || inst->src[0].negate)) {
2903
inst->src[0].abs = false;
2904
inst->src[0].negate = false;
2905
progress = true;
2906
break;
2907
}
2908
break;
2909
case BRW_OPCODE_SEL:
2910
if (!devinfo->has_64bit_float &&
2911
!devinfo->has_64bit_int &&
2912
(inst->dst.type == BRW_REGISTER_TYPE_DF ||
2913
inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2914
inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2915
assert(inst->dst.type == inst->src[0].type);
2916
assert(!inst->saturate);
2917
assert(!inst->src[0].abs && !inst->src[0].negate);
2918
assert(!inst->src[1].abs && !inst->src[1].negate);
2919
const brw::fs_builder ibld(this, block, inst);
2920
2921
set_predicate(inst->predicate,
2922
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2923
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
2924
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
2925
set_predicate(inst->predicate,
2926
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2927
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
2928
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
2929
2930
inst->remove(block);
2931
progress = true;
2932
}
2933
if (inst->src[0].equals(inst->src[1])) {
2934
inst->opcode = BRW_OPCODE_MOV;
2935
inst->src[1] = reg_undef;
2936
inst->predicate = BRW_PREDICATE_NONE;
2937
inst->predicate_inverse = false;
2938
progress = true;
2939
} else if (inst->saturate && inst->src[1].file == IMM) {
2940
switch (inst->conditional_mod) {
2941
case BRW_CONDITIONAL_LE:
2942
case BRW_CONDITIONAL_L:
2943
switch (inst->src[1].type) {
2944
case BRW_REGISTER_TYPE_F:
2945
if (inst->src[1].f >= 1.0f) {
2946
inst->opcode = BRW_OPCODE_MOV;
2947
inst->src[1] = reg_undef;
2948
inst->conditional_mod = BRW_CONDITIONAL_NONE;
2949
progress = true;
2950
}
2951
break;
2952
default:
2953
break;
2954
}
2955
break;
2956
case BRW_CONDITIONAL_GE:
2957
case BRW_CONDITIONAL_G:
2958
switch (inst->src[1].type) {
2959
case BRW_REGISTER_TYPE_F:
2960
if (inst->src[1].f <= 0.0f) {
2961
inst->opcode = BRW_OPCODE_MOV;
2962
inst->src[1] = reg_undef;
2963
inst->conditional_mod = BRW_CONDITIONAL_NONE;
2964
progress = true;
2965
}
2966
break;
2967
default:
2968
break;
2969
}
2970
default:
2971
break;
2972
}
2973
}
2974
break;
2975
case BRW_OPCODE_MAD:
2976
if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
2977
inst->src[1].type != BRW_REGISTER_TYPE_F ||
2978
inst->src[2].type != BRW_REGISTER_TYPE_F)
2979
break;
2980
if (inst->src[1].is_one()) {
2981
inst->opcode = BRW_OPCODE_ADD;
2982
inst->src[1] = inst->src[2];
2983
inst->src[2] = reg_undef;
2984
progress = true;
2985
} else if (inst->src[2].is_one()) {
2986
inst->opcode = BRW_OPCODE_ADD;
2987
inst->src[2] = reg_undef;
2988
progress = true;
2989
}
2990
break;
2991
case SHADER_OPCODE_BROADCAST:
2992
if (is_uniform(inst->src[0])) {
2993
inst->opcode = BRW_OPCODE_MOV;
2994
inst->sources = 1;
2995
inst->force_writemask_all = true;
2996
progress = true;
2997
} else if (inst->src[1].file == IMM) {
2998
inst->opcode = BRW_OPCODE_MOV;
2999
/* It's possible that the selected component will be too large and
3000
* overflow the register. This can happen if someone does a
3001
* readInvocation() from GLSL or SPIR-V and provides an OOB
3002
* invocationIndex. If this happens and we some how manage
3003
* to constant fold it in and get here, then component() may cause
3004
* us to start reading outside of the VGRF which will lead to an
3005
* assert later. Instead, just let it wrap around if it goes over
3006
* exec_size.
3007
*/
3008
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
3009
inst->src[0] = component(inst->src[0], comp);
3010
inst->sources = 1;
3011
inst->force_writemask_all = true;
3012
progress = true;
3013
}
3014
break;
3015
3016
case SHADER_OPCODE_SHUFFLE:
3017
if (is_uniform(inst->src[0])) {
3018
inst->opcode = BRW_OPCODE_MOV;
3019
inst->sources = 1;
3020
progress = true;
3021
} else if (inst->src[1].file == IMM) {
3022
inst->opcode = BRW_OPCODE_MOV;
3023
inst->src[0] = component(inst->src[0],
3024
inst->src[1].ud);
3025
inst->sources = 1;
3026
progress = true;
3027
}
3028
break;
3029
3030
default:
3031
break;
3032
}
3033
3034
/* Swap if src[0] is immediate. */
3035
if (progress && inst->is_commutative()) {
3036
if (inst->src[0].file == IMM) {
3037
fs_reg tmp = inst->src[1];
3038
inst->src[1] = inst->src[0];
3039
inst->src[0] = tmp;
3040
}
3041
}
3042
}
3043
3044
if (progress)
3045
invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
3046
DEPENDENCY_INSTRUCTION_DETAIL);
3047
3048
return progress;
3049
}
3050
3051
/**
3052
* Optimize sample messages that have constant zero values for the trailing
3053
* texture coordinates. We can just reduce the message length for these
3054
* instructions instead of reserving a register for it. Trailing parameters
3055
* that aren't sent default to zero anyway. This will cause the dead code
3056
* eliminator to remove the MOV instruction that would otherwise be emitted to
3057
* set up the zero value.
3058
*/
3059
bool
3060
fs_visitor::opt_zero_samples()
3061
{
3062
/* Gfx4 infers the texturing opcode based on the message length so we can't
3063
* change it. Gfx12.5 has restrictions on the number of coordinate
3064
* parameters that have to be provided for some texture types
3065
* (Wa_14013363432).
3066
*/
3067
if (devinfo->ver < 5 || devinfo->verx10 == 125)
3068
return false;
3069
3070
bool progress = false;
3071
3072
foreach_block_and_inst(block, fs_inst, inst, cfg) {
3073
if (!inst->is_tex())
3074
continue;
3075
3076
fs_inst *load_payload = (fs_inst *) inst->prev;
3077
3078
if (load_payload->is_head_sentinel() ||
3079
load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3080
continue;
3081
3082
/* We don't want to remove the message header or the first parameter.
3083
* Removing the first parameter is not allowed, see the Haswell PRM
3084
* volume 7, page 149:
3085
*
3086
* "Parameter 0 is required except for the sampleinfo message, which
3087
* has no parameter 0"
3088
*/
3089
while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
3090
load_payload->src[(inst->mlen - inst->header_size) /
3091
(inst->exec_size / 8) +
3092
inst->header_size - 1].is_zero()) {
3093
inst->mlen -= inst->exec_size / 8;
3094
progress = true;
3095
}
3096
}
3097
3098
if (progress)
3099
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3100
3101
return progress;
3102
}
3103
3104
bool
3105
fs_visitor::opt_register_renaming()
3106
{
3107
bool progress = false;
3108
int depth = 0;
3109
3110
unsigned remap[alloc.count];
3111
memset(remap, ~0u, sizeof(unsigned) * alloc.count);
3112
3113
foreach_block_and_inst(block, fs_inst, inst, cfg) {
3114
if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
3115
depth++;
3116
} else if (inst->opcode == BRW_OPCODE_ENDIF ||
3117
inst->opcode == BRW_OPCODE_WHILE) {
3118
depth--;
3119
}
3120
3121
/* Rewrite instruction sources. */
3122
for (int i = 0; i < inst->sources; i++) {
3123
if (inst->src[i].file == VGRF &&
3124
remap[inst->src[i].nr] != ~0u &&
3125
remap[inst->src[i].nr] != inst->src[i].nr) {
3126
inst->src[i].nr = remap[inst->src[i].nr];
3127
progress = true;
3128
}
3129
}
3130
3131
const unsigned dst = inst->dst.nr;
3132
3133
if (depth == 0 &&
3134
inst->dst.file == VGRF &&
3135
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
3136
!inst->is_partial_write()) {
3137
if (remap[dst] == ~0u) {
3138
remap[dst] = dst;
3139
} else {
3140
remap[dst] = alloc.allocate(regs_written(inst));
3141
inst->dst.nr = remap[dst];
3142
progress = true;
3143
}
3144
} else if (inst->dst.file == VGRF &&
3145
remap[dst] != ~0u &&
3146
remap[dst] != dst) {
3147
inst->dst.nr = remap[dst];
3148
progress = true;
3149
}
3150
}
3151
3152
if (progress) {
3153
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
3154
DEPENDENCY_VARIABLES);
3155
3156
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
3157
if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {
3158
delta_xy[i].nr = remap[delta_xy[i].nr];
3159
}
3160
}
3161
}
3162
3163
return progress;
3164
}
3165
3166
/**
3167
* Remove redundant or useless halts.
3168
*
3169
* For example, we can eliminate halts in the following sequence:
3170
*
3171
* halt (redundant with the next halt)
3172
* halt (useless; jumps to the next instruction)
3173
* halt-target
3174
*/
3175
bool
3176
fs_visitor::opt_redundant_halt()
3177
{
3178
bool progress = false;
3179
3180
unsigned halt_count = 0;
3181
fs_inst *halt_target = NULL;
3182
bblock_t *halt_target_block = NULL;
3183
foreach_block_and_inst(block, fs_inst, inst, cfg) {
3184
if (inst->opcode == BRW_OPCODE_HALT)
3185
halt_count++;
3186
3187
if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
3188
halt_target = inst;
3189
halt_target_block = block;
3190
break;
3191
}
3192
}
3193
3194
if (!halt_target) {
3195
assert(halt_count == 0);
3196
return false;
3197
}
3198
3199
/* Delete any HALTs immediately before the halt target. */
3200
for (fs_inst *prev = (fs_inst *) halt_target->prev;
3201
!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
3202
prev = (fs_inst *) halt_target->prev) {
3203
prev->remove(halt_target_block);
3204
halt_count--;
3205
progress = true;
3206
}
3207
3208
if (halt_count == 0) {
3209
halt_target->remove(halt_target_block);
3210
progress = true;
3211
}
3212
3213
if (progress)
3214
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3215
3216
return progress;
3217
}
3218
3219
/**
3220
* Compute a bitmask with GRF granularity with a bit set for each GRF starting
3221
* from \p r.offset which overlaps the region starting at \p s.offset and
3222
* spanning \p ds bytes.
3223
*/
3224
static inline unsigned
3225
mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
3226
{
3227
const int rel_offset = reg_offset(s) - reg_offset(r);
3228
const int shift = rel_offset / REG_SIZE;
3229
const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
3230
assert(reg_space(r) == reg_space(s) &&
3231
shift >= 0 && shift < int(8 * sizeof(unsigned)));
3232
return ((1 << n) - 1) << shift;
3233
}
3234
3235
bool
3236
fs_visitor::compute_to_mrf()
3237
{
3238
bool progress = false;
3239
int next_ip = 0;
3240
3241
/* No MRFs on Gen >= 7. */
3242
if (devinfo->ver >= 7)
3243
return false;
3244
3245
const fs_live_variables &live = live_analysis.require();
3246
3247
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3248
int ip = next_ip;
3249
next_ip++;
3250
3251
if (inst->opcode != BRW_OPCODE_MOV ||
3252
inst->is_partial_write() ||
3253
inst->dst.file != MRF || inst->src[0].file != VGRF ||
3254
inst->dst.type != inst->src[0].type ||
3255
inst->src[0].abs || inst->src[0].negate ||
3256
!inst->src[0].is_contiguous() ||
3257
inst->src[0].offset % REG_SIZE != 0)
3258
continue;
3259
3260
/* Can't compute-to-MRF this GRF if someone else was going to
3261
* read it later.
3262
*/
3263
if (live.vgrf_end[inst->src[0].nr] > ip)
3264
continue;
3265
3266
/* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
3267
* things that computed the value of all GRFs of the source region. The
3268
* regs_left bitset keeps track of the registers we haven't yet found a
3269
* generating instruction for.
3270
*/
3271
unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
3272
3273
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3274
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3275
inst->src[0], inst->size_read(0))) {
3276
/* Found the last thing to write our reg we want to turn
3277
* into a compute-to-MRF.
3278
*/
3279
3280
/* If this one instruction didn't populate all the
3281
* channels, bail. We might be able to rewrite everything
3282
* that writes that reg, but it would require smarter
3283
* tracking.
3284
*/
3285
if (scan_inst->is_partial_write())
3286
break;
3287
3288
/* Handling things not fully contained in the source of the copy
3289
* would need us to understand coalescing out more than one MOV at
3290
* a time.
3291
*/
3292
if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
3293
inst->src[0], inst->size_read(0)))
3294
break;
3295
3296
/* SEND instructions can't have MRF as a destination. */
3297
if (scan_inst->mlen)
3298
break;
3299
3300
if (devinfo->ver == 6) {
3301
/* gfx6 math instructions must have the destination be
3302
* GRF, so no compute-to-MRF for them.
3303
*/
3304
if (scan_inst->is_math()) {
3305
break;
3306
}
3307
}
3308
3309
/* Clear the bits for any registers this instruction overwrites. */
3310
regs_left &= ~mask_relative_to(
3311
inst->src[0], scan_inst->dst, scan_inst->size_written);
3312
if (!regs_left)
3313
break;
3314
}
3315
3316
/* We don't handle control flow here. Most computation of
3317
* values that end up in MRFs are shortly before the MRF
3318
* write anyway.
3319
*/
3320
if (block->start() == scan_inst)
3321
break;
3322
3323
/* You can't read from an MRF, so if someone else reads our
3324
* MRF's source GRF that we wanted to rewrite, that stops us.
3325
*/
3326
bool interfered = false;
3327
for (int i = 0; i < scan_inst->sources; i++) {
3328
if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
3329
inst->src[0], inst->size_read(0))) {
3330
interfered = true;
3331
}
3332
}
3333
if (interfered)
3334
break;
3335
3336
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3337
inst->dst, inst->size_written)) {
3338
/* If somebody else writes our MRF here, we can't
3339
* compute-to-MRF before that.
3340
*/
3341
break;
3342
}
3343
3344
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
3345
regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
3346
inst->dst, inst->size_written)) {
3347
/* Found a SEND instruction, which means that there are
3348
* live values in MRFs from base_mrf to base_mrf +
3349
* scan_inst->mlen - 1. Don't go pushing our MRF write up
3350
* above it.
3351
*/
3352
break;
3353
}
3354
}
3355
3356
if (regs_left)
3357
continue;
3358
3359
/* Found all generating instructions of our MRF's source value, so it
3360
* should be safe to rewrite them to point to the MRF directly.
3361
*/
3362
regs_left = (1 << regs_read(inst, 0)) - 1;
3363
3364
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3365
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3366
inst->src[0], inst->size_read(0))) {
3367
/* Clear the bits for any registers this instruction overwrites. */
3368
regs_left &= ~mask_relative_to(
3369
inst->src[0], scan_inst->dst, scan_inst->size_written);
3370
3371
const unsigned rel_offset = reg_offset(scan_inst->dst) -
3372
reg_offset(inst->src[0]);
3373
3374
if (inst->dst.nr & BRW_MRF_COMPR4) {
3375
/* Apply the same address transformation done by the hardware
3376
* for COMPR4 MRF writes.
3377
*/
3378
assert(rel_offset < 2 * REG_SIZE);
3379
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
3380
3381
/* Clear the COMPR4 bit if the generating instruction is not
3382
* compressed.
3383
*/
3384
if (scan_inst->size_written < 2 * REG_SIZE)
3385
scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
3386
3387
} else {
3388
/* Calculate the MRF number the result of this instruction is
3389
* ultimately written to.
3390
*/
3391
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
3392
}
3393
3394
scan_inst->dst.file = MRF;
3395
scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
3396
scan_inst->saturate |= inst->saturate;
3397
if (!regs_left)
3398
break;
3399
}
3400
}
3401
3402
assert(!regs_left);
3403
inst->remove(block);
3404
progress = true;
3405
}
3406
3407
if (progress)
3408
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3409
3410
return progress;
3411
}
3412
3413
/**
3414
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3415
* flow. We could probably do better here with some form of divergence
3416
* analysis.
3417
*/
3418
bool
3419
fs_visitor::eliminate_find_live_channel()
3420
{
3421
bool progress = false;
3422
unsigned depth = 0;
3423
3424
if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
3425
/* The optimization below assumes that channel zero is live on thread
3426
* dispatch, which may not be the case if the fixed function dispatches
3427
* threads sparsely.
3428
*/
3429
return false;
3430
}
3431
3432
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3433
switch (inst->opcode) {
3434
case BRW_OPCODE_IF:
3435
case BRW_OPCODE_DO:
3436
depth++;
3437
break;
3438
3439
case BRW_OPCODE_ENDIF:
3440
case BRW_OPCODE_WHILE:
3441
depth--;
3442
break;
3443
3444
case BRW_OPCODE_HALT:
3445
/* This can potentially make control flow non-uniform until the end
3446
* of the program.
3447
*/
3448
return progress;
3449
3450
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3451
if (depth == 0) {
3452
inst->opcode = BRW_OPCODE_MOV;
3453
inst->src[0] = brw_imm_ud(0u);
3454
inst->sources = 1;
3455
inst->force_writemask_all = true;
3456
progress = true;
3457
}
3458
break;
3459
3460
default:
3461
break;
3462
}
3463
}
3464
3465
if (progress)
3466
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3467
3468
return progress;
3469
}
3470
3471
/**
3472
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3473
* instructions to FS_OPCODE_REP_FB_WRITE.
3474
*/
3475
void
3476
fs_visitor::emit_repclear_shader()
3477
{
3478
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3479
int base_mrf = 0;
3480
int color_mrf = base_mrf + 2;
3481
fs_inst *mov;
3482
3483
if (uniforms > 0) {
3484
mov = bld.exec_all().group(4, 0)
3485
.MOV(brw_message_reg(color_mrf),
3486
fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
3487
} else {
3488
struct brw_reg reg =
3489
brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
3490
BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
3491
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
3492
3493
mov = bld.exec_all().group(4, 0)
3494
.MOV(brw_uvec_mrf(4, color_mrf, 0), fs_reg(reg));
3495
}
3496
3497
fs_inst *write = NULL;
3498
if (key->nr_color_regions == 1) {
3499
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3500
write->saturate = key->clamp_fragment_color;
3501
write->base_mrf = color_mrf;
3502
write->target = 0;
3503
write->header_size = 0;
3504
write->mlen = 1;
3505
} else {
3506
assume(key->nr_color_regions > 0);
3507
3508
struct brw_reg header =
3509
retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
3510
bld.exec_all().group(16, 0)
3511
.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3512
3513
for (int i = 0; i < key->nr_color_regions; ++i) {
3514
if (i > 0) {
3515
bld.exec_all().group(1, 0)
3516
.MOV(component(header, 2), brw_imm_ud(i));
3517
}
3518
3519
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3520
write->saturate = key->clamp_fragment_color;
3521
write->base_mrf = base_mrf;
3522
write->target = i;
3523
write->header_size = 2;
3524
write->mlen = 3;
3525
}
3526
}
3527
write->eot = true;
3528
write->last_rt = true;
3529
3530
calculate_cfg();
3531
3532
assign_constant_locations();
3533
assign_curb_setup();
3534
3535
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3536
if (uniforms > 0) {
3537
assert(mov->src[0].file == FIXED_GRF);
3538
mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
3539
}
3540
3541
lower_scoreboard();
3542
}
3543
3544
/**
3545
* Walks through basic blocks, looking for repeated MRF writes and
3546
* removing the later ones.
3547
*/
3548
bool
3549
fs_visitor::remove_duplicate_mrf_writes()
3550
{
3551
fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];
3552
bool progress = false;
3553
3554
/* Need to update the MRF tracking for compressed instructions. */
3555
if (dispatch_width >= 16)
3556
return false;
3557
3558
memset(last_mrf_move, 0, sizeof(last_mrf_move));
3559
3560
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3561
if (inst->is_control_flow()) {
3562
memset(last_mrf_move, 0, sizeof(last_mrf_move));
3563
}
3564
3565
if (inst->opcode == BRW_OPCODE_MOV &&
3566
inst->dst.file == MRF) {
3567
fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3568
if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
3569
inst->dst.equals(prev_inst->dst) &&
3570
inst->src[0].equals(prev_inst->src[0]) &&
3571
inst->saturate == prev_inst->saturate &&
3572
inst->predicate == prev_inst->predicate &&
3573
inst->conditional_mod == prev_inst->conditional_mod &&
3574
inst->exec_size == prev_inst->exec_size) {
3575
inst->remove(block);
3576
progress = true;
3577
continue;
3578
}
3579
}
3580
3581
/* Clear out the last-write records for MRFs that were overwritten. */
3582
if (inst->dst.file == MRF) {
3583
last_mrf_move[inst->dst.nr] = NULL;
3584
}
3585
3586
if (inst->mlen > 0 && inst->base_mrf != -1) {
3587
/* Found a SEND instruction, which will include two or fewer
3588
* implied MRF writes. We could do better here.
3589
*/
3590
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3591
last_mrf_move[inst->base_mrf + i] = NULL;
3592
}
3593
}
3594
3595
/* Clear out any MRF move records whose sources got overwritten. */
3596
for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3597
if (last_mrf_move[i] &&
3598
regions_overlap(inst->dst, inst->size_written,
3599
last_mrf_move[i]->src[0],
3600
last_mrf_move[i]->size_read(0))) {
3601
last_mrf_move[i] = NULL;
3602
}
3603
}
3604
3605
if (inst->opcode == BRW_OPCODE_MOV &&
3606
inst->dst.file == MRF &&
3607
inst->src[0].file != ARF &&
3608
!inst->is_partial_write()) {
3609
last_mrf_move[inst->dst.nr] = inst;
3610
}
3611
}
3612
3613
if (progress)
3614
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3615
3616
return progress;
3617
}
3618
3619
/**
3620
* Rounding modes for conversion instructions are included for each
3621
* conversion, but right now it is a state. So once it is set,
3622
* we don't need to call it again for subsequent calls.
3623
*
3624
* This is useful for vector/matrices conversions, as setting the
3625
* mode once is enough for the full vector/matrix
3626
*/
3627
bool
3628
fs_visitor::remove_extra_rounding_modes()
3629
{
3630
bool progress = false;
3631
unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3632
3633
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
3634
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3635
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3636
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3637
execution_mode)
3638
base_mode = BRW_RND_MODE_RTNE;
3639
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3640
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3641
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3642
execution_mode)
3643
base_mode = BRW_RND_MODE_RTZ;
3644
3645
foreach_block (block, cfg) {
3646
brw_rnd_mode prev_mode = base_mode;
3647
3648
foreach_inst_in_block_safe (fs_inst, inst, block) {
3649
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
3650
assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
3651
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
3652
if (mode == prev_mode) {
3653
inst->remove(block);
3654
progress = true;
3655
} else {
3656
prev_mode = mode;
3657
}
3658
}
3659
}
3660
}
3661
3662
if (progress)
3663
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3664
3665
return progress;
3666
}
3667
3668
static void
3669
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3670
{
3671
/* Clear the flag for registers that actually got read (as expected). */
3672
for (int i = 0; i < inst->sources; i++) {
3673
int grf;
3674
if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3675
grf = inst->src[i].nr;
3676
} else {
3677
continue;
3678
}
3679
3680
if (grf >= first_grf &&
3681
grf < first_grf + grf_len) {
3682
deps[grf - first_grf] = false;
3683
if (inst->exec_size == 16)
3684
deps[grf - first_grf + 1] = false;
3685
}
3686
}
3687
}
3688
3689
/**
3690
* Implements this workaround for the original 965:
3691
*
3692
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3693
* check for post destination dependencies on this instruction, software
3694
* must ensure that there is no destination hazard for the case of ‘write
3695
* followed by a posted write’ shown in the following example.
3696
*
3697
* 1. mov r3 0
3698
* 2. send r3.xy <rest of send instruction>
3699
* 3. mov r2 r3
3700
*
3701
* Due to no post-destination dependency check on the ‘send’, the above
3702
* code sequence could have two instructions (1 and 2) in flight at the
3703
* same time that both consider ‘r3’ as the target of their final writes.
3704
*/
3705
void
3706
fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
3707
fs_inst *inst)
3708
{
3709
int write_len = regs_written(inst);
3710
int first_write_grf = inst->dst.nr;
3711
bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
3712
assert(write_len < (int)sizeof(needs_dep) - 1);
3713
3714
memset(needs_dep, false, sizeof(needs_dep));
3715
memset(needs_dep, true, write_len);
3716
3717
clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3718
3719
/* Walk backwards looking for writes to registers we're writing which
3720
* aren't read since being written. If we hit the start of the program,
3721
* we assume that there are no outstanding dependencies on entry to the
3722
* program.
3723
*/
3724
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3725
/* If we hit control flow, assume that there *are* outstanding
3726
* dependencies, and force their cleanup before our instruction.
3727
*/
3728
if (block->start() == scan_inst && block->num != 0) {
3729
for (int i = 0; i < write_len; i++) {
3730
if (needs_dep[i])
3731
DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3732
first_write_grf + i);
3733
}
3734
return;
3735
}
3736
3737
/* We insert our reads as late as possible on the assumption that any
3738
* instruction but a MOV that might have left us an outstanding
3739
* dependency has more latency than a MOV.
3740
*/
3741
if (scan_inst->dst.file == VGRF) {
3742
for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3743
int reg = scan_inst->dst.nr + i;
3744
3745
if (reg >= first_write_grf &&
3746
reg < first_write_grf + write_len &&
3747
needs_dep[reg - first_write_grf]) {
3748
DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3749
needs_dep[reg - first_write_grf] = false;
3750
if (scan_inst->exec_size == 16)
3751
needs_dep[reg - first_write_grf + 1] = false;
3752
}
3753
}
3754
}
3755
3756
/* Clear the flag for registers that actually got read (as expected). */
3757
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3758
3759
/* Continue the loop only if we haven't resolved all the dependencies */
3760
int i;
3761
for (i = 0; i < write_len; i++) {
3762
if (needs_dep[i])
3763
break;
3764
}
3765
if (i == write_len)
3766
return;
3767
}
3768
}
3769
3770
/**
3771
* Implements this workaround for the original 965:
3772
*
3773
* "[DevBW, DevCL] Errata: A destination register from a send can not be
3774
* used as a destination register until after it has been sourced by an
3775
* instruction with a different destination register.
3776
*/
3777
void
3778
fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3779
{
3780
int write_len = regs_written(inst);
3781
unsigned first_write_grf = inst->dst.nr;
3782
bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
3783
assert(write_len < (int)sizeof(needs_dep) - 1);
3784
3785
memset(needs_dep, false, sizeof(needs_dep));
3786
memset(needs_dep, true, write_len);
3787
/* Walk forwards looking for writes to registers we're writing which aren't
3788
* read before being written.
3789
*/
3790
foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
3791
/* If we hit control flow, force resolve all remaining dependencies. */
3792
if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3793
for (int i = 0; i < write_len; i++) {
3794
if (needs_dep[i])
3795
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3796
first_write_grf + i);
3797
}
3798
return;
3799
}
3800
3801
/* Clear the flag for registers that actually got read (as expected). */
3802
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3803
3804
/* We insert our reads as late as possible since they're reading the
3805
* result of a SEND, which has massive latency.
3806
*/
3807
if (scan_inst->dst.file == VGRF &&
3808
scan_inst->dst.nr >= first_write_grf &&
3809
scan_inst->dst.nr < first_write_grf + write_len &&
3810
needs_dep[scan_inst->dst.nr - first_write_grf]) {
3811
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3812
scan_inst->dst.nr);
3813
needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3814
}
3815
3816
/* Continue the loop only if we haven't resolved all the dependencies */
3817
int i;
3818
for (i = 0; i < write_len; i++) {
3819
if (needs_dep[i])
3820
break;
3821
}
3822
if (i == write_len)
3823
return;
3824
}
3825
}
3826
3827
void
3828
fs_visitor::insert_gfx4_send_dependency_workarounds()
3829
{
3830
if (devinfo->ver != 4 || devinfo->is_g4x)
3831
return;
3832
3833
bool progress = false;
3834
3835
foreach_block_and_inst(block, fs_inst, inst, cfg) {
3836
if (inst->mlen != 0 && inst->dst.file == VGRF) {
3837
insert_gfx4_pre_send_dependency_workarounds(block, inst);
3838
insert_gfx4_post_send_dependency_workarounds(block, inst);
3839
progress = true;
3840
}
3841
}
3842
3843
if (progress)
3844
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3845
}
3846
3847
/**
3848
* Turns the generic expression-style uniform pull constant load instruction
3849
* into a hardware-specific series of instructions for loading a pull
3850
* constant.
3851
*
3852
* The expression style allows the CSE pass before this to optimize out
3853
* repeated loads from the same offset, and gives the pre-register-allocation
3854
* scheduling full flexibility, while the conversion to native instructions
3855
* allows the post-register-allocation scheduler the best information
3856
* possible.
3857
*
3858
* Note that execution masking for setting up pull constant loads is special:
3859
* the channels that need to be written are unrelated to the current execution
3860
* mask, since a later instruction will use one of the result channels as a
3861
* source operand for all 8 or 16 of its channels.
3862
*/
3863
void
3864
fs_visitor::lower_uniform_pull_constant_loads()
3865
{
3866
foreach_block_and_inst (block, fs_inst, inst, cfg) {
3867
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3868
continue;
3869
3870
const fs_reg& surface = inst->src[0];
3871
const fs_reg& offset_B = inst->src[1];
3872
assert(offset_B.file == IMM);
3873
3874
if (devinfo->has_lsc) {
3875
const fs_builder ubld =
3876
fs_builder(this, block, inst).group(8, 0).exec_all();
3877
3878
const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3879
ubld.MOV(payload, offset_B);
3880
3881
inst->sfid = GFX12_SFID_UGM;
3882
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3883
1 /* simd_size */,
3884
LSC_ADDR_SURFTYPE_BTI,
3885
LSC_ADDR_SIZE_A32,
3886
1 /* num_coordinates */,
3887
LSC_DATA_SIZE_D32,
3888
inst->size_written / 4,
3889
true /* transpose */,
3890
LSC_CACHE_LOAD_L1STATE_L3MOCS,
3891
true /* has_dest */);
3892
3893
fs_reg ex_desc;
3894
if (surface.file == IMM) {
3895
ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
3896
} else {
3897
/* We only need the first component for the payload so we can use
3898
* one of the other components for the extended descriptor
3899
*/
3900
ex_desc = component(payload, 1);
3901
ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
3902
}
3903
3904
/* Update the original instruction. */
3905
inst->opcode = SHADER_OPCODE_SEND;
3906
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3907
inst->ex_mlen = 0;
3908
inst->header_size = 0;
3909
inst->send_has_side_effects = false;
3910
inst->send_is_volatile = true;
3911
inst->exec_size = 1;
3912
3913
/* Finally, the payload */
3914
inst->resize_sources(3);
3915
inst->src[0] = brw_imm_ud(0); /* desc */
3916
inst->src[1] = ex_desc;
3917
inst->src[2] = payload;
3918
3919
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3920
} else if (devinfo->ver >= 7) {
3921
const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3922
const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
3923
3924
ubld.group(8, 0).MOV(payload,
3925
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3926
ubld.group(1, 0).MOV(component(payload, 2),
3927
brw_imm_ud(offset_B.ud / 16));
3928
3929
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
3930
inst->src[1] = payload;
3931
inst->header_size = 1;
3932
inst->mlen = 1;
3933
3934
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3935
} else {
3936
/* Before register allocation, we didn't tell the scheduler about the
3937
* MRF we use. We know it's safe to use this MRF because nothing
3938
* else does except for register spill/unspill, which generates and
3939
* uses its MRF within a single IR instruction.
3940
*/
3941
inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
3942
inst->mlen = 1;
3943
}
3944
}
3945
}
3946
3947
bool
3948
fs_visitor::lower_load_payload()
3949
{
3950
bool progress = false;
3951
3952
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3953
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3954
continue;
3955
3956
assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3957
assert(inst->saturate == false);
3958
fs_reg dst = inst->dst;
3959
3960
/* Get rid of COMPR4. We'll add it back in if we need it */
3961
if (dst.file == MRF)
3962
dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3963
3964
const fs_builder ibld(this, block, inst);
3965
const fs_builder ubld = ibld.exec_all();
3966
3967
for (uint8_t i = 0; i < inst->header_size;) {
3968
/* Number of header GRFs to initialize at once with a single MOV
3969
* instruction.
3970
*/
3971
const unsigned n =
3972
(i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3973
inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3974
2 : 1;
3975
3976
if (inst->src[i].file != BAD_FILE)
3977
ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
3978
retype(inst->src[i], BRW_REGISTER_TYPE_UD));
3979
3980
dst = byte_offset(dst, n * REG_SIZE);
3981
i += n;
3982
}
3983
3984
if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3985
inst->exec_size > 8) {
3986
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
3987
* a straightforward copy. Instead, the result of the
3988
* LOAD_PAYLOAD is treated as interleaved and the first four
3989
* non-header sources are unpacked as:
3990
*
3991
* m + 0: r0
3992
* m + 1: g0
3993
* m + 2: b0
3994
* m + 3: a0
3995
* m + 4: r1
3996
* m + 5: g1
3997
* m + 6: b1
3998
* m + 7: a1
3999
*
4000
* This is used for gen <= 5 fb writes.
4001
*/
4002
assert(inst->exec_size == 16);
4003
assert(inst->header_size + 4 <= inst->sources);
4004
for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
4005
if (inst->src[i].file != BAD_FILE) {
4006
if (devinfo->has_compr4) {
4007
fs_reg compr4_dst = retype(dst, inst->src[i].type);
4008
compr4_dst.nr |= BRW_MRF_COMPR4;
4009
ibld.MOV(compr4_dst, inst->src[i]);
4010
} else {
4011
/* Platform doesn't have COMPR4. We have to fake it */
4012
fs_reg mov_dst = retype(dst, inst->src[i].type);
4013
ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
4014
mov_dst.nr += 4;
4015
ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
4016
}
4017
}
4018
4019
dst.nr++;
4020
}
4021
4022
/* The loop above only ever incremented us through the first set
4023
* of 4 registers. However, thanks to the magic of COMPR4, we
4024
* actually wrote to the first 8 registers, so we need to take
4025
* that into account now.
4026
*/
4027
dst.nr += 4;
4028
4029
/* The COMPR4 code took care of the first 4 sources. We'll let
4030
* the regular path handle any remaining sources. Yes, we are
4031
* modifying the instruction but we're about to delete it so
4032
* this really doesn't hurt anything.
4033
*/
4034
inst->header_size += 4;
4035
}
4036
4037
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
4038
if (inst->src[i].file != BAD_FILE) {
4039
dst.type = inst->src[i].type;
4040
ibld.MOV(dst, inst->src[i]);
4041
} else {
4042
dst.type = BRW_REGISTER_TYPE_UD;
4043
}
4044
dst = offset(dst, ibld, 1);
4045
}
4046
4047
inst->remove(block);
4048
progress = true;
4049
}
4050
4051
if (progress)
4052
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4053
4054
return progress;
4055
}
4056
4057
void
4058
fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
4059
{
4060
const fs_builder ibld(this, block, inst);
4061
4062
const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
4063
if (inst->src[1].file == IMM &&
4064
(( ud && inst->src[1].ud <= UINT16_MAX) ||
4065
(!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) {
4066
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
4067
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
4068
* src1 are used.
4069
*
4070
* If multiplying by an immediate value that fits in 16-bits, do a
4071
* single MUL instruction with that value in the proper location.
4072
*/
4073
if (devinfo->ver < 7) {
4074
fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
4075
ibld.MOV(imm, inst->src[1]);
4076
ibld.MUL(inst->dst, imm, inst->src[0]);
4077
} else {
4078
ibld.MUL(inst->dst, inst->src[0],
4079
ud ? brw_imm_uw(inst->src[1].ud)
4080
: brw_imm_w(inst->src[1].d));
4081
}
4082
} else {
4083
/* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
4084
* do 32-bit integer multiplication in one instruction, but instead
4085
* must do a sequence (which actually calculates a 64-bit result):
4086
*
4087
* mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
4088
* mach(8) null g3<8,8,1>D g4<8,8,1>D
4089
* mov(8) g2<1>D acc0<8,8,1>D
4090
*
4091
* But on Gen > 6, the ability to use second accumulator register
4092
* (acc1) for non-float data types was removed, preventing a simple
4093
* implementation in SIMD16. A 16-channel result can be calculated by
4094
* executing the three instructions twice in SIMD8, once with quarter
4095
* control of 1Q for the first eight channels and again with 2Q for
4096
* the second eight channels.
4097
*
4098
* Which accumulator register is implicitly accessed (by AccWrEnable
4099
* for instance) is determined by the quarter control. Unfortunately
4100
* Ivybridge (and presumably Baytrail) has a hardware bug in which an
4101
* implicit accumulator access by an instruction with 2Q will access
4102
* acc1 regardless of whether the data type is usable in acc1.
4103
*
4104
* Specifically, the 2Q mach(8) writes acc1 which does not exist for
4105
* integer data types.
4106
*
4107
* Since we only want the low 32-bits of the result, we can do two
4108
* 32-bit x 16-bit multiplies (like the mul and mach are doing), and
4109
* adjust the high result and add them (like the mach is doing):
4110
*
4111
* mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
4112
* mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
4113
* shl(8) g9<1>D g8<8,8,1>D 16D
4114
* add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
4115
*
4116
* We avoid the shl instruction by realizing that we only want to add
4117
* the low 16-bits of the "high" result to the high 16-bits of the
4118
* "low" result and using proper regioning on the add:
4119
*
4120
* mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
4121
* mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
4122
* add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
4123
*
4124
* Since it does not use the (single) accumulator register, we can
4125
* schedule multi-component multiplications much better.
4126
*/
4127
4128
bool needs_mov = false;
4129
fs_reg orig_dst = inst->dst;
4130
4131
/* Get a new VGRF for the "low" 32x16-bit multiplication result if
4132
* reusing the original destination is impossible due to hardware
4133
* restrictions, source/destination overlap, or it being the null
4134
* register.
4135
*/
4136
fs_reg low = inst->dst;
4137
if (orig_dst.is_null() || orig_dst.file == MRF ||
4138
regions_overlap(inst->dst, inst->size_written,
4139
inst->src[0], inst->size_read(0)) ||
4140
regions_overlap(inst->dst, inst->size_written,
4141
inst->src[1], inst->size_read(1)) ||
4142
inst->dst.stride >= 4) {
4143
needs_mov = true;
4144
low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
4145
inst->dst.type);
4146
}
4147
4148
/* Get a new VGRF but keep the same stride as inst->dst */
4149
fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
4150
high.stride = inst->dst.stride;
4151
high.offset = inst->dst.offset % REG_SIZE;
4152
4153
if (devinfo->ver >= 7) {
4154
/* From Wa_1604601757:
4155
*
4156
* "When multiplying a DW and any lower precision integer, source modifier
4157
* is not supported."
4158
*
4159
* An unsupported negate modifier on src[1] would ordinarily be
4160
* lowered by the subsequent lower_regioning pass. In this case that
4161
* pass would spawn another dword multiply. Instead, lower the
4162
* modifier first.
4163
*/
4164
const bool source_mods_unsupported = (devinfo->ver >= 12);
4165
4166
if (inst->src[1].abs || (inst->src[1].negate &&
4167
source_mods_unsupported))
4168
lower_src_modifiers(this, block, inst, 1);
4169
4170
if (inst->src[1].file == IMM) {
4171
ibld.MUL(low, inst->src[0],
4172
brw_imm_uw(inst->src[1].ud & 0xffff));
4173
ibld.MUL(high, inst->src[0],
4174
brw_imm_uw(inst->src[1].ud >> 16));
4175
} else {
4176
ibld.MUL(low, inst->src[0],
4177
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4178
ibld.MUL(high, inst->src[0],
4179
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
4180
}
4181
} else {
4182
if (inst->src[0].abs)
4183
lower_src_modifiers(this, block, inst, 0);
4184
4185
ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
4186
inst->src[1]);
4187
ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
4188
inst->src[1]);
4189
}
4190
4191
ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
4192
subscript(low, BRW_REGISTER_TYPE_UW, 1),
4193
subscript(high, BRW_REGISTER_TYPE_UW, 0));
4194
4195
if (needs_mov || inst->conditional_mod)
4196
set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
4197
}
4198
}
4199
4200
void
4201
fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
4202
{
4203
const fs_builder ibld(this, block, inst);
4204
4205
/* Considering two 64-bit integers ab and cd where each letter ab
4206
* corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
4207
* only need to provide the YZ part of the result. -------
4208
* BD
4209
* Only BD needs to be 64 bits. For AD and BC we only care + AD
4210
* about the lower 32 bits (since they are part of the upper + BC
4211
* 32 bits of our result). AC is not needed since it starts + AC
4212
* on the 65th bit of the result. -------
4213
* WXYZ
4214
*/
4215
unsigned int q_regs = regs_written(inst);
4216
unsigned int d_regs = (q_regs + 1) / 2;
4217
4218
fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
4219
fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4220
fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4221
4222
/* Here we need the full 64 bit result for 32b * 32b. */
4223
if (devinfo->has_integer_dword_mul) {
4224
ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4225
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4226
} else {
4227
fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4228
fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4229
fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);
4230
4231
fs_inst *mul = ibld.MUL(acc,
4232
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4233
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4234
mul->writes_accumulator = true;
4235
4236
ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4237
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4238
ibld.MOV(bd_low, acc);
4239
4240
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
4241
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
4242
}
4243
4244
ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
4245
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4246
ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4247
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
4248
4249
ibld.ADD(ad, ad, bc);
4250
ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
4251
subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
4252
4253
if (devinfo->has_64bit_int) {
4254
ibld.MOV(inst->dst, bd);
4255
} else {
4256
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
4257
subscript(bd, BRW_REGISTER_TYPE_UD, 0));
4258
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
4259
subscript(bd, BRW_REGISTER_TYPE_UD, 1));
4260
}
4261
}
4262
4263
void
4264
fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
4265
{
4266
const fs_builder ibld(this, block, inst);
4267
4268
/* According to the BDW+ BSpec page for the "Multiply Accumulate
4269
* High" instruction:
4270
*
4271
* "An added preliminary mov is required for source modification on
4272
* src1:
4273
* mov (8) r3.0<1>:d -r3<8;8,1>:d
4274
* mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
4275
* mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
4276
*/
4277
if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
4278
lower_src_modifiers(this, block, inst, 1);
4279
4280
/* Should have been lowered to 8-wide. */
4281
assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
4282
const fs_reg acc = retype(brw_acc_reg(inst->exec_size), inst->dst.type);
4283
fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
4284
fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
4285
4286
if (devinfo->ver >= 8) {
4287
/* Until Gfx8, integer multiplies read 32-bits from one source,
4288
* and 16-bits from the other, and relying on the MACH instruction
4289
* to generate the high bits of the result.
4290
*
4291
* On Gfx8, the multiply instruction does a full 32x32-bit
4292
* multiply, but in order to do a 64-bit multiply we can simulate
4293
* the previous behavior and then use a MACH instruction.
4294
*/
4295
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
4296
mul->src[1].type == BRW_REGISTER_TYPE_UD);
4297
mul->src[1].type = BRW_REGISTER_TYPE_UW;
4298
mul->src[1].stride *= 2;
4299
4300
if (mul->src[1].file == IMM) {
4301
mul->src[1] = brw_imm_uw(mul->src[1].ud);
4302
}
4303
} else if (devinfo->verx10 == 70 &&
4304
inst->group > 0) {
4305
/* Among other things the quarter control bits influence which
4306
* accumulator register is used by the hardware for instructions
4307
* that access the accumulator implicitly (e.g. MACH). A
4308
* second-half instruction would normally map to acc1, which
4309
* doesn't exist on Gfx7 and up (the hardware does emulate it for
4310
* floating-point instructions *only* by taking advantage of the
4311
* extra precision of acc0 not normally used for floating point
4312
* arithmetic).
4313
*
4314
* HSW and up are careful enough not to try to access an
4315
* accumulator register that doesn't exist, but on earlier Gfx7
4316
* hardware we need to make sure that the quarter control bits are
4317
* zero to avoid non-deterministic behaviour and emit an extra MOV
4318
* to get the result masked correctly according to the current
4319
* channel enables.
4320
*/
4321
mach->group = 0;
4322
mach->force_writemask_all = true;
4323
mach->dst = ibld.vgrf(inst->dst.type);
4324
ibld.MOV(inst->dst, mach->dst);
4325
}
4326
}
4327
4328
bool
4329
fs_visitor::lower_integer_multiplication()
4330
{
4331
bool progress = false;
4332
4333
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4334
if (inst->opcode == BRW_OPCODE_MUL) {
4335
/* If the instruction is already in a form that does not need lowering,
4336
* return early.
4337
*/
4338
if (devinfo->ver >= 7) {
4339
if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
4340
continue;
4341
} else {
4342
if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
4343
continue;
4344
}
4345
4346
if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
4347
inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
4348
(inst->src[0].type == BRW_REGISTER_TYPE_Q ||
4349
inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
4350
(inst->src[1].type == BRW_REGISTER_TYPE_Q ||
4351
inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
4352
lower_mul_qword_inst(inst, block);
4353
inst->remove(block);
4354
progress = true;
4355
} else if (!inst->dst.is_accumulator() &&
4356
(inst->dst.type == BRW_REGISTER_TYPE_D ||
4357
inst->dst.type == BRW_REGISTER_TYPE_UD) &&
4358
(!devinfo->has_integer_dword_mul ||
4359
devinfo->verx10 >= 125)) {
4360
lower_mul_dword_inst(inst, block);
4361
inst->remove(block);
4362
progress = true;
4363
}
4364
} else if (inst->opcode == SHADER_OPCODE_MULH) {
4365
lower_mulh_inst(inst, block);
4366
inst->remove(block);
4367
progress = true;
4368
}
4369
4370
}
4371
4372
if (progress)
4373
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4374
4375
return progress;
4376
}
4377
4378
bool
4379
fs_visitor::lower_minmax()
4380
{
4381
assert(devinfo->ver < 6);
4382
4383
bool progress = false;
4384
4385
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4386
const fs_builder ibld(this, block, inst);
4387
4388
if (inst->opcode == BRW_OPCODE_SEL &&
4389
inst->predicate == BRW_PREDICATE_NONE) {
4390
/* If src1 is an immediate value that is not NaN, then it can't be
4391
* NaN. In that case, emit CMP because it is much better for cmod
4392
* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
4393
* support HF or DF, so it is not necessary to check for those.
4394
*/
4395
if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
4396
(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4397
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4398
inst->conditional_mod);
4399
} else {
4400
ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4401
inst->conditional_mod);
4402
}
4403
inst->predicate = BRW_PREDICATE_NORMAL;
4404
inst->conditional_mod = BRW_CONDITIONAL_NONE;
4405
4406
progress = true;
4407
}
4408
}
4409
4410
if (progress)
4411
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4412
4413
return progress;
4414
}
4415
4416
bool
4417
fs_visitor::lower_sub_sat()
4418
{
4419
bool progress = false;
4420
4421
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4422
const fs_builder ibld(this, block, inst);
4423
4424
if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
4425
inst->opcode == SHADER_OPCODE_ISUB_SAT) {
4426
/* The fundamental problem is the hardware performs source negation
4427
* at the bit width of the source. If the source is 0x80000000D, the
4428
* negation is 0x80000000D. As a result, subtractSaturate(0,
4429
* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
4430
* are at least three ways to resolve this:
4431
*
4432
* 1. Use the accumulator for the negated source. The accumulator is
4433
* 33 bits, so our source 0x80000000 is sign-extended to
4434
* 0x1800000000. The negation of which is 0x080000000. This
4435
* doesn't help for 64-bit integers (which are already bigger than
4436
* 33 bits). There are also only 8 accumulators, so SIMD16 or
4437
* SIMD32 instructions would have to be split into multiple SIMD8
4438
* instructions.
4439
*
4440
* 2. Use slightly different math. For any n-bit value x, we know (x
4441
* >> 1) != -(x >> 1). We can use this fact to only do
4442
* subtractions involving (x >> 1). subtractSaturate(a, b) ==
4443
* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4444
*
4445
* 3. For unsigned sources, it is sufficient to replace the
4446
* subtractSaturate with (a > b) ? a - b : 0.
4447
*
4448
* It may also be possible to use the SUBB instruction. This
4449
* implicitly writes the accumulator, so it could only be used in the
4450
* same situations as #1 above. It is further limited by only
4451
* allowing UD sources.
4452
*/
4453
if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
4454
inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
4455
fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
4456
4457
ibld.MOV(acc, inst->src[1]);
4458
fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4459
add->saturate = true;
4460
add->src[0].negate = true;
4461
} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
4462
/* tmp = src1 >> 1;
4463
* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4464
*/
4465
fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4466
fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4467
fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4468
fs_inst *add;
4469
4470
ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
4471
4472
add = ibld.ADD(tmp2, inst->src[1], tmp1);
4473
add->src[1].negate = true;
4474
4475
add = ibld.ADD(tmp3, inst->src[0], tmp1);
4476
add->src[1].negate = true;
4477
add->saturate = true;
4478
4479
add = ibld.ADD(inst->dst, tmp3, tmp2);
4480
add->src[1].negate = true;
4481
add->saturate = true;
4482
} else {
4483
/* a > b ? a - b : 0 */
4484
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4485
BRW_CONDITIONAL_G);
4486
4487
fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4488
add->src[1].negate = !add->src[1].negate;
4489
4490
ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
4491
->predicate = BRW_PREDICATE_NORMAL;
4492
}
4493
4494
inst->remove(block);
4495
progress = true;
4496
}
4497
}
4498
4499
if (progress)
4500
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4501
4502
return progress;
4503
}
4504
4505
/**
4506
* Get the mask of SIMD channels enabled during dispatch and not yet disabled
4507
* by discard. Due to the layout of the sample mask in the fragment shader
4508
* thread payload, \p bld is required to have a dispatch_width() not greater
4509
* than 16 for fragment shaders.
4510
*/
4511
static fs_reg
4512
sample_mask_reg(const fs_builder &bld)
4513
{
4514
const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
4515
4516
if (v->stage != MESA_SHADER_FRAGMENT) {
4517
return brw_imm_ud(0xffffffff);
4518
} else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
4519
assert(bld.dispatch_width() <= 16);
4520
return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
4521
} else {
4522
assert(v->devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4523
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4524
BRW_REGISTER_TYPE_UW);
4525
}
4526
}
4527
4528
static void
4529
setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
4530
fs_reg *dst, fs_reg color, unsigned components)
4531
{
4532
if (key->clamp_fragment_color) {
4533
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
4534
assert(color.type == BRW_REGISTER_TYPE_F);
4535
4536
for (unsigned i = 0; i < components; i++)
4537
set_saturate(true,
4538
bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
4539
4540
color = tmp;
4541
}
4542
4543
for (unsigned i = 0; i < components; i++)
4544
dst[i] = offset(color, bld, i);
4545
}
4546
4547
uint32_t
4548
brw_fb_write_msg_control(const fs_inst *inst,
4549
const struct brw_wm_prog_data *prog_data)
4550
{
4551
uint32_t mctl;
4552
4553
if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
4554
assert(inst->group == 0 && inst->exec_size == 16);
4555
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4556
} else if (prog_data->dual_src_blend) {
4557
assert(inst->exec_size == 8);
4558
4559
if (inst->group % 16 == 0)
4560
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4561
else if (inst->group % 16 == 8)
4562
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4563
else
4564
unreachable("Invalid dual-source FB write instruction group");
4565
} else {
4566
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4567
4568
if (inst->exec_size == 16)
4569
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4570
else if (inst->exec_size == 8)
4571
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4572
else
4573
unreachable("Invalid FB write execution size");
4574
}
4575
4576
return mctl;
4577
}
4578
4579
static void
4580
lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
4581
const struct brw_wm_prog_data *prog_data,
4582
const brw_wm_prog_key *key,
4583
const fs_visitor::thread_payload &payload)
4584
{
4585
assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
4586
const intel_device_info *devinfo = bld.shader->devinfo;
4587
const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
4588
const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
4589
const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
4590
const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
4591
const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
4592
const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
4593
fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
4594
const unsigned components =
4595
inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
4596
4597
assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
4598
4599
/* We can potentially have a message length of up to 15, so we have to set
4600
* base_mrf to either 0 or 1 in order to fit in m0..m15.
4601
*/
4602
fs_reg sources[15];
4603
int header_size = 2, payload_header_size;
4604
unsigned length = 0;
4605
4606
if (devinfo->ver < 6) {
4607
/* TODO: Support SIMD32 on gfx4-5 */
4608
assert(bld.group() < 16);
4609
4610
/* For gfx4-5, we always have a header consisting of g0 and g1. We have
4611
* an implied MOV from g0,g1 to the start of the message. The MOV from
4612
* g0 is handled by the hardware and the MOV from g1 is provided by the
4613
* generator. This is required because, on gfx4-5, the generator may
4614
* generate two write messages with different message lengths in order
4615
* to handle AA data properly.
4616
*
4617
* Also, since the pixel mask goes in the g0 portion of the message and
4618
* since render target writes are the last thing in the shader, we write
4619
* the pixel mask directly into g0 and it will get copied as part of the
4620
* implied write.
4621
*/
4622
if (prog_data->uses_kill) {
4623
bld.exec_all().group(1, 0)
4624
.MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
4625
sample_mask_reg(bld));
4626
}
4627
4628
assert(length == 0);
4629
length = 2;
4630
} else if ((devinfo->verx10 <= 70 &&
4631
prog_data->uses_kill) ||
4632
(devinfo->ver < 11 &&
4633
(color1.file != BAD_FILE || key->nr_color_regions > 1))) {
4634
/* From the Sandy Bridge PRM, volume 4, page 198:
4635
*
4636
* "Dispatched Pixel Enables. One bit per pixel indicating
4637
* which pixels were originally enabled when the thread was
4638
* dispatched. This field is only required for the end-of-
4639
* thread message and on all dual-source messages."
4640
*/
4641
const fs_builder ubld = bld.exec_all().group(8, 0);
4642
4643
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4644
if (bld.group() < 16) {
4645
/* The header starts off as g0 and g1 for the first half */
4646
ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4647
BRW_REGISTER_TYPE_UD));
4648
} else {
4649
/* The header starts off as g0 and g2 for the second half */
4650
assert(bld.group() < 32);
4651
const fs_reg header_sources[2] = {
4652
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4653
retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
4654
};
4655
ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
4656
4657
/* Gfx12 will require additional fix-ups if we ever hit this path. */
4658
assert(devinfo->ver < 12);
4659
}
4660
4661
uint32_t g00_bits = 0;
4662
4663
/* Set "Source0 Alpha Present to RenderTarget" bit in message
4664
* header.
4665
*/
4666
if (src0_alpha.file != BAD_FILE)
4667
g00_bits |= 1 << 11;
4668
4669
/* Set computes stencil to render target */
4670
if (prog_data->computed_stencil)
4671
g00_bits |= 1 << 14;
4672
4673
if (g00_bits) {
4674
/* OR extra bits into g0.0 */
4675
ubld.group(1, 0).OR(component(header, 0),
4676
retype(brw_vec1_grf(0, 0),
4677
BRW_REGISTER_TYPE_UD),
4678
brw_imm_ud(g00_bits));
4679
}
4680
4681
/* Set the render target index for choosing BLEND_STATE. */
4682
if (inst->target > 0) {
4683
ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
4684
}
4685
4686
if (prog_data->uses_kill) {
4687
ubld.group(1, 0).MOV(retype(component(header, 15),
4688
BRW_REGISTER_TYPE_UW),
4689
sample_mask_reg(bld));
4690
}
4691
4692
assert(length == 0);
4693
sources[0] = header;
4694
sources[1] = horiz_offset(header, 8);
4695
length = 2;
4696
}
4697
assert(length == 0 || length == 2);
4698
header_size = length;
4699
4700
if (payload.aa_dest_stencil_reg[0]) {
4701
assert(inst->group < 16);
4702
sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
4703
bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
4704
.MOV(sources[length],
4705
fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
4706
length++;
4707
}
4708
4709
if (src0_alpha.file != BAD_FILE) {
4710
for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
4711
const fs_builder &ubld = bld.exec_all().group(8, i)
4712
.annotate("FB write src0 alpha");
4713
const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
4714
ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
4715
setup_color_payload(ubld, key, &sources[length], tmp, 1);
4716
length++;
4717
}
4718
}
4719
4720
if (sample_mask.file != BAD_FILE) {
4721
sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
4722
BRW_REGISTER_TYPE_UD);
4723
4724
/* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
4725
* relevant. Since it's unsigned single words one vgrf is always
4726
* 16-wide, but only the lower or higher 8 channels will be used by the
4727
* hardware when doing a SIMD8 write depending on whether we have
4728
* selected the subspans for the first or second half respectively.
4729
*/
4730
assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
4731
sample_mask.type = BRW_REGISTER_TYPE_UW;
4732
sample_mask.stride *= 2;
4733
4734
bld.exec_all().annotate("FB write oMask")
4735
.MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
4736
inst->group % 16),
4737
sample_mask);
4738
length++;
4739
}
4740
4741
payload_header_size = length;
4742
4743
setup_color_payload(bld, key, &sources[length], color0, components);
4744
length += 4;
4745
4746
if (color1.file != BAD_FILE) {
4747
setup_color_payload(bld, key, &sources[length], color1, components);
4748
length += 4;
4749
}
4750
4751
if (src_depth.file != BAD_FILE) {
4752
sources[length] = src_depth;
4753
length++;
4754
}
4755
4756
if (dst_depth.file != BAD_FILE) {
4757
sources[length] = dst_depth;
4758
length++;
4759
}
4760
4761
if (src_stencil.file != BAD_FILE) {
4762
assert(devinfo->ver >= 9);
4763
assert(bld.dispatch_width() == 8);
4764
4765
/* XXX: src_stencil is only available on gfx9+. dst_depth is never
4766
* available on gfx9+. As such it's impossible to have both enabled at the
4767
* same time and therefore length cannot overrun the array.
4768
*/
4769
assert(length < 15);
4770
4771
sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4772
bld.exec_all().annotate("FB write OS")
4773
.MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
4774
subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
4775
length++;
4776
}
4777
4778
fs_inst *load;
4779
if (devinfo->ver >= 7) {
4780
/* Send from the GRF */
4781
fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
4782
load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
4783
payload.nr = bld.shader->alloc.allocate(regs_written(load));
4784
load->dst = payload;
4785
4786
uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
4787
4788
inst->desc =
4789
(inst->group / 16) << 11 | /* rt slot group */
4790
brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
4791
prog_data->per_coarse_pixel_dispatch);
4792
4793
uint32_t ex_desc = 0;
4794
if (devinfo->ver >= 11) {
4795
/* Set the "Render Target Index" and "Src0 Alpha Present" fields
4796
* in the extended message descriptor, in lieu of using a header.
4797
*/
4798
ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
4799
4800
if (key->nr_color_regions == 0)
4801
ex_desc |= 1 << 20; /* Null Render Target */
4802
}
4803
inst->ex_desc = ex_desc;
4804
4805
inst->opcode = SHADER_OPCODE_SEND;
4806
inst->resize_sources(3);
4807
inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
4808
inst->src[0] = brw_imm_ud(0);
4809
inst->src[1] = brw_imm_ud(0);
4810
inst->src[2] = payload;
4811
inst->mlen = regs_written(load);
4812
inst->ex_mlen = 0;
4813
inst->header_size = header_size;
4814
inst->check_tdr = true;
4815
inst->send_has_side_effects = true;
4816
} else {
4817
/* Send from the MRF */
4818
load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
4819
sources, length, payload_header_size);
4820
4821
/* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
4822
* will do this for us if we just give it a COMPR4 destination.
4823
*/
4824
if (devinfo->ver < 6 && bld.dispatch_width() == 16)
4825
load->dst.nr |= BRW_MRF_COMPR4;
4826
4827
if (devinfo->ver < 6) {
4828
/* Set up src[0] for the implied MOV from grf0-1 */
4829
inst->resize_sources(1);
4830
inst->src[0] = brw_vec8_grf(0, 0);
4831
} else {
4832
inst->resize_sources(0);
4833
}
4834
inst->base_mrf = 1;
4835
inst->opcode = FS_OPCODE_FB_WRITE;
4836
inst->mlen = regs_written(load);
4837
inst->header_size = header_size;
4838
}
4839
}
4840
4841
static void
4842
lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
4843
{
4844
const intel_device_info *devinfo = bld.shader->devinfo;
4845
const fs_builder &ubld = bld.exec_all().group(8, 0);
4846
const unsigned length = 2;
4847
const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
4848
4849
if (bld.group() < 16) {
4850
ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4851
BRW_REGISTER_TYPE_UD));
4852
} else {
4853
assert(bld.group() < 32);
4854
const fs_reg header_sources[] = {
4855
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4856
retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
4857
};
4858
ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
4859
4860
if (devinfo->ver >= 12) {
4861
/* On Gfx12 the Viewport and Render Target Array Index fields (AKA
4862
* Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
4863
* target message header format was updated accordingly -- However
4864
* the updated format only works for the lower 16 channels in a
4865
* SIMD32 thread, since the higher 16 channels want the subspan data
4866
* from r2 instead of r1, so we need to copy over the contents of
4867
* r1.1 in order to fix things up.
4868
*/
4869
ubld.group(1, 0).MOV(component(header, 9),
4870
retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
4871
}
4872
}
4873
4874
inst->resize_sources(1);
4875
inst->src[0] = header;
4876
inst->opcode = FS_OPCODE_FB_READ;
4877
inst->mlen = length;
4878
inst->header_size = length;
4879
}
4880
4881
static void
4882
lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
4883
const fs_reg &coordinate,
4884
const fs_reg &shadow_c,
4885
const fs_reg &lod, const fs_reg &lod2,
4886
const fs_reg &surface,
4887
const fs_reg &sampler,
4888
unsigned coord_components,
4889
unsigned grad_components)
4890
{
4891
const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
4892
op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
4893
fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
4894
fs_reg msg_end = msg_begin;
4895
4896
/* g0 header. */
4897
msg_end = offset(msg_end, bld.group(8, 0), 1);
4898
4899
for (unsigned i = 0; i < coord_components; i++)
4900
bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
4901
offset(coordinate, bld, i));
4902
4903
msg_end = offset(msg_end, bld, coord_components);
4904
4905
/* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
4906
* require all three components to be present and zero if they are unused.
4907
*/
4908
if (coord_components > 0 &&
4909
(has_lod || shadow_c.file != BAD_FILE ||
4910
(op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
4911
assert(coord_components <= 3);
4912
for (unsigned i = 0; i < 3 - coord_components; i++)
4913
bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
4914
4915
msg_end = offset(msg_end, bld, 3 - coord_components);
4916
}
4917
4918
if (op == SHADER_OPCODE_TXD) {
4919
/* TXD unsupported in SIMD16 mode. */
4920
assert(bld.dispatch_width() == 8);
4921
4922
/* the slots for u and v are always present, but r is optional */
4923
if (coord_components < 2)
4924
msg_end = offset(msg_end, bld, 2 - coord_components);
4925
4926
/* P = u, v, r
4927
* dPdx = dudx, dvdx, drdx
4928
* dPdy = dudy, dvdy, drdy
4929
*
4930
* 1-arg: Does not exist.
4931
*
4932
* 2-arg: dudx dvdx dudy dvdy
4933
* dPdx.x dPdx.y dPdy.x dPdy.y
4934
* m4 m5 m6 m7
4935
*
4936
* 3-arg: dudx dvdx drdx dudy dvdy drdy
4937
* dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
4938
* m5 m6 m7 m8 m9 m10
4939
*/
4940
for (unsigned i = 0; i < grad_components; i++)
4941
bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
4942
4943
msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4944
4945
for (unsigned i = 0; i < grad_components; i++)
4946
bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
4947
4948
msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4949
}
4950
4951
if (has_lod) {
4952
/* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
4953
* shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
4954
*/
4955
assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
4956
bld.dispatch_width() == 16);
4957
4958
const brw_reg_type type =
4959
(op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
4960
BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
4961
bld.MOV(retype(msg_end, type), lod);
4962
msg_end = offset(msg_end, bld, 1);
4963
}
4964
4965
if (shadow_c.file != BAD_FILE) {
4966
if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
4967
/* There's no plain shadow compare message, so we use shadow
4968
* compare with a bias of 0.0.
4969
*/
4970
bld.MOV(msg_end, brw_imm_f(0.0f));
4971
msg_end = offset(msg_end, bld, 1);
4972
}
4973
4974
bld.MOV(msg_end, shadow_c);
4975
msg_end = offset(msg_end, bld, 1);
4976
}
4977
4978
inst->opcode = op;
4979
inst->src[0] = reg_undef;
4980
inst->src[1] = surface;
4981
inst->src[2] = sampler;
4982
inst->resize_sources(3);
4983
inst->base_mrf = msg_begin.nr;
4984
inst->mlen = msg_end.nr - msg_begin.nr;
4985
inst->header_size = 1;
4986
}
4987
4988
static void
4989
lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
4990
const fs_reg &coordinate,
4991
const fs_reg &shadow_c,
4992
const fs_reg &lod, const fs_reg &lod2,
4993
const fs_reg &sample_index,
4994
const fs_reg &surface,
4995
const fs_reg &sampler,
4996
unsigned coord_components,
4997
unsigned grad_components)
4998
{
4999
fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
5000
fs_reg msg_coords = message;
5001
unsigned header_size = 0;
5002
5003
if (inst->offset != 0) {
5004
/* The offsets set up by the visitor are in the m1 header, so we can't
5005
* go headerless.
5006
*/
5007
header_size = 1;
5008
message.nr--;
5009
}
5010
5011
for (unsigned i = 0; i < coord_components; i++)
5012
bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
5013
offset(coordinate, bld, i));
5014
5015
fs_reg msg_end = offset(msg_coords, bld, coord_components);
5016
fs_reg msg_lod = offset(msg_coords, bld, 4);
5017
5018
if (shadow_c.file != BAD_FILE) {
5019
fs_reg msg_shadow = msg_lod;
5020
bld.MOV(msg_shadow, shadow_c);
5021
msg_lod = offset(msg_shadow, bld, 1);
5022
msg_end = msg_lod;
5023
}
5024
5025
switch (op) {
5026
case SHADER_OPCODE_TXL:
5027
case FS_OPCODE_TXB:
5028
bld.MOV(msg_lod, lod);
5029
msg_end = offset(msg_lod, bld, 1);
5030
break;
5031
case SHADER_OPCODE_TXD:
5032
/**
5033
* P = u, v, r
5034
* dPdx = dudx, dvdx, drdx
5035
* dPdy = dudy, dvdy, drdy
5036
*
5037
* Load up these values:
5038
* - dudx dudy dvdx dvdy drdx drdy
5039
* - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
5040
*/
5041
msg_end = msg_lod;
5042
for (unsigned i = 0; i < grad_components; i++) {
5043
bld.MOV(msg_end, offset(lod, bld, i));
5044
msg_end = offset(msg_end, bld, 1);
5045
5046
bld.MOV(msg_end, offset(lod2, bld, i));
5047
msg_end = offset(msg_end, bld, 1);
5048
}
5049
break;
5050
case SHADER_OPCODE_TXS:
5051
msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
5052
bld.MOV(msg_lod, lod);
5053
msg_end = offset(msg_lod, bld, 1);
5054
break;
5055
case SHADER_OPCODE_TXF:
5056
msg_lod = offset(msg_coords, bld, 3);
5057
bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
5058
msg_end = offset(msg_lod, bld, 1);
5059
break;
5060
case SHADER_OPCODE_TXF_CMS:
5061
msg_lod = offset(msg_coords, bld, 3);
5062
/* lod */
5063
bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
5064
/* sample index */
5065
bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
5066
msg_end = offset(msg_lod, bld, 2);
5067
break;
5068
default:
5069
break;
5070
}
5071
5072
inst->opcode = op;
5073
inst->src[0] = reg_undef;
5074
inst->src[1] = surface;
5075
inst->src[2] = sampler;
5076
inst->resize_sources(3);
5077
inst->base_mrf = message.nr;
5078
inst->mlen = msg_end.nr - message.nr;
5079
inst->header_size = header_size;
5080
5081
/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5082
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5083
}
5084
5085
static bool
5086
is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
5087
{
5088
if (devinfo->verx10 <= 70)
5089
return false;
5090
5091
return sampler.file != IMM || sampler.ud >= 16;
5092
}
5093
5094
static unsigned
5095
sampler_msg_type(const intel_device_info *devinfo,
5096
opcode opcode, bool shadow_compare)
5097
{
5098
assert(devinfo->ver >= 5);
5099
switch (opcode) {
5100
case SHADER_OPCODE_TEX:
5101
return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
5102
GFX5_SAMPLER_MESSAGE_SAMPLE;
5103
case FS_OPCODE_TXB:
5104
return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
5105
GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
5106
case SHADER_OPCODE_TXL:
5107
return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
5108
GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
5109
case SHADER_OPCODE_TXL_LZ:
5110
return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
5111
GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
5112
case SHADER_OPCODE_TXS:
5113
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5114
return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
5115
case SHADER_OPCODE_TXD:
5116
assert(!shadow_compare || devinfo->verx10 >= 75);
5117
return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
5118
GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
5119
case SHADER_OPCODE_TXF:
5120
return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
5121
case SHADER_OPCODE_TXF_LZ:
5122
assert(devinfo->ver >= 9);
5123
return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
5124
case SHADER_OPCODE_TXF_CMS_W:
5125
assert(devinfo->ver >= 9);
5126
return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
5127
case SHADER_OPCODE_TXF_CMS:
5128
return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
5129
GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
5130
case SHADER_OPCODE_TXF_UMS:
5131
assert(devinfo->ver >= 7);
5132
return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
5133
case SHADER_OPCODE_TXF_MCS:
5134
assert(devinfo->ver >= 7);
5135
return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
5136
case SHADER_OPCODE_LOD:
5137
return GFX5_SAMPLER_MESSAGE_LOD;
5138
case SHADER_OPCODE_TG4:
5139
assert(devinfo->ver >= 7);
5140
return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
5141
GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
5142
break;
5143
case SHADER_OPCODE_TG4_OFFSET:
5144
assert(devinfo->ver >= 7);
5145
return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
5146
GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
5147
case SHADER_OPCODE_SAMPLEINFO:
5148
return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
5149
default:
5150
unreachable("not reached");
5151
}
5152
}
5153
5154
static void
5155
lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
5156
const fs_reg &coordinate,
5157
const fs_reg &shadow_c,
5158
fs_reg lod, const fs_reg &lod2,
5159
const fs_reg &min_lod,
5160
const fs_reg &sample_index,
5161
const fs_reg &mcs,
5162
const fs_reg &surface,
5163
const fs_reg &sampler,
5164
const fs_reg &surface_handle,
5165
const fs_reg &sampler_handle,
5166
const fs_reg &tg4_offset,
5167
unsigned coord_components,
5168
unsigned grad_components)
5169
{
5170
const intel_device_info *devinfo = bld.shader->devinfo;
5171
const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
5172
unsigned reg_width = bld.dispatch_width() / 8;
5173
unsigned header_size = 0, length = 0;
5174
fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
5175
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
5176
sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
5177
5178
/* We must have exactly one of surface/sampler and surface/sampler_handle */
5179
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5180
assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
5181
5182
if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
5183
inst->offset != 0 || inst->eot ||
5184
op == SHADER_OPCODE_SAMPLEINFO ||
5185
sampler_handle.file != BAD_FILE ||
5186
is_high_sampler(devinfo, sampler)) {
5187
/* For general texture offsets (no txf workaround), we need a header to
5188
* put them in.
5189
*
5190
* TG4 needs to place its channel select in the header, for interaction
5191
* with ARB_texture_swizzle. The sampler index is only 4-bits, so for
5192
* larger sampler numbers we need to offset the Sampler State Pointer in
5193
* the header.
5194
*/
5195
fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
5196
header_size = 1;
5197
length++;
5198
5199
/* If we're requesting fewer than four channels worth of response,
5200
* and we have an explicit header, we need to set up the sampler
5201
* writemask. It's reversed from normal: 1 means "don't write".
5202
*/
5203
if (!inst->eot && regs_written(inst) != 4 * reg_width) {
5204
assert(regs_written(inst) % reg_width == 0);
5205
unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
5206
inst->offset |= mask << 12;
5207
}
5208
5209
/* Build the actual header */
5210
const fs_builder ubld = bld.exec_all().group(8, 0);
5211
const fs_builder ubld1 = ubld.group(1, 0);
5212
ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
5213
if (inst->offset) {
5214
ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
5215
} else if (bld.shader->stage != MESA_SHADER_VERTEX &&
5216
bld.shader->stage != MESA_SHADER_FRAGMENT) {
5217
/* The vertex and fragment stages have g0.2 set to 0, so
5218
* header0.2 is 0 when g0 is copied. Other stages may not, so we
5219
* must set it to 0 to avoid setting undesirable bits in the
5220
* message.
5221
*/
5222
ubld1.MOV(component(header, 2), brw_imm_ud(0));
5223
}
5224
5225
if (sampler_handle.file != BAD_FILE) {
5226
/* Bindless sampler handles aren't relative to the sampler state
5227
* pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
5228
* Instead, it's an absolute pointer relative to dynamic state base
5229
* address.
5230
*
5231
* Sampler states are 16 bytes each and the pointer we give here has
5232
* to be 32-byte aligned. In order to avoid more indirect messages
5233
* than required, we assume that all bindless sampler states are
5234
* 32-byte aligned. This sacrifices a bit of general state base
5235
* address space but means we can do something more efficient in the
5236
* shader.
5237
*/
5238
ubld1.MOV(component(header, 3), sampler_handle);
5239
} else if (is_high_sampler(devinfo, sampler)) {
5240
fs_reg sampler_state_ptr =
5241
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
5242
5243
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
5244
* with the ones included in g0.3 bits 4:0. Mask them out.
5245
*/
5246
if (devinfo->ver >= 11) {
5247
sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
5248
ubld1.AND(sampler_state_ptr,
5249
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
5250
brw_imm_ud(INTEL_MASK(31, 5)));
5251
}
5252
5253
if (sampler.file == BRW_IMMEDIATE_VALUE) {
5254
assert(sampler.ud >= 16);
5255
const int sampler_state_size = 16; /* 16 bytes */
5256
5257
ubld1.ADD(component(header, 3), sampler_state_ptr,
5258
brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
5259
} else {
5260
fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
5261
ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
5262
ubld1.SHL(tmp, tmp, brw_imm_ud(4));
5263
ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
5264
}
5265
} else if (devinfo->ver >= 11) {
5266
/* Gfx11+ sampler message headers include bits in 4:0 which conflict
5267
* with the ones included in g0.3 bits 4:0. Mask them out.
5268
*/
5269
ubld1.AND(component(header, 3),
5270
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
5271
brw_imm_ud(INTEL_MASK(31, 5)));
5272
}
5273
}
5274
5275
if (shadow_c.file != BAD_FILE) {
5276
bld.MOV(sources[length], shadow_c);
5277
length++;
5278
}
5279
5280
bool coordinate_done = false;
5281
5282
/* Set up the LOD info */
5283
switch (op) {
5284
case FS_OPCODE_TXB:
5285
case SHADER_OPCODE_TXL:
5286
if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
5287
op = SHADER_OPCODE_TXL_LZ;
5288
break;
5289
}
5290
bld.MOV(sources[length], lod);
5291
length++;
5292
break;
5293
case SHADER_OPCODE_TXD:
5294
/* TXD should have been lowered in SIMD16 mode. */
5295
assert(bld.dispatch_width() == 8);
5296
5297
/* Load dPdx and the coordinate together:
5298
* [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
5299
*/
5300
for (unsigned i = 0; i < coord_components; i++) {
5301
bld.MOV(sources[length++], offset(coordinate, bld, i));
5302
5303
/* For cube map array, the coordinate is (u,v,r,ai) but there are
5304
* only derivatives for (u, v, r).
5305
*/
5306
if (i < grad_components) {
5307
bld.MOV(sources[length++], offset(lod, bld, i));
5308
bld.MOV(sources[length++], offset(lod2, bld, i));
5309
}
5310
}
5311
5312
coordinate_done = true;
5313
break;
5314
case SHADER_OPCODE_TXS:
5315
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
5316
length++;
5317
break;
5318
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5319
/* We need an LOD; just use 0 */
5320
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
5321
length++;
5322
break;
5323
case SHADER_OPCODE_TXF:
5324
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
5325
* On Gfx9 they are u, v, lod, r
5326
*/
5327
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
5328
5329
if (devinfo->ver >= 9) {
5330
if (coord_components >= 2) {
5331
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
5332
offset(coordinate, bld, 1));
5333
} else {
5334
sources[length] = brw_imm_d(0);
5335
}
5336
length++;
5337
}
5338
5339
if (devinfo->ver >= 9 && lod.is_zero()) {
5340
op = SHADER_OPCODE_TXF_LZ;
5341
} else {
5342
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
5343
length++;
5344
}
5345
5346
for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
5347
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5348
offset(coordinate, bld, i));
5349
5350
coordinate_done = true;
5351
break;
5352
5353
case SHADER_OPCODE_TXF_CMS:
5354
case SHADER_OPCODE_TXF_CMS_W:
5355
case SHADER_OPCODE_TXF_UMS:
5356
case SHADER_OPCODE_TXF_MCS:
5357
if (op == SHADER_OPCODE_TXF_UMS ||
5358
op == SHADER_OPCODE_TXF_CMS ||
5359
op == SHADER_OPCODE_TXF_CMS_W) {
5360
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
5361
length++;
5362
}
5363
5364
if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
5365
/* Data from the multisample control surface. */
5366
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
5367
length++;
5368
5369
/* On Gfx9+ we'll use ld2dms_w instead which has two registers for
5370
* the MCS data.
5371
*/
5372
if (op == SHADER_OPCODE_TXF_CMS_W) {
5373
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
5374
mcs.file == IMM ?
5375
mcs :
5376
offset(mcs, bld, 1));
5377
length++;
5378
}
5379
}
5380
5381
/* There is no offsetting for this message; just copy in the integer
5382
* texture coordinates.
5383
*/
5384
for (unsigned i = 0; i < coord_components; i++)
5385
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5386
offset(coordinate, bld, i));
5387
5388
coordinate_done = true;
5389
break;
5390
case SHADER_OPCODE_TG4_OFFSET:
5391
/* More crazy intermixing */
5392
for (unsigned i = 0; i < 2; i++) /* u, v */
5393
bld.MOV(sources[length++], offset(coordinate, bld, i));
5394
5395
for (unsigned i = 0; i < 2; i++) /* offu, offv */
5396
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5397
offset(tg4_offset, bld, i));
5398
5399
if (coord_components == 3) /* r if present */
5400
bld.MOV(sources[length++], offset(coordinate, bld, 2));
5401
5402
coordinate_done = true;
5403
break;
5404
default:
5405
break;
5406
}
5407
5408
/* Set up the coordinate (except for cases where it was done above) */
5409
if (!coordinate_done) {
5410
for (unsigned i = 0; i < coord_components; i++)
5411
bld.MOV(sources[length++], offset(coordinate, bld, i));
5412
}
5413
5414
if (min_lod.file != BAD_FILE) {
5415
/* Account for all of the missing coordinate sources */
5416
length += 4 - coord_components;
5417
if (op == SHADER_OPCODE_TXD)
5418
length += (3 - grad_components) * 2;
5419
5420
bld.MOV(sources[length++], min_lod);
5421
}
5422
5423
unsigned mlen;
5424
if (reg_width == 2)
5425
mlen = length * reg_width - header_size;
5426
else
5427
mlen = length * reg_width;
5428
5429
const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
5430
BRW_REGISTER_TYPE_F);
5431
bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
5432
5433
/* Generate the SEND. */
5434
inst->opcode = SHADER_OPCODE_SEND;
5435
inst->mlen = mlen;
5436
inst->header_size = header_size;
5437
5438
const unsigned msg_type =
5439
sampler_msg_type(devinfo, op, inst->shadow_compare);
5440
const unsigned simd_mode =
5441
inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
5442
BRW_SAMPLER_SIMD_MODE_SIMD16;
5443
5444
uint32_t base_binding_table_index;
5445
switch (op) {
5446
case SHADER_OPCODE_TG4:
5447
case SHADER_OPCODE_TG4_OFFSET:
5448
base_binding_table_index = prog_data->binding_table.gather_texture_start;
5449
break;
5450
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5451
base_binding_table_index = prog_data->binding_table.image_start;
5452
break;
5453
default:
5454
base_binding_table_index = prog_data->binding_table.texture_start;
5455
break;
5456
}
5457
5458
inst->sfid = BRW_SFID_SAMPLER;
5459
if (surface.file == IMM &&
5460
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
5461
inst->desc = brw_sampler_desc(devinfo,
5462
surface.ud + base_binding_table_index,
5463
sampler.file == IMM ? sampler.ud % 16 : 0,
5464
msg_type,
5465
simd_mode,
5466
0 /* return_format unused on gfx7+ */);
5467
inst->src[0] = brw_imm_ud(0);
5468
inst->src[1] = brw_imm_ud(0);
5469
} else if (surface_handle.file != BAD_FILE) {
5470
/* Bindless surface */
5471
assert(devinfo->ver >= 9);
5472
inst->desc = brw_sampler_desc(devinfo,
5473
GFX9_BTI_BINDLESS,
5474
sampler.file == IMM ? sampler.ud % 16 : 0,
5475
msg_type,
5476
simd_mode,
5477
0 /* return_format unused on gfx7+ */);
5478
5479
/* For bindless samplers, the entire address is included in the message
5480
* header so we can leave the portion in the message descriptor 0.
5481
*/
5482
if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
5483
inst->src[0] = brw_imm_ud(0);
5484
} else {
5485
const fs_builder ubld = bld.group(1, 0).exec_all();
5486
fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5487
ubld.SHL(desc, sampler, brw_imm_ud(8));
5488
inst->src[0] = desc;
5489
}
5490
5491
/* We assume that the driver provided the handle in the top 20 bits so
5492
* we can use the surface handle directly as the extended descriptor.
5493
*/
5494
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5495
} else {
5496
/* Immediate portion of the descriptor */
5497
inst->desc = brw_sampler_desc(devinfo,
5498
0, /* surface */
5499
0, /* sampler */
5500
msg_type,
5501
simd_mode,
5502
0 /* return_format unused on gfx7+ */);
5503
const fs_builder ubld = bld.group(1, 0).exec_all();
5504
fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5505
if (surface.equals(sampler)) {
5506
/* This case is common in GL */
5507
ubld.MUL(desc, surface, brw_imm_ud(0x101));
5508
} else {
5509
if (sampler_handle.file != BAD_FILE) {
5510
ubld.MOV(desc, surface);
5511
} else if (sampler.file == IMM) {
5512
ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
5513
} else {
5514
ubld.SHL(desc, sampler, brw_imm_ud(8));
5515
ubld.OR(desc, desc, surface);
5516
}
5517
}
5518
if (base_binding_table_index)
5519
ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));
5520
ubld.AND(desc, desc, brw_imm_ud(0xfff));
5521
5522
inst->src[0] = component(desc, 0);
5523
inst->src[1] = brw_imm_ud(0); /* ex_desc */
5524
}
5525
5526
inst->ex_desc = 0;
5527
5528
inst->src[2] = src_payload;
5529
inst->resize_sources(3);
5530
5531
if (inst->eot) {
5532
/* EOT sampler messages don't make sense to split because it would
5533
* involve ending half of the thread early.
5534
*/
5535
assert(inst->group == 0);
5536
/* We need to use SENDC for EOT sampler messages */
5537
inst->check_tdr = true;
5538
inst->send_has_side_effects = true;
5539
}
5540
5541
/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5542
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5543
}
5544
5545
static void
5546
lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
5547
{
5548
const intel_device_info *devinfo = bld.shader->devinfo;
5549
const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
5550
const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
5551
const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
5552
const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
5553
const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
5554
const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
5555
const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
5556
const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
5557
const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
5558
const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
5559
const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
5560
const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
5561
assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
5562
const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
5563
assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
5564
const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
5565
5566
if (devinfo->ver >= 7) {
5567
lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
5568
shadow_c, lod, lod2, min_lod,
5569
sample_index,
5570
mcs, surface, sampler,
5571
surface_handle, sampler_handle,
5572
tg4_offset,
5573
coord_components, grad_components);
5574
} else if (devinfo->ver >= 5) {
5575
lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
5576
shadow_c, lod, lod2, sample_index,
5577
surface, sampler,
5578
coord_components, grad_components);
5579
} else {
5580
lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
5581
shadow_c, lod, lod2,
5582
surface, sampler,
5583
coord_components, grad_components);
5584
}
5585
}
5586
5587
/**
5588
* Predicate the specified instruction on the sample mask.
5589
*/
5590
static void
5591
emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
5592
{
5593
assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
5594
bld.group() == inst->group &&
5595
bld.dispatch_width() == inst->exec_size);
5596
5597
const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
5598
const fs_reg sample_mask = sample_mask_reg(bld);
5599
const unsigned subreg = sample_mask_flag_subreg(v);
5600
5601
if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
5602
assert(sample_mask.file == ARF &&
5603
sample_mask.nr == brw_flag_subreg(subreg).nr &&
5604
sample_mask.subnr == brw_flag_subreg(
5605
subreg + inst->group / 16).subnr);
5606
} else {
5607
bld.group(1, 0).exec_all()
5608
.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
5609
}
5610
5611
if (inst->predicate) {
5612
assert(inst->predicate == BRW_PREDICATE_NORMAL);
5613
assert(!inst->predicate_inverse);
5614
assert(inst->flag_subreg == 0);
5615
/* Combine the sample mask with the existing predicate by using a
5616
* vertical predication mode.
5617
*/
5618
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
5619
} else {
5620
inst->flag_subreg = subreg;
5621
inst->predicate = BRW_PREDICATE_NORMAL;
5622
inst->predicate_inverse = false;
5623
}
5624
}
5625
5626
static void
5627
setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
5628
const fs_reg &surface, const fs_reg &surface_handle)
5629
{
5630
const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
5631
5632
/* We must have exactly one of surface and surface_handle */
5633
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5634
5635
if (surface.file == IMM) {
5636
inst->desc = desc | (surface.ud & 0xff);
5637
inst->src[0] = brw_imm_ud(0);
5638
inst->src[1] = brw_imm_ud(0); /* ex_desc */
5639
} else if (surface_handle.file != BAD_FILE) {
5640
/* Bindless surface */
5641
assert(devinfo->ver >= 9);
5642
inst->desc = desc | GFX9_BTI_BINDLESS;
5643
inst->src[0] = brw_imm_ud(0);
5644
5645
/* We assume that the driver provided the handle in the top 20 bits so
5646
* we can use the surface handle directly as the extended descriptor.
5647
*/
5648
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5649
} else {
5650
inst->desc = desc;
5651
const fs_builder ubld = bld.exec_all().group(1, 0);
5652
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5653
ubld.AND(tmp, surface, brw_imm_ud(0xff));
5654
inst->src[0] = component(tmp, 0);
5655
inst->src[1] = brw_imm_ud(0); /* ex_desc */
5656
}
5657
}
5658
5659
static void
5660
lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5661
{
5662
const intel_device_info *devinfo = bld.shader->devinfo;
5663
5664
/* Get the logical send arguments. */
5665
const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5666
const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5667
const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
5668
const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
5669
const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
5670
const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
5671
const fs_reg &allow_sample_mask =
5672
inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
5673
assert(arg.file == IMM);
5674
assert(allow_sample_mask.file == IMM);
5675
5676
/* Calculate the total number of components of the payload. */
5677
const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
5678
const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
5679
5680
const bool is_typed_access =
5681
inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
5682
inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
5683
inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
5684
5685
const bool is_surface_access = is_typed_access ||
5686
inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
5687
inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
5688
inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
5689
5690
const bool is_stateless =
5691
surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
5692
surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
5693
5694
const bool has_side_effects = inst->has_side_effects();
5695
5696
fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
5697
fs_reg(brw_imm_d(0xffff));
5698
5699
/* From the BDW PRM Volume 7, page 147:
5700
*
5701
* "For the Data Cache Data Port*, the header must be present for the
5702
* following message types: [...] Typed read/write/atomics"
5703
*
5704
* Earlier generations have a similar wording. Because of this restriction
5705
* we don't attempt to implement sample masks via predication for such
5706
* messages prior to Gfx9, since we have to provide a header anyway. On
5707
* Gfx11+ the header has been removed so we can only use predication.
5708
*
5709
* For all stateless A32 messages, we also need a header
5710
*/
5711
fs_reg header;
5712
if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
5713
fs_builder ubld = bld.exec_all().group(8, 0);
5714
header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5715
if (is_stateless) {
5716
assert(!is_surface_access);
5717
ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
5718
} else {
5719
ubld.MOV(header, brw_imm_d(0));
5720
if (is_surface_access)
5721
ubld.group(1, 0).MOV(component(header, 7), sample_mask);
5722
}
5723
}
5724
const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
5725
5726
fs_reg payload, payload2;
5727
unsigned mlen, ex_mlen = 0;
5728
if (devinfo->ver >= 9 &&
5729
(src.file == BAD_FILE || header.file == BAD_FILE)) {
5730
/* We have split sends on gfx9 and above */
5731
if (header.file == BAD_FILE) {
5732
payload = bld.move_to_vgrf(addr, addr_sz);
5733
payload2 = bld.move_to_vgrf(src, src_sz);
5734
mlen = addr_sz * (inst->exec_size / 8);
5735
ex_mlen = src_sz * (inst->exec_size / 8);
5736
} else {
5737
assert(src.file == BAD_FILE);
5738
payload = header;
5739
payload2 = bld.move_to_vgrf(addr, addr_sz);
5740
mlen = header_sz;
5741
ex_mlen = addr_sz * (inst->exec_size / 8);
5742
}
5743
} else {
5744
/* Allocate space for the payload. */
5745
const unsigned sz = header_sz + addr_sz + src_sz;
5746
payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
5747
fs_reg *const components = new fs_reg[sz];
5748
unsigned n = 0;
5749
5750
/* Construct the payload. */
5751
if (header.file != BAD_FILE)
5752
components[n++] = header;
5753
5754
for (unsigned i = 0; i < addr_sz; i++)
5755
components[n++] = offset(addr, bld, i);
5756
5757
for (unsigned i = 0; i < src_sz; i++)
5758
components[n++] = offset(src, bld, i);
5759
5760
bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
5761
mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
5762
5763
delete[] components;
5764
}
5765
5766
/* Predicate the instruction on the sample mask if no header is
5767
* provided.
5768
*/
5769
if ((header.file == BAD_FILE || !is_surface_access) &&
5770
sample_mask.file != BAD_FILE && sample_mask.file != IMM)
5771
emit_predicate_on_sample_mask(bld, inst);
5772
5773
uint32_t sfid;
5774
switch (inst->opcode) {
5775
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5776
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5777
/* Byte scattered opcodes go through the normal data cache */
5778
sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
5779
break;
5780
5781
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5782
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5783
sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
5784
devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
5785
BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
5786
break;
5787
5788
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5789
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5790
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5791
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5792
/* Untyped Surface messages go through the data cache but the SFID value
5793
* changed on Haswell.
5794
*/
5795
sfid = (devinfo->verx10 >= 75 ?
5796
HSW_SFID_DATAPORT_DATA_CACHE_1 :
5797
GFX7_SFID_DATAPORT_DATA_CACHE);
5798
break;
5799
5800
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5801
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5802
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5803
/* Typed surface messages go through the render cache on IVB and the
5804
* data cache on HSW+.
5805
*/
5806
sfid = (devinfo->verx10 >= 75 ?
5807
HSW_SFID_DATAPORT_DATA_CACHE_1 :
5808
GFX6_SFID_DATAPORT_RENDER_CACHE);
5809
break;
5810
5811
default:
5812
unreachable("Unsupported surface opcode");
5813
}
5814
5815
uint32_t desc;
5816
switch (inst->opcode) {
5817
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5818
desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5819
arg.ud, /* num_channels */
5820
false /* write */);
5821
break;
5822
5823
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5824
desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5825
arg.ud, /* num_channels */
5826
true /* write */);
5827
break;
5828
5829
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5830
desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5831
arg.ud, /* bit_size */
5832
false /* write */);
5833
break;
5834
5835
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5836
desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5837
arg.ud, /* bit_size */
5838
true /* write */);
5839
break;
5840
5841
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5842
assert(arg.ud == 32); /* bit_size */
5843
desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
5844
false /* write */);
5845
break;
5846
5847
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5848
assert(arg.ud == 32); /* bit_size */
5849
desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
5850
true /* write */);
5851
break;
5852
5853
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5854
desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
5855
arg.ud, /* atomic_op */
5856
!inst->dst.is_null());
5857
break;
5858
5859
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5860
desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
5861
arg.ud, /* atomic_op */
5862
!inst->dst.is_null());
5863
break;
5864
5865
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5866
desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5867
arg.ud, /* num_channels */
5868
false /* write */);
5869
break;
5870
5871
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5872
desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5873
arg.ud, /* num_channels */
5874
true /* write */);
5875
break;
5876
5877
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5878
desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
5879
arg.ud, /* atomic_op */
5880
!inst->dst.is_null());
5881
break;
5882
5883
default:
5884
unreachable("Unknown surface logical instruction");
5885
}
5886
5887
/* Update the original instruction. */
5888
inst->opcode = SHADER_OPCODE_SEND;
5889
inst->mlen = mlen;
5890
inst->ex_mlen = ex_mlen;
5891
inst->header_size = header_sz;
5892
inst->send_has_side_effects = has_side_effects;
5893
inst->send_is_volatile = !has_side_effects;
5894
5895
/* Set up SFID and descriptors */
5896
inst->sfid = sfid;
5897
setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
5898
5899
/* Finally, the payload */
5900
inst->src[2] = payload;
5901
inst->src[3] = payload2;
5902
5903
inst->resize_sources(4);
5904
}
5905
5906
static enum lsc_opcode
5907
brw_atomic_op_to_lsc_atomic_op(unsigned op)
5908
{
5909
switch(op) {
5910
case BRW_AOP_AND:
5911
return LSC_OP_ATOMIC_AND;
5912
case BRW_AOP_OR:
5913
return LSC_OP_ATOMIC_OR;
5914
case BRW_AOP_XOR:
5915
return LSC_OP_ATOMIC_XOR;
5916
case BRW_AOP_MOV:
5917
return LSC_OP_ATOMIC_STORE;
5918
case BRW_AOP_INC:
5919
return LSC_OP_ATOMIC_INC;
5920
case BRW_AOP_DEC:
5921
return LSC_OP_ATOMIC_DEC;
5922
case BRW_AOP_ADD:
5923
return LSC_OP_ATOMIC_ADD;
5924
case BRW_AOP_SUB:
5925
return LSC_OP_ATOMIC_SUB;
5926
case BRW_AOP_IMAX:
5927
return LSC_OP_ATOMIC_MAX;
5928
case BRW_AOP_IMIN:
5929
return LSC_OP_ATOMIC_MIN;
5930
case BRW_AOP_UMAX:
5931
return LSC_OP_ATOMIC_UMAX;
5932
case BRW_AOP_UMIN:
5933
return LSC_OP_ATOMIC_UMIN;
5934
case BRW_AOP_CMPWR:
5935
return LSC_OP_ATOMIC_CMPXCHG;
5936
default:
5937
assert(false);
5938
unreachable("invalid atomic opcode");
5939
}
5940
}
5941
5942
static enum lsc_opcode
5943
brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)
5944
{
5945
switch(aop) {
5946
case BRW_AOP_FMAX:
5947
return LSC_OP_ATOMIC_FMAX;
5948
case BRW_AOP_FMIN:
5949
return LSC_OP_ATOMIC_FMIN;
5950
case BRW_AOP_FCMPWR:
5951
return LSC_OP_ATOMIC_FCMPXCHG;
5952
default:
5953
unreachable("Unsupported float atomic opcode");
5954
}
5955
}
5956
5957
static enum lsc_data_size
5958
lsc_bits_to_data_size(unsigned bit_size)
5959
{
5960
switch (bit_size / 8) {
5961
case 1: return LSC_DATA_SIZE_D8U32;
5962
case 2: return LSC_DATA_SIZE_D16U32;
5963
case 4: return LSC_DATA_SIZE_D32;
5964
case 8: return LSC_DATA_SIZE_D64;
5965
default:
5966
unreachable("Unsupported data size.");
5967
}
5968
}
5969
5970
static void
5971
lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5972
{
5973
const intel_device_info *devinfo = bld.shader->devinfo;
5974
assert(devinfo->has_lsc);
5975
5976
/* Get the logical send arguments. */
5977
const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5978
const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5979
const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
5980
const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
5981
const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
5982
const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
5983
const fs_reg allow_sample_mask =
5984
inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
5985
assert(arg.file == IMM);
5986
assert(allow_sample_mask.file == IMM);
5987
5988
/* Calculate the total number of components of the payload. */
5989
const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
5990
const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
5991
5992
const bool has_side_effects = inst->has_side_effects();
5993
5994
unsigned ex_mlen = 0;
5995
fs_reg payload, payload2;
5996
payload = bld.move_to_vgrf(addr, addr_sz);
5997
if (src.file != BAD_FILE) {
5998
payload2 = bld.move_to_vgrf(src, src_sz);
5999
ex_mlen = src_sz * (inst->exec_size / 8);
6000
}
6001
6002
/* Predicate the instruction on the sample mask if needed */
6003
fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
6004
fs_reg(brw_imm_d(0xffff));
6005
if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
6006
emit_predicate_on_sample_mask(bld, inst);
6007
6008
if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
6009
inst->sfid = GFX12_SFID_SLM;
6010
else
6011
inst->sfid = GFX12_SFID_UGM;
6012
6013
/* We must have exactly one of surface and surface_handle */
6014
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
6015
6016
enum lsc_addr_surface_type surf_type;
6017
if (surface_handle.file != BAD_FILE)
6018
surf_type = LSC_ADDR_SURFTYPE_BSS;
6019
else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
6020
surf_type = LSC_ADDR_SURFTYPE_FLAT;
6021
else
6022
surf_type = LSC_ADDR_SURFTYPE_BTI;
6023
6024
switch (inst->opcode) {
6025
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6026
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6027
surf_type, LSC_ADDR_SIZE_A32,
6028
1 /* num_coordinates */,
6029
LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
6030
false /* transpose */,
6031
LSC_CACHE_LOAD_L1STATE_L3MOCS,
6032
true /* has_dest */);
6033
break;
6034
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6035
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
6036
surf_type, LSC_ADDR_SIZE_A32,
6037
1 /* num_coordinates */,
6038
LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
6039
false /* transpose */,
6040
LSC_CACHE_STORE_L1STATE_L3MOCS,
6041
false /* has_dest */);
6042
break;
6043
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6044
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
6045
/* Bspec: Atomic instruction -> Cache section:
6046
*
6047
* Atomic messages are always forced to "un-cacheable" in the L1
6048
* cache.
6049
*/
6050
enum lsc_opcode opcode =
6051
inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?
6052
brw_atomic_op_to_lsc_fatomic_op(arg.ud) :
6053
brw_atomic_op_to_lsc_atomic_op(arg.ud);
6054
inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
6055
surf_type, LSC_ADDR_SIZE_A32,
6056
1 /* num_coordinates */,
6057
LSC_DATA_SIZE_D32, 1 /* num_channels */,
6058
false /* transpose */,
6059
LSC_CACHE_STORE_L1UC_L3WB,
6060
!inst->dst.is_null());
6061
break;
6062
}
6063
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6064
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6065
surf_type, LSC_ADDR_SIZE_A32,
6066
1 /* num_coordinates */,
6067
lsc_bits_to_data_size(arg.ud),
6068
1 /* num_channels */,
6069
false /* transpose */,
6070
LSC_CACHE_LOAD_L1STATE_L3MOCS,
6071
true /* has_dest */);
6072
break;
6073
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6074
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
6075
surf_type, LSC_ADDR_SIZE_A32,
6076
1 /* num_coordinates */,
6077
lsc_bits_to_data_size(arg.ud),
6078
1 /* num_channels */,
6079
false /* transpose */,
6080
LSC_CACHE_STORE_L1STATE_L3MOCS,
6081
false /* has_dest */);
6082
break;
6083
default:
6084
unreachable("Unknown surface logical instruction");
6085
}
6086
6087
inst->src[0] = brw_imm_ud(0);
6088
6089
/* Set up extended descriptors */
6090
switch (surf_type) {
6091
case LSC_ADDR_SURFTYPE_FLAT:
6092
inst->src[1] = brw_imm_ud(0);
6093
break;
6094
case LSC_ADDR_SURFTYPE_BSS:
6095
/* We assume that the driver provided the handle in the top 20 bits so
6096
* we can use the surface handle directly as the extended descriptor.
6097
*/
6098
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
6099
break;
6100
case LSC_ADDR_SURFTYPE_BTI:
6101
if (surface.file == IMM) {
6102
inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
6103
} else {
6104
const fs_builder ubld = bld.exec_all().group(1, 0);
6105
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6106
ubld.SHL(tmp, surface, brw_imm_ud(24));
6107
inst->src[1] = component(tmp, 0);
6108
}
6109
break;
6110
default:
6111
unreachable("Unknown surface type");
6112
}
6113
6114
/* Update the original instruction. */
6115
inst->opcode = SHADER_OPCODE_SEND;
6116
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6117
inst->ex_mlen = ex_mlen;
6118
inst->header_size = 0;
6119
inst->send_has_side_effects = has_side_effects;
6120
inst->send_is_volatile = !has_side_effects;
6121
6122
/* Finally, the payload */
6123
inst->src[2] = payload;
6124
inst->src[3] = payload2;
6125
6126
inst->resize_sources(4);
6127
}
6128
6129
static void
6130
lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
6131
{
6132
const intel_device_info *devinfo = bld.shader->devinfo;
6133
assert(devinfo->ver >= 9);
6134
6135
/* Get the logical send arguments. */
6136
const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
6137
const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
6138
const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
6139
const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
6140
const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
6141
assert(arg.file == IMM);
6142
assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
6143
assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
6144
6145
const bool is_stateless =
6146
surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
6147
surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
6148
6149
const bool has_side_effects = inst->has_side_effects();
6150
6151
const bool align_16B =
6152
inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
6153
6154
const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
6155
6156
/* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */
6157
fs_builder ubld = bld.exec_all().group(8, 0);
6158
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6159
6160
if (is_stateless)
6161
ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
6162
else
6163
ubld.MOV(header, brw_imm_d(0));
6164
6165
/* Address in OWord units when aligned to OWords. */
6166
if (align_16B)
6167
ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
6168
else
6169
ubld.group(1, 0).MOV(component(header, 2), addr);
6170
6171
fs_reg data;
6172
unsigned ex_mlen = 0;
6173
if (write) {
6174
const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
6175
data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
6176
ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
6177
}
6178
6179
inst->opcode = SHADER_OPCODE_SEND;
6180
inst->mlen = 1;
6181
inst->ex_mlen = ex_mlen;
6182
inst->header_size = 1;
6183
inst->send_has_side_effects = has_side_effects;
6184
inst->send_is_volatile = !has_side_effects;
6185
6186
inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
6187
6188
const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
6189
arg.ud, write);
6190
setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
6191
6192
inst->src[2] = header;
6193
inst->src[3] = data;
6194
6195
inst->resize_sources(4);
6196
}
6197
6198
static fs_reg
6199
emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
6200
{
6201
const fs_builder ubld = bld.exec_all().group(8, 0);
6202
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6203
ubld.MOV(header, brw_imm_ud(0));
6204
6205
/* Use a 2-wide MOV to fill out the address */
6206
assert(type_sz(addr.type) == 8 && addr.stride == 0);
6207
fs_reg addr_vec2 = addr;
6208
addr_vec2.type = BRW_REGISTER_TYPE_UD;
6209
addr_vec2.stride = 1;
6210
ubld.group(2, 0).MOV(header, addr_vec2);
6211
6212
return header;
6213
}
6214
6215
static void
6216
lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
6217
{
6218
const intel_device_info *devinfo = bld.shader->devinfo;
6219
6220
/* Get the logical send arguments. */
6221
const fs_reg &addr = inst->src[0];
6222
const fs_reg &src = inst->src[1];
6223
const unsigned src_sz = type_sz(src.type);
6224
6225
const unsigned src_comps = inst->components_read(1);
6226
assert(inst->src[2].file == IMM);
6227
const unsigned arg = inst->src[2].ud;
6228
const bool has_side_effects = inst->has_side_effects();
6229
6230
/* If the surface message has side effects and we're a fragment shader, we
6231
* have to predicate with the sample mask to avoid helper invocations.
6232
*/
6233
if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
6234
emit_predicate_on_sample_mask(bld, inst);
6235
6236
fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
6237
fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
6238
BRW_REGISTER_TYPE_UD);
6239
unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
6240
6241
switch (inst->opcode) {
6242
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6243
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6244
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6245
1 /* num_coordinates */,
6246
LSC_DATA_SIZE_D32, arg /* num_channels */,
6247
false /* transpose */,
6248
LSC_CACHE_LOAD_L1STATE_L3MOCS,
6249
true /* has_dest */);
6250
break;
6251
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6252
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
6253
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6254
1 /* num_coordinates */,
6255
LSC_DATA_SIZE_D32, arg /* num_channels */,
6256
false /* transpose */,
6257
LSC_CACHE_STORE_L1STATE_L3MOCS,
6258
false /* has_dest */);
6259
break;
6260
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6261
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6262
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6263
1 /* num_coordinates */,
6264
lsc_bits_to_data_size(arg),
6265
1 /* num_channels */,
6266
false /* transpose */,
6267
LSC_CACHE_STORE_L1STATE_L3MOCS,
6268
true /* has_dest */);
6269
break;
6270
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6271
inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
6272
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6273
1 /* num_coordinates */,
6274
lsc_bits_to_data_size(arg),
6275
1 /* num_channels */,
6276
false /* transpose */,
6277
LSC_CACHE_STORE_L1STATE_L3MOCS,
6278
false /* has_dest */);
6279
break;
6280
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6281
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6282
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {
6283
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6284
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6285
/* Bspec: Atomic instruction -> Cache section:
6286
*
6287
* Atomic messages are always forced to "un-cacheable" in the L1
6288
* cache.
6289
*/
6290
enum lsc_opcode opcode =
6291
(inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||
6292
inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||
6293
inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?
6294
brw_atomic_op_to_lsc_atomic_op(arg) :
6295
brw_atomic_op_to_lsc_fatomic_op(arg);
6296
inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
6297
LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6298
1 /* num_coordinates */,
6299
lsc_bits_to_data_size(src_sz * 8),
6300
1 /* num_channels */,
6301
false /* transpose */,
6302
LSC_CACHE_STORE_L1UC_L3WB,
6303
!inst->dst.is_null());
6304
break;
6305
}
6306
default:
6307
unreachable("Unknown A64 logical instruction");
6308
}
6309
6310
/* Update the original instruction. */
6311
inst->opcode = SHADER_OPCODE_SEND;
6312
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6313
inst->ex_mlen = ex_mlen;
6314
inst->header_size = 0;
6315
inst->send_has_side_effects = has_side_effects;
6316
inst->send_is_volatile = !has_side_effects;
6317
6318
/* Set up SFID and descriptors */
6319
inst->sfid = GFX12_SFID_UGM;
6320
inst->resize_sources(4);
6321
inst->src[0] = brw_imm_ud(0); /* desc */
6322
inst->src[1] = brw_imm_ud(0); /* ex_desc */
6323
inst->src[2] = payload;
6324
inst->src[3] = payload2;
6325
}
6326
6327
static void
6328
lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
6329
{
6330
const intel_device_info *devinfo = bld.shader->devinfo;
6331
6332
const fs_reg &addr = inst->src[0];
6333
const fs_reg &src = inst->src[1];
6334
const unsigned src_comps = inst->components_read(1);
6335
assert(inst->src[2].file == IMM);
6336
const unsigned arg = inst->src[2].ud;
6337
const bool has_side_effects = inst->has_side_effects();
6338
6339
/* If the surface message has side effects and we're a fragment shader, we
6340
* have to predicate with the sample mask to avoid helper invocations.
6341
*/
6342
if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
6343
emit_predicate_on_sample_mask(bld, inst);
6344
6345
fs_reg payload, payload2;
6346
unsigned mlen, ex_mlen = 0, header_size = 0;
6347
if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
6348
inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
6349
inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
6350
assert(devinfo->ver >= 9);
6351
6352
/* OWORD messages only take a scalar address in a header */
6353
mlen = 1;
6354
header_size = 1;
6355
payload = emit_a64_oword_block_header(bld, addr);
6356
6357
if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
6358
ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
6359
payload2 = retype(bld.move_to_vgrf(src, src_comps),
6360
BRW_REGISTER_TYPE_UD);
6361
}
6362
} else if (devinfo->ver >= 9) {
6363
/* On Skylake and above, we have SENDS */
6364
mlen = 2 * (inst->exec_size / 8);
6365
ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
6366
payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
6367
payload2 = retype(bld.move_to_vgrf(src, src_comps),
6368
BRW_REGISTER_TYPE_UD);
6369
} else {
6370
/* Add two because the address is 64-bit */
6371
const unsigned dwords = 2 + src_comps;
6372
mlen = dwords * (inst->exec_size / 8);
6373
6374
fs_reg sources[5];
6375
6376
sources[0] = addr;
6377
6378
for (unsigned i = 0; i < src_comps; i++)
6379
sources[1 + i] = offset(src, bld, i);
6380
6381
payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
6382
bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
6383
}
6384
6385
uint32_t desc;
6386
switch (inst->opcode) {
6387
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6388
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
6389
arg, /* num_channels */
6390
false /* write */);
6391
break;
6392
6393
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6394
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
6395
arg, /* num_channels */
6396
true /* write */);
6397
break;
6398
6399
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
6400
desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6401
true, /* align_16B */
6402
arg, /* num_dwords */
6403
false /* write */);
6404
break;
6405
6406
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6407
desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6408
false, /* align_16B */
6409
arg, /* num_dwords */
6410
false /* write */);
6411
break;
6412
6413
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
6414
desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6415
true, /* align_16B */
6416
arg, /* num_dwords */
6417
true /* write */);
6418
break;
6419
6420
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6421
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
6422
arg, /* bit_size */
6423
false /* write */);
6424
break;
6425
6426
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6427
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
6428
arg, /* bit_size */
6429
true /* write */);
6430
break;
6431
6432
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6433
desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
6434
arg, /* atomic_op */
6435
!inst->dst.is_null());
6436
break;
6437
6438
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6439
desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,
6440
arg, /* atomic_op */
6441
!inst->dst.is_null());
6442
break;
6443
6444
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6445
desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
6446
arg, /* atomic_op */
6447
!inst->dst.is_null());
6448
break;
6449
6450
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6451
desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
6452
16, /* bit_size */
6453
arg, /* atomic_op */
6454
!inst->dst.is_null());
6455
break;
6456
6457
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6458
desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
6459
32, /* bit_size */
6460
arg, /* atomic_op */
6461
!inst->dst.is_null());
6462
break;
6463
6464
default:
6465
unreachable("Unknown A64 logical instruction");
6466
}
6467
6468
/* Update the original instruction. */
6469
inst->opcode = SHADER_OPCODE_SEND;
6470
inst->mlen = mlen;
6471
inst->ex_mlen = ex_mlen;
6472
inst->header_size = header_size;
6473
inst->send_has_side_effects = has_side_effects;
6474
inst->send_is_volatile = !has_side_effects;
6475
6476
/* Set up SFID and descriptors */
6477
inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
6478
inst->desc = desc;
6479
inst->resize_sources(4);
6480
inst->src[0] = brw_imm_ud(0); /* desc */
6481
inst->src[1] = brw_imm_ud(0); /* ex_desc */
6482
inst->src[2] = payload;
6483
inst->src[3] = payload2;
6484
}
6485
6486
static void
6487
lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
6488
fs_inst *inst)
6489
{
6490
const intel_device_info *devinfo = bld.shader->devinfo;
6491
ASSERTED const brw_compiler *compiler = bld.shader->compiler;
6492
6493
fs_reg index = inst->src[0];
6494
6495
/* We are switching the instruction from an ALU-like instruction to a
6496
* send-from-grf instruction. Since sends can't handle strides or
6497
* source modifiers, we have to make a copy of the offset source.
6498
*/
6499
fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);
6500
6501
assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
6502
unsigned alignment = inst->src[2].ud;
6503
6504
inst->opcode = SHADER_OPCODE_SEND;
6505
inst->sfid = GFX12_SFID_UGM;
6506
inst->resize_sources(3);
6507
inst->src[0] = brw_imm_ud(0);
6508
6509
if (index.file == IMM) {
6510
inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));
6511
} else {
6512
const fs_builder ubld = bld.exec_all().group(1, 0);
6513
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6514
ubld.SHL(tmp, index, brw_imm_ud(24));
6515
inst->src[1] = component(tmp, 0);
6516
}
6517
6518
assert(!compiler->indirect_ubos_use_sampler);
6519
6520
inst->src[2] = ubo_offset; /* payload */
6521
if (alignment >= 4) {
6522
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6523
LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
6524
1 /* num_coordinates */,
6525
LSC_DATA_SIZE_D32,
6526
4 /* num_channels */,
6527
false /* transpose */,
6528
LSC_CACHE_LOAD_L1STATE_L3MOCS,
6529
true /* has_dest */);
6530
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6531
} else {
6532
inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6533
LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
6534
1 /* num_coordinates */,
6535
LSC_DATA_SIZE_D32,
6536
1 /* num_channels */,
6537
false /* transpose */,
6538
LSC_CACHE_LOAD_L1STATE_L3MOCS,
6539
true /* has_dest */);
6540
inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6541
/* The byte scattered messages can only read one dword at a time so
6542
* we have to duplicate the message 4 times to read the full vec4.
6543
* Hopefully, dead code will clean up the mess if some of them aren't
6544
* needed.
6545
*/
6546
assert(inst->size_written == 16 * inst->exec_size);
6547
inst->size_written /= 4;
6548
for (unsigned c = 1; c < 4; c++) {
6549
/* Emit a copy of the instruction because we're about to modify
6550
* it. Because this loop starts at 1, we will emit copies for the
6551
* first 3 and the final one will be the modified instruction.
6552
*/
6553
bld.emit(*inst);
6554
6555
/* Offset the source */
6556
inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6557
bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
6558
6559
/* Offset the destination */
6560
inst->dst = offset(inst->dst, bld, 1);
6561
}
6562
}
6563
}
6564
6565
static void
6566
lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
6567
{
6568
const intel_device_info *devinfo = bld.shader->devinfo;
6569
const brw_compiler *compiler = bld.shader->compiler;
6570
6571
if (devinfo->ver >= 7) {
6572
fs_reg index = inst->src[0];
6573
/* We are switching the instruction from an ALU-like instruction to a
6574
* send-from-grf instruction. Since sends can't handle strides or
6575
* source modifiers, we have to make a copy of the offset source.
6576
*/
6577
fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
6578
bld.MOV(ubo_offset, inst->src[1]);
6579
6580
assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
6581
unsigned alignment = inst->src[2].ud;
6582
6583
inst->opcode = SHADER_OPCODE_SEND;
6584
inst->mlen = inst->exec_size / 8;
6585
inst->resize_sources(3);
6586
6587
if (index.file == IMM) {
6588
inst->desc = index.ud & 0xff;
6589
inst->src[0] = brw_imm_ud(0);
6590
} else {
6591
inst->desc = 0;
6592
const fs_builder ubld = bld.exec_all().group(1, 0);
6593
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6594
ubld.AND(tmp, index, brw_imm_ud(0xff));
6595
inst->src[0] = component(tmp, 0);
6596
}
6597
inst->src[1] = brw_imm_ud(0); /* ex_desc */
6598
inst->src[2] = ubo_offset; /* payload */
6599
6600
if (compiler->indirect_ubos_use_sampler) {
6601
const unsigned simd_mode =
6602
inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
6603
BRW_SAMPLER_SIMD_MODE_SIMD16;
6604
6605
inst->sfid = BRW_SFID_SAMPLER;
6606
inst->desc |= brw_sampler_desc(devinfo, 0, 0,
6607
GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
6608
simd_mode, 0);
6609
} else if (alignment >= 4) {
6610
inst->sfid = (devinfo->verx10 >= 75 ?
6611
HSW_SFID_DATAPORT_DATA_CACHE_1 :
6612
GFX7_SFID_DATAPORT_DATA_CACHE);
6613
inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
6614
4, /* num_channels */
6615
false /* write */);
6616
} else {
6617
inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
6618
inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
6619
32, /* bit_size */
6620
false /* write */);
6621
/* The byte scattered messages can only read one dword at a time so
6622
* we have to duplicate the message 4 times to read the full vec4.
6623
* Hopefully, dead code will clean up the mess if some of them aren't
6624
* needed.
6625
*/
6626
assert(inst->size_written == 16 * inst->exec_size);
6627
inst->size_written /= 4;
6628
for (unsigned c = 1; c < 4; c++) {
6629
/* Emit a copy of the instruction because we're about to modify
6630
* it. Because this loop starts at 1, we will emit copies for the
6631
* first 3 and the final one will be the modified instruction.
6632
*/
6633
bld.emit(*inst);
6634
6635
/* Offset the source */
6636
inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6637
bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
6638
6639
/* Offset the destination */
6640
inst->dst = offset(inst->dst, bld, 1);
6641
}
6642
}
6643
} else {
6644
const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
6645
BRW_REGISTER_TYPE_UD);
6646
6647
bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
6648
6649
inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
6650
inst->resize_sources(1);
6651
inst->base_mrf = payload.nr;
6652
inst->header_size = 1;
6653
inst->mlen = 1 + inst->exec_size / 8;
6654
}
6655
}
6656
6657
static void
6658
lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
6659
{
6660
assert(bld.shader->devinfo->ver < 6);
6661
6662
inst->base_mrf = 2;
6663
inst->mlen = inst->sources * inst->exec_size / 8;
6664
6665
if (inst->sources > 1) {
6666
/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
6667
* "Message Payload":
6668
*
6669
* "Operand0[7]. For the INT DIV functions, this operand is the
6670
* denominator."
6671
* ...
6672
* "Operand1[7]. For the INT DIV functions, this operand is the
6673
* numerator."
6674
*/
6675
const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
6676
const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
6677
const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
6678
6679
inst->resize_sources(1);
6680
inst->src[0] = src0;
6681
6682
assert(inst->exec_size == 8);
6683
bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
6684
}
6685
}
6686
6687
static void
6688
lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
6689
{
6690
const intel_device_info *devinfo = bld.shader->devinfo;
6691
fs_reg global_addr = inst->src[0];
6692
const fs_reg &btd_record = inst->src[1];
6693
6694
const unsigned mlen = 2;
6695
const fs_builder ubld = bld.exec_all().group(8, 0);
6696
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
6697
6698
ubld.MOV(header, brw_imm_ud(0));
6699
switch (inst->opcode) {
6700
case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
6701
assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
6702
global_addr.type = BRW_REGISTER_TYPE_UD;
6703
global_addr.stride = 1;
6704
ubld.group(2, 0).MOV(header, global_addr);
6705
break;
6706
6707
case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
6708
/* The bottom bit is the Stack ID release bit */
6709
ubld.group(1, 0).MOV(header, brw_imm_ud(1));
6710
break;
6711
6712
default:
6713
unreachable("Invalid BTD message");
6714
}
6715
6716
/* Stack IDs are always in R1 regardless of whether we're coming from a
6717
* bindless shader or a regular compute shader.
6718
*/
6719
fs_reg stack_ids =
6720
retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);
6721
bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
6722
6723
unsigned ex_mlen = 0;
6724
fs_reg payload;
6725
if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
6726
ex_mlen = 2 * (inst->exec_size / 8);
6727
payload = bld.move_to_vgrf(btd_record, 1);
6728
} else {
6729
assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
6730
/* All these messages take a BTD and things complain if we don't provide
6731
* one for RETIRE. However, it shouldn't ever actually get used so fill
6732
* it with zero.
6733
*/
6734
ex_mlen = 2 * (inst->exec_size / 8);
6735
payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
6736
}
6737
6738
/* Update the original instruction. */
6739
inst->opcode = SHADER_OPCODE_SEND;
6740
inst->mlen = mlen;
6741
inst->ex_mlen = ex_mlen;
6742
inst->header_size = 0; /* HW docs require has_header = false */
6743
inst->send_has_side_effects = true;
6744
inst->send_is_volatile = false;
6745
6746
/* Set up SFID and descriptors */
6747
inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
6748
inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
6749
GEN_RT_BTD_MESSAGE_SPAWN);
6750
inst->resize_sources(4);
6751
inst->src[0] = brw_imm_ud(0); /* desc */
6752
inst->src[1] = brw_imm_ud(0); /* ex_desc */
6753
inst->src[2] = header;
6754
inst->src[3] = payload;
6755
}
6756
6757
static void
6758
lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
6759
{
6760
const intel_device_info *devinfo = bld.shader->devinfo;
6761
const fs_reg &bvh_level = inst->src[0];
6762
assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);
6763
const uint32_t trace_ray_control = inst->src[1].ud;
6764
6765
const unsigned mlen = 1;
6766
const fs_builder ubld = bld.exec_all().group(8, 0);
6767
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6768
ubld.MOV(header, brw_imm_ud(0));
6769
ubld.group(2, 0).MOV(header,
6770
retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));
6771
/* TODO: Bit 128 is ray_query */
6772
6773
const unsigned ex_mlen = inst->exec_size / 8;
6774
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
6775
const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);
6776
if (bvh_level.file == BRW_IMMEDIATE_VALUE) {
6777
bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));
6778
} else {
6779
bld.AND(payload, bvh_level, brw_imm_ud(0x7));
6780
if (trc_bits != 0)
6781
bld.OR(payload, payload, brw_imm_ud(trc_bits));
6782
}
6783
bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
6784
retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
6785
brw_imm_uw(0x7ff));
6786
6787
/* Update the original instruction. */
6788
inst->opcode = SHADER_OPCODE_SEND;
6789
inst->mlen = mlen;
6790
inst->ex_mlen = ex_mlen;
6791
inst->header_size = 0; /* HW docs require has_header = false */
6792
inst->send_has_side_effects = true;
6793
inst->send_is_volatile = false;
6794
6795
/* Set up SFID and descriptors */
6796
inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
6797
inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
6798
inst->resize_sources(4);
6799
inst->src[0] = brw_imm_ud(0); /* desc */
6800
inst->src[1] = brw_imm_ud(0); /* ex_desc */
6801
inst->src[2] = header;
6802
inst->src[3] = payload;
6803
}
6804
6805
bool
6806
fs_visitor::lower_logical_sends()
6807
{
6808
bool progress = false;
6809
6810
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
6811
const fs_builder ibld(this, block, inst);
6812
6813
switch (inst->opcode) {
6814
case FS_OPCODE_FB_WRITE_LOGICAL:
6815
assert(stage == MESA_SHADER_FRAGMENT);
6816
lower_fb_write_logical_send(ibld, inst,
6817
brw_wm_prog_data(prog_data),
6818
(const brw_wm_prog_key *)key,
6819
payload);
6820
break;
6821
6822
case FS_OPCODE_FB_READ_LOGICAL:
6823
lower_fb_read_logical_send(ibld, inst);
6824
break;
6825
6826
case SHADER_OPCODE_TEX_LOGICAL:
6827
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
6828
break;
6829
6830
case SHADER_OPCODE_TXD_LOGICAL:
6831
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
6832
break;
6833
6834
case SHADER_OPCODE_TXF_LOGICAL:
6835
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
6836
break;
6837
6838
case SHADER_OPCODE_TXL_LOGICAL:
6839
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
6840
break;
6841
6842
case SHADER_OPCODE_TXS_LOGICAL:
6843
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
6844
break;
6845
6846
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
6847
lower_sampler_logical_send(ibld, inst,
6848
SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
6849
break;
6850
6851
case FS_OPCODE_TXB_LOGICAL:
6852
lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
6853
break;
6854
6855
case SHADER_OPCODE_TXF_CMS_LOGICAL:
6856
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
6857
break;
6858
6859
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
6860
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
6861
break;
6862
6863
case SHADER_OPCODE_TXF_UMS_LOGICAL:
6864
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
6865
break;
6866
6867
case SHADER_OPCODE_TXF_MCS_LOGICAL:
6868
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
6869
break;
6870
6871
case SHADER_OPCODE_LOD_LOGICAL:
6872
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
6873
break;
6874
6875
case SHADER_OPCODE_TG4_LOGICAL:
6876
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
6877
break;
6878
6879
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
6880
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
6881
break;
6882
6883
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
6884
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
6885
break;
6886
6887
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6888
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6889
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6890
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
6891
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6892
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6893
if (devinfo->has_lsc) {
6894
lower_lsc_surface_logical_send(ibld, inst);
6895
break;
6896
}
6897
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
6898
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
6899
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
6900
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
6901
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
6902
lower_surface_logical_send(ibld, inst);
6903
break;
6904
6905
case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
6906
case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6907
case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
6908
lower_surface_block_logical_send(ibld, inst);
6909
break;
6910
6911
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6912
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6913
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6914
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6915
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6916
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6917
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6918
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6919
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6920
if (devinfo->has_lsc) {
6921
lower_lsc_a64_logical_send(ibld, inst);
6922
break;
6923
}
6924
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
6925
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6926
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
6927
lower_a64_logical_send(ibld, inst);
6928
break;
6929
6930
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
6931
if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
6932
lower_lsc_varying_pull_constant_logical_send(ibld, inst);
6933
else
6934
lower_varying_pull_constant_logical_send(ibld, inst);
6935
break;
6936
6937
case SHADER_OPCODE_RCP:
6938
case SHADER_OPCODE_RSQ:
6939
case SHADER_OPCODE_SQRT:
6940
case SHADER_OPCODE_EXP2:
6941
case SHADER_OPCODE_LOG2:
6942
case SHADER_OPCODE_SIN:
6943
case SHADER_OPCODE_COS:
6944
case SHADER_OPCODE_POW:
6945
case SHADER_OPCODE_INT_QUOTIENT:
6946
case SHADER_OPCODE_INT_REMAINDER:
6947
/* The math opcodes are overloaded for the send-like and
6948
* expression-like instructions which seems kind of icky. Gfx6+ has
6949
* a native (but rather quirky) MATH instruction so we don't need to
6950
* do anything here. On Gfx4-5 we'll have to lower the Gfx6-like
6951
* logical instructions (which we can easily recognize because they
6952
* have mlen = 0) into send-like virtual instructions.
6953
*/
6954
if (devinfo->ver < 6 && inst->mlen == 0) {
6955
lower_math_logical_send(ibld, inst);
6956
break;
6957
6958
} else {
6959
continue;
6960
}
6961
6962
case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
6963
case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
6964
lower_btd_logical_send(ibld, inst);
6965
break;
6966
6967
case RT_OPCODE_TRACE_RAY_LOGICAL:
6968
lower_trace_ray_logical_send(ibld, inst);
6969
break;
6970
6971
default:
6972
continue;
6973
}
6974
6975
progress = true;
6976
}
6977
6978
if (progress)
6979
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6980
6981
return progress;
6982
}
6983
6984
static bool
6985
is_mixed_float_with_fp32_dst(const fs_inst *inst)
6986
{
6987
/* This opcode sometimes uses :W type on the source even if the operand is
6988
* a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
6989
*/
6990
if (inst->opcode == BRW_OPCODE_F16TO32)
6991
return true;
6992
6993
if (inst->dst.type != BRW_REGISTER_TYPE_F)
6994
return false;
6995
6996
for (int i = 0; i < inst->sources; i++) {
6997
if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
6998
return true;
6999
}
7000
7001
return false;
7002
}
7003
7004
static bool
7005
is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
7006
{
7007
/* This opcode sometimes uses :W type on the destination even if the
7008
* destination is a :HF, because in gfx7 there is no support for :HF, and
7009
* thus it uses :W.
7010
*/
7011
if (inst->opcode == BRW_OPCODE_F32TO16 &&
7012
inst->dst.stride == 1)
7013
return true;
7014
7015
if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
7016
inst->dst.stride != 1)
7017
return false;
7018
7019
for (int i = 0; i < inst->sources; i++) {
7020
if (inst->src[i].type == BRW_REGISTER_TYPE_F)
7021
return true;
7022
}
7023
7024
return false;
7025
}
7026
7027
/**
7028
* Get the closest allowed SIMD width for instruction \p inst accounting for
7029
* some common regioning and execution control restrictions that apply to FPU
7030
* instructions. These restrictions don't necessarily have any relevance to
7031
* instructions not executed by the FPU pipeline like extended math, control
7032
* flow or send message instructions.
7033
*
7034
* For virtual opcodes it's really up to the instruction -- In some cases
7035
* (e.g. where a virtual instruction unrolls into a simple sequence of FPU
7036
* instructions) it may simplify virtual instruction lowering if we can
7037
* enforce FPU-like regioning restrictions already on the virtual instruction,
7038
* in other cases (e.g. virtual send-like instructions) this may be
7039
* excessively restrictive.
7040
*/
7041
static unsigned
7042
get_fpu_lowered_simd_width(const struct intel_device_info *devinfo,
7043
const fs_inst *inst)
7044
{
7045
/* Maximum execution size representable in the instruction controls. */
7046
unsigned max_width = MIN2(32, inst->exec_size);
7047
7048
/* According to the PRMs:
7049
* "A. In Direct Addressing mode, a source cannot span more than 2
7050
* adjacent GRF registers.
7051
* B. A destination cannot span more than 2 adjacent GRF registers."
7052
*
7053
* Look for the source or destination with the largest register region
7054
* which is the one that is going to limit the overall execution size of
7055
* the instruction due to this rule.
7056
*/
7057
unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
7058
7059
for (unsigned i = 0; i < inst->sources; i++)
7060
reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
7061
7062
/* Calculate the maximum execution size of the instruction based on the
7063
* factor by which it goes over the hardware limit of 2 GRFs.
7064
*/
7065
if (reg_count > 2)
7066
max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
7067
7068
/* According to the IVB PRMs:
7069
* "When destination spans two registers, the source MUST span two
7070
* registers. The exception to the above rule:
7071
*
7072
* - When source is scalar, the source registers are not incremented.
7073
* - When source is packed integer Word and destination is packed
7074
* integer DWord, the source register is not incremented but the
7075
* source sub register is incremented."
7076
*
7077
* The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
7078
* restrictions. The code below intentionally doesn't check whether the
7079
* destination type is integer because empirically the hardware doesn't
7080
* seem to care what the actual type is as long as it's dword-aligned.
7081
*/
7082
if (devinfo->ver < 8) {
7083
for (unsigned i = 0; i < inst->sources; i++) {
7084
/* IVB implements DF scalars as <0;2,1> regions. */
7085
const bool is_scalar_exception = is_uniform(inst->src[i]) &&
7086
(devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
7087
const bool is_packed_word_exception =
7088
type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
7089
type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
7090
7091
/* We check size_read(i) against size_written instead of REG_SIZE
7092
* because we want to properly handle SIMD32. In SIMD32, you can end
7093
* up with writes to 4 registers and a source that reads 2 registers
7094
* and we may still need to lower all the way to SIMD8 in that case.
7095
*/
7096
if (inst->size_written > REG_SIZE &&
7097
inst->size_read(i) != 0 &&
7098
inst->size_read(i) < inst->size_written &&
7099
!is_scalar_exception && !is_packed_word_exception) {
7100
const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
7101
max_width = MIN2(max_width, inst->exec_size / reg_count);
7102
}
7103
}
7104
}
7105
7106
if (devinfo->ver < 6) {
7107
/* From the G45 PRM, Volume 4 Page 361:
7108
*
7109
* "Operand Alignment Rule: With the exceptions listed below, a
7110
* source/destination operand in general should be aligned to even
7111
* 256-bit physical register with a region size equal to two 256-bit
7112
* physical registers."
7113
*
7114
* Normally we enforce this by allocating virtual registers to the
7115
* even-aligned class. But we need to handle payload registers.
7116
*/
7117
for (unsigned i = 0; i < inst->sources; i++) {
7118
if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
7119
inst->size_read(i) > REG_SIZE) {
7120
max_width = MIN2(max_width, 8);
7121
}
7122
}
7123
}
7124
7125
/* From the IVB PRMs:
7126
* "When an instruction is SIMD32, the low 16 bits of the execution mask
7127
* are applied for both halves of the SIMD32 instruction. If different
7128
* execution mask channels are required, split the instruction into two
7129
* SIMD16 instructions."
7130
*
7131
* There is similar text in the HSW PRMs. Gfx4-6 don't even implement
7132
* 32-wide control flow support in hardware and will behave similarly.
7133
*/
7134
if (devinfo->ver < 8 && !inst->force_writemask_all)
7135
max_width = MIN2(max_width, 16);
7136
7137
/* From the IVB PRMs (applies to HSW too):
7138
* "Instructions with condition modifiers must not use SIMD32."
7139
*
7140
* From the BDW PRMs (applies to later hardware too):
7141
* "Ternary instruction with condition modifiers must not use SIMD32."
7142
*/
7143
if (inst->conditional_mod && (devinfo->ver < 8 || inst->is_3src(devinfo)))
7144
max_width = MIN2(max_width, 16);
7145
7146
/* From the IVB PRMs (applies to other devices that don't have the
7147
* intel_device_info::supports_simd16_3src flag set):
7148
* "In Align16 access mode, SIMD16 is not allowed for DW operations and
7149
* SIMD8 is not allowed for DF operations."
7150
*/
7151
if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
7152
max_width = MIN2(max_width, inst->exec_size / reg_count);
7153
7154
/* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
7155
* the 8-bit quarter of the execution mask signals specified in the
7156
* instruction control fields) for the second compressed half of any
7157
* single-precision instruction (for double-precision instructions
7158
* it's hardwired to use NibCtrl+1, at least on HSW), which means that
7159
* the EU will apply the wrong execution controls for the second
7160
* sequential GRF write if the number of channels per GRF is not exactly
7161
* eight in single-precision mode (or four in double-float mode).
7162
*
7163
* In this situation we calculate the maximum size of the split
7164
* instructions so they only ever write to a single register.
7165
*/
7166
if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
7167
!inst->force_writemask_all) {
7168
const unsigned channels_per_grf = inst->exec_size /
7169
DIV_ROUND_UP(inst->size_written, REG_SIZE);
7170
const unsigned exec_type_size = get_exec_type_size(inst);
7171
assert(exec_type_size);
7172
7173
/* The hardware shifts exactly 8 channels per compressed half of the
7174
* instruction in single-precision mode and exactly 4 in double-precision.
7175
*/
7176
if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
7177
max_width = MIN2(max_width, channels_per_grf);
7178
7179
/* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
7180
* because HW applies the same channel enable signals to both halves of
7181
* the compressed instruction which will be just wrong under
7182
* non-uniform control flow.
7183
*/
7184
if (devinfo->verx10 == 70 &&
7185
(exec_type_size == 8 || type_sz(inst->dst.type) == 8))
7186
max_width = MIN2(max_width, 4);
7187
}
7188
7189
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
7190
* Float Operations:
7191
*
7192
* "No SIMD16 in mixed mode when destination is f32. Instruction
7193
* execution size must be no more than 8."
7194
*
7195
* FIXME: the simulator doesn't seem to complain if we don't do this and
7196
* empirical testing with existing CTS tests show that they pass just fine
7197
* without implementing this, however, since our interpretation of the PRM
7198
* is that conversion MOVs between HF and F are still mixed-float
7199
* instructions (and therefore subject to this restriction) we decided to
7200
* split them to be safe. Might be useful to do additional investigation to
7201
* lift the restriction if we can ensure that it is safe though, since these
7202
* conversions are common when half-float types are involved since many
7203
* instructions do not support HF types and conversions from/to F are
7204
* required.
7205
*/
7206
if (is_mixed_float_with_fp32_dst(inst))
7207
max_width = MIN2(max_width, 8);
7208
7209
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
7210
* Float Operations:
7211
*
7212
* "No SIMD16 in mixed mode when destination is packed f16 for both
7213
* Align1 and Align16."
7214
*/
7215
if (is_mixed_float_with_packed_fp16_dst(inst))
7216
max_width = MIN2(max_width, 8);
7217
7218
/* Only power-of-two execution sizes are representable in the instruction
7219
* control fields.
7220
*/
7221
return 1 << util_logbase2(max_width);
7222
}
7223
7224
/**
7225
* Get the maximum allowed SIMD width for instruction \p inst accounting for
7226
* various payload size restrictions that apply to sampler message
7227
* instructions.
7228
*
7229
* This is only intended to provide a maximum theoretical bound for the
7230
* execution size of the message based on the number of argument components
7231
* alone, which in most cases will determine whether the SIMD8 or SIMD16
7232
* variant of the message can be used, though some messages may have
7233
* additional restrictions not accounted for here (e.g. pre-ILK hardware uses
7234
* the message length to determine the exact SIMD width and argument count,
7235
* which makes a number of sampler message combinations impossible to
7236
* represent).
7237
*/
7238
static unsigned
7239
get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
7240
const fs_inst *inst)
7241
{
7242
/* If we have a min_lod parameter on anything other than a simple sample
7243
* message, it will push it over 5 arguments and we have to fall back to
7244
* SIMD8.
7245
*/
7246
if (inst->opcode != SHADER_OPCODE_TEX &&
7247
inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
7248
return 8;
7249
7250
/* Calculate the number of coordinate components that have to be present
7251
* assuming that additional arguments follow the texel coordinates in the
7252
* message payload. On IVB+ there is no need for padding, on ILK-SNB we
7253
* need to pad to four or three components depending on the message,
7254
* pre-ILK we need to pad to at most three components.
7255
*/
7256
const unsigned req_coord_components =
7257
(devinfo->ver >= 7 ||
7258
!inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
7259
(devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
7260
inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
7261
3;
7262
7263
/* On Gfx9+ the LOD argument is for free if we're able to use the LZ
7264
* variant of the TXL or TXF message.
7265
*/
7266
const bool implicit_lod = devinfo->ver >= 9 &&
7267
(inst->opcode == SHADER_OPCODE_TXL ||
7268
inst->opcode == SHADER_OPCODE_TXF) &&
7269
inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
7270
7271
/* Calculate the total number of argument components that need to be passed
7272
* to the sampler unit.
7273
*/
7274
const unsigned num_payload_components =
7275
MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
7276
req_coord_components) +
7277
inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
7278
(implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
7279
inst->components_read(TEX_LOGICAL_SRC_LOD2) +
7280
inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
7281
(inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
7282
inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
7283
inst->components_read(TEX_LOGICAL_SRC_MCS);
7284
7285
/* SIMD16 messages with more than five arguments exceed the maximum message
7286
* size supported by the sampler, regardless of whether a header is
7287
* provided or not.
7288
*/
7289
return MIN2(inst->exec_size,
7290
num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
7291
}
7292
7293
/**
7294
* Get the closest native SIMD width supported by the hardware for instruction
7295
* \p inst. The instruction will be left untouched by
7296
* fs_visitor::lower_simd_width() if the returned value is equal to the
7297
* original execution size.
7298
*/
7299
static unsigned
7300
get_lowered_simd_width(const struct intel_device_info *devinfo,
7301
const fs_inst *inst)
7302
{
7303
switch (inst->opcode) {
7304
case BRW_OPCODE_MOV:
7305
case BRW_OPCODE_SEL:
7306
case BRW_OPCODE_NOT:
7307
case BRW_OPCODE_AND:
7308
case BRW_OPCODE_OR:
7309
case BRW_OPCODE_XOR:
7310
case BRW_OPCODE_SHR:
7311
case BRW_OPCODE_SHL:
7312
case BRW_OPCODE_ASR:
7313
case BRW_OPCODE_ROR:
7314
case BRW_OPCODE_ROL:
7315
case BRW_OPCODE_CMPN:
7316
case BRW_OPCODE_CSEL:
7317
case BRW_OPCODE_F32TO16:
7318
case BRW_OPCODE_F16TO32:
7319
case BRW_OPCODE_BFREV:
7320
case BRW_OPCODE_BFE:
7321
case BRW_OPCODE_ADD:
7322
case BRW_OPCODE_MUL:
7323
case BRW_OPCODE_AVG:
7324
case BRW_OPCODE_FRC:
7325
case BRW_OPCODE_RNDU:
7326
case BRW_OPCODE_RNDD:
7327
case BRW_OPCODE_RNDE:
7328
case BRW_OPCODE_RNDZ:
7329
case BRW_OPCODE_LZD:
7330
case BRW_OPCODE_FBH:
7331
case BRW_OPCODE_FBL:
7332
case BRW_OPCODE_CBIT:
7333
case BRW_OPCODE_SAD2:
7334
case BRW_OPCODE_MAD:
7335
case BRW_OPCODE_LRP:
7336
case FS_OPCODE_PACK:
7337
case SHADER_OPCODE_SEL_EXEC:
7338
case SHADER_OPCODE_CLUSTER_BROADCAST:
7339
case SHADER_OPCODE_MOV_RELOC_IMM:
7340
return get_fpu_lowered_simd_width(devinfo, inst);
7341
7342
case BRW_OPCODE_CMP: {
7343
/* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
7344
* when the destination is a GRF the dependency-clear bit on the flag
7345
* register is cleared early.
7346
*
7347
* Suggested workarounds are to disable coissuing CMP instructions
7348
* or to split CMP(16) instructions into two CMP(8) instructions.
7349
*
7350
* We choose to split into CMP(8) instructions since disabling
7351
* coissuing would affect CMP instructions not otherwise affected by
7352
* the errata.
7353
*/
7354
const unsigned max_width = (devinfo->verx10 == 70 &&
7355
!inst->dst.is_null() ? 8 : ~0);
7356
return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
7357
}
7358
case BRW_OPCODE_BFI1:
7359
case BRW_OPCODE_BFI2:
7360
/* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
7361
* should
7362
* "Force BFI instructions to be executed always in SIMD8."
7363
*/
7364
return MIN2(devinfo->is_haswell ? 8 : ~0u,
7365
get_fpu_lowered_simd_width(devinfo, inst));
7366
7367
case BRW_OPCODE_IF:
7368
assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
7369
return inst->exec_size;
7370
7371
case SHADER_OPCODE_RCP:
7372
case SHADER_OPCODE_RSQ:
7373
case SHADER_OPCODE_SQRT:
7374
case SHADER_OPCODE_EXP2:
7375
case SHADER_OPCODE_LOG2:
7376
case SHADER_OPCODE_SIN:
7377
case SHADER_OPCODE_COS: {
7378
/* Unary extended math instructions are limited to SIMD8 on Gfx4 and
7379
* Gfx6. Extended Math Function is limited to SIMD8 with half-float.
7380
*/
7381
if (devinfo->ver == 6 || (devinfo->ver == 4 && !devinfo->is_g4x))
7382
return MIN2(8, inst->exec_size);
7383
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
7384
return MIN2(8, inst->exec_size);
7385
return MIN2(16, inst->exec_size);
7386
}
7387
7388
case SHADER_OPCODE_POW: {
7389
/* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
7390
* to SIMD8 with half-float
7391
*/
7392
if (devinfo->ver < 7)
7393
return MIN2(8, inst->exec_size);
7394
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
7395
return MIN2(8, inst->exec_size);
7396
return MIN2(16, inst->exec_size);
7397
}
7398
7399
case SHADER_OPCODE_USUB_SAT:
7400
case SHADER_OPCODE_ISUB_SAT:
7401
return get_fpu_lowered_simd_width(devinfo, inst);
7402
7403
case SHADER_OPCODE_INT_QUOTIENT:
7404
case SHADER_OPCODE_INT_REMAINDER:
7405
/* Integer division is limited to SIMD8 on all generations. */
7406
return MIN2(8, inst->exec_size);
7407
7408
case FS_OPCODE_LINTERP:
7409
case SHADER_OPCODE_GET_BUFFER_SIZE:
7410
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
7411
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
7412
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
7413
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
7414
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
7415
return MIN2(16, inst->exec_size);
7416
7417
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
7418
/* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
7419
* message used to implement varying pull constant loads, so expand it
7420
* to SIMD16. An alternative with longer message payload length but
7421
* shorter return payload would be to use the SIMD8 sampler message that
7422
* takes (header, u, v, r) as parameters instead of (header, u).
7423
*/
7424
return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
7425
7426
case FS_OPCODE_DDX_COARSE:
7427
case FS_OPCODE_DDX_FINE:
7428
case FS_OPCODE_DDY_COARSE:
7429
case FS_OPCODE_DDY_FINE:
7430
/* The implementation of this virtual opcode may require emitting
7431
* compressed Align16 instructions, which are severely limited on some
7432
* generations.
7433
*
7434
* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
7435
* Region Restrictions):
7436
*
7437
* "In Align16 access mode, SIMD16 is not allowed for DW operations
7438
* and SIMD8 is not allowed for DF operations."
7439
*
7440
* In this context, "DW operations" means "operations acting on 32-bit
7441
* values", so it includes operations on floats.
7442
*
7443
* Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3
7444
* (Instruction Compression -> Rules and Restrictions):
7445
*
7446
* "A compressed instruction must be in Align1 access mode. Align16
7447
* mode instructions cannot be compressed."
7448
*
7449
* Similar text exists in the g45 PRM.
7450
*
7451
* Empirically, compressed align16 instructions using odd register
7452
* numbers don't appear to work on Sandybridge either.
7453
*/
7454
return (devinfo->ver == 4 || devinfo->ver == 6 ||
7455
(devinfo->verx10 == 70) ?
7456
MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
7457
7458
case SHADER_OPCODE_MULH:
7459
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
7460
* is 8-wide on Gfx7+.
7461
*/
7462
return (devinfo->ver >= 7 ? 8 :
7463
get_fpu_lowered_simd_width(devinfo, inst));
7464
7465
case FS_OPCODE_FB_WRITE_LOGICAL:
7466
/* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
7467
* here.
7468
*/
7469
assert(devinfo->ver != 6 ||
7470
inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
7471
inst->exec_size == 8);
7472
/* Dual-source FB writes are unsupported in SIMD16 mode. */
7473
return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
7474
8 : MIN2(16, inst->exec_size));
7475
7476
case FS_OPCODE_FB_READ_LOGICAL:
7477
return MIN2(16, inst->exec_size);
7478
7479
case SHADER_OPCODE_TEX_LOGICAL:
7480
case SHADER_OPCODE_TXF_CMS_LOGICAL:
7481
case SHADER_OPCODE_TXF_UMS_LOGICAL:
7482
case SHADER_OPCODE_TXF_MCS_LOGICAL:
7483
case SHADER_OPCODE_LOD_LOGICAL:
7484
case SHADER_OPCODE_TG4_LOGICAL:
7485
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
7486
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
7487
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
7488
return get_sampler_lowered_simd_width(devinfo, inst);
7489
7490
case SHADER_OPCODE_TXD_LOGICAL:
7491
/* TXD is unsupported in SIMD16 mode. */
7492
return 8;
7493
7494
case SHADER_OPCODE_TXL_LOGICAL:
7495
case FS_OPCODE_TXB_LOGICAL:
7496
/* Only one execution size is representable pre-ILK depending on whether
7497
* the shadow reference argument is present.
7498
*/
7499
if (devinfo->ver == 4)
7500
return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
7501
else
7502
return get_sampler_lowered_simd_width(devinfo, inst);
7503
7504
case SHADER_OPCODE_TXF_LOGICAL:
7505
case SHADER_OPCODE_TXS_LOGICAL:
7506
/* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
7507
* messages. Use SIMD16 instead.
7508
*/
7509
if (devinfo->ver == 4)
7510
return 16;
7511
else
7512
return get_sampler_lowered_simd_width(devinfo, inst);
7513
7514
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
7515
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
7516
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
7517
return 8;
7518
7519
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
7520
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
7521
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
7522
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
7523
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
7524
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
7525
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
7526
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
7527
return MIN2(16, inst->exec_size);
7528
7529
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
7530
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
7531
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
7532
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
7533
return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
7534
7535
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
7536
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
7537
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
7538
assert(inst->exec_size <= 16);
7539
return inst->exec_size;
7540
7541
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
7542
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
7543
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
7544
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
7545
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
7546
return 8;
7547
7548
case SHADER_OPCODE_URB_READ_SIMD8:
7549
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
7550
case SHADER_OPCODE_URB_WRITE_SIMD8:
7551
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
7552
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
7553
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
7554
return MIN2(8, inst->exec_size);
7555
7556
case SHADER_OPCODE_QUAD_SWIZZLE: {
7557
const unsigned swiz = inst->src[1].ud;
7558
return (is_uniform(inst->src[0]) ?
7559
get_fpu_lowered_simd_width(devinfo, inst) :
7560
devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
7561
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
7562
get_fpu_lowered_simd_width(devinfo, inst));
7563
}
7564
case SHADER_OPCODE_MOV_INDIRECT: {
7565
/* From IVB and HSW PRMs:
7566
*
7567
* "2.When the destination requires two registers and the sources are
7568
* indirect, the sources must use 1x1 regioning mode.
7569
*
7570
* In case of DF instructions in HSW/IVB, the exec_size is limited by
7571
* the EU decompression logic not handling VxH indirect addressing
7572
* correctly.
7573
*/
7574
const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
7575
/* Prior to Broadwell, we only have 8 address subregisters. */
7576
return MIN3(devinfo->ver >= 8 ? 16 : 8,
7577
max_size / (inst->dst.stride * type_sz(inst->dst.type)),
7578
inst->exec_size);
7579
}
7580
7581
case SHADER_OPCODE_LOAD_PAYLOAD: {
7582
const unsigned reg_count =
7583
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
7584
7585
if (reg_count > 2) {
7586
/* Only LOAD_PAYLOAD instructions with per-channel destination region
7587
* can be easily lowered (which excludes headers and heterogeneous
7588
* types).
7589
*/
7590
assert(!inst->header_size);
7591
for (unsigned i = 0; i < inst->sources; i++)
7592
assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
7593
inst->src[i].file == BAD_FILE);
7594
7595
return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
7596
} else {
7597
return inst->exec_size;
7598
}
7599
}
7600
default:
7601
return inst->exec_size;
7602
}
7603
}
7604
7605
/**
7606
* Return true if splitting out the group of channels of instruction \p inst
7607
* given by lbld.group() requires allocating a temporary for the i-th source
7608
* of the lowered instruction.
7609
*/
7610
static inline bool
7611
needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
7612
{
7613
return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
7614
(inst->components_read(i) == 1 &&
7615
lbld.dispatch_width() <= inst->exec_size)) ||
7616
(inst->flags_written(lbld.shader->devinfo) &
7617
flag_mask(inst->src[i], type_sz(inst->src[i].type)));
7618
}
7619
7620
/**
7621
* Extract the data that would be consumed by the channel group given by
7622
* lbld.group() from the i-th source region of instruction \p inst and return
7623
* it as result in packed form.
7624
*/
7625
static fs_reg
7626
emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
7627
{
7628
assert(lbld.group() >= inst->group);
7629
7630
/* Specified channel group from the source region. */
7631
const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
7632
7633
if (needs_src_copy(lbld, inst, i)) {
7634
/* Builder of the right width to perform the copy avoiding uninitialized
7635
* data if the lowered execution size is greater than the original
7636
* execution size of the instruction.
7637
*/
7638
const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
7639
inst->exec_size), 0);
7640
const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
7641
7642
for (unsigned k = 0; k < inst->components_read(i); ++k)
7643
cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
7644
7645
return tmp;
7646
7647
} else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
7648
/* The source is invariant for all dispatch_width-wide groups of the
7649
* original region.
7650
*/
7651
return inst->src[i];
7652
7653
} else {
7654
/* We can just point the lowered instruction at the right channel group
7655
* from the original region.
7656
*/
7657
return src;
7658
}
7659
}
7660
7661
/**
7662
* Return true if splitting out the group of channels of instruction \p inst
7663
* given by lbld.group() requires allocating a temporary for the destination
7664
* of the lowered instruction and copying the data back to the original
7665
* destination region.
7666
*/
7667
static inline bool
7668
needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
7669
{
7670
/* If the instruction writes more than one component we'll have to shuffle
7671
* the results of multiple lowered instructions in order to make sure that
7672
* they end up arranged correctly in the original destination region.
7673
*/
7674
if (inst->size_written > inst->dst.component_size(inst->exec_size))
7675
return true;
7676
7677
/* If the lowered execution size is larger than the original the result of
7678
* the instruction won't fit in the original destination, so we'll have to
7679
* allocate a temporary in any case.
7680
*/
7681
if (lbld.dispatch_width() > inst->exec_size)
7682
return true;
7683
7684
for (unsigned i = 0; i < inst->sources; i++) {
7685
/* If we already made a copy of the source for other reasons there won't
7686
* be any overlap with the destination.
7687
*/
7688
if (needs_src_copy(lbld, inst, i))
7689
continue;
7690
7691
/* In order to keep the logic simple we emit a copy whenever the
7692
* destination region doesn't exactly match an overlapping source, which
7693
* may point at the source and destination not being aligned group by
7694
* group which could cause one of the lowered instructions to overwrite
7695
* the data read from the same source by other lowered instructions.
7696
*/
7697
if (regions_overlap(inst->dst, inst->size_written,
7698
inst->src[i], inst->size_read(i)) &&
7699
!inst->dst.equals(inst->src[i]))
7700
return true;
7701
}
7702
7703
return false;
7704
}
7705
7706
/**
7707
* Insert data from a packed temporary into the channel group given by
7708
* lbld.group() of the destination region of instruction \p inst and return
7709
* the temporary as result. Any copy instructions that are required for
7710
* unzipping the previous value (in the case of partial writes) will be
7711
* inserted using \p lbld_before and any copy instructions required for
7712
* zipping up the destination of \p inst will be inserted using \p lbld_after.
7713
*/
7714
static fs_reg
7715
emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
7716
fs_inst *inst)
7717
{
7718
assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
7719
assert(lbld_before.group() == lbld_after.group());
7720
assert(lbld_after.group() >= inst->group);
7721
7722
/* Specified channel group from the destination region. */
7723
const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
7724
const unsigned dst_size = inst->size_written /
7725
inst->dst.component_size(inst->exec_size);
7726
7727
if (needs_dst_copy(lbld_after, inst)) {
7728
const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
7729
7730
if (inst->predicate) {
7731
/* Handle predication by copying the original contents of
7732
* the destination into the temporary before emitting the
7733
* lowered instruction.
7734
*/
7735
const fs_builder gbld_before =
7736
lbld_before.group(MIN2(lbld_before.dispatch_width(),
7737
inst->exec_size), 0);
7738
for (unsigned k = 0; k < dst_size; ++k) {
7739
gbld_before.MOV(offset(tmp, lbld_before, k),
7740
offset(dst, inst->exec_size, k));
7741
}
7742
}
7743
7744
const fs_builder gbld_after =
7745
lbld_after.group(MIN2(lbld_after.dispatch_width(),
7746
inst->exec_size), 0);
7747
for (unsigned k = 0; k < dst_size; ++k) {
7748
/* Use a builder of the right width to perform the copy avoiding
7749
* uninitialized data if the lowered execution size is greater than
7750
* the original execution size of the instruction.
7751
*/
7752
gbld_after.MOV(offset(dst, inst->exec_size, k),
7753
offset(tmp, lbld_after, k));
7754
}
7755
7756
return tmp;
7757
7758
} else {
7759
/* No need to allocate a temporary for the lowered instruction, just
7760
* take the right group of channels from the original region.
7761
*/
7762
return dst;
7763
}
7764
}
7765
7766
bool
7767
fs_visitor::lower_simd_width()
7768
{
7769
bool progress = false;
7770
7771
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
7772
const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
7773
7774
if (lower_width != inst->exec_size) {
7775
/* Builder matching the original instruction. We may also need to
7776
* emit an instruction of width larger than the original, set the
7777
* execution size of the builder to the highest of both for now so
7778
* we're sure that both cases can be handled.
7779
*/
7780
const unsigned max_width = MAX2(inst->exec_size, lower_width);
7781
const fs_builder ibld = bld.at(block, inst)
7782
.exec_all(inst->force_writemask_all)
7783
.group(max_width, inst->group / max_width);
7784
7785
/* Split the copies in chunks of the execution width of either the
7786
* original or the lowered instruction, whichever is lower.
7787
*/
7788
const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
7789
const unsigned dst_size = inst->size_written /
7790
inst->dst.component_size(inst->exec_size);
7791
7792
assert(!inst->writes_accumulator && !inst->mlen);
7793
7794
/* Inserting the zip, unzip, and duplicated instructions in all of
7795
* the right spots is somewhat tricky. All of the unzip and any
7796
* instructions from the zip which unzip the destination prior to
7797
* writing need to happen before all of the per-group instructions
7798
* and the zip instructions need to happen after. In order to sort
7799
* this all out, we insert the unzip instructions before \p inst,
7800
* insert the per-group instructions after \p inst (i.e. before
7801
* inst->next), and insert the zip instructions before the
7802
* instruction after \p inst. Since we are inserting instructions
7803
* after \p inst, inst->next is a moving target and we need to save
7804
* it off here so that we insert the zip instructions in the right
7805
* place.
7806
*
7807
* Since we're inserting split instructions after after_inst, the
7808
* instructions will end up in the reverse order that we insert them.
7809
* However, certain render target writes require that the low group
7810
* instructions come before the high group. From the Ivy Bridge PRM
7811
* Vol. 4, Pt. 1, Section 3.9.11:
7812
*
7813
* "If multiple SIMD8 Dual Source messages are delivered by the
7814
* pixel shader thread, each SIMD8_DUALSRC_LO message must be
7815
* issued before the SIMD8_DUALSRC_HI message with the same Slot
7816
* Group Select setting."
7817
*
7818
* And, from Section 3.9.11.1 of the same PRM:
7819
*
7820
* "When SIMD32 or SIMD16 PS threads send render target writes
7821
* with multiple SIMD8 and SIMD16 messages, the following must
7822
* hold:
7823
*
7824
* All the slots (as described above) must have a corresponding
7825
* render target write irrespective of the slot's validity. A slot
7826
* is considered valid when at least one sample is enabled. For
7827
* example, a SIMD16 PS thread must send two SIMD8 render target
7828
* writes to cover all the slots.
7829
*
7830
* PS thread must send SIMD render target write messages with
7831
* increasing slot numbers. For example, SIMD16 thread has
7832
* Slot[15:0] and if two SIMD8 render target writes are used, the
7833
* first SIMD8 render target write must send Slot[7:0] and the
7834
* next one must send Slot[15:8]."
7835
*
7836
* In order to make low group instructions come before high group
7837
* instructions (this is required for some render target writes), we
7838
* split from the highest group to lowest.
7839
*/
7840
exec_node *const after_inst = inst->next;
7841
for (int i = n - 1; i >= 0; i--) {
7842
/* Emit a copy of the original instruction with the lowered width.
7843
* If the EOT flag was set throw it away except for the last
7844
* instruction to avoid killing the thread prematurely.
7845
*/
7846
fs_inst split_inst = *inst;
7847
split_inst.exec_size = lower_width;
7848
split_inst.eot = inst->eot && i == int(n - 1);
7849
7850
/* Select the correct channel enables for the i-th group, then
7851
* transform the sources and destination and emit the lowered
7852
* instruction.
7853
*/
7854
const fs_builder lbld = ibld.group(lower_width, i);
7855
7856
for (unsigned j = 0; j < inst->sources; j++)
7857
split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
7858
7859
split_inst.dst = emit_zip(lbld.at(block, inst),
7860
lbld.at(block, after_inst), inst);
7861
split_inst.size_written =
7862
split_inst.dst.component_size(lower_width) * dst_size;
7863
7864
lbld.at(block, inst->next).emit(split_inst);
7865
}
7866
7867
inst->remove(block);
7868
progress = true;
7869
}
7870
}
7871
7872
if (progress)
7873
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7874
7875
return progress;
7876
}
7877
7878
/**
7879
* Transform barycentric vectors into the interleaved form expected by the PLN
7880
* instruction and returned by the Gfx7+ PI shared function.
7881
*
7882
* For channels 0-15 in SIMD16 mode they are expected to be laid out as
7883
* follows in the register file:
7884
*
7885
* rN+0: X[0-7]
7886
* rN+1: Y[0-7]
7887
* rN+2: X[8-15]
7888
* rN+3: Y[8-15]
7889
*
7890
* There is no need to handle SIMD32 here -- This is expected to be run after
7891
* SIMD lowering, since SIMD lowering relies on vectors having the standard
7892
* component layout.
7893
*/
7894
bool
7895
fs_visitor::lower_barycentrics()
7896
{
7897
const bool has_interleaved_layout = devinfo->has_pln || devinfo->ver >= 7;
7898
bool progress = false;
7899
7900
if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
7901
return false;
7902
7903
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
7904
if (inst->exec_size < 16)
7905
continue;
7906
7907
const fs_builder ibld(this, block, inst);
7908
const fs_builder ubld = ibld.exec_all().group(8, 0);
7909
7910
switch (inst->opcode) {
7911
case FS_OPCODE_LINTERP : {
7912
assert(inst->exec_size == 16);
7913
const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
7914
fs_reg srcs[4];
7915
7916
for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
7917
srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
7918
8 * (i / 2));
7919
7920
ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
7921
7922
inst->src[0] = tmp;
7923
progress = true;
7924
break;
7925
}
7926
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
7927
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
7928
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
7929
assert(inst->exec_size == 16);
7930
const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
7931
7932
for (unsigned i = 0; i < 2; i++) {
7933
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
7934
fs_inst *mov = ibld.at(block, inst->next).group(8, g)
7935
.MOV(horiz_offset(offset(inst->dst, ibld, i),
7936
8 * g),
7937
offset(tmp, ubld, 2 * g + i));
7938
mov->predicate = inst->predicate;
7939
mov->predicate_inverse = inst->predicate_inverse;
7940
mov->flag_subreg = inst->flag_subreg;
7941
}
7942
}
7943
7944
inst->dst = tmp;
7945
progress = true;
7946
break;
7947
}
7948
default:
7949
break;
7950
}
7951
}
7952
7953
if (progress)
7954
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7955
7956
return progress;
7957
}
7958
7959
/**
7960
* Lower a derivative instruction as the floating-point difference of two
7961
* swizzles of the source, specified as \p swz0 and \p swz1.
7962
*/
7963
static bool
7964
lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst,
7965
unsigned swz0, unsigned swz1)
7966
{
7967
const fs_builder ibld(v, block, inst);
7968
const fs_reg tmp0 = ibld.vgrf(inst->src[0].type);
7969
const fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
7970
7971
ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
7972
ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
7973
7974
inst->resize_sources(2);
7975
inst->src[0] = negate(tmp0);
7976
inst->src[1] = tmp1;
7977
inst->opcode = BRW_OPCODE_ADD;
7978
7979
return true;
7980
}
7981
7982
/**
7983
* Lower derivative instructions on platforms where codegen cannot implement
7984
* them efficiently (i.e. XeHP).
7985
*/
7986
bool
7987
fs_visitor::lower_derivatives()
7988
{
7989
bool progress = false;
7990
7991
if (devinfo->verx10 < 125)
7992
return false;
7993
7994
foreach_block_and_inst(block, fs_inst, inst, cfg) {
7995
if (inst->opcode == FS_OPCODE_DDX_COARSE)
7996
progress |= lower_derivative(this, block, inst,
7997
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
7998
7999
else if (inst->opcode == FS_OPCODE_DDX_FINE)
8000
progress |= lower_derivative(this, block, inst,
8001
BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
8002
8003
else if (inst->opcode == FS_OPCODE_DDY_COARSE)
8004
progress |= lower_derivative(this, block, inst,
8005
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
8006
8007
else if (inst->opcode == FS_OPCODE_DDY_FINE)
8008
progress |= lower_derivative(this, block, inst,
8009
BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
8010
}
8011
8012
if (progress)
8013
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8014
8015
return progress;
8016
}
8017
8018
void
8019
fs_visitor::dump_instructions() const
8020
{
8021
dump_instructions(NULL);
8022
}
8023
8024
void
8025
fs_visitor::dump_instructions(const char *name) const
8026
{
8027
FILE *file = stderr;
8028
if (name && geteuid() != 0) {
8029
file = fopen(name, "w");
8030
if (!file)
8031
file = stderr;
8032
}
8033
8034
if (cfg) {
8035
const register_pressure &rp = regpressure_analysis.require();
8036
unsigned ip = 0, max_pressure = 0;
8037
foreach_block_and_inst(block, backend_instruction, inst, cfg) {
8038
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
8039
fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
8040
dump_instruction(inst, file);
8041
ip++;
8042
}
8043
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
8044
} else {
8045
int ip = 0;
8046
foreach_in_list(backend_instruction, inst, &instructions) {
8047
fprintf(file, "%4d: ", ip++);
8048
dump_instruction(inst, file);
8049
}
8050
}
8051
8052
if (file != stderr) {
8053
fclose(file);
8054
}
8055
}
8056
8057
void
8058
fs_visitor::dump_instruction(const backend_instruction *be_inst) const
8059
{
8060
dump_instruction(be_inst, stderr);
8061
}
8062
8063
void
8064
fs_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const
8065
{
8066
const fs_inst *inst = (const fs_inst *)be_inst;
8067
8068
if (inst->predicate) {
8069
fprintf(file, "(%cf%d.%d) ",
8070
inst->predicate_inverse ? '-' : '+',
8071
inst->flag_subreg / 2,
8072
inst->flag_subreg % 2);
8073
}
8074
8075
fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
8076
if (inst->saturate)
8077
fprintf(file, ".sat");
8078
if (inst->conditional_mod) {
8079
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
8080
if (!inst->predicate &&
8081
(devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
8082
inst->opcode != BRW_OPCODE_CSEL &&
8083
inst->opcode != BRW_OPCODE_IF &&
8084
inst->opcode != BRW_OPCODE_WHILE))) {
8085
fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
8086
inst->flag_subreg % 2);
8087
}
8088
}
8089
fprintf(file, "(%d) ", inst->exec_size);
8090
8091
if (inst->mlen) {
8092
fprintf(file, "(mlen: %d) ", inst->mlen);
8093
}
8094
8095
if (inst->ex_mlen) {
8096
fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
8097
}
8098
8099
if (inst->eot) {
8100
fprintf(file, "(EOT) ");
8101
}
8102
8103
switch (inst->dst.file) {
8104
case VGRF:
8105
fprintf(file, "vgrf%d", inst->dst.nr);
8106
break;
8107
case FIXED_GRF:
8108
fprintf(file, "g%d", inst->dst.nr);
8109
break;
8110
case MRF:
8111
fprintf(file, "m%d", inst->dst.nr);
8112
break;
8113
case BAD_FILE:
8114
fprintf(file, "(null)");
8115
break;
8116
case UNIFORM:
8117
fprintf(file, "***u%d***", inst->dst.nr);
8118
break;
8119
case ATTR:
8120
fprintf(file, "***attr%d***", inst->dst.nr);
8121
break;
8122
case ARF:
8123
switch (inst->dst.nr) {
8124
case BRW_ARF_NULL:
8125
fprintf(file, "null");
8126
break;
8127
case BRW_ARF_ADDRESS:
8128
fprintf(file, "a0.%d", inst->dst.subnr);
8129
break;
8130
case BRW_ARF_ACCUMULATOR:
8131
fprintf(file, "acc%d", inst->dst.subnr);
8132
break;
8133
case BRW_ARF_FLAG:
8134
fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
8135
break;
8136
default:
8137
fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
8138
break;
8139
}
8140
break;
8141
case IMM:
8142
unreachable("not reached");
8143
}
8144
8145
if (inst->dst.offset ||
8146
(inst->dst.file == VGRF &&
8147
alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
8148
const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
8149
fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
8150
inst->dst.offset % reg_size);
8151
}
8152
8153
if (inst->dst.stride != 1)
8154
fprintf(file, "<%u>", inst->dst.stride);
8155
fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
8156
8157
for (int i = 0; i < inst->sources; i++) {
8158
if (inst->src[i].negate)
8159
fprintf(file, "-");
8160
if (inst->src[i].abs)
8161
fprintf(file, "|");
8162
switch (inst->src[i].file) {
8163
case VGRF:
8164
fprintf(file, "vgrf%d", inst->src[i].nr);
8165
break;
8166
case FIXED_GRF:
8167
fprintf(file, "g%d", inst->src[i].nr);
8168
break;
8169
case MRF:
8170
fprintf(file, "***m%d***", inst->src[i].nr);
8171
break;
8172
case ATTR:
8173
fprintf(file, "attr%d", inst->src[i].nr);
8174
break;
8175
case UNIFORM:
8176
fprintf(file, "u%d", inst->src[i].nr);
8177
break;
8178
case BAD_FILE:
8179
fprintf(file, "(null)");
8180
break;
8181
case IMM:
8182
switch (inst->src[i].type) {
8183
case BRW_REGISTER_TYPE_HF:
8184
fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
8185
break;
8186
case BRW_REGISTER_TYPE_F:
8187
fprintf(file, "%-gf", inst->src[i].f);
8188
break;
8189
case BRW_REGISTER_TYPE_DF:
8190
fprintf(file, "%fdf", inst->src[i].df);
8191
break;
8192
case BRW_REGISTER_TYPE_W:
8193
case BRW_REGISTER_TYPE_D:
8194
fprintf(file, "%dd", inst->src[i].d);
8195
break;
8196
case BRW_REGISTER_TYPE_UW:
8197
case BRW_REGISTER_TYPE_UD:
8198
fprintf(file, "%uu", inst->src[i].ud);
8199
break;
8200
case BRW_REGISTER_TYPE_Q:
8201
fprintf(file, "%" PRId64 "q", inst->src[i].d64);
8202
break;
8203
case BRW_REGISTER_TYPE_UQ:
8204
fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
8205
break;
8206
case BRW_REGISTER_TYPE_VF:
8207
fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
8208
brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
8209
brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
8210
brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
8211
brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
8212
break;
8213
case BRW_REGISTER_TYPE_V:
8214
case BRW_REGISTER_TYPE_UV:
8215
fprintf(file, "%08x%s", inst->src[i].ud,
8216
inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
8217
break;
8218
default:
8219
fprintf(file, "???");
8220
break;
8221
}
8222
break;
8223
case ARF:
8224
switch (inst->src[i].nr) {
8225
case BRW_ARF_NULL:
8226
fprintf(file, "null");
8227
break;
8228
case BRW_ARF_ADDRESS:
8229
fprintf(file, "a0.%d", inst->src[i].subnr);
8230
break;
8231
case BRW_ARF_ACCUMULATOR:
8232
fprintf(file, "acc%d", inst->src[i].subnr);
8233
break;
8234
case BRW_ARF_FLAG:
8235
fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
8236
break;
8237
default:
8238
fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
8239
break;
8240
}
8241
break;
8242
}
8243
8244
if (inst->src[i].offset ||
8245
(inst->src[i].file == VGRF &&
8246
alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
8247
const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
8248
fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
8249
inst->src[i].offset % reg_size);
8250
}
8251
8252
if (inst->src[i].abs)
8253
fprintf(file, "|");
8254
8255
if (inst->src[i].file != IMM) {
8256
unsigned stride;
8257
if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
8258
unsigned hstride = inst->src[i].hstride;
8259
stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
8260
} else {
8261
stride = inst->src[i].stride;
8262
}
8263
if (stride != 1)
8264
fprintf(file, "<%u>", stride);
8265
8266
fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
8267
}
8268
8269
if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
8270
fprintf(file, ", ");
8271
}
8272
8273
fprintf(file, " ");
8274
8275
if (inst->force_writemask_all)
8276
fprintf(file, "NoMask ");
8277
8278
if (inst->exec_size != dispatch_width)
8279
fprintf(file, "group%d ", inst->group);
8280
8281
fprintf(file, "\n");
8282
}
8283
8284
void
8285
fs_visitor::setup_fs_payload_gfx6()
8286
{
8287
assert(stage == MESA_SHADER_FRAGMENT);
8288
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
8289
const unsigned payload_width = MIN2(16, dispatch_width);
8290
assert(dispatch_width % payload_width == 0);
8291
assert(devinfo->ver >= 6);
8292
8293
/* R0: PS thread payload header. */
8294
payload.num_regs++;
8295
8296
for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
8297
/* R1: masks, pixel X/Y coordinates. */
8298
payload.subspan_coord_reg[j] = payload.num_regs++;
8299
}
8300
8301
for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
8302
/* R3-26: barycentric interpolation coordinates. These appear in the
8303
* same order that they appear in the brw_barycentric_mode enum. Each
8304
* set of coordinates occupies 2 registers if dispatch width == 8 and 4
8305
* registers if dispatch width == 16. Coordinates only appear if they
8306
* were enabled using the "Barycentric Interpolation Mode" bits in
8307
* WM_STATE.
8308
*/
8309
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
8310
if (prog_data->barycentric_interp_modes & (1 << i)) {
8311
payload.barycentric_coord_reg[i][j] = payload.num_regs;
8312
payload.num_regs += payload_width / 4;
8313
}
8314
}
8315
8316
/* R27-28: interpolated depth if uses source depth */
8317
if (prog_data->uses_src_depth) {
8318
payload.source_depth_reg[j] = payload.num_regs;
8319
payload.num_regs += payload_width / 8;
8320
}
8321
8322
/* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
8323
if (prog_data->uses_src_w) {
8324
payload.source_w_reg[j] = payload.num_regs;
8325
payload.num_regs += payload_width / 8;
8326
}
8327
8328
/* R31: MSAA position offsets. */
8329
if (prog_data->uses_pos_offset) {
8330
payload.sample_pos_reg[j] = payload.num_regs;
8331
payload.num_regs++;
8332
}
8333
8334
/* R32-33: MSAA input coverage mask */
8335
if (prog_data->uses_sample_mask) {
8336
assert(devinfo->ver >= 7);
8337
payload.sample_mask_in_reg[j] = payload.num_regs;
8338
payload.num_regs += payload_width / 8;
8339
}
8340
8341
/* R66: Source Depth and/or W Attribute Vertex Deltas */
8342
if (prog_data->uses_depth_w_coefficients) {
8343
payload.depth_w_coef_reg[j] = payload.num_regs;
8344
payload.num_regs++;
8345
}
8346
}
8347
8348
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
8349
source_depth_to_render_target = true;
8350
}
8351
}
8352
8353
void
8354
fs_visitor::setup_vs_payload()
8355
{
8356
/* R0: thread header, R1: urb handles */
8357
payload.num_regs = 2;
8358
}
8359
8360
void
8361
fs_visitor::setup_gs_payload()
8362
{
8363
assert(stage == MESA_SHADER_GEOMETRY);
8364
8365
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
8366
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
8367
8368
/* R0: thread header, R1: output URB handles */
8369
payload.num_regs = 2;
8370
8371
if (gs_prog_data->include_primitive_id) {
8372
/* R2: Primitive ID 0..7 */
8373
payload.num_regs++;
8374
}
8375
8376
/* Always enable VUE handles so we can safely use pull model if needed.
8377
*
8378
* The push model for a GS uses a ton of register space even for trivial
8379
* scenarios with just a few inputs, so just make things easier and a bit
8380
* safer by always having pull model available.
8381
*/
8382
gs_prog_data->base.include_vue_handles = true;
8383
8384
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
8385
payload.num_regs += nir->info.gs.vertices_in;
8386
8387
/* Use a maximum of 24 registers for push-model inputs. */
8388
const unsigned max_push_components = 24;
8389
8390
/* If pushing our inputs would take too many registers, reduce the URB read
8391
* length (which is in HWords, or 8 registers), and resort to pulling.
8392
*
8393
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
8394
* have to multiply by VerticesIn to obtain the total storage requirement.
8395
*/
8396
if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
8397
max_push_components) {
8398
vue_prog_data->urb_read_length =
8399
ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
8400
}
8401
}
8402
8403
void
8404
fs_visitor::setup_cs_payload()
8405
{
8406
assert(devinfo->ver >= 7);
8407
/* TODO: Fill out uses_btd_stack_ids automatically */
8408
payload.num_regs = 1 + brw_cs_prog_data(prog_data)->uses_btd_stack_ids;
8409
}
8410
8411
brw::register_pressure::register_pressure(const fs_visitor *v)
8412
{
8413
const fs_live_variables &live = v->live_analysis.require();
8414
const unsigned num_instructions = v->cfg->num_blocks ?
8415
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
8416
8417
regs_live_at_ip = new unsigned[num_instructions]();
8418
8419
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
8420
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
8421
regs_live_at_ip[ip] += v->alloc.sizes[reg];
8422
}
8423
}
8424
8425
brw::register_pressure::~register_pressure()
8426
{
8427
delete[] regs_live_at_ip;
8428
}
8429
8430
void
8431
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
8432
{
8433
backend_shader::invalidate_analysis(c);
8434
live_analysis.invalidate(c);
8435
regpressure_analysis.invalidate(c);
8436
}
8437
8438
void
8439
fs_visitor::optimize()
8440
{
8441
/* Start by validating the shader we currently have. */
8442
validate();
8443
8444
/* bld is the common builder object pointing at the end of the program we
8445
* used to translate it into i965 IR. For the optimization and lowering
8446
* passes coming next, any code added after the end of the program without
8447
* having explicitly called fs_builder::at() clearly points at a mistake.
8448
* Ideally optimization passes wouldn't be part of the visitor so they
8449
* wouldn't have access to bld at all, but they do, so just in case some
8450
* pass forgets to ask for a location explicitly set it to NULL here to
8451
* make it trip. The dispatch width is initialized to a bogus value to
8452
* make sure that optimizations set the execution controls explicitly to
8453
* match the code they are manipulating instead of relying on the defaults.
8454
*/
8455
bld = fs_builder(this, 64);
8456
8457
assign_constant_locations();
8458
lower_constant_loads();
8459
8460
validate();
8461
8462
split_virtual_grfs();
8463
validate();
8464
8465
#define OPT(pass, args...) ({ \
8466
pass_num++; \
8467
bool this_progress = pass(args); \
8468
\
8469
if ((INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
8470
char filename[64]; \
8471
snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
8472
stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
8473
\
8474
backend_shader::dump_instructions(filename); \
8475
} \
8476
\
8477
validate(); \
8478
\
8479
progress = progress || this_progress; \
8480
this_progress; \
8481
})
8482
8483
if (INTEL_DEBUG & DEBUG_OPTIMIZER) {
8484
char filename[64];
8485
snprintf(filename, 64, "%s%d-%s-00-00-start",
8486
stage_abbrev, dispatch_width, nir->info.name);
8487
8488
backend_shader::dump_instructions(filename);
8489
}
8490
8491
bool progress = false;
8492
int iteration = 0;
8493
int pass_num = 0;
8494
8495
/* Before anything else, eliminate dead code. The results of some NIR
8496
* instructions may effectively be calculated twice. Once when the
8497
* instruction is encountered, and again when the user of that result is
8498
* encountered. Wipe those away before algebraic optimizations and
8499
* especially copy propagation can mix things up.
8500
*/
8501
OPT(dead_code_eliminate);
8502
8503
OPT(remove_extra_rounding_modes);
8504
8505
do {
8506
progress = false;
8507
pass_num = 0;
8508
iteration++;
8509
8510
OPT(remove_duplicate_mrf_writes);
8511
8512
OPT(opt_algebraic);
8513
OPT(opt_cse);
8514
OPT(opt_copy_propagation);
8515
OPT(opt_predicated_break, this);
8516
OPT(opt_cmod_propagation);
8517
OPT(dead_code_eliminate);
8518
OPT(opt_peephole_sel);
8519
OPT(dead_control_flow_eliminate, this);
8520
OPT(opt_register_renaming);
8521
OPT(opt_saturate_propagation);
8522
OPT(register_coalesce);
8523
OPT(compute_to_mrf);
8524
OPT(eliminate_find_live_channel);
8525
8526
OPT(compact_virtual_grfs);
8527
} while (progress);
8528
8529
progress = false;
8530
pass_num = 0;
8531
8532
if (OPT(lower_pack)) {
8533
OPT(register_coalesce);
8534
OPT(dead_code_eliminate);
8535
}
8536
8537
OPT(lower_simd_width);
8538
OPT(lower_barycentrics);
8539
OPT(lower_logical_sends);
8540
8541
/* After logical SEND lowering. */
8542
OPT(fixup_nomask_control_flow);
8543
8544
if (progress) {
8545
OPT(opt_copy_propagation);
8546
/* Only run after logical send lowering because it's easier to implement
8547
* in terms of physical sends.
8548
*/
8549
if (OPT(opt_zero_samples))
8550
OPT(opt_copy_propagation);
8551
/* Run after logical send lowering to give it a chance to CSE the
8552
* LOAD_PAYLOAD instructions created to construct the payloads of
8553
* e.g. texturing messages in cases where it wasn't possible to CSE the
8554
* whole logical instruction.
8555
*/
8556
OPT(opt_cse);
8557
OPT(register_coalesce);
8558
OPT(compute_to_mrf);
8559
OPT(dead_code_eliminate);
8560
OPT(remove_duplicate_mrf_writes);
8561
OPT(opt_peephole_sel);
8562
}
8563
8564
OPT(opt_redundant_halt);
8565
8566
if (OPT(lower_load_payload)) {
8567
split_virtual_grfs();
8568
8569
/* Lower 64 bit MOVs generated by payload lowering. */
8570
if (!devinfo->has_64bit_float && !devinfo->has_64bit_int)
8571
OPT(opt_algebraic);
8572
8573
OPT(register_coalesce);
8574
OPT(lower_simd_width);
8575
OPT(compute_to_mrf);
8576
OPT(dead_code_eliminate);
8577
}
8578
8579
OPT(opt_combine_constants);
8580
if (OPT(lower_integer_multiplication)) {
8581
/* If lower_integer_multiplication made progress, it may have produced
8582
* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
8583
* one more time to clean those up if they exist.
8584
*/
8585
OPT(lower_integer_multiplication);
8586
}
8587
OPT(lower_sub_sat);
8588
8589
if (devinfo->ver <= 5 && OPT(lower_minmax)) {
8590
OPT(opt_cmod_propagation);
8591
OPT(opt_cse);
8592
OPT(opt_copy_propagation);
8593
OPT(dead_code_eliminate);
8594
}
8595
8596
progress = false;
8597
OPT(lower_derivatives);
8598
OPT(lower_regioning);
8599
if (progress) {
8600
OPT(opt_copy_propagation);
8601
OPT(dead_code_eliminate);
8602
OPT(lower_simd_width);
8603
}
8604
8605
OPT(fixup_sends_duplicate_payload);
8606
8607
lower_uniform_pull_constant_loads();
8608
8609
validate();
8610
}
8611
8612
/**
8613
* From the Skylake PRM Vol. 2a docs for sends:
8614
*
8615
* "It is required that the second block of GRFs does not overlap with the
8616
* first block."
8617
*
8618
* There are plenty of cases where we may accidentally violate this due to
8619
* having, for instance, both sources be the constant 0. This little pass
8620
* just adds a new vgrf for the second payload and copies it over.
8621
*/
8622
bool
8623
fs_visitor::fixup_sends_duplicate_payload()
8624
{
8625
bool progress = false;
8626
8627
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
8628
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
8629
regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
8630
inst->src[3], inst->ex_mlen * REG_SIZE)) {
8631
fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
8632
BRW_REGISTER_TYPE_UD);
8633
/* Sadly, we've lost all notion of channels and bit sizes at this
8634
* point. Just WE_all it.
8635
*/
8636
const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0);
8637
fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
8638
fs_reg copy_dst = tmp;
8639
for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
8640
if (inst->ex_mlen == i + 1) {
8641
/* Only one register left; do SIMD8 */
8642
ibld.group(8, 0).MOV(copy_dst, copy_src);
8643
} else {
8644
ibld.MOV(copy_dst, copy_src);
8645
}
8646
copy_src = offset(copy_src, ibld, 1);
8647
copy_dst = offset(copy_dst, ibld, 1);
8648
}
8649
inst->src[3] = tmp;
8650
progress = true;
8651
}
8652
}
8653
8654
if (progress)
8655
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8656
8657
return progress;
8658
}
8659
8660
/**
8661
* Three source instruction must have a GRF/MRF destination register.
8662
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
8663
*/
8664
void
8665
fs_visitor::fixup_3src_null_dest()
8666
{
8667
bool progress = false;
8668
8669
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
8670
if (inst->is_3src(devinfo) && inst->dst.is_null()) {
8671
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
8672
inst->dst.type);
8673
progress = true;
8674
}
8675
}
8676
8677
if (progress)
8678
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
8679
DEPENDENCY_VARIABLES);
8680
}
8681
8682
/**
8683
* Find the first instruction in the program that might start a region of
8684
* divergent control flow due to a HALT jump. There is no
8685
* find_halt_control_flow_region_end(), the region of divergence extends until
8686
* the only SHADER_OPCODE_HALT_TARGET in the program.
8687
*/
8688
static const fs_inst *
8689
find_halt_control_flow_region_start(const fs_visitor *v)
8690
{
8691
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
8692
if (inst->opcode == BRW_OPCODE_HALT ||
8693
inst->opcode == SHADER_OPCODE_HALT_TARGET)
8694
return inst;
8695
}
8696
8697
return NULL;
8698
}
8699
8700
/**
8701
* Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion
8702
* can cause a BB to be executed with all channels disabled, which will lead
8703
* to the execution of any NoMask instructions in it, even though any
8704
* execution-masked instructions will be correctly shot down. This may break
8705
* assumptions of some NoMask SEND messages whose descriptor depends on data
8706
* generated by live invocations of the shader.
8707
*
8708
* This avoids the problem by predicating certain instructions on an ANY
8709
* horizontal predicate that makes sure that their execution is omitted when
8710
* all channels of the program are disabled.
8711
*/
8712
bool
8713
fs_visitor::fixup_nomask_control_flow()
8714
{
8715
if (devinfo->ver != 12)
8716
return false;
8717
8718
const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
8719
dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
8720
BRW_PREDICATE_ALIGN1_ANY8H;
8721
const fs_inst *halt_start = find_halt_control_flow_region_start(this);
8722
unsigned depth = 0;
8723
bool progress = false;
8724
8725
const fs_live_variables &live_vars = live_analysis.require();
8726
8727
/* Scan the program backwards in order to be able to easily determine
8728
* whether the flag register is live at any point.
8729
*/
8730
foreach_block_reverse_safe(block, cfg) {
8731
BITSET_WORD flag_liveout = live_vars.block_data[block->num]
8732
.flag_liveout[0];
8733
STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
8734
8735
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
8736
if (!inst->predicate && inst->exec_size >= 8)
8737
flag_liveout &= ~inst->flags_written(devinfo);
8738
8739
switch (inst->opcode) {
8740
case BRW_OPCODE_DO:
8741
case BRW_OPCODE_IF:
8742
/* Note that this doesn't handle BRW_OPCODE_HALT since only
8743
* the first one in the program closes the region of divergent
8744
* control flow due to any HALT instructions -- Instead this is
8745
* handled with the halt_start check below.
8746
*/
8747
depth--;
8748
break;
8749
8750
case BRW_OPCODE_WHILE:
8751
case BRW_OPCODE_ENDIF:
8752
case SHADER_OPCODE_HALT_TARGET:
8753
depth++;
8754
break;
8755
8756
default:
8757
/* Note that the vast majority of NoMask SEND instructions in the
8758
* program are harmless while executed in a block with all
8759
* channels disabled, since any instructions with side effects we
8760
* could hit here should be execution-masked.
8761
*
8762
* The main concern is NoMask SEND instructions where the message
8763
* descriptor or header depends on data generated by live
8764
* invocations of the shader (RESINFO and
8765
* FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
8766
* computed surface index seem to be the only examples right now
8767
* where this could easily lead to GPU hangs). Unfortunately we
8768
* have no straightforward way to detect that currently, so just
8769
* predicate any NoMask SEND instructions we find under control
8770
* flow.
8771
*
8772
* If this proves to have a measurable performance impact it can
8773
* be easily extended with a whitelist of messages we know we can
8774
* safely omit the predication for.
8775
*/
8776
if (depth && inst->force_writemask_all &&
8777
is_send(inst) && !inst->predicate) {
8778
/* We need to load the execution mask into the flag register by
8779
* using a builder with channel group matching the whole shader
8780
* (rather than the default which is derived from the original
8781
* instruction), in order to avoid getting a right-shifted
8782
* value.
8783
*/
8784
const fs_builder ubld = fs_builder(this, block, inst)
8785
.exec_all().group(dispatch_width, 0);
8786
const fs_reg flag = retype(brw_flag_reg(0, 0),
8787
BRW_REGISTER_TYPE_UD);
8788
8789
/* Due to the lack of flag register allocation we need to save
8790
* and restore the flag register if it's live.
8791
*/
8792
const bool save_flag = flag_liveout &
8793
flag_mask(flag, dispatch_width / 8);
8794
const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
8795
8796
if (save_flag)
8797
ubld.group(1, 0).MOV(tmp, flag);
8798
8799
ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
8800
8801
set_predicate(pred, inst);
8802
inst->flag_subreg = 0;
8803
8804
if (save_flag)
8805
ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
8806
8807
progress = true;
8808
}
8809
break;
8810
}
8811
8812
if (inst == halt_start)
8813
depth--;
8814
8815
flag_liveout |= inst->flags_read(devinfo);
8816
}
8817
}
8818
8819
if (progress)
8820
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8821
8822
return progress;
8823
}
8824
8825
void
8826
fs_visitor::allocate_registers(bool allow_spilling)
8827
{
8828
bool allocated;
8829
8830
static const enum instruction_scheduler_mode pre_modes[] = {
8831
SCHEDULE_PRE,
8832
SCHEDULE_PRE_NON_LIFO,
8833
SCHEDULE_PRE_LIFO,
8834
};
8835
8836
static const char *scheduler_mode_name[] = {
8837
"top-down",
8838
"non-lifo",
8839
"lifo"
8840
};
8841
8842
bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
8843
8844
/* Try each scheduling heuristic to see if it can successfully register
8845
* allocate without spilling. They should be ordered by decreasing
8846
* performance but increasing likelihood of allocating.
8847
*/
8848
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
8849
schedule_instructions(pre_modes[i]);
8850
this->shader_stats.scheduler_mode = scheduler_mode_name[i];
8851
8852
if (0) {
8853
assign_regs_trivial();
8854
allocated = true;
8855
break;
8856
}
8857
8858
/* Scheduling may create additional opportunities for CMOD propagation,
8859
* so let's do it again. If CMOD propagation made any progress,
8860
* eliminate dead code one more time.
8861
*/
8862
bool progress = false;
8863
const int iteration = 99;
8864
int pass_num = 0;
8865
8866
if (OPT(opt_cmod_propagation)) {
8867
/* dead_code_eliminate "undoes" the fixing done by
8868
* fixup_3src_null_dest, so we have to do it again if
8869
* dead_code_eliminiate makes any progress.
8870
*/
8871
if (OPT(dead_code_eliminate))
8872
fixup_3src_null_dest();
8873
}
8874
8875
bool can_spill = allow_spilling &&
8876
(i == ARRAY_SIZE(pre_modes) - 1);
8877
8878
/* We should only spill registers on the last scheduling. */
8879
assert(!spilled_any_registers);
8880
8881
allocated = assign_regs(can_spill, spill_all);
8882
if (allocated)
8883
break;
8884
}
8885
8886
if (!allocated) {
8887
fail("Failure to register allocate. Reduce number of "
8888
"live scalar values to avoid this.");
8889
} else if (spilled_any_registers) {
8890
compiler->shader_perf_log(log_data,
8891
"%s shader triggered register spilling. "
8892
"Try reducing the number of live scalar "
8893
"values to improve performance.\n",
8894
stage_name);
8895
}
8896
8897
/* This must come after all optimization and register allocation, since
8898
* it inserts dead code that happens to have side effects, and it does
8899
* so based on the actual physical registers in use.
8900
*/
8901
insert_gfx4_send_dependency_workarounds();
8902
8903
if (failed)
8904
return;
8905
8906
opt_bank_conflicts();
8907
8908
schedule_instructions(SCHEDULE_POST);
8909
8910
if (last_scratch > 0) {
8911
ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
8912
8913
prog_data->total_scratch = brw_get_scratch_size(last_scratch);
8914
8915
if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
8916
if (devinfo->is_haswell) {
8917
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
8918
* field documentation, Haswell supports a minimum of 2kB of
8919
* scratch space for compute shaders, unlike every other stage
8920
* and platform.
8921
*/
8922
prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
8923
} else if (devinfo->ver <= 7) {
8924
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
8925
* field documentation, platforms prior to Haswell measure scratch
8926
* size linearly with a range of [1kB, 12kB] and 1kB granularity.
8927
*/
8928
prog_data->total_scratch = ALIGN(last_scratch, 1024);
8929
max_scratch_size = 12 * 1024;
8930
}
8931
}
8932
8933
/* We currently only support up to 2MB of scratch space. If we
8934
* need to support more eventually, the documentation suggests
8935
* that we could allocate a larger buffer, and partition it out
8936
* ourselves. We'd just have to undo the hardware's address
8937
* calculation by subtracting (FFTID * Per Thread Scratch Space)
8938
* and then add FFTID * (Larger Per Thread Scratch Space).
8939
*
8940
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
8941
* Thread Group Tracking > Local Memory/Scratch Space.
8942
*/
8943
assert(prog_data->total_scratch < max_scratch_size);
8944
}
8945
8946
lower_scoreboard();
8947
}
8948
8949
bool
8950
fs_visitor::run_vs()
8951
{
8952
assert(stage == MESA_SHADER_VERTEX);
8953
8954
setup_vs_payload();
8955
8956
if (shader_time_index >= 0)
8957
emit_shader_time_begin();
8958
8959
emit_nir_code();
8960
8961
if (failed)
8962
return false;
8963
8964
emit_urb_writes();
8965
8966
if (shader_time_index >= 0)
8967
emit_shader_time_end();
8968
8969
calculate_cfg();
8970
8971
optimize();
8972
8973
assign_curb_setup();
8974
assign_vs_urb_setup();
8975
8976
fixup_3src_null_dest();
8977
allocate_registers(true /* allow_spilling */);
8978
8979
return !failed;
8980
}
8981
8982
void
8983
fs_visitor::set_tcs_invocation_id()
8984
{
8985
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
8986
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
8987
8988
const unsigned instance_id_mask =
8989
devinfo->ver >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
8990
const unsigned instance_id_shift =
8991
devinfo->ver >= 11 ? 16 : 17;
8992
8993
/* Get instance number from g0.2 bits 22:16 or 23:17 */
8994
fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
8995
bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
8996
brw_imm_ud(instance_id_mask));
8997
8998
invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
8999
9000
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
9001
/* gl_InvocationID is just the thread number */
9002
bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
9003
return;
9004
}
9005
9006
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
9007
9008
fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
9009
fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
9010
bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
9011
bld.MOV(channels_ud, channels_uw);
9012
9013
if (tcs_prog_data->instances == 1) {
9014
invocation_id = channels_ud;
9015
} else {
9016
fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
9017
bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
9018
bld.ADD(invocation_id, instance_times_8, channels_ud);
9019
}
9020
}
9021
9022
bool
9023
fs_visitor::run_tcs()
9024
{
9025
assert(stage == MESA_SHADER_TESS_CTRL);
9026
9027
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
9028
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
9029
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
9030
9031
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
9032
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
9033
9034
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
9035
/* r1-r4 contain the ICP handles. */
9036
payload.num_regs = 5;
9037
} else {
9038
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
9039
assert(tcs_key->input_vertices > 0);
9040
/* r1 contains output handles, r2 may contain primitive ID, then the
9041
* ICP handles occupy the next 1-32 registers.
9042
*/
9043
payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
9044
tcs_key->input_vertices;
9045
}
9046
9047
if (shader_time_index >= 0)
9048
emit_shader_time_begin();
9049
9050
/* Initialize gl_InvocationID */
9051
set_tcs_invocation_id();
9052
9053
const bool fix_dispatch_mask =
9054
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
9055
(nir->info.tess.tcs_vertices_out % 8) != 0;
9056
9057
/* Fix the disptach mask */
9058
if (fix_dispatch_mask) {
9059
bld.CMP(bld.null_reg_ud(), invocation_id,
9060
brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
9061
bld.IF(BRW_PREDICATE_NORMAL);
9062
}
9063
9064
emit_nir_code();
9065
9066
if (fix_dispatch_mask) {
9067
bld.emit(BRW_OPCODE_ENDIF);
9068
}
9069
9070
/* Emit EOT write; set TR DS Cache bit */
9071
fs_reg srcs[3] = {
9072
fs_reg(get_tcs_output_urb_handle()),
9073
fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
9074
fs_reg(brw_imm_ud(0)),
9075
};
9076
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
9077
bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
9078
9079
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
9080
bld.null_reg_ud(), payload);
9081
inst->mlen = 3;
9082
inst->eot = true;
9083
9084
if (shader_time_index >= 0)
9085
emit_shader_time_end();
9086
9087
if (failed)
9088
return false;
9089
9090
calculate_cfg();
9091
9092
optimize();
9093
9094
assign_curb_setup();
9095
assign_tcs_urb_setup();
9096
9097
fixup_3src_null_dest();
9098
allocate_registers(true /* allow_spilling */);
9099
9100
return !failed;
9101
}
9102
9103
bool
9104
fs_visitor::run_tes()
9105
{
9106
assert(stage == MESA_SHADER_TESS_EVAL);
9107
9108
/* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
9109
payload.num_regs = 5;
9110
9111
if (shader_time_index >= 0)
9112
emit_shader_time_begin();
9113
9114
emit_nir_code();
9115
9116
if (failed)
9117
return false;
9118
9119
emit_urb_writes();
9120
9121
if (shader_time_index >= 0)
9122
emit_shader_time_end();
9123
9124
calculate_cfg();
9125
9126
optimize();
9127
9128
assign_curb_setup();
9129
assign_tes_urb_setup();
9130
9131
fixup_3src_null_dest();
9132
allocate_registers(true /* allow_spilling */);
9133
9134
return !failed;
9135
}
9136
9137
bool
9138
fs_visitor::run_gs()
9139
{
9140
assert(stage == MESA_SHADER_GEOMETRY);
9141
9142
setup_gs_payload();
9143
9144
this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
9145
9146
if (gs_compile->control_data_header_size_bits > 0) {
9147
/* Create a VGRF to store accumulated control data bits. */
9148
this->control_data_bits = vgrf(glsl_type::uint_type);
9149
9150
/* If we're outputting more than 32 control data bits, then EmitVertex()
9151
* will set control_data_bits to 0 after emitting the first vertex.
9152
* Otherwise, we need to initialize it to 0 here.
9153
*/
9154
if (gs_compile->control_data_header_size_bits <= 32) {
9155
const fs_builder abld = bld.annotate("initialize control data bits");
9156
abld.MOV(this->control_data_bits, brw_imm_ud(0u));
9157
}
9158
}
9159
9160
if (shader_time_index >= 0)
9161
emit_shader_time_begin();
9162
9163
emit_nir_code();
9164
9165
emit_gs_thread_end();
9166
9167
if (shader_time_index >= 0)
9168
emit_shader_time_end();
9169
9170
if (failed)
9171
return false;
9172
9173
calculate_cfg();
9174
9175
optimize();
9176
9177
assign_curb_setup();
9178
assign_gs_urb_setup();
9179
9180
fixup_3src_null_dest();
9181
allocate_registers(true /* allow_spilling */);
9182
9183
return !failed;
9184
}
9185
9186
/* From the SKL PRM, Volume 16, Workarounds:
9187
*
9188
* 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
9189
* only header phases (R0-R2)
9190
*
9191
* WA: Enable a non-header phase (e.g. push constant) when dispatch would
9192
* have been header only.
9193
*
9194
* Instead of enabling push constants one can alternatively enable one of the
9195
* inputs. Here one simply chooses "layer" which shouldn't impose much
9196
* overhead.
9197
*/
9198
static void
9199
gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
9200
{
9201
if (wm_prog_data->num_varying_inputs)
9202
return;
9203
9204
if (wm_prog_data->base.curb_read_length)
9205
return;
9206
9207
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
9208
wm_prog_data->num_varying_inputs = 1;
9209
9210
brw_compute_urb_setup_index(wm_prog_data);
9211
}
9212
9213
bool
9214
fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
9215
{
9216
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
9217
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
9218
9219
assert(stage == MESA_SHADER_FRAGMENT);
9220
9221
if (devinfo->ver >= 6)
9222
setup_fs_payload_gfx6();
9223
else
9224
setup_fs_payload_gfx4();
9225
9226
if (0) {
9227
emit_dummy_fs();
9228
} else if (do_rep_send) {
9229
assert(dispatch_width == 16);
9230
emit_repclear_shader();
9231
} else {
9232
if (shader_time_index >= 0)
9233
emit_shader_time_begin();
9234
9235
if (nir->info.inputs_read > 0 ||
9236
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
9237
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
9238
if (devinfo->ver < 6)
9239
emit_interpolation_setup_gfx4();
9240
else
9241
emit_interpolation_setup_gfx6();
9242
}
9243
9244
/* We handle discards by keeping track of the still-live pixels in f0.1.
9245
* Initialize it with the dispatched pixels.
9246
*/
9247
if (wm_prog_data->uses_kill) {
9248
const unsigned lower_width = MIN2(dispatch_width, 16);
9249
for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
9250
const fs_reg dispatch_mask =
9251
devinfo->ver >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
9252
brw_vec1_grf(0, 0);
9253
bld.exec_all().group(1, 0)
9254
.MOV(sample_mask_reg(bld.group(lower_width, i)),
9255
retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
9256
}
9257
}
9258
9259
if (nir->info.writes_memory)
9260
wm_prog_data->has_side_effects = true;
9261
9262
emit_nir_code();
9263
9264
if (failed)
9265
return false;
9266
9267
if (wm_key->alpha_test_func)
9268
emit_alpha_test();
9269
9270
emit_fb_writes();
9271
9272
if (shader_time_index >= 0)
9273
emit_shader_time_end();
9274
9275
calculate_cfg();
9276
9277
optimize();
9278
9279
assign_curb_setup();
9280
9281
if (devinfo->ver >= 9)
9282
gfx9_ps_header_only_workaround(wm_prog_data);
9283
9284
assign_urb_setup();
9285
9286
fixup_3src_null_dest();
9287
9288
allocate_registers(allow_spilling);
9289
9290
if (failed)
9291
return false;
9292
}
9293
9294
return !failed;
9295
}
9296
9297
bool
9298
fs_visitor::run_cs(bool allow_spilling)
9299
{
9300
assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
9301
9302
setup_cs_payload();
9303
9304
if (shader_time_index >= 0)
9305
emit_shader_time_begin();
9306
9307
if (devinfo->is_haswell && prog_data->total_shared > 0) {
9308
/* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
9309
const fs_builder abld = bld.exec_all().group(1, 0);
9310
abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
9311
suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
9312
}
9313
9314
emit_nir_code();
9315
9316
if (failed)
9317
return false;
9318
9319
emit_cs_terminate();
9320
9321
if (shader_time_index >= 0)
9322
emit_shader_time_end();
9323
9324
calculate_cfg();
9325
9326
optimize();
9327
9328
assign_curb_setup();
9329
9330
fixup_3src_null_dest();
9331
allocate_registers(allow_spilling);
9332
9333
if (failed)
9334
return false;
9335
9336
return !failed;
9337
}
9338
9339
bool
9340
fs_visitor::run_bs(bool allow_spilling)
9341
{
9342
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
9343
9344
/* R0: thread header, R1: stack IDs, R2: argument addresses */
9345
payload.num_regs = 3;
9346
9347
if (shader_time_index >= 0)
9348
emit_shader_time_begin();
9349
9350
emit_nir_code();
9351
9352
if (failed)
9353
return false;
9354
9355
/* TODO(RT): Perhaps rename this? */
9356
emit_cs_terminate();
9357
9358
if (shader_time_index >= 0)
9359
emit_shader_time_end();
9360
9361
calculate_cfg();
9362
9363
optimize();
9364
9365
assign_curb_setup();
9366
9367
fixup_3src_null_dest();
9368
allocate_registers(allow_spilling);
9369
9370
if (failed)
9371
return false;
9372
9373
return !failed;
9374
}
9375
9376
static bool
9377
is_used_in_not_interp_frag_coord(nir_ssa_def *def)
9378
{
9379
nir_foreach_use(src, def) {
9380
if (src->parent_instr->type != nir_instr_type_intrinsic)
9381
return true;
9382
9383
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
9384
if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
9385
return true;
9386
}
9387
9388
nir_foreach_if_use(src, def)
9389
return true;
9390
9391
return false;
9392
}
9393
9394
/**
9395
* Return a bitfield where bit n is set if barycentric interpolation mode n
9396
* (see enum brw_barycentric_mode) is needed by the fragment shader.
9397
*
9398
* We examine the load_barycentric intrinsics rather than looking at input
9399
* variables so that we catch interpolateAtCentroid() messages too, which
9400
* also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
9401
*/
9402
static unsigned
9403
brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
9404
const nir_shader *shader)
9405
{
9406
unsigned barycentric_interp_modes = 0;
9407
9408
nir_foreach_function(f, shader) {
9409
if (!f->impl)
9410
continue;
9411
9412
nir_foreach_block(block, f->impl) {
9413
nir_foreach_instr(instr, block) {
9414
if (instr->type != nir_instr_type_intrinsic)
9415
continue;
9416
9417
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9418
switch (intrin->intrinsic) {
9419
case nir_intrinsic_load_barycentric_pixel:
9420
case nir_intrinsic_load_barycentric_centroid:
9421
case nir_intrinsic_load_barycentric_sample:
9422
break;
9423
default:
9424
continue;
9425
}
9426
9427
/* Ignore WPOS; it doesn't require interpolation. */
9428
assert(intrin->dest.is_ssa);
9429
if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa))
9430
continue;
9431
9432
enum glsl_interp_mode interp = (enum glsl_interp_mode)
9433
nir_intrinsic_interp_mode(intrin);
9434
nir_intrinsic_op bary_op = intrin->intrinsic;
9435
enum brw_barycentric_mode bary =
9436
brw_barycentric_mode(interp, bary_op);
9437
9438
barycentric_interp_modes |= 1 << bary;
9439
9440
if (devinfo->needs_unlit_centroid_workaround &&
9441
bary_op == nir_intrinsic_load_barycentric_centroid)
9442
barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
9443
}
9444
}
9445
}
9446
9447
return barycentric_interp_modes;
9448
}
9449
9450
static void
9451
brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
9452
const nir_shader *shader)
9453
{
9454
prog_data->flat_inputs = 0;
9455
9456
nir_foreach_shader_in_variable(var, shader) {
9457
unsigned slots = glsl_count_attribute_slots(var->type, false);
9458
for (unsigned s = 0; s < slots; s++) {
9459
int input_index = prog_data->urb_setup[var->data.location + s];
9460
9461
if (input_index < 0)
9462
continue;
9463
9464
/* flat shading */
9465
if (var->data.interpolation == INTERP_MODE_FLAT)
9466
prog_data->flat_inputs |= 1 << input_index;
9467
}
9468
}
9469
}
9470
9471
static uint8_t
9472
computed_depth_mode(const nir_shader *shader)
9473
{
9474
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
9475
switch (shader->info.fs.depth_layout) {
9476
case FRAG_DEPTH_LAYOUT_NONE:
9477
case FRAG_DEPTH_LAYOUT_ANY:
9478
return BRW_PSCDEPTH_ON;
9479
case FRAG_DEPTH_LAYOUT_GREATER:
9480
return BRW_PSCDEPTH_ON_GE;
9481
case FRAG_DEPTH_LAYOUT_LESS:
9482
return BRW_PSCDEPTH_ON_LE;
9483
case FRAG_DEPTH_LAYOUT_UNCHANGED:
9484
return BRW_PSCDEPTH_OFF;
9485
}
9486
}
9487
return BRW_PSCDEPTH_OFF;
9488
}
9489
9490
/**
9491
* Move load_interpolated_input with simple (payload-based) barycentric modes
9492
* to the top of the program so we don't emit multiple PLNs for the same input.
9493
*
9494
* This works around CSE not being able to handle non-dominating cases
9495
* such as:
9496
*
9497
* if (...) {
9498
* interpolate input
9499
* } else {
9500
* interpolate the same exact input
9501
* }
9502
*
9503
* This should be replaced by global value numbering someday.
9504
*/
9505
bool
9506
brw_nir_move_interpolation_to_top(nir_shader *nir)
9507
{
9508
bool progress = false;
9509
9510
nir_foreach_function(f, nir) {
9511
if (!f->impl)
9512
continue;
9513
9514
nir_block *top = nir_start_block(f->impl);
9515
exec_node *cursor_node = NULL;
9516
9517
nir_foreach_block(block, f->impl) {
9518
if (block == top)
9519
continue;
9520
9521
nir_foreach_instr_safe(instr, block) {
9522
if (instr->type != nir_instr_type_intrinsic)
9523
continue;
9524
9525
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9526
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
9527
continue;
9528
nir_intrinsic_instr *bary_intrinsic =
9529
nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
9530
nir_intrinsic_op op = bary_intrinsic->intrinsic;
9531
9532
/* Leave interpolateAtSample/Offset() where they are. */
9533
if (op == nir_intrinsic_load_barycentric_at_sample ||
9534
op == nir_intrinsic_load_barycentric_at_offset)
9535
continue;
9536
9537
nir_instr *move[3] = {
9538
&bary_intrinsic->instr,
9539
intrin->src[1].ssa->parent_instr,
9540
instr
9541
};
9542
9543
for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
9544
if (move[i]->block != top) {
9545
move[i]->block = top;
9546
exec_node_remove(&move[i]->node);
9547
if (cursor_node) {
9548
exec_node_insert_after(cursor_node, &move[i]->node);
9549
} else {
9550
exec_list_push_head(&top->instr_list, &move[i]->node);
9551
}
9552
cursor_node = &move[i]->node;
9553
progress = true;
9554
}
9555
}
9556
}
9557
}
9558
nir_metadata_preserve(f->impl, nir_metadata_block_index |
9559
nir_metadata_dominance);
9560
}
9561
9562
return progress;
9563
}
9564
9565
/**
9566
* Demote per-sample barycentric intrinsics to centroid.
9567
*
9568
* Useful when rendering to a non-multisampled buffer.
9569
*/
9570
bool
9571
brw_nir_demote_sample_qualifiers(nir_shader *nir)
9572
{
9573
bool progress = true;
9574
9575
nir_foreach_function(f, nir) {
9576
if (!f->impl)
9577
continue;
9578
9579
nir_builder b;
9580
nir_builder_init(&b, f->impl);
9581
9582
nir_foreach_block(block, f->impl) {
9583
nir_foreach_instr_safe(instr, block) {
9584
if (instr->type != nir_instr_type_intrinsic)
9585
continue;
9586
9587
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9588
if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
9589
intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
9590
continue;
9591
9592
b.cursor = nir_before_instr(instr);
9593
nir_ssa_def *centroid =
9594
nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
9595
nir_intrinsic_interp_mode(intrin));
9596
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
9597
centroid);
9598
nir_instr_remove(instr);
9599
progress = true;
9600
}
9601
}
9602
9603
nir_metadata_preserve(f->impl, nir_metadata_block_index |
9604
nir_metadata_dominance);
9605
}
9606
9607
return progress;
9608
}
9609
9610
void
9611
brw_nir_populate_wm_prog_data(const nir_shader *shader,
9612
const struct intel_device_info *devinfo,
9613
const struct brw_wm_prog_key *key,
9614
struct brw_wm_prog_data *prog_data)
9615
{
9616
/* key->alpha_test_func means simulating alpha testing via discards,
9617
* so the shader definitely kills pixels.
9618
*/
9619
prog_data->uses_kill = shader->info.fs.uses_discard ||
9620
key->alpha_test_func;
9621
prog_data->uses_omask = !key->ignore_sample_mask_out &&
9622
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
9623
prog_data->computed_depth_mode = computed_depth_mode(shader);
9624
prog_data->computed_stencil =
9625
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
9626
9627
prog_data->persample_dispatch =
9628
key->multisample_fbo &&
9629
(key->persample_interp ||
9630
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
9631
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
9632
shader->info.fs.uses_sample_qualifier ||
9633
shader->info.outputs_read);
9634
9635
if (devinfo->ver >= 6) {
9636
prog_data->uses_sample_mask =
9637
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
9638
9639
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
9640
*
9641
* "MSDISPMODE_PERSAMPLE is required in order to select
9642
* POSOFFSET_SAMPLE"
9643
*
9644
* So we can only really get sample positions if we are doing real
9645
* per-sample dispatch. If we need gl_SamplePosition and we don't have
9646
* persample dispatch, we hard-code it to 0.5.
9647
*/
9648
prog_data->uses_pos_offset = prog_data->persample_dispatch &&
9649
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
9650
}
9651
9652
prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
9653
9654
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
9655
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
9656
prog_data->inner_coverage = shader->info.fs.inner_coverage;
9657
9658
prog_data->barycentric_interp_modes =
9659
brw_compute_barycentric_interp_modes(devinfo, shader);
9660
9661
prog_data->per_coarse_pixel_dispatch =
9662
key->coarse_pixel &&
9663
!prog_data->persample_dispatch &&
9664
!prog_data->uses_sample_mask &&
9665
(prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
9666
!prog_data->computed_stencil;
9667
9668
prog_data->uses_src_w =
9669
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
9670
prog_data->uses_src_depth =
9671
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
9672
!prog_data->per_coarse_pixel_dispatch;
9673
prog_data->uses_depth_w_coefficients =
9674
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
9675
prog_data->per_coarse_pixel_dispatch;
9676
9677
calculate_urb_setup(devinfo, key, prog_data, shader);
9678
brw_compute_flat_inputs(prog_data, shader);
9679
}
9680
9681
/**
9682
* Pre-gfx6, the register file of the EUs was shared between threads,
9683
* and each thread used some subset allocated on a 16-register block
9684
* granularity. The unit states wanted these block counts.
9685
*/
9686
static inline int
9687
brw_register_blocks(int reg_count)
9688
{
9689
return ALIGN(reg_count, 16) / 16 - 1;
9690
}
9691
9692
const unsigned *
9693
brw_compile_fs(const struct brw_compiler *compiler,
9694
void *mem_ctx,
9695
struct brw_compile_fs_params *params)
9696
{
9697
struct nir_shader *nir = params->nir;
9698
const struct brw_wm_prog_key *key = params->key;
9699
struct brw_wm_prog_data *prog_data = params->prog_data;
9700
bool allow_spilling = params->allow_spilling;
9701
const bool debug_enabled =
9702
INTEL_DEBUG & (params->debug_flag ? params->debug_flag : DEBUG_WM);
9703
9704
prog_data->base.stage = MESA_SHADER_FRAGMENT;
9705
9706
const struct intel_device_info *devinfo = compiler->devinfo;
9707
const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
9708
9709
brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size, true);
9710
brw_nir_lower_fs_inputs(nir, devinfo, key);
9711
brw_nir_lower_fs_outputs(nir);
9712
9713
if (devinfo->ver < 6)
9714
brw_setup_vue_interpolation(params->vue_map, nir, prog_data);
9715
9716
/* From the SKL PRM, Volume 7, "Alpha Coverage":
9717
* "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
9718
* hardware, regardless of the state setting for this feature."
9719
*/
9720
if (devinfo->ver > 6 && key->alpha_to_coverage) {
9721
/* Run constant fold optimization in order to get the correct source
9722
* offset to determine render target 0 store instruction in
9723
* emit_alpha_to_coverage pass.
9724
*/
9725
NIR_PASS_V(nir, nir_opt_constant_folding);
9726
NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage);
9727
}
9728
9729
if (!key->multisample_fbo)
9730
NIR_PASS_V(nir, brw_nir_demote_sample_qualifiers);
9731
NIR_PASS_V(nir, brw_nir_move_interpolation_to_top);
9732
brw_postprocess_nir(nir, compiler, true, debug_enabled,
9733
key->base.robust_buffer_access);
9734
9735
brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
9736
9737
fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
9738
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
9739
float throughput = 0;
9740
bool has_spilled = false;
9741
9742
v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9743
&prog_data->base, nir, 8,
9744
params->shader_time ? params->shader_time_index8 : -1,
9745
debug_enabled);
9746
if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
9747
params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
9748
delete v8;
9749
return NULL;
9750
} else if (!(INTEL_DEBUG & DEBUG_NO8)) {
9751
simd8_cfg = v8->cfg;
9752
prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs;
9753
prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
9754
const performance &perf = v8->performance_analysis.require();
9755
throughput = MAX2(throughput, perf.throughput);
9756
has_spilled = v8->spilled_any_registers;
9757
allow_spilling = false;
9758
}
9759
9760
/* Limit dispatch width to simd8 with dual source blending on gfx8.
9761
* See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
9762
*/
9763
if (devinfo->ver == 8 && prog_data->dual_src_blend &&
9764
!(INTEL_DEBUG & DEBUG_NO8)) {
9765
assert(!params->use_rep_send);
9766
v8->limit_dispatch_width(8, "gfx8 workaround: "
9767
"using SIMD8 when dual src blending.\n");
9768
}
9769
9770
if (key->coarse_pixel) {
9771
if (prog_data->dual_src_blend) {
9772
v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
9773
" use SIMD8 messages.\n");
9774
}
9775
v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
9776
" pixel shading.\n");
9777
}
9778
9779
if (!has_spilled &&
9780
v8->max_dispatch_width >= 16 &&
9781
(!(INTEL_DEBUG & DEBUG_NO16) || params->use_rep_send)) {
9782
/* Try a SIMD16 compile */
9783
v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9784
&prog_data->base, nir, 16,
9785
params->shader_time ? params->shader_time_index16 : -1,
9786
debug_enabled);
9787
v16->import_uniforms(v8);
9788
if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
9789
compiler->shader_perf_log(params->log_data,
9790
"SIMD16 shader failed to compile: %s",
9791
v16->fail_msg);
9792
} else {
9793
simd16_cfg = v16->cfg;
9794
prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs;
9795
prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
9796
const performance &perf = v16->performance_analysis.require();
9797
throughput = MAX2(throughput, perf.throughput);
9798
has_spilled = v16->spilled_any_registers;
9799
allow_spilling = false;
9800
}
9801
}
9802
9803
const bool simd16_failed = v16 && !simd16_cfg;
9804
9805
/* Currently, the compiler only supports SIMD32 on SNB+ */
9806
if (!has_spilled &&
9807
v8->max_dispatch_width >= 32 && !params->use_rep_send &&
9808
devinfo->ver >= 6 && !simd16_failed &&
9809
!(INTEL_DEBUG & DEBUG_NO32)) {
9810
/* Try a SIMD32 compile */
9811
v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9812
&prog_data->base, nir, 32,
9813
params->shader_time ? params->shader_time_index32 : -1,
9814
debug_enabled);
9815
v32->import_uniforms(v8);
9816
if (!v32->run_fs(allow_spilling, false)) {
9817
compiler->shader_perf_log(params->log_data,
9818
"SIMD32 shader failed to compile: %s",
9819
v32->fail_msg);
9820
} else {
9821
const performance &perf = v32->performance_analysis.require();
9822
9823
if (!(INTEL_DEBUG & DEBUG_DO32) && throughput >= perf.throughput) {
9824
compiler->shader_perf_log(params->log_data, "SIMD32 shader inefficient\n");
9825
} else {
9826
simd32_cfg = v32->cfg;
9827
prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs;
9828
prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
9829
throughput = MAX2(throughput, perf.throughput);
9830
}
9831
}
9832
}
9833
9834
/* When the caller requests a repclear shader, they want SIMD16-only */
9835
if (params->use_rep_send)
9836
simd8_cfg = NULL;
9837
9838
/* Prior to Iron Lake, the PS had a single shader offset with a jump table
9839
* at the top to select the shader. We've never implemented that.
9840
* Instead, we just give them exactly one shader and we pick the widest one
9841
* available.
9842
*/
9843
if (compiler->devinfo->ver < 5) {
9844
if (simd32_cfg || simd16_cfg)
9845
simd8_cfg = NULL;
9846
if (simd32_cfg)
9847
simd16_cfg = NULL;
9848
}
9849
9850
/* If computed depth is enabled SNB only allows SIMD8. */
9851
if (compiler->devinfo->ver == 6 &&
9852
prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
9853
assert(simd16_cfg == NULL && simd32_cfg == NULL);
9854
9855
if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
9856
/* Iron lake and earlier only have one Dispatch GRF start field. Make
9857
* the data available in the base prog data struct for convenience.
9858
*/
9859
if (simd16_cfg) {
9860
prog_data->base.dispatch_grf_start_reg =
9861
prog_data->dispatch_grf_start_reg_16;
9862
} else if (simd32_cfg) {
9863
prog_data->base.dispatch_grf_start_reg =
9864
prog_data->dispatch_grf_start_reg_32;
9865
}
9866
}
9867
9868
if (prog_data->persample_dispatch) {
9869
/* Starting with SandyBridge (where we first get MSAA), the different
9870
* pixel dispatch combinations are grouped into classifications A
9871
* through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On most hardware
9872
* generations, the only configurations supporting persample dispatch
9873
* are those in which only one dispatch width is enabled.
9874
*
9875
* The Gfx12 hardware spec has a similar dispatch grouping table, but
9876
* the following conflicting restriction applies (from the page on
9877
* "Structure_3DSTATE_PS_BODY"), so we need to keep the SIMD16 shader:
9878
*
9879
* "SIMD32 may only be enabled if SIMD16 or (dual)SIMD8 is also
9880
* enabled."
9881
*/
9882
if (simd32_cfg || simd16_cfg)
9883
simd8_cfg = NULL;
9884
if (simd32_cfg && devinfo->ver < 12)
9885
simd16_cfg = NULL;
9886
}
9887
9888
fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
9889
v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
9890
9891
if (unlikely(debug_enabled)) {
9892
g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
9893
nir->info.label ?
9894
nir->info.label : "unnamed",
9895
nir->info.name));
9896
}
9897
9898
struct brw_compile_stats *stats = params->stats;
9899
9900
if (simd8_cfg) {
9901
prog_data->dispatch_8 = true;
9902
g.generate_code(simd8_cfg, 8, v8->shader_stats,
9903
v8->performance_analysis.require(), stats);
9904
stats = stats ? stats + 1 : NULL;
9905
}
9906
9907
if (simd16_cfg) {
9908
prog_data->dispatch_16 = true;
9909
prog_data->prog_offset_16 = g.generate_code(
9910
simd16_cfg, 16, v16->shader_stats,
9911
v16->performance_analysis.require(), stats);
9912
stats = stats ? stats + 1 : NULL;
9913
}
9914
9915
if (simd32_cfg) {
9916
prog_data->dispatch_32 = true;
9917
prog_data->prog_offset_32 = g.generate_code(
9918
simd32_cfg, 32, v32->shader_stats,
9919
v32->performance_analysis.require(), stats);
9920
stats = stats ? stats + 1 : NULL;
9921
}
9922
9923
g.add_const_data(nir->constant_data, nir->constant_data_size);
9924
9925
delete v8;
9926
delete v16;
9927
delete v32;
9928
9929
return g.get_assembly();
9930
}
9931
9932
fs_reg *
9933
fs_visitor::emit_cs_work_group_id_setup()
9934
{
9935
assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
9936
9937
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
9938
9939
struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
9940
struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
9941
struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
9942
9943
bld.MOV(*reg, r0_1);
9944
bld.MOV(offset(*reg, bld, 1), r0_6);
9945
bld.MOV(offset(*reg, bld, 2), r0_7);
9946
9947
return reg;
9948
}
9949
9950
unsigned
9951
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
9952
unsigned threads)
9953
{
9954
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
9955
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
9956
return cs_prog_data->push.per_thread.size * threads +
9957
cs_prog_data->push.cross_thread.size;
9958
}
9959
9960
static void
9961
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
9962
{
9963
block->dwords = dwords;
9964
block->regs = DIV_ROUND_UP(dwords, 8);
9965
block->size = block->regs * 32;
9966
}
9967
9968
static void
9969
cs_fill_push_const_info(const struct intel_device_info *devinfo,
9970
struct brw_cs_prog_data *cs_prog_data)
9971
{
9972
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
9973
int subgroup_id_index = get_subgroup_id_param_index(devinfo, prog_data);
9974
bool cross_thread_supported = devinfo->verx10 >= 75;
9975
9976
/* The thread ID should be stored in the last param dword */
9977
assert(subgroup_id_index == -1 ||
9978
subgroup_id_index == (int)prog_data->nr_params - 1);
9979
9980
unsigned cross_thread_dwords, per_thread_dwords;
9981
if (!cross_thread_supported) {
9982
cross_thread_dwords = 0u;
9983
per_thread_dwords = prog_data->nr_params;
9984
} else if (subgroup_id_index >= 0) {
9985
/* Fill all but the last register with cross-thread payload */
9986
cross_thread_dwords = 8 * (subgroup_id_index / 8);
9987
per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
9988
assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
9989
} else {
9990
/* Fill all data using cross-thread payload */
9991
cross_thread_dwords = prog_data->nr_params;
9992
per_thread_dwords = 0u;
9993
}
9994
9995
fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
9996
fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
9997
9998
assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
9999
cs_prog_data->push.per_thread.size == 0);
10000
assert(cs_prog_data->push.cross_thread.dwords +
10001
cs_prog_data->push.per_thread.dwords ==
10002
prog_data->nr_params);
10003
}
10004
10005
static bool
10006
filter_simd(const nir_instr *instr, const void * /* options */)
10007
{
10008
if (instr->type != nir_instr_type_intrinsic)
10009
return false;
10010
10011
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
10012
case nir_intrinsic_load_simd_width_intel:
10013
case nir_intrinsic_load_subgroup_id:
10014
return true;
10015
10016
default:
10017
return false;
10018
}
10019
}
10020
10021
static nir_ssa_def *
10022
lower_simd(nir_builder *b, nir_instr *instr, void *options)
10023
{
10024
uintptr_t simd_width = (uintptr_t)options;
10025
10026
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
10027
case nir_intrinsic_load_simd_width_intel:
10028
return nir_imm_int(b, simd_width);
10029
10030
case nir_intrinsic_load_subgroup_id:
10031
/* If the whole workgroup fits in one thread, we can lower subgroup_id
10032
* to a constant zero.
10033
*/
10034
if (!b->shader->info.workgroup_size_variable) {
10035
unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
10036
b->shader->info.workgroup_size[1] *
10037
b->shader->info.workgroup_size[2];
10038
if (local_workgroup_size <= simd_width)
10039
return nir_imm_int(b, 0);
10040
}
10041
return NULL;
10042
10043
default:
10044
return NULL;
10045
}
10046
}
10047
10048
static void
10049
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
10050
{
10051
nir_shader_lower_instructions(nir, filter_simd, lower_simd,
10052
(void *)(uintptr_t)dispatch_width);
10053
}
10054
10055
static nir_shader *
10056
compile_cs_to_nir(const struct brw_compiler *compiler,
10057
void *mem_ctx,
10058
const struct brw_cs_prog_key *key,
10059
const nir_shader *src_shader,
10060
unsigned dispatch_width,
10061
bool debug_enabled)
10062
{
10063
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
10064
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
10065
10066
NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);
10067
10068
/* Clean up after the local index and ID calculations. */
10069
NIR_PASS_V(shader, nir_opt_constant_folding);
10070
NIR_PASS_V(shader, nir_opt_dce);
10071
10072
brw_postprocess_nir(shader, compiler, true, debug_enabled,
10073
key->base.robust_buffer_access);
10074
10075
return shader;
10076
}
10077
10078
const unsigned *
10079
brw_compile_cs(const struct brw_compiler *compiler,
10080
void *mem_ctx,
10081
struct brw_compile_cs_params *params)
10082
{
10083
const nir_shader *nir = params->nir;
10084
const struct brw_cs_prog_key *key = params->key;
10085
struct brw_cs_prog_data *prog_data = params->prog_data;
10086
int shader_time_index = params->shader_time ? params->shader_time_index : -1;
10087
10088
const bool debug_enabled = INTEL_DEBUG & DEBUG_CS;
10089
10090
prog_data->base.stage = MESA_SHADER_COMPUTE;
10091
prog_data->base.total_shared = nir->info.shared_size;
10092
10093
/* Generate code for all the possible SIMD variants. */
10094
bool generate_all;
10095
10096
unsigned min_dispatch_width;
10097
unsigned max_dispatch_width;
10098
10099
if (nir->info.workgroup_size_variable) {
10100
generate_all = true;
10101
min_dispatch_width = 8;
10102
max_dispatch_width = 32;
10103
} else {
10104
generate_all = false;
10105
prog_data->local_size[0] = nir->info.workgroup_size[0];
10106
prog_data->local_size[1] = nir->info.workgroup_size[1];
10107
prog_data->local_size[2] = nir->info.workgroup_size[2];
10108
unsigned local_workgroup_size = prog_data->local_size[0] *
10109
prog_data->local_size[1] *
10110
prog_data->local_size[2];
10111
10112
/* Limit max_threads to 64 for the GPGPU_WALKER command */
10113
const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
10114
min_dispatch_width = util_next_power_of_two(
10115
MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));
10116
assert(min_dispatch_width <= 32);
10117
max_dispatch_width = 32;
10118
}
10119
10120
if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
10121
/* These enum values are expressly chosen to be equal to the subgroup
10122
* size that they require.
10123
*/
10124
const unsigned required_dispatch_width =
10125
(unsigned)key->base.subgroup_size_type;
10126
assert(required_dispatch_width == 8 ||
10127
required_dispatch_width == 16 ||
10128
required_dispatch_width == 32);
10129
if (required_dispatch_width < min_dispatch_width ||
10130
required_dispatch_width > max_dispatch_width) {
10131
params->error_str = ralloc_strdup(mem_ctx,
10132
"Cannot satisfy explicit subgroup size");
10133
return NULL;
10134
}
10135
min_dispatch_width = max_dispatch_width = required_dispatch_width;
10136
}
10137
10138
assert(min_dispatch_width <= max_dispatch_width);
10139
10140
fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
10141
fs_visitor *v = NULL;
10142
10143
if (!(INTEL_DEBUG & DEBUG_NO8) &&
10144
min_dispatch_width <= 8 && max_dispatch_width >= 8) {
10145
nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
10146
nir, 8, debug_enabled);
10147
v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10148
&prog_data->base,
10149
nir8, 8, shader_time_index, debug_enabled);
10150
if (!v8->run_cs(true /* allow_spilling */)) {
10151
params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
10152
delete v8;
10153
return NULL;
10154
}
10155
10156
/* We should always be able to do SIMD32 for compute shaders */
10157
assert(v8->max_dispatch_width >= 32);
10158
10159
v = v8;
10160
prog_data->prog_mask |= 1 << 0;
10161
if (v8->spilled_any_registers)
10162
prog_data->prog_spilled |= 1 << 0;
10163
cs_fill_push_const_info(compiler->devinfo, prog_data);
10164
}
10165
10166
if (!(INTEL_DEBUG & DEBUG_NO16) &&
10167
(generate_all || !prog_data->prog_spilled) &&
10168
min_dispatch_width <= 16 && max_dispatch_width >= 16) {
10169
/* Try a SIMD16 compile */
10170
nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
10171
nir, 16, debug_enabled);
10172
v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10173
&prog_data->base,
10174
nir16, 16, shader_time_index, debug_enabled);
10175
if (v8)
10176
v16->import_uniforms(v8);
10177
10178
const bool allow_spilling = generate_all || v == NULL;
10179
if (!v16->run_cs(allow_spilling)) {
10180
compiler->shader_perf_log(params->log_data,
10181
"SIMD16 shader failed to compile: %s",
10182
v16->fail_msg);
10183
if (!v) {
10184
assert(v8 == NULL);
10185
params->error_str = ralloc_asprintf(
10186
mem_ctx, "Not enough threads for SIMD8 and "
10187
"couldn't generate SIMD16: %s", v16->fail_msg);
10188
delete v16;
10189
return NULL;
10190
}
10191
} else {
10192
/* We should always be able to do SIMD32 for compute shaders */
10193
assert(v16->max_dispatch_width >= 32);
10194
10195
v = v16;
10196
prog_data->prog_mask |= 1 << 1;
10197
if (v16->spilled_any_registers)
10198
prog_data->prog_spilled |= 1 << 1;
10199
cs_fill_push_const_info(compiler->devinfo, prog_data);
10200
}
10201
}
10202
10203
/* The SIMD32 is only enabled for cases it is needed unless forced.
10204
*
10205
* TODO: Use performance_analysis and drop this boolean.
10206
*/
10207
const bool needs_32 = v == NULL ||
10208
(INTEL_DEBUG & DEBUG_DO32) ||
10209
generate_all;
10210
10211
if (!(INTEL_DEBUG & DEBUG_NO32) &&
10212
(generate_all || !prog_data->prog_spilled) &&
10213
needs_32 &&
10214
min_dispatch_width <= 32 && max_dispatch_width >= 32) {
10215
/* Try a SIMD32 compile */
10216
nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
10217
nir, 32, debug_enabled);
10218
v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10219
&prog_data->base,
10220
nir32, 32, shader_time_index, debug_enabled);
10221
if (v8)
10222
v32->import_uniforms(v8);
10223
else if (v16)
10224
v32->import_uniforms(v16);
10225
10226
const bool allow_spilling = generate_all || v == NULL;
10227
if (!v32->run_cs(allow_spilling)) {
10228
compiler->shader_perf_log(params->log_data,
10229
"SIMD32 shader failed to compile: %s",
10230
v32->fail_msg);
10231
if (!v) {
10232
assert(v8 == NULL);
10233
assert(v16 == NULL);
10234
params->error_str = ralloc_asprintf(
10235
mem_ctx, "Not enough threads for SIMD16 and "
10236
"couldn't generate SIMD32: %s", v32->fail_msg);
10237
delete v32;
10238
return NULL;
10239
}
10240
} else {
10241
v = v32;
10242
prog_data->prog_mask |= 1 << 2;
10243
if (v32->spilled_any_registers)
10244
prog_data->prog_spilled |= 1 << 2;
10245
cs_fill_push_const_info(compiler->devinfo, prog_data);
10246
}
10247
}
10248
10249
if (unlikely(!v) && (INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32))) {
10250
params->error_str =
10251
ralloc_strdup(mem_ctx,
10252
"Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
10253
return NULL;
10254
}
10255
10256
assert(v);
10257
10258
const unsigned *ret = NULL;
10259
10260
fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
10261
v->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
10262
if (unlikely(debug_enabled)) {
10263
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
10264
nir->info.label ?
10265
nir->info.label : "unnamed",
10266
nir->info.name);
10267
g.enable_debug(name);
10268
}
10269
10270
struct brw_compile_stats *stats = params->stats;
10271
if (generate_all) {
10272
if (prog_data->prog_mask & (1 << 0)) {
10273
assert(v8);
10274
prog_data->prog_offset[0] =
10275
g.generate_code(v8->cfg, 8, v8->shader_stats,
10276
v8->performance_analysis.require(), stats);
10277
stats = stats ? stats + 1 : NULL;
10278
}
10279
10280
if (prog_data->prog_mask & (1 << 1)) {
10281
assert(v16);
10282
prog_data->prog_offset[1] =
10283
g.generate_code(v16->cfg, 16, v16->shader_stats,
10284
v16->performance_analysis.require(), stats);
10285
stats = stats ? stats + 1 : NULL;
10286
}
10287
10288
if (prog_data->prog_mask & (1 << 2)) {
10289
assert(v32);
10290
prog_data->prog_offset[2] =
10291
g.generate_code(v32->cfg, 32, v32->shader_stats,
10292
v32->performance_analysis.require(), stats);
10293
stats = stats ? stats + 1 : NULL;
10294
}
10295
} else {
10296
/* Only one dispatch width will be valid, and will be at offset 0,
10297
* which is already the default value of prog_offset_* fields.
10298
*/
10299
prog_data->prog_mask = 1 << (v->dispatch_width / 16);
10300
g.generate_code(v->cfg, v->dispatch_width, v->shader_stats,
10301
v->performance_analysis.require(), stats);
10302
}
10303
10304
g.add_const_data(nir->constant_data, nir->constant_data_size);
10305
10306
ret = g.get_assembly();
10307
10308
delete v8;
10309
delete v16;
10310
delete v32;
10311
10312
return ret;
10313
}
10314
10315
static unsigned
10316
brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
10317
const struct brw_cs_prog_data *cs_prog_data,
10318
unsigned group_size)
10319
{
10320
const unsigned mask = cs_prog_data->prog_mask;
10321
assert(mask != 0);
10322
10323
static const unsigned simd8 = 1 << 0;
10324
static const unsigned simd16 = 1 << 1;
10325
static const unsigned simd32 = 1 << 2;
10326
10327
if ((INTEL_DEBUG & DEBUG_DO32) && (mask & simd32))
10328
return 32;
10329
10330
/* Limit max_threads to 64 for the GPGPU_WALKER command */
10331
const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads);
10332
10333
if ((mask & simd8) && group_size <= 8 * max_threads) {
10334
/* Prefer SIMD16 if can do without spilling. Matches logic in
10335
* brw_compile_cs.
10336
*/
10337
if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
10338
return 16;
10339
return 8;
10340
}
10341
10342
if ((mask & simd16) && group_size <= 16 * max_threads)
10343
return 16;
10344
10345
assert(mask & simd32);
10346
assert(group_size <= 32 * max_threads);
10347
return 32;
10348
}
10349
10350
struct brw_cs_dispatch_info
10351
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
10352
const struct brw_cs_prog_data *prog_data,
10353
const unsigned *override_local_size)
10354
{
10355
struct brw_cs_dispatch_info info = {};
10356
10357
const unsigned *sizes =
10358
override_local_size ? override_local_size :
10359
prog_data->local_size;
10360
10361
info.group_size = sizes[0] * sizes[1] * sizes[2];
10362
info.simd_size =
10363
brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
10364
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
10365
10366
const uint32_t remainder = info.group_size & (info.simd_size - 1);
10367
if (remainder > 0)
10368
info.right_mask = ~0u >> (32 - remainder);
10369
else
10370
info.right_mask = ~0u >> (32 - info.simd_size);
10371
10372
return info;
10373
}
10374
10375
static uint8_t
10376
compile_single_bs(const struct brw_compiler *compiler, void *log_data,
10377
void *mem_ctx,
10378
const struct brw_bs_prog_key *key,
10379
struct brw_bs_prog_data *prog_data,
10380
nir_shader *shader,
10381
fs_generator *g,
10382
struct brw_compile_stats *stats,
10383
int *prog_offset,
10384
char **error_str)
10385
{
10386
const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;
10387
10388
prog_data->base.stage = shader->info.stage;
10389
prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
10390
shader->scratch_size);
10391
10392
const unsigned max_dispatch_width = 16;
10393
brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width, true);
10394
brw_postprocess_nir(shader, compiler, true, debug_enabled,
10395
key->base.robust_buffer_access);
10396
10397
fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;
10398
bool has_spilled = false;
10399
10400
uint8_t simd_size = 0;
10401
if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
10402
v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
10403
&prog_data->base, shader,
10404
8, -1 /* shader time */, debug_enabled);
10405
const bool allow_spilling = true;
10406
if (!v8->run_bs(allow_spilling)) {
10407
if (error_str)
10408
*error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
10409
delete v8;
10410
return 0;
10411
} else {
10412
v = v8;
10413
simd_size = 8;
10414
if (v8->spilled_any_registers)
10415
has_spilled = true;
10416
}
10417
}
10418
10419
if (!has_spilled && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
10420
v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
10421
&prog_data->base, shader,
10422
16, -1 /* shader time */, debug_enabled);
10423
const bool allow_spilling = (v == NULL);
10424
if (!v16->run_bs(allow_spilling)) {
10425
compiler->shader_perf_log(log_data,
10426
"SIMD16 shader failed to compile: %s",
10427
v16->fail_msg);
10428
if (v == NULL) {
10429
assert(v8 == NULL);
10430
if (error_str) {
10431
*error_str = ralloc_asprintf(
10432
mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s",
10433
v16->fail_msg);
10434
}
10435
delete v16;
10436
return 0;
10437
}
10438
} else {
10439
v = v16;
10440
simd_size = 16;
10441
if (v16->spilled_any_registers)
10442
has_spilled = true;
10443
}
10444
}
10445
10446
if (unlikely(v == NULL)) {
10447
assert(INTEL_DEBUG & (DEBUG_NO8 | DEBUG_NO16));
10448
if (error_str) {
10449
*error_str = ralloc_strdup(mem_ctx,
10450
"Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
10451
}
10452
return false;
10453
}
10454
10455
assert(v);
10456
10457
int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,
10458
v->performance_analysis.require(), stats);
10459
if (prog_offset)
10460
*prog_offset = offset;
10461
else
10462
assert(offset == 0);
10463
10464
delete v8;
10465
delete v16;
10466
10467
return simd_size;
10468
}
10469
10470
uint64_t
10471
brw_bsr(const struct intel_device_info *devinfo,
10472
uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
10473
{
10474
assert(offset % 64 == 0);
10475
assert(simd_size == 8 || simd_size == 16);
10476
assert(local_arg_offset % 8 == 0);
10477
10478
return offset |
10479
SET_BITS(simd_size > 8, 4, 4) |
10480
SET_BITS(local_arg_offset / 8, 2, 0);
10481
}
10482
10483
const unsigned *
10484
brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
10485
void *mem_ctx,
10486
const struct brw_bs_prog_key *key,
10487
struct brw_bs_prog_data *prog_data,
10488
nir_shader *shader,
10489
unsigned num_resume_shaders,
10490
struct nir_shader **resume_shaders,
10491
struct brw_compile_stats *stats,
10492
char **error_str)
10493
{
10494
const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;
10495
10496
prog_data->base.stage = shader->info.stage;
10497
prog_data->max_stack_size = 0;
10498
10499
fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
10500
false, shader->info.stage);
10501
if (unlikely(debug_enabled)) {
10502
char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s",
10503
shader->info.label ?
10504
shader->info.label : "unnamed",
10505
gl_shader_stage_name(shader->info.stage),
10506
shader->info.name);
10507
g.enable_debug(name);
10508
}
10509
10510
prog_data->simd_size =
10511
compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
10512
shader, &g, stats, NULL, error_str);
10513
if (prog_data->simd_size == 0)
10514
return NULL;
10515
10516
uint64_t *resume_sbt = ralloc_array(mem_ctx, uint64_t, num_resume_shaders);
10517
for (unsigned i = 0; i < num_resume_shaders; i++) {
10518
if (INTEL_DEBUG & DEBUG_RT) {
10519
char *name = ralloc_asprintf(mem_ctx, "%s %s resume(%u) shader %s",
10520
shader->info.label ?
10521
shader->info.label : "unnamed",
10522
gl_shader_stage_name(shader->info.stage),
10523
i, shader->info.name);
10524
g.enable_debug(name);
10525
}
10526
10527
/* TODO: Figure out shader stats etc. for resume shaders */
10528
int offset = 0;
10529
uint8_t simd_size =
10530
compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
10531
resume_shaders[i], &g, NULL, &offset, error_str);
10532
if (simd_size == 0)
10533
return NULL;
10534
10535
assert(offset > 0);
10536
resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
10537
}
10538
10539
/* We only have one constant data so we want to make sure they're all the
10540
* same.
10541
*/
10542
for (unsigned i = 0; i < num_resume_shaders; i++) {
10543
assert(resume_shaders[i]->constant_data_size ==
10544
shader->constant_data_size);
10545
assert(memcmp(resume_shaders[i]->constant_data,
10546
shader->constant_data,
10547
shader->constant_data_size) == 0);
10548
}
10549
10550
g.add_const_data(shader->constant_data, shader->constant_data_size);
10551
g.add_resume_sbt(num_resume_shaders, resume_sbt);
10552
10553
return g.get_assembly();
10554
}
10555
10556
/**
10557
* Test the dispatch mask packing assumptions of
10558
* brw_stage_has_packed_dispatch(). Call this from e.g. the top of
10559
* fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
10560
* executed with an unexpected dispatch mask.
10561
*/
10562
static UNUSED void
10563
brw_fs_test_dispatch_packing(const fs_builder &bld)
10564
{
10565
const gl_shader_stage stage = bld.shader->stage;
10566
10567
if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
10568
bld.shader->stage_prog_data)) {
10569
const fs_builder ubld = bld.exec_all().group(1, 0);
10570
const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
10571
const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
10572
brw_dmask_reg());
10573
10574
ubld.ADD(tmp, mask, brw_imm_ud(1));
10575
ubld.AND(tmp, mask, tmp);
10576
10577
/* This will loop forever if the dispatch mask doesn't have the expected
10578
* form '2^n-1', in which case tmp will be non-zero.
10579
*/
10580
bld.emit(BRW_OPCODE_DO);
10581
bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
10582
set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
10583
}
10584
}
10585
10586
unsigned
10587
fs_visitor::workgroup_size() const
10588
{
10589
assert(stage == MESA_SHADER_COMPUTE);
10590
const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
10591
return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
10592
}
10593
10594