Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_generator.cpp
4550 views
1
/*
2
* Copyright © 2010 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
/** @file brw_fs_generator.cpp
25
*
26
* This file supports generating code from the FS LIR to the actual
27
* native instructions.
28
*/
29
30
#include "brw_eu.h"
31
#include "brw_fs.h"
32
#include "brw_cfg.h"
33
#include "util/mesa-sha1.h"
34
35
static enum brw_reg_file
36
brw_file_from_reg(fs_reg *reg)
37
{
38
switch (reg->file) {
39
case ARF:
40
return BRW_ARCHITECTURE_REGISTER_FILE;
41
case FIXED_GRF:
42
case VGRF:
43
return BRW_GENERAL_REGISTER_FILE;
44
case MRF:
45
return BRW_MESSAGE_REGISTER_FILE;
46
case IMM:
47
return BRW_IMMEDIATE_VALUE;
48
case BAD_FILE:
49
case ATTR:
50
case UNIFORM:
51
unreachable("not reached");
52
}
53
return BRW_ARCHITECTURE_REGISTER_FILE;
54
}
55
56
static struct brw_reg
57
brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
58
fs_reg *reg, bool compressed)
59
{
60
struct brw_reg brw_reg;
61
62
switch (reg->file) {
63
case MRF:
64
assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
65
FALLTHROUGH;
66
case VGRF:
67
if (reg->stride == 0) {
68
brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
69
} else {
70
/* From the Haswell PRM:
71
*
72
* "VertStride must be used to cross GRF register boundaries. This
73
* rule implies that elements within a 'Width' cannot cross GRF
74
* boundaries."
75
*
76
* The maximum width value that could satisfy this restriction is:
77
*/
78
const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
79
80
/* Because the hardware can only split source regions at a whole
81
* multiple of width during decompression (i.e. vertically), clamp
82
* the value obtained above to the physical execution size of a
83
* single decompressed chunk of the instruction:
84
*/
85
const unsigned phys_width = compressed ? inst->exec_size / 2 :
86
inst->exec_size;
87
88
const unsigned max_hw_width = 16;
89
90
/* XXX - The equation above is strictly speaking not correct on
91
* hardware that supports unbalanced GRF writes -- On Gfx9+
92
* each decompressed chunk of the instruction may have a
93
* different execution size when the number of components
94
* written to each destination GRF is not the same.
95
*/
96
if (reg->stride > 4) {
97
assert(reg != &inst->dst);
98
assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
99
brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
100
brw_reg = stride(brw_reg, reg->stride, 1, 0);
101
} else {
102
const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
103
brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
104
brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
105
}
106
107
if (devinfo->verx10 == 70) {
108
/* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
109
* "Each DF (Double Float) operand uses an element size of 4 rather
110
* than 8 and all regioning parameters are twice what the values
111
* would be based on the true element size: ExecSize, Width,
112
* HorzStride, and VertStride. Each DF operand uses a pair of
113
* channels and all masking and swizzing should be adjusted
114
* appropriately."
115
*
116
* From the IvyBridge PRM (Special Requirements for Handling Double
117
* Precision Data Types, page 71):
118
* "In Align1 mode, all regioning parameters like stride, execution
119
* size, and width must use the syntax of a pair of packed
120
* floats. The offsets for these data types must be 64-bit
121
* aligned. The execution size and regioning parameters are in terms
122
* of floats."
123
*
124
* Summarized: when handling DF-typed arguments, ExecSize,
125
* VertStride, and Width must be doubled.
126
*
127
* It applies to BayTrail too.
128
*/
129
if (type_sz(reg->type) == 8) {
130
brw_reg.width++;
131
if (brw_reg.vstride > 0)
132
brw_reg.vstride++;
133
assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
134
}
135
136
/* When converting from DF->F, we set the destination stride to 2
137
* because each d2f conversion implicitly writes 2 floats, being
138
* the first one the converted value. IVB/BYT actually writes two
139
* F components per SIMD channel, and every other component is
140
* filled with garbage.
141
*/
142
if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
143
type_sz(inst->dst.type) < 8) {
144
assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
145
brw_reg.hstride--;
146
}
147
}
148
}
149
150
brw_reg = retype(brw_reg, reg->type);
151
brw_reg = byte_offset(brw_reg, reg->offset);
152
brw_reg.abs = reg->abs;
153
brw_reg.negate = reg->negate;
154
break;
155
case ARF:
156
case FIXED_GRF:
157
case IMM:
158
assert(reg->offset == 0);
159
brw_reg = reg->as_brw_reg();
160
break;
161
case BAD_FILE:
162
/* Probably unused. */
163
brw_reg = brw_null_reg();
164
break;
165
case ATTR:
166
case UNIFORM:
167
unreachable("not reached");
168
}
169
170
/* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
171
* region, but on IVB and BYT DF regions must be programmed in terms of
172
* floats. A <0,2,1> region accomplishes this.
173
*/
174
if (devinfo->verx10 == 70 &&
175
type_sz(reg->type) == 8 &&
176
brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
177
brw_reg.width == BRW_WIDTH_1 &&
178
brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
179
brw_reg.width = BRW_WIDTH_2;
180
brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
181
}
182
183
return brw_reg;
184
}
185
186
fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
187
void *mem_ctx,
188
struct brw_stage_prog_data *prog_data,
189
bool runtime_check_aads_emit,
190
gl_shader_stage stage)
191
192
: compiler(compiler), log_data(log_data),
193
devinfo(compiler->devinfo),
194
prog_data(prog_data), dispatch_width(0),
195
runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
196
shader_name(NULL), stage(stage), mem_ctx(mem_ctx)
197
{
198
p = rzalloc(mem_ctx, struct brw_codegen);
199
brw_init_codegen(devinfo, p, mem_ctx);
200
201
/* In the FS code generator, we are very careful to ensure that we always
202
* set the right execution size so we don't need the EU code to "help" us
203
* by trying to infer it. Sometimes, it infers the wrong thing.
204
*/
205
p->automatic_exec_sizes = false;
206
}
207
208
fs_generator::~fs_generator()
209
{
210
}
211
212
class ip_record : public exec_node {
213
public:
214
DECLARE_RALLOC_CXX_OPERATORS(ip_record)
215
216
ip_record(int ip)
217
{
218
this->ip = ip;
219
}
220
221
int ip;
222
};
223
224
bool
225
fs_generator::patch_halt_jumps()
226
{
227
if (this->discard_halt_patches.is_empty())
228
return false;
229
230
int scale = brw_jump_scale(p->devinfo);
231
232
if (devinfo->ver >= 6) {
233
/* There is a somewhat strange undocumented requirement of using
234
* HALT, according to the simulator. If some channel has HALTed to
235
* a particular UIP, then by the end of the program, every channel
236
* must have HALTed to that UIP. Furthermore, the tracking is a
237
* stack, so you can't do the final halt of a UIP after starting
238
* halting to a new UIP.
239
*
240
* Symptoms of not emitting this instruction on actual hardware
241
* included GPU hangs and sparkly rendering on the piglit discard
242
* tests.
243
*/
244
brw_inst *last_halt = brw_HALT(p);
245
brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
246
brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
247
}
248
249
int ip = p->nr_insn;
250
251
foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
252
brw_inst *patch = &p->store[patch_ip->ip];
253
254
assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
255
if (devinfo->ver >= 6) {
256
/* HALT takes a half-instruction distance from the pre-incremented IP. */
257
brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
258
} else {
259
brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
260
}
261
}
262
263
this->discard_halt_patches.make_empty();
264
265
if (devinfo->ver < 6) {
266
/* From the g965 PRM:
267
*
268
* "As DMask is not automatically reloaded into AMask upon completion
269
* of this instruction, software has to manually restore AMask upon
270
* completion."
271
*
272
* DMask lives in the bottom 16 bits of sr0.1.
273
*/
274
brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
275
retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
276
brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
277
brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
278
brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
279
brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
280
}
281
282
if (devinfo->ver == 4 && !devinfo->is_g4x) {
283
/* From the g965 PRM:
284
*
285
* "[DevBW, DevCL] Erratum: The subfields in mask stack register are
286
* reset to zero during graphics reset, however, they are not
287
* initialized at thread dispatch. These subfields will retain the
288
* values from the previous thread. Software should make sure the
289
* mask stack is empty (reset to zero) before terminating the thread.
290
* In case that this is not practical, software may have to reset the
291
* mask stack at the beginning of each kernel, which will impact the
292
* performance."
293
*
294
* Luckily we can rely on:
295
*
296
* "[DevBW, DevCL] This register access restriction is not
297
* applicable, hardware does ensure execution pipeline coherency,
298
* when a mask stack register is used as an explicit source and/or
299
* destination."
300
*/
301
brw_push_insn_state(p);
302
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
303
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
304
305
brw_set_default_exec_size(p, BRW_EXECUTE_2);
306
brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
307
308
brw_set_default_exec_size(p, BRW_EXECUTE_16);
309
/* Reset the if stack. */
310
brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
311
brw_imm_uw(0));
312
313
brw_pop_insn_state(p);
314
}
315
316
return true;
317
}
318
319
void
320
fs_generator::generate_send(fs_inst *inst,
321
struct brw_reg dst,
322
struct brw_reg desc,
323
struct brw_reg ex_desc,
324
struct brw_reg payload,
325
struct brw_reg payload2)
326
{
327
const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE &&
328
dst.nr == BRW_ARF_NULL;
329
const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE;
330
331
uint32_t desc_imm = inst->desc |
332
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
333
334
uint32_t ex_desc_imm = inst->ex_desc |
335
brw_message_ex_desc(devinfo, inst->ex_mlen);
336
337
if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) {
338
/* If we have any sort of extended descriptor, then we need SENDS. This
339
* also covers the dual-payload case because ex_mlen goes in ex_desc.
340
*/
341
brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
342
desc, desc_imm, ex_desc, ex_desc_imm,
343
inst->eot);
344
if (inst->check_tdr)
345
brw_inst_set_opcode(p->devinfo, brw_last_inst,
346
devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
347
} else {
348
brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
349
inst->eot);
350
if (inst->check_tdr)
351
brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
352
}
353
}
354
355
void
356
fs_generator::fire_fb_write(fs_inst *inst,
357
struct brw_reg payload,
358
struct brw_reg implied_header,
359
GLuint nr)
360
{
361
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
362
363
if (devinfo->ver < 6) {
364
brw_push_insn_state(p);
365
brw_set_default_exec_size(p, BRW_EXECUTE_8);
366
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
367
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
368
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
369
brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
370
offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
371
brw_pop_insn_state(p);
372
}
373
374
uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data);
375
376
/* We assume render targets start at 0, because headerless FB write
377
* messages set "Render Target Index" to 0. Using a different binding
378
* table index would make it impossible to use headerless messages.
379
*/
380
const uint32_t surf_index = inst->target;
381
382
brw_inst *insn = brw_fb_WRITE(p,
383
payload,
384
retype(implied_header, BRW_REGISTER_TYPE_UW),
385
msg_control,
386
surf_index,
387
nr,
388
0,
389
inst->eot,
390
inst->last_rt,
391
inst->header_size != 0);
392
393
if (devinfo->ver >= 6)
394
brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
395
}
396
397
void
398
fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
399
{
400
if (devinfo->verx10 <= 70) {
401
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
402
brw_set_default_flag_reg(p, 0, 0);
403
}
404
405
const struct brw_reg implied_header =
406
devinfo->ver < 6 ? payload : brw_null_reg();
407
408
if (inst->base_mrf >= 0)
409
payload = brw_message_reg(inst->base_mrf);
410
411
if (!runtime_check_aads_emit) {
412
fire_fb_write(inst, payload, implied_header, inst->mlen);
413
} else {
414
/* This can only happen in gen < 6 */
415
assert(devinfo->ver < 6);
416
417
struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
418
419
/* Check runtime bit to detect if we have to send AA data or not */
420
brw_push_insn_state(p);
421
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
422
brw_set_default_exec_size(p, BRW_EXECUTE_1);
423
brw_AND(p,
424
v1_null_ud,
425
retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
426
brw_imm_ud(1<<26));
427
brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
428
429
int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
430
brw_pop_insn_state(p);
431
{
432
/* Don't send AA data */
433
fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
434
}
435
brw_land_fwd_jump(p, jmp);
436
fire_fb_write(inst, payload, implied_header, inst->mlen);
437
}
438
}
439
440
void
441
fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
442
struct brw_reg payload)
443
{
444
assert(inst->size_written % REG_SIZE == 0);
445
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
446
/* We assume that render targets start at binding table index 0. */
447
const unsigned surf_index = inst->target;
448
449
gfx9_fb_READ(p, dst, payload, surf_index,
450
inst->header_size, inst->size_written / REG_SIZE,
451
prog_data->persample_dispatch);
452
}
453
454
void
455
fs_generator::generate_mov_indirect(fs_inst *inst,
456
struct brw_reg dst,
457
struct brw_reg reg,
458
struct brw_reg indirect_byte_offset)
459
{
460
assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
461
assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
462
assert(!reg.abs && !reg.negate);
463
assert(reg.type == dst.type);
464
465
unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
466
467
if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
468
imm_byte_offset += indirect_byte_offset.ud;
469
470
reg.nr = imm_byte_offset / REG_SIZE;
471
reg.subnr = imm_byte_offset % REG_SIZE;
472
if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) {
473
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
474
subscript(reg, BRW_REGISTER_TYPE_D, 0));
475
brw_set_default_swsb(p, tgl_swsb_null());
476
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
477
subscript(reg, BRW_REGISTER_TYPE_D, 1));
478
} else {
479
brw_MOV(p, dst, reg);
480
}
481
} else {
482
/* Prior to Broadwell, there are only 8 address registers. */
483
assert(inst->exec_size <= 8 || devinfo->ver >= 8);
484
485
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
486
struct brw_reg addr = vec8(brw_address_reg(0));
487
488
/* Whether we can use destination dependency control without running the
489
* risk of a hang if an instruction gets shot down.
490
*/
491
const bool use_dep_ctrl = !inst->predicate &&
492
inst->exec_size == dispatch_width;
493
brw_inst *insn;
494
495
/* The destination stride of an instruction (in bytes) must be greater
496
* than or equal to the size of the rest of the instruction. Since the
497
* address register is of type UW, we can't use a D-type instruction.
498
* In order to get around this, re retype to UW and use a stride.
499
*/
500
indirect_byte_offset =
501
retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
502
503
/* There are a number of reasons why we don't use the base offset here.
504
* One reason is that the field is only 9 bits which means we can only
505
* use it to access the first 16 GRFs. Also, from the Haswell PRM
506
* section "Register Region Restrictions":
507
*
508
* "The lower bits of the AddressImmediate must not overflow to
509
* change the register address. The lower 5 bits of Address
510
* Immediate when added to lower 5 bits of address register gives
511
* the sub-register offset. The upper bits of Address Immediate
512
* when added to upper bits of address register gives the register
513
* address. Any overflow from sub-register offset is dropped."
514
*
515
* Since the indirect may cause us to cross a register boundary, this
516
* makes the base offset almost useless. We could try and do something
517
* clever where we use a actual base offset if base_offset % 32 == 0 but
518
* that would mean we were generating different code depending on the
519
* base offset. Instead, for the sake of consistency, we'll just do the
520
* add ourselves. This restriction is only listed in the Haswell PRM
521
* but empirical testing indicates that it applies on all older
522
* generations and is lifted on Broadwell.
523
*
524
* In the end, while base_offset is nice to look at in the generated
525
* code, using it saves us 0 instructions and would require quite a bit
526
* of case-by-case work. It's just not worth it.
527
*
528
* Due to a hardware bug some platforms (particularly Gfx11+) seem to
529
* require the address components of all channels to be valid whether or
530
* not they're active, which causes issues if we use VxH addressing
531
* under non-uniform control-flow. We can easily work around that by
532
* initializing the whole address register with a pipelined NoMask MOV
533
* instruction.
534
*/
535
if (devinfo->ver >= 7) {
536
insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
537
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
538
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
539
if (devinfo->ver >= 12)
540
brw_set_default_swsb(p, tgl_swsb_null());
541
else
542
brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
543
}
544
545
insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
546
if (devinfo->ver >= 12)
547
brw_set_default_swsb(p, tgl_swsb_regdist(1));
548
else if (devinfo->ver >= 7)
549
brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
550
551
if (type_sz(reg.type) > 4 &&
552
((devinfo->verx10 == 70) ||
553
devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
554
!devinfo->has_64bit_float || devinfo->verx10 >= 125)) {
555
/* IVB has an issue (which we found empirically) where it reads two
556
* address register components per channel for indirectly addressed
557
* 64-bit sources.
558
*
559
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
560
*
561
* "When source or destination datatype is 64b or operation is
562
* integer DWord multiply, indirect addressing must not be used."
563
*
564
* To work around both of these, we do two integer MOVs insead of one
565
* 64-bit MOV. Because no double value should ever cross a register
566
* boundary, it's safe to use the immediate offset in the indirect
567
* here to handle adding 4 bytes to the offset and avoid the extra
568
* ADD to the register file.
569
*/
570
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
571
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
572
brw_set_default_swsb(p, tgl_swsb_null());
573
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
574
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
575
} else {
576
struct brw_reg ind_src = brw_VxH_indirect(0, 0);
577
578
brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
579
580
if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
581
!inst->get_next()->is_tail_sentinel() &&
582
((fs_inst *)inst->get_next())->mlen > 0) {
583
/* From the Sandybridge PRM:
584
*
585
* "[Errata: DevSNB(SNB)] If MRF register is updated by any
586
* instruction that “indexed/indirect” source AND is followed
587
* by a send, the instruction requires a “Switch”. This is to
588
* avoid race condition where send may dispatch before MRF is
589
* updated."
590
*/
591
brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
592
}
593
}
594
}
595
}
596
597
void
598
fs_generator::generate_shuffle(fs_inst *inst,
599
struct brw_reg dst,
600
struct brw_reg src,
601
struct brw_reg idx)
602
{
603
assert(src.file == BRW_GENERAL_REGISTER_FILE);
604
assert(!src.abs && !src.negate);
605
606
/* Ivy bridge has some strange behavior that makes this a real pain to
607
* implement for 64-bit values so we just don't bother.
608
*/
609
assert(devinfo->verx10 >= 75 || type_sz(src.type) <= 4);
610
611
/* Because we're using the address register, we're limited to 8-wide
612
* execution on gfx7. On gfx8, we're limited to 16-wide by the address
613
* register file and 8-wide for 64-bit types. We could try and make this
614
* instruction splittable higher up in the compiler but that gets weird
615
* because it reads all of the channels regardless of execution size. It's
616
* easier just to split it here.
617
*/
618
const unsigned lower_width =
619
(devinfo->ver <= 7 || type_sz(src.type) > 4) ?
620
8 : MIN2(16, inst->exec_size);
621
622
brw_set_default_exec_size(p, cvt(lower_width) - 1);
623
for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
624
brw_set_default_group(p, group);
625
626
if ((src.vstride == 0 && src.hstride == 0) ||
627
idx.file == BRW_IMMEDIATE_VALUE) {
628
/* Trivial, the source is already uniform or the index is a constant.
629
* We will typically not get here if the optimizer is doing its job,
630
* but asserting would be mean.
631
*/
632
const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
633
struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
634
struct brw_reg group_dst = suboffset(dst, group);
635
if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
636
brw_MOV(p, subscript(group_dst, BRW_REGISTER_TYPE_UD, 0),
637
subscript(group_src, BRW_REGISTER_TYPE_UD, 0));
638
brw_set_default_swsb(p, tgl_swsb_null());
639
brw_MOV(p, subscript(group_dst, BRW_REGISTER_TYPE_UD, 1),
640
subscript(group_src, BRW_REGISTER_TYPE_UD, 1));
641
} else {
642
brw_MOV(p, group_dst, group_src);
643
}
644
} else {
645
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
646
struct brw_reg addr = vec8(brw_address_reg(0));
647
648
struct brw_reg group_idx = suboffset(idx, group);
649
650
if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
651
/* Things get grumpy if the register is too wide. */
652
group_idx.width--;
653
group_idx.vstride--;
654
}
655
656
assert(type_sz(group_idx.type) <= 4);
657
if (type_sz(group_idx.type) == 4) {
658
/* The destination stride of an instruction (in bytes) must be
659
* greater than or equal to the size of the rest of the
660
* instruction. Since the address register is of type UW, we
661
* can't use a D-type instruction. In order to get around this,
662
* re retype to UW and use a stride.
663
*/
664
group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
665
}
666
667
uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
668
669
/* From the Haswell PRM:
670
*
671
* "When a sequence of NoDDChk and NoDDClr are used, the last
672
* instruction that completes the scoreboard clear must have a
673
* non-zero execution mask. This means, if any kind of predication
674
* can change the execution mask or channel enable of the last
675
* instruction, the optimization must be avoided. This is to
676
* avoid instructions being shot down the pipeline when no writes
677
* are required."
678
*
679
* Whenever predication is enabled or the instructions being emitted
680
* aren't the full width, it's possible that it will be run with zero
681
* channels enabled so we can't use dependency control without
682
* running the risk of a hang if an instruction gets shot down.
683
*/
684
const bool use_dep_ctrl = !inst->predicate &&
685
lower_width == dispatch_width;
686
brw_inst *insn;
687
688
/* Due to a hardware bug some platforms (particularly Gfx11+) seem
689
* to require the address components of all channels to be valid
690
* whether or not they're active, which causes issues if we use VxH
691
* addressing under non-uniform control-flow. We can easily work
692
* around that by initializing the whole address register with a
693
* pipelined NoMask MOV instruction.
694
*/
695
insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
696
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
697
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
698
if (devinfo->ver >= 12)
699
brw_set_default_swsb(p, tgl_swsb_null());
700
else
701
brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
702
703
/* Take into account the component size and horizontal stride. */
704
assert(src.vstride == src.hstride + src.width);
705
insn = brw_SHL(p, addr, group_idx,
706
brw_imm_uw(util_logbase2(type_sz(src.type)) +
707
src.hstride - 1));
708
if (devinfo->ver >= 12)
709
brw_set_default_swsb(p, tgl_swsb_regdist(1));
710
else
711
brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
712
713
/* Add on the register start offset */
714
brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
715
716
if (type_sz(src.type) > 4 &&
717
((devinfo->verx10 == 70) ||
718
devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
719
!devinfo->has_64bit_float)) {
720
/* IVB has an issue (which we found empirically) where it reads
721
* two address register components per channel for indirectly
722
* addressed 64-bit sources.
723
*
724
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
725
*
726
* "When source or destination datatype is 64b or operation is
727
* integer DWord multiply, indirect addressing must not be
728
* used."
729
*
730
* To work around both of these, we do two integer MOVs insead of
731
* one 64-bit MOV. Because no double value should ever cross a
732
* register boundary, it's safe to use the immediate offset in the
733
* indirect here to handle adding 4 bytes to the offset and avoid
734
* the extra ADD to the register file.
735
*/
736
struct brw_reg gdst = suboffset(dst, group);
737
struct brw_reg dst_d = retype(spread(gdst, 2),
738
BRW_REGISTER_TYPE_D);
739
assert(dst.hstride == 1);
740
brw_MOV(p, dst_d,
741
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
742
brw_set_default_swsb(p, tgl_swsb_null());
743
brw_MOV(p, byte_offset(dst_d, 4),
744
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
745
} else {
746
brw_MOV(p, suboffset(dst, group * dst.hstride),
747
retype(brw_VxH_indirect(0, 0), src.type));
748
}
749
}
750
751
brw_set_default_swsb(p, tgl_swsb_null());
752
}
753
}
754
755
void
756
fs_generator::generate_quad_swizzle(const fs_inst *inst,
757
struct brw_reg dst, struct brw_reg src,
758
unsigned swiz)
759
{
760
/* Requires a quad. */
761
assert(inst->exec_size >= 4);
762
763
if (src.file == BRW_IMMEDIATE_VALUE ||
764
has_scalar_region(src)) {
765
/* The value is uniform across all channels */
766
brw_MOV(p, dst, src);
767
768
} else if (devinfo->ver < 11 && type_sz(src.type) == 4) {
769
/* This only works on 8-wide 32-bit values */
770
assert(inst->exec_size == 8);
771
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
772
assert(src.vstride == src.width + 1);
773
brw_set_default_access_mode(p, BRW_ALIGN_16);
774
struct brw_reg swiz_src = stride(src, 4, 4, 1);
775
swiz_src.swizzle = swiz;
776
brw_MOV(p, dst, swiz_src);
777
778
} else {
779
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
780
assert(src.vstride == src.width + 1);
781
const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
782
783
switch (swiz) {
784
case BRW_SWIZZLE_XXXX:
785
case BRW_SWIZZLE_YYYY:
786
case BRW_SWIZZLE_ZZZZ:
787
case BRW_SWIZZLE_WWWW:
788
brw_MOV(p, dst, stride(src_0, 4, 4, 0));
789
break;
790
791
case BRW_SWIZZLE_XXZZ:
792
case BRW_SWIZZLE_YYWW:
793
brw_MOV(p, dst, stride(src_0, 2, 2, 0));
794
break;
795
796
case BRW_SWIZZLE_XYXY:
797
case BRW_SWIZZLE_ZWZW:
798
assert(inst->exec_size == 4);
799
brw_MOV(p, dst, stride(src_0, 0, 2, 1));
800
break;
801
802
default:
803
assert(inst->force_writemask_all);
804
brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
805
806
for (unsigned c = 0; c < 4; c++) {
807
brw_inst *insn = brw_MOV(
808
p, stride(suboffset(dst, c),
809
4 * inst->dst.stride, 1, 4 * inst->dst.stride),
810
stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
811
812
if (devinfo->ver < 12) {
813
brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
814
brw_inst_set_no_dd_check(devinfo, insn, c > 0);
815
}
816
817
brw_set_default_swsb(p, tgl_swsb_null());
818
}
819
820
break;
821
}
822
}
823
}
824
825
void
826
fs_generator::generate_urb_read(fs_inst *inst,
827
struct brw_reg dst,
828
struct brw_reg header)
829
{
830
assert(inst->size_written % REG_SIZE == 0);
831
assert(header.file == BRW_GENERAL_REGISTER_FILE);
832
assert(header.type == BRW_REGISTER_TYPE_UD);
833
834
brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
835
brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
836
brw_set_src0(p, send, header);
837
if (devinfo->ver < 12)
838
brw_set_src1(p, send, brw_imm_ud(0u));
839
840
brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
841
brw_inst_set_urb_opcode(p->devinfo, send, GFX8_URB_OPCODE_SIMD8_READ);
842
843
if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
844
brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
845
846
brw_inst_set_mlen(p->devinfo, send, inst->mlen);
847
brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
848
brw_inst_set_header_present(p->devinfo, send, true);
849
brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
850
}
851
852
void
853
fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
854
{
855
brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
856
857
brw_set_dest(p, insn, brw_null_reg());
858
brw_set_src0(p, insn, payload);
859
if (devinfo->ver < 12)
860
brw_set_src1(p, insn, brw_imm_ud(0u));
861
862
brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
863
brw_inst_set_urb_opcode(p->devinfo, insn, GFX8_URB_OPCODE_SIMD8_WRITE);
864
865
if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
866
inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
867
brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
868
869
if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
870
inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
871
brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
872
873
brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
874
brw_inst_set_rlen(p->devinfo, insn, 0);
875
brw_inst_set_eot(p->devinfo, insn, inst->eot);
876
brw_inst_set_header_present(p->devinfo, insn, true);
877
brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
878
}
879
880
void
881
fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
882
{
883
struct brw_inst *insn;
884
885
insn = brw_next_insn(p, BRW_OPCODE_SEND);
886
887
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
888
brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
889
if (devinfo->ver < 12)
890
brw_set_src1(p, insn, brw_imm_ud(0u));
891
892
/* For XeHP and newer send a message to the message gateway to terminate a
893
* compute shader. For older devices, a message is sent to the thread
894
* spawner.
895
*/
896
if (devinfo->verx10 >= 125)
897
brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY);
898
else
899
brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
900
brw_inst_set_mlen(devinfo, insn, 1);
901
brw_inst_set_rlen(devinfo, insn, 0);
902
brw_inst_set_eot(devinfo, insn, inst->eot);
903
brw_inst_set_header_present(devinfo, insn, false);
904
905
brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
906
907
if (devinfo->ver < 11) {
908
brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
909
910
/* Note that even though the thread has a URB resource associated with it,
911
* we set the "do not dereference URB" bit, because the URB resource is
912
* managed by the fixed-function unit, so it will free it automatically.
913
*/
914
brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
915
}
916
917
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
918
}
919
920
void
921
fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
922
{
923
brw_barrier(p, src);
924
if (devinfo->ver >= 12) {
925
brw_set_default_swsb(p, tgl_swsb_null());
926
brw_SYNC(p, TGL_SYNC_BAR);
927
} else {
928
brw_WAIT(p);
929
}
930
}
931
932
bool
933
fs_generator::generate_linterp(fs_inst *inst,
934
struct brw_reg dst, struct brw_reg *src)
935
{
936
/* PLN reads:
937
* / in SIMD16 \
938
* -----------------------------------
939
* | src1+0 | src1+1 | src1+2 | src1+3 |
940
* |-----------------------------------|
941
* |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
942
* -----------------------------------
943
*
944
* but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
945
*
946
* -----------------------------------
947
* | src1+0 | src1+1 | src1+2 | src1+3 |
948
* |-----------------------------------|
949
* |(x0, x1)|(y0, y1)| | | in SIMD8
950
* |-----------------------------------|
951
* |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
952
* -----------------------------------
953
*
954
* See also: emit_interpolation_setup_gfx4().
955
*/
956
struct brw_reg delta_x = src[0];
957
struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
958
struct brw_reg interp = src[1];
959
brw_inst *i[2];
960
961
/* nir_lower_interpolation() will do the lowering to MAD instructions for
962
* us on gfx11+
963
*/
964
assert(devinfo->ver < 11);
965
966
if (devinfo->has_pln) {
967
if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) {
968
/* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
969
*
970
* "[DevSNB]:<src1> must be even register aligned.
971
*
972
* This restriction is lifted on Ivy Bridge.
973
*
974
* This means that we need to split PLN into LINE+MAC on-the-fly.
975
* Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
976
* we have to split into SIMD8 pieces. For gfx4 (!has_pln), the
977
* coordinate registers are laid out differently so we leave it as a
978
* SIMD16 instruction.
979
*/
980
assert(inst->exec_size == 8 || inst->exec_size == 16);
981
assert(inst->group % 16 == 0);
982
983
brw_push_insn_state(p);
984
brw_set_default_exec_size(p, BRW_EXECUTE_8);
985
986
/* Thanks to two accumulators, we can emit all the LINEs and then all
987
* the MACs. This improves parallelism a bit.
988
*/
989
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
990
brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
991
offset(delta_x, g * 2));
992
brw_inst_set_group(devinfo, line, inst->group + g * 8);
993
994
/* LINE writes the accumulator automatically on gfx4-5. On Sandy
995
* Bridge and later, we have to explicitly enable it.
996
*/
997
if (devinfo->ver >= 6)
998
brw_inst_set_acc_wr_control(p->devinfo, line, true);
999
1000
/* brw_set_default_saturate() is called before emitting
1001
* instructions, so the saturate bit is set in each instruction,
1002
* so we need to unset it on the LINE instructions.
1003
*/
1004
brw_inst_set_saturate(p->devinfo, line, false);
1005
}
1006
1007
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
1008
brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
1009
offset(delta_x, g * 2 + 1));
1010
brw_inst_set_group(devinfo, mac, inst->group + g * 8);
1011
brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
1012
}
1013
1014
brw_pop_insn_state(p);
1015
1016
return true;
1017
} else {
1018
brw_PLN(p, dst, interp, delta_x);
1019
1020
return false;
1021
}
1022
} else {
1023
i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
1024
i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
1025
1026
brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
1027
1028
/* brw_set_default_saturate() is called before emitting instructions, so
1029
* the saturate bit is set in each instruction, so we need to unset it on
1030
* the first instruction.
1031
*/
1032
brw_inst_set_saturate(p->devinfo, i[0], false);
1033
1034
return true;
1035
}
1036
}
1037
1038
void
1039
fs_generator::generate_get_buffer_size(fs_inst *inst,
1040
struct brw_reg dst,
1041
struct brw_reg src,
1042
struct brw_reg surf_index)
1043
{
1044
assert(devinfo->ver >= 7);
1045
assert(surf_index.file == BRW_IMMEDIATE_VALUE);
1046
1047
uint32_t simd_mode;
1048
int rlen = 4;
1049
1050
switch (inst->exec_size) {
1051
case 8:
1052
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1053
break;
1054
case 16:
1055
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1056
break;
1057
default:
1058
unreachable("Invalid width for texture instruction");
1059
}
1060
1061
if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1062
rlen = 8;
1063
dst = vec16(dst);
1064
}
1065
1066
brw_SAMPLE(p,
1067
retype(dst, BRW_REGISTER_TYPE_UW),
1068
inst->base_mrf,
1069
src,
1070
surf_index.ud,
1071
0,
1072
GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1073
rlen, /* response length */
1074
inst->mlen,
1075
inst->header_size > 0,
1076
simd_mode,
1077
BRW_SAMPLER_RETURN_FORMAT_SINT32);
1078
}
1079
1080
void
1081
fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst,
1082
struct brw_reg surface_index,
1083
struct brw_reg sampler_index)
1084
{
1085
assert(devinfo->ver < 7);
1086
assert(inst->size_written % REG_SIZE == 0);
1087
int msg_type = -1;
1088
uint32_t simd_mode;
1089
uint32_t return_format;
1090
1091
/* Sampler EOT message of less than the dispatch width would kill the
1092
* thread prematurely.
1093
*/
1094
assert(!inst->eot || inst->exec_size == dispatch_width);
1095
1096
switch (dst.type) {
1097
case BRW_REGISTER_TYPE_D:
1098
return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
1099
break;
1100
case BRW_REGISTER_TYPE_UD:
1101
return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1102
break;
1103
default:
1104
return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1105
break;
1106
}
1107
1108
/* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
1109
* is set as part of the message descriptor. On gfx4, the PRM seems to
1110
* allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
1111
* later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
1112
* gone from the message descriptor entirely and you just get UINT32 all
1113
* the time regasrdless. Since we can really only do non-UINT32 on gfx4,
1114
* just stomp it to UINT32 all the time.
1115
*/
1116
if (inst->opcode == SHADER_OPCODE_TXS)
1117
return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1118
1119
switch (inst->exec_size) {
1120
case 8:
1121
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1122
break;
1123
case 16:
1124
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1125
break;
1126
default:
1127
unreachable("Invalid width for texture instruction");
1128
}
1129
1130
if (devinfo->ver >= 5) {
1131
switch (inst->opcode) {
1132
case SHADER_OPCODE_TEX:
1133
if (inst->shadow_compare) {
1134
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1135
} else {
1136
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE;
1137
}
1138
break;
1139
case FS_OPCODE_TXB:
1140
if (inst->shadow_compare) {
1141
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
1142
} else {
1143
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1144
}
1145
break;
1146
case SHADER_OPCODE_TXL:
1147
if (inst->shadow_compare) {
1148
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
1149
} else {
1150
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
1151
}
1152
break;
1153
case SHADER_OPCODE_TXS:
1154
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
1155
break;
1156
case SHADER_OPCODE_TXD:
1157
assert(!inst->shadow_compare);
1158
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
1159
break;
1160
case SHADER_OPCODE_TXF:
1161
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1162
break;
1163
case SHADER_OPCODE_TXF_CMS:
1164
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1165
break;
1166
case SHADER_OPCODE_LOD:
1167
msg_type = GFX5_SAMPLER_MESSAGE_LOD;
1168
break;
1169
case SHADER_OPCODE_TG4:
1170
assert(devinfo->ver == 6);
1171
assert(!inst->shadow_compare);
1172
msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
1173
break;
1174
case SHADER_OPCODE_SAMPLEINFO:
1175
msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
1176
break;
1177
default:
1178
unreachable("not reached");
1179
}
1180
} else {
1181
switch (inst->opcode) {
1182
case SHADER_OPCODE_TEX:
1183
/* Note that G45 and older determines shadow compare and dispatch width
1184
* from message length for most messages.
1185
*/
1186
if (inst->exec_size == 8) {
1187
msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1188
if (inst->shadow_compare) {
1189
assert(inst->mlen == 6);
1190
} else {
1191
assert(inst->mlen <= 4);
1192
}
1193
} else {
1194
if (inst->shadow_compare) {
1195
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1196
assert(inst->mlen == 9);
1197
} else {
1198
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1199
assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
1200
}
1201
}
1202
break;
1203
case FS_OPCODE_TXB:
1204
if (inst->shadow_compare) {
1205
assert(inst->exec_size == 8);
1206
assert(inst->mlen == 6);
1207
msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
1208
} else {
1209
assert(inst->mlen == 9);
1210
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1211
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1212
}
1213
break;
1214
case SHADER_OPCODE_TXL:
1215
if (inst->shadow_compare) {
1216
assert(inst->exec_size == 8);
1217
assert(inst->mlen == 6);
1218
msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
1219
} else {
1220
assert(inst->mlen == 9);
1221
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
1222
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1223
}
1224
break;
1225
case SHADER_OPCODE_TXD:
1226
/* There is no sample_d_c message; comparisons are done manually */
1227
assert(inst->exec_size == 8);
1228
assert(inst->mlen == 7 || inst->mlen == 10);
1229
msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
1230
break;
1231
case SHADER_OPCODE_TXF:
1232
assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
1233
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1234
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1235
break;
1236
case SHADER_OPCODE_TXS:
1237
assert(inst->mlen == 3);
1238
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
1239
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1240
break;
1241
default:
1242
unreachable("not reached");
1243
}
1244
}
1245
assert(msg_type != -1);
1246
1247
if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1248
dst = vec16(dst);
1249
}
1250
1251
assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
1252
1253
/* Load the message header if present. If there's a texture offset,
1254
* we need to set it up explicitly and load the offset bitfield.
1255
* Otherwise, we can use an implied move from g0 to the first message reg.
1256
*/
1257
struct brw_reg src = brw_null_reg();
1258
if (inst->header_size != 0) {
1259
if (devinfo->ver < 6 && !inst->offset) {
1260
/* Set up an implied move from g0 to the MRF. */
1261
src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1262
} else {
1263
const tgl_swsb swsb = brw_get_default_swsb(p);
1264
assert(inst->base_mrf != -1);
1265
struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
1266
1267
brw_push_insn_state(p);
1268
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1269
brw_set_default_exec_size(p, BRW_EXECUTE_8);
1270
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1271
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1272
/* Explicitly set up the message header by copying g0 to the MRF. */
1273
brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
1274
brw_set_default_swsb(p, tgl_swsb_regdist(1));
1275
1276
brw_set_default_exec_size(p, BRW_EXECUTE_1);
1277
if (inst->offset) {
1278
/* Set the offset bits in DWord 2. */
1279
brw_MOV(p, get_element_ud(header_reg, 2),
1280
brw_imm_ud(inst->offset));
1281
}
1282
1283
brw_pop_insn_state(p);
1284
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1285
}
1286
}
1287
1288
uint32_t base_binding_table_index;
1289
switch (inst->opcode) {
1290
case SHADER_OPCODE_TG4:
1291
base_binding_table_index = prog_data->binding_table.gather_texture_start;
1292
break;
1293
default:
1294
base_binding_table_index = prog_data->binding_table.texture_start;
1295
break;
1296
}
1297
1298
assert(surface_index.file == BRW_IMMEDIATE_VALUE);
1299
assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
1300
1301
brw_SAMPLE(p,
1302
retype(dst, BRW_REGISTER_TYPE_UW),
1303
inst->base_mrf,
1304
src,
1305
surface_index.ud + base_binding_table_index,
1306
sampler_index.ud % 16,
1307
msg_type,
1308
inst->size_written / REG_SIZE,
1309
inst->mlen,
1310
inst->header_size != 0,
1311
simd_mode,
1312
return_format);
1313
}
1314
1315
1316
/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1317
* looking like:
1318
*
1319
* arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1320
*
1321
* Ideally, we want to produce:
1322
*
1323
* DDX DDY
1324
* dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
1325
* (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
1326
* (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
1327
* (ss0.br - ss0.bl) (ss0.tr - ss0.br)
1328
* (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
1329
* (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
1330
* (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
1331
* (ss1.br - ss1.bl) (ss1.tr - ss1.br)
1332
*
1333
* and add another set of two more subspans if in 16-pixel dispatch mode.
1334
*
1335
* For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1336
* for each pair, and vertstride = 2 jumps us 2 elements after processing a
1337
* pair. But the ideal approximation may impose a huge performance cost on
1338
* sample_d. On at least Haswell, sample_d instruction does some
1339
* optimizations if the same LOD is used for all pixels in the subspan.
1340
*
1341
* For DDY, we need to use ALIGN16 mode since it's capable of doing the
1342
* appropriate swizzling.
1343
*/
1344
void
1345
fs_generator::generate_ddx(const fs_inst *inst,
1346
struct brw_reg dst, struct brw_reg src)
1347
{
1348
unsigned vstride, width;
1349
1350
if (devinfo->ver >= 8) {
1351
if (inst->opcode == FS_OPCODE_DDX_FINE) {
1352
/* produce accurate derivatives */
1353
vstride = BRW_VERTICAL_STRIDE_2;
1354
width = BRW_WIDTH_2;
1355
} else {
1356
/* replicate the derivative at the top-left pixel to other pixels */
1357
vstride = BRW_VERTICAL_STRIDE_4;
1358
width = BRW_WIDTH_4;
1359
}
1360
1361
struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
1362
struct brw_reg src1 = src;
1363
1364
src0.vstride = vstride;
1365
src0.width = width;
1366
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1367
src1.vstride = vstride;
1368
src1.width = width;
1369
src1.hstride = BRW_HORIZONTAL_STRIDE_0;
1370
1371
brw_ADD(p, dst, src0, negate(src1));
1372
} else {
1373
/* On Haswell and earlier, the region used above appears to not work
1374
* correctly for compressed instructions. At least on Haswell and
1375
* Iron Lake, compressed ALIGN16 instructions do work. Since we
1376
* would have to split to SIMD8 no matter which method we choose, we
1377
* may as well use ALIGN16 on all platforms gfx7 and earlier.
1378
*/
1379
struct brw_reg src0 = stride(src, 4, 4, 1);
1380
struct brw_reg src1 = stride(src, 4, 4, 1);
1381
if (inst->opcode == FS_OPCODE_DDX_FINE) {
1382
src0.swizzle = BRW_SWIZZLE_XXZZ;
1383
src1.swizzle = BRW_SWIZZLE_YYWW;
1384
} else {
1385
src0.swizzle = BRW_SWIZZLE_XXXX;
1386
src1.swizzle = BRW_SWIZZLE_YYYY;
1387
}
1388
1389
brw_push_insn_state(p);
1390
brw_set_default_access_mode(p, BRW_ALIGN_16);
1391
brw_ADD(p, dst, negate(src0), src1);
1392
brw_pop_insn_state(p);
1393
}
1394
}
1395
1396
/* The negate_value boolean is used to negate the derivative computation for
1397
* FBOs, since they place the origin at the upper left instead of the lower
1398
* left.
1399
*/
1400
void
1401
fs_generator::generate_ddy(const fs_inst *inst,
1402
struct brw_reg dst, struct brw_reg src)
1403
{
1404
const uint32_t type_size = type_sz(src.type);
1405
1406
if (inst->opcode == FS_OPCODE_DDY_FINE) {
1407
/* produce accurate derivatives.
1408
*
1409
* From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
1410
* "Register Region Restrictions", Section "1. Special Restrictions":
1411
*
1412
* "In Align16 mode, the channel selects and channel enables apply to
1413
* a pair of half-floats, because these parameters are defined for
1414
* DWord elements ONLY. This is applicable when both source and
1415
* destination are half-floats."
1416
*
1417
* So for half-float operations we use the Gfx11+ Align1 path. CHV
1418
* inherits its FP16 hardware from SKL, so it is not affected.
1419
*/
1420
if (devinfo->ver >= 11 ||
1421
(devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) {
1422
src = stride(src, 0, 2, 1);
1423
1424
brw_push_insn_state(p);
1425
brw_set_default_exec_size(p, BRW_EXECUTE_4);
1426
for (uint32_t g = 0; g < inst->exec_size; g += 4) {
1427
brw_set_default_group(p, inst->group + g);
1428
brw_ADD(p, byte_offset(dst, g * type_size),
1429
negate(byte_offset(src, g * type_size)),
1430
byte_offset(src, (g + 2) * type_size));
1431
brw_set_default_swsb(p, tgl_swsb_null());
1432
}
1433
brw_pop_insn_state(p);
1434
} else {
1435
struct brw_reg src0 = stride(src, 4, 4, 1);
1436
struct brw_reg src1 = stride(src, 4, 4, 1);
1437
src0.swizzle = BRW_SWIZZLE_XYXY;
1438
src1.swizzle = BRW_SWIZZLE_ZWZW;
1439
1440
brw_push_insn_state(p);
1441
brw_set_default_access_mode(p, BRW_ALIGN_16);
1442
brw_ADD(p, dst, negate(src0), src1);
1443
brw_pop_insn_state(p);
1444
}
1445
} else {
1446
/* replicate the derivative at the top-left pixel to other pixels */
1447
if (devinfo->ver >= 8) {
1448
struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
1449
struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
1450
1451
brw_ADD(p, dst, negate(src0), src1);
1452
} else {
1453
/* On Haswell and earlier, the region used above appears to not work
1454
* correctly for compressed instructions. At least on Haswell and
1455
* Iron Lake, compressed ALIGN16 instructions do work. Since we
1456
* would have to split to SIMD8 no matter which method we choose, we
1457
* may as well use ALIGN16 on all platforms gfx7 and earlier.
1458
*/
1459
struct brw_reg src0 = stride(src, 4, 4, 1);
1460
struct brw_reg src1 = stride(src, 4, 4, 1);
1461
src0.swizzle = BRW_SWIZZLE_XXXX;
1462
src1.swizzle = BRW_SWIZZLE_ZZZZ;
1463
1464
brw_push_insn_state(p);
1465
brw_set_default_access_mode(p, BRW_ALIGN_16);
1466
brw_ADD(p, dst, negate(src0), src1);
1467
brw_pop_insn_state(p);
1468
}
1469
}
1470
}
1471
1472
void
1473
fs_generator::generate_halt(fs_inst *)
1474
{
1475
/* This HALT will be patched up at FB write time to point UIP at the end of
1476
* the program, and at brw_uip_jip() JIP will be set to the end of the
1477
* current block (or the program).
1478
*/
1479
this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1480
brw_HALT(p);
1481
}
1482
1483
void
1484
fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1485
{
1486
/* The 32-wide messages only respect the first 16-wide half of the channel
1487
* enable signals which are replicated identically for the second group of
1488
* 16 channels, so we cannot use them unless the write is marked
1489
* force_writemask_all.
1490
*/
1491
const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1492
MIN2(16, inst->exec_size);
1493
const unsigned block_size = 4 * lower_size / REG_SIZE;
1494
const tgl_swsb swsb = brw_get_default_swsb(p);
1495
assert(inst->mlen != 0);
1496
1497
brw_push_insn_state(p);
1498
brw_set_default_exec_size(p, cvt(lower_size) - 1);
1499
brw_set_default_compression(p, lower_size > 8);
1500
1501
for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1502
brw_set_default_group(p, inst->group + lower_size * i);
1503
1504
if (i > 0) {
1505
assert(swsb.mode & TGL_SBID_SET);
1506
brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid));
1507
} else {
1508
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1509
}
1510
1511
brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1512
retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1513
1514
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1515
brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1516
block_size,
1517
inst->offset + block_size * REG_SIZE * i);
1518
}
1519
1520
brw_pop_insn_state(p);
1521
}
1522
1523
void
1524
fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1525
{
1526
assert(inst->exec_size <= 16 || inst->force_writemask_all);
1527
assert(inst->mlen != 0);
1528
1529
brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1530
inst->exec_size / 8, inst->offset);
1531
}
1532
1533
void
1534
fs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst)
1535
{
1536
assert(inst->exec_size <= 16 || inst->force_writemask_all);
1537
1538
gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1539
}
1540
1541
/* The A32 messages take a buffer base address in header.5:[31:0] (See
1542
* MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
1543
* and OWord block messages in the SKL PRM Vol. 2d for more details.)
1544
* Unfortunately, there are a number of subtle differences:
1545
*
1546
* For the block read/write messages:
1547
*
1548
* - We always stomp header.2 to fill in the actual scratch address (in
1549
* units of OWORDs) so we don't care what's in there.
1550
*
1551
* - They rely on per-thread scratch space value in header.3[3:0] to do
1552
* bounds checking so that needs to be valid. The upper bits of
1553
* header.3 are ignored, though, so we can copy all of g0.3.
1554
*
1555
* - They ignore header.5[9:0] and assumes the address is 1KB aligned.
1556
*
1557
*
1558
* For the byte/dword scattered read/write messages:
1559
*
1560
* - We want header.2 to be zero because that gets added to the per-channel
1561
* offset in the non-header portion of the message.
1562
*
1563
* - Contrary to what the docs claim, they don't do any bounds checking so
1564
* the value of header.3[3:0] doesn't matter.
1565
*
1566
* - They consider all of header.5 for the base address and header.5[9:0]
1567
* are not ignored. This means that we can't copy g0.5 verbatim because
1568
* g0.5[9:0] contains the FFTID on most platforms. Instead, we have to
1569
* use an AND to mask off the bottom 10 bits.
1570
*
1571
*
1572
* For block messages, just copying g0 gives a valid header because all the
1573
* garbage gets ignored except for header.2 which we stomp as part of message
1574
* setup. For byte/dword scattered messages, we can just zero out the header
1575
* and copy over the bits we need from g0.5. This opcode, however, tries to
1576
* satisfy the requirements of both by starting with 0 and filling out the
1577
* information required by either set of opcodes.
1578
*/
1579
void
1580
fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
1581
{
1582
assert(inst->exec_size == 8 && inst->force_writemask_all);
1583
assert(dst.file == BRW_GENERAL_REGISTER_FILE);
1584
1585
dst.type = BRW_REGISTER_TYPE_UD;
1586
1587
brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
1588
if (devinfo->ver >= 12)
1589
brw_set_default_swsb(p, tgl_swsb_null());
1590
else
1591
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1592
1593
/* Copy the per-thread scratch space size from g0.3[3:0] */
1594
brw_set_default_exec_size(p, BRW_EXECUTE_1);
1595
insn = brw_AND(p, suboffset(dst, 3),
1596
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
1597
brw_imm_ud(INTEL_MASK(3, 0)));
1598
if (devinfo->ver < 12) {
1599
brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1600
brw_inst_set_no_dd_check(p->devinfo, insn, true);
1601
}
1602
1603
/* Copy the scratch base address from g0.5[31:10] */
1604
insn = brw_AND(p, suboffset(dst, 5),
1605
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
1606
brw_imm_ud(INTEL_MASK(31, 10)));
1607
if (devinfo->ver < 12)
1608
brw_inst_set_no_dd_check(p->devinfo, insn, true);
1609
}
1610
1611
void
1612
fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1613
struct brw_reg dst,
1614
struct brw_reg index,
1615
struct brw_reg offset)
1616
{
1617
assert(type_sz(dst.type) == 4);
1618
assert(inst->mlen != 0);
1619
1620
assert(index.file == BRW_IMMEDIATE_VALUE &&
1621
index.type == BRW_REGISTER_TYPE_UD);
1622
uint32_t surf_index = index.ud;
1623
1624
assert(offset.file == BRW_IMMEDIATE_VALUE &&
1625
offset.type == BRW_REGISTER_TYPE_UD);
1626
uint32_t read_offset = offset.ud;
1627
1628
brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1629
read_offset, surf_index);
1630
}
1631
1632
void
1633
fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
1634
struct brw_reg dst,
1635
struct brw_reg index,
1636
struct brw_reg payload)
1637
{
1638
assert(index.type == BRW_REGISTER_TYPE_UD);
1639
assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1640
assert(type_sz(dst.type) == 4);
1641
1642
if (index.file == BRW_IMMEDIATE_VALUE) {
1643
const uint32_t surf_index = index.ud;
1644
1645
brw_push_insn_state(p);
1646
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1647
brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1648
brw_pop_insn_state(p);
1649
1650
brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE);
1651
brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1652
brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1653
brw_set_desc(p, send,
1654
brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
1655
REG_SIZE), true) |
1656
brw_dp_desc(devinfo, surf_index,
1657
GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1658
BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)));
1659
1660
} else {
1661
const tgl_swsb swsb = brw_get_default_swsb(p);
1662
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1663
1664
brw_push_insn_state(p);
1665
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1666
1667
/* a0.0 = surf_index & 0xff */
1668
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1669
brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1670
brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1671
brw_set_dest(p, insn_and, addr);
1672
brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1673
brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1674
1675
/* dst = send(payload, a0.0 | <descriptor>) */
1676
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1677
brw_send_indirect_message(
1678
p, GFX6_SFID_DATAPORT_CONSTANT_CACHE,
1679
retype(dst, BRW_REGISTER_TYPE_UD),
1680
retype(payload, BRW_REGISTER_TYPE_UD), addr,
1681
brw_message_desc(devinfo, 1,
1682
DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
1683
brw_dp_desc(devinfo, 0 /* surface */,
1684
GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1685
BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)),
1686
false /* EOT */);
1687
1688
brw_pop_insn_state(p);
1689
}
1690
}
1691
1692
void
1693
fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst,
1694
struct brw_reg dst,
1695
struct brw_reg index)
1696
{
1697
assert(devinfo->ver < 7); /* Should use the gfx7 variant. */
1698
assert(inst->header_size != 0);
1699
assert(inst->mlen);
1700
1701
assert(index.file == BRW_IMMEDIATE_VALUE &&
1702
index.type == BRW_REGISTER_TYPE_UD);
1703
uint32_t surf_index = index.ud;
1704
1705
uint32_t simd_mode, rlen, msg_type;
1706
if (inst->exec_size == 16) {
1707
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1708
rlen = 8;
1709
} else {
1710
assert(inst->exec_size == 8);
1711
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1712
rlen = 4;
1713
}
1714
1715
if (devinfo->ver >= 5)
1716
msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1717
else {
1718
/* We always use the SIMD16 message so that we only have to load U, and
1719
* not V or R.
1720
*/
1721
msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1722
assert(inst->mlen == 3);
1723
assert(inst->size_written == 8 * REG_SIZE);
1724
rlen = 8;
1725
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1726
}
1727
1728
struct brw_reg header = brw_vec8_grf(0, 0);
1729
gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1730
1731
brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1732
brw_inst_set_compression(devinfo, send, false);
1733
brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
1734
brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1735
brw_set_src0(p, send, header);
1736
if (devinfo->ver < 6)
1737
brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1738
1739
/* Our surface is set up as floats, regardless of what actual data is
1740
* stored in it.
1741
*/
1742
uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1743
brw_set_desc(p, send,
1744
brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
1745
brw_sampler_desc(devinfo, surf_index,
1746
0, /* sampler (unused) */
1747
msg_type, simd_mode, return_format));
1748
}
1749
1750
void
1751
fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1752
struct brw_reg dst,
1753
struct brw_reg src,
1754
struct brw_reg msg_data,
1755
unsigned msg_type)
1756
{
1757
const bool has_payload = inst->src[0].file != BAD_FILE;
1758
assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1759
assert(inst->size_written % REG_SIZE == 0);
1760
1761
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1762
1763
brw_pixel_interpolator_query(p,
1764
retype(dst, BRW_REGISTER_TYPE_UW),
1765
/* If we don't have a payload, what we send doesn't matter */
1766
has_payload ? src : brw_vec8_grf(0, 0),
1767
inst->pi_noperspective,
1768
prog_data->per_coarse_pixel_dispatch,
1769
msg_type,
1770
msg_data,
1771
has_payload ? 2 * inst->exec_size / 8 : 1,
1772
inst->size_written / REG_SIZE);
1773
}
1774
1775
/* Sets vstride=1, width=4, hstride=0 of register src1 during
1776
* the ADD instruction.
1777
*/
1778
void
1779
fs_generator::generate_set_sample_id(fs_inst *inst,
1780
struct brw_reg dst,
1781
struct brw_reg src0,
1782
struct brw_reg src1)
1783
{
1784
assert(dst.type == BRW_REGISTER_TYPE_D ||
1785
dst.type == BRW_REGISTER_TYPE_UD);
1786
assert(src0.type == BRW_REGISTER_TYPE_D ||
1787
src0.type == BRW_REGISTER_TYPE_UD);
1788
1789
const struct brw_reg reg = stride(src1, 1, 4, 0);
1790
const unsigned lower_size = MIN2(inst->exec_size,
1791
devinfo->ver >= 8 ? 16 : 8);
1792
1793
for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1794
brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
1795
offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
1796
(i * lower_size / (1 << src0.width))) *
1797
type_sz(src0.type) / REG_SIZE),
1798
suboffset(reg, i * lower_size / 4));
1799
brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
1800
brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
1801
brw_inst_set_compression(devinfo, insn, lower_size > 8);
1802
brw_set_default_swsb(p, tgl_swsb_null());
1803
}
1804
}
1805
1806
void
1807
fs_generator::generate_pack_half_2x16_split(fs_inst *,
1808
struct brw_reg dst,
1809
struct brw_reg x,
1810
struct brw_reg y)
1811
{
1812
assert(devinfo->ver >= 7);
1813
assert(dst.type == BRW_REGISTER_TYPE_UD);
1814
assert(x.type == BRW_REGISTER_TYPE_F);
1815
assert(y.type == BRW_REGISTER_TYPE_F);
1816
1817
/* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1818
*
1819
* Because this instruction does not have a 16-bit floating-point type,
1820
* the destination data type must be Word (W).
1821
*
1822
* The destination must be DWord-aligned and specify a horizontal stride
1823
* (HorzStride) of 2. The 16-bit result is stored in the lower word of
1824
* each destination channel and the upper word is not modified.
1825
*/
1826
struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1827
1828
/* Give each 32-bit channel of dst the form below, where "." means
1829
* unchanged.
1830
* 0x....hhhh
1831
*/
1832
brw_F32TO16(p, dst_w, y);
1833
1834
/* Now the form:
1835
* 0xhhhh0000
1836
*/
1837
brw_set_default_swsb(p, tgl_swsb_regdist(1));
1838
brw_SHL(p, dst, dst, brw_imm_ud(16u));
1839
1840
/* And, finally the form of packHalf2x16's output:
1841
* 0xhhhhllll
1842
*/
1843
brw_F32TO16(p, dst_w, x);
1844
}
1845
1846
void
1847
fs_generator::generate_shader_time_add(fs_inst *,
1848
struct brw_reg payload,
1849
struct brw_reg offset,
1850
struct brw_reg value)
1851
{
1852
const tgl_swsb swsb = brw_get_default_swsb(p);
1853
1854
assert(devinfo->ver >= 7);
1855
brw_push_insn_state(p);
1856
brw_set_default_mask_control(p, true);
1857
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1858
1859
assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1860
struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1861
offset.type);
1862
struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1863
value.type);
1864
1865
assert(offset.file == BRW_IMMEDIATE_VALUE);
1866
if (value.file == BRW_GENERAL_REGISTER_FILE) {
1867
value.width = BRW_WIDTH_1;
1868
value.hstride = BRW_HORIZONTAL_STRIDE_0;
1869
value.vstride = BRW_VERTICAL_STRIDE_0;
1870
} else {
1871
assert(value.file == BRW_IMMEDIATE_VALUE);
1872
}
1873
1874
/* Trying to deal with setup of the params from the IR is crazy in the FS8
1875
* case, and we don't really care about squeezing every bit of performance
1876
* out of this path, so we just emit the MOVs from here.
1877
*/
1878
brw_MOV(p, payload_offset, offset);
1879
brw_set_default_swsb(p, tgl_swsb_null());
1880
brw_MOV(p, payload_value, value);
1881
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1882
brw_shader_time_add(p, payload,
1883
prog_data->binding_table.shader_time_start);
1884
brw_pop_insn_state(p);
1885
}
1886
1887
void
1888
fs_generator::enable_debug(const char *shader_name)
1889
{
1890
debug_flag = true;
1891
this->shader_name = shader_name;
1892
}
1893
1894
int
1895
fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
1896
struct shader_stats shader_stats,
1897
const brw::performance &perf,
1898
struct brw_compile_stats *stats)
1899
{
1900
/* align to 64 byte boundary. */
1901
brw_realign(p, 64);
1902
1903
this->dispatch_width = dispatch_width;
1904
1905
int start_offset = p->next_insn_offset;
1906
1907
/* `send_count` explicitly does not include spills or fills, as we'd
1908
* like to use it as a metric for intentional memory access or other
1909
* shared function use. Otherwise, subtle changes to scheduling or
1910
* register allocation could cause it to fluctuate wildly - and that
1911
* effect is already counted in spill/fill counts.
1912
*/
1913
int spill_count = 0, fill_count = 0;
1914
int loop_count = 0, send_count = 0, nop_count = 0;
1915
bool is_accum_used = false;
1916
1917
struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
1918
1919
foreach_block_and_inst (block, fs_inst, inst, cfg) {
1920
if (inst->opcode == SHADER_OPCODE_UNDEF)
1921
continue;
1922
1923
struct brw_reg src[4], dst;
1924
unsigned int last_insn_offset = p->next_insn_offset;
1925
bool multiple_instructions_emitted = false;
1926
tgl_swsb swsb = inst->sched;
1927
1928
/* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1929
* "Register Region Restrictions" section: for BDW, SKL:
1930
*
1931
* "A POW/FDIV operation must not be followed by an instruction
1932
* that requires two destination registers."
1933
*
1934
* The documentation is often lacking annotations for Atom parts,
1935
* and empirically this affects CHV as well.
1936
*/
1937
if (devinfo->ver >= 8 &&
1938
devinfo->ver <= 9 &&
1939
p->nr_insn > 1 &&
1940
brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1941
brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1942
inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1943
brw_NOP(p);
1944
last_insn_offset = p->next_insn_offset;
1945
1946
/* In order to avoid spurious instruction count differences when the
1947
* instruction schedule changes, keep track of the number of inserted
1948
* NOPs.
1949
*/
1950
nop_count++;
1951
}
1952
1953
/* Wa_14010017096:
1954
*
1955
* Clear accumulator register before end of thread.
1956
*/
1957
if (inst->eot && is_accum_used && devinfo->ver >= 12) {
1958
brw_set_default_exec_size(p, BRW_EXECUTE_16);
1959
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1960
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1961
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1962
brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
1963
last_insn_offset = p->next_insn_offset;
1964
swsb = tgl_swsb_dst_dep(swsb, 1);
1965
}
1966
1967
if (!is_accum_used && !inst->eot) {
1968
is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
1969
inst->dst.is_accumulator();
1970
}
1971
1972
/* Wa_14013745556:
1973
*
1974
* Always use @1 SWSB for EOT.
1975
*/
1976
if (inst->eot && devinfo->ver >= 12) {
1977
if (tgl_swsb_src_dep(swsb).mode) {
1978
brw_set_default_exec_size(p, BRW_EXECUTE_1);
1979
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1980
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1981
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1982
brw_SYNC(p, TGL_SYNC_NOP);
1983
last_insn_offset = p->next_insn_offset;
1984
}
1985
1986
swsb = tgl_swsb_dst_dep(swsb, 1);
1987
}
1988
1989
if (unlikely(debug_flag))
1990
disasm_annotate(disasm_info, inst, p->next_insn_offset);
1991
1992
/* If the instruction writes to more than one register, it needs to be
1993
* explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
1994
* hardware figures out by itself what the right compression mode is,
1995
* but we still need to know whether the instruction is compressed to
1996
* set up the source register regions appropriately.
1997
*
1998
* XXX - This is wrong for instructions that write a single register but
1999
* read more than one which should strictly speaking be treated as
2000
* compressed. For instructions that don't write any registers it
2001
* relies on the destination being a null register of the correct
2002
* type and regioning so the instruction is considered compressed
2003
* or not accordingly.
2004
*/
2005
const bool compressed =
2006
inst->dst.component_size(inst->exec_size) > REG_SIZE;
2007
brw_set_default_compression(p, compressed);
2008
brw_set_default_group(p, inst->group);
2009
2010
for (unsigned int i = 0; i < inst->sources; i++) {
2011
src[i] = brw_reg_from_fs_reg(devinfo, inst,
2012
&inst->src[i], compressed);
2013
/* The accumulator result appears to get used for the
2014
* conditional modifier generation. When negating a UD
2015
* value, there is a 33rd bit generated for the sign in the
2016
* accumulator value, so now you can't check, for example,
2017
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
2018
*/
2019
assert(!inst->conditional_mod ||
2020
inst->src[i].type != BRW_REGISTER_TYPE_UD ||
2021
!inst->src[i].negate);
2022
}
2023
dst = brw_reg_from_fs_reg(devinfo, inst,
2024
&inst->dst, compressed);
2025
2026
brw_set_default_access_mode(p, BRW_ALIGN_1);
2027
brw_set_default_predicate_control(p, inst->predicate);
2028
brw_set_default_predicate_inverse(p, inst->predicate_inverse);
2029
/* On gfx7 and above, hardware automatically adds the group onto the
2030
* flag subregister number. On Sandy Bridge and older, we have to do it
2031
* ourselves.
2032
*/
2033
const unsigned flag_subreg = inst->flag_subreg +
2034
(devinfo->ver >= 7 ? 0 : inst->group / 16);
2035
brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
2036
brw_set_default_saturate(p, inst->saturate);
2037
brw_set_default_mask_control(p, inst->force_writemask_all);
2038
brw_set_default_acc_write_control(p, inst->writes_accumulator);
2039
brw_set_default_swsb(p, swsb);
2040
2041
unsigned exec_size = inst->exec_size;
2042
if (devinfo->verx10 == 70 &&
2043
(get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
2044
exec_size *= 2;
2045
}
2046
2047
brw_set_default_exec_size(p, cvt(exec_size) - 1);
2048
2049
assert(inst->force_writemask_all || inst->exec_size >= 4);
2050
assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
2051
assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver));
2052
assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
2053
2054
switch (inst->opcode) {
2055
case BRW_OPCODE_SYNC:
2056
assert(src[0].file == BRW_IMMEDIATE_VALUE);
2057
brw_SYNC(p, tgl_sync_function(src[0].ud));
2058
break;
2059
case BRW_OPCODE_MOV:
2060
brw_MOV(p, dst, src[0]);
2061
break;
2062
case BRW_OPCODE_ADD:
2063
brw_ADD(p, dst, src[0], src[1]);
2064
break;
2065
case BRW_OPCODE_MUL:
2066
brw_MUL(p, dst, src[0], src[1]);
2067
break;
2068
case BRW_OPCODE_AVG:
2069
brw_AVG(p, dst, src[0], src[1]);
2070
break;
2071
case BRW_OPCODE_MACH:
2072
brw_MACH(p, dst, src[0], src[1]);
2073
break;
2074
2075
case BRW_OPCODE_LINE:
2076
brw_LINE(p, dst, src[0], src[1]);
2077
break;
2078
2079
case BRW_OPCODE_MAD:
2080
assert(devinfo->ver >= 6);
2081
if (devinfo->ver < 10)
2082
brw_set_default_access_mode(p, BRW_ALIGN_16);
2083
brw_MAD(p, dst, src[0], src[1], src[2]);
2084
break;
2085
2086
case BRW_OPCODE_LRP:
2087
assert(devinfo->ver >= 6 && devinfo->ver <= 10);
2088
if (devinfo->ver < 10)
2089
brw_set_default_access_mode(p, BRW_ALIGN_16);
2090
brw_LRP(p, dst, src[0], src[1], src[2]);
2091
break;
2092
2093
case BRW_OPCODE_FRC:
2094
brw_FRC(p, dst, src[0]);
2095
break;
2096
case BRW_OPCODE_RNDD:
2097
brw_RNDD(p, dst, src[0]);
2098
break;
2099
case BRW_OPCODE_RNDE:
2100
brw_RNDE(p, dst, src[0]);
2101
break;
2102
case BRW_OPCODE_RNDZ:
2103
brw_RNDZ(p, dst, src[0]);
2104
break;
2105
2106
case BRW_OPCODE_AND:
2107
brw_AND(p, dst, src[0], src[1]);
2108
break;
2109
case BRW_OPCODE_OR:
2110
brw_OR(p, dst, src[0], src[1]);
2111
break;
2112
case BRW_OPCODE_XOR:
2113
brw_XOR(p, dst, src[0], src[1]);
2114
break;
2115
case BRW_OPCODE_NOT:
2116
brw_NOT(p, dst, src[0]);
2117
break;
2118
case BRW_OPCODE_ASR:
2119
brw_ASR(p, dst, src[0], src[1]);
2120
break;
2121
case BRW_OPCODE_SHR:
2122
brw_SHR(p, dst, src[0], src[1]);
2123
break;
2124
case BRW_OPCODE_SHL:
2125
brw_SHL(p, dst, src[0], src[1]);
2126
break;
2127
case BRW_OPCODE_ROL:
2128
assert(devinfo->ver >= 11);
2129
assert(src[0].type == dst.type);
2130
brw_ROL(p, dst, src[0], src[1]);
2131
break;
2132
case BRW_OPCODE_ROR:
2133
assert(devinfo->ver >= 11);
2134
assert(src[0].type == dst.type);
2135
brw_ROR(p, dst, src[0], src[1]);
2136
break;
2137
case BRW_OPCODE_F32TO16:
2138
assert(devinfo->ver >= 7);
2139
brw_F32TO16(p, dst, src[0]);
2140
break;
2141
case BRW_OPCODE_F16TO32:
2142
assert(devinfo->ver >= 7);
2143
brw_F16TO32(p, dst, src[0]);
2144
break;
2145
case BRW_OPCODE_CMP:
2146
if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
2147
dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2148
/* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
2149
* implemented in the compiler is not sufficient. Overriding the
2150
* type when the destination is the null register is necessary but
2151
* not sufficient by itself.
2152
*/
2153
dst.type = BRW_REGISTER_TYPE_D;
2154
}
2155
brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
2156
break;
2157
case BRW_OPCODE_CMPN:
2158
if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
2159
dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2160
/* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
2161
* implemented in the compiler is not sufficient. Overriding the
2162
* type when the destination is the null register is necessary but
2163
* not sufficient by itself.
2164
*/
2165
dst.type = BRW_REGISTER_TYPE_D;
2166
}
2167
brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
2168
break;
2169
case BRW_OPCODE_SEL:
2170
brw_SEL(p, dst, src[0], src[1]);
2171
break;
2172
case BRW_OPCODE_CSEL:
2173
assert(devinfo->ver >= 8);
2174
if (devinfo->ver < 10)
2175
brw_set_default_access_mode(p, BRW_ALIGN_16);
2176
brw_CSEL(p, dst, src[0], src[1], src[2]);
2177
break;
2178
case BRW_OPCODE_BFREV:
2179
assert(devinfo->ver >= 7);
2180
brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
2181
retype(src[0], BRW_REGISTER_TYPE_UD));
2182
break;
2183
case BRW_OPCODE_FBH:
2184
assert(devinfo->ver >= 7);
2185
brw_FBH(p, retype(dst, src[0].type), src[0]);
2186
break;
2187
case BRW_OPCODE_FBL:
2188
assert(devinfo->ver >= 7);
2189
brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
2190
retype(src[0], BRW_REGISTER_TYPE_UD));
2191
break;
2192
case BRW_OPCODE_LZD:
2193
brw_LZD(p, dst, src[0]);
2194
break;
2195
case BRW_OPCODE_CBIT:
2196
assert(devinfo->ver >= 7);
2197
brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
2198
retype(src[0], BRW_REGISTER_TYPE_UD));
2199
break;
2200
case BRW_OPCODE_ADDC:
2201
assert(devinfo->ver >= 7);
2202
brw_ADDC(p, dst, src[0], src[1]);
2203
break;
2204
case BRW_OPCODE_SUBB:
2205
assert(devinfo->ver >= 7);
2206
brw_SUBB(p, dst, src[0], src[1]);
2207
break;
2208
case BRW_OPCODE_MAC:
2209
brw_MAC(p, dst, src[0], src[1]);
2210
break;
2211
2212
case BRW_OPCODE_BFE:
2213
assert(devinfo->ver >= 7);
2214
if (devinfo->ver < 10)
2215
brw_set_default_access_mode(p, BRW_ALIGN_16);
2216
brw_BFE(p, dst, src[0], src[1], src[2]);
2217
break;
2218
2219
case BRW_OPCODE_BFI1:
2220
assert(devinfo->ver >= 7);
2221
brw_BFI1(p, dst, src[0], src[1]);
2222
break;
2223
case BRW_OPCODE_BFI2:
2224
assert(devinfo->ver >= 7);
2225
if (devinfo->ver < 10)
2226
brw_set_default_access_mode(p, BRW_ALIGN_16);
2227
brw_BFI2(p, dst, src[0], src[1], src[2]);
2228
break;
2229
2230
case BRW_OPCODE_IF:
2231
if (inst->src[0].file != BAD_FILE) {
2232
/* The instruction has an embedded compare (only allowed on gfx6) */
2233
assert(devinfo->ver == 6);
2234
gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
2235
} else {
2236
brw_IF(p, brw_get_default_exec_size(p));
2237
}
2238
break;
2239
2240
case BRW_OPCODE_ELSE:
2241
brw_ELSE(p);
2242
break;
2243
case BRW_OPCODE_ENDIF:
2244
brw_ENDIF(p);
2245
break;
2246
2247
case BRW_OPCODE_DO:
2248
brw_DO(p, brw_get_default_exec_size(p));
2249
break;
2250
2251
case BRW_OPCODE_BREAK:
2252
brw_BREAK(p);
2253
break;
2254
case BRW_OPCODE_CONTINUE:
2255
brw_CONT(p);
2256
break;
2257
2258
case BRW_OPCODE_WHILE:
2259
brw_WHILE(p);
2260
loop_count++;
2261
break;
2262
2263
case SHADER_OPCODE_RCP:
2264
case SHADER_OPCODE_RSQ:
2265
case SHADER_OPCODE_SQRT:
2266
case SHADER_OPCODE_EXP2:
2267
case SHADER_OPCODE_LOG2:
2268
case SHADER_OPCODE_SIN:
2269
case SHADER_OPCODE_COS:
2270
assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2271
if (devinfo->ver >= 6) {
2272
assert(inst->mlen == 0);
2273
assert(devinfo->ver >= 7 || inst->exec_size == 8);
2274
gfx6_math(p, dst, brw_math_function(inst->opcode),
2275
src[0], brw_null_reg());
2276
} else {
2277
assert(inst->mlen >= 1);
2278
assert(devinfo->ver == 5 || devinfo->is_g4x || inst->exec_size == 8);
2279
gfx4_math(p, dst,
2280
brw_math_function(inst->opcode),
2281
inst->base_mrf, src[0],
2282
BRW_MATH_PRECISION_FULL);
2283
send_count++;
2284
}
2285
break;
2286
case SHADER_OPCODE_INT_QUOTIENT:
2287
case SHADER_OPCODE_INT_REMAINDER:
2288
case SHADER_OPCODE_POW:
2289
assert(devinfo->verx10 < 125);
2290
assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2291
if (devinfo->ver >= 6) {
2292
assert(inst->mlen == 0);
2293
assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
2294
inst->exec_size == 8);
2295
gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
2296
} else {
2297
assert(inst->mlen >= 1);
2298
assert(inst->exec_size == 8);
2299
gfx4_math(p, dst, brw_math_function(inst->opcode),
2300
inst->base_mrf, src[0],
2301
BRW_MATH_PRECISION_FULL);
2302
send_count++;
2303
}
2304
break;
2305
case FS_OPCODE_LINTERP:
2306
multiple_instructions_emitted = generate_linterp(inst, dst, src);
2307
break;
2308
case FS_OPCODE_PIXEL_X:
2309
assert(src[0].type == BRW_REGISTER_TYPE_UW);
2310
assert(src[1].type == BRW_REGISTER_TYPE_UW);
2311
src[0].subnr = 0 * type_sz(src[0].type);
2312
if (src[1].file == BRW_IMMEDIATE_VALUE) {
2313
assert(src[1].ud == 0);
2314
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2315
} else {
2316
/* Coarse pixel case */
2317
brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
2318
}
2319
break;
2320
case FS_OPCODE_PIXEL_Y:
2321
assert(src[0].type == BRW_REGISTER_TYPE_UW);
2322
assert(src[1].type == BRW_REGISTER_TYPE_UW);
2323
src[0].subnr = 4 * type_sz(src[0].type);
2324
if (src[1].file == BRW_IMMEDIATE_VALUE) {
2325
assert(src[1].ud == 0);
2326
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2327
} else {
2328
/* Coarse pixel case */
2329
brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
2330
}
2331
break;
2332
2333
case SHADER_OPCODE_SEND:
2334
generate_send(inst, dst, src[0], src[1], src[2],
2335
inst->ex_mlen > 0 ? src[3] : brw_null_reg());
2336
if ((inst->desc & 0xff) == BRW_BTI_STATELESS ||
2337
(inst->desc & 0xff) == GFX8_BTI_STATELESS_NON_COHERENT) {
2338
if (inst->size_written)
2339
fill_count++;
2340
else
2341
spill_count++;
2342
} else {
2343
send_count++;
2344
}
2345
break;
2346
2347
case SHADER_OPCODE_GET_BUFFER_SIZE:
2348
generate_get_buffer_size(inst, dst, src[0], src[1]);
2349
send_count++;
2350
break;
2351
case SHADER_OPCODE_TEX:
2352
case FS_OPCODE_TXB:
2353
case SHADER_OPCODE_TXD:
2354
case SHADER_OPCODE_TXF:
2355
case SHADER_OPCODE_TXF_CMS:
2356
case SHADER_OPCODE_TXL:
2357
case SHADER_OPCODE_TXS:
2358
case SHADER_OPCODE_LOD:
2359
case SHADER_OPCODE_TG4:
2360
case SHADER_OPCODE_SAMPLEINFO:
2361
assert(inst->src[0].file == BAD_FILE);
2362
generate_tex(inst, dst, src[1], src[2]);
2363
send_count++;
2364
break;
2365
2366
case FS_OPCODE_DDX_COARSE:
2367
case FS_OPCODE_DDX_FINE:
2368
generate_ddx(inst, dst, src[0]);
2369
break;
2370
case FS_OPCODE_DDY_COARSE:
2371
case FS_OPCODE_DDY_FINE:
2372
generate_ddy(inst, dst, src[0]);
2373
break;
2374
2375
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
2376
generate_scratch_write(inst, src[0]);
2377
spill_count++;
2378
break;
2379
2380
case SHADER_OPCODE_GFX4_SCRATCH_READ:
2381
generate_scratch_read(inst, dst);
2382
fill_count++;
2383
break;
2384
2385
case SHADER_OPCODE_GFX7_SCRATCH_READ:
2386
generate_scratch_read_gfx7(inst, dst);
2387
fill_count++;
2388
break;
2389
2390
case SHADER_OPCODE_SCRATCH_HEADER:
2391
generate_scratch_header(inst, dst);
2392
break;
2393
2394
case SHADER_OPCODE_MOV_INDIRECT:
2395
generate_mov_indirect(inst, dst, src[0], src[1]);
2396
break;
2397
2398
case SHADER_OPCODE_MOV_RELOC_IMM:
2399
assert(src[0].file == BRW_IMMEDIATE_VALUE);
2400
brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud);
2401
break;
2402
2403
case SHADER_OPCODE_URB_READ_SIMD8:
2404
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
2405
generate_urb_read(inst, dst, src[0]);
2406
send_count++;
2407
break;
2408
2409
case SHADER_OPCODE_URB_WRITE_SIMD8:
2410
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2411
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2412
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2413
generate_urb_write(inst, src[0]);
2414
send_count++;
2415
break;
2416
2417
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2418
assert(inst->force_writemask_all);
2419
generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2420
send_count++;
2421
break;
2422
2423
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
2424
assert(inst->force_writemask_all);
2425
generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]);
2426
send_count++;
2427
break;
2428
2429
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
2430
generate_varying_pull_constant_load_gfx4(inst, dst, src[0]);
2431
send_count++;
2432
break;
2433
2434
case FS_OPCODE_REP_FB_WRITE:
2435
case FS_OPCODE_FB_WRITE:
2436
generate_fb_write(inst, src[0]);
2437
send_count++;
2438
break;
2439
2440
case FS_OPCODE_FB_READ:
2441
generate_fb_read(inst, dst, src[0]);
2442
send_count++;
2443
break;
2444
2445
case BRW_OPCODE_HALT:
2446
generate_halt(inst);
2447
break;
2448
2449
case SHADER_OPCODE_SHADER_TIME_ADD:
2450
generate_shader_time_add(inst, src[0], src[1], src[2]);
2451
break;
2452
2453
case SHADER_OPCODE_INTERLOCK:
2454
case SHADER_OPCODE_MEMORY_FENCE: {
2455
assert(src[1].file == BRW_IMMEDIATE_VALUE);
2456
assert(src[2].file == BRW_IMMEDIATE_VALUE);
2457
2458
const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
2459
BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
2460
2461
brw_memory_fence(p, dst, src[0], send_op,
2462
brw_message_target(inst->sfid),
2463
/* commit_enable */ src[1].ud,
2464
/* bti */ src[2].ud);
2465
send_count++;
2466
break;
2467
}
2468
2469
case FS_OPCODE_SCHEDULING_FENCE:
2470
if (inst->sources == 0 && swsb.regdist == 0 &&
2471
swsb.mode == TGL_SBID_NULL) {
2472
if (unlikely(debug_flag))
2473
disasm_info->use_tail = true;
2474
break;
2475
}
2476
2477
if (devinfo->ver >= 12) {
2478
/* Use the available SWSB information to stall. A single SYNC is
2479
* sufficient since if there were multiple dependencies, the
2480
* scoreboard algorithm already injected other SYNCs before this
2481
* instruction.
2482
*/
2483
brw_SYNC(p, TGL_SYNC_NOP);
2484
} else {
2485
for (unsigned i = 0; i < inst->sources; i++) {
2486
/* Emit a MOV to force a stall until the instruction producing the
2487
* registers finishes.
2488
*/
2489
brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
2490
retype(src[i], BRW_REGISTER_TYPE_UW));
2491
}
2492
2493
if (inst->sources > 1)
2494
multiple_instructions_emitted = true;
2495
}
2496
2497
break;
2498
2499
case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2500
const struct brw_reg mask =
2501
brw_stage_has_packed_dispatch(devinfo, stage,
2502
prog_data) ? brw_imm_ud(~0u) :
2503
stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2504
brw_dmask_reg();
2505
brw_find_live_channel(p, dst, mask);
2506
break;
2507
}
2508
case FS_OPCODE_LOAD_LIVE_CHANNELS: {
2509
assert(devinfo->ver >= 8);
2510
assert(inst->force_writemask_all && inst->group == 0);
2511
assert(inst->dst.file == BAD_FILE);
2512
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2513
brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
2514
BRW_REGISTER_TYPE_UD),
2515
retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
2516
break;
2517
}
2518
case SHADER_OPCODE_BROADCAST:
2519
assert(inst->force_writemask_all);
2520
brw_broadcast(p, dst, src[0], src[1]);
2521
break;
2522
2523
case SHADER_OPCODE_SHUFFLE:
2524
generate_shuffle(inst, dst, src[0], src[1]);
2525
break;
2526
2527
case SHADER_OPCODE_SEL_EXEC:
2528
assert(inst->force_writemask_all);
2529
if (type_sz(dst.type) > 4 && !devinfo->has_64bit_float) {
2530
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2531
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 0),
2532
subscript(src[1], BRW_REGISTER_TYPE_UD, 0));
2533
brw_set_default_swsb(p, tgl_swsb_null());
2534
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 1),
2535
subscript(src[1], BRW_REGISTER_TYPE_UD, 1));
2536
brw_set_default_mask_control(p, BRW_MASK_ENABLE);
2537
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 0),
2538
subscript(src[0], BRW_REGISTER_TYPE_UD, 0));
2539
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_UD, 1),
2540
subscript(src[0], BRW_REGISTER_TYPE_UD, 1));
2541
} else {
2542
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2543
brw_MOV(p, dst, src[1]);
2544
brw_set_default_mask_control(p, BRW_MASK_ENABLE);
2545
brw_set_default_swsb(p, tgl_swsb_null());
2546
brw_MOV(p, dst, src[0]);
2547
}
2548
break;
2549
2550
case SHADER_OPCODE_QUAD_SWIZZLE:
2551
assert(src[1].file == BRW_IMMEDIATE_VALUE);
2552
assert(src[1].type == BRW_REGISTER_TYPE_UD);
2553
generate_quad_swizzle(inst, dst, src[0], src[1].ud);
2554
break;
2555
2556
case SHADER_OPCODE_CLUSTER_BROADCAST: {
2557
assert(!src[0].negate && !src[0].abs);
2558
assert(src[1].file == BRW_IMMEDIATE_VALUE);
2559
assert(src[1].type == BRW_REGISTER_TYPE_UD);
2560
assert(src[2].file == BRW_IMMEDIATE_VALUE);
2561
assert(src[2].type == BRW_REGISTER_TYPE_UD);
2562
const unsigned component = src[1].ud;
2563
const unsigned cluster_size = src[2].ud;
2564
unsigned vstride = cluster_size;
2565
unsigned width = cluster_size;
2566
2567
/* The maximum exec_size is 32, but the maximum width is only 16. */
2568
if (inst->exec_size == width) {
2569
vstride = 0;
2570
width = 1;
2571
}
2572
2573
struct brw_reg strided = stride(suboffset(src[0], component),
2574
vstride, width, 0);
2575
if (type_sz(src[0].type) > 4 &&
2576
(devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
2577
!devinfo->has_64bit_float)) {
2578
/* IVB has an issue (which we found empirically) where it reads
2579
* two address register components per channel for indirectly
2580
* addressed 64-bit sources.
2581
*
2582
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
2583
*
2584
* "When source or destination datatype is 64b or operation is
2585
* integer DWord multiply, indirect addressing must not be
2586
* used."
2587
*
2588
* To work around both of these, we do two integer MOVs insead of
2589
* one 64-bit MOV. Because no double value should ever cross a
2590
* register boundary, it's safe to use the immediate offset in the
2591
* indirect here to handle adding 4 bytes to the offset and avoid
2592
* the extra ADD to the register file.
2593
*/
2594
assert(src[0].type == dst.type);
2595
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
2596
subscript(strided, BRW_REGISTER_TYPE_D, 0));
2597
brw_set_default_swsb(p, tgl_swsb_null());
2598
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
2599
subscript(strided, BRW_REGISTER_TYPE_D, 1));
2600
} else {
2601
brw_MOV(p, dst, strided);
2602
}
2603
break;
2604
}
2605
2606
case FS_OPCODE_SET_SAMPLE_ID:
2607
generate_set_sample_id(inst, dst, src[0], src[1]);
2608
break;
2609
2610
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2611
generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2612
break;
2613
2614
case SHADER_OPCODE_HALT_TARGET:
2615
/* This is the place where the final HALT needs to be inserted if
2616
* we've emitted any discards. If not, this will emit no code.
2617
*/
2618
if (!patch_halt_jumps()) {
2619
if (unlikely(debug_flag)) {
2620
disasm_info->use_tail = true;
2621
}
2622
}
2623
break;
2624
2625
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2626
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2627
GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2628
send_count++;
2629
break;
2630
2631
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2632
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2633
GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2634
send_count++;
2635
break;
2636
2637
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2638
generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2639
GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2640
send_count++;
2641
break;
2642
2643
case CS_OPCODE_CS_TERMINATE:
2644
generate_cs_terminate(inst, src[0]);
2645
send_count++;
2646
break;
2647
2648
case SHADER_OPCODE_BARRIER:
2649
generate_barrier(inst, src[0]);
2650
send_count++;
2651
break;
2652
2653
case BRW_OPCODE_DIM:
2654
assert(devinfo->is_haswell);
2655
assert(src[0].type == BRW_REGISTER_TYPE_DF);
2656
assert(dst.type == BRW_REGISTER_TYPE_DF);
2657
brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2658
break;
2659
2660
case SHADER_OPCODE_RND_MODE: {
2661
assert(src[0].file == BRW_IMMEDIATE_VALUE);
2662
/*
2663
* Changes the floating point rounding mode updating the control
2664
* register field defined at cr0.0[5-6] bits.
2665
*/
2666
enum brw_rnd_mode mode =
2667
(enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
2668
brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
2669
}
2670
break;
2671
2672
case SHADER_OPCODE_FLOAT_CONTROL_MODE:
2673
assert(src[0].file == BRW_IMMEDIATE_VALUE);
2674
assert(src[1].file == BRW_IMMEDIATE_VALUE);
2675
brw_float_controls_mode(p, src[0].d, src[1].d);
2676
break;
2677
2678
case SHADER_OPCODE_GET_DSS_ID:
2679
/* The Slice, Dual-SubSlice, SubSlice, EU, and Thread IDs are all
2680
* stored in sr0.0. Normally, for reading from HW regs, we'd just do
2681
* this in the IR and let the back-end generate some code but these
2682
* live in the state register which tends to have special rules.
2683
*
2684
* For convenience, we combine Slice ID and Dual-SubSlice ID into a
2685
* single ID.
2686
*/
2687
if (devinfo->ver == 12) {
2688
/* There is a SWSB restriction that requires that any time sr0 is
2689
* accessed both the instruction doing the access and the next one
2690
* have SWSB set to RegDist(1).
2691
*/
2692
if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
2693
brw_SYNC(p, TGL_SYNC_NOP);
2694
brw_set_default_swsb(p, tgl_swsb_regdist(1));
2695
brw_SHR(p, dst, brw_sr0_reg(0), brw_imm_ud(9));
2696
brw_set_default_swsb(p, tgl_swsb_regdist(1));
2697
brw_AND(p, dst, dst, brw_imm_ud(0x1f));
2698
} else {
2699
/* These move around basically every hardware generation, so don't
2700
* do any >= checks and fail if the platform hasn't explicitly
2701
* been enabled here.
2702
*/
2703
unreachable("Unsupported platform");
2704
}
2705
break;
2706
2707
default:
2708
unreachable("Unsupported opcode");
2709
2710
case SHADER_OPCODE_LOAD_PAYLOAD:
2711
unreachable("Should be lowered by lower_load_payload()");
2712
}
2713
2714
if (multiple_instructions_emitted)
2715
continue;
2716
2717
if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2718
assert(p->next_insn_offset == last_insn_offset + 16 ||
2719
!"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2720
"emitting more than 1 instruction");
2721
2722
brw_inst *last = &p->store[last_insn_offset / 16];
2723
2724
if (inst->conditional_mod)
2725
brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2726
if (devinfo->ver < 12) {
2727
brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2728
brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2729
}
2730
}
2731
}
2732
2733
brw_set_uip_jip(p, start_offset);
2734
2735
/* end of program sentinel */
2736
disasm_new_inst_group(disasm_info, p->next_insn_offset);
2737
2738
#ifndef NDEBUG
2739
bool validated =
2740
#else
2741
if (unlikely(debug_flag))
2742
#endif
2743
brw_validate_instructions(devinfo, p->store,
2744
start_offset,
2745
p->next_insn_offset,
2746
disasm_info);
2747
2748
int before_size = p->next_insn_offset - start_offset;
2749
brw_compact_instructions(p, start_offset, disasm_info);
2750
int after_size = p->next_insn_offset - start_offset;
2751
2752
if (unlikely(debug_flag)) {
2753
unsigned char sha1[21];
2754
char sha1buf[41];
2755
2756
_mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
2757
after_size, sha1);
2758
_mesa_sha1_format(sha1buf, sha1);
2759
2760
fprintf(stderr, "Native code for %s (sha1 %s)\n"
2761
"SIMD%d shader: %d instructions. %d loops. %u cycles. "
2762
"%d:%d spills:fills, %u sends, "
2763
"scheduled with mode %s. "
2764
"Promoted %u constants. "
2765
"Compacted %d to %d bytes (%.0f%%)\n",
2766
shader_name, sha1buf,
2767
dispatch_width, before_size / 16,
2768
loop_count, perf.latency,
2769
spill_count, fill_count, send_count,
2770
shader_stats.scheduler_mode,
2771
shader_stats.promoted_constants,
2772
before_size, after_size,
2773
100.0f * (before_size - after_size) / before_size);
2774
2775
/* overriding the shader makes disasm_info invalid */
2776
if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
2777
dump_assembly(p->store, start_offset, p->next_insn_offset,
2778
disasm_info, perf.block_latency);
2779
} else {
2780
fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
2781
}
2782
}
2783
ralloc_free(disasm_info);
2784
#ifndef NDEBUG
2785
if (!validated && !debug_flag) {
2786
fprintf(stderr,
2787
"Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
2788
}
2789
#endif
2790
assert(validated);
2791
2792
compiler->shader_debug_log(log_data,
2793
"%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2794
"%d:%d spills:fills, %u sends, "
2795
"scheduled with mode %s, "
2796
"Promoted %u constants, "
2797
"compacted %d to %d bytes.",
2798
_mesa_shader_stage_to_abbrev(stage),
2799
dispatch_width, before_size / 16 - nop_count,
2800
loop_count, perf.latency,
2801
spill_count, fill_count, send_count,
2802
shader_stats.scheduler_mode,
2803
shader_stats.promoted_constants,
2804
before_size, after_size);
2805
if (stats) {
2806
stats->dispatch_width = dispatch_width;
2807
stats->instructions = before_size / 16 - nop_count;
2808
stats->sends = send_count;
2809
stats->loops = loop_count;
2810
stats->cycles = perf.latency;
2811
stats->spills = spill_count;
2812
stats->fills = fill_count;
2813
}
2814
2815
return start_offset;
2816
}
2817
2818
void
2819
fs_generator::add_const_data(void *data, unsigned size)
2820
{
2821
assert(prog_data->const_data_size == 0);
2822
if (size > 0) {
2823
prog_data->const_data_size = size;
2824
prog_data->const_data_offset = brw_append_data(p, data, size, 32);
2825
}
2826
}
2827
2828
void
2829
fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
2830
{
2831
assert(brw_shader_stage_is_bindless(stage));
2832
struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
2833
if (num_resume_shaders > 0) {
2834
bs_prog_data->resume_sbt_offset =
2835
brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
2836
for (unsigned i = 0; i < num_resume_shaders; i++) {
2837
size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
2838
assert(offset <= UINT32_MAX);
2839
brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
2840
BRW_SHADER_RELOC_TYPE_U32,
2841
(uint32_t)offset, (uint32_t)sbt[i]);
2842
}
2843
}
2844
}
2845
2846
const unsigned *
2847
fs_generator::get_assembly()
2848
{
2849
prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
2850
2851
return brw_get_program(p, &prog_data->program_size);
2852
}
2853
2854