Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_eu_emit.c
4550 views
1
/*
2
Copyright (C) Intel Corp. 2006. All Rights Reserved.
3
Intel funded Tungsten Graphics to
4
develop this 3D driver.
5
6
Permission is hereby granted, free of charge, to any person obtaining
7
a copy of this software and associated documentation files (the
8
"Software"), to deal in the Software without restriction, including
9
without limitation the rights to use, copy, modify, merge, publish,
10
distribute, sublicense, and/or sell copies of the Software, and to
11
permit persons to whom the Software is furnished to do so, subject to
12
the following conditions:
13
14
The above copyright notice and this permission notice (including the
15
next paragraph) shall be included in all copies or substantial
16
portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26
**********************************************************************/
27
/*
28
* Authors:
29
* Keith Whitwell <[email protected]>
30
*/
31
32
33
#include "brw_eu_defines.h"
34
#include "brw_eu.h"
35
36
#include "util/ralloc.h"
37
38
/**
39
* Prior to Sandybridge, the SEND instruction accepted non-MRF source
40
* registers, implicitly moving the operand to a message register.
41
*
42
* On Sandybridge, this is no longer the case. This function performs the
43
* explicit move; it should be called before emitting a SEND instruction.
44
*/
45
void
46
gfx6_resolve_implied_move(struct brw_codegen *p,
47
struct brw_reg *src,
48
unsigned msg_reg_nr)
49
{
50
const struct intel_device_info *devinfo = p->devinfo;
51
if (devinfo->ver < 6)
52
return;
53
54
if (src->file == BRW_MESSAGE_REGISTER_FILE)
55
return;
56
57
if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58
assert(devinfo->ver < 12);
59
brw_push_insn_state(p);
60
brw_set_default_exec_size(p, BRW_EXECUTE_8);
61
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63
brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64
retype(*src, BRW_REGISTER_TYPE_UD));
65
brw_pop_insn_state(p);
66
}
67
*src = brw_message_reg(msg_reg_nr);
68
}
69
70
static void
71
gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72
{
73
/* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74
* "The send with EOT should use register space R112-R127 for <src>. This is
75
* to enable loading of a new thread into the same slot while the message
76
* with EOT for current thread is pending dispatch."
77
*
78
* Since we're pretending to have 16 MRFs anyway, we may as well use the
79
* registers required for messages with EOT.
80
*/
81
const struct intel_device_info *devinfo = p->devinfo;
82
if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83
reg->file = BRW_GENERAL_REGISTER_FILE;
84
reg->nr += GFX7_MRF_HACK_START;
85
}
86
}
87
88
void
89
brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90
{
91
const struct intel_device_info *devinfo = p->devinfo;
92
93
if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94
assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
95
else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96
assert(dest.nr < 128);
97
98
/* The hardware has a restriction where a destination of size Byte with
99
* a stride of 1 is only allowed for a packed byte MOV. For any other
100
* instruction, the stride must be at least 2, even when the destination
101
* is the NULL register.
102
*/
103
if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104
dest.nr == BRW_ARF_NULL &&
105
type_sz(dest.type) == 1 &&
106
dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107
dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108
}
109
110
gfx7_convert_mrf_to_grf(p, &dest);
111
112
if (devinfo->ver >= 12 &&
113
(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115
assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116
dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117
assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118
assert(dest.subnr == 0);
119
assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120
(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121
dest.vstride == dest.width + 1));
122
assert(!dest.negate && !dest.abs);
123
brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124
brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126
} else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128
assert(devinfo->ver < 12);
129
assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130
dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131
assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132
assert(dest.subnr % 16 == 0);
133
assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134
dest.vstride == dest.width + 1);
135
assert(!dest.negate && !dest.abs);
136
brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137
brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138
brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139
} else {
140
brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141
brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143
if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144
brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147
brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148
if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149
dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150
brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151
} else {
152
brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153
brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154
if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155
dest.file == BRW_MESSAGE_REGISTER_FILE) {
156
assert(dest.writemask != 0);
157
}
158
/* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159
* Although Dst.HorzStride is a don't care for Align16, HW needs
160
* this to be programmed as "01".
161
*/
162
brw_inst_set_dst_hstride(devinfo, inst, 1);
163
}
164
} else {
165
brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167
/* These are different sizes in align1 vs align16:
168
*/
169
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170
brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171
dest.indirect_offset);
172
if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173
dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174
brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175
} else {
176
brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177
dest.indirect_offset);
178
/* even ignored in da16, still need to set as '01' */
179
brw_inst_set_dst_hstride(devinfo, inst, 1);
180
}
181
}
182
}
183
184
/* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185
* or 16 (SIMD16), as that's normally correct. However, when dealing with
186
* small registers, it can be useful for us to automatically reduce it to
187
* match the register size.
188
*/
189
if (p->automatic_exec_sizes) {
190
/*
191
* In platforms that support fp64 we can emit instructions with a width
192
* of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193
* these cases we need to make sure that these instructions have their
194
* exec sizes set properly when they are emitted and we can't rely on
195
* this code to fix it.
196
*/
197
bool fix_exec_size;
198
if (devinfo->ver >= 6)
199
fix_exec_size = dest.width < BRW_EXECUTE_4;
200
else
201
fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203
if (fix_exec_size)
204
brw_inst_set_exec_size(devinfo, inst, dest.width);
205
}
206
}
207
208
void
209
brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210
{
211
const struct intel_device_info *devinfo = p->devinfo;
212
213
if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214
assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
215
else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216
assert(reg.nr < 128);
217
218
gfx7_convert_mrf_to_grf(p, &reg);
219
220
if (devinfo->ver >= 6 &&
221
(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225
/* Any source modifiers or regions will be ignored, since this just
226
* identifies the MRF/GRF to start reading the message contents from.
227
* Check for some likely failures.
228
*/
229
assert(!reg.negate);
230
assert(!reg.abs);
231
assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232
}
233
234
if (devinfo->ver >= 12 &&
235
(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237
assert(reg.file != BRW_IMMEDIATE_VALUE);
238
assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239
assert(reg.subnr == 0);
240
assert(has_scalar_region(reg) ||
241
(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242
reg.vstride == reg.width + 1));
243
assert(!reg.negate && !reg.abs);
244
brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245
brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247
} else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249
assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250
assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251
assert(reg.subnr % 16 == 0);
252
assert(has_scalar_region(reg) ||
253
(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254
reg.vstride == reg.width + 1));
255
assert(!reg.negate && !reg.abs);
256
brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257
brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258
} else {
259
brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260
brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261
brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262
brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263
264
if (reg.file == BRW_IMMEDIATE_VALUE) {
265
if (reg.type == BRW_REGISTER_TYPE_DF ||
266
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267
brw_inst_set_imm_df(devinfo, inst, reg.df);
268
else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269
reg.type == BRW_REGISTER_TYPE_Q)
270
brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271
else
272
brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273
274
if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275
brw_inst_set_src1_reg_file(devinfo, inst,
276
BRW_ARCHITECTURE_REGISTER_FILE);
277
brw_inst_set_src1_reg_hw_type(devinfo, inst,
278
brw_inst_src0_reg_hw_type(devinfo, inst));
279
}
280
} else {
281
if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282
brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284
brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285
} else {
286
brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287
}
288
} else {
289
brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290
291
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292
brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293
} else {
294
brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295
}
296
}
297
298
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299
if (reg.width == BRW_WIDTH_1 &&
300
brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301
brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302
brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303
brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304
} else {
305
brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306
brw_inst_set_src0_width(devinfo, inst, reg.width);
307
brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308
}
309
} else {
310
brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312
brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314
brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316
brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318
319
if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320
/* This is an oddity of the fact we're using the same
321
* descriptions for registers in align_16 as align_1:
322
*/
323
brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324
} else if (devinfo->verx10 == 70 &&
325
reg.type == BRW_REGISTER_TYPE_DF &&
326
reg.vstride == BRW_VERTICAL_STRIDE_2) {
327
/* From SNB PRM:
328
*
329
* "For Align16 access mode, only encodings of 0000 and 0011
330
* are allowed. Other codes are reserved."
331
*
332
* Presumably the DevSNB behavior applies to IVB as well.
333
*/
334
brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335
} else {
336
brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337
}
338
}
339
}
340
}
341
}
342
343
344
void
345
brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346
{
347
const struct intel_device_info *devinfo = p->devinfo;
348
349
if (reg.file == BRW_GENERAL_REGISTER_FILE)
350
assert(reg.nr < 128);
351
352
if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354
(devinfo->ver >= 12 &&
355
(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357
assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358
reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359
assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360
assert(reg.subnr == 0);
361
assert(has_scalar_region(reg) ||
362
(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363
reg.vstride == reg.width + 1));
364
assert(!reg.negate && !reg.abs);
365
brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366
brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367
} else {
368
/* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369
*
370
* "Accumulator registers may be accessed explicitly as src0
371
* operands only."
372
*/
373
assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374
reg.nr != BRW_ARF_ACCUMULATOR);
375
376
gfx7_convert_mrf_to_grf(p, &reg);
377
assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378
379
brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380
brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381
brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382
383
/* Only src1 can be immediate in two-argument instructions.
384
*/
385
assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386
387
if (reg.file == BRW_IMMEDIATE_VALUE) {
388
/* two-argument instructions can only use 32-bit immediates */
389
assert(type_sz(reg.type) < 8);
390
brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391
} else {
392
/* This is a hardware restriction, which may or may not be lifted
393
* in the future:
394
*/
395
assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396
/* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397
398
brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400
brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401
} else {
402
brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403
}
404
405
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406
if (reg.width == BRW_WIDTH_1 &&
407
brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408
brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409
brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410
brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411
} else {
412
brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413
brw_inst_set_src1_width(devinfo, inst, reg.width);
414
brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415
}
416
} else {
417
brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419
brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421
brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423
brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424
BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425
426
if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427
/* This is an oddity of the fact we're using the same
428
* descriptions for registers in align_16 as align_1:
429
*/
430
brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431
} else if (devinfo->verx10 == 70 &&
432
reg.type == BRW_REGISTER_TYPE_DF &&
433
reg.vstride == BRW_VERTICAL_STRIDE_2) {
434
/* From SNB PRM:
435
*
436
* "For Align16 access mode, only encodings of 0000 and 0011
437
* are allowed. Other codes are reserved."
438
*
439
* Presumably the DevSNB behavior applies to IVB as well.
440
*/
441
brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442
} else {
443
brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444
}
445
}
446
}
447
}
448
}
449
450
/**
451
* Specify the descriptor and extended descriptor immediate for a SEND(C)
452
* message instruction.
453
*/
454
void
455
brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456
unsigned desc, unsigned ex_desc)
457
{
458
const struct intel_device_info *devinfo = p->devinfo;
459
assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460
brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461
if (devinfo->ver < 12)
462
brw_inst_set_src1_file_type(devinfo, inst,
463
BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464
brw_inst_set_send_desc(devinfo, inst, desc);
465
if (devinfo->ver >= 9)
466
brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467
}
468
469
static void brw_set_math_message( struct brw_codegen *p,
470
brw_inst *inst,
471
unsigned function,
472
unsigned integer_type,
473
bool low_precision,
474
unsigned dataType )
475
{
476
const struct intel_device_info *devinfo = p->devinfo;
477
unsigned msg_length;
478
unsigned response_length;
479
480
/* Infer message length from the function */
481
switch (function) {
482
case BRW_MATH_FUNCTION_POW:
483
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484
case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486
msg_length = 2;
487
break;
488
default:
489
msg_length = 1;
490
break;
491
}
492
493
/* Infer response length from the function */
494
switch (function) {
495
case BRW_MATH_FUNCTION_SINCOS:
496
case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497
response_length = 2;
498
break;
499
default:
500
response_length = 1;
501
break;
502
}
503
504
brw_set_desc(p, inst, brw_message_desc(
505
devinfo, msg_length, response_length, false));
506
507
brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508
brw_inst_set_math_msg_function(devinfo, inst, function);
509
brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510
brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511
brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512
brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513
brw_inst_set_saturate(devinfo, inst, 0);
514
}
515
516
517
static void brw_set_ff_sync_message(struct brw_codegen *p,
518
brw_inst *insn,
519
bool allocate,
520
unsigned response_length,
521
bool end_of_thread)
522
{
523
const struct intel_device_info *devinfo = p->devinfo;
524
525
brw_set_desc(p, insn, brw_message_desc(
526
devinfo, 1, response_length, true));
527
528
brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529
brw_inst_set_eot(devinfo, insn, end_of_thread);
530
brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531
brw_inst_set_urb_allocate(devinfo, insn, allocate);
532
/* The following fields are not used by FF_SYNC: */
533
brw_inst_set_urb_global_offset(devinfo, insn, 0);
534
brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535
brw_inst_set_urb_used(devinfo, insn, 0);
536
brw_inst_set_urb_complete(devinfo, insn, 0);
537
}
538
539
static void brw_set_urb_message( struct brw_codegen *p,
540
brw_inst *insn,
541
enum brw_urb_write_flags flags,
542
unsigned msg_length,
543
unsigned response_length,
544
unsigned offset,
545
unsigned swizzle_control )
546
{
547
const struct intel_device_info *devinfo = p->devinfo;
548
549
assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550
assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551
assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552
553
brw_set_desc(p, insn, brw_message_desc(
554
devinfo, msg_length, response_length, true));
555
556
brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557
brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558
559
if (flags & BRW_URB_WRITE_OWORD) {
560
assert(msg_length == 2); /* header + one OWORD of data */
561
brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562
} else {
563
brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564
}
565
566
brw_inst_set_urb_global_offset(devinfo, insn, offset);
567
brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568
569
if (devinfo->ver < 8) {
570
brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571
}
572
573
if (devinfo->ver < 7) {
574
brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575
brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576
} else {
577
brw_inst_set_urb_per_slot_offset(devinfo, insn,
578
!!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579
}
580
}
581
582
static void
583
gfx7_set_dp_scratch_message(struct brw_codegen *p,
584
brw_inst *inst,
585
bool write,
586
bool dword,
587
bool invalidate_after_read,
588
unsigned num_regs,
589
unsigned addr_offset,
590
unsigned mlen,
591
unsigned rlen,
592
bool header_present)
593
{
594
const struct intel_device_info *devinfo = p->devinfo;
595
assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596
(devinfo->ver >= 8 && num_regs == 8));
597
const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598
num_regs - 1);
599
600
brw_set_desc(p, inst, brw_message_desc(
601
devinfo, mlen, rlen, header_present));
602
603
brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604
brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605
brw_inst_set_scratch_read_write(devinfo, inst, write);
606
brw_inst_set_scratch_type(devinfo, inst, dword);
607
brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608
brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609
brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610
}
611
612
static void
613
brw_inst_set_state(const struct intel_device_info *devinfo,
614
brw_inst *insn,
615
const struct brw_insn_state *state)
616
{
617
brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618
brw_inst_set_group(devinfo, insn, state->group);
619
brw_inst_set_compression(devinfo, insn, state->compressed);
620
brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621
brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622
if (devinfo->ver >= 12)
623
brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
624
brw_inst_set_saturate(devinfo, insn, state->saturate);
625
brw_inst_set_pred_control(devinfo, insn, state->predicate);
626
brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627
628
if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629
state->access_mode == BRW_ALIGN_16) {
630
brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631
if (devinfo->ver >= 7)
632
brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633
} else {
634
brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635
if (devinfo->ver >= 7)
636
brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637
}
638
639
if (devinfo->ver >= 6)
640
brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641
}
642
643
static brw_inst *
644
brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
645
{
646
assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
647
assert(util_is_power_of_two_or_zero(align));
648
const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
649
const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
650
const unsigned new_nr_insn = start_insn + nr_insn;
651
652
if (p->store_size < new_nr_insn) {
653
p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
654
p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
655
}
656
657
/* Memset any padding due to alignment to 0. We don't want to be hashing
658
* or caching a bunch of random bits we got from a memory allocation.
659
*/
660
if (p->nr_insn < start_insn) {
661
memset(&p->store[p->nr_insn], 0,
662
(start_insn - p->nr_insn) * sizeof(brw_inst));
663
}
664
665
assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
666
p->nr_insn = new_nr_insn;
667
p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
668
669
return &p->store[start_insn];
670
}
671
672
void
673
brw_realign(struct brw_codegen *p, unsigned align)
674
{
675
brw_append_insns(p, 0, align);
676
}
677
678
int
679
brw_append_data(struct brw_codegen *p, void *data,
680
unsigned size, unsigned align)
681
{
682
unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
683
void *dst = brw_append_insns(p, nr_insn, align);
684
memcpy(dst, data, size);
685
686
/* If it's not a whole number of instructions, memset the end */
687
if (size < nr_insn * sizeof(brw_inst))
688
memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
689
690
return dst - (void *)p->store;
691
}
692
693
#define next_insn brw_next_insn
694
brw_inst *
695
brw_next_insn(struct brw_codegen *p, unsigned opcode)
696
{
697
const struct intel_device_info *devinfo = p->devinfo;
698
brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
699
700
memset(insn, 0, sizeof(*insn));
701
brw_inst_set_opcode(devinfo, insn, opcode);
702
703
/* Apply the default instruction state */
704
brw_inst_set_state(devinfo, insn, p->current);
705
706
return insn;
707
}
708
709
void
710
brw_add_reloc(struct brw_codegen *p, uint32_t id,
711
enum brw_shader_reloc_type type,
712
uint32_t offset, uint32_t delta)
713
{
714
if (p->num_relocs + 1 > p->reloc_array_size) {
715
p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
716
p->relocs = reralloc(p->mem_ctx, p->relocs,
717
struct brw_shader_reloc, p->reloc_array_size);
718
}
719
720
p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
721
.id = id,
722
.type = type,
723
.offset = offset,
724
.delta = delta,
725
};
726
}
727
728
static brw_inst *
729
brw_alu1(struct brw_codegen *p, unsigned opcode,
730
struct brw_reg dest, struct brw_reg src)
731
{
732
brw_inst *insn = next_insn(p, opcode);
733
brw_set_dest(p, insn, dest);
734
brw_set_src0(p, insn, src);
735
return insn;
736
}
737
738
static brw_inst *
739
brw_alu2(struct brw_codegen *p, unsigned opcode,
740
struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
741
{
742
/* 64-bit immediates are only supported on 1-src instructions */
743
assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
744
assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
745
746
brw_inst *insn = next_insn(p, opcode);
747
brw_set_dest(p, insn, dest);
748
brw_set_src0(p, insn, src0);
749
brw_set_src1(p, insn, src1);
750
return insn;
751
}
752
753
static int
754
get_3src_subreg_nr(struct brw_reg reg)
755
{
756
/* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
757
* use 32-bit units (components 0..7). Since they only support F/D/UD
758
* types, this doesn't lose any flexibility, but uses fewer bits.
759
*/
760
return reg.subnr / 4;
761
}
762
763
static enum gfx10_align1_3src_vertical_stride
764
to_3src_align1_vstride(const struct intel_device_info *devinfo,
765
enum brw_vertical_stride vstride)
766
{
767
switch (vstride) {
768
case BRW_VERTICAL_STRIDE_0:
769
return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
770
case BRW_VERTICAL_STRIDE_1:
771
assert(devinfo->ver >= 12);
772
return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
773
case BRW_VERTICAL_STRIDE_2:
774
assert(devinfo->ver < 12);
775
return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
776
case BRW_VERTICAL_STRIDE_4:
777
return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
778
case BRW_VERTICAL_STRIDE_8:
779
case BRW_VERTICAL_STRIDE_16:
780
return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
781
default:
782
unreachable("invalid vstride");
783
}
784
}
785
786
787
static enum gfx10_align1_3src_src_horizontal_stride
788
to_3src_align1_hstride(enum brw_horizontal_stride hstride)
789
{
790
switch (hstride) {
791
case BRW_HORIZONTAL_STRIDE_0:
792
return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
793
case BRW_HORIZONTAL_STRIDE_1:
794
return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
795
case BRW_HORIZONTAL_STRIDE_2:
796
return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
797
case BRW_HORIZONTAL_STRIDE_4:
798
return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
799
default:
800
unreachable("invalid hstride");
801
}
802
}
803
804
static brw_inst *
805
brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
806
struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
807
{
808
const struct intel_device_info *devinfo = p->devinfo;
809
brw_inst *inst = next_insn(p, opcode);
810
811
gfx7_convert_mrf_to_grf(p, &dest);
812
813
assert(dest.nr < 128);
814
815
if (devinfo->ver >= 10)
816
assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
817
src2.file == BRW_IMMEDIATE_VALUE));
818
819
assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
820
assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
821
assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
822
assert(dest.address_mode == BRW_ADDRESS_DIRECT);
823
assert(src0.address_mode == BRW_ADDRESS_DIRECT);
824
assert(src1.address_mode == BRW_ADDRESS_DIRECT);
825
assert(src2.address_mode == BRW_ADDRESS_DIRECT);
826
827
if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
828
assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
829
dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
830
831
if (devinfo->ver >= 12) {
832
brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
833
brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
834
} else {
835
if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
836
brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
837
BRW_ALIGN1_3SRC_ACCUMULATOR);
838
brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
839
} else {
840
brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
841
BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
842
brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
843
}
844
}
845
brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
846
847
brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
848
849
if (brw_reg_type_is_floating_point(dest.type)) {
850
brw_inst_set_3src_a1_exec_type(devinfo, inst,
851
BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
852
} else {
853
brw_inst_set_3src_a1_exec_type(devinfo, inst,
854
BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
855
}
856
857
brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
858
brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
859
brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
860
brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
861
862
if (src0.file == BRW_IMMEDIATE_VALUE) {
863
brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
864
} else {
865
brw_inst_set_3src_a1_src0_vstride(
866
devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
867
brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
868
to_3src_align1_hstride(src0.hstride));
869
brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
870
if (src0.type == BRW_REGISTER_TYPE_NF) {
871
brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
872
} else {
873
brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
874
}
875
brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876
brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877
}
878
brw_inst_set_3src_a1_src1_vstride(
879
devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
880
brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
881
to_3src_align1_hstride(src1.hstride));
882
883
brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
884
if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
885
brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
886
} else {
887
brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
888
}
889
brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
890
brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
891
892
if (src2.file == BRW_IMMEDIATE_VALUE) {
893
brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
894
} else {
895
brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
896
to_3src_align1_hstride(src2.hstride));
897
/* no vstride on src2 */
898
brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
899
brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
900
brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
901
brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
902
}
903
904
assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
905
src0.file == BRW_IMMEDIATE_VALUE ||
906
(src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
907
src0.type == BRW_REGISTER_TYPE_NF));
908
assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
909
src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
910
assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
911
src2.file == BRW_IMMEDIATE_VALUE);
912
913
if (devinfo->ver >= 12) {
914
if (src0.file == BRW_IMMEDIATE_VALUE) {
915
brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
916
} else {
917
brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
918
}
919
920
brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
921
922
if (src2.file == BRW_IMMEDIATE_VALUE) {
923
brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
924
} else {
925
brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
926
}
927
} else {
928
brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
929
src0.file == BRW_GENERAL_REGISTER_FILE ?
930
BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
931
BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
932
brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
933
src1.file == BRW_GENERAL_REGISTER_FILE ?
934
BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
935
BRW_ALIGN1_3SRC_ACCUMULATOR);
936
brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
937
src2.file == BRW_GENERAL_REGISTER_FILE ?
938
BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
939
BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
940
}
941
942
} else {
943
assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
944
dest.file == BRW_MESSAGE_REGISTER_FILE);
945
assert(dest.type == BRW_REGISTER_TYPE_F ||
946
dest.type == BRW_REGISTER_TYPE_DF ||
947
dest.type == BRW_REGISTER_TYPE_D ||
948
dest.type == BRW_REGISTER_TYPE_UD ||
949
(dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
950
if (devinfo->ver == 6) {
951
brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
952
dest.file == BRW_MESSAGE_REGISTER_FILE);
953
}
954
brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
955
brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
956
brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
957
958
assert(src0.file == BRW_GENERAL_REGISTER_FILE);
959
brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
960
brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
961
brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
962
brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
963
brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
964
brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
965
src0.vstride == BRW_VERTICAL_STRIDE_0);
966
967
assert(src1.file == BRW_GENERAL_REGISTER_FILE);
968
brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
969
brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
970
brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
971
brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
972
brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
973
brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
974
src1.vstride == BRW_VERTICAL_STRIDE_0);
975
976
assert(src2.file == BRW_GENERAL_REGISTER_FILE);
977
brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
978
brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
979
brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
980
brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
981
brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
982
brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
983
src2.vstride == BRW_VERTICAL_STRIDE_0);
984
985
if (devinfo->ver >= 7) {
986
/* Set both the source and destination types based on dest.type,
987
* ignoring the source register types. The MAD and LRP emitters ensure
988
* that all four types are float. The BFE and BFI2 emitters, however,
989
* may send us mixed D and UD types and want us to ignore that and use
990
* the destination type.
991
*/
992
brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
993
brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
994
995
/* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
996
*
997
* "Three source instructions can use operands with mixed-mode
998
* precision. When SrcType field is set to :f or :hf it defines
999
* precision for source 0 only, and fields Src1Type and Src2Type
1000
* define precision for other source operands:
1001
*
1002
* 0b = :f. Single precision Float (32-bit).
1003
* 1b = :hf. Half precision Float (16-bit)."
1004
*/
1005
if (src1.type == BRW_REGISTER_TYPE_HF)
1006
brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1007
1008
if (src2.type == BRW_REGISTER_TYPE_HF)
1009
brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1010
}
1011
}
1012
1013
return inst;
1014
}
1015
1016
1017
/***********************************************************************
1018
* Convenience routines.
1019
*/
1020
#define ALU1(OP) \
1021
brw_inst *brw_##OP(struct brw_codegen *p, \
1022
struct brw_reg dest, \
1023
struct brw_reg src0) \
1024
{ \
1025
return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1026
}
1027
1028
#define ALU2(OP) \
1029
brw_inst *brw_##OP(struct brw_codegen *p, \
1030
struct brw_reg dest, \
1031
struct brw_reg src0, \
1032
struct brw_reg src1) \
1033
{ \
1034
return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1035
}
1036
1037
#define ALU3(OP) \
1038
brw_inst *brw_##OP(struct brw_codegen *p, \
1039
struct brw_reg dest, \
1040
struct brw_reg src0, \
1041
struct brw_reg src1, \
1042
struct brw_reg src2) \
1043
{ \
1044
if (p->current->access_mode == BRW_ALIGN_16) { \
1045
if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1046
src0.swizzle = BRW_SWIZZLE_XXXX; \
1047
if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1048
src1.swizzle = BRW_SWIZZLE_XXXX; \
1049
if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1050
src2.swizzle = BRW_SWIZZLE_XXXX; \
1051
} \
1052
return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1053
}
1054
1055
#define ALU3F(OP) \
1056
brw_inst *brw_##OP(struct brw_codegen *p, \
1057
struct brw_reg dest, \
1058
struct brw_reg src0, \
1059
struct brw_reg src1, \
1060
struct brw_reg src2) \
1061
{ \
1062
assert(dest.type == BRW_REGISTER_TYPE_F || \
1063
dest.type == BRW_REGISTER_TYPE_DF); \
1064
if (dest.type == BRW_REGISTER_TYPE_F) { \
1065
assert(src0.type == BRW_REGISTER_TYPE_F); \
1066
assert(src1.type == BRW_REGISTER_TYPE_F); \
1067
assert(src2.type == BRW_REGISTER_TYPE_F); \
1068
} else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1069
assert(src0.type == BRW_REGISTER_TYPE_DF); \
1070
assert(src1.type == BRW_REGISTER_TYPE_DF); \
1071
assert(src2.type == BRW_REGISTER_TYPE_DF); \
1072
} \
1073
\
1074
if (p->current->access_mode == BRW_ALIGN_16) { \
1075
if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1076
src0.swizzle = BRW_SWIZZLE_XXXX; \
1077
if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1078
src1.swizzle = BRW_SWIZZLE_XXXX; \
1079
if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1080
src2.swizzle = BRW_SWIZZLE_XXXX; \
1081
} \
1082
return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1083
}
1084
1085
ALU2(SEL)
1086
ALU1(NOT)
1087
ALU2(AND)
1088
ALU2(OR)
1089
ALU2(XOR)
1090
ALU2(SHR)
1091
ALU2(SHL)
1092
ALU1(DIM)
1093
ALU2(ASR)
1094
ALU2(ROL)
1095
ALU2(ROR)
1096
ALU3(CSEL)
1097
ALU1(FRC)
1098
ALU1(RNDD)
1099
ALU1(RNDE)
1100
ALU1(RNDU)
1101
ALU1(RNDZ)
1102
ALU2(MAC)
1103
ALU2(MACH)
1104
ALU1(LZD)
1105
ALU2(DP4)
1106
ALU2(DPH)
1107
ALU2(DP3)
1108
ALU2(DP2)
1109
ALU3(MAD)
1110
ALU3F(LRP)
1111
ALU1(BFREV)
1112
ALU3(BFE)
1113
ALU2(BFI1)
1114
ALU3(BFI2)
1115
ALU1(FBH)
1116
ALU1(FBL)
1117
ALU1(CBIT)
1118
ALU2(ADDC)
1119
ALU2(SUBB)
1120
1121
brw_inst *
1122
brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1123
{
1124
const struct intel_device_info *devinfo = p->devinfo;
1125
1126
/* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1127
* To avoid the problems that causes, we use an <X,2,0> source region to
1128
* read each element twice.
1129
*/
1130
if (devinfo->verx10 == 70 &&
1131
brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1132
dest.type == BRW_REGISTER_TYPE_DF &&
1133
(src0.type == BRW_REGISTER_TYPE_F ||
1134
src0.type == BRW_REGISTER_TYPE_D ||
1135
src0.type == BRW_REGISTER_TYPE_UD) &&
1136
!has_scalar_region(src0)) {
1137
assert(src0.vstride == src0.width + src0.hstride);
1138
src0.vstride = src0.hstride;
1139
src0.width = BRW_WIDTH_2;
1140
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1141
}
1142
1143
return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1144
}
1145
1146
brw_inst *
1147
brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1148
struct brw_reg src0, struct brw_reg src1)
1149
{
1150
/* 6.2.2: add */
1151
if (src0.type == BRW_REGISTER_TYPE_F ||
1152
(src0.file == BRW_IMMEDIATE_VALUE &&
1153
src0.type == BRW_REGISTER_TYPE_VF)) {
1154
assert(src1.type != BRW_REGISTER_TYPE_UD);
1155
assert(src1.type != BRW_REGISTER_TYPE_D);
1156
}
1157
1158
if (src1.type == BRW_REGISTER_TYPE_F ||
1159
(src1.file == BRW_IMMEDIATE_VALUE &&
1160
src1.type == BRW_REGISTER_TYPE_VF)) {
1161
assert(src0.type != BRW_REGISTER_TYPE_UD);
1162
assert(src0.type != BRW_REGISTER_TYPE_D);
1163
}
1164
1165
return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1166
}
1167
1168
brw_inst *
1169
brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1170
struct brw_reg src0, struct brw_reg src1)
1171
{
1172
assert(dest.type == src0.type);
1173
assert(src0.type == src1.type);
1174
switch (src0.type) {
1175
case BRW_REGISTER_TYPE_B:
1176
case BRW_REGISTER_TYPE_UB:
1177
case BRW_REGISTER_TYPE_W:
1178
case BRW_REGISTER_TYPE_UW:
1179
case BRW_REGISTER_TYPE_D:
1180
case BRW_REGISTER_TYPE_UD:
1181
break;
1182
default:
1183
unreachable("Bad type for brw_AVG");
1184
}
1185
1186
return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1187
}
1188
1189
brw_inst *
1190
brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1191
struct brw_reg src0, struct brw_reg src1)
1192
{
1193
/* 6.32.38: mul */
1194
if (src0.type == BRW_REGISTER_TYPE_D ||
1195
src0.type == BRW_REGISTER_TYPE_UD ||
1196
src1.type == BRW_REGISTER_TYPE_D ||
1197
src1.type == BRW_REGISTER_TYPE_UD) {
1198
assert(dest.type != BRW_REGISTER_TYPE_F);
1199
}
1200
1201
if (src0.type == BRW_REGISTER_TYPE_F ||
1202
(src0.file == BRW_IMMEDIATE_VALUE &&
1203
src0.type == BRW_REGISTER_TYPE_VF)) {
1204
assert(src1.type != BRW_REGISTER_TYPE_UD);
1205
assert(src1.type != BRW_REGISTER_TYPE_D);
1206
}
1207
1208
if (src1.type == BRW_REGISTER_TYPE_F ||
1209
(src1.file == BRW_IMMEDIATE_VALUE &&
1210
src1.type == BRW_REGISTER_TYPE_VF)) {
1211
assert(src0.type != BRW_REGISTER_TYPE_UD);
1212
assert(src0.type != BRW_REGISTER_TYPE_D);
1213
}
1214
1215
assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1216
src0.nr != BRW_ARF_ACCUMULATOR);
1217
assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1218
src1.nr != BRW_ARF_ACCUMULATOR);
1219
1220
return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1221
}
1222
1223
brw_inst *
1224
brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1225
struct brw_reg src0, struct brw_reg src1)
1226
{
1227
src0.vstride = BRW_VERTICAL_STRIDE_0;
1228
src0.width = BRW_WIDTH_1;
1229
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1230
return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1231
}
1232
1233
brw_inst *
1234
brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1235
struct brw_reg src0, struct brw_reg src1)
1236
{
1237
src0.vstride = BRW_VERTICAL_STRIDE_0;
1238
src0.width = BRW_WIDTH_1;
1239
src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1240
src1.vstride = BRW_VERTICAL_STRIDE_8;
1241
src1.width = BRW_WIDTH_8;
1242
src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1243
return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1244
}
1245
1246
brw_inst *
1247
brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1248
{
1249
const struct intel_device_info *devinfo = p->devinfo;
1250
const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1251
/* The F32TO16 instruction doesn't support 32-bit destination types in
1252
* Align1 mode, and neither does the Gfx8 implementation in terms of a
1253
* converting MOV. Gfx7 does zero out the high 16 bits in Align16 mode as
1254
* an undocumented feature.
1255
*/
1256
const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1257
(!align16 || devinfo->ver >= 8));
1258
brw_inst *inst;
1259
1260
if (align16) {
1261
assert(dst.type == BRW_REGISTER_TYPE_UD);
1262
} else {
1263
assert(dst.type == BRW_REGISTER_TYPE_UD ||
1264
dst.type == BRW_REGISTER_TYPE_W ||
1265
dst.type == BRW_REGISTER_TYPE_UW ||
1266
dst.type == BRW_REGISTER_TYPE_HF);
1267
}
1268
1269
brw_push_insn_state(p);
1270
1271
if (needs_zero_fill) {
1272
brw_set_default_access_mode(p, BRW_ALIGN_1);
1273
dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1274
}
1275
1276
if (devinfo->ver >= 8) {
1277
inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1278
} else {
1279
assert(devinfo->ver == 7);
1280
inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1281
}
1282
1283
if (needs_zero_fill) {
1284
if (devinfo->ver < 12)
1285
brw_inst_set_no_dd_clear(devinfo, inst, true);
1286
brw_set_default_swsb(p, tgl_swsb_null());
1287
inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1288
if (devinfo->ver < 12)
1289
brw_inst_set_no_dd_check(devinfo, inst, true);
1290
}
1291
1292
brw_pop_insn_state(p);
1293
return inst;
1294
}
1295
1296
brw_inst *
1297
brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1298
{
1299
const struct intel_device_info *devinfo = p->devinfo;
1300
bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1301
1302
if (align16) {
1303
assert(src.type == BRW_REGISTER_TYPE_UD);
1304
} else {
1305
/* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1306
*
1307
* Because this instruction does not have a 16-bit floating-point
1308
* type, the source data type must be Word (W). The destination type
1309
* must be F (Float).
1310
*/
1311
if (src.type == BRW_REGISTER_TYPE_UD)
1312
src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1313
1314
assert(src.type == BRW_REGISTER_TYPE_W ||
1315
src.type == BRW_REGISTER_TYPE_UW ||
1316
src.type == BRW_REGISTER_TYPE_HF);
1317
}
1318
1319
if (devinfo->ver >= 8) {
1320
return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1321
} else {
1322
assert(devinfo->ver == 7);
1323
return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1324
}
1325
}
1326
1327
1328
void brw_NOP(struct brw_codegen *p)
1329
{
1330
brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1331
memset(insn, 0, sizeof(*insn));
1332
brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1333
}
1334
1335
void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1336
{
1337
brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1338
brw_inst_set_cond_modifier(p->devinfo, insn, func);
1339
}
1340
1341
/***********************************************************************
1342
* Comparisons, if/else/endif
1343
*/
1344
1345
brw_inst *
1346
brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1347
unsigned predicate_control)
1348
{
1349
const struct intel_device_info *devinfo = p->devinfo;
1350
struct brw_reg ip = brw_ip_reg();
1351
brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1352
1353
brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1354
brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1355
brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1356
brw_inst_set_pred_control(devinfo, inst, predicate_control);
1357
1358
return inst;
1359
}
1360
1361
static void
1362
push_if_stack(struct brw_codegen *p, brw_inst *inst)
1363
{
1364
p->if_stack[p->if_stack_depth] = inst - p->store;
1365
1366
p->if_stack_depth++;
1367
if (p->if_stack_array_size <= p->if_stack_depth) {
1368
p->if_stack_array_size *= 2;
1369
p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1370
p->if_stack_array_size);
1371
}
1372
}
1373
1374
static brw_inst *
1375
pop_if_stack(struct brw_codegen *p)
1376
{
1377
p->if_stack_depth--;
1378
return &p->store[p->if_stack[p->if_stack_depth]];
1379
}
1380
1381
static void
1382
push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1383
{
1384
if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1385
p->loop_stack_array_size *= 2;
1386
p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1387
p->loop_stack_array_size);
1388
p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1389
p->loop_stack_array_size);
1390
}
1391
1392
p->loop_stack[p->loop_stack_depth] = inst - p->store;
1393
p->loop_stack_depth++;
1394
p->if_depth_in_loop[p->loop_stack_depth] = 0;
1395
}
1396
1397
static brw_inst *
1398
get_inner_do_insn(struct brw_codegen *p)
1399
{
1400
return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1401
}
1402
1403
/* EU takes the value from the flag register and pushes it onto some
1404
* sort of a stack (presumably merging with any flag value already on
1405
* the stack). Within an if block, the flags at the top of the stack
1406
* control execution on each channel of the unit, eg. on each of the
1407
* 16 pixel values in our wm programs.
1408
*
1409
* When the matching 'else' instruction is reached (presumably by
1410
* countdown of the instruction count patched in by our ELSE/ENDIF
1411
* functions), the relevant flags are inverted.
1412
*
1413
* When the matching 'endif' instruction is reached, the flags are
1414
* popped off. If the stack is now empty, normal execution resumes.
1415
*/
1416
brw_inst *
1417
brw_IF(struct brw_codegen *p, unsigned execute_size)
1418
{
1419
const struct intel_device_info *devinfo = p->devinfo;
1420
brw_inst *insn;
1421
1422
insn = next_insn(p, BRW_OPCODE_IF);
1423
1424
/* Override the defaults for this instruction:
1425
*/
1426
if (devinfo->ver < 6) {
1427
brw_set_dest(p, insn, brw_ip_reg());
1428
brw_set_src0(p, insn, brw_ip_reg());
1429
brw_set_src1(p, insn, brw_imm_d(0x0));
1430
} else if (devinfo->ver == 6) {
1431
brw_set_dest(p, insn, brw_imm_w(0));
1432
brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1433
brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1434
brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1435
} else if (devinfo->ver == 7) {
1436
brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1437
brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1438
brw_set_src1(p, insn, brw_imm_w(0));
1439
brw_inst_set_jip(devinfo, insn, 0);
1440
brw_inst_set_uip(devinfo, insn, 0);
1441
} else {
1442
brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1443
if (devinfo->ver < 12)
1444
brw_set_src0(p, insn, brw_imm_d(0));
1445
brw_inst_set_jip(devinfo, insn, 0);
1446
brw_inst_set_uip(devinfo, insn, 0);
1447
}
1448
1449
brw_inst_set_exec_size(devinfo, insn, execute_size);
1450
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1451
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1452
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1453
if (!p->single_program_flow && devinfo->ver < 6)
1454
brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1455
1456
push_if_stack(p, insn);
1457
p->if_depth_in_loop[p->loop_stack_depth]++;
1458
return insn;
1459
}
1460
1461
/* This function is only used for gfx6-style IF instructions with an
1462
* embedded comparison (conditional modifier). It is not used on gfx7.
1463
*/
1464
brw_inst *
1465
gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1466
struct brw_reg src0, struct brw_reg src1)
1467
{
1468
const struct intel_device_info *devinfo = p->devinfo;
1469
brw_inst *insn;
1470
1471
insn = next_insn(p, BRW_OPCODE_IF);
1472
1473
brw_set_dest(p, insn, brw_imm_w(0));
1474
brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1475
brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1476
brw_set_src0(p, insn, src0);
1477
brw_set_src1(p, insn, src1);
1478
1479
assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1480
assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1481
brw_inst_set_cond_modifier(devinfo, insn, conditional);
1482
1483
push_if_stack(p, insn);
1484
return insn;
1485
}
1486
1487
/**
1488
* In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1489
*/
1490
static void
1491
convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1492
brw_inst *if_inst, brw_inst *else_inst)
1493
{
1494
const struct intel_device_info *devinfo = p->devinfo;
1495
1496
/* The next instruction (where the ENDIF would be, if it existed) */
1497
brw_inst *next_inst = &p->store[p->nr_insn];
1498
1499
assert(p->single_program_flow);
1500
assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1501
assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1502
assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1503
1504
/* Convert IF to an ADD instruction that moves the instruction pointer
1505
* to the first instruction of the ELSE block. If there is no ELSE
1506
* block, point to where ENDIF would be. Reverse the predicate.
1507
*
1508
* There's no need to execute an ENDIF since we don't need to do any
1509
* stack operations, and if we're currently executing, we just want to
1510
* continue normally.
1511
*/
1512
brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1513
brw_inst_set_pred_inv(devinfo, if_inst, true);
1514
1515
if (else_inst != NULL) {
1516
/* Convert ELSE to an ADD instruction that points where the ENDIF
1517
* would be.
1518
*/
1519
brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1520
1521
brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1522
brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1523
} else {
1524
brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1525
}
1526
}
1527
1528
/**
1529
* Patch IF and ELSE instructions with appropriate jump targets.
1530
*/
1531
static void
1532
patch_IF_ELSE(struct brw_codegen *p,
1533
brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1534
{
1535
const struct intel_device_info *devinfo = p->devinfo;
1536
1537
/* We shouldn't be patching IF and ELSE instructions in single program flow
1538
* mode when gen < 6, because in single program flow mode on those
1539
* platforms, we convert flow control instructions to conditional ADDs that
1540
* operate on IP (see brw_ENDIF).
1541
*
1542
* However, on Gfx6, writing to IP doesn't work in single program flow mode
1543
* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1544
* not be updated by non-flow control instructions."). And on later
1545
* platforms, there is no significant benefit to converting control flow
1546
* instructions to conditional ADDs. So we do patch IF and ELSE
1547
* instructions in single program flow mode on those platforms.
1548
*/
1549
if (devinfo->ver < 6)
1550
assert(!p->single_program_flow);
1551
1552
assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1553
assert(endif_inst != NULL);
1554
assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1555
1556
unsigned br = brw_jump_scale(devinfo);
1557
1558
assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1559
brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1560
1561
if (else_inst == NULL) {
1562
/* Patch IF -> ENDIF */
1563
if (devinfo->ver < 6) {
1564
/* Turn it into an IFF, which means no mask stack operations for
1565
* all-false and jumping past the ENDIF.
1566
*/
1567
brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1568
brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1569
br * (endif_inst - if_inst + 1));
1570
brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1571
} else if (devinfo->ver == 6) {
1572
/* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1573
brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1574
} else {
1575
brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1576
brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1577
}
1578
} else {
1579
brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1580
1581
/* Patch IF -> ELSE */
1582
if (devinfo->ver < 6) {
1583
brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1584
br * (else_inst - if_inst));
1585
brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1586
} else if (devinfo->ver == 6) {
1587
brw_inst_set_gfx6_jump_count(devinfo, if_inst,
1588
br * (else_inst - if_inst + 1));
1589
}
1590
1591
/* Patch ELSE -> ENDIF */
1592
if (devinfo->ver < 6) {
1593
/* BRW_OPCODE_ELSE pre-gfx6 should point just past the
1594
* matching ENDIF.
1595
*/
1596
brw_inst_set_gfx4_jump_count(devinfo, else_inst,
1597
br * (endif_inst - else_inst + 1));
1598
brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1599
} else if (devinfo->ver == 6) {
1600
/* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1601
brw_inst_set_gfx6_jump_count(devinfo, else_inst,
1602
br * (endif_inst - else_inst));
1603
} else {
1604
/* The IF instruction's JIP should point just past the ELSE */
1605
brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1606
/* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1607
brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1608
brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1609
if (devinfo->ver >= 8) {
1610
/* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1611
* should point to ENDIF.
1612
*/
1613
brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1614
}
1615
}
1616
}
1617
}
1618
1619
void
1620
brw_ELSE(struct brw_codegen *p)
1621
{
1622
const struct intel_device_info *devinfo = p->devinfo;
1623
brw_inst *insn;
1624
1625
insn = next_insn(p, BRW_OPCODE_ELSE);
1626
1627
if (devinfo->ver < 6) {
1628
brw_set_dest(p, insn, brw_ip_reg());
1629
brw_set_src0(p, insn, brw_ip_reg());
1630
brw_set_src1(p, insn, brw_imm_d(0x0));
1631
} else if (devinfo->ver == 6) {
1632
brw_set_dest(p, insn, brw_imm_w(0));
1633
brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1634
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1635
brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1636
} else if (devinfo->ver == 7) {
1637
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1639
brw_set_src1(p, insn, brw_imm_w(0));
1640
brw_inst_set_jip(devinfo, insn, 0);
1641
brw_inst_set_uip(devinfo, insn, 0);
1642
} else {
1643
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1644
if (devinfo->ver < 12)
1645
brw_set_src0(p, insn, brw_imm_d(0));
1646
brw_inst_set_jip(devinfo, insn, 0);
1647
brw_inst_set_uip(devinfo, insn, 0);
1648
}
1649
1650
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1651
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1652
if (!p->single_program_flow && devinfo->ver < 6)
1653
brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1654
1655
push_if_stack(p, insn);
1656
}
1657
1658
void
1659
brw_ENDIF(struct brw_codegen *p)
1660
{
1661
const struct intel_device_info *devinfo = p->devinfo;
1662
brw_inst *insn = NULL;
1663
brw_inst *else_inst = NULL;
1664
brw_inst *if_inst = NULL;
1665
brw_inst *tmp;
1666
bool emit_endif = true;
1667
1668
/* In single program flow mode, we can express IF and ELSE instructions
1669
* equivalently as ADD instructions that operate on IP. On platforms prior
1670
* to Gfx6, flow control instructions cause an implied thread switch, so
1671
* this is a significant savings.
1672
*
1673
* However, on Gfx6, writing to IP doesn't work in single program flow mode
1674
* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1675
* not be updated by non-flow control instructions."). And on later
1676
* platforms, there is no significant benefit to converting control flow
1677
* instructions to conditional ADDs. So we only do this trick on Gfx4 and
1678
* Gfx5.
1679
*/
1680
if (devinfo->ver < 6 && p->single_program_flow)
1681
emit_endif = false;
1682
1683
/*
1684
* A single next_insn() may change the base address of instruction store
1685
* memory(p->store), so call it first before referencing the instruction
1686
* store pointer from an index
1687
*/
1688
if (emit_endif)
1689
insn = next_insn(p, BRW_OPCODE_ENDIF);
1690
1691
/* Pop the IF and (optional) ELSE instructions from the stack */
1692
p->if_depth_in_loop[p->loop_stack_depth]--;
1693
tmp = pop_if_stack(p);
1694
if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1695
else_inst = tmp;
1696
tmp = pop_if_stack(p);
1697
}
1698
if_inst = tmp;
1699
1700
if (!emit_endif) {
1701
/* ENDIF is useless; don't bother emitting it. */
1702
convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1703
return;
1704
}
1705
1706
if (devinfo->ver < 6) {
1707
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1708
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1709
brw_set_src1(p, insn, brw_imm_d(0x0));
1710
} else if (devinfo->ver == 6) {
1711
brw_set_dest(p, insn, brw_imm_w(0));
1712
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1713
brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1714
} else if (devinfo->ver == 7) {
1715
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1716
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1717
brw_set_src1(p, insn, brw_imm_w(0));
1718
} else {
1719
brw_set_src0(p, insn, brw_imm_d(0));
1720
}
1721
1722
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1723
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1724
if (devinfo->ver < 6)
1725
brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1726
1727
/* Also pop item off the stack in the endif instruction: */
1728
if (devinfo->ver < 6) {
1729
brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
1730
brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
1731
} else if (devinfo->ver == 6) {
1732
brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
1733
} else {
1734
brw_inst_set_jip(devinfo, insn, 2);
1735
}
1736
patch_IF_ELSE(p, if_inst, else_inst, insn);
1737
}
1738
1739
brw_inst *
1740
brw_BREAK(struct brw_codegen *p)
1741
{
1742
const struct intel_device_info *devinfo = p->devinfo;
1743
brw_inst *insn;
1744
1745
insn = next_insn(p, BRW_OPCODE_BREAK);
1746
if (devinfo->ver >= 8) {
1747
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1748
brw_set_src0(p, insn, brw_imm_d(0x0));
1749
} else if (devinfo->ver >= 6) {
1750
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1751
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1752
brw_set_src1(p, insn, brw_imm_d(0x0));
1753
} else {
1754
brw_set_dest(p, insn, brw_ip_reg());
1755
brw_set_src0(p, insn, brw_ip_reg());
1756
brw_set_src1(p, insn, brw_imm_d(0x0));
1757
brw_inst_set_gfx4_pop_count(devinfo, insn,
1758
p->if_depth_in_loop[p->loop_stack_depth]);
1759
}
1760
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1761
brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1762
1763
return insn;
1764
}
1765
1766
brw_inst *
1767
brw_CONT(struct brw_codegen *p)
1768
{
1769
const struct intel_device_info *devinfo = p->devinfo;
1770
brw_inst *insn;
1771
1772
insn = next_insn(p, BRW_OPCODE_CONTINUE);
1773
brw_set_dest(p, insn, brw_ip_reg());
1774
if (devinfo->ver >= 8) {
1775
brw_set_src0(p, insn, brw_imm_d(0x0));
1776
} else {
1777
brw_set_src0(p, insn, brw_ip_reg());
1778
brw_set_src1(p, insn, brw_imm_d(0x0));
1779
}
1780
1781
if (devinfo->ver < 6) {
1782
brw_inst_set_gfx4_pop_count(devinfo, insn,
1783
p->if_depth_in_loop[p->loop_stack_depth]);
1784
}
1785
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1786
brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1787
return insn;
1788
}
1789
1790
brw_inst *
1791
brw_HALT(struct brw_codegen *p)
1792
{
1793
const struct intel_device_info *devinfo = p->devinfo;
1794
brw_inst *insn;
1795
1796
insn = next_insn(p, BRW_OPCODE_HALT);
1797
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1798
if (devinfo->ver < 6) {
1799
/* From the Gfx4 PRM:
1800
*
1801
* "IP register must be put (for example, by the assembler) at <dst>
1802
* and <src0> locations.
1803
*/
1804
brw_set_dest(p, insn, brw_ip_reg());
1805
brw_set_src0(p, insn, brw_ip_reg());
1806
brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1807
} else if (devinfo->ver < 8) {
1808
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1809
brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1810
} else if (devinfo->ver < 12) {
1811
brw_set_src0(p, insn, brw_imm_d(0x0));
1812
}
1813
1814
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1815
brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1816
return insn;
1817
}
1818
1819
/* DO/WHILE loop:
1820
*
1821
* The DO/WHILE is just an unterminated loop -- break or continue are
1822
* used for control within the loop. We have a few ways they can be
1823
* done.
1824
*
1825
* For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1826
* jip and no DO instruction.
1827
*
1828
* For non-uniform control flow pre-gfx6, there's a DO instruction to
1829
* push the mask, and a WHILE to jump back, and BREAK to get out and
1830
* pop the mask.
1831
*
1832
* For gfx6, there's no more mask stack, so no need for DO. WHILE
1833
* just points back to the first instruction of the loop.
1834
*/
1835
brw_inst *
1836
brw_DO(struct brw_codegen *p, unsigned execute_size)
1837
{
1838
const struct intel_device_info *devinfo = p->devinfo;
1839
1840
if (devinfo->ver >= 6 || p->single_program_flow) {
1841
push_loop_stack(p, &p->store[p->nr_insn]);
1842
return &p->store[p->nr_insn];
1843
} else {
1844
brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1845
1846
push_loop_stack(p, insn);
1847
1848
/* Override the defaults for this instruction:
1849
*/
1850
brw_set_dest(p, insn, brw_null_reg());
1851
brw_set_src0(p, insn, brw_null_reg());
1852
brw_set_src1(p, insn, brw_null_reg());
1853
1854
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1855
brw_inst_set_exec_size(devinfo, insn, execute_size);
1856
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1857
1858
return insn;
1859
}
1860
}
1861
1862
/**
1863
* For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1864
* instruction here.
1865
*
1866
* For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1867
* nesting, since it can always just point to the end of the block/current loop.
1868
*/
1869
static void
1870
brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1871
{
1872
const struct intel_device_info *devinfo = p->devinfo;
1873
brw_inst *do_inst = get_inner_do_insn(p);
1874
brw_inst *inst;
1875
unsigned br = brw_jump_scale(devinfo);
1876
1877
assert(devinfo->ver < 6);
1878
1879
for (inst = while_inst - 1; inst != do_inst; inst--) {
1880
/* If the jump count is != 0, that means that this instruction has already
1881
* been patched because it's part of a loop inside of the one we're
1882
* patching.
1883
*/
1884
if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1885
brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1886
brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1887
} else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1888
brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1889
brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1890
}
1891
}
1892
}
1893
1894
brw_inst *
1895
brw_WHILE(struct brw_codegen *p)
1896
{
1897
const struct intel_device_info *devinfo = p->devinfo;
1898
brw_inst *insn, *do_insn;
1899
unsigned br = brw_jump_scale(devinfo);
1900
1901
if (devinfo->ver >= 6) {
1902
insn = next_insn(p, BRW_OPCODE_WHILE);
1903
do_insn = get_inner_do_insn(p);
1904
1905
if (devinfo->ver >= 8) {
1906
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1907
if (devinfo->ver < 12)
1908
brw_set_src0(p, insn, brw_imm_d(0));
1909
brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1910
} else if (devinfo->ver == 7) {
1911
brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1912
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1913
brw_set_src1(p, insn, brw_imm_w(0));
1914
brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1915
} else {
1916
brw_set_dest(p, insn, brw_imm_w(0));
1917
brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1918
brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1919
brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1920
}
1921
1922
brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1923
1924
} else {
1925
if (p->single_program_flow) {
1926
insn = next_insn(p, BRW_OPCODE_ADD);
1927
do_insn = get_inner_do_insn(p);
1928
1929
brw_set_dest(p, insn, brw_ip_reg());
1930
brw_set_src0(p, insn, brw_ip_reg());
1931
brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1932
brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1933
} else {
1934
insn = next_insn(p, BRW_OPCODE_WHILE);
1935
do_insn = get_inner_do_insn(p);
1936
1937
assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1938
1939
brw_set_dest(p, insn, brw_ip_reg());
1940
brw_set_src0(p, insn, brw_ip_reg());
1941
brw_set_src1(p, insn, brw_imm_d(0));
1942
1943
brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1944
brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1945
brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
1946
1947
brw_patch_break_cont(p, insn);
1948
}
1949
}
1950
brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1951
1952
p->loop_stack_depth--;
1953
1954
return insn;
1955
}
1956
1957
/* FORWARD JUMPS:
1958
*/
1959
void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1960
{
1961
const struct intel_device_info *devinfo = p->devinfo;
1962
brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1963
unsigned jmpi = 1;
1964
1965
if (devinfo->ver >= 5)
1966
jmpi = 2;
1967
1968
assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1969
assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1970
1971
brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1972
jmpi * (p->nr_insn - jmp_insn_idx - 1));
1973
}
1974
1975
/* To integrate with the above, it makes sense that the comparison
1976
* instruction should populate the flag register. It might be simpler
1977
* just to use the flag reg for most WM tasks?
1978
*/
1979
void brw_CMP(struct brw_codegen *p,
1980
struct brw_reg dest,
1981
unsigned conditional,
1982
struct brw_reg src0,
1983
struct brw_reg src1)
1984
{
1985
const struct intel_device_info *devinfo = p->devinfo;
1986
brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1987
1988
brw_inst_set_cond_modifier(devinfo, insn, conditional);
1989
brw_set_dest(p, insn, dest);
1990
brw_set_src0(p, insn, src0);
1991
brw_set_src1(p, insn, src1);
1992
1993
/* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1994
* page says:
1995
* "Any CMP instruction with a null destination must use a {switch}."
1996
*
1997
* It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
1998
* mentioned on their work-arounds pages.
1999
*/
2000
if (devinfo->ver == 7) {
2001
if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2002
dest.nr == BRW_ARF_NULL) {
2003
brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2004
}
2005
}
2006
}
2007
2008
void brw_CMPN(struct brw_codegen *p,
2009
struct brw_reg dest,
2010
unsigned conditional,
2011
struct brw_reg src0,
2012
struct brw_reg src1)
2013
{
2014
const struct intel_device_info *devinfo = p->devinfo;
2015
brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
2016
2017
brw_inst_set_cond_modifier(devinfo, insn, conditional);
2018
brw_set_dest(p, insn, dest);
2019
brw_set_src0(p, insn, src0);
2020
brw_set_src1(p, insn, src1);
2021
2022
/* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2023
* says:
2024
*
2025
* If the destination is the null register, the {Switch} instruction
2026
* option must be used.
2027
*
2028
* Page 77 of the Haswell PRM Volume 2b contains the same text.
2029
*/
2030
if (devinfo->ver == 7) {
2031
if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2032
dest.nr == BRW_ARF_NULL) {
2033
brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2034
}
2035
}
2036
}
2037
2038
/***********************************************************************
2039
* Helpers for the various SEND message types:
2040
*/
2041
2042
/** Extended math function, float[8].
2043
*/
2044
void gfx4_math(struct brw_codegen *p,
2045
struct brw_reg dest,
2046
unsigned function,
2047
unsigned msg_reg_nr,
2048
struct brw_reg src,
2049
unsigned precision )
2050
{
2051
const struct intel_device_info *devinfo = p->devinfo;
2052
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2053
unsigned data_type;
2054
if (has_scalar_region(src)) {
2055
data_type = BRW_MATH_DATA_SCALAR;
2056
} else {
2057
data_type = BRW_MATH_DATA_VECTOR;
2058
}
2059
2060
assert(devinfo->ver < 6);
2061
2062
/* Example code doesn't set predicate_control for send
2063
* instructions.
2064
*/
2065
brw_inst_set_pred_control(devinfo, insn, 0);
2066
brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2067
2068
brw_set_dest(p, insn, dest);
2069
brw_set_src0(p, insn, src);
2070
brw_set_math_message(p,
2071
insn,
2072
function,
2073
src.type == BRW_REGISTER_TYPE_D,
2074
precision,
2075
data_type);
2076
}
2077
2078
void gfx6_math(struct brw_codegen *p,
2079
struct brw_reg dest,
2080
unsigned function,
2081
struct brw_reg src0,
2082
struct brw_reg src1)
2083
{
2084
const struct intel_device_info *devinfo = p->devinfo;
2085
brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2086
2087
assert(devinfo->ver >= 6);
2088
2089
assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2090
(devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2091
2092
assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2093
if (devinfo->ver == 6) {
2094
assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2095
assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2096
}
2097
2098
if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2099
function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2100
function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2101
assert(src0.type != BRW_REGISTER_TYPE_F);
2102
assert(src1.type != BRW_REGISTER_TYPE_F);
2103
assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2104
(devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2105
} else {
2106
assert(src0.type == BRW_REGISTER_TYPE_F ||
2107
(src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2108
assert(src1.type == BRW_REGISTER_TYPE_F ||
2109
(src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2110
}
2111
2112
/* Source modifiers are ignored for extended math instructions on Gfx6. */
2113
if (devinfo->ver == 6) {
2114
assert(!src0.negate);
2115
assert(!src0.abs);
2116
assert(!src1.negate);
2117
assert(!src1.abs);
2118
}
2119
2120
brw_inst_set_math_function(devinfo, insn, function);
2121
2122
brw_set_dest(p, insn, dest);
2123
brw_set_src0(p, insn, src0);
2124
brw_set_src1(p, insn, src1);
2125
}
2126
2127
/**
2128
* Return the right surface index to access the thread scratch space using
2129
* stateless dataport messages.
2130
*/
2131
unsigned
2132
brw_scratch_surface_idx(const struct brw_codegen *p)
2133
{
2134
/* The scratch space is thread-local so IA coherency is unnecessary. */
2135
if (p->devinfo->ver >= 8)
2136
return GFX8_BTI_STATELESS_NON_COHERENT;
2137
else
2138
return BRW_BTI_STATELESS;
2139
}
2140
2141
/**
2142
* Write a block of OWORDs (half a GRF each) from the scratch buffer,
2143
* using a constant offset per channel.
2144
*
2145
* The offset must be aligned to oword size (16 bytes). Used for
2146
* register spilling.
2147
*/
2148
void brw_oword_block_write_scratch(struct brw_codegen *p,
2149
struct brw_reg mrf,
2150
int num_regs,
2151
unsigned offset)
2152
{
2153
const struct intel_device_info *devinfo = p->devinfo;
2154
const unsigned target_cache =
2155
(devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2156
devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2157
BRW_SFID_DATAPORT_WRITE);
2158
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2159
uint32_t msg_type;
2160
2161
if (devinfo->ver >= 6)
2162
offset /= 16;
2163
2164
mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2165
2166
const unsigned mlen = 1 + num_regs;
2167
2168
/* Set up the message header. This is g0, with g0.2 filled with
2169
* the offset. We don't want to leave our offset around in g0 or
2170
* it'll screw up texture samples, so set it up inside the message
2171
* reg.
2172
*/
2173
{
2174
brw_push_insn_state(p);
2175
brw_set_default_exec_size(p, BRW_EXECUTE_8);
2176
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2177
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2178
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2179
2180
brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2181
2182
/* set message header global offset field (reg 0, element 2) */
2183
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2184
brw_set_default_swsb(p, tgl_swsb_null());
2185
brw_MOV(p,
2186
retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2187
mrf.nr,
2188
2), BRW_REGISTER_TYPE_UD),
2189
brw_imm_ud(offset));
2190
2191
brw_pop_insn_state(p);
2192
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2193
}
2194
2195
{
2196
struct brw_reg dest;
2197
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2198
int send_commit_msg;
2199
struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2200
BRW_REGISTER_TYPE_UW);
2201
2202
brw_inst_set_sfid(devinfo, insn, target_cache);
2203
brw_inst_set_compression(devinfo, insn, false);
2204
2205
if (brw_inst_exec_size(devinfo, insn) >= 16)
2206
src_header = vec16(src_header);
2207
2208
assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2209
if (devinfo->ver < 6)
2210
brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2211
2212
/* Until gfx6, writes followed by reads from the same location
2213
* are not guaranteed to be ordered unless write_commit is set.
2214
* If set, then a no-op write is issued to the destination
2215
* register to set a dependency, and a read from the destination
2216
* can be used to ensure the ordering.
2217
*
2218
* For gfx6, only writes between different threads need ordering
2219
* protection. Our use of DP writes is all about register
2220
* spilling within a thread.
2221
*/
2222
if (devinfo->ver >= 6) {
2223
dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2224
send_commit_msg = 0;
2225
} else {
2226
dest = src_header;
2227
send_commit_msg = 1;
2228
}
2229
2230
brw_set_dest(p, insn, dest);
2231
if (devinfo->ver >= 6) {
2232
brw_set_src0(p, insn, mrf);
2233
} else {
2234
brw_set_src0(p, insn, brw_null_reg());
2235
}
2236
2237
if (devinfo->ver >= 6)
2238
msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2239
else
2240
msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2241
2242
brw_set_desc(p, insn,
2243
brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2244
brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2245
BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2246
msg_type, send_commit_msg));
2247
}
2248
}
2249
2250
2251
/**
2252
* Read a block of owords (half a GRF each) from the scratch buffer
2253
* using a constant index per channel.
2254
*
2255
* Offset must be aligned to oword size (16 bytes). Used for register
2256
* spilling.
2257
*/
2258
void
2259
brw_oword_block_read_scratch(struct brw_codegen *p,
2260
struct brw_reg dest,
2261
struct brw_reg mrf,
2262
int num_regs,
2263
unsigned offset)
2264
{
2265
const struct intel_device_info *devinfo = p->devinfo;
2266
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2267
2268
if (devinfo->ver >= 6)
2269
offset /= 16;
2270
2271
if (p->devinfo->ver >= 7) {
2272
/* On gen 7 and above, we no longer have message registers and we can
2273
* send from any register we want. By using the destination register
2274
* for the message, we guarantee that the implied message write won't
2275
* accidentally overwrite anything. This has been a problem because
2276
* the MRF registers and source for the final FB write are both fixed
2277
* and may overlap.
2278
*/
2279
mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2280
} else {
2281
mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2282
}
2283
dest = retype(dest, BRW_REGISTER_TYPE_UW);
2284
2285
const unsigned rlen = num_regs;
2286
const unsigned target_cache =
2287
(devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2288
devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2289
BRW_SFID_DATAPORT_READ);
2290
2291
{
2292
brw_push_insn_state(p);
2293
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2294
brw_set_default_exec_size(p, BRW_EXECUTE_8);
2295
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2296
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2297
2298
brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2299
2300
/* set message header global offset field (reg 0, element 2) */
2301
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2302
brw_set_default_swsb(p, tgl_swsb_null());
2303
brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2304
2305
brw_pop_insn_state(p);
2306
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2307
}
2308
2309
{
2310
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2311
2312
brw_inst_set_sfid(devinfo, insn, target_cache);
2313
assert(brw_inst_pred_control(devinfo, insn) == 0);
2314
brw_inst_set_compression(devinfo, insn, false);
2315
2316
brw_set_dest(p, insn, dest); /* UW? */
2317
if (devinfo->ver >= 6) {
2318
brw_set_src0(p, insn, mrf);
2319
} else {
2320
brw_set_src0(p, insn, brw_null_reg());
2321
brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2322
}
2323
2324
brw_set_desc(p, insn,
2325
brw_message_desc(devinfo, 1, rlen, true) |
2326
brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2327
BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2328
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2329
BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2330
}
2331
}
2332
2333
void
2334
gfx7_block_read_scratch(struct brw_codegen *p,
2335
struct brw_reg dest,
2336
int num_regs,
2337
unsigned offset)
2338
{
2339
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2340
assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2341
2342
brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2343
2344
/* The HW requires that the header is present; this is to get the g0.5
2345
* scratch offset.
2346
*/
2347
brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2348
2349
/* According to the docs, offset is "A 12-bit HWord offset into the memory
2350
* Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2351
* is 32 bytes, which happens to be the size of a register.
2352
*/
2353
offset /= REG_SIZE;
2354
assert(offset < (1 << 12));
2355
2356
gfx7_set_dp_scratch_message(p, insn,
2357
false, /* scratch read */
2358
false, /* OWords */
2359
false, /* invalidate after read */
2360
num_regs,
2361
offset,
2362
1, /* mlen: just g0 */
2363
num_regs, /* rlen */
2364
true); /* header present */
2365
}
2366
2367
/**
2368
* Read float[4] vectors from the data port constant cache.
2369
* Location (in buffer) should be a multiple of 16.
2370
* Used for fetching shader constants.
2371
*/
2372
void brw_oword_block_read(struct brw_codegen *p,
2373
struct brw_reg dest,
2374
struct brw_reg mrf,
2375
uint32_t offset,
2376
uint32_t bind_table_index)
2377
{
2378
const struct intel_device_info *devinfo = p->devinfo;
2379
const unsigned target_cache =
2380
(devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2381
BRW_SFID_DATAPORT_READ);
2382
const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2383
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2384
2385
/* On newer hardware, offset is in units of owords. */
2386
if (devinfo->ver >= 6)
2387
offset /= 16;
2388
2389
mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2390
2391
brw_push_insn_state(p);
2392
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2393
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2394
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2395
2396
brw_push_insn_state(p);
2397
brw_set_default_exec_size(p, BRW_EXECUTE_8);
2398
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2399
brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2400
2401
/* set message header global offset field (reg 0, element 2) */
2402
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2403
brw_set_default_swsb(p, tgl_swsb_null());
2404
brw_MOV(p,
2405
retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2406
mrf.nr,
2407
2), BRW_REGISTER_TYPE_UD),
2408
brw_imm_ud(offset));
2409
brw_pop_insn_state(p);
2410
2411
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2412
2413
brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2414
2415
brw_inst_set_sfid(devinfo, insn, target_cache);
2416
2417
/* cast dest to a uword[8] vector */
2418
dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2419
2420
brw_set_dest(p, insn, dest);
2421
if (devinfo->ver >= 6) {
2422
brw_set_src0(p, insn, mrf);
2423
} else {
2424
brw_set_src0(p, insn, brw_null_reg());
2425
brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2426
}
2427
2428
brw_set_desc(p, insn,
2429
brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2430
brw_dp_read_desc(devinfo, bind_table_index,
2431
BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2432
BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2433
BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2434
2435
brw_pop_insn_state(p);
2436
}
2437
2438
brw_inst *
2439
brw_fb_WRITE(struct brw_codegen *p,
2440
struct brw_reg payload,
2441
struct brw_reg implied_header,
2442
unsigned msg_control,
2443
unsigned binding_table_index,
2444
unsigned msg_length,
2445
unsigned response_length,
2446
bool eot,
2447
bool last_render_target,
2448
bool header_present)
2449
{
2450
const struct intel_device_info *devinfo = p->devinfo;
2451
const unsigned target_cache =
2452
(devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2453
BRW_SFID_DATAPORT_WRITE);
2454
brw_inst *insn;
2455
struct brw_reg dest, src0;
2456
2457
if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2458
dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2459
else
2460
dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2461
2462
if (devinfo->ver >= 6) {
2463
insn = next_insn(p, BRW_OPCODE_SENDC);
2464
} else {
2465
insn = next_insn(p, BRW_OPCODE_SEND);
2466
}
2467
brw_inst_set_sfid(devinfo, insn, target_cache);
2468
brw_inst_set_compression(devinfo, insn, false);
2469
2470
if (devinfo->ver >= 6) {
2471
/* headerless version, just submit color payload */
2472
src0 = payload;
2473
} else {
2474
assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2475
brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2476
src0 = implied_header;
2477
}
2478
2479
brw_set_dest(p, insn, dest);
2480
brw_set_src0(p, insn, src0);
2481
brw_set_desc(p, insn,
2482
brw_message_desc(devinfo, msg_length, response_length,
2483
header_present) |
2484
brw_fb_write_desc(devinfo, binding_table_index, msg_control,
2485
last_render_target,
2486
false /* coarse_write */));
2487
brw_inst_set_eot(devinfo, insn, eot);
2488
2489
return insn;
2490
}
2491
2492
brw_inst *
2493
gfx9_fb_READ(struct brw_codegen *p,
2494
struct brw_reg dst,
2495
struct brw_reg payload,
2496
unsigned binding_table_index,
2497
unsigned msg_length,
2498
unsigned response_length,
2499
bool per_sample)
2500
{
2501
const struct intel_device_info *devinfo = p->devinfo;
2502
assert(devinfo->ver >= 9);
2503
brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2504
2505
brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2506
brw_set_dest(p, insn, dst);
2507
brw_set_src0(p, insn, payload);
2508
brw_set_desc(
2509
p, insn,
2510
brw_message_desc(devinfo, msg_length, response_length, true) |
2511
brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2512
1 << brw_get_default_exec_size(p), per_sample));
2513
brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2514
2515
return insn;
2516
}
2517
2518
/**
2519
* Texture sample instruction.
2520
* Note: the msg_type plus msg_length values determine exactly what kind
2521
* of sampling operation is performed. See volume 4, page 161 of docs.
2522
*/
2523
void brw_SAMPLE(struct brw_codegen *p,
2524
struct brw_reg dest,
2525
unsigned msg_reg_nr,
2526
struct brw_reg src0,
2527
unsigned binding_table_index,
2528
unsigned sampler,
2529
unsigned msg_type,
2530
unsigned response_length,
2531
unsigned msg_length,
2532
unsigned header_present,
2533
unsigned simd_mode,
2534
unsigned return_format)
2535
{
2536
const struct intel_device_info *devinfo = p->devinfo;
2537
brw_inst *insn;
2538
2539
if (msg_reg_nr != -1)
2540
gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2541
2542
insn = next_insn(p, BRW_OPCODE_SEND);
2543
brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2544
brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2545
2546
/* From the 965 PRM (volume 4, part 1, section 14.2.41):
2547
*
2548
* "Instruction compression is not allowed for this instruction (that
2549
* is, send). The hardware behavior is undefined if this instruction is
2550
* set as compressed. However, compress control can be set to "SecHalf"
2551
* to affect the EMask generation."
2552
*
2553
* No similar wording is found in later PRMs, but there are examples
2554
* utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2555
* are allowed in SIMD16 mode and they could not work without SecHalf. For
2556
* these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2557
*/
2558
brw_inst_set_compression(devinfo, insn, false);
2559
2560
if (devinfo->ver < 6)
2561
brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2562
2563
brw_set_dest(p, insn, dest);
2564
brw_set_src0(p, insn, src0);
2565
brw_set_desc(p, insn,
2566
brw_message_desc(devinfo, msg_length, response_length,
2567
header_present) |
2568
brw_sampler_desc(devinfo, binding_table_index, sampler,
2569
msg_type, simd_mode, return_format));
2570
}
2571
2572
/* Adjust the message header's sampler state pointer to
2573
* select the correct group of 16 samplers.
2574
*/
2575
void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2576
struct brw_reg header,
2577
struct brw_reg sampler_index)
2578
{
2579
/* The "Sampler Index" field can only store values between 0 and 15.
2580
* However, we can add an offset to the "Sampler State Pointer"
2581
* field, effectively selecting a different set of 16 samplers.
2582
*
2583
* The "Sampler State Pointer" needs to be aligned to a 32-byte
2584
* offset, and each sampler state is only 16-bytes, so we can't
2585
* exclusively use the offset - we have to use both.
2586
*/
2587
2588
const struct intel_device_info *devinfo = p->devinfo;
2589
2590
if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2591
const int sampler_state_size = 16; /* 16 bytes */
2592
uint32_t sampler = sampler_index.ud;
2593
2594
if (sampler >= 16) {
2595
assert(devinfo->verx10 >= 75);
2596
brw_ADD(p,
2597
get_element_ud(header, 3),
2598
get_element_ud(brw_vec8_grf(0, 0), 3),
2599
brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2600
}
2601
} else {
2602
/* Non-const sampler array indexing case */
2603
if (devinfo->verx10 <= 70) {
2604
return;
2605
}
2606
2607
struct brw_reg temp = get_element_ud(header, 3);
2608
2609
brw_push_insn_state(p);
2610
brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2611
brw_set_default_swsb(p, tgl_swsb_regdist(1));
2612
brw_SHL(p, temp, temp, brw_imm_ud(4));
2613
brw_ADD(p,
2614
get_element_ud(header, 3),
2615
get_element_ud(brw_vec8_grf(0, 0), 3),
2616
temp);
2617
brw_pop_insn_state(p);
2618
}
2619
}
2620
2621
/* All these variables are pretty confusing - we might be better off
2622
* using bitmasks and macros for this, in the old style. Or perhaps
2623
* just having the caller instantiate the fields in dword3 itself.
2624
*/
2625
void brw_urb_WRITE(struct brw_codegen *p,
2626
struct brw_reg dest,
2627
unsigned msg_reg_nr,
2628
struct brw_reg src0,
2629
enum brw_urb_write_flags flags,
2630
unsigned msg_length,
2631
unsigned response_length,
2632
unsigned offset,
2633
unsigned swizzle)
2634
{
2635
const struct intel_device_info *devinfo = p->devinfo;
2636
brw_inst *insn;
2637
2638
gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2639
2640
if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2641
/* Enable Channel Masks in the URB_WRITE_HWORD message header */
2642
brw_push_insn_state(p);
2643
brw_set_default_access_mode(p, BRW_ALIGN_1);
2644
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2645
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2646
brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2647
BRW_REGISTER_TYPE_UD),
2648
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2649
brw_imm_ud(0xff00));
2650
brw_pop_insn_state(p);
2651
}
2652
2653
insn = next_insn(p, BRW_OPCODE_SEND);
2654
2655
assert(msg_length < BRW_MAX_MRF(devinfo->ver));
2656
2657
brw_set_dest(p, insn, dest);
2658
brw_set_src0(p, insn, src0);
2659
brw_set_src1(p, insn, brw_imm_d(0));
2660
2661
if (devinfo->ver < 6)
2662
brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2663
2664
brw_set_urb_message(p,
2665
insn,
2666
flags,
2667
msg_length,
2668
response_length,
2669
offset,
2670
swizzle);
2671
}
2672
2673
void
2674
brw_send_indirect_message(struct brw_codegen *p,
2675
unsigned sfid,
2676
struct brw_reg dst,
2677
struct brw_reg payload,
2678
struct brw_reg desc,
2679
unsigned desc_imm,
2680
bool eot)
2681
{
2682
const struct intel_device_info *devinfo = p->devinfo;
2683
struct brw_inst *send;
2684
2685
dst = retype(dst, BRW_REGISTER_TYPE_UW);
2686
2687
assert(desc.type == BRW_REGISTER_TYPE_UD);
2688
2689
if (desc.file == BRW_IMMEDIATE_VALUE) {
2690
send = next_insn(p, BRW_OPCODE_SEND);
2691
brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2692
brw_set_desc(p, send, desc.ud | desc_imm);
2693
} else {
2694
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2695
struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2696
2697
brw_push_insn_state(p);
2698
brw_set_default_access_mode(p, BRW_ALIGN_1);
2699
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2700
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2701
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2702
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2703
2704
/* Load the indirect descriptor to an address register using OR so the
2705
* caller can specify additional descriptor bits with the desc_imm
2706
* immediate.
2707
*/
2708
brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2709
2710
brw_pop_insn_state(p);
2711
2712
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2713
send = next_insn(p, BRW_OPCODE_SEND);
2714
brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2715
2716
if (devinfo->ver >= 12)
2717
brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2718
else
2719
brw_set_src1(p, send, addr);
2720
}
2721
2722
brw_set_dest(p, send, dst);
2723
brw_inst_set_sfid(devinfo, send, sfid);
2724
brw_inst_set_eot(devinfo, send, eot);
2725
}
2726
2727
void
2728
brw_send_indirect_split_message(struct brw_codegen *p,
2729
unsigned sfid,
2730
struct brw_reg dst,
2731
struct brw_reg payload0,
2732
struct brw_reg payload1,
2733
struct brw_reg desc,
2734
unsigned desc_imm,
2735
struct brw_reg ex_desc,
2736
unsigned ex_desc_imm,
2737
bool eot)
2738
{
2739
const struct intel_device_info *devinfo = p->devinfo;
2740
struct brw_inst *send;
2741
2742
dst = retype(dst, BRW_REGISTER_TYPE_UW);
2743
2744
assert(desc.type == BRW_REGISTER_TYPE_UD);
2745
2746
if (desc.file == BRW_IMMEDIATE_VALUE) {
2747
desc.ud |= desc_imm;
2748
} else {
2749
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2750
struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2751
2752
brw_push_insn_state(p);
2753
brw_set_default_access_mode(p, BRW_ALIGN_1);
2754
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2755
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2756
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2757
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2758
2759
/* Load the indirect descriptor to an address register using OR so the
2760
* caller can specify additional descriptor bits with the desc_imm
2761
* immediate.
2762
*/
2763
brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2764
2765
brw_pop_insn_state(p);
2766
desc = addr;
2767
2768
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2769
}
2770
2771
if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2772
(devinfo->ver >= 12 ||
2773
((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2774
ex_desc.ud |= ex_desc_imm;
2775
} else {
2776
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2777
struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2778
2779
brw_push_insn_state(p);
2780
brw_set_default_access_mode(p, BRW_ALIGN_1);
2781
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2782
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2783
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2784
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2785
2786
/* Load the indirect extended descriptor to an address register using OR
2787
* so the caller can specify additional descriptor bits with the
2788
* desc_imm immediate.
2789
*
2790
* Even though the instruction dispatcher always pulls the SFID and EOT
2791
* fields from the instruction itself, actual external unit which
2792
* processes the message gets the SFID and EOT from the extended
2793
* descriptor which comes from the address register. If we don't OR
2794
* those two bits in, the external unit may get confused and hang.
2795
*/
2796
unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2797
2798
if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2799
/* ex_desc bits 15:12 don't exist in the instruction encoding prior
2800
* to Gfx12, so we may have fallen back to an indirect extended
2801
* descriptor.
2802
*/
2803
brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2804
} else {
2805
brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2806
}
2807
2808
brw_pop_insn_state(p);
2809
ex_desc = addr;
2810
2811
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2812
}
2813
2814
send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2815
brw_set_dest(p, send, dst);
2816
brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2817
brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2818
2819
if (desc.file == BRW_IMMEDIATE_VALUE) {
2820
brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2821
brw_inst_set_send_desc(devinfo, send, desc.ud);
2822
} else {
2823
assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2824
assert(desc.nr == BRW_ARF_ADDRESS);
2825
assert(desc.subnr == 0);
2826
brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2827
}
2828
2829
if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2830
brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2831
brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2832
} else {
2833
assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2834
assert(ex_desc.nr == BRW_ARF_ADDRESS);
2835
assert((ex_desc.subnr & 0x3) == 0);
2836
brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2837
brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2838
}
2839
2840
brw_inst_set_sfid(devinfo, send, sfid);
2841
brw_inst_set_eot(devinfo, send, eot);
2842
}
2843
2844
static void
2845
brw_send_indirect_surface_message(struct brw_codegen *p,
2846
unsigned sfid,
2847
struct brw_reg dst,
2848
struct brw_reg payload,
2849
struct brw_reg surface,
2850
unsigned desc_imm)
2851
{
2852
if (surface.file != BRW_IMMEDIATE_VALUE) {
2853
const struct tgl_swsb swsb = brw_get_default_swsb(p);
2854
struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2855
2856
brw_push_insn_state(p);
2857
brw_set_default_access_mode(p, BRW_ALIGN_1);
2858
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2859
brw_set_default_exec_size(p, BRW_EXECUTE_1);
2860
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2861
brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2862
2863
/* Mask out invalid bits from the surface index to avoid hangs e.g. when
2864
* some surface array is accessed out of bounds.
2865
*/
2866
brw_AND(p, addr,
2867
suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2868
BRW_GET_SWZ(surface.swizzle, 0)),
2869
brw_imm_ud(0xff));
2870
2871
brw_pop_insn_state(p);
2872
2873
surface = addr;
2874
brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2875
}
2876
2877
brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2878
}
2879
2880
static bool
2881
while_jumps_before_offset(const struct intel_device_info *devinfo,
2882
brw_inst *insn, int while_offset, int start_offset)
2883
{
2884
int scale = 16 / brw_jump_scale(devinfo);
2885
int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
2886
: brw_inst_jip(devinfo, insn);
2887
assert(jip < 0);
2888
return while_offset + jip * scale <= start_offset;
2889
}
2890
2891
2892
static int
2893
brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2894
{
2895
int offset;
2896
void *store = p->store;
2897
const struct intel_device_info *devinfo = p->devinfo;
2898
2899
int depth = 0;
2900
2901
for (offset = next_offset(devinfo, store, start_offset);
2902
offset < p->next_insn_offset;
2903
offset = next_offset(devinfo, store, offset)) {
2904
brw_inst *insn = store + offset;
2905
2906
switch (brw_inst_opcode(devinfo, insn)) {
2907
case BRW_OPCODE_IF:
2908
depth++;
2909
break;
2910
case BRW_OPCODE_ENDIF:
2911
if (depth == 0)
2912
return offset;
2913
depth--;
2914
break;
2915
case BRW_OPCODE_WHILE:
2916
/* If the while doesn't jump before our instruction, it's the end
2917
* of a sibling do...while loop. Ignore it.
2918
*/
2919
if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2920
continue;
2921
FALLTHROUGH;
2922
case BRW_OPCODE_ELSE:
2923
case BRW_OPCODE_HALT:
2924
if (depth == 0)
2925
return offset;
2926
break;
2927
default:
2928
break;
2929
}
2930
}
2931
2932
return 0;
2933
}
2934
2935
/* There is no DO instruction on gfx6, so to find the end of the loop
2936
* we have to see if the loop is jumping back before our start
2937
* instruction.
2938
*/
2939
static int
2940
brw_find_loop_end(struct brw_codegen *p, int start_offset)
2941
{
2942
const struct intel_device_info *devinfo = p->devinfo;
2943
int offset;
2944
void *store = p->store;
2945
2946
assert(devinfo->ver >= 6);
2947
2948
/* Always start after the instruction (such as a WHILE) we're trying to fix
2949
* up.
2950
*/
2951
for (offset = next_offset(devinfo, store, start_offset);
2952
offset < p->next_insn_offset;
2953
offset = next_offset(devinfo, store, offset)) {
2954
brw_inst *insn = store + offset;
2955
2956
if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2957
if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2958
return offset;
2959
}
2960
}
2961
assert(!"not reached");
2962
return start_offset;
2963
}
2964
2965
/* After program generation, go back and update the UIP and JIP of
2966
* BREAK, CONT, and HALT instructions to their correct locations.
2967
*/
2968
void
2969
brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2970
{
2971
const struct intel_device_info *devinfo = p->devinfo;
2972
int offset;
2973
int br = brw_jump_scale(devinfo);
2974
int scale = 16 / br;
2975
void *store = p->store;
2976
2977
if (devinfo->ver < 6)
2978
return;
2979
2980
for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2981
brw_inst *insn = store + offset;
2982
assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2983
2984
int block_end_offset = brw_find_next_block_end(p, offset);
2985
switch (brw_inst_opcode(devinfo, insn)) {
2986
case BRW_OPCODE_BREAK:
2987
assert(block_end_offset != 0);
2988
brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2989
/* Gfx7 UIP points to WHILE; Gfx6 points just after it */
2990
brw_inst_set_uip(devinfo, insn,
2991
(brw_find_loop_end(p, offset) - offset +
2992
(devinfo->ver == 6 ? 16 : 0)) / scale);
2993
break;
2994
case BRW_OPCODE_CONTINUE:
2995
assert(block_end_offset != 0);
2996
brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2997
brw_inst_set_uip(devinfo, insn,
2998
(brw_find_loop_end(p, offset) - offset) / scale);
2999
3000
assert(brw_inst_uip(devinfo, insn) != 0);
3001
assert(brw_inst_jip(devinfo, insn) != 0);
3002
break;
3003
3004
case BRW_OPCODE_ENDIF: {
3005
int32_t jump = (block_end_offset == 0) ?
3006
1 * br : (block_end_offset - offset) / scale;
3007
if (devinfo->ver >= 7)
3008
brw_inst_set_jip(devinfo, insn, jump);
3009
else
3010
brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
3011
break;
3012
}
3013
3014
case BRW_OPCODE_HALT:
3015
/* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3016
*
3017
* "In case of the halt instruction not inside any conditional
3018
* code block, the value of <JIP> and <UIP> should be the
3019
* same. In case of the halt instruction inside conditional code
3020
* block, the <UIP> should be the end of the program, and the
3021
* <JIP> should be end of the most inner conditional code block."
3022
*
3023
* The uip will have already been set by whoever set up the
3024
* instruction.
3025
*/
3026
if (block_end_offset == 0) {
3027
brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
3028
} else {
3029
brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3030
}
3031
assert(brw_inst_uip(devinfo, insn) != 0);
3032
assert(brw_inst_jip(devinfo, insn) != 0);
3033
break;
3034
3035
default:
3036
break;
3037
}
3038
}
3039
}
3040
3041
void brw_ff_sync(struct brw_codegen *p,
3042
struct brw_reg dest,
3043
unsigned msg_reg_nr,
3044
struct brw_reg src0,
3045
bool allocate,
3046
unsigned response_length,
3047
bool eot)
3048
{
3049
const struct intel_device_info *devinfo = p->devinfo;
3050
brw_inst *insn;
3051
3052
gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3053
3054
insn = next_insn(p, BRW_OPCODE_SEND);
3055
brw_set_dest(p, insn, dest);
3056
brw_set_src0(p, insn, src0);
3057
brw_set_src1(p, insn, brw_imm_d(0));
3058
3059
if (devinfo->ver < 6)
3060
brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3061
3062
brw_set_ff_sync_message(p,
3063
insn,
3064
allocate,
3065
response_length,
3066
eot);
3067
}
3068
3069
/**
3070
* Emit the SEND instruction necessary to generate stream output data on Gfx6
3071
* (for transform feedback).
3072
*
3073
* If send_commit_msg is true, this is the last piece of stream output data
3074
* from this thread, so send the data as a committed write. According to the
3075
* Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3076
*
3077
* "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3078
* writes are complete by sending the final write as a committed write."
3079
*/
3080
void
3081
brw_svb_write(struct brw_codegen *p,
3082
struct brw_reg dest,
3083
unsigned msg_reg_nr,
3084
struct brw_reg src0,
3085
unsigned binding_table_index,
3086
bool send_commit_msg)
3087
{
3088
const struct intel_device_info *devinfo = p->devinfo;
3089
assert(devinfo->ver == 6);
3090
const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3091
brw_inst *insn;
3092
3093
gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3094
3095
insn = next_insn(p, BRW_OPCODE_SEND);
3096
brw_inst_set_sfid(devinfo, insn, target_cache);
3097
brw_set_dest(p, insn, dest);
3098
brw_set_src0(p, insn, src0);
3099
brw_set_desc(p, insn,
3100
brw_message_desc(devinfo, 1, send_commit_msg, true) |
3101
brw_dp_write_desc(devinfo, binding_table_index,
3102
0, /* msg_control: ignored */
3103
GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3104
send_commit_msg)); /* send_commit_msg */
3105
}
3106
3107
static unsigned
3108
brw_surface_payload_size(unsigned num_channels,
3109
unsigned exec_size /**< 0 for SIMD4x2 */)
3110
{
3111
if (exec_size == 0)
3112
return 1; /* SIMD4x2 */
3113
else if (exec_size <= 8)
3114
return num_channels;
3115
else
3116
return 2 * num_channels;
3117
}
3118
3119
void
3120
brw_untyped_atomic(struct brw_codegen *p,
3121
struct brw_reg dst,
3122
struct brw_reg payload,
3123
struct brw_reg surface,
3124
unsigned atomic_op,
3125
unsigned msg_length,
3126
bool response_expected,
3127
bool header_present)
3128
{
3129
const struct intel_device_info *devinfo = p->devinfo;
3130
const unsigned sfid = (devinfo->verx10 >= 75 ?
3131
HSW_SFID_DATAPORT_DATA_CACHE_1 :
3132
GFX7_SFID_DATAPORT_DATA_CACHE);
3133
const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3134
/* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3135
const bool has_simd4x2 = devinfo->verx10 >= 75;
3136
const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3137
has_simd4x2 ? 0 : 8;
3138
const unsigned response_length =
3139
brw_surface_payload_size(response_expected, exec_size);
3140
const unsigned desc =
3141
brw_message_desc(devinfo, msg_length, response_length, header_present) |
3142
brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3143
response_expected);
3144
/* Mask out unused components -- This is especially important in Align16
3145
* mode on generations that don't have native support for SIMD4x2 atomics,
3146
* because unused but enabled components will cause the dataport to perform
3147
* additional atomic operations on the addresses that happen to be in the
3148
* uninitialized Y, Z and W coordinates of the payload.
3149
*/
3150
const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3151
3152
brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3153
payload, surface, desc);
3154
}
3155
3156
void
3157
brw_untyped_surface_read(struct brw_codegen *p,
3158
struct brw_reg dst,
3159
struct brw_reg payload,
3160
struct brw_reg surface,
3161
unsigned msg_length,
3162
unsigned num_channels)
3163
{
3164
const struct intel_device_info *devinfo = p->devinfo;
3165
const unsigned sfid = (devinfo->verx10 >= 75 ?
3166
HSW_SFID_DATAPORT_DATA_CACHE_1 :
3167
GFX7_SFID_DATAPORT_DATA_CACHE);
3168
const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3169
const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3170
const unsigned response_length =
3171
brw_surface_payload_size(num_channels, exec_size);
3172
const unsigned desc =
3173
brw_message_desc(devinfo, msg_length, response_length, false) |
3174
brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3175
3176
brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3177
}
3178
3179
void
3180
brw_untyped_surface_write(struct brw_codegen *p,
3181
struct brw_reg payload,
3182
struct brw_reg surface,
3183
unsigned msg_length,
3184
unsigned num_channels,
3185
bool header_present)
3186
{
3187
const struct intel_device_info *devinfo = p->devinfo;
3188
const unsigned sfid = (devinfo->verx10 >= 75 ?
3189
HSW_SFID_DATAPORT_DATA_CACHE_1 :
3190
GFX7_SFID_DATAPORT_DATA_CACHE);
3191
const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3192
/* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3193
const bool has_simd4x2 = devinfo->verx10 >= 75;
3194
const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3195
has_simd4x2 ? 0 : 8;
3196
const unsigned desc =
3197
brw_message_desc(devinfo, msg_length, 0, header_present) |
3198
brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3199
/* Mask out unused components -- See comment in brw_untyped_atomic(). */
3200
const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3201
3202
brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3203
payload, surface, desc);
3204
}
3205
3206
static void
3207
brw_set_memory_fence_message(struct brw_codegen *p,
3208
struct brw_inst *insn,
3209
enum brw_message_target sfid,
3210
bool commit_enable,
3211
unsigned bti)
3212
{
3213
const struct intel_device_info *devinfo = p->devinfo;
3214
3215
brw_set_desc(p, insn, brw_message_desc(
3216
devinfo, 1, (commit_enable ? 1 : 0), true));
3217
3218
brw_inst_set_sfid(devinfo, insn, sfid);
3219
3220
switch (sfid) {
3221
case GFX6_SFID_DATAPORT_RENDER_CACHE:
3222
brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3223
break;
3224
case GFX7_SFID_DATAPORT_DATA_CACHE:
3225
brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3226
break;
3227
default:
3228
unreachable("Not reached");
3229
}
3230
3231
if (commit_enable)
3232
brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3233
3234
assert(devinfo->ver >= 11 || bti == 0);
3235
brw_inst_set_binding_table_index(devinfo, insn, bti);
3236
}
3237
3238
static void
3239
gfx12_set_memory_fence_message(struct brw_codegen *p,
3240
struct brw_inst *insn,
3241
enum brw_message_target sfid)
3242
{
3243
const unsigned mlen = 1; /* g0 header */
3244
/* Completion signaled by write to register. No data returned. */
3245
const unsigned rlen = 1;
3246
3247
brw_inst_set_sfid(p->devinfo, insn, sfid);
3248
3249
enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;
3250
enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
3251
3252
if (sfid == GFX12_SFID_TGM) {
3253
scope = LSC_FENCE_GPU;
3254
flush_type = LSC_FLUSH_TYPE_EVICT;
3255
}
3256
3257
brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3258
flush_type, false) |
3259
brw_message_desc(p->devinfo, mlen, rlen, false));
3260
}
3261
3262
void
3263
brw_memory_fence(struct brw_codegen *p,
3264
struct brw_reg dst,
3265
struct brw_reg src,
3266
enum opcode send_op,
3267
enum brw_message_target sfid,
3268
bool commit_enable,
3269
unsigned bti)
3270
{
3271
const struct intel_device_info *devinfo = p->devinfo;
3272
3273
dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3274
src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3275
3276
/* Set dst as destination for dependency tracking, the MEMORY_FENCE
3277
* message doesn't write anything back.
3278
*/
3279
struct brw_inst *insn = next_insn(p, send_op);
3280
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3281
brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3282
brw_set_dest(p, insn, dst);
3283
brw_set_src0(p, insn, src);
3284
3285
/* All DG2 hardware requires LSC for fence messages, even A-step */
3286
if (devinfo->has_lsc)
3287
gfx12_set_memory_fence_message(p, insn, sfid);
3288
else
3289
brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3290
}
3291
3292
void
3293
brw_pixel_interpolator_query(struct brw_codegen *p,
3294
struct brw_reg dest,
3295
struct brw_reg mrf,
3296
bool noperspective,
3297
bool coarse_pixel_rate,
3298
unsigned mode,
3299
struct brw_reg data,
3300
unsigned msg_length,
3301
unsigned response_length)
3302
{
3303
const struct intel_device_info *devinfo = p->devinfo;
3304
const uint16_t exec_size = brw_get_default_exec_size(p);
3305
const unsigned slot_group = brw_get_default_group(p) / 16;
3306
const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3307
const unsigned desc =
3308
brw_message_desc(devinfo, msg_length, response_length, false) |
3309
brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,
3310
simd_mode, slot_group);
3311
3312
/* brw_send_indirect_message will automatically use a direct send message
3313
* if data is actually immediate.
3314
*/
3315
brw_send_indirect_message(p,
3316
GFX7_SFID_PIXEL_INTERPOLATOR,
3317
dest,
3318
mrf,
3319
vec1(data),
3320
desc,
3321
false);
3322
}
3323
3324
void
3325
brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3326
struct brw_reg mask)
3327
{
3328
const struct intel_device_info *devinfo = p->devinfo;
3329
const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3330
const unsigned qtr_control = brw_get_default_group(p) / 8;
3331
brw_inst *inst;
3332
3333
assert(devinfo->ver >= 7);
3334
assert(mask.type == BRW_REGISTER_TYPE_UD);
3335
3336
brw_push_insn_state(p);
3337
3338
/* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3339
* unnecessary bits in the instruction words, get the information we need
3340
* and reset the default flag register. This allows more instructions to be
3341
* compacted.
3342
*/
3343
const unsigned flag_subreg = p->current->flag_subreg;
3344
brw_set_default_flag_reg(p, 0, 0);
3345
3346
if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3347
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3348
3349
if (devinfo->ver >= 8) {
3350
/* Getting the first active channel index is easy on Gfx8: Just find
3351
* the first bit set in the execution mask. The register exists on
3352
* HSW already but it reads back as all ones when the current
3353
* instruction has execution masking disabled, so it's kind of
3354
* useless.
3355
*/
3356
struct brw_reg exec_mask =
3357
retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3358
3359
brw_set_default_exec_size(p, BRW_EXECUTE_1);
3360
if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3361
/* Unfortunately, ce0 does not take into account the thread
3362
* dispatch mask, which may be a problem in cases where it's not
3363
* tightly packed (i.e. it doesn't have the form '2^n - 1' for
3364
* some n). Combine ce0 with the given dispatch (or vector) mask
3365
* to mask off those channels which were never dispatched by the
3366
* hardware.
3367
*/
3368
brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3369
brw_set_default_swsb(p, tgl_swsb_regdist(1));
3370
brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3371
exec_mask = vec1(dst);
3372
}
3373
3374
/* Quarter control has the effect of magically shifting the value of
3375
* ce0 so you'll get the first active channel relative to the
3376
* specified quarter control as result.
3377
*/
3378
inst = brw_FBL(p, vec1(dst), exec_mask);
3379
} else {
3380
const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3381
3382
brw_set_default_exec_size(p, BRW_EXECUTE_1);
3383
brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3384
3385
/* Run enough instructions returning zero with execution masking and
3386
* a conditional modifier enabled in order to get the full execution
3387
* mask in f1.0. We could use a single 32-wide move here if it
3388
* weren't because of the hardware bug that causes channel enables to
3389
* be applied incorrectly to the second half of 32-wide instructions
3390
* on Gfx7.
3391
*/
3392
const unsigned lower_size = MIN2(16, exec_size);
3393
for (unsigned i = 0; i < exec_size / lower_size; i++) {
3394
inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3395
brw_imm_uw(0));
3396
brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3397
brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3398
brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3399
brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3400
brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3401
brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3402
}
3403
3404
/* Find the first bit set in the exec_size-wide portion of the flag
3405
* register that was updated by the last sequence of MOV
3406
* instructions.
3407
*/
3408
const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3409
brw_set_default_exec_size(p, BRW_EXECUTE_1);
3410
brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3411
}
3412
} else {
3413
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3414
3415
if (devinfo->ver >= 8 &&
3416
mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3417
/* In SIMD4x2 mode the first active channel index is just the
3418
* negation of the first bit of the mask register. Note that ce0
3419
* doesn't take into account the dispatch mask, so the Gfx7 path
3420
* should be used instead unless you have the guarantee that the
3421
* dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3422
* for some n).
3423
*/
3424
inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3425
negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3426
brw_imm_ud(1));
3427
3428
} else {
3429
/* Overwrite the destination without and with execution masking to
3430
* find out which of the channels is active.
3431
*/
3432
brw_push_insn_state(p);
3433
brw_set_default_exec_size(p, BRW_EXECUTE_4);
3434
brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3435
brw_imm_ud(1));
3436
3437
inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3438
brw_imm_ud(0));
3439
brw_pop_insn_state(p);
3440
brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3441
}
3442
}
3443
3444
brw_pop_insn_state(p);
3445
}
3446
3447
void
3448
brw_broadcast(struct brw_codegen *p,
3449
struct brw_reg dst,
3450
struct brw_reg src,
3451
struct brw_reg idx)
3452
{
3453
const struct intel_device_info *devinfo = p->devinfo;
3454
const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3455
brw_inst *inst;
3456
3457
brw_push_insn_state(p);
3458
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3459
brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3460
3461
assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3462
src.address_mode == BRW_ADDRESS_DIRECT);
3463
assert(!src.abs && !src.negate);
3464
assert(src.type == dst.type);
3465
3466
if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3467
idx.file == BRW_IMMEDIATE_VALUE) {
3468
/* Trivial, the source is already uniform or the index is a constant.
3469
* We will typically not get here if the optimizer is doing its job, but
3470
* asserting would be mean.
3471
*/
3472
const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3473
src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3474
stride(suboffset(src, 4 * i), 0, 4, 1);
3475
3476
if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3477
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3478
subscript(src, BRW_REGISTER_TYPE_D, 0));
3479
brw_set_default_swsb(p, tgl_swsb_null());
3480
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3481
subscript(src, BRW_REGISTER_TYPE_D, 1));
3482
} else {
3483
brw_MOV(p, dst, src);
3484
}
3485
} else {
3486
/* From the Haswell PRM section "Register Region Restrictions":
3487
*
3488
* "The lower bits of the AddressImmediate must not overflow to
3489
* change the register address. The lower 5 bits of Address
3490
* Immediate when added to lower 5 bits of address register gives
3491
* the sub-register offset. The upper bits of Address Immediate
3492
* when added to upper bits of address register gives the register
3493
* address. Any overflow from sub-register offset is dropped."
3494
*
3495
* Fortunately, for broadcast, we never have a sub-register offset so
3496
* this isn't an issue.
3497
*/
3498
assert(src.subnr == 0);
3499
3500
if (align1) {
3501
const struct brw_reg addr =
3502
retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3503
unsigned offset = src.nr * REG_SIZE + src.subnr;
3504
/* Limit in bytes of the signed indirect addressing immediate. */
3505
const unsigned limit = 512;
3506
3507
brw_push_insn_state(p);
3508
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3509
brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3510
3511
/* Take into account the component size and horizontal stride. */
3512
assert(src.vstride == src.hstride + src.width);
3513
brw_SHL(p, addr, vec1(idx),
3514
brw_imm_ud(util_logbase2(type_sz(src.type)) +
3515
src.hstride - 1));
3516
3517
/* We can only address up to limit bytes using the indirect
3518
* addressing immediate, account for the difference if the source
3519
* register is above this limit.
3520
*/
3521
if (offset >= limit) {
3522
brw_set_default_swsb(p, tgl_swsb_regdist(1));
3523
brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3524
offset = offset % limit;
3525
}
3526
3527
brw_pop_insn_state(p);
3528
3529
brw_set_default_swsb(p, tgl_swsb_regdist(1));
3530
3531
/* Use indirect addressing to fetch the specified component. */
3532
if (type_sz(src.type) > 4 &&
3533
(devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
3534
!devinfo->has_64bit_float)) {
3535
/* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3536
*
3537
* "When source or destination datatype is 64b or operation is
3538
* integer DWord multiply, indirect addressing must not be
3539
* used."
3540
*
3541
* To work around both of this issue, we do two integer MOVs
3542
* insead of one 64-bit MOV. Because no double value should ever
3543
* cross a register boundary, it's safe to use the immediate
3544
* offset in the indirect here to handle adding 4 bytes to the
3545
* offset and avoid the extra ADD to the register file.
3546
*/
3547
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3548
retype(brw_vec1_indirect(addr.subnr, offset),
3549
BRW_REGISTER_TYPE_D));
3550
brw_set_default_swsb(p, tgl_swsb_null());
3551
brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3552
retype(brw_vec1_indirect(addr.subnr, offset + 4),
3553
BRW_REGISTER_TYPE_D));
3554
} else {
3555
brw_MOV(p, dst,
3556
retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3557
}
3558
} else {
3559
/* In SIMD4x2 mode the index can be either zero or one, replicate it
3560
* to all bits of a flag register,
3561
*/
3562
inst = brw_MOV(p,
3563
brw_null_reg(),
3564
stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3565
brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3566
brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3567
brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3568
3569
/* and use predicated SEL to pick the right channel. */
3570
inst = brw_SEL(p, dst,
3571
stride(suboffset(src, 4), 4, 4, 1),
3572
stride(src, 4, 4, 1));
3573
brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3574
brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3575
}
3576
}
3577
3578
brw_pop_insn_state(p);
3579
}
3580
3581
/**
3582
* This instruction is generated as a single-channel align1 instruction by
3583
* both the VS and FS stages when using INTEL_DEBUG=shader_time.
3584
*
3585
* We can't use the typed atomic op in the FS because that has the execution
3586
* mask ANDed with the pixel mask, but we just want to write the one dword for
3587
* all the pixels.
3588
*
3589
* We don't use the SIMD4x2 atomic ops in the VS because want to just write
3590
* one u32. So we use the same untyped atomic write message as the pixel
3591
* shader.
3592
*
3593
* The untyped atomic operation requires a BUFFER surface type with RAW
3594
* format, and is only accessible through the legacy DATA_CACHE dataport
3595
* messages.
3596
*/
3597
void brw_shader_time_add(struct brw_codegen *p,
3598
struct brw_reg payload,
3599
uint32_t surf_index)
3600
{
3601
const struct intel_device_info *devinfo = p->devinfo;
3602
const unsigned sfid = (devinfo->verx10 >= 75 ?
3603
HSW_SFID_DATAPORT_DATA_CACHE_1 :
3604
GFX7_SFID_DATAPORT_DATA_CACHE);
3605
assert(devinfo->ver >= 7);
3606
3607
brw_push_insn_state(p);
3608
brw_set_default_access_mode(p, BRW_ALIGN_1);
3609
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3610
brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3611
brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3612
3613
/* We use brw_vec1_reg and unmasked because we want to increment the given
3614
* offset only once.
3615
*/
3616
brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3617
BRW_ARF_NULL, 0));
3618
brw_set_src0(p, send, brw_vec1_reg(payload.file,
3619
payload.nr, 0));
3620
brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3621
brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3622
false)));
3623
3624
brw_inst_set_sfid(devinfo, send, sfid);
3625
brw_inst_set_binding_table_index(devinfo, send, surf_index);
3626
3627
brw_pop_insn_state(p);
3628
}
3629
3630
3631
/**
3632
* Emit the SEND message for a barrier
3633
*/
3634
void
3635
brw_barrier(struct brw_codegen *p, struct brw_reg src)
3636
{
3637
const struct intel_device_info *devinfo = p->devinfo;
3638
struct brw_inst *inst;
3639
3640
assert(devinfo->ver >= 7);
3641
3642
brw_push_insn_state(p);
3643
brw_set_default_access_mode(p, BRW_ALIGN_1);
3644
inst = next_insn(p, BRW_OPCODE_SEND);
3645
brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3646
brw_set_src0(p, inst, src);
3647
brw_set_src1(p, inst, brw_null_reg());
3648
brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3649
3650
brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3651
brw_inst_set_gateway_subfuncid(devinfo, inst,
3652
BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3653
3654
brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3655
brw_pop_insn_state(p);
3656
}
3657
3658
3659
/**
3660
* Emit the wait instruction for a barrier
3661
*/
3662
void
3663
brw_WAIT(struct brw_codegen *p)
3664
{
3665
const struct intel_device_info *devinfo = p->devinfo;
3666
struct brw_inst *insn;
3667
3668
struct brw_reg src = brw_notification_reg();
3669
3670
insn = next_insn(p, BRW_OPCODE_WAIT);
3671
brw_set_dest(p, insn, src);
3672
brw_set_src0(p, insn, src);
3673
brw_set_src1(p, insn, brw_null_reg());
3674
3675
brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3676
brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3677
}
3678
3679
void
3680
brw_float_controls_mode(struct brw_codegen *p,
3681
unsigned mode, unsigned mask)
3682
{
3683
/* From the Skylake PRM, Volume 7, page 760:
3684
* "Implementation Restriction on Register Access: When the control
3685
* register is used as an explicit source and/or destination, hardware
3686
* does not ensure execution pipeline coherency. Software must set the
3687
* thread control field to ‘switch’ for an instruction that uses
3688
* control register as an explicit operand."
3689
*
3690
* On Gfx12+ this is implemented in terms of SWSB annotations instead.
3691
*/
3692
brw_set_default_swsb(p, tgl_swsb_regdist(1));
3693
3694
brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3695
brw_imm_ud(~mask));
3696
brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3697
if (p->devinfo->ver < 12)
3698
brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3699
3700
if (mode) {
3701
brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3702
brw_imm_ud(mode));
3703
brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3704
if (p->devinfo->ver < 12)
3705
brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3706
}
3707
3708
if (p->devinfo->ver >= 12)
3709
brw_SYNC(p, TGL_SYNC_NOP);
3710
}
3711
3712
void
3713
brw_update_reloc_imm(const struct intel_device_info *devinfo,
3714
brw_inst *inst,
3715
uint32_t value)
3716
{
3717
/* Sanity check that the instruction is a MOV of an immediate */
3718
assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);
3719
assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3720
3721
/* If it was compacted, we can't safely rewrite */
3722
assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3723
3724
brw_inst_set_imm_ud(devinfo, inst, value);
3725
}
3726
3727
/* A default value for constants that will be patched at run-time.
3728
* We pick an arbitrary value that prevents instruction compaction.
3729
*/
3730
#define DEFAULT_PATCH_IMM 0x4a7cc037
3731
3732
void
3733
brw_MOV_reloc_imm(struct brw_codegen *p,
3734
struct brw_reg dst,
3735
enum brw_reg_type src_type,
3736
uint32_t id)
3737
{
3738
assert(type_sz(src_type) == 4);
3739
assert(type_sz(dst.type) == 4);
3740
3741
brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
3742
p->next_insn_offset, 0);
3743
3744
brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3745
}
3746
3747