Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/compiler/brw_fs_visitor.cpp
4550 views
1
/*
2
* Copyright © 2010 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
/** @file brw_fs_visitor.cpp
25
*
26
* This file supports generating the FS LIR from the GLSL IR. The LIR
27
* makes it easier to do backend-specific optimizations than doing so
28
* in the GLSL IR or in the native code.
29
*/
30
#include "brw_fs.h"
31
#include "compiler/glsl_types.h"
32
33
using namespace brw;
34
35
/* Sample from the MCS surface attached to this multisample texture. */
36
fs_reg
37
fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
38
const fs_reg &texture,
39
const fs_reg &texture_handle)
40
{
41
const fs_reg dest = vgrf(glsl_type::uvec4_type);
42
43
fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
44
srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
45
srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
46
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
47
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
48
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
49
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
50
51
fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
52
ARRAY_SIZE(srcs));
53
54
/* We only care about one or two regs of response, but the sampler always
55
* writes 4/8.
56
*/
57
inst->size_written = 4 * dest.component_size(inst->exec_size);
58
59
return dest;
60
}
61
62
/**
63
* Apply workarounds for Gfx6 gather with UINT/SINT
64
*/
65
void
66
fs_visitor::emit_gfx6_gather_wa(uint8_t wa, fs_reg dst)
67
{
68
if (!wa)
69
return;
70
71
int width = (wa & WA_8BIT) ? 8 : 16;
72
73
for (int i = 0; i < 4; i++) {
74
fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
75
/* Convert from UNORM to UINT */
76
bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1));
77
bld.MOV(dst, dst_f);
78
79
if (wa & WA_SIGN) {
80
/* Reinterpret the UINT value as a signed INT value by
81
* shifting the sign bit into place, then shifting back
82
* preserving sign.
83
*/
84
bld.SHL(dst, dst, brw_imm_d(32 - width));
85
bld.ASR(dst, dst, brw_imm_d(32 - width));
86
}
87
88
dst = offset(dst, bld, 1);
89
}
90
}
91
92
/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
93
void
94
fs_visitor::emit_dummy_fs()
95
{
96
int reg_width = dispatch_width / 8;
97
98
/* Everyone's favorite color. */
99
const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
100
for (int i = 0; i < 4; i++) {
101
bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
102
brw_imm_f(color[i]));
103
}
104
105
fs_inst *write;
106
write = bld.emit(FS_OPCODE_FB_WRITE);
107
write->eot = true;
108
write->last_rt = true;
109
if (devinfo->ver >= 6) {
110
write->base_mrf = 2;
111
write->mlen = 4 * reg_width;
112
} else {
113
write->header_size = 2;
114
write->base_mrf = 0;
115
write->mlen = 2 + 4 * reg_width;
116
}
117
118
/* Tell the SF we don't have any inputs. Gfx4-5 require at least one
119
* varying to avoid GPU hangs, so set that.
120
*/
121
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
122
wm_prog_data->num_varying_inputs = devinfo->ver < 6 ? 1 : 0;
123
memset(wm_prog_data->urb_setup, -1,
124
sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
125
brw_compute_urb_setup_index(wm_prog_data);
126
127
/* We don't have any uniforms. */
128
stage_prog_data->nr_params = 0;
129
stage_prog_data->nr_pull_params = 0;
130
stage_prog_data->curb_read_length = 0;
131
stage_prog_data->dispatch_grf_start_reg = 2;
132
wm_prog_data->dispatch_grf_start_reg_16 = 2;
133
wm_prog_data->dispatch_grf_start_reg_32 = 2;
134
grf_used = 1; /* Gfx4-5 don't allow zero GRF blocks */
135
136
calculate_cfg();
137
}
138
139
/* The register location here is relative to the start of the URB
140
* data. It will get adjusted to be a real location before
141
* generate_code() time.
142
*/
143
fs_reg
144
fs_visitor::interp_reg(int location, int channel)
145
{
146
assert(stage == MESA_SHADER_FRAGMENT);
147
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
148
int regnr = prog_data->urb_setup[location] * 4 + channel;
149
assert(prog_data->urb_setup[location] != -1);
150
151
return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
152
}
153
154
/** Emits the interpolation for the varying inputs. */
155
void
156
fs_visitor::emit_interpolation_setup_gfx4()
157
{
158
struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
159
160
fs_builder abld = bld.annotate("compute pixel centers");
161
this->pixel_x = vgrf(glsl_type::uint_type);
162
this->pixel_y = vgrf(glsl_type::uint_type);
163
this->pixel_x.type = BRW_REGISTER_TYPE_UW;
164
this->pixel_y.type = BRW_REGISTER_TYPE_UW;
165
abld.ADD(this->pixel_x,
166
fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
167
fs_reg(brw_imm_v(0x10101010)));
168
abld.ADD(this->pixel_y,
169
fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
170
fs_reg(brw_imm_v(0x11001100)));
171
172
abld = bld.annotate("compute pixel deltas from v0");
173
174
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
175
vgrf(glsl_type::vec2_type);
176
const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
177
const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
178
const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
179
180
if (devinfo->has_pln) {
181
for (unsigned i = 0; i < dispatch_width / 8; i++) {
182
abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
183
quarter(this->pixel_x, i), xstart);
184
abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
185
quarter(this->pixel_y, i), ystart);
186
}
187
} else {
188
abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
189
abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
190
}
191
192
this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
193
194
/* The SF program automatically handles doing the perspective correction or
195
* not based on wm_prog_data::interp_mode[] so we can use the same pixel
196
* offsets for both perspective and non-perspective.
197
*/
198
this->delta_xy[BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
199
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
200
201
abld = bld.annotate("compute pos.w and 1/pos.w");
202
/* Compute wpos.w. It's always in our setup, since it's needed to
203
* interpolate the other attributes.
204
*/
205
this->wpos_w = vgrf(glsl_type::float_type);
206
abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
207
component(interp_reg(VARYING_SLOT_POS, 3), 0));
208
/* Compute the pixel 1/W value from wpos.w. */
209
this->pixel_w = vgrf(glsl_type::float_type);
210
abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
211
}
212
213
static unsigned
214
brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
215
{
216
unsigned brw_mode = 0;
217
*mask = 0;
218
219
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
220
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
221
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
222
mode) {
223
brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
224
*mask |= BRW_CR0_RND_MODE_MASK;
225
}
226
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
227
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
228
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
229
mode) {
230
brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
231
*mask |= BRW_CR0_RND_MODE_MASK;
232
}
233
if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
234
brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
235
*mask |= BRW_CR0_FP16_DENORM_PRESERVE;
236
}
237
if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
238
brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
239
*mask |= BRW_CR0_FP32_DENORM_PRESERVE;
240
}
241
if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
242
brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
243
*mask |= BRW_CR0_FP64_DENORM_PRESERVE;
244
}
245
if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
246
*mask |= BRW_CR0_FP16_DENORM_PRESERVE;
247
if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
248
*mask |= BRW_CR0_FP32_DENORM_PRESERVE;
249
if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
250
*mask |= BRW_CR0_FP64_DENORM_PRESERVE;
251
if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
252
*mask |= BRW_CR0_FP_MODE_MASK;
253
254
if (*mask != 0)
255
assert((*mask & brw_mode) == brw_mode);
256
257
return brw_mode;
258
}
259
260
void
261
fs_visitor::emit_shader_float_controls_execution_mode()
262
{
263
unsigned execution_mode = this->nir->info.float_controls_execution_mode;
264
if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
265
return;
266
267
fs_builder abld = bld.annotate("shader floats control execution mode");
268
unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
269
270
if (mask == 0)
271
return;
272
273
abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
274
brw_imm_d(mode), brw_imm_d(mask));
275
}
276
277
/** Emits the interpolation for the varying inputs. */
278
void
279
fs_visitor::emit_interpolation_setup_gfx6()
280
{
281
fs_builder abld = bld.annotate("compute pixel centers");
282
283
this->pixel_x = vgrf(glsl_type::float_type);
284
this->pixel_y = vgrf(glsl_type::float_type);
285
286
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
287
288
fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
289
fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
290
fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
291
if (!wm_prog_data->per_coarse_pixel_dispatch) {
292
/* The thread payload only delivers subspan locations (ss0, ss1,
293
* ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
294
* generate 4 pixel coordinates out of each subspan location. We do this
295
* by replicating a subspan coordinate 4 times and adding an offset of 1
296
* in each direction from the initial top left (tl) location to generate
297
* top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
298
* (br = +1 in x, +1 in y).
299
*
300
* The locations we build look like this in SIMD8 :
301
*
302
* ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
303
*
304
* The value 0x11001010 is a vector of 8 half byte vector. It adds
305
* following to generate the 4 pixels coordinates out of the subspan0:
306
*
307
* 0x
308
* 1 : ss0.y + 1 -> ss0.br.y
309
* 1 : ss0.y + 1 -> ss0.bl.y
310
* 0 : ss0.y + 0 -> ss0.tr.y
311
* 0 : ss0.y + 0 -> ss0.tl.y
312
* 1 : ss0.x + 1 -> ss0.br.x
313
* 0 : ss0.x + 0 -> ss0.bl.x
314
* 1 : ss0.x + 1 -> ss0.tr.x
315
* 0 : ss0.x + 0 -> ss0.tl.x
316
*
317
* By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
318
* coordinates out of 2 subspans coordinates in a single ADD instruction
319
* (twice the operation above).
320
*/
321
int_pixel_offset_xy = fs_reg(brw_imm_v(0x11001010));
322
half_int_pixel_offset_x = fs_reg(brw_imm_uw(0));
323
half_int_pixel_offset_y = fs_reg(brw_imm_uw(0));
324
/* On Gfx12.5, because of regioning restrictions, the interpolation code
325
* is slightly different and works off X & Y only inputs. The ordering
326
* of the half bytes here is a bit odd, with each subspan replicated
327
* twice and every other element is discarded :
328
*
329
* ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
330
* X offset: 0 0 1 0 0 0 1 0
331
* Y offset: 0 0 0 0 1 0 1 0
332
*/
333
int_pixel_offset_x = fs_reg(brw_imm_v(0x01000100));
334
int_pixel_offset_y = fs_reg(brw_imm_v(0x01010000));
335
} else {
336
/* In coarse pixel dispatch we have to do the same ADD instruction that
337
* we do in normal per pixel dispatch, except this time we're not adding
338
* 1 in each direction, but instead the coarse pixel size.
339
*
340
* The coarse pixel size is delivered as 2 u8 in r1.0
341
*/
342
struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB);
343
344
const fs_builder dbld =
345
abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
346
347
if (devinfo->verx10 >= 125) {
348
/* To build the array of half bytes we do and AND operation with the
349
* right mask in X.
350
*/
351
int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
352
dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
353
354
/* And the right mask in Y. */
355
int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
356
dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
357
} else {
358
/* To build the array of half bytes we do and AND operation with the
359
* right mask in X.
360
*/
361
int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
362
dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
363
364
/* And the right mask in Y. */
365
int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
366
dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
367
368
/* Finally OR the 2 registers. */
369
int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
370
dbld.OR(int_pixel_offset_xy, int_pixel_offset_x, int_pixel_offset_y);
371
}
372
373
/* Also compute the half pixel size used to center pixels. */
374
half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
375
half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
376
377
bld.SHR(half_int_pixel_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
378
bld.SHR(half_int_pixel_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
379
}
380
381
for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
382
const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
383
struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW);
384
385
if (devinfo->verx10 >= 125) {
386
const fs_builder dbld =
387
abld.exec_all().group(hbld.dispatch_width() * 2, 0);
388
const fs_reg int_pixel_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
389
const fs_reg int_pixel_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
390
391
dbld.ADD(int_pixel_x,
392
fs_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
393
int_pixel_offset_x);
394
dbld.ADD(int_pixel_y,
395
fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
396
int_pixel_offset_y);
397
398
if (wm_prog_data->per_coarse_pixel_dispatch) {
399
dbld.ADD(int_pixel_x, int_pixel_x,
400
horiz_stride(half_int_pixel_offset_x, 0));
401
dbld.ADD(int_pixel_y, int_pixel_y,
402
horiz_stride(half_int_pixel_offset_y, 0));
403
}
404
405
hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
406
hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
407
408
} else if (devinfo->ver >= 8 || dispatch_width == 8) {
409
/* The "Register Region Restrictions" page says for BDW (and newer,
410
* presumably):
411
*
412
* "When destination spans two registers, the source may be one or
413
* two registers. The destination elements must be evenly split
414
* between the two registers."
415
*
416
* Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
417
* to compute our pixel centers.
418
*/
419
const fs_builder dbld =
420
abld.exec_all().group(hbld.dispatch_width() * 2, 0);
421
fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
422
423
dbld.ADD(int_pixel_xy,
424
fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
425
int_pixel_offset_xy);
426
427
hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
428
horiz_stride(half_int_pixel_offset_x, 0));
429
hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
430
horiz_stride(half_int_pixel_offset_y, 0));
431
} else {
432
/* The "Register Region Restrictions" page says for SNB, IVB, HSW:
433
*
434
* "When destination spans two registers, the source MUST span
435
* two registers."
436
*
437
* Since the GRF source of the ADD will only read a single register,
438
* we must do two separate ADDs in SIMD16.
439
*/
440
const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW);
441
const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW);
442
443
hbld.ADD(int_pixel_x,
444
fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
445
fs_reg(brw_imm_v(0x10101010)));
446
hbld.ADD(int_pixel_y,
447
fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
448
fs_reg(brw_imm_v(0x11001100)));
449
450
/* As of gfx6, we can no longer mix float and int sources. We have
451
* to turn the integer pixel centers into floats for their actual
452
* use.
453
*/
454
hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
455
hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
456
}
457
}
458
459
abld = bld.annotate("compute pos.z");
460
if (wm_prog_data->uses_depth_w_coefficients) {
461
assert(!wm_prog_data->uses_src_depth);
462
/* In coarse pixel mode, the HW doesn't interpolate Z coordinate
463
* properly. In the same way we have to add the coarse pixel size to
464
* pixels locations, here we recompute the Z value with 2 coefficients
465
* in X & Y axis.
466
*/
467
fs_reg coef_payload = fetch_payload_reg(abld, payload.depth_w_coef_reg, BRW_REGISTER_TYPE_F);
468
const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
469
const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
470
const fs_reg z_cx = brw_vec1_grf(coef_payload.nr, 1);
471
const fs_reg z_cy = brw_vec1_grf(coef_payload.nr, 0);
472
const fs_reg z_c0 = brw_vec1_grf(coef_payload.nr, 3);
473
474
const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F);
475
const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F);
476
477
abld.ADD(float_pixel_x, this->pixel_x, negate(x_start));
478
abld.ADD(float_pixel_y, this->pixel_y, negate(y_start));
479
480
/* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
481
const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
482
/* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
483
const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1);
484
const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD);
485
const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD);
486
abld.MOV(u32_cps_width, u8_cps_width);
487
abld.MOV(u32_cps_height, u8_cps_height);
488
489
const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F);
490
const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F);
491
abld.MOV(f_cps_width, u32_cps_width);
492
abld.MOV(f_cps_height, u32_cps_height);
493
494
/* Center in the middle of the coarse pixel. */
495
abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
496
abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
497
498
this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
499
abld.MAD(this->pixel_z, z_c0, z_cx, float_pixel_x);
500
abld.MAD(this->pixel_z, this->pixel_z, z_cy, float_pixel_y);
501
}
502
503
if (wm_prog_data->uses_src_depth) {
504
assert(!wm_prog_data->uses_depth_w_coefficients);
505
this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
506
}
507
508
if (wm_prog_data->uses_src_w) {
509
abld = bld.annotate("compute pos.w");
510
this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
511
this->wpos_w = vgrf(glsl_type::float_type);
512
abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
513
}
514
515
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
516
this->delta_xy[i] = fetch_barycentric_reg(
517
bld, payload.barycentric_coord_reg[i]);
518
}
519
520
uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
521
(1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
522
1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
523
524
if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
525
/* Get the pixel/sample mask into f0 so that we know which
526
* pixels are lit. Then, for each channel that is unlit,
527
* replace the centroid data with non-centroid data.
528
*/
529
for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
530
bld.exec_all().group(1, 0)
531
.MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW),
532
retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW));
533
}
534
535
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
536
if (!(centroid_modes & (1 << i)))
537
continue;
538
539
const fs_reg centroid_delta_xy = delta_xy[i];
540
const fs_reg &pixel_delta_xy = delta_xy[i - 1];
541
542
delta_xy[i] = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
543
544
for (unsigned c = 0; c < 2; c++) {
545
for (unsigned q = 0; q < dispatch_width / 8; q++) {
546
set_predicate(BRW_PREDICATE_NORMAL,
547
bld.quarter(q).SEL(
548
quarter(offset(delta_xy[i], bld, c), q),
549
quarter(offset(centroid_delta_xy, bld, c), q),
550
quarter(offset(pixel_delta_xy, bld, c), q)));
551
}
552
}
553
}
554
}
555
}
556
557
static enum brw_conditional_mod
558
cond_for_alpha_func(GLenum func)
559
{
560
switch(func) {
561
case GL_GREATER:
562
return BRW_CONDITIONAL_G;
563
case GL_GEQUAL:
564
return BRW_CONDITIONAL_GE;
565
case GL_LESS:
566
return BRW_CONDITIONAL_L;
567
case GL_LEQUAL:
568
return BRW_CONDITIONAL_LE;
569
case GL_EQUAL:
570
return BRW_CONDITIONAL_EQ;
571
case GL_NOTEQUAL:
572
return BRW_CONDITIONAL_NEQ;
573
default:
574
unreachable("Not reached");
575
}
576
}
577
578
/**
579
* Alpha test support for when we compile it into the shader instead
580
* of using the normal fixed-function alpha test.
581
*/
582
void
583
fs_visitor::emit_alpha_test()
584
{
585
assert(stage == MESA_SHADER_FRAGMENT);
586
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
587
const fs_builder abld = bld.annotate("Alpha test");
588
589
fs_inst *cmp;
590
if (key->alpha_test_func == GL_ALWAYS)
591
return;
592
593
if (key->alpha_test_func == GL_NEVER) {
594
/* f0.1 = 0 */
595
fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
596
BRW_REGISTER_TYPE_UW));
597
cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
598
BRW_CONDITIONAL_NEQ);
599
} else {
600
/* RT0 alpha */
601
fs_reg color = offset(outputs[0], bld, 3);
602
603
/* f0.1 &= func(color, ref) */
604
cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
605
cond_for_alpha_func(key->alpha_test_func));
606
}
607
cmp->predicate = BRW_PREDICATE_NORMAL;
608
cmp->flag_subreg = 1;
609
}
610
611
fs_inst *
612
fs_visitor::emit_single_fb_write(const fs_builder &bld,
613
fs_reg color0, fs_reg color1,
614
fs_reg src0_alpha, unsigned components)
615
{
616
assert(stage == MESA_SHADER_FRAGMENT);
617
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
618
619
/* Hand over gl_FragDepth or the payload depth. */
620
const fs_reg dst_depth = fetch_payload_reg(bld, payload.dest_depth_reg);
621
fs_reg src_depth, src_stencil;
622
623
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
624
src_depth = frag_depth;
625
} else if (source_depth_to_render_target) {
626
/* If we got here, we're in one of those strange Gen4-5 cases where
627
* we're forced to pass the source depth, unmodified, to the FB write.
628
* In this case, we don't want to use pixel_z because we may not have
629
* set up interpolation. It's also perfectly safe because it only
630
* happens on old hardware (no coarse interpolation) and this is
631
* explicitly the pass-through case.
632
*/
633
assert(devinfo->ver <= 5);
634
src_depth = fetch_payload_reg(bld, payload.source_depth_reg);
635
}
636
637
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
638
src_stencil = frag_stencil;
639
640
const fs_reg sources[] = {
641
color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
642
(prog_data->uses_omask ? sample_mask : fs_reg()),
643
brw_imm_ud(components)
644
};
645
assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
646
fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
647
sources, ARRAY_SIZE(sources));
648
649
if (prog_data->uses_kill) {
650
write->predicate = BRW_PREDICATE_NORMAL;
651
write->flag_subreg = sample_mask_flag_subreg(this);
652
}
653
654
return write;
655
}
656
657
void
658
fs_visitor::emit_fb_writes()
659
{
660
assert(stage == MESA_SHADER_FRAGMENT);
661
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
662
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
663
664
fs_inst *inst = NULL;
665
666
if (source_depth_to_render_target && devinfo->ver == 6) {
667
/* For outputting oDepth on gfx6, SIMD8 writes have to be used. This
668
* would require SIMD8 moves of each half to message regs, e.g. by using
669
* the SIMD lowering pass. Unfortunately this is more difficult than it
670
* sounds because the SIMD8 single-source message lacks channel selects
671
* for the second and third subspans.
672
*/
673
limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
674
}
675
676
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
677
/* From the 'Render Target Write message' section of the docs:
678
* "Output Stencil is not supported with SIMD16 Render Target Write
679
* Messages."
680
*/
681
limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
682
"in SIMD16+ mode.\n");
683
}
684
685
/* ANV doesn't know about sample mask output during the wm key creation
686
* so we compute if we need replicate alpha and emit alpha to coverage
687
* workaround here.
688
*/
689
const bool replicate_alpha = key->alpha_test_replicate_alpha ||
690
(key->nr_color_regions > 1 && key->alpha_to_coverage &&
691
(sample_mask.file == BAD_FILE || devinfo->ver == 6));
692
693
for (int target = 0; target < key->nr_color_regions; target++) {
694
/* Skip over outputs that weren't written. */
695
if (this->outputs[target].file == BAD_FILE)
696
continue;
697
698
const fs_builder abld = bld.annotate(
699
ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
700
701
fs_reg src0_alpha;
702
if (devinfo->ver >= 6 && replicate_alpha && target != 0)
703
src0_alpha = offset(outputs[0], bld, 3);
704
705
inst = emit_single_fb_write(abld, this->outputs[target],
706
this->dual_src_output, src0_alpha, 4);
707
inst->target = target;
708
}
709
710
prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
711
this->outputs[0].file != BAD_FILE);
712
assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
713
714
if (inst == NULL) {
715
/* Even if there's no color buffers enabled, we still need to send
716
* alpha out the pipeline to our null renderbuffer to support
717
* alpha-testing, alpha-to-coverage, and so on.
718
*/
719
/* FINISHME: Factor out this frequently recurring pattern into a
720
* helper function.
721
*/
722
const fs_reg srcs[] = { reg_undef, reg_undef,
723
reg_undef, offset(this->outputs[0], bld, 3) };
724
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
725
bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
726
727
inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
728
inst->target = 0;
729
}
730
731
inst->last_rt = true;
732
inst->eot = true;
733
734
if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
735
prog_data->dual_src_blend) {
736
/* The dual-source RT write messages fail to release the thread
737
* dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
738
*
739
* XXX - Emit an extra single-source NULL RT-write marked LastRT in
740
* order to release the thread dependency without disabling
741
* SIMD32.
742
*
743
* The dual-source RT write messages may lead to hangs with SIMD16
744
* dispatch on ICL due some unknown reasons, see
745
* https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
746
*/
747
limit_dispatch_width(8, "Dual source blending unsupported "
748
"in SIMD16 and SIMD32 modes.\n");
749
}
750
}
751
752
void
753
fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
754
{
755
int slot, urb_offset, length;
756
int starting_urb_offset = 0;
757
const struct brw_vue_prog_data *vue_prog_data =
758
brw_vue_prog_data(this->prog_data);
759
const struct brw_vs_prog_key *vs_key =
760
(const struct brw_vs_prog_key *) this->key;
761
const GLbitfield64 psiz_mask =
762
VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
763
const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
764
bool flush;
765
fs_reg sources[8];
766
fs_reg urb_handle;
767
768
if (stage == MESA_SHADER_TESS_EVAL)
769
urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
770
else
771
urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
772
773
opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
774
int header_size = 1;
775
fs_reg per_slot_offsets;
776
777
if (stage == MESA_SHADER_GEOMETRY) {
778
const struct brw_gs_prog_data *gs_prog_data =
779
brw_gs_prog_data(this->prog_data);
780
781
/* We need to increment the Global Offset to skip over the control data
782
* header and the extra "Vertex Count" field (1 HWord) at the beginning
783
* of the VUE. We're counting in OWords, so the units are doubled.
784
*/
785
starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
786
if (gs_prog_data->static_vertex_count == -1)
787
starting_urb_offset += 2;
788
789
/* We also need to use per-slot offsets. The per-slot offset is the
790
* Vertex Count. SIMD8 mode processes 8 different primitives at a
791
* time; each may output a different number of vertices.
792
*/
793
opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
794
header_size++;
795
796
/* The URB offset is in 128-bit units, so we need to multiply by 2 */
797
const int output_vertex_size_owords =
798
gs_prog_data->output_vertex_size_hwords * 2;
799
800
if (gs_vertex_count.file == IMM) {
801
per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
802
gs_vertex_count.ud);
803
} else {
804
per_slot_offsets = vgrf(glsl_type::uint_type);
805
bld.MUL(per_slot_offsets, gs_vertex_count,
806
brw_imm_ud(output_vertex_size_owords));
807
}
808
}
809
810
length = 0;
811
urb_offset = starting_urb_offset;
812
flush = false;
813
814
/* SSO shaders can have VUE slots allocated which are never actually
815
* written to, so ignore them when looking for the last (written) slot.
816
*/
817
int last_slot = vue_map->num_slots - 1;
818
while (last_slot > 0 &&
819
(vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
820
outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
821
last_slot--;
822
}
823
824
bool urb_written = false;
825
for (slot = 0; slot < vue_map->num_slots; slot++) {
826
int varying = vue_map->slot_to_varying[slot];
827
switch (varying) {
828
case VARYING_SLOT_PSIZ: {
829
/* The point size varying slot is the vue header and is always in the
830
* vue map. But often none of the special varyings that live there
831
* are written and in that case we can skip writing to the vue
832
* header, provided the corresponding state properly clamps the
833
* values further down the pipeline. */
834
if ((vue_map->slots_valid & psiz_mask) == 0) {
835
assert(length == 0);
836
urb_offset++;
837
break;
838
}
839
840
fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
841
bld.MOV(zero, brw_imm_ud(0u));
842
843
sources[length++] = zero;
844
if (vue_map->slots_valid & VARYING_BIT_LAYER)
845
sources[length++] = this->outputs[VARYING_SLOT_LAYER];
846
else
847
sources[length++] = zero;
848
849
if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
850
sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
851
else
852
sources[length++] = zero;
853
854
if (vue_map->slots_valid & VARYING_BIT_PSIZ)
855
sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
856
else
857
sources[length++] = zero;
858
break;
859
}
860
case BRW_VARYING_SLOT_NDC:
861
case VARYING_SLOT_EDGE:
862
unreachable("unexpected scalar vs output");
863
break;
864
865
default:
866
/* gl_Position is always in the vue map, but isn't always written by
867
* the shader. Other varyings (clip distances) get added to the vue
868
* map but don't always get written. In those cases, the
869
* corresponding this->output[] slot will be invalid we and can skip
870
* the urb write for the varying. If we've already queued up a vue
871
* slot for writing we flush a mlen 5 urb write, otherwise we just
872
* advance the urb_offset.
873
*/
874
if (varying == BRW_VARYING_SLOT_PAD ||
875
this->outputs[varying].file == BAD_FILE) {
876
if (length > 0)
877
flush = true;
878
else
879
urb_offset++;
880
break;
881
}
882
883
if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
884
(varying == VARYING_SLOT_COL0 ||
885
varying == VARYING_SLOT_COL1 ||
886
varying == VARYING_SLOT_BFC0 ||
887
varying == VARYING_SLOT_BFC1)) {
888
/* We need to clamp these guys, so do a saturating MOV into a
889
* temp register and use that for the payload.
890
*/
891
for (int i = 0; i < 4; i++) {
892
fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type);
893
fs_reg src = offset(this->outputs[varying], bld, i);
894
set_saturate(true, bld.MOV(reg, src));
895
sources[length++] = reg;
896
}
897
} else {
898
int slot_offset = 0;
899
900
/* When using Primitive Replication, there may be multiple slots
901
* assigned to POS.
902
*/
903
if (varying == VARYING_SLOT_POS)
904
slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
905
906
for (unsigned i = 0; i < 4; i++) {
907
sources[length++] = offset(this->outputs[varying], bld,
908
i + (slot_offset * 4));
909
}
910
}
911
break;
912
}
913
914
const fs_builder abld = bld.annotate("URB write");
915
916
/* If we've queued up 8 registers of payload (2 VUE slots), if this is
917
* the last slot or if we need to flush (see BAD_FILE varying case
918
* above), emit a URB write send now to flush out the data.
919
*/
920
if (length == 8 || (length > 0 && slot == last_slot))
921
flush = true;
922
if (flush) {
923
fs_reg *payload_sources =
924
ralloc_array(mem_ctx, fs_reg, length + header_size);
925
fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
926
BRW_REGISTER_TYPE_F);
927
payload_sources[0] = urb_handle;
928
929
if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
930
payload_sources[1] = per_slot_offsets;
931
932
memcpy(&payload_sources[header_size], sources,
933
length * sizeof sources[0]);
934
935
abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
936
header_size);
937
938
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
939
940
/* For ICL WA 1805992985 one needs additional write in the end. */
941
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
942
inst->eot = false;
943
else
944
inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
945
946
inst->mlen = length + header_size;
947
inst->offset = urb_offset;
948
urb_offset = starting_urb_offset + slot + 1;
949
length = 0;
950
flush = false;
951
urb_written = true;
952
}
953
}
954
955
/* If we don't have any valid slots to write, just do a minimal urb write
956
* send to terminate the shader. This includes 1 slot of undefined data,
957
* because it's invalid to write 0 data:
958
*
959
* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
960
* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
961
* Write Data Payload:
962
*
963
* "The write data payload can be between 1 and 8 message phases long."
964
*/
965
if (!urb_written) {
966
/* For GS, just turn EmitVertex() into a no-op. We don't want it to
967
* end the thread, and emit_gs_thread_end() already emits a SEND with
968
* EOT at the end of the program for us.
969
*/
970
if (stage == MESA_SHADER_GEOMETRY)
971
return;
972
973
fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
974
bld.exec_all().MOV(payload, urb_handle);
975
976
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
977
inst->eot = true;
978
inst->mlen = 2;
979
inst->offset = 1;
980
return;
981
}
982
983
/* ICL WA 1805992985:
984
*
985
* ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The
986
* send cycle, which is a urb write with an eot must be 4 phases long and
987
* all 8 lanes must valid.
988
*/
989
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
990
fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD);
991
992
/* Workaround requires all 8 channels (lanes) to be valid. This is
993
* understood to mean they all need to be alive. First trick is to find
994
* a live channel and copy its urb handle for all the other channels to
995
* make sure all handles are valid.
996
*/
997
bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle));
998
999
/* Second trick is to use masked URB write where one can tell the HW to
1000
* actually write data only for selected channels even though all are
1001
* active.
1002
* Third trick is to take advantage of the must-be-zero (MBZ) area in
1003
* the very beginning of the URB.
1004
*
1005
* One masks data to be written only for the first channel and uses
1006
* offset zero explicitly to land data to the MBZ area avoiding trashing
1007
* any other part of the URB.
1008
*
1009
* Since the WA says that the write needs to be 4 phases long one uses
1010
* 4 slots data. All are explicitly zeros in order to to keep the MBZ
1011
* area written as zeros.
1012
*/
1013
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u));
1014
bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
1015
bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
1016
bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
1017
bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u));
1018
1019
fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
1020
reg_undef, payload);
1021
inst->eot = true;
1022
inst->mlen = 6;
1023
inst->offset = 0;
1024
}
1025
}
1026
1027
void
1028
fs_visitor::emit_cs_terminate()
1029
{
1030
assert(devinfo->ver >= 7);
1031
1032
/* We can't directly send from g0, since sends with EOT have to use
1033
* g112-127. So, copy it to a virtual register, The register allocator will
1034
* make sure it uses the appropriate register range.
1035
*/
1036
struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
1037
fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1038
bld.group(8, 0).exec_all().MOV(payload, g0);
1039
1040
/* Send a message to the thread spawner to terminate the thread. */
1041
fs_inst *inst = bld.exec_all()
1042
.emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
1043
inst->eot = true;
1044
}
1045
1046
void
1047
fs_visitor::emit_barrier()
1048
{
1049
uint32_t barrier_id_mask;
1050
switch (devinfo->ver) {
1051
case 7:
1052
case 8:
1053
barrier_id_mask = 0x0f000000u; break;
1054
case 9:
1055
barrier_id_mask = 0x8f000000u; break;
1056
case 11:
1057
case 12:
1058
barrier_id_mask = 0x7f000000u; break;
1059
default:
1060
unreachable("barrier is only available on gen >= 7");
1061
}
1062
1063
/* We are getting the barrier ID from the compute shader header */
1064
assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
1065
1066
fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1067
1068
/* Clear the message payload */
1069
bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u));
1070
1071
/* Copy the barrier id from r0.2 to the message payload reg.2 */
1072
fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
1073
bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
1074
brw_imm_ud(barrier_id_mask));
1075
1076
/* Emit a gateway "barrier" message using the payload we set up, followed
1077
* by a wait instruction.
1078
*/
1079
bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
1080
}
1081
1082
fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
1083
void *mem_ctx,
1084
const brw_base_prog_key *key,
1085
struct brw_stage_prog_data *prog_data,
1086
const nir_shader *shader,
1087
unsigned dispatch_width,
1088
int shader_time_index,
1089
bool debug_enabled)
1090
: backend_shader(compiler, log_data, mem_ctx, shader, prog_data,
1091
debug_enabled),
1092
key(key), gs_compile(NULL), prog_data(prog_data),
1093
live_analysis(this), regpressure_analysis(this),
1094
performance_analysis(this),
1095
dispatch_width(dispatch_width),
1096
shader_time_index(shader_time_index),
1097
bld(fs_builder(this, dispatch_width).at_end())
1098
{
1099
init();
1100
}
1101
1102
fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
1103
void *mem_ctx,
1104
struct brw_gs_compile *c,
1105
struct brw_gs_prog_data *prog_data,
1106
const nir_shader *shader,
1107
int shader_time_index,
1108
bool debug_enabled)
1109
: backend_shader(compiler, log_data, mem_ctx, shader,
1110
&prog_data->base.base, debug_enabled),
1111
key(&c->key.base), gs_compile(c),
1112
prog_data(&prog_data->base.base),
1113
live_analysis(this), regpressure_analysis(this),
1114
performance_analysis(this),
1115
dispatch_width(8),
1116
shader_time_index(shader_time_index),
1117
bld(fs_builder(this, dispatch_width).at_end())
1118
{
1119
init();
1120
}
1121
1122
1123
void
1124
fs_visitor::init()
1125
{
1126
if (key)
1127
this->key_tex = &key->tex;
1128
else
1129
this->key_tex = NULL;
1130
1131
this->max_dispatch_width = 32;
1132
this->prog_data = this->stage_prog_data;
1133
1134
this->failed = false;
1135
this->fail_msg = NULL;
1136
1137
this->nir_locals = NULL;
1138
this->nir_ssa_values = NULL;
1139
this->nir_system_values = NULL;
1140
1141
memset(&this->payload, 0, sizeof(this->payload));
1142
this->source_depth_to_render_target = false;
1143
this->runtime_check_aads_emit = false;
1144
this->first_non_payload_grf = 0;
1145
this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
1146
1147
this->uniforms = 0;
1148
this->last_scratch = 0;
1149
this->pull_constant_loc = NULL;
1150
this->push_constant_loc = NULL;
1151
1152
this->shader_stats.scheduler_mode = NULL;
1153
this->shader_stats.promoted_constants = 0,
1154
1155
this->grf_used = 0;
1156
this->spilled_any_registers = false;
1157
}
1158
1159
fs_visitor::~fs_visitor()
1160
{
1161
}
1162
1163