Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/blorp/blorp_genX_exec.h
7227 views
1
/*
2
* Copyright © 2016 Intel Corporation
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#ifndef BLORP_GENX_EXEC_H
25
#define BLORP_GENX_EXEC_H
26
27
#include "blorp_priv.h"
28
#include "dev/intel_device_info.h"
29
#include "common/intel_sample_positions.h"
30
#include "common/intel_l3_config.h"
31
#include "genxml/gen_macros.h"
32
33
/**
34
* This file provides the blorp pipeline setup and execution functionality.
35
* It defines the following function:
36
*
37
* static void
38
* blorp_exec(struct blorp_context *blorp, void *batch_data,
39
* const struct blorp_params *params);
40
*
41
* It is the job of whoever includes this header to wrap this in something
42
* to get an externally visible symbol.
43
*
44
* In order for the blorp_exec function to work, the driver must provide
45
* implementations of the following static helper functions.
46
*/
47
48
static void *
49
blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
50
51
static uint64_t
52
blorp_emit_reloc(struct blorp_batch *batch,
53
void *location, struct blorp_address address, uint32_t delta);
54
55
static void
56
blorp_measure_start(struct blorp_batch *batch,
57
const struct blorp_params *params);
58
59
static void *
60
blorp_alloc_dynamic_state(struct blorp_batch *batch,
61
uint32_t size,
62
uint32_t alignment,
63
uint32_t *offset);
64
static void *
65
blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
66
struct blorp_address *addr);
67
static void
68
blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
69
const struct blorp_address *addrs,
70
uint32_t *sizes,
71
unsigned num_vbs);
72
73
UNUSED static struct blorp_address
74
blorp_get_workaround_address(struct blorp_batch *batch);
75
76
static void
77
blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
78
unsigned state_size, unsigned state_alignment,
79
uint32_t *bt_offset, uint32_t *surface_offsets,
80
void **surface_maps);
81
82
static void
83
blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
84
85
static void
86
blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
87
struct blorp_address address, uint32_t delta);
88
89
static uint64_t
90
blorp_get_surface_address(struct blorp_batch *batch,
91
struct blorp_address address);
92
93
#if GFX_VER >= 7 && GFX_VER < 10
94
static struct blorp_address
95
blorp_get_surface_base_address(struct blorp_batch *batch);
96
#endif
97
98
#if GFX_VER >= 7
99
static const struct intel_l3_config *
100
blorp_get_l3_config(struct blorp_batch *batch);
101
# else
102
static void
103
blorp_emit_urb_config(struct blorp_batch *batch,
104
unsigned vs_entry_size, unsigned sf_entry_size);
105
#endif
106
107
static void
108
blorp_emit_pipeline(struct blorp_batch *batch,
109
const struct blorp_params *params);
110
111
/***** BEGIN blorp_exec implementation ******/
112
113
static uint64_t
114
_blorp_combine_address(struct blorp_batch *batch, void *location,
115
struct blorp_address address, uint32_t delta)
116
{
117
if (address.buffer == NULL) {
118
return address.offset + delta;
119
} else {
120
return blorp_emit_reloc(batch, location, address, delta);
121
}
122
}
123
124
#define __gen_address_type struct blorp_address
125
#define __gen_user_data struct blorp_batch
126
#define __gen_combine_address _blorp_combine_address
127
128
#include "genxml/genX_pack.h"
129
130
#define _blorp_cmd_length(cmd) cmd ## _length
131
#define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
132
#define _blorp_cmd_header(cmd) cmd ## _header
133
#define _blorp_cmd_pack(cmd) cmd ## _pack
134
135
#define blorp_emit(batch, cmd, name) \
136
for (struct cmd name = { _blorp_cmd_header(cmd) }, \
137
*_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
138
__builtin_expect(_dst != NULL, 1); \
139
_blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
140
_dst = NULL)
141
142
#define blorp_emitn(batch, cmd, n, ...) ({ \
143
uint32_t *_dw = blorp_emit_dwords(batch, n); \
144
if (_dw) { \
145
struct cmd template = { \
146
_blorp_cmd_header(cmd), \
147
.DWordLength = n - _blorp_cmd_length_bias(cmd), \
148
__VA_ARGS__ \
149
}; \
150
_blorp_cmd_pack(cmd)(batch, _dw, &template); \
151
} \
152
_dw ? _dw + 1 : NULL; /* Array starts at dw[1] */ \
153
})
154
155
#define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
156
157
#define blorp_emit_dynamic(batch, state, name, align, offset) \
158
for (struct state name = STRUCT_ZERO(state), \
159
*_dst = blorp_alloc_dynamic_state(batch, \
160
_blorp_cmd_length(state) * 4, \
161
align, offset); \
162
__builtin_expect(_dst != NULL, 1); \
163
_blorp_cmd_pack(state)(batch, (void *)_dst, &name), \
164
blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4), \
165
_dst = NULL)
166
167
/* 3DSTATE_URB
168
* 3DSTATE_URB_VS
169
* 3DSTATE_URB_HS
170
* 3DSTATE_URB_DS
171
* 3DSTATE_URB_GS
172
*
173
* Assign the entire URB to the VS. Even though the VS disabled, URB space
174
* is still needed because the clipper loads the VUE's from the URB. From
175
* the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
176
* Dword 1.15:0 "VS Number of URB Entries":
177
* This field is always used (even if VS Function Enable is DISABLED).
178
*
179
* The warning below appears in the PRM (Section 3DSTATE_URB), but we can
180
* safely ignore it because this batch contains only one draw call.
181
* Because of URB corruption caused by allocating a previous GS unit
182
* URB entry to the VS unit, software is required to send a “GS NULL
183
* Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
184
* plus a dummy DRAW call before any case where VS will be taking over
185
* GS URB space.
186
*
187
* If the 3DSTATE_URB_VS is emitted, than the others must be also.
188
* From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
189
*
190
* 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
191
* programmed in order for the programming of this state to be
192
* valid.
193
*/
194
static void
195
emit_urb_config(struct blorp_batch *batch,
196
const struct blorp_params *params,
197
UNUSED enum intel_urb_deref_block_size *deref_block_size)
198
{
199
/* Once vertex fetcher has written full VUE entries with complete
200
* header the space requirement is as follows per vertex (in bytes):
201
*
202
* Header Position Program constants
203
* +--------+------------+-------------------+
204
* | 16 | 16 | n x 16 |
205
* +--------+------------+-------------------+
206
*
207
* where 'n' stands for number of varying inputs expressed as vec4s.
208
*/
209
const unsigned num_varyings =
210
params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
211
const unsigned total_needed = 16 + 16 + num_varyings * 16;
212
213
/* The URB size is expressed in units of 64 bytes (512 bits) */
214
const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
215
216
ASSERTED const unsigned sf_entry_size =
217
params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
218
219
#if GFX_VER >= 7
220
assert(sf_entry_size == 0);
221
const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 };
222
223
unsigned entries[4], start[4];
224
bool constrained;
225
intel_get_urb_config(batch->blorp->compiler->devinfo,
226
blorp_get_l3_config(batch),
227
false, false, entry_size,
228
entries, start, deref_block_size, &constrained);
229
230
#if GFX_VERx10 == 70
231
/* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
232
*
233
* "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
234
* needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
235
* 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
236
* 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL
237
* needs to be sent before any combination of VS associated 3DSTATE."
238
*/
239
blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
240
pc.DepthStallEnable = true;
241
pc.PostSyncOperation = WriteImmediateData;
242
pc.Address = blorp_get_workaround_address(batch);
243
}
244
#endif
245
246
for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
247
blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
248
urb._3DCommandSubOpcode += i;
249
urb.VSURBStartingAddress = start[i];
250
urb.VSURBEntryAllocationSize = entry_size[i] - 1;
251
urb.VSNumberofURBEntries = entries[i];
252
}
253
}
254
#else /* GFX_VER < 7 */
255
blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
256
#endif
257
}
258
259
#if GFX_VER >= 7
260
static void
261
blorp_emit_memcpy(struct blorp_batch *batch,
262
struct blorp_address dst,
263
struct blorp_address src,
264
uint32_t size);
265
#endif
266
267
static void
268
blorp_emit_vertex_data(struct blorp_batch *batch,
269
const struct blorp_params *params,
270
struct blorp_address *addr,
271
uint32_t *size)
272
{
273
const float vertices[] = {
274
/* v0 */ (float)params->x1, (float)params->y1, params->z,
275
/* v1 */ (float)params->x0, (float)params->y1, params->z,
276
/* v2 */ (float)params->x0, (float)params->y0, params->z,
277
};
278
279
void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
280
memcpy(data, vertices, sizeof(vertices));
281
*size = sizeof(vertices);
282
blorp_flush_range(batch, data, *size);
283
}
284
285
static void
286
blorp_emit_input_varying_data(struct blorp_batch *batch,
287
const struct blorp_params *params,
288
struct blorp_address *addr,
289
uint32_t *size)
290
{
291
const unsigned vec4_size_in_bytes = 4 * sizeof(float);
292
const unsigned max_num_varyings =
293
DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
294
const unsigned num_varyings =
295
params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
296
297
*size = 16 + num_varyings * vec4_size_in_bytes;
298
299
const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
300
void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
301
uint32_t *inputs = data;
302
303
/* Copy in the VS inputs */
304
assert(sizeof(params->vs_inputs) == 16);
305
memcpy(inputs, &params->vs_inputs, sizeof(params->vs_inputs));
306
inputs += 4;
307
308
if (params->wm_prog_data) {
309
/* Walk over the attribute slots, determine if the attribute is used by
310
* the program and when necessary copy the values from the input storage
311
* to the vertex data buffer.
312
*/
313
for (unsigned i = 0; i < max_num_varyings; i++) {
314
const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
315
316
const int input_index = params->wm_prog_data->urb_setup[attr];
317
if (input_index < 0)
318
continue;
319
320
memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
321
322
inputs += 4;
323
}
324
}
325
326
blorp_flush_range(batch, data, *size);
327
328
if (params->dst_clear_color_as_input) {
329
#if GFX_VER >= 7
330
/* In this case, the clear color isn't known statically and instead
331
* comes in through an indirect which we have to copy into the vertex
332
* buffer before we execute the 3DPRIMITIVE. We already copied the
333
* value of params->wm_inputs.clear_color into the vertex buffer in the
334
* loop above. Now we emit code to stomp it from the GPU with the
335
* actual clear color value.
336
*/
337
assert(num_varyings == 1);
338
339
/* The clear color is the first thing after the header */
340
struct blorp_address clear_color_input_addr = *addr;
341
clear_color_input_addr.offset += 16;
342
343
const unsigned clear_color_size =
344
GFX_VER < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
345
blorp_emit_memcpy(batch, clear_color_input_addr,
346
params->dst.clear_color_addr,
347
clear_color_size);
348
#else
349
unreachable("MCS partial resolve is not a thing on SNB and earlier");
350
#endif
351
}
352
}
353
354
static void
355
blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
356
unsigned idx,
357
struct blorp_address addr, uint32_t size,
358
uint32_t stride)
359
{
360
vb[idx].VertexBufferIndex = idx;
361
vb[idx].BufferStartingAddress = addr;
362
vb[idx].BufferPitch = stride;
363
364
#if GFX_VER >= 6
365
vb[idx].MOCS = addr.mocs;
366
#endif
367
368
#if GFX_VER >= 7
369
vb[idx].AddressModifyEnable = true;
370
#endif
371
372
#if GFX_VER >= 8
373
vb[idx].BufferSize = size;
374
#elif GFX_VER >= 5
375
vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
376
vb[idx].EndAddress = vb[idx].BufferStartingAddress;
377
vb[idx].EndAddress.offset += size - 1;
378
#elif GFX_VER == 4
379
vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
380
vb[idx].MaxIndex = stride > 0 ? size / stride : 0;
381
#endif
382
383
#if GFX_VER >= 12
384
vb[idx].L3BypassDisable = true;
385
#endif
386
}
387
388
static void
389
blorp_emit_vertex_buffers(struct blorp_batch *batch,
390
const struct blorp_params *params)
391
{
392
struct GENX(VERTEX_BUFFER_STATE) vb[3];
393
uint32_t num_vbs = 2;
394
memset(vb, 0, sizeof(vb));
395
396
struct blorp_address addrs[2] = {};
397
uint32_t sizes[2];
398
blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
399
blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
400
3 * sizeof(float));
401
402
blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
403
blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
404
405
blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
406
407
const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
408
uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
409
if (!dw)
410
return;
411
412
for (unsigned i = 0; i < num_vbs; i++) {
413
GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
414
dw += GENX(VERTEX_BUFFER_STATE_length);
415
}
416
}
417
418
static void
419
blorp_emit_vertex_elements(struct blorp_batch *batch,
420
const struct blorp_params *params)
421
{
422
const unsigned num_varyings =
423
params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
424
bool need_ndc = batch->blorp->compiler->devinfo->ver <= 5;
425
const unsigned num_elements = 2 + need_ndc + num_varyings;
426
427
struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
428
memset(ve, 0, num_elements * sizeof(*ve));
429
430
/* Setup VBO for the rectangle primitive..
431
*
432
* A rectangle primitive (3DPRIM_RECTLIST) consists of only three
433
* vertices. The vertices reside in screen space with DirectX
434
* coordinates (that is, (0, 0) is the upper left corner).
435
*
436
* v2 ------ implied
437
* | |
438
* | |
439
* v1 ----- v0
440
*
441
* Since the VS is disabled, the clipper loads each VUE directly from
442
* the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
443
* 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
444
* dw0: Reserved, MBZ.
445
* dw1: Render Target Array Index. Below vertex fetcher gets programmed
446
* to assign this with primitive instance identifier which will be
447
* used for layered clears. All other renders have only one instance
448
* and therefore the value will be effectively zero.
449
* dw2: Viewport Index. The HiZ op disables viewport mapping and
450
* scissoring, so set the dword to 0.
451
* dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
452
* so set the dword to 0.
453
* dw4: Vertex Position X.
454
* dw5: Vertex Position Y.
455
* dw6: Vertex Position Z.
456
* dw7: Vertex Position W.
457
*
458
* dw8: Flat vertex input 0
459
* dw9: Flat vertex input 1
460
* ...
461
* dwn: Flat vertex input n - 8
462
*
463
* For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
464
* "Vertex URB Entry (VUE) Formats".
465
*
466
* Only vertex position X and Y are going to be variable, Z is fixed to
467
* zero and W to one. Header words dw0,2,3 are zero. There is no need to
468
* include the fixed values in the vertex buffer. Vertex fetcher can be
469
* instructed to fill vertex elements with constant values of one and zero
470
* instead of reading them from the buffer.
471
* Flat inputs are program constants that are not interpolated. Moreover
472
* their values will be the same between vertices.
473
*
474
* See the vertex element setup below.
475
*/
476
unsigned slot = 0;
477
478
ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
479
.VertexBufferIndex = 1,
480
.Valid = true,
481
.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
482
.SourceElementOffset = 0,
483
.Component0Control = VFCOMP_STORE_SRC,
484
485
/* From Gfx8 onwards hardware is no more instructed to overwrite
486
* components using an element specifier. Instead one has separate
487
* 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
488
*/
489
#if GFX_VER >= 8
490
.Component1Control = VFCOMP_STORE_0,
491
#elif GFX_VER >= 5
492
.Component1Control = VFCOMP_STORE_IID,
493
#else
494
.Component1Control = VFCOMP_STORE_0,
495
#endif
496
.Component2Control = VFCOMP_STORE_0,
497
.Component3Control = VFCOMP_STORE_0,
498
#if GFX_VER <= 5
499
.DestinationElementOffset = slot * 4,
500
#endif
501
};
502
slot++;
503
504
#if GFX_VER <= 5
505
/* On Iron Lake and earlier, a native device coordinates version of the
506
* position goes right after the normal VUE header and before position.
507
* Since w == 1 for all of our coordinates, this is just a copy of the
508
* position.
509
*/
510
ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
511
.VertexBufferIndex = 0,
512
.Valid = true,
513
.SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
514
.SourceElementOffset = 0,
515
.Component0Control = VFCOMP_STORE_SRC,
516
.Component1Control = VFCOMP_STORE_SRC,
517
.Component2Control = VFCOMP_STORE_SRC,
518
.Component3Control = VFCOMP_STORE_1_FP,
519
.DestinationElementOffset = slot * 4,
520
};
521
slot++;
522
#endif
523
524
ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
525
.VertexBufferIndex = 0,
526
.Valid = true,
527
.SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
528
.SourceElementOffset = 0,
529
.Component0Control = VFCOMP_STORE_SRC,
530
.Component1Control = VFCOMP_STORE_SRC,
531
.Component2Control = VFCOMP_STORE_SRC,
532
.Component3Control = VFCOMP_STORE_1_FP,
533
#if GFX_VER <= 5
534
.DestinationElementOffset = slot * 4,
535
#endif
536
};
537
slot++;
538
539
for (unsigned i = 0; i < num_varyings; ++i) {
540
ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
541
.VertexBufferIndex = 1,
542
.Valid = true,
543
.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
544
.SourceElementOffset = 16 + i * 4 * sizeof(float),
545
.Component0Control = VFCOMP_STORE_SRC,
546
.Component1Control = VFCOMP_STORE_SRC,
547
.Component2Control = VFCOMP_STORE_SRC,
548
.Component3Control = VFCOMP_STORE_SRC,
549
#if GFX_VER <= 5
550
.DestinationElementOffset = slot * 4,
551
#endif
552
};
553
slot++;
554
}
555
556
const unsigned num_dwords =
557
1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
558
uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
559
if (!dw)
560
return;
561
562
for (unsigned i = 0; i < num_elements; i++) {
563
GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
564
dw += GENX(VERTEX_ELEMENT_STATE_length);
565
}
566
567
blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
568
vf.StatisticsEnable = false;
569
}
570
571
#if GFX_VER >= 8
572
/* Overwrite Render Target Array Index (2nd dword) in the VUE header with
573
* primitive instance identifier. This is used for layered clears.
574
*/
575
blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
576
sgvs.InstanceIDEnable = true;
577
sgvs.InstanceIDComponentNumber = COMP_1;
578
sgvs.InstanceIDElementOffset = 0;
579
}
580
581
for (unsigned i = 0; i < num_elements; i++) {
582
blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
583
vf.VertexElementIndex = i;
584
vf.InstancingEnable = false;
585
}
586
}
587
588
blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
589
topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
590
}
591
#endif
592
}
593
594
/* 3DSTATE_VIEWPORT_STATE_POINTERS */
595
static uint32_t
596
blorp_emit_cc_viewport(struct blorp_batch *batch)
597
{
598
uint32_t cc_vp_offset;
599
blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
600
vp.MinimumDepth = 0.0;
601
vp.MaximumDepth = 1.0;
602
}
603
604
#if GFX_VER >= 7
605
blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
606
vsp.CCViewportPointer = cc_vp_offset;
607
}
608
#elif GFX_VER == 6
609
blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
610
vsp.CCViewportStateChange = true;
611
vsp.PointertoCC_VIEWPORT = cc_vp_offset;
612
}
613
#endif
614
615
return cc_vp_offset;
616
}
617
618
static uint32_t
619
blorp_emit_sampler_state(struct blorp_batch *batch)
620
{
621
uint32_t offset;
622
blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
623
sampler.MipModeFilter = MIPFILTER_NONE;
624
sampler.MagModeFilter = MAPFILTER_LINEAR;
625
sampler.MinModeFilter = MAPFILTER_LINEAR;
626
sampler.MinLOD = 0;
627
sampler.MaxLOD = 0;
628
sampler.TCXAddressControlMode = TCM_CLAMP;
629
sampler.TCYAddressControlMode = TCM_CLAMP;
630
sampler.TCZAddressControlMode = TCM_CLAMP;
631
sampler.MaximumAnisotropy = RATIO21;
632
sampler.RAddressMinFilterRoundingEnable = true;
633
sampler.RAddressMagFilterRoundingEnable = true;
634
sampler.VAddressMinFilterRoundingEnable = true;
635
sampler.VAddressMagFilterRoundingEnable = true;
636
sampler.UAddressMinFilterRoundingEnable = true;
637
sampler.UAddressMagFilterRoundingEnable = true;
638
#if GFX_VER > 6
639
sampler.NonnormalizedCoordinateEnable = true;
640
#endif
641
}
642
643
#if GFX_VER >= 7
644
blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
645
ssp.PointertoPSSamplerState = offset;
646
}
647
#elif GFX_VER == 6
648
blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
649
ssp.VSSamplerStateChange = true;
650
ssp.GSSamplerStateChange = true;
651
ssp.PSSamplerStateChange = true;
652
ssp.PointertoPSSamplerState = offset;
653
}
654
#endif
655
656
return offset;
657
}
658
659
/* What follows is the code for setting up a "pipeline" on Sandy Bridge and
660
* later hardware. This file will be included by i965 for gfx4-5 as well, so
661
* this code is guarded by GFX_VER >= 6.
662
*/
663
#if GFX_VER >= 6
664
665
static void
666
blorp_emit_vs_config(struct blorp_batch *batch,
667
const struct blorp_params *params)
668
{
669
struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
670
assert(!vs_prog_data || GFX_VER < 11 ||
671
vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
672
673
blorp_emit(batch, GENX(3DSTATE_VS), vs) {
674
if (vs_prog_data) {
675
vs.Enable = true;
676
677
vs.KernelStartPointer = params->vs_prog_kernel;
678
679
vs.DispatchGRFStartRegisterForURBData =
680
vs_prog_data->base.base.dispatch_grf_start_reg;
681
vs.VertexURBEntryReadLength =
682
vs_prog_data->base.urb_read_length;
683
vs.VertexURBEntryReadOffset = 0;
684
685
vs.MaximumNumberofThreads =
686
batch->blorp->isl_dev->info->max_vs_threads - 1;
687
688
#if GFX_VER >= 8
689
vs.SIMD8DispatchEnable =
690
vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
691
#endif
692
}
693
}
694
}
695
696
static void
697
blorp_emit_sf_config(struct blorp_batch *batch,
698
const struct blorp_params *params,
699
UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
700
{
701
const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
702
703
/* 3DSTATE_SF
704
*
705
* Disable ViewportTransformEnable (dw2.1)
706
*
707
* From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
708
* Primitives Overview":
709
* RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
710
* use of screen- space coordinates).
711
*
712
* A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
713
* and BackFaceFillMode (dw2.5:6) to SOLID(0).
714
*
715
* From the Sandy Bridge PRM, Volume 2, Part 1, Section
716
* 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
717
* SOLID: Any triangle or rectangle object found to be front-facing
718
* is rendered as a solid object. This setting is required when
719
* (rendering rectangle (RECTLIST) objects.
720
*/
721
722
#if GFX_VER >= 8
723
724
blorp_emit(batch, GENX(3DSTATE_SF), sf) {
725
#if GFX_VER >= 12
726
sf.DerefBlockSize = urb_deref_block_size;
727
#endif
728
}
729
730
blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
731
raster.CullMode = CULLMODE_NONE;
732
}
733
734
blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
735
sbe.VertexURBEntryReadOffset = 1;
736
if (prog_data) {
737
sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
738
sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
739
sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
740
} else {
741
sbe.NumberofSFOutputAttributes = 0;
742
sbe.VertexURBEntryReadLength = 1;
743
}
744
sbe.ForceVertexURBEntryReadLength = true;
745
sbe.ForceVertexURBEntryReadOffset = true;
746
747
#if GFX_VER >= 9
748
for (unsigned i = 0; i < 32; i++)
749
sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
750
#endif
751
}
752
753
#elif GFX_VER >= 7
754
755
blorp_emit(batch, GENX(3DSTATE_SF), sf) {
756
sf.FrontFaceFillMode = FILL_MODE_SOLID;
757
sf.BackFaceFillMode = FILL_MODE_SOLID;
758
759
sf.MultisampleRasterizationMode = params->num_samples > 1 ?
760
MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
761
762
#if GFX_VER == 7
763
sf.DepthBufferSurfaceFormat = params->depth_format;
764
#endif
765
}
766
767
blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
768
sbe.VertexURBEntryReadOffset = 1;
769
if (prog_data) {
770
sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
771
sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
772
sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
773
} else {
774
sbe.NumberofSFOutputAttributes = 0;
775
sbe.VertexURBEntryReadLength = 1;
776
}
777
}
778
779
#else /* GFX_VER <= 6 */
780
781
blorp_emit(batch, GENX(3DSTATE_SF), sf) {
782
sf.FrontFaceFillMode = FILL_MODE_SOLID;
783
sf.BackFaceFillMode = FILL_MODE_SOLID;
784
785
sf.MultisampleRasterizationMode = params->num_samples > 1 ?
786
MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
787
788
sf.VertexURBEntryReadOffset = 1;
789
if (prog_data) {
790
sf.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
791
sf.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
792
sf.ConstantInterpolationEnable = prog_data->flat_inputs;
793
} else {
794
sf.NumberofSFOutputAttributes = 0;
795
sf.VertexURBEntryReadLength = 1;
796
}
797
}
798
799
#endif /* GFX_VER */
800
}
801
802
static void
803
blorp_emit_ps_config(struct blorp_batch *batch,
804
const struct blorp_params *params)
805
{
806
const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
807
808
/* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
809
* nonzero to prevent the GPU from hanging. While the documentation doesn't
810
* mention this explicitly, it notes that the valid range for the field is
811
* [1,39] = [2,40] threads, which excludes zero.
812
*
813
* To be safe (and to minimize extraneous code) we go ahead and fully
814
* configure the WM state whether or not there is a WM program.
815
*/
816
817
#if GFX_VER >= 8
818
819
blorp_emit(batch, GENX(3DSTATE_WM), wm);
820
821
blorp_emit(batch, GENX(3DSTATE_PS), ps) {
822
if (params->src.enabled) {
823
ps.SamplerCount = 1; /* Up to 4 samplers */
824
ps.BindingTableEntryCount = 2;
825
} else {
826
ps.BindingTableEntryCount = 1;
827
}
828
829
/* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
830
if (GFX_VER == 11)
831
ps.SamplerCount = 0;
832
833
if (prog_data) {
834
ps._8PixelDispatchEnable = prog_data->dispatch_8;
835
ps._16PixelDispatchEnable = prog_data->dispatch_16;
836
ps._32PixelDispatchEnable = prog_data->dispatch_32;
837
838
/* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
839
*
840
* "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
841
* Dispatch must not be enabled for PER_PIXEL dispatch mode."
842
*
843
* Since 16x MSAA is first introduced on SKL, we don't need to apply
844
* the workaround on any older hardware.
845
*/
846
if (GFX_VER >= 9 && !prog_data->persample_dispatch &&
847
params->num_samples == 16) {
848
assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
849
ps._32PixelDispatchEnable = false;
850
}
851
852
ps.DispatchGRFStartRegisterForConstantSetupData0 =
853
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
854
ps.DispatchGRFStartRegisterForConstantSetupData1 =
855
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
856
ps.DispatchGRFStartRegisterForConstantSetupData2 =
857
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
858
859
ps.KernelStartPointer0 = params->wm_prog_kernel +
860
brw_wm_prog_data_prog_offset(prog_data, ps, 0);
861
ps.KernelStartPointer1 = params->wm_prog_kernel +
862
brw_wm_prog_data_prog_offset(prog_data, ps, 1);
863
ps.KernelStartPointer2 = params->wm_prog_kernel +
864
brw_wm_prog_data_prog_offset(prog_data, ps, 2);
865
}
866
867
/* 3DSTATE_PS expects the number of threads per PSD, which is always 64
868
* for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
869
* k, it implies 2(k+1) threads. It implicitly scales for different GT
870
* levels (which have some # of PSDs).
871
*
872
* In Gfx8 the format is U8-2 whereas in Gfx9+ it is U9-1.
873
*/
874
if (GFX_VER >= 9)
875
ps.MaximumNumberofThreadsPerPSD = 64 - 1;
876
else
877
ps.MaximumNumberofThreadsPerPSD = 64 - 2;
878
879
switch (params->fast_clear_op) {
880
case ISL_AUX_OP_NONE:
881
break;
882
#if GFX_VER >= 10
883
case ISL_AUX_OP_AMBIGUATE:
884
ps.RenderTargetFastClearEnable = true;
885
ps.RenderTargetResolveType = FAST_CLEAR_0;
886
break;
887
#endif
888
#if GFX_VER >= 9
889
case ISL_AUX_OP_PARTIAL_RESOLVE:
890
ps.RenderTargetResolveType = RESOLVE_PARTIAL;
891
break;
892
case ISL_AUX_OP_FULL_RESOLVE:
893
ps.RenderTargetResolveType = RESOLVE_FULL;
894
break;
895
#else
896
case ISL_AUX_OP_FULL_RESOLVE:
897
ps.RenderTargetResolveEnable = true;
898
break;
899
#endif
900
case ISL_AUX_OP_FAST_CLEAR:
901
ps.RenderTargetFastClearEnable = true;
902
break;
903
default:
904
unreachable("Invalid fast clear op");
905
}
906
}
907
908
blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
909
if (prog_data) {
910
psx.PixelShaderValid = true;
911
psx.AttributeEnable = prog_data->num_varying_inputs > 0;
912
psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
913
psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
914
#if GFX_VER >= 9
915
psx.PixelShaderComputesStencil = prog_data->computed_stencil;
916
#endif
917
}
918
919
if (params->src.enabled)
920
psx.PixelShaderKillsPixel = true;
921
}
922
923
#elif GFX_VER >= 7
924
925
blorp_emit(batch, GENX(3DSTATE_WM), wm) {
926
switch (params->hiz_op) {
927
case ISL_AUX_OP_FAST_CLEAR:
928
wm.DepthBufferClear = true;
929
break;
930
case ISL_AUX_OP_FULL_RESOLVE:
931
wm.DepthBufferResolveEnable = true;
932
break;
933
case ISL_AUX_OP_AMBIGUATE:
934
wm.HierarchicalDepthBufferResolveEnable = true;
935
break;
936
case ISL_AUX_OP_NONE:
937
break;
938
default:
939
unreachable("not reached");
940
}
941
942
if (prog_data) {
943
wm.ThreadDispatchEnable = true;
944
wm.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
945
}
946
947
if (params->src.enabled)
948
wm.PixelShaderKillsPixel = true;
949
950
if (params->num_samples > 1) {
951
wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
952
wm.MultisampleDispatchMode =
953
(prog_data && prog_data->persample_dispatch) ?
954
MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
955
} else {
956
wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
957
wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
958
}
959
}
960
961
blorp_emit(batch, GENX(3DSTATE_PS), ps) {
962
ps.MaximumNumberofThreads =
963
batch->blorp->isl_dev->info->max_wm_threads - 1;
964
965
#if GFX_VERx10 == 75
966
ps.SampleMask = 1;
967
#endif
968
969
if (prog_data) {
970
ps._8PixelDispatchEnable = prog_data->dispatch_8;
971
ps._16PixelDispatchEnable = prog_data->dispatch_16;
972
ps._32PixelDispatchEnable = prog_data->dispatch_32;
973
974
ps.DispatchGRFStartRegisterForConstantSetupData0 =
975
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
976
ps.DispatchGRFStartRegisterForConstantSetupData1 =
977
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
978
ps.DispatchGRFStartRegisterForConstantSetupData2 =
979
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
980
981
ps.KernelStartPointer0 = params->wm_prog_kernel +
982
brw_wm_prog_data_prog_offset(prog_data, ps, 0);
983
ps.KernelStartPointer1 = params->wm_prog_kernel +
984
brw_wm_prog_data_prog_offset(prog_data, ps, 1);
985
ps.KernelStartPointer2 = params->wm_prog_kernel +
986
brw_wm_prog_data_prog_offset(prog_data, ps, 2);
987
988
ps.AttributeEnable = prog_data->num_varying_inputs > 0;
989
} else {
990
/* Gfx7 hardware gets angry if we don't enable at least one dispatch
991
* mode, so just enable 16-pixel dispatch if we don't have a program.
992
*/
993
ps._16PixelDispatchEnable = true;
994
}
995
996
if (params->src.enabled)
997
ps.SamplerCount = 1; /* Up to 4 samplers */
998
999
switch (params->fast_clear_op) {
1000
case ISL_AUX_OP_NONE:
1001
break;
1002
case ISL_AUX_OP_FULL_RESOLVE:
1003
ps.RenderTargetResolveEnable = true;
1004
break;
1005
case ISL_AUX_OP_FAST_CLEAR:
1006
ps.RenderTargetFastClearEnable = true;
1007
break;
1008
default:
1009
unreachable("Invalid fast clear op");
1010
}
1011
}
1012
1013
#else /* GFX_VER <= 6 */
1014
1015
blorp_emit(batch, GENX(3DSTATE_WM), wm) {
1016
wm.MaximumNumberofThreads =
1017
batch->blorp->isl_dev->info->max_wm_threads - 1;
1018
1019
switch (params->hiz_op) {
1020
case ISL_AUX_OP_FAST_CLEAR:
1021
wm.DepthBufferClear = true;
1022
break;
1023
case ISL_AUX_OP_FULL_RESOLVE:
1024
wm.DepthBufferResolveEnable = true;
1025
break;
1026
case ISL_AUX_OP_AMBIGUATE:
1027
wm.HierarchicalDepthBufferResolveEnable = true;
1028
break;
1029
case ISL_AUX_OP_NONE:
1030
break;
1031
default:
1032
unreachable("not reached");
1033
}
1034
1035
if (prog_data) {
1036
wm.ThreadDispatchEnable = true;
1037
1038
wm._8PixelDispatchEnable = prog_data->dispatch_8;
1039
wm._16PixelDispatchEnable = prog_data->dispatch_16;
1040
wm._32PixelDispatchEnable = prog_data->dispatch_32;
1041
1042
wm.DispatchGRFStartRegisterForConstantSetupData0 =
1043
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
1044
wm.DispatchGRFStartRegisterForConstantSetupData1 =
1045
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
1046
wm.DispatchGRFStartRegisterForConstantSetupData2 =
1047
brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
1048
1049
wm.KernelStartPointer0 = params->wm_prog_kernel +
1050
brw_wm_prog_data_prog_offset(prog_data, wm, 0);
1051
wm.KernelStartPointer1 = params->wm_prog_kernel +
1052
brw_wm_prog_data_prog_offset(prog_data, wm, 1);
1053
wm.KernelStartPointer2 = params->wm_prog_kernel +
1054
brw_wm_prog_data_prog_offset(prog_data, wm, 2);
1055
1056
wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
1057
}
1058
1059
if (params->src.enabled) {
1060
wm.SamplerCount = 1; /* Up to 4 samplers */
1061
wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
1062
}
1063
1064
if (params->num_samples > 1) {
1065
wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1066
wm.MultisampleDispatchMode =
1067
(prog_data && prog_data->persample_dispatch) ?
1068
MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
1069
} else {
1070
wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1071
wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1072
}
1073
}
1074
1075
#endif /* GFX_VER */
1076
}
1077
1078
static uint32_t
1079
blorp_emit_blend_state(struct blorp_batch *batch,
1080
const struct blorp_params *params)
1081
{
1082
struct GENX(BLEND_STATE) blend = { };
1083
1084
uint32_t offset;
1085
int size = GENX(BLEND_STATE_length) * 4;
1086
size += GENX(BLEND_STATE_ENTRY_length) * 4 * params->num_draw_buffers;
1087
uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
1088
uint32_t *pos = state;
1089
1090
GENX(BLEND_STATE_pack)(NULL, pos, &blend);
1091
pos += GENX(BLEND_STATE_length);
1092
1093
for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
1094
struct GENX(BLEND_STATE_ENTRY) entry = {
1095
.PreBlendColorClampEnable = true,
1096
.PostBlendColorClampEnable = true,
1097
.ColorClampRange = COLORCLAMP_RTFORMAT,
1098
1099
.WriteDisableRed = params->color_write_disable[0],
1100
.WriteDisableGreen = params->color_write_disable[1],
1101
.WriteDisableBlue = params->color_write_disable[2],
1102
.WriteDisableAlpha = params->color_write_disable[3],
1103
};
1104
GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
1105
pos += GENX(BLEND_STATE_ENTRY_length);
1106
}
1107
1108
blorp_flush_range(batch, state, size);
1109
1110
#if GFX_VER >= 7
1111
blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
1112
sp.BlendStatePointer = offset;
1113
#if GFX_VER >= 8
1114
sp.BlendStatePointerValid = true;
1115
#endif
1116
}
1117
#endif
1118
1119
#if GFX_VER >= 8
1120
blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
1121
ps_blend.HasWriteableRT = true;
1122
}
1123
#endif
1124
1125
return offset;
1126
}
1127
1128
static uint32_t
1129
blorp_emit_color_calc_state(struct blorp_batch *batch,
1130
UNUSED const struct blorp_params *params)
1131
{
1132
uint32_t offset;
1133
blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
1134
#if GFX_VER <= 8
1135
cc.StencilReferenceValue = params->stencil_ref;
1136
#endif
1137
}
1138
1139
#if GFX_VER >= 7
1140
blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
1141
sp.ColorCalcStatePointer = offset;
1142
#if GFX_VER >= 8
1143
sp.ColorCalcStatePointerValid = true;
1144
#endif
1145
}
1146
#endif
1147
1148
return offset;
1149
}
1150
1151
static uint32_t
1152
blorp_emit_depth_stencil_state(struct blorp_batch *batch,
1153
const struct blorp_params *params)
1154
{
1155
#if GFX_VER >= 8
1156
struct GENX(3DSTATE_WM_DEPTH_STENCIL) ds = {
1157
GENX(3DSTATE_WM_DEPTH_STENCIL_header),
1158
};
1159
#else
1160
struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
1161
#endif
1162
1163
if (params->depth.enabled) {
1164
ds.DepthBufferWriteEnable = true;
1165
1166
switch (params->hiz_op) {
1167
/* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
1168
* - 7.5.3.1 Depth Buffer Clear
1169
* - 7.5.3.2 Depth Buffer Resolve
1170
* - 7.5.3.3 Hierarchical Depth Buffer Resolve
1171
*/
1172
case ISL_AUX_OP_FULL_RESOLVE:
1173
ds.DepthTestEnable = true;
1174
ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1175
break;
1176
1177
case ISL_AUX_OP_NONE:
1178
case ISL_AUX_OP_FAST_CLEAR:
1179
case ISL_AUX_OP_AMBIGUATE:
1180
ds.DepthTestEnable = false;
1181
break;
1182
case ISL_AUX_OP_PARTIAL_RESOLVE:
1183
unreachable("Invalid HIZ op");
1184
}
1185
}
1186
1187
if (params->stencil.enabled) {
1188
ds.StencilBufferWriteEnable = true;
1189
ds.StencilTestEnable = true;
1190
ds.DoubleSidedStencilEnable = false;
1191
1192
ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1193
ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1194
1195
ds.StencilWriteMask = params->stencil_mask;
1196
#if GFX_VER >= 9
1197
ds.StencilReferenceValue = params->stencil_ref;
1198
#endif
1199
}
1200
1201
#if GFX_VER >= 8
1202
uint32_t offset = 0;
1203
uint32_t *dw = blorp_emit_dwords(batch,
1204
GENX(3DSTATE_WM_DEPTH_STENCIL_length));
1205
if (!dw)
1206
return 0;
1207
1208
GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &ds);
1209
#else
1210
uint32_t offset;
1211
void *state = blorp_alloc_dynamic_state(batch,
1212
GENX(DEPTH_STENCIL_STATE_length) * 4,
1213
64, &offset);
1214
GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
1215
blorp_flush_range(batch, state, GENX(DEPTH_STENCIL_STATE_length) * 4);
1216
#endif
1217
1218
#if GFX_VER == 7
1219
blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
1220
sp.PointertoDEPTH_STENCIL_STATE = offset;
1221
}
1222
#endif
1223
1224
return offset;
1225
}
1226
1227
static void
1228
blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1229
const struct blorp_params *params)
1230
{
1231
blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1232
ms.NumberofMultisamples = __builtin_ffs(params->num_samples) - 1;
1233
1234
#if GFX_VER >= 8
1235
/* The PRM says that this bit is valid only for DX9:
1236
*
1237
* SW can choose to set this bit only for DX9 API. DX10/OGL API's
1238
* should not have any effect by setting or not setting this bit.
1239
*/
1240
ms.PixelPositionOffsetEnable = false;
1241
#elif GFX_VER >= 7
1242
1243
switch (params->num_samples) {
1244
case 1:
1245
INTEL_SAMPLE_POS_1X(ms.Sample);
1246
break;
1247
case 2:
1248
INTEL_SAMPLE_POS_2X(ms.Sample);
1249
break;
1250
case 4:
1251
INTEL_SAMPLE_POS_4X(ms.Sample);
1252
break;
1253
case 8:
1254
INTEL_SAMPLE_POS_8X(ms.Sample);
1255
break;
1256
default:
1257
break;
1258
}
1259
#else
1260
INTEL_SAMPLE_POS_4X(ms.Sample);
1261
#endif
1262
ms.PixelLocation = CENTER;
1263
}
1264
}
1265
1266
static void
1267
blorp_emit_pipeline(struct blorp_batch *batch,
1268
const struct blorp_params *params)
1269
{
1270
uint32_t blend_state_offset = 0;
1271
uint32_t color_calc_state_offset;
1272
uint32_t depth_stencil_state_offset;
1273
1274
enum intel_urb_deref_block_size urb_deref_block_size;
1275
emit_urb_config(batch, params, &urb_deref_block_size);
1276
1277
if (params->wm_prog_data) {
1278
blend_state_offset = blorp_emit_blend_state(batch, params);
1279
}
1280
color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
1281
depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
1282
1283
#if GFX_VER == 6
1284
/* 3DSTATE_CC_STATE_POINTERS
1285
*
1286
* The pointer offsets are relative to
1287
* CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
1288
*
1289
* The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
1290
*
1291
* The dynamic state emit helpers emit their own STATE_POINTERS packets on
1292
* gfx7+. However, on gfx6 and earlier, they're all lumpped together in
1293
* one CC_STATE_POINTERS packet so we have to emit that here.
1294
*/
1295
blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
1296
cc.BLEND_STATEChange = true;
1297
cc.ColorCalcStatePointerValid = true;
1298
cc.DEPTH_STENCIL_STATEChange = true;
1299
cc.PointertoBLEND_STATE = blend_state_offset;
1300
cc.ColorCalcStatePointer = color_calc_state_offset;
1301
cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
1302
}
1303
#else
1304
(void)blend_state_offset;
1305
(void)color_calc_state_offset;
1306
(void)depth_stencil_state_offset;
1307
#endif
1308
1309
#if GFX_VER >= 12
1310
blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
1311
/* Update empty push constants for all stages (bitmask = 11111b) */
1312
pc.ShaderUpdateEnable = 0x1f;
1313
}
1314
#else
1315
blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
1316
#if GFX_VER >= 7
1317
blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
1318
blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
1319
#endif
1320
blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
1321
blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
1322
#endif
1323
1324
if (params->src.enabled)
1325
blorp_emit_sampler_state(batch);
1326
1327
blorp_emit_3dstate_multisample(batch, params);
1328
1329
blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1330
mask.SampleMask = (1 << params->num_samples) - 1;
1331
}
1332
1333
/* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1334
* 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1335
*
1336
* [DevSNB] A pipeline flush must be programmed prior to a
1337
* 3DSTATE_VS command that causes the VS Function Enable to
1338
* toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1339
* command with CS stall bit set and a post sync operation.
1340
*
1341
* We've already done one at the start of the BLORP operation.
1342
*/
1343
blorp_emit_vs_config(batch, params);
1344
#if GFX_VER >= 7
1345
blorp_emit(batch, GENX(3DSTATE_HS), hs);
1346
blorp_emit(batch, GENX(3DSTATE_TE), te);
1347
blorp_emit(batch, GENX(3DSTATE_DS), DS);
1348
blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1349
#endif
1350
blorp_emit(batch, GENX(3DSTATE_GS), gs);
1351
1352
blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1353
clip.PerspectiveDivideDisable = true;
1354
}
1355
1356
blorp_emit_sf_config(batch, params, urb_deref_block_size);
1357
blorp_emit_ps_config(batch, params);
1358
1359
blorp_emit_cc_viewport(batch);
1360
1361
#if GFX_VER >= 12
1362
/* Disable Primitive Replication. */
1363
blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1364
#endif
1365
}
1366
1367
/******** This is the end of the pipeline setup code ********/
1368
1369
#endif /* GFX_VER >= 6 */
1370
1371
#if GFX_VER >= 7
1372
static void
1373
blorp_emit_memcpy(struct blorp_batch *batch,
1374
struct blorp_address dst,
1375
struct blorp_address src,
1376
uint32_t size)
1377
{
1378
assert(size % 4 == 0);
1379
1380
for (unsigned dw = 0; dw < size; dw += 4) {
1381
#if GFX_VER >= 8
1382
blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1383
cp.DestinationMemoryAddress = dst;
1384
cp.SourceMemoryAddress = src;
1385
}
1386
#else
1387
/* IVB does not have a general purpose register for command streamer
1388
* commands. Therefore, we use an alternate temporary register.
1389
*/
1390
#define BLORP_TEMP_REG 0x2440 /* GFX7_3DPRIM_BASE_VERTEX */
1391
blorp_emit(batch, GENX(MI_LOAD_REGISTER_MEM), load) {
1392
load.RegisterAddress = BLORP_TEMP_REG;
1393
load.MemoryAddress = src;
1394
}
1395
blorp_emit(batch, GENX(MI_STORE_REGISTER_MEM), store) {
1396
store.RegisterAddress = BLORP_TEMP_REG;
1397
store.MemoryAddress = dst;
1398
}
1399
#undef BLORP_TEMP_REG
1400
#endif
1401
dst.offset += 4;
1402
src.offset += 4;
1403
}
1404
}
1405
#endif
1406
1407
static void
1408
blorp_emit_surface_state(struct blorp_batch *batch,
1409
const struct brw_blorp_surface_info *surface,
1410
UNUSED enum isl_aux_op aux_op,
1411
void *state, uint32_t state_offset,
1412
const bool color_write_disables[4],
1413
bool is_render_target)
1414
{
1415
const struct isl_device *isl_dev = batch->blorp->isl_dev;
1416
struct isl_surf surf = surface->surf;
1417
1418
if (surf.dim == ISL_SURF_DIM_1D &&
1419
surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1420
assert(surf.logical_level0_px.height == 1);
1421
surf.dim = ISL_SURF_DIM_2D;
1422
}
1423
1424
if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1425
/* BLORP doesn't render with depth so we can't use HiZ */
1426
assert(!is_render_target);
1427
/* We can't reinterpret HiZ */
1428
assert(surface->surf.format == surface->view.format);
1429
}
1430
1431
enum isl_aux_usage aux_usage = surface->aux_usage;
1432
1433
/* On gfx12, implicit CCS has no aux buffer */
1434
bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1435
(surface->aux_addr.buffer != NULL);
1436
1437
isl_channel_mask_t write_disable_mask = 0;
1438
if (is_render_target && GFX_VER <= 5) {
1439
if (color_write_disables[0])
1440
write_disable_mask |= ISL_CHANNEL_RED_BIT;
1441
if (color_write_disables[1])
1442
write_disable_mask |= ISL_CHANNEL_GREEN_BIT;
1443
if (color_write_disables[2])
1444
write_disable_mask |= ISL_CHANNEL_BLUE_BIT;
1445
if (color_write_disables[3])
1446
write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
1447
}
1448
1449
const bool use_clear_address =
1450
GFX_VER >= 10 && (surface->clear_color_addr.buffer != NULL);
1451
1452
isl_surf_fill_state(batch->blorp->isl_dev, state,
1453
.surf = &surf, .view = &surface->view,
1454
.aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1455
.address =
1456
blorp_get_surface_address(batch, surface->addr),
1457
.aux_address = !use_aux_address ? 0 :
1458
blorp_get_surface_address(batch, surface->aux_addr),
1459
.clear_address = !use_clear_address ? 0 :
1460
blorp_get_surface_address(batch,
1461
surface->clear_color_addr),
1462
.mocs = surface->addr.mocs,
1463
.clear_color = surface->clear_color,
1464
.use_clear_address = use_clear_address,
1465
.write_disables = write_disable_mask);
1466
1467
blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1468
surface->addr, 0);
1469
1470
if (use_aux_address) {
1471
/* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1472
* used to store other information. This should be ok, however, because
1473
* surface buffer addresses are always 4K page alinged.
1474
*/
1475
assert((surface->aux_addr.offset & 0xfff) == 0);
1476
uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1477
blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1478
surface->aux_addr, *aux_addr);
1479
}
1480
1481
if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1482
#if GFX_VER >= 10
1483
assert((surface->clear_color_addr.offset & 0x3f) == 0);
1484
uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
1485
blorp_surface_reloc(batch, state_offset +
1486
isl_dev->ss.clear_color_state_offset,
1487
surface->clear_color_addr, *clear_addr);
1488
#elif GFX_VER >= 7
1489
/* Fast clears just whack the AUX surface and don't actually use the
1490
* clear color for anything. We can avoid the MI memcpy on that case.
1491
*/
1492
if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1493
struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1494
dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1495
blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1496
isl_dev->ss.clear_value_size);
1497
}
1498
#else
1499
unreachable("Fast clears are only supported on gfx7+");
1500
#endif
1501
}
1502
1503
blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1504
}
1505
1506
static void
1507
blorp_emit_null_surface_state(struct blorp_batch *batch,
1508
const struct brw_blorp_surface_info *surface,
1509
uint32_t *state)
1510
{
1511
struct GENX(RENDER_SURFACE_STATE) ss = {
1512
.SurfaceType = SURFTYPE_NULL,
1513
.SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1514
.Width = surface->surf.logical_level0_px.width - 1,
1515
.Height = surface->surf.logical_level0_px.height - 1,
1516
.MIPCountLOD = surface->view.base_level,
1517
.MinimumArrayElement = surface->view.base_array_layer,
1518
.Depth = surface->view.array_len - 1,
1519
.RenderTargetViewExtent = surface->view.array_len - 1,
1520
#if GFX_VER >= 6
1521
.NumberofMultisamples = ffs(surface->surf.samples) - 1,
1522
#endif
1523
1524
#if GFX_VER >= 7
1525
.SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1526
#endif
1527
1528
#if GFX_VER >= 8
1529
.TileMode = YMAJOR,
1530
#else
1531
.TiledSurface = true,
1532
#endif
1533
};
1534
1535
GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1536
1537
blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1538
}
1539
1540
static void
1541
blorp_emit_surface_states(struct blorp_batch *batch,
1542
const struct blorp_params *params)
1543
{
1544
const struct isl_device *isl_dev = batch->blorp->isl_dev;
1545
uint32_t bind_offset = 0, surface_offsets[2];
1546
void *surface_maps[2];
1547
1548
UNUSED bool has_indirect_clear_color = false;
1549
if (params->use_pre_baked_binding_table) {
1550
bind_offset = params->pre_baked_binding_table_offset;
1551
} else {
1552
unsigned num_surfaces = 1 + params->src.enabled;
1553
blorp_alloc_binding_table(batch, num_surfaces,
1554
isl_dev->ss.size, isl_dev->ss.align,
1555
&bind_offset, surface_offsets, surface_maps);
1556
1557
if (params->dst.enabled) {
1558
blorp_emit_surface_state(batch, &params->dst,
1559
params->fast_clear_op,
1560
surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1561
surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1562
params->color_write_disable, true);
1563
if (params->dst.clear_color_addr.buffer != NULL)
1564
has_indirect_clear_color = true;
1565
} else {
1566
assert(params->depth.enabled || params->stencil.enabled);
1567
const struct brw_blorp_surface_info *surface =
1568
params->depth.enabled ? &params->depth : &params->stencil;
1569
blorp_emit_null_surface_state(batch, surface,
1570
surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1571
}
1572
1573
if (params->src.enabled) {
1574
blorp_emit_surface_state(batch, &params->src,
1575
params->fast_clear_op,
1576
surface_maps[BLORP_TEXTURE_BT_INDEX],
1577
surface_offsets[BLORP_TEXTURE_BT_INDEX],
1578
NULL, false);
1579
if (params->src.clear_color_addr.buffer != NULL)
1580
has_indirect_clear_color = true;
1581
}
1582
}
1583
1584
#if GFX_VER >= 7
1585
if (has_indirect_clear_color) {
1586
/* Updating a surface state object may require that the state cache be
1587
* invalidated. From the SKL PRM, Shared Functions -> State -> State
1588
* Caching:
1589
*
1590
* Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1591
* the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1592
* modified [...], the L1 state cache must be invalidated to ensure
1593
* the new surface or sampler state is fetched from system memory.
1594
*/
1595
blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1596
pipe.StateCacheInvalidationEnable = true;
1597
}
1598
}
1599
#endif
1600
1601
#if GFX_VER >= 7
1602
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1603
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1604
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1605
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1606
1607
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1608
bt.PointertoPSBindingTable = bind_offset;
1609
}
1610
#elif GFX_VER >= 6
1611
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1612
bt.PSBindingTableChange = true;
1613
bt.PointertoPSBindingTable = bind_offset;
1614
}
1615
#else
1616
blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1617
bt.PointertoPSBindingTable = bind_offset;
1618
}
1619
#endif
1620
}
1621
1622
static void
1623
blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1624
const struct blorp_params *params)
1625
{
1626
const struct isl_device *isl_dev = batch->blorp->isl_dev;
1627
1628
uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1629
if (dw == NULL)
1630
return;
1631
1632
struct isl_depth_stencil_hiz_emit_info info = { };
1633
1634
if (params->depth.enabled) {
1635
info.view = &params->depth.view;
1636
info.mocs = params->depth.addr.mocs;
1637
} else if (params->stencil.enabled) {
1638
info.view = &params->stencil.view;
1639
info.mocs = params->stencil.addr.mocs;
1640
}
1641
1642
if (params->depth.enabled) {
1643
info.depth_surf = &params->depth.surf;
1644
1645
info.depth_address =
1646
blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1647
params->depth.addr, 0);
1648
1649
info.hiz_usage = params->depth.aux_usage;
1650
if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1651
info.hiz_surf = &params->depth.aux_surf;
1652
1653
struct blorp_address hiz_address = params->depth.aux_addr;
1654
#if GFX_VER == 6
1655
/* Sandy bridge hardware does not technically support mipmapped HiZ.
1656
* However, we have a special layout that allows us to make it work
1657
* anyway by manually offsetting to the specified miplevel.
1658
*/
1659
assert(info.hiz_surf->dim_layout == ISL_DIM_LAYOUT_GFX6_STENCIL_HIZ);
1660
uint32_t offset_B;
1661
isl_surf_get_image_offset_B_tile_sa(info.hiz_surf,
1662
info.view->base_level, 0, 0,
1663
&offset_B, NULL, NULL);
1664
hiz_address.offset += offset_B;
1665
#endif
1666
1667
info.hiz_address =
1668
blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1669
hiz_address, 0);
1670
1671
info.depth_clear_value = params->depth.clear_color.f32[0];
1672
}
1673
}
1674
1675
if (params->stencil.enabled) {
1676
info.stencil_surf = &params->stencil.surf;
1677
1678
info.stencil_aux_usage = params->stencil.aux_usage;
1679
struct blorp_address stencil_address = params->stencil.addr;
1680
#if GFX_VER == 6
1681
/* Sandy bridge hardware does not technically support mipmapped stencil.
1682
* However, we have a special layout that allows us to make it work
1683
* anyway by manually offsetting to the specified miplevel.
1684
*/
1685
assert(info.stencil_surf->dim_layout == ISL_DIM_LAYOUT_GFX6_STENCIL_HIZ);
1686
uint32_t offset_B;
1687
isl_surf_get_image_offset_B_tile_sa(info.stencil_surf,
1688
info.view->base_level, 0, 0,
1689
&offset_B, NULL, NULL);
1690
stencil_address.offset += offset_B;
1691
#endif
1692
1693
info.stencil_address =
1694
blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1695
stencil_address, 0);
1696
}
1697
1698
isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1699
1700
#if GFX_VER >= 12
1701
/* Wa_1408224581
1702
*
1703
* Workaround: Gfx12LP Astep only An additional pipe control with
1704
* post-sync = store dword operation would be required.( w/a is to
1705
* have an additional pipe control after the stencil state whenever
1706
* the surface state bits of this state is changing).
1707
*/
1708
blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1709
pc.PostSyncOperation = WriteImmediateData;
1710
pc.Address = blorp_get_workaround_address(batch);
1711
}
1712
#endif
1713
}
1714
1715
#if GFX_VER >= 8
1716
/* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1717
* depth/stencil buffer extents are ignored to handle APIs which perform
1718
* clearing operations without such information.
1719
* */
1720
static void
1721
blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1722
const struct blorp_params *params)
1723
{
1724
/* We should be performing an operation on a depth or stencil buffer.
1725
*/
1726
assert(params->depth.enabled || params->stencil.enabled);
1727
1728
/* The stencil buffer should only be enabled if a fast clear operation is
1729
* requested.
1730
*/
1731
if (params->stencil.enabled)
1732
assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1733
1734
/* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1735
*
1736
* 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1737
* the Number of Multisamples. This packet must not be used to change
1738
* Number of Multisamples in a rendering sequence.
1739
*
1740
* Since HIZ may be the first thing in a batch buffer, play safe and always
1741
* emit 3DSTATE_MULTISAMPLE.
1742
*/
1743
blorp_emit_3dstate_multisample(batch, params);
1744
1745
/* From the BDW PRM Volume 7, Depth Buffer Clear:
1746
*
1747
* The clear value must be between the min and max depth values
1748
* (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1749
* D32_FLOAT, then +/-DENORM values are also allowed.
1750
*
1751
* Set the bounds to match our hardware limits, [0.0, 1.0].
1752
*/
1753
if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
1754
assert(params->depth.clear_color.f32[0] >= 0.0f);
1755
assert(params->depth.clear_color.f32[0] <= 1.0f);
1756
blorp_emit_cc_viewport(batch);
1757
}
1758
1759
/* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1760
* 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1761
* even when WM_HZ_OP is active. However, WM thread dispatch is normally
1762
* disabled for HiZ ops and it appears that force-enabling it can lead to
1763
* GPU hangs on at least Skylake. Since we don't know the current state of
1764
* the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1765
*/
1766
blorp_emit(batch, GENX(3DSTATE_WM), wm);
1767
1768
/* If we can't alter the depth stencil config and multiple layers are
1769
* involved, the HiZ op will fail. This is because the op requires that a
1770
* new config is emitted for each additional layer.
1771
*/
1772
if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1773
assert(params->num_layers <= 1);
1774
} else {
1775
blorp_emit_depth_stencil_config(batch, params);
1776
}
1777
1778
blorp_measure_start(batch, params);
1779
1780
blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1781
switch (params->hiz_op) {
1782
case ISL_AUX_OP_FAST_CLEAR:
1783
hzp.StencilBufferClearEnable = params->stencil.enabled;
1784
hzp.DepthBufferClearEnable = params->depth.enabled;
1785
hzp.StencilClearValue = params->stencil_ref;
1786
hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1787
break;
1788
case ISL_AUX_OP_FULL_RESOLVE:
1789
assert(params->full_surface_hiz_op);
1790
hzp.DepthBufferResolveEnable = true;
1791
break;
1792
case ISL_AUX_OP_AMBIGUATE:
1793
assert(params->full_surface_hiz_op);
1794
hzp.HierarchicalDepthBufferResolveEnable = true;
1795
break;
1796
case ISL_AUX_OP_PARTIAL_RESOLVE:
1797
case ISL_AUX_OP_NONE:
1798
unreachable("Invalid HIZ op");
1799
}
1800
1801
hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1802
hzp.SampleMask = 0xFFFF;
1803
1804
/* Due to a hardware issue, this bit MBZ */
1805
assert(hzp.ScissorRectangleEnable == false);
1806
1807
/* Contrary to the HW docs both fields are inclusive */
1808
hzp.ClearRectangleXMin = params->x0;
1809
hzp.ClearRectangleYMin = params->y0;
1810
1811
/* Contrary to the HW docs both fields are exclusive */
1812
hzp.ClearRectangleXMax = params->x1;
1813
hzp.ClearRectangleYMax = params->y1;
1814
}
1815
1816
/* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1817
* to “Write Immediate Data” enabled.
1818
*/
1819
blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1820
pc.PostSyncOperation = WriteImmediateData;
1821
pc.Address = blorp_get_workaround_address(batch);
1822
}
1823
1824
blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1825
}
1826
#endif
1827
1828
static void
1829
blorp_update_clear_color(UNUSED struct blorp_batch *batch,
1830
const struct brw_blorp_surface_info *info,
1831
enum isl_aux_op op)
1832
{
1833
if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
1834
#if GFX_VER == 11
1835
blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1836
pipe.CommandStreamerStallEnable = true;
1837
}
1838
1839
/* 2 QWORDS */
1840
const unsigned inlinedata_dw = 2 * 2;
1841
const unsigned num_dwords = GENX(MI_ATOMIC_length) + inlinedata_dw;
1842
1843
struct blorp_address clear_addr = info->clear_color_addr;
1844
uint32_t *dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
1845
.DataSize = MI_ATOMIC_QWORD,
1846
.ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
1847
.InlineData = true,
1848
.MemoryAddress = clear_addr);
1849
/* dw starts at dword 1, but we need to fill dwords 3 and 5 */
1850
dw[2] = info->clear_color.u32[0];
1851
dw[3] = 0;
1852
dw[4] = info->clear_color.u32[1];
1853
dw[5] = 0;
1854
1855
clear_addr.offset += 8;
1856
dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
1857
.DataSize = MI_ATOMIC_QWORD,
1858
.ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
1859
.CSSTALL = true,
1860
.ReturnDataControl = true,
1861
.InlineData = true,
1862
.MemoryAddress = clear_addr);
1863
/* dw starts at dword 1, but we need to fill dwords 3 and 5 */
1864
dw[2] = info->clear_color.u32[2];
1865
dw[3] = 0;
1866
dw[4] = info->clear_color.u32[3];
1867
dw[5] = 0;
1868
1869
blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1870
pipe.StateCacheInvalidationEnable = true;
1871
pipe.TextureCacheInvalidationEnable = true;
1872
}
1873
#elif GFX_VER >= 9
1874
1875
/* According to Wa_2201730850, in the Clear Color Programming Note
1876
* under the Red channel, "Software shall write the converted Depth
1877
* Clear to this dword." The only depth formats listed under the red
1878
* channel are IEEE_FP and UNORM24_X8. These two requirements are
1879
* incompatible with the UNORM16 depth format, so just ignore that case
1880
* and simply perform the conversion for all depth formats.
1881
*/
1882
union isl_color_value fixed_color = info->clear_color;
1883
if (GFX_VER == 12 && isl_surf_usage_is_depth(info->surf.usage)) {
1884
isl_color_value_pack(&info->clear_color, info->surf.format,
1885
fixed_color.u32);
1886
}
1887
1888
for (int i = 0; i < 4; i++) {
1889
blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
1890
sdi.Address = info->clear_color_addr;
1891
sdi.Address.offset += i * 4;
1892
sdi.ImmediateData = fixed_color.u32[i];
1893
#if GFX_VER >= 12
1894
if (i == 3)
1895
sdi.ForceWriteCompletionCheck = true;
1896
#endif
1897
}
1898
}
1899
1900
/* The RENDER_SURFACE_STATE::ClearColor field states that software should
1901
* write the converted depth value 16B after the clear address:
1902
*
1903
* 3D Sampler will always fetch clear depth from the location 16-bytes
1904
* above this address, where the clear depth, converted to native
1905
* surface format by software, will be stored.
1906
*
1907
*/
1908
#if GFX_VER >= 12
1909
if (isl_surf_usage_is_depth(info->surf.usage)) {
1910
blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
1911
sdi.Address = info->clear_color_addr;
1912
sdi.Address.offset += 4 * 4;
1913
sdi.ImmediateData = fixed_color.u32[0];
1914
sdi.ForceWriteCompletionCheck = true;
1915
}
1916
}
1917
#endif
1918
1919
#elif GFX_VER >= 7
1920
blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
1921
sdi.Address = info->clear_color_addr;
1922
sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
1923
ISL_CHANNEL_SELECT_GREEN << 22 |
1924
ISL_CHANNEL_SELECT_BLUE << 19 |
1925
ISL_CHANNEL_SELECT_ALPHA << 16;
1926
if (isl_format_has_int_channel(info->view.format)) {
1927
for (unsigned i = 0; i < 4; i++) {
1928
assert(info->clear_color.u32[i] == 0 ||
1929
info->clear_color.u32[i] == 1);
1930
}
1931
sdi.ImmediateData |= (info->clear_color.u32[0] != 0) << 31;
1932
sdi.ImmediateData |= (info->clear_color.u32[1] != 0) << 30;
1933
sdi.ImmediateData |= (info->clear_color.u32[2] != 0) << 29;
1934
sdi.ImmediateData |= (info->clear_color.u32[3] != 0) << 28;
1935
} else {
1936
for (unsigned i = 0; i < 4; i++) {
1937
assert(info->clear_color.f32[i] == 0.0f ||
1938
info->clear_color.f32[i] == 1.0f);
1939
}
1940
sdi.ImmediateData |= (info->clear_color.f32[0] != 0.0f) << 31;
1941
sdi.ImmediateData |= (info->clear_color.f32[1] != 0.0f) << 30;
1942
sdi.ImmediateData |= (info->clear_color.f32[2] != 0.0f) << 29;
1943
sdi.ImmediateData |= (info->clear_color.f32[3] != 0.0f) << 28;
1944
}
1945
}
1946
#endif
1947
}
1948
}
1949
1950
/**
1951
* \brief Execute a blit or render pass operation.
1952
*
1953
* To execute the operation, this function manually constructs and emits a
1954
* batch to draw a rectangle primitive. The batchbuffer is flushed before
1955
* constructing and after emitting the batch.
1956
*
1957
* This function alters no GL state.
1958
*/
1959
static void
1960
blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
1961
{
1962
if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
1963
blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
1964
blorp_update_clear_color(batch, &params->depth, params->hiz_op);
1965
}
1966
1967
#if GFX_VER >= 8
1968
if (params->hiz_op != ISL_AUX_OP_NONE) {
1969
blorp_emit_gfx8_hiz_op(batch, params);
1970
return;
1971
}
1972
#endif
1973
1974
blorp_emit_vertex_buffers(batch, params);
1975
blorp_emit_vertex_elements(batch, params);
1976
1977
blorp_emit_pipeline(batch, params);
1978
1979
blorp_emit_surface_states(batch, params);
1980
1981
if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1982
blorp_emit_depth_stencil_config(batch, params);
1983
1984
blorp_measure_start(batch, params);
1985
1986
blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1987
prim.VertexAccessType = SEQUENTIAL;
1988
prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1989
#if GFX_VER >= 7
1990
prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1991
#endif
1992
prim.VertexCountPerInstance = 3;
1993
prim.InstanceCount = params->num_layers;
1994
}
1995
}
1996
1997
#endif /* BLORP_GENX_EXEC_H */
1998
1999