CoCalc -- genX_pipeline.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/intel/vulkan/genX_pipeline.c
⁴⁵⁴⁷ views
1
/*
2
 * Copyright © 2015 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include "anv_private.h"
25

26
#include "genxml/gen_macros.h"
27
#include "genxml/genX_pack.h"
28
#include "genxml/gen_rt_pack.h"
29

30
#include "common/intel_l3_config.h"
31
#include "common/intel_sample_positions.h"
32
#include "nir/nir_xfb_info.h"
33
#include "vk_util.h"
34
#include "vk_format.h"
35

36
static uint32_t
37
vertex_element_comp_control(enum isl_format format, unsigned comp)
38
{
39
   uint8_t bits;
40
   switch (comp) {
41
   case 0: bits = isl_format_layouts[format].channels.r.bits; break;
42
   case 1: bits = isl_format_layouts[format].channels.g.bits; break;
43
   case 2: bits = isl_format_layouts[format].channels.b.bits; break;
44
   case 3: bits = isl_format_layouts[format].channels.a.bits; break;
45
   default: unreachable("Invalid component");
46
   }
47

48
   /*
49
    * Take in account hardware restrictions when dealing with 64-bit floats.
50
    *
51
    * From Broadwell spec, command reference structures, page 586:
52
    *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
53
    *   64-bit components are stored * in the URB without any conversion. In
54
    *   this case, vertex elements must be written as 128 or 256 bits, with
55
    *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
56
    *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
57
    *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
58
    *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
59
    *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
60
    *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
61
    *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
62
    *   256-bit vertex element."
63
    */
64
   if (bits) {
65
      return VFCOMP_STORE_SRC;
66
   } else if (comp >= 2 &&
67
              !isl_format_layouts[format].channels.b.bits &&
68
              isl_format_layouts[format].channels.r.type == ISL_RAW) {
69
      /* When emitting 64-bit attributes, we need to write either 128 or 256
70
       * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
71
       * VFCOMP_STORE_0 to pad the written chunk */
72
      return VFCOMP_NOSTORE;
73
   } else if (comp < 3 ||
74
              isl_format_layouts[format].channels.r.type == ISL_RAW) {
75
      /* Note we need to pad with value 0, not 1, due hardware restrictions
76
       * (see comment above) */
77
      return VFCOMP_STORE_0;
78
   } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
79
            isl_format_layouts[format].channels.r.type == ISL_SINT) {
80
      assert(comp == 3);
81
      return VFCOMP_STORE_1_INT;
82
   } else {
83
      assert(comp == 3);
84
      return VFCOMP_STORE_1_FP;
85
   }
86
}
87

88
static void
89
emit_vertex_input(struct anv_graphics_pipeline *pipeline,
90
                  const VkPipelineVertexInputStateCreateInfo *info)
91
{
92
   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
93

94
   /* Pull inputs_read out of the VS prog data */
95
   const uint64_t inputs_read = vs_prog_data->inputs_read;
96
   const uint64_t double_inputs_read =
97
      vs_prog_data->double_inputs_read & inputs_read;
98
   assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
99
   const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
100
   const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
101
   const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
102
                                vs_prog_data->uses_instanceid ||
103
                                vs_prog_data->uses_firstvertex ||
104
                                vs_prog_data->uses_baseinstance;
105

106
   uint32_t elem_count = __builtin_popcount(elements) -
107
      __builtin_popcount(elements_double) / 2;
108

109
   const uint32_t total_elems =
110
      MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
111

112
   uint32_t *p;
113

114
   const uint32_t num_dwords = 1 + total_elems * 2;
115
   p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
116
                       GENX(3DSTATE_VERTEX_ELEMENTS));
117
   if (!p)
118
      return;
119

120
   for (uint32_t i = 0; i < total_elems; i++) {
121
      /* The SKL docs for VERTEX_ELEMENT_STATE say:
122
       *
123
       *    "All elements must be valid from Element[0] to the last valid
124
       *    element. (I.e. if Element[2] is valid then Element[1] and
125
       *    Element[0] must also be valid)."
126
       *
127
       * The SKL docs for 3D_Vertex_Component_Control say:
128
       *
129
       *    "Don't store this component. (Not valid for Component 0, but can
130
       *    be used for Component 1-3)."
131
       *
132
       * So we can't just leave a vertex element blank and hope for the best.
133
       * We have to tell the VF hardware to put something in it; so we just
134
       * store a bunch of zero.
135
       *
136
       * TODO: Compact vertex elements so we never end up with holes.
137
       */
138
      struct GENX(VERTEX_ELEMENT_STATE) element = {
139
         .Valid = true,
140
         .Component0Control = VFCOMP_STORE_0,
141
         .Component1Control = VFCOMP_STORE_0,
142
         .Component2Control = VFCOMP_STORE_0,
143
         .Component3Control = VFCOMP_STORE_0,
144
      };
145
      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
146
   }
147

148
   for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
149
      const VkVertexInputAttributeDescription *desc =
150
         &info->pVertexAttributeDescriptions[i];
151
      enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,
152
                                                  desc->format,
153
                                                  VK_IMAGE_ASPECT_COLOR_BIT,
154
                                                  VK_IMAGE_TILING_LINEAR);
155

156
      assert(desc->binding < MAX_VBS);
157

158
      if ((elements & (1 << desc->location)) == 0)
159
         continue; /* Binding unused */
160

161
      uint32_t slot =
162
         __builtin_popcount(elements & ((1 << desc->location) - 1)) -
163
         DIV_ROUND_UP(__builtin_popcount(elements_double &
164
                                        ((1 << desc->location) -1)), 2);
165

166
      struct GENX(VERTEX_ELEMENT_STATE) element = {
167
         .VertexBufferIndex = desc->binding,
168
         .Valid = true,
169
         .SourceElementFormat = format,
170
         .EdgeFlagEnable = false,
171
         .SourceElementOffset = desc->offset,
172
         .Component0Control = vertex_element_comp_control(format, 0),
173
         .Component1Control = vertex_element_comp_control(format, 1),
174
         .Component2Control = vertex_element_comp_control(format, 2),
175
         .Component3Control = vertex_element_comp_control(format, 3),
176
      };
177
      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
178

179
#if GFX_VER >= 8
180
      /* On Broadwell and later, we have a separate VF_INSTANCING packet
181
       * that controls instancing.  On Haswell and prior, that's part of
182
       * VERTEX_BUFFER_STATE which we emit later.
183
       */
184
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
185
         vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
186
         vfi.VertexElementIndex = slot;
187
         vfi.InstanceDataStepRate =
188
            pipeline->vb[desc->binding].instance_divisor;
189
      }
190
#endif
191
   }
192

193
   const uint32_t id_slot = elem_count;
194
   if (needs_svgs_elem) {
195
      /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
196
       *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
197
       *    Control field is set to something other than VFCOMP_STORE_SRC,
198
       *    no higher-numbered Component Control fields may be set to
199
       *    VFCOMP_STORE_SRC"
200
       *
201
       * This means, that if we have BaseInstance, we need BaseVertex as
202
       * well.  Just do all or nothing.
203
       */
204
      uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
205
                            vs_prog_data->uses_baseinstance) ?
206
                           VFCOMP_STORE_SRC : VFCOMP_STORE_0;
207

208
      struct GENX(VERTEX_ELEMENT_STATE) element = {
209
         .VertexBufferIndex = ANV_SVGS_VB_INDEX,
210
         .Valid = true,
211
         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
212
         .Component0Control = base_ctrl,
213
         .Component1Control = base_ctrl,
214
#if GFX_VER >= 8
215
         .Component2Control = VFCOMP_STORE_0,
216
         .Component3Control = VFCOMP_STORE_0,
217
#else
218
         .Component2Control = VFCOMP_STORE_VID,
219
         .Component3Control = VFCOMP_STORE_IID,
220
#endif
221
      };
222
      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
223

224
#if GFX_VER >= 8
225
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
226
         vfi.VertexElementIndex = id_slot;
227
      }
228
#endif
229
   }
230

231
#if GFX_VER >= 8
232
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
233
      sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
234
      sgvs.VertexIDComponentNumber     = 2;
235
      sgvs.VertexIDElementOffset       = id_slot;
236
      sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
237
      sgvs.InstanceIDComponentNumber   = 3;
238
      sgvs.InstanceIDElementOffset     = id_slot;
239
   }
240
#endif
241

242
   const uint32_t drawid_slot = elem_count + needs_svgs_elem;
243
   if (vs_prog_data->uses_drawid) {
244
      struct GENX(VERTEX_ELEMENT_STATE) element = {
245
         .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
246
         .Valid = true,
247
         .SourceElementFormat = ISL_FORMAT_R32_UINT,
248
         .Component0Control = VFCOMP_STORE_SRC,
249
         .Component1Control = VFCOMP_STORE_0,
250
         .Component2Control = VFCOMP_STORE_0,
251
         .Component3Control = VFCOMP_STORE_0,
252
      };
253
      GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
254
                                      &p[1 + drawid_slot * 2],
255
                                      &element);
256

257
#if GFX_VER >= 8
258
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
259
         vfi.VertexElementIndex = drawid_slot;
260
      }
261
#endif
262
   }
263
}
264

265
void
266
genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
267
                     const struct intel_l3_config *l3_config,
268
                     VkShaderStageFlags active_stages,
269
                     const unsigned entry_size[4],
270
                     enum intel_urb_deref_block_size *deref_block_size)
271
{
272
   const struct intel_device_info *devinfo = &device->info;
273

274
   unsigned entries[4];
275
   unsigned start[4];
276
   bool constrained;
277
   intel_get_urb_config(devinfo, l3_config,
278
                        active_stages &
279
                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
280
                        active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
281
                        entry_size, entries, start, deref_block_size,
282
                        &constrained);
283

284
#if GFX_VERx10 == 70
285
   /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
286
    *
287
    *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
288
    *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
289
    *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
290
    *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
291
    *    needs to be sent before any combination of VS associated 3DSTATE."
292
    */
293
   anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {
294
      pc.DepthStallEnable  = true;
295
      pc.PostSyncOperation = WriteImmediateData;
296
      pc.Address           = device->workaround_address;
297
   }
298
#endif
299

300
   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
301
      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
302
         urb._3DCommandSubOpcode      += i;
303
         urb.VSURBStartingAddress      = start[i];
304
         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
305
         urb.VSNumberofURBEntries      = entries[i];
306
      }
307
   }
308
}
309

310
static void
311
emit_urb_setup(struct anv_graphics_pipeline *pipeline,
312
               enum intel_urb_deref_block_size *deref_block_size)
313
{
314
   unsigned entry_size[4];
315
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
316
      const struct brw_vue_prog_data *prog_data =
317
         !anv_pipeline_has_stage(pipeline, i) ? NULL :
318
         (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
319

320
      entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
321
   }
322

323
   genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
324
                        pipeline->base.l3_config,
325
                        pipeline->active_stages, entry_size,
326
                        deref_block_size);
327
}
328

329
static void
330
emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
331
{
332
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
333

334
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
335
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
336
#if GFX_VER >= 8
337
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
338
#endif
339
      return;
340
   }
341

342
   const struct brw_vue_map *fs_input_map =
343
      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
344

345
   struct GENX(3DSTATE_SBE) sbe = {
346
      GENX(3DSTATE_SBE_header),
347
      .AttributeSwizzleEnable = true,
348
      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
349
      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
350
      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
351
   };
352

353
#if GFX_VER >= 9
354
   for (unsigned i = 0; i < 32; i++)
355
      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
356
#endif
357

358
#if GFX_VER >= 8
359
   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
360
   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
361
      GENX(3DSTATE_SBE_SWIZ_header),
362
   };
363
#else
364
#  define swiz sbe
365
#endif
366

367
   int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
368
                                                        fs_input_map);
369
   assert(first_slot % 2 == 0);
370
   unsigned urb_entry_read_offset = first_slot / 2;
371
   int max_source_attr = 0;
372
   for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
373
      uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
374
      int input_index = wm_prog_data->urb_setup[attr];
375

376
      assert(0 <= input_index);
377

378
      /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
379
       * VUE header
380
       */
381
      if (attr == VARYING_SLOT_VIEWPORT ||
382
          attr == VARYING_SLOT_LAYER ||
383
          attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
384
         continue;
385
      }
386

387
      if (attr == VARYING_SLOT_PNTC) {
388
         sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
389
         continue;
390
      }
391

392
      const int slot = fs_input_map->varying_to_slot[attr];
393

394
      if (slot == -1) {
395
         /* This attribute does not exist in the VUE--that means that the
396
          * vertex shader did not write to it.  It could be that it's a
397
          * regular varying read by the fragment shader but not written by
398
          * the vertex shader or it's gl_PrimitiveID. In the first case the
399
          * value is undefined, in the second it needs to be
400
          * gl_PrimitiveID.
401
          */
402
         swiz.Attribute[input_index].ConstantSource = PRIM_ID;
403
         swiz.Attribute[input_index].ComponentOverrideX = true;
404
         swiz.Attribute[input_index].ComponentOverrideY = true;
405
         swiz.Attribute[input_index].ComponentOverrideZ = true;
406
         swiz.Attribute[input_index].ComponentOverrideW = true;
407
         continue;
408
      }
409

410
      /* We have to subtract two slots to accout for the URB entry output
411
       * read offset in the VS and GS stages.
412
       */
413
      const int source_attr = slot - 2 * urb_entry_read_offset;
414
      assert(source_attr >= 0 && source_attr < 32);
415
      max_source_attr = MAX2(max_source_attr, source_attr);
416
      /* The hardware can only do overrides on 16 overrides at a time, and the
417
       * other up to 16 have to be lined up so that the input index = the
418
       * output index. We'll need to do some tweaking to make sure that's the
419
       * case.
420
       */
421
      if (input_index < 16)
422
         swiz.Attribute[input_index].SourceAttribute = source_attr;
423
      else
424
         assert(source_attr == input_index);
425
   }
426

427
   sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
428
   sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
429
#if GFX_VER >= 8
430
   sbe.ForceVertexURBEntryReadOffset = true;
431
   sbe.ForceVertexURBEntryReadLength = true;
432
#endif
433

434
   uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
435
                                        GENX(3DSTATE_SBE_length));
436
   if (!dw)
437
      return;
438
   GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
439

440
#if GFX_VER >= 8
441
   dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
442
   if (!dw)
443
      return;
444
   GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
445
#endif
446
}
447

448
/** Returns the final polygon mode for rasterization
449
 *
450
 * This function takes into account polygon mode, primitive topology and the
451
 * different shader stages which might generate their own type of primitives.
452
 */
453
VkPolygonMode
454
genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
455
                          VkPrimitiveTopology primitive_topology)
456
{
457
   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
458
      switch (get_gs_prog_data(pipeline)->output_topology) {
459
      case _3DPRIM_POINTLIST:
460
         return VK_POLYGON_MODE_POINT;
461

462
      case _3DPRIM_LINELIST:
463
      case _3DPRIM_LINESTRIP:
464
      case _3DPRIM_LINELOOP:
465
         return VK_POLYGON_MODE_LINE;
466

467
      case _3DPRIM_TRILIST:
468
      case _3DPRIM_TRIFAN:
469
      case _3DPRIM_TRISTRIP:
470
      case _3DPRIM_RECTLIST:
471
      case _3DPRIM_QUADLIST:
472
      case _3DPRIM_QUADSTRIP:
473
      case _3DPRIM_POLYGON:
474
         return pipeline->polygon_mode;
475
      }
476
      unreachable("Unsupported GS output topology");
477
   } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
478
      switch (get_tes_prog_data(pipeline)->output_topology) {
479
      case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
480
         return VK_POLYGON_MODE_POINT;
481

482
      case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
483
         return VK_POLYGON_MODE_LINE;
484

485
      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
486
      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
487
         return pipeline->polygon_mode;
488
      }
489
      unreachable("Unsupported TCS output topology");
490
   } else {
491
      switch (primitive_topology) {
492
      case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
493
         return VK_POLYGON_MODE_POINT;
494

495
      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
496
      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
497
      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
498
      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
499
         return VK_POLYGON_MODE_LINE;
500

501
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
502
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
503
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
504
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
505
      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
506
         return pipeline->polygon_mode;
507

508
      default:
509
         unreachable("Unsupported primitive topology");
510
      }
511
   }
512
}
513

514
uint32_t
515
genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
516
                            VkPolygonMode raster_mode)
517
{
518
#if GFX_VER <= 7
519
   if (raster_mode == VK_POLYGON_MODE_LINE) {
520
      switch (pipeline->line_mode) {
521
      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
522
         return MSRASTMODE_ON_PATTERN;
523

524
      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
525
      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
526
         return MSRASTMODE_OFF_PIXEL;
527

528
      default:
529
         unreachable("Unsupported line rasterization mode");
530
      }
531
   } else {
532
      return pipeline->rasterization_samples > 1 ?
533
         MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
534
   }
535
#else
536
   unreachable("Only on gen7");
537
#endif
538
}
539

540
static VkProvokingVertexModeEXT
541
vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
542
{
543
   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
544
      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
545

546
   return rs_pv_info == NULL ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT :
547
                               rs_pv_info->provokingVertexMode;
548
}
549

550
const uint32_t genX(vk_to_intel_cullmode)[] = {
551
   [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
552
   [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
553
   [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
554
   [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
555
};
556

557
const uint32_t genX(vk_to_intel_fillmode)[] = {
558
   [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
559
   [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
560
   [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
561
};
562

563
const uint32_t genX(vk_to_intel_front_face)[] = {
564
   [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
565
   [VK_FRONT_FACE_CLOCKWISE]                 = 0
566
};
567

568
#if GFX_VER >= 9
569
static VkConservativeRasterizationModeEXT
570
vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
571
{
572
   const VkPipelineRasterizationConservativeStateCreateInfoEXT *cr =
573
      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
574

575
   return cr ? cr->conservativeRasterizationMode :
576
               VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
577
}
578
#endif
579

580
void
581
genX(rasterization_mode)(VkPolygonMode raster_mode,
582
                         VkLineRasterizationModeEXT line_mode,
583
                         uint32_t *api_mode,
584
                         bool *msaa_rasterization_enable)
585
{
586
#if GFX_VER >= 8
587
   if (raster_mode == VK_POLYGON_MODE_LINE) {
588
      /* Unfortunately, configuring our line rasterization hardware on gfx8
589
       * and later is rather painful.  Instead of giving us bits to tell the
590
       * hardware what line mode to use like we had on gfx7, we now have an
591
       * arcane combination of API Mode and MSAA enable bits which do things
592
       * in a table which are expected to magically put the hardware into the
593
       * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
594
       * hardware people thought of so nothing works the way you want it to.
595
       *
596
       * Look at the table titled "Multisample Rasterization Modes" in Vol 7
597
       * of the Skylake PRM for more details.
598
       */
599
      switch (line_mode) {
600
      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
601
         *api_mode = DX100;
602
         *msaa_rasterization_enable = true;
603
         break;
604

605
      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
606
      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
607
         *api_mode = DX9OGL;
608
         *msaa_rasterization_enable = false;
609
         break;
610

611
      default:
612
         unreachable("Unsupported line rasterization mode");
613
      }
614
   } else {
615
      *api_mode = DX100;
616
      *msaa_rasterization_enable = true;
617
   }
618
#else
619
   unreachable("Invalid call");
620
#endif
621
}
622

623
static void
624
emit_rs_state(struct anv_graphics_pipeline *pipeline,
625
              const VkPipelineInputAssemblyStateCreateInfo *ia_info,
626
              const VkPipelineRasterizationStateCreateInfo *rs_info,
627
              const VkPipelineMultisampleStateCreateInfo *ms_info,
628
              const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
629
              const uint32_t dynamic_states,
630
              const struct anv_render_pass *pass,
631
              const struct anv_subpass *subpass,
632
              enum intel_urb_deref_block_size urb_deref_block_size)
633
{
634
   struct GENX(3DSTATE_SF) sf = {
635
      GENX(3DSTATE_SF_header),
636
   };
637

638
   sf.ViewportTransformEnable = true;
639
   sf.StatisticsEnable = true;
640
   sf.VertexSubPixelPrecisionSelect = _8Bit;
641
   sf.AALineDistanceMode = true;
642

643
   switch (vk_provoking_vertex_mode(rs_info)) {
644
   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
645
      sf.TriangleStripListProvokingVertexSelect = 0;
646
      sf.LineStripListProvokingVertexSelect = 0;
647
      sf.TriangleFanProvokingVertexSelect = 1;
648
      break;
649

650
   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
651
      sf.TriangleStripListProvokingVertexSelect = 2;
652
      sf.LineStripListProvokingVertexSelect = 1;
653
      sf.TriangleFanProvokingVertexSelect = 2;
654
      break;
655

656
   default:
657
      unreachable("Invalid provoking vertex mode");
658
   }
659

660
#if GFX_VERx10 == 75
661
   sf.LineStippleEnable = line_info && line_info->stippledLineEnable;
662
#endif
663

664
#if GFX_VER >= 12
665
   sf.DerefBlockSize = urb_deref_block_size;
666
#endif
667

668
   const struct brw_vue_prog_data *last_vue_prog_data =
669
      anv_pipeline_get_last_vue_prog_data(pipeline);
670

671
   if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
672
      sf.PointWidthSource = Vertex;
673
   } else {
674
      sf.PointWidthSource = State;
675
      sf.PointWidth = 1.0;
676
   }
677

678
#if GFX_VER >= 8
679
   struct GENX(3DSTATE_RASTER) raster = {
680
      GENX(3DSTATE_RASTER_header),
681
   };
682
#else
683
#  define raster sf
684
#endif
685

686
   VkPolygonMode raster_mode =
687
      genX(raster_polygon_mode)(pipeline, ia_info->topology);
688
   bool dynamic_primitive_topology =
689
      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
690

691
   /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
692
    * "Multisample Modes State".
693
    */
694
#if GFX_VER >= 8
695
   if (!dynamic_primitive_topology)
696
      genX(rasterization_mode)(raster_mode, pipeline->line_mode,
697
                               &raster.APIMode,
698
                               &raster.DXMultisampleRasterizationEnable);
699

700
   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
701
    * computations.  If we ever set this bit to a different value, they will
702
    * need to be updated accordingly.
703
    */
704
   raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
705
   raster.ForceMultisampling = false;
706
#else
707
   uint32_t ms_rast_mode = 0;
708

709
   if (!dynamic_primitive_topology)
710
      ms_rast_mode = genX(ms_rasterization_mode)(pipeline, raster_mode);
711

712
   raster.MultisampleRasterizationMode = ms_rast_mode;
713
#endif
714

715
   raster.AntialiasingEnable =
716
      dynamic_primitive_topology ? 0 :
717
      anv_rasterization_aa_mode(raster_mode, pipeline->line_mode);
718

719
   raster.FrontWinding =
720
      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?
721
         0 : genX(vk_to_intel_front_face)[rs_info->frontFace];
722
   raster.CullMode =
723
      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?
724
         0 : genX(vk_to_intel_cullmode)[rs_info->cullMode];
725

726
   raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
727
   raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
728
   raster.ScissorRectangleEnable = true;
729

730
#if GFX_VER >= 9
731
   /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
732
   raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
733
   raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
734
#elif GFX_VER >= 8
735
   raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
736
#endif
737

738
#if GFX_VER >= 9
739
   raster.ConservativeRasterizationEnable =
740
      vk_conservative_rasterization_mode(rs_info) !=
741
         VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
742
#endif
743

744
   bool depth_bias_enable =
745
      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE ?
746
         0 : rs_info->depthBiasEnable;
747

748
   raster.GlobalDepthOffsetEnableSolid = depth_bias_enable;
749
   raster.GlobalDepthOffsetEnableWireframe = depth_bias_enable;
750
   raster.GlobalDepthOffsetEnablePoint = depth_bias_enable;
751

752
#if GFX_VER == 7
753
   /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
754
    * can get the depth offsets correct.
755
    */
756
   if (subpass->depth_stencil_attachment) {
757
      VkFormat vk_format =
758
         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
759
      assert(vk_format_is_depth_or_stencil(vk_format));
760
      if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
761
         enum isl_format isl_format =
762
            anv_get_isl_format(&pipeline->base.device->info, vk_format,
763
                               VK_IMAGE_ASPECT_DEPTH_BIT,
764
                               VK_IMAGE_TILING_OPTIMAL);
765
         sf.DepthBufferSurfaceFormat =
766
            isl_format_get_depth_format(isl_format, false);
767
      }
768
   }
769
#endif
770

771
#if GFX_VER >= 8
772
   GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);
773
   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);
774
#else
775
#  undef raster
776
   GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);
777
#endif
778
}
779

780
static void
781
emit_ms_state(struct anv_graphics_pipeline *pipeline,
782
              const VkPipelineMultisampleStateCreateInfo *info,
783
              uint32_t dynamic_states)
784
{
785
   /* Only lookup locations if the extensions is active, otherwise the default
786
    * ones will be used either at device initialization time or through
787
    * 3DSTATE_MULTISAMPLE on Gfx7/7.5 by passing NULL locations.
788
    */
789
   if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations) {
790
      /* If the sample locations are dynamic, 3DSTATE_MULTISAMPLE on Gfx7/7.5
791
       * will be emitted dynamically, so skip it here. On Gfx8+
792
       * 3DSTATE_SAMPLE_PATTERN will be emitted dynamically, so skip it here.
793
       */
794
      if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)) {
795
#if GFX_VER >= 8
796
         genX(emit_sample_pattern)(&pipeline->base.batch,
797
                                   pipeline->dynamic_state.sample_locations.samples,
798
                                   pipeline->dynamic_state.sample_locations.locations);
799
#endif
800
      }
801

802
      genX(emit_multisample)(&pipeline->base.batch,
803
                             pipeline->dynamic_state.sample_locations.samples,
804
                             pipeline->dynamic_state.sample_locations.locations);
805
   } else {
806
      /* On Gfx8+ 3DSTATE_MULTISAMPLE does not hold anything we need to modify
807
       * for sample locations, so we don't have to emit it dynamically.
808
       */
809
#if GFX_VER >= 8
810
      genX(emit_multisample)(&pipeline->base.batch,
811
                             info ? info->rasterizationSamples : 1,
812
                             NULL);
813
#endif
814
   }
815

816
   /* From the Vulkan 1.0 spec:
817
    *    If pSampleMask is NULL, it is treated as if the mask has all bits
818
    *    enabled, i.e. no coverage is removed from fragments.
819
    *
820
    * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
821
    */
822
#if GFX_VER >= 8
823
   uint32_t sample_mask = 0xffff;
824
#else
825
   uint32_t sample_mask = 0xff;
826
#endif
827

828
   if (info && info->pSampleMask)
829
      sample_mask &= info->pSampleMask[0];
830

831
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
832
      sm.SampleMask = sample_mask;
833
   }
834

835
   pipeline->cps_state = ANV_STATE_NULL;
836
#if GFX_VER >= 11
837
   if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) &&
838
       pipeline->base.device->vk.enabled_extensions.KHR_fragment_shading_rate) {
839
#if GFX_VER >= 12
840
      struct anv_device *device = pipeline->base.device;
841
      const uint32_t num_dwords =
842
         GENX(CPS_STATE_length) * 4 * pipeline->dynamic_state.viewport.count;
843
      pipeline->cps_state =
844
         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords, 32);
845
#endif
846

847
      genX(emit_shading_rate)(&pipeline->base.batch,
848
                              pipeline,
849
                              pipeline->cps_state,
850
                              &pipeline->dynamic_state);
851
   }
852
#endif
853
}
854

855
const uint32_t genX(vk_to_intel_logic_op)[] = {
856
   [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
857
   [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
858
   [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
859
   [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
860
   [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
861
   [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
862
   [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
863
   [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
864
   [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
865
   [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
866
   [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
867
   [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
868
   [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
869
   [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
870
   [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
871
   [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
872
};
873

874
static const uint32_t vk_to_intel_blend[] = {
875
   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
876
   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
877
   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
878
   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
879
   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
880
   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
881
   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
882
   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
883
   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
884
   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
885
   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
886
   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
887
   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
888
   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
889
   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
890
   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
891
   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
892
   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
893
   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
894
};
895

896
static const uint32_t vk_to_intel_blend_op[] = {
897
   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
898
   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
899
   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
900
   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
901
   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
902
};
903

904
const uint32_t genX(vk_to_intel_compare_op)[] = {
905
   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
906
   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
907
   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_EQUAL,
908
   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LEQUAL,
909
   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GREATER,
910
   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_NOTEQUAL,
911
   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GEQUAL,
912
   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_ALWAYS,
913
};
914

915
const uint32_t genX(vk_to_intel_stencil_op)[] = {
916
   [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
917
   [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
918
   [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
919
   [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
920
   [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
921
   [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
922
   [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
923
   [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
924
};
925

926
const uint32_t genX(vk_to_intel_primitive_type)[] = {
927
   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
928
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
929
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
930
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
931
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
932
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
933
   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
934
   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
935
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
936
   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
937
};
938

939
/* This function sanitizes the VkStencilOpState by looking at the compare ops
940
 * and trying to determine whether or not a given stencil op can ever actually
941
 * occur.  Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.
942
 * This function returns true if, after sanitation, any of the stencil ops are
943
 * set to something other than VK_STENCIL_OP_KEEP.
944
 */
945
static bool
946
sanitize_stencil_face(VkStencilOpState *face,
947
                      VkCompareOp depthCompareOp)
948
{
949
   /* If compareOp is ALWAYS then the stencil test will never fail and failOp
950
    * will never happen.  Set failOp to KEEP in this case.
951
    */
952
   if (face->compareOp == VK_COMPARE_OP_ALWAYS)
953
      face->failOp = VK_STENCIL_OP_KEEP;
954

955
   /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth
956
    * or stencil tests will fail and passOp will never happen.
957
    */
958
   if (face->compareOp == VK_COMPARE_OP_NEVER ||
959
       depthCompareOp == VK_COMPARE_OP_NEVER)
960
      face->passOp = VK_STENCIL_OP_KEEP;
961

962
   /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the
963
    * stencil test will fail or the depth test will pass.  In either case,
964
    * depthFailOp will never happen.
965
    */
966
   if (face->compareOp == VK_COMPARE_OP_NEVER ||
967
       depthCompareOp == VK_COMPARE_OP_ALWAYS)
968
      face->depthFailOp = VK_STENCIL_OP_KEEP;
969

970
   return face->failOp != VK_STENCIL_OP_KEEP ||
971
          face->depthFailOp != VK_STENCIL_OP_KEEP ||
972
          face->passOp != VK_STENCIL_OP_KEEP;
973
}
974

975
/* Intel hardware is fairly sensitive to whether or not depth/stencil writes
976
 * are enabled.  In the presence of discards, it's fairly easy to get into the
977
 * non-promoted case which means a fairly big performance hit.  From the Iron
978
 * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":
979
 *
980
 *    "Non-promoted depth (N) is active whenever the depth test can be done
981
 *    early but it cannot determine whether or not to write source depth to
982
 *    the depth buffer, therefore the depth write must be performed post pixel
983
 *    shader. This includes cases where the pixel shader can kill pixels,
984
 *    including via sampler chroma key, as well as cases where the alpha test
985
 *    function is enabled, which kills pixels based on a programmable alpha
986
 *    test. In this case, even if the depth test fails, the pixel cannot be
987
 *    killed if a stencil write is indicated. Whether or not the stencil write
988
 *    happens depends on whether or not the pixel is killed later. In these
989
 *    cases if stencil test fails and stencil writes are off, the pixels can
990
 *    also be killed early. If stencil writes are enabled, the pixels must be
991
 *    treated as Computed depth (described above)."
992
 *
993
 * The same thing as mentioned in the stencil case can happen in the depth
994
 * case as well if it thinks it writes depth but, thanks to the depth test
995
 * being GL_EQUAL, the write doesn't actually matter.  A little extra work
996
 * up-front to try and disable depth and stencil writes can make a big
997
 * difference.
998
 *
999
 * Unfortunately, the way depth and stencil testing is specified, there are
1000
 * many case where, regardless of depth/stencil writes being enabled, nothing
1001
 * actually gets written due to some other bit of state being set.  This
1002
 * function attempts to "sanitize" the depth stencil state and disable writes
1003
 * and sometimes even testing whenever possible.
1004
 */
1005
static void
1006
sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,
1007
                  bool *stencilWriteEnable,
1008
                  VkImageAspectFlags ds_aspects)
1009
{
1010
   *stencilWriteEnable = state->stencilTestEnable;
1011

1012
   /* If the depth test is disabled, we won't be writing anything. Make sure we
1013
    * treat the test as always passing later on as well.
1014
    *
1015
    * Also, the Vulkan spec requires that if either depth or stencil is not
1016
    * present, the pipeline is to act as if the test silently passes. In that
1017
    * case we won't write either.
1018
    */
1019
   if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1020
      state->depthWriteEnable = false;
1021
      state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
1022
   }
1023

1024
   if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
1025
      *stencilWriteEnable = false;
1026
      state->front.compareOp = VK_COMPARE_OP_ALWAYS;
1027
      state->back.compareOp = VK_COMPARE_OP_ALWAYS;
1028
   }
1029

1030
   /* If the stencil test is enabled and always fails, then we will never get
1031
    * to the depth test so we can just disable the depth test entirely.
1032
    */
1033
   if (state->stencilTestEnable &&
1034
       state->front.compareOp == VK_COMPARE_OP_NEVER &&
1035
       state->back.compareOp == VK_COMPARE_OP_NEVER) {
1036
      state->depthTestEnable = false;
1037
      state->depthWriteEnable = false;
1038
   }
1039

1040
   /* If depthCompareOp is EQUAL then the value we would be writing to the
1041
    * depth buffer is the same as the value that's already there so there's no
1042
    * point in writing it.
1043
    */
1044
   if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)
1045
      state->depthWriteEnable = false;
1046

1047
   /* If the stencil ops are such that we don't actually ever modify the
1048
    * stencil buffer, we should disable writes.
1049
    */
1050
   if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&
1051
       !sanitize_stencil_face(&state->back, state->depthCompareOp))
1052
      *stencilWriteEnable = false;
1053

1054
   /* If the depth test always passes and we never write out depth, that's the
1055
    * same as if the depth test is disabled entirely.
1056
    */
1057
   if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&
1058
       !state->depthWriteEnable)
1059
      state->depthTestEnable = false;
1060

1061
   /* If the stencil test always passes and we never write out stencil, that's
1062
    * the same as if the stencil test is disabled entirely.
1063
    */
1064
   if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&
1065
       state->back.compareOp == VK_COMPARE_OP_ALWAYS &&
1066
       !*stencilWriteEnable)
1067
      state->stencilTestEnable = false;
1068
}
1069

1070
static void
1071
emit_ds_state(struct anv_graphics_pipeline *pipeline,
1072
              const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,
1073
              const uint32_t dynamic_states,
1074
              const struct anv_render_pass *pass,
1075
              const struct anv_subpass *subpass)
1076
{
1077
#if GFX_VER == 7
1078
#  define depth_stencil_dw pipeline->gfx7.depth_stencil_state
1079
#elif GFX_VER == 8
1080
#  define depth_stencil_dw pipeline->gfx8.wm_depth_stencil
1081
#else
1082
#  define depth_stencil_dw pipeline->gfx9.wm_depth_stencil
1083
#endif
1084

1085
   if (pCreateInfo == NULL) {
1086
      /* We're going to OR this together with the dynamic state.  We need
1087
       * to make sure it's initialized to something useful.
1088
       */
1089
      pipeline->writes_stencil = false;
1090
      pipeline->stencil_test_enable = false;
1091
      pipeline->writes_depth = false;
1092
      pipeline->depth_test_enable = false;
1093
      pipeline->depth_bounds_test_enable = false;
1094
      memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
1095
      return;
1096
   }
1097

1098
   VkImageAspectFlags ds_aspects = 0;
1099
   if (subpass->depth_stencil_attachment) {
1100
      VkFormat depth_stencil_format =
1101
         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
1102
      ds_aspects = vk_format_aspects(depth_stencil_format);
1103
   }
1104

1105
   VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
1106
   sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
1107
   pipeline->stencil_test_enable = info.stencilTestEnable;
1108
   pipeline->writes_depth = info.depthWriteEnable;
1109
   pipeline->depth_test_enable = info.depthTestEnable;
1110
   pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;
1111

1112
   bool dynamic_stencil_op =
1113
      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1114

1115
#if GFX_VER <= 7
1116
   struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
1117
#else
1118
   struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
1119
#endif
1120
      .DepthTestEnable =
1121
         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?
1122
            0 : info.depthTestEnable,
1123

1124
      .DepthBufferWriteEnable =
1125
         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?
1126
            0 : info.depthWriteEnable,
1127

1128
      .DepthTestFunction =
1129
         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?
1130
            0 : genX(vk_to_intel_compare_op)[info.depthCompareOp],
1131

1132
      .DoubleSidedStencilEnable = true,
1133

1134
      .StencilTestEnable =
1135
         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?
1136
            0 : info.stencilTestEnable,
1137

1138
      .StencilFailOp = genX(vk_to_intel_stencil_op)[info.front.failOp],
1139
      .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.front.passOp],
1140
      .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.front.depthFailOp],
1141
      .StencilTestFunction = genX(vk_to_intel_compare_op)[info.front.compareOp],
1142
      .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[info.back.failOp],
1143
      .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.back.passOp],
1144
      .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.back.depthFailOp],
1145
      .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[info.back.compareOp],
1146
   };
1147

1148
   if (dynamic_stencil_op) {
1149
      depth_stencil.StencilFailOp = 0;
1150
      depth_stencil.StencilPassDepthPassOp = 0;
1151
      depth_stencil.StencilPassDepthFailOp = 0;
1152
      depth_stencil.StencilTestFunction = 0;
1153
      depth_stencil.BackfaceStencilFailOp = 0;
1154
      depth_stencil.BackfaceStencilPassDepthPassOp = 0;
1155
      depth_stencil.BackfaceStencilPassDepthFailOp = 0;
1156
      depth_stencil.BackfaceStencilTestFunction = 0;
1157
   }
1158

1159
#if GFX_VER <= 7
1160
   GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
1161
#else
1162
   GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
1163
#endif
1164
}
1165

1166
static bool
1167
is_dual_src_blend_factor(VkBlendFactor factor)
1168
{
1169
   return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
1170
          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
1171
          factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
1172
          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
1173
}
1174

1175
static inline uint32_t *
1176
write_disabled_blend(uint32_t *state)
1177
{
1178
   struct GENX(BLEND_STATE_ENTRY) entry = {
1179
      .WriteDisableAlpha = true,
1180
      .WriteDisableRed = true,
1181
      .WriteDisableGreen = true,
1182
      .WriteDisableBlue = true,
1183
   };
1184
   GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);
1185
   return state + GENX(BLEND_STATE_ENTRY_length);
1186
}
1187

1188
static void
1189
emit_cb_state(struct anv_graphics_pipeline *pipeline,
1190
              const VkPipelineColorBlendStateCreateInfo *info,
1191
              const VkPipelineMultisampleStateCreateInfo *ms_info,
1192
              uint32_t dynamic_states)
1193
{
1194
   struct anv_device *device = pipeline->base.device;
1195
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1196

1197
   struct GENX(BLEND_STATE) blend_state = {
1198
#if GFX_VER >= 8
1199
      .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1200
      .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1201
#endif
1202
   };
1203

1204
   uint32_t surface_count = 0;
1205
   struct anv_pipeline_bind_map *map;
1206
   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1207
      map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
1208
      surface_count = map->surface_count;
1209
   }
1210

1211
   const uint32_t num_dwords = GENX(BLEND_STATE_length) +
1212
      GENX(BLEND_STATE_ENTRY_length) * surface_count;
1213
   uint32_t *blend_state_start, *state_pos;
1214

1215
   if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1216
                         ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
1217
      const struct intel_device_info *devinfo = &pipeline->base.device->info;
1218
      blend_state_start = devinfo->ver >= 8 ?
1219
         pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
1220
      pipeline->blend_state = ANV_STATE_NULL;
1221
   } else {
1222
      pipeline->blend_state =
1223
         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
1224
      blend_state_start = pipeline->blend_state.map;
1225
   }
1226
   state_pos = blend_state_start;
1227

1228
   bool has_writeable_rt = false;
1229
   state_pos += GENX(BLEND_STATE_length);
1230
#if GFX_VER >= 8
1231
   struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
1232
#endif
1233
   for (unsigned i = 0; i < surface_count; i++) {
1234
      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
1235

1236
      /* All color attachments are at the beginning of the binding table */
1237
      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
1238
         break;
1239

1240
      /* We can have at most 8 attachments */
1241
      assert(i < MAX_RTS);
1242

1243
      if (info == NULL || binding->index >= info->attachmentCount) {
1244
         state_pos = write_disabled_blend(state_pos);
1245
         continue;
1246
      }
1247

1248
      if ((pipeline->dynamic_state.color_writes & (1u << binding->index)) == 0) {
1249
         state_pos = write_disabled_blend(state_pos);
1250
         continue;
1251
      }
1252

1253
      const VkPipelineColorBlendAttachmentState *a =
1254
         &info->pAttachments[binding->index];
1255

1256
      struct GENX(BLEND_STATE_ENTRY) entry = {
1257
#if GFX_VER < 8
1258
         .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1259
         .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1260
#endif
1261
         .LogicOpEnable = info->logicOpEnable,
1262
         .LogicOpFunction = dynamic_states & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP ?
1263
                            0: genX(vk_to_intel_logic_op)[info->logicOp],
1264

1265
         /* Vulkan specification 1.2.168, VkLogicOp:
1266
          *
1267
          *   "Logical operations are controlled by the logicOpEnable and
1268
          *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
1269
          *    logicOpEnable is VK_TRUE, then a logical operation selected by
1270
          *    logicOp is applied between each color attachment and the
1271
          *    fragment’s corresponding output value, and blending of all
1272
          *    attachments is treated as if it were disabled."
1273
          *
1274
          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1275
          * BLEND_STATE_ENTRY:
1276
          *
1277
          *   "Enabling LogicOp and Color Buffer Blending at the same time is
1278
          *    UNDEFINED"
1279
          */
1280
         .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
1281
         .ColorClampRange = COLORCLAMP_RTFORMAT,
1282
         .PreBlendColorClampEnable = true,
1283
         .PostBlendColorClampEnable = true,
1284
         .SourceBlendFactor = vk_to_intel_blend[a->srcColorBlendFactor],
1285
         .DestinationBlendFactor = vk_to_intel_blend[a->dstColorBlendFactor],
1286
         .ColorBlendFunction = vk_to_intel_blend_op[a->colorBlendOp],
1287
         .SourceAlphaBlendFactor = vk_to_intel_blend[a->srcAlphaBlendFactor],
1288
         .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dstAlphaBlendFactor],
1289
         .AlphaBlendFunction = vk_to_intel_blend_op[a->alphaBlendOp],
1290
         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
1291
         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
1292
         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
1293
         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
1294
      };
1295

1296
      if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
1297
          a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
1298
          a->colorBlendOp != a->alphaBlendOp) {
1299
#if GFX_VER >= 8
1300
         blend_state.IndependentAlphaBlendEnable = true;
1301
#else
1302
         entry.IndependentAlphaBlendEnable = true;
1303
#endif
1304
      }
1305

1306
      /* The Dual Source Blending documentation says:
1307
       *
1308
       * "If SRC1 is included in a src/dst blend factor and
1309
       * a DualSource RT Write message is not used, results
1310
       * are UNDEFINED. (This reflects the same restriction in DX APIs,
1311
       * where undefined results are produced if “o1” is not written
1312
       * by a PS – there are no default values defined)."
1313
       *
1314
       * There is no way to gracefully fix this undefined situation
1315
       * so we just disable the blending to prevent possible issues.
1316
       */
1317
      if (!wm_prog_data->dual_src_blend &&
1318
          (is_dual_src_blend_factor(a->srcColorBlendFactor) ||
1319
           is_dual_src_blend_factor(a->dstColorBlendFactor) ||
1320
           is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||
1321
           is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {
1322
         vk_debug_report(&device->physical->instance->vk,
1323
                         VK_DEBUG_REPORT_WARNING_BIT_EXT,
1324
                         &device->vk.base, 0, 0, "anv",
1325
                         "Enabled dual-src blend factors without writing both targets "
1326
                         "in the shader.  Disabling blending to avoid GPU hangs.");
1327
         entry.ColorBufferBlendEnable = false;
1328
      }
1329

1330
      if (a->colorWriteMask != 0)
1331
         has_writeable_rt = true;
1332

1333
      /* Our hardware applies the blend factor prior to the blend function
1334
       * regardless of what function is used.  Technically, this means the
1335
       * hardware can do MORE than GL or Vulkan specify.  However, it also
1336
       * means that, for MIN and MAX, we have to stomp the blend factor to
1337
       * ONE to make it a no-op.
1338
       */
1339
      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
1340
          a->colorBlendOp == VK_BLEND_OP_MAX) {
1341
         entry.SourceBlendFactor = BLENDFACTOR_ONE;
1342
         entry.DestinationBlendFactor = BLENDFACTOR_ONE;
1343
      }
1344
      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
1345
          a->alphaBlendOp == VK_BLEND_OP_MAX) {
1346
         entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1347
         entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1348
      }
1349
      GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
1350
      state_pos += GENX(BLEND_STATE_ENTRY_length);
1351
#if GFX_VER >= 8
1352
      if (i == 0)
1353
         bs0 = entry;
1354
#endif
1355
   }
1356

1357
#if GFX_VER >= 8
1358
   struct GENX(3DSTATE_PS_BLEND) blend = {
1359
      GENX(3DSTATE_PS_BLEND_header),
1360
   };
1361
   blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
1362
   blend.HasWriteableRT                = has_writeable_rt;
1363
   blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
1364
   blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
1365
   blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
1366
   blend.SourceBlendFactor             = bs0.SourceBlendFactor;
1367
   blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
1368
   blend.AlphaTestEnable               = false;
1369
   blend.IndependentAlphaBlendEnable   = blend_state.IndependentAlphaBlendEnable;
1370

1371
   if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1372
                        ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
1373
      GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
1374
   } else {
1375
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)
1376
         _blend = blend;
1377
   }
1378
#else
1379
   (void)has_writeable_rt;
1380
#endif
1381

1382
   GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
1383

1384
   if (!(dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1385
                           ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP))) {
1386
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
1387
         bsp.BlendStatePointer      = pipeline->blend_state.offset;
1388
#if GFX_VER >= 8
1389
         bsp.BlendStatePointerValid = true;
1390
#endif
1391
      }
1392
   }
1393
}
1394

1395
static void
1396
emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
1397
                  const VkPipelineInputAssemblyStateCreateInfo *ia_info,
1398
                  const VkPipelineViewportStateCreateInfo *vp_info,
1399
                  const VkPipelineRasterizationStateCreateInfo *rs_info,
1400
                  const uint32_t dynamic_states)
1401
{
1402
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1403
   (void) wm_prog_data;
1404

1405
   struct GENX(3DSTATE_CLIP) clip = {
1406
      GENX(3DSTATE_CLIP_header),
1407
   };
1408

1409
   clip.ClipEnable               = true;
1410
   clip.StatisticsEnable         = true;
1411
   clip.EarlyCullEnable          = true;
1412
   clip.APIMode                  = APIMODE_D3D;
1413
   clip.GuardbandClipTestEnable  = true;
1414

1415
   /* Only enable the XY clip test when the final polygon rasterization
1416
    * mode is VK_POLYGON_MODE_FILL.  We want to leave it disabled for
1417
    * points and lines so we get "pop-free" clipping.
1418
    */
1419
   VkPolygonMode raster_mode =
1420
      genX(raster_polygon_mode)(pipeline, ia_info->topology);
1421
   clip.ViewportXYClipTestEnable =
1422
      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ?
1423
         0 : (raster_mode == VK_POLYGON_MODE_FILL);
1424

1425
#if GFX_VER >= 8
1426
   clip.VertexSubPixelPrecisionSelect = _8Bit;
1427
#endif
1428
   clip.ClipMode = CLIPMODE_NORMAL;
1429

1430
   switch (vk_provoking_vertex_mode(rs_info)) {
1431
   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
1432
      clip.TriangleStripListProvokingVertexSelect = 0;
1433
      clip.LineStripListProvokingVertexSelect = 0;
1434
      clip.TriangleFanProvokingVertexSelect = 1;
1435
      break;
1436

1437
   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
1438
      clip.TriangleStripListProvokingVertexSelect = 2;
1439
      clip.LineStripListProvokingVertexSelect = 1;
1440
      clip.TriangleFanProvokingVertexSelect = 2;
1441
      break;
1442

1443
   default:
1444
      unreachable("Invalid provoking vertex mode");
1445
   }
1446

1447
   clip.MinimumPointWidth = 0.125;
1448
   clip.MaximumPointWidth = 255.875;
1449

1450
   const struct brw_vue_prog_data *last =
1451
      anv_pipeline_get_last_vue_prog_data(pipeline);
1452

1453
   /* From the Vulkan 1.0.45 spec:
1454
    *
1455
    *    "If the last active vertex processing stage shader entry point's
1456
    *    interface does not include a variable decorated with
1457
    *    ViewportIndex, then the first viewport is used."
1458
    */
1459
   if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
1460
      clip.MaximumVPIndex = vp_info->viewportCount > 0 ?
1461
         vp_info->viewportCount - 1 : 0;
1462
   } else {
1463
      clip.MaximumVPIndex = 0;
1464
   }
1465

1466
   /* From the Vulkan 1.0.45 spec:
1467
    *
1468
    *    "If the last active vertex processing stage shader entry point's
1469
    *    interface does not include a variable decorated with Layer, then
1470
    *    the first layer is used."
1471
    */
1472
   clip.ForceZeroRTAIndexEnable =
1473
      !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
1474

1475
#if GFX_VER == 7
1476
   clip.FrontWinding            = genX(vk_to_intel_front_face)[rs_info->frontFace];
1477
   clip.CullMode                = genX(vk_to_intel_cullmode)[rs_info->cullMode];
1478
   clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
1479
   clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
1480
   clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
1481
#else
1482
   clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
1483
      (wm_prog_data->barycentric_interp_modes &
1484
       BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
1485
#endif
1486

1487
   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);
1488
}
1489

1490
static void
1491
emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
1492
                       const VkPipelineRasterizationStateCreateInfo *rs_info,
1493
                       const uint32_t dynamic_states)
1494
{
1495
   const struct brw_vue_prog_data *prog_data =
1496
      anv_pipeline_get_last_vue_prog_data(pipeline);
1497
   const struct brw_vue_map *vue_map = &prog_data->vue_map;
1498

1499
   nir_xfb_info *xfb_info;
1500
   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1501
      xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
1502
   else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
1503
      xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
1504
   else
1505
      xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
1506

1507
#if GFX_VER == 7
1508
#  define streamout_state_dw pipeline->gfx7.streamout_state
1509
#else
1510
#  define streamout_state_dw pipeline->gfx8.streamout_state
1511
#endif
1512

1513
   struct GENX(3DSTATE_STREAMOUT) so = {
1514
      GENX(3DSTATE_STREAMOUT_header),
1515
      .RenderingDisable =
1516
         (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) ?
1517
            0 : rs_info->rasterizerDiscardEnable,
1518
   };
1519

1520
   if (xfb_info) {
1521
      so.SOFunctionEnable = true;
1522
      so.SOStatisticsEnable = true;
1523

1524
      switch (vk_provoking_vertex_mode(rs_info)) {
1525
      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
1526
         so.ReorderMode = LEADING;
1527
         break;
1528

1529
      case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
1530
         so.ReorderMode = TRAILING;
1531
         break;
1532

1533
      default:
1534
         unreachable("Invalid provoking vertex mode");
1535
      }
1536

1537
      const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
1538
         vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
1539
      so.RenderStreamSelect = stream_info ?
1540
                              stream_info->rasterizationStream : 0;
1541

1542
#if GFX_VER >= 8
1543
      so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1544
      so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1545
      so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1546
      so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1547
#else
1548
      pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
1549
      pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
1550
      pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
1551
      pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
1552

1553
      /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which
1554
       * is a bit inconvenient because we don't know what buffers will
1555
       * actually be enabled until draw time.  We do our best here by
1556
       * setting them based on buffers_written and we disable them
1557
       * as-needed at draw time by setting EndAddress = BaseAddress.
1558
       */
1559
      so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
1560
      so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
1561
      so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
1562
      so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
1563
#endif
1564

1565
      int urb_entry_read_offset = 0;
1566
      int urb_entry_read_length =
1567
         (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1568

1569
      /* We always read the whole vertex.  This could be reduced at some
1570
       * point by reading less and offsetting the register index in the
1571
       * SO_DECLs.
1572
       */
1573
      so.Stream0VertexReadOffset = urb_entry_read_offset;
1574
      so.Stream0VertexReadLength = urb_entry_read_length - 1;
1575
      so.Stream1VertexReadOffset = urb_entry_read_offset;
1576
      so.Stream1VertexReadLength = urb_entry_read_length - 1;
1577
      so.Stream2VertexReadOffset = urb_entry_read_offset;
1578
      so.Stream2VertexReadLength = urb_entry_read_length - 1;
1579
      so.Stream3VertexReadOffset = urb_entry_read_offset;
1580
      so.Stream3VertexReadLength = urb_entry_read_length - 1;
1581
   }
1582

1583
   if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
1584
      GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);
1585
   } else {
1586
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), _so)
1587
         _so = so;
1588
   }
1589

1590
   if (xfb_info) {
1591
      struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
1592
      int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
1593
      int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
1594

1595
      memset(so_decl, 0, sizeof(so_decl));
1596

1597
      for (unsigned i = 0; i < xfb_info->output_count; i++) {
1598
         const nir_xfb_output_info *output = &xfb_info->outputs[i];
1599
         unsigned buffer = output->buffer;
1600
         unsigned stream = xfb_info->buffer_to_stream[buffer];
1601

1602
         /* Our hardware is unusual in that it requires us to program SO_DECLs
1603
          * for fake "hole" components, rather than simply taking the offset
1604
          * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
1605
          * program as many size = 4 holes as we can, then a final hole to
1606
          * accommodate the final 1, 2, or 3 remaining.
1607
          */
1608
         int hole_dwords = (output->offset - next_offset[buffer]) / 4;
1609
         while (hole_dwords > 0) {
1610
            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1611
               .HoleFlag = 1,
1612
               .OutputBufferSlot = buffer,
1613
               .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
1614
            };
1615
            hole_dwords -= 4;
1616
         }
1617

1618
         int varying = output->location;
1619
         uint8_t component_mask = output->component_mask;
1620
         /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
1621
          * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
1622
          * - VARYING_SLOT_LAYER                  in VARYING_SLOT_PSIZ.y
1623
          * - VARYING_SLOT_VIEWPORT               in VARYING_SLOT_PSIZ.z
1624
          * - VARYING_SLOT_PSIZ                   in VARYING_SLOT_PSIZ.w
1625
          */
1626
         if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
1627
            varying = VARYING_SLOT_PSIZ;
1628
            component_mask = 1 << 0; // SO_DECL_COMPMASK_X
1629
         } else if (varying == VARYING_SLOT_LAYER) {
1630
            varying = VARYING_SLOT_PSIZ;
1631
            component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
1632
         } else if (varying == VARYING_SLOT_VIEWPORT) {
1633
            varying = VARYING_SLOT_PSIZ;
1634
            component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
1635
         } else if (varying == VARYING_SLOT_PSIZ) {
1636
            component_mask = 1 << 3; // SO_DECL_COMPMASK_W
1637
         }
1638

1639
         next_offset[buffer] = output->offset +
1640
                               __builtin_popcount(component_mask) * 4;
1641

1642
         const int slot = vue_map->varying_to_slot[varying];
1643
         if (slot < 0) {
1644
            /* This can happen if the shader never writes to the varying.
1645
             * Insert a hole instead of actual varying data.
1646
             */
1647
            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1648
               .HoleFlag = true,
1649
               .OutputBufferSlot = buffer,
1650
               .ComponentMask = component_mask,
1651
            };
1652
         } else {
1653
            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1654
               .OutputBufferSlot = buffer,
1655
               .RegisterIndex = slot,
1656
               .ComponentMask = component_mask,
1657
            };
1658
         }
1659
      }
1660

1661
      int max_decls = 0;
1662
      for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1663
         max_decls = MAX2(max_decls, decls[s]);
1664

1665
      uint8_t sbs[MAX_XFB_STREAMS] = { };
1666
      for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1667
         if (xfb_info->buffers_written & (1 << b))
1668
            sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1669
      }
1670

1671
      uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
1672
                                     GENX(3DSTATE_SO_DECL_LIST),
1673
                                     .StreamtoBufferSelects0 = sbs[0],
1674
                                     .StreamtoBufferSelects1 = sbs[1],
1675
                                     .StreamtoBufferSelects2 = sbs[2],
1676
                                     .StreamtoBufferSelects3 = sbs[3],
1677
                                     .NumEntries0 = decls[0],
1678
                                     .NumEntries1 = decls[1],
1679
                                     .NumEntries2 = decls[2],
1680
                                     .NumEntries3 = decls[3]);
1681

1682
      for (int i = 0; i < max_decls; i++) {
1683
         GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1684
            &(struct GENX(SO_DECL_ENTRY)) {
1685
               .Stream0Decl = so_decl[0][i],
1686
               .Stream1Decl = so_decl[1][i],
1687
               .Stream2Decl = so_decl[2][i],
1688
               .Stream3Decl = so_decl[3][i],
1689
            });
1690
      }
1691
   }
1692
}
1693

1694
static uint32_t
1695
get_sampler_count(const struct anv_shader_bin *bin)
1696
{
1697
   uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
1698

1699
   /* We can potentially have way more than 32 samplers and that's ok.
1700
    * However, the 3DSTATE_XS packets only have 3 bits to specify how
1701
    * many to pre-fetch and all values above 4 are marked reserved.
1702
    */
1703
   return MIN2(count_by_4, 4);
1704
}
1705

1706
static UNUSED struct anv_address
1707
get_scratch_address(struct anv_pipeline *pipeline,
1708
                    gl_shader_stage stage,
1709
                    const struct anv_shader_bin *bin)
1710
{
1711
   return (struct anv_address) {
1712
      .bo = anv_scratch_pool_alloc(pipeline->device,
1713
                                   &pipeline->device->scratch_pool,
1714
                                   stage, bin->prog_data->total_scratch),
1715
      .offset = 0,
1716
   };
1717
}
1718

1719
static UNUSED uint32_t
1720
get_scratch_space(const struct anv_shader_bin *bin)
1721
{
1722
   return ffs(bin->prog_data->total_scratch / 2048);
1723
}
1724

1725
static UNUSED uint32_t
1726
get_scratch_surf(struct anv_pipeline *pipeline,
1727
                 const struct anv_shader_bin *bin)
1728
{
1729
   return anv_scratch_pool_get_surf(pipeline->device,
1730
                                    &pipeline->device->scratch_pool,
1731
                                    bin->prog_data->total_scratch) >> 4;
1732
}
1733

1734
static void
1735
emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1736
{
1737
   const struct intel_device_info *devinfo = &pipeline->base.device->info;
1738
   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1739
   const struct anv_shader_bin *vs_bin =
1740
      pipeline->shaders[MESA_SHADER_VERTEX];
1741

1742
   assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1743

1744
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
1745
      vs.Enable               = true;
1746
      vs.StatisticsEnable     = true;
1747
      vs.KernelStartPointer   = vs_bin->kernel.offset;
1748
#if GFX_VER >= 8
1749
      vs.SIMD8DispatchEnable  =
1750
         vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1751
#endif
1752

1753
      assert(!vs_prog_data->base.base.use_alt_mode);
1754
#if GFX_VER < 11
1755
      vs.SingleVertexDispatch       = false;
1756
#endif
1757
      vs.VectorMaskEnable           = false;
1758
      /* Wa_1606682166:
1759
       * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1760
       * Disable the Sampler state prefetch functionality in the SARB by
1761
       * programming 0xB000[30] to '1'.
1762
       */
1763
      vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1764
      vs.BindingTableEntryCount     = vs_bin->bind_map.surface_count;
1765
      vs.FloatingPointMode          = IEEE754;
1766
      vs.IllegalOpcodeExceptionEnable = false;
1767
      vs.SoftwareExceptionEnable    = false;
1768
      vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
1769

1770
      if (GFX_VER == 9 && devinfo->gt == 4 &&
1771
          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1772
         /* On Sky Lake GT4, we have experienced some hangs related to the VS
1773
          * cache and tessellation.  It is unknown exactly what is happening
1774
          * but the Haswell docs for the "VS Reference Count Full Force Miss
1775
          * Enable" field of the "Thread Mode" register refer to a HSW bug in
1776
          * which the VUE handle reference count would overflow resulting in
1777
          * internal reference counting bugs.  My (Jason's) best guess is that
1778
          * this bug cropped back up on SKL GT4 when we suddenly had more
1779
          * threads in play than any previous gfx9 hardware.
1780
          *
1781
          * What we do know for sure is that setting this bit when
1782
          * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1783
          * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1784
          * Disabling the vertex cache with tessellation shaders should only
1785
          * have a minor performance impact as the tessellation shaders are
1786
          * likely generating and processing far more geometry than the vertex
1787
          * stage.
1788
          */
1789
         vs.VertexCacheDisable = true;
1790
      }
1791

1792
      vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
1793
      vs.VertexURBEntryReadOffset      = 0;
1794
      vs.DispatchGRFStartRegisterForURBData =
1795
         vs_prog_data->base.base.dispatch_grf_start_reg;
1796

1797
#if GFX_VER >= 8
1798
      vs.UserClipDistanceClipTestEnableBitmask =
1799
         vs_prog_data->base.clip_distance_mask;
1800
      vs.UserClipDistanceCullTestEnableBitmask =
1801
         vs_prog_data->base.cull_distance_mask;
1802
#endif
1803

1804
#if GFX_VERx10 >= 125
1805
      vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, vs_bin);
1806
#else
1807
      vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
1808
      vs.ScratchSpaceBasePointer =
1809
         get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
1810
#endif
1811
   }
1812
}
1813

1814
static void
1815
emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
1816
                      const VkPipelineTessellationStateCreateInfo *tess_info)
1817
{
1818
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1819
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
1820
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
1821
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
1822
      return;
1823
   }
1824

1825
   const struct intel_device_info *devinfo = &pipeline->base.device->info;
1826
   const struct anv_shader_bin *tcs_bin =
1827
      pipeline->shaders[MESA_SHADER_TESS_CTRL];
1828
   const struct anv_shader_bin *tes_bin =
1829
      pipeline->shaders[MESA_SHADER_TESS_EVAL];
1830

1831
   const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1832
   const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1833

1834
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
1835
      hs.Enable = true;
1836
      hs.StatisticsEnable = true;
1837
      hs.KernelStartPointer = tcs_bin->kernel.offset;
1838
      /* Wa_1606682166 */
1839
      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1840
      hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1841

1842
#if GFX_VER >= 12
1843
      /* Wa_1604578095:
1844
       *
1845
       *    Hang occurs when the number of max threads is less than 2 times
1846
       *    the number of instance count. The number of max threads must be
1847
       *    more than 2 times the number of instance count.
1848
       */
1849
      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1850
#endif
1851

1852
      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1853
      hs.IncludeVertexHandles = true;
1854
      hs.InstanceCount = tcs_prog_data->instances - 1;
1855

1856
      hs.VertexURBEntryReadLength = 0;
1857
      hs.VertexURBEntryReadOffset = 0;
1858
      hs.DispatchGRFStartRegisterForURBData =
1859
         tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1860
#if GFX_VER >= 12
1861
      hs.DispatchGRFStartRegisterForURBData5 =
1862
         tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1863
#endif
1864

1865
#if GFX_VERx10 >= 125
1866
      hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tcs_bin);
1867
#else
1868
      hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1869
      hs.ScratchSpaceBasePointer =
1870
         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
1871
#endif
1872

1873
#if GFX_VER == 12
1874
      /*  Patch Count threshold specifies the maximum number of patches that
1875
       *  will be accumulated before a thread dispatch is forced.
1876
       */
1877
      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1878
#endif
1879

1880
#if GFX_VER >= 9
1881
      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1882
      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1883
#endif
1884
   }
1885

1886
   const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
1887
      tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;
1888

1889
   VkTessellationDomainOrigin uv_origin =
1890
      domain_origin_state ? domain_origin_state->domainOrigin :
1891
                            VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
1892

1893
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
1894
      te.Partitioning = tes_prog_data->partitioning;
1895

1896
      if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1897
         te.OutputTopology = tes_prog_data->output_topology;
1898
      } else {
1899
         /* When the origin is upper-left, we have to flip the winding order */
1900
         if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
1901
            te.OutputTopology = OUTPUT_TRI_CW;
1902
         } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
1903
            te.OutputTopology = OUTPUT_TRI_CCW;
1904
         } else {
1905
            te.OutputTopology = tes_prog_data->output_topology;
1906
         }
1907
      }
1908

1909
      te.TEDomain = tes_prog_data->domain;
1910
      te.TEEnable = true;
1911
      te.MaximumTessellationFactorOdd = 63.0;
1912
      te.MaximumTessellationFactorNotOdd = 64.0;
1913
   }
1914

1915
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
1916
      ds.Enable = true;
1917
      ds.StatisticsEnable = true;
1918
      ds.KernelStartPointer = tes_bin->kernel.offset;
1919
      /* Wa_1606682166 */
1920
      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1921
      ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1922
      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1923

1924
      ds.ComputeWCoordinateEnable =
1925
         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
1926

1927
      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1928
      ds.PatchURBEntryReadOffset = 0;
1929
      ds.DispatchGRFStartRegisterForURBData =
1930
         tes_prog_data->base.base.dispatch_grf_start_reg;
1931

1932
#if GFX_VER >= 8
1933
#if GFX_VER < 11
1934
      ds.DispatchMode =
1935
         tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1936
            DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1937
            DISPATCH_MODE_SIMD4X2;
1938
#else
1939
      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
1940
      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1941
#endif
1942

1943
      ds.UserClipDistanceClipTestEnableBitmask =
1944
         tes_prog_data->base.clip_distance_mask;
1945
      ds.UserClipDistanceCullTestEnableBitmask =
1946
         tes_prog_data->base.cull_distance_mask;
1947
#endif
1948

1949
#if GFX_VERx10 >= 125
1950
      ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tes_bin);
1951
#else
1952
      ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1953
      ds.ScratchSpaceBasePointer =
1954
         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
1955
#endif
1956
   }
1957
}
1958

1959
static void
1960
emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1961
{
1962
   const struct intel_device_info *devinfo = &pipeline->base.device->info;
1963
   const struct anv_shader_bin *gs_bin =
1964
      pipeline->shaders[MESA_SHADER_GEOMETRY];
1965

1966
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1967
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
1968
      return;
1969
   }
1970

1971
   const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1972

1973
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
1974
      gs.Enable                  = true;
1975
      gs.StatisticsEnable        = true;
1976
      gs.KernelStartPointer      = gs_bin->kernel.offset;
1977
      gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
1978

1979
      gs.SingleProgramFlow       = false;
1980
      gs.VectorMaskEnable        = false;
1981
      /* Wa_1606682166 */
1982
      gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
1983
      gs.BindingTableEntryCount  = gs_bin->bind_map.surface_count;
1984
      gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
1985
      gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
1986

1987
      if (GFX_VER == 8) {
1988
         /* Broadwell is weird.  It needs us to divide by 2. */
1989
         gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
1990
      } else {
1991
         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1992
      }
1993

1994
      gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1995
      gs.OutputTopology          = gs_prog_data->output_topology;
1996
      gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1997
      gs.ControlDataFormat       = gs_prog_data->control_data_format;
1998
      gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
1999
      gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
2000
      gs.ReorderMode             = TRAILING;
2001

2002
#if GFX_VER >= 8
2003
      gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
2004
      gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
2005
      gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
2006
                                   gs_prog_data->static_vertex_count : 0;
2007
#endif
2008

2009
      gs.VertexURBEntryReadOffset = 0;
2010
      gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
2011
      gs.DispatchGRFStartRegisterForURBData =
2012
         gs_prog_data->base.base.dispatch_grf_start_reg;
2013

2014
#if GFX_VER >= 8
2015
      gs.UserClipDistanceClipTestEnableBitmask =
2016
         gs_prog_data->base.clip_distance_mask;
2017
      gs.UserClipDistanceCullTestEnableBitmask =
2018
         gs_prog_data->base.cull_distance_mask;
2019
#endif
2020

2021
#if GFX_VERx10 >= 125
2022
      gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, gs_bin);
2023
#else
2024
      gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
2025
      gs.ScratchSpaceBasePointer =
2026
         get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
2027
#endif
2028
   }
2029
}
2030

2031
static bool
2032
has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,
2033
                               const VkPipelineColorBlendStateCreateInfo *blend)
2034
{
2035
   const struct anv_shader_bin *shader_bin =
2036
      pipeline->shaders[MESA_SHADER_FRAGMENT];
2037
   if (!shader_bin)
2038
      return false;
2039

2040
   if (!pipeline->dynamic_state.color_writes)
2041
      return false;
2042

2043
   const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
2044
   for (int i = 0; i < bind_map->surface_count; i++) {
2045
      struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];
2046

2047
      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
2048
         continue;
2049

2050
      if (binding->index == UINT32_MAX)
2051
         continue;
2052

2053
      if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
2054
         return true;
2055
   }
2056

2057
   return false;
2058
}
2059

2060
static void
2061
emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,
2062
                const VkPipelineInputAssemblyStateCreateInfo *ia,
2063
                const VkPipelineRasterizationStateCreateInfo *raster,
2064
                const VkPipelineColorBlendStateCreateInfo *blend,
2065
                const VkPipelineMultisampleStateCreateInfo *multisample,
2066
                const VkPipelineRasterizationLineStateCreateInfoEXT *line,
2067
                const uint32_t dynamic_states)
2068
{
2069
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2070

2071
   struct GENX(3DSTATE_WM) wm = {
2072
      GENX(3DSTATE_WM_header),
2073
   };
2074
   wm.StatisticsEnable                    = true;
2075
   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
2076
   wm.LineAntialiasingRegionWidth         = _10pixels;
2077
   wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
2078

2079
   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2080
      if (wm_prog_data->early_fragment_tests) {
2081
            wm.EarlyDepthStencilControl         = EDSC_PREPS;
2082
      } else if (wm_prog_data->has_side_effects) {
2083
         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
2084
      } else {
2085
         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
2086
      }
2087

2088
#if GFX_VER >= 8
2089
      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
2090
       * doesn't take into account KillPixels when no depth or stencil
2091
       * writes are enabled.  In order for occlusion queries to work
2092
       * correctly with no attachments, we need to force-enable PS thread
2093
       * dispatch.
2094
       *
2095
       * The BDW docs are pretty clear that that this bit isn't validated
2096
       * and probably shouldn't be used in production:
2097
       *
2098
       *    "This must always be set to Normal. This field should not be
2099
       *    tested for functional validation."
2100
       *
2101
       * Unfortunately, however, the other mechanism we have for doing this
2102
       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
2103
       * Given two bad options, we choose the one which works.
2104
       */
2105
      pipeline->force_fragment_thread_dispatch =
2106
         wm_prog_data->has_side_effects ||
2107
         wm_prog_data->uses_kill;
2108

2109
      if (pipeline->force_fragment_thread_dispatch ||
2110
          !has_color_buffer_write_enabled(pipeline, blend)) {
2111
         /* Only set this value in non dynamic mode. */
2112
         wm.ForceThreadDispatchEnable =
2113
            !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;
2114
      }
2115
#endif
2116

2117
      wm.BarycentricInterpolationMode =
2118
         wm_prog_data->barycentric_interp_modes;
2119

2120
#if GFX_VER < 8
2121
      wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
2122
      wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
2123
      wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
2124
      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2125

2126
      /* If the subpass has a depth or stencil self-dependency, then we
2127
       * need to force the hardware to do the depth/stencil write *after*
2128
       * fragment shader execution.  Otherwise, the writes may hit memory
2129
       * before we get around to fetching from the input attachment and we
2130
       * may get the depth or stencil value from the current draw rather
2131
       * than the previous one.
2132
       */
2133
      wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
2134
                                         wm_prog_data->uses_kill;
2135

2136
      pipeline->force_fragment_thread_dispatch =
2137
         wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
2138
         wm_prog_data->has_side_effects ||
2139
         wm.PixelShaderKillsPixel;
2140

2141
      if (pipeline->force_fragment_thread_dispatch ||
2142
          has_color_buffer_write_enabled(pipeline, blend)) {
2143
         /* Only set this value in non dynamic mode. */
2144
         wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
2145
      }
2146

2147
      if (multisample && multisample->rasterizationSamples > 1) {
2148
         if (wm_prog_data->persample_dispatch) {
2149
            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2150
         } else {
2151
            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2152
         }
2153
      } else {
2154
         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2155
      }
2156

2157
      VkPolygonMode raster_mode =
2158
         genX(raster_polygon_mode)(pipeline, ia->topology);
2159

2160
      wm.MultisampleRasterizationMode =
2161
         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ? 0 :
2162
         genX(ms_rasterization_mode)(pipeline, raster_mode);
2163
#endif
2164

2165
      wm.LineStippleEnable = line && line->stippledLineEnable;
2166
   }
2167

2168
   uint32_t dynamic_wm_states = ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
2169

2170
#if GFX_VER < 8
2171
   dynamic_wm_states |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
2172
#endif
2173

2174
   if (dynamic_states & dynamic_wm_states) {
2175
      const struct intel_device_info *devinfo = &pipeline->base.device->info;
2176
      uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
2177
      GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
2178
   } else {
2179
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)
2180
         _wm = wm;
2181
   }
2182
}
2183

2184
static void
2185
emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
2186
                const VkPipelineColorBlendStateCreateInfo *blend,
2187
                const VkPipelineMultisampleStateCreateInfo *multisample)
2188
{
2189
   UNUSED const struct intel_device_info *devinfo =
2190
      &pipeline->base.device->info;
2191
   const struct anv_shader_bin *fs_bin =
2192
      pipeline->shaders[MESA_SHADER_FRAGMENT];
2193

2194
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2195
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2196
#if GFX_VER == 7
2197
         /* Even if no fragments are ever dispatched, gfx7 hardware hangs if
2198
          * we don't at least set the maximum number of threads.
2199
          */
2200
         ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2201
#endif
2202
      }
2203
      return;
2204
   }
2205

2206
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2207

2208
#if GFX_VER < 8
2209
   /* The hardware wedges if you have this bit set but don't turn on any dual
2210
    * source blend factors.
2211
    */
2212
   bool dual_src_blend = false;
2213
   if (wm_prog_data->dual_src_blend && blend) {
2214
      for (uint32_t i = 0; i < blend->attachmentCount; i++) {
2215
         const VkPipelineColorBlendAttachmentState *bstate =
2216
            &blend->pAttachments[i];
2217

2218
         if (bstate->blendEnable &&
2219
             (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
2220
              is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
2221
              is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
2222
              is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
2223
            dual_src_blend = true;
2224
            break;
2225
         }
2226
      }
2227
   }
2228
#endif
2229

2230
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2231
      ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
2232
      ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
2233
      ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;
2234

2235
      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
2236
       *
2237
       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
2238
       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
2239
       *
2240
       * Since 16x MSAA is first introduced on SKL, we don't need to apply
2241
       * the workaround on any older hardware.
2242
       */
2243
      if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
2244
          multisample && multisample->rasterizationSamples == 16) {
2245
         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
2246
         ps._32PixelDispatchEnable = false;
2247
      }
2248

2249
      ps.KernelStartPointer0 = fs_bin->kernel.offset +
2250
                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
2251
      ps.KernelStartPointer1 = fs_bin->kernel.offset +
2252
                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
2253
      ps.KernelStartPointer2 = fs_bin->kernel.offset +
2254
                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
2255

2256
      ps.SingleProgramFlow          = false;
2257
      ps.VectorMaskEnable           = GFX_VER >= 8;
2258
      /* Wa_1606682166 */
2259
      ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
2260
      ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
2261
      ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
2262
                                      wm_prog_data->base.ubo_ranges[0].length;
2263
      ps.PositionXYOffsetSelect     = wm_prog_data->uses_pos_offset ?
2264
                                      POSOFFSET_SAMPLE: POSOFFSET_NONE;
2265
#if GFX_VER < 8
2266
      ps.AttributeEnable            = wm_prog_data->num_varying_inputs > 0;
2267
      ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2268
      ps.DualSourceBlendEnable      = dual_src_blend;
2269
#endif
2270

2271
#if GFX_VERx10 == 75
2272
      /* Haswell requires the sample mask to be set in this packet as well
2273
       * as in 3DSTATE_SAMPLE_MASK; the values should match.
2274
       */
2275
      ps.SampleMask                 = 0xff;
2276
#endif
2277

2278
#if GFX_VER >= 9
2279
      ps.MaximumNumberofThreadsPerPSD  = 64 - 1;
2280
#elif GFX_VER >= 8
2281
      ps.MaximumNumberofThreadsPerPSD  = 64 - 2;
2282
#else
2283
      ps.MaximumNumberofThreads        = devinfo->max_wm_threads - 1;
2284
#endif
2285

2286
      ps.DispatchGRFStartRegisterForConstantSetupData0 =
2287
         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
2288
      ps.DispatchGRFStartRegisterForConstantSetupData1 =
2289
         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
2290
      ps.DispatchGRFStartRegisterForConstantSetupData2 =
2291
         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
2292

2293
#if GFX_VERx10 >= 125
2294
      ps.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, fs_bin);
2295
#else
2296
      ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
2297
      ps.ScratchSpaceBasePointer =
2298
         get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
2299
#endif
2300
   }
2301
}
2302

2303
#if GFX_VER >= 8
2304
static void
2305
emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
2306
                      struct anv_subpass *subpass,
2307
                      const VkPipelineRasterizationStateCreateInfo *rs_info)
2308
{
2309
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2310

2311
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2312
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
2313
      return;
2314
   }
2315

2316
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
2317
      ps.PixelShaderValid              = true;
2318
      ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
2319
      ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
2320
      ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
2321
      ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
2322
      ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
2323
      ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
2324

2325
      /* If the subpass has a depth or stencil self-dependency, then we need
2326
       * to force the hardware to do the depth/stencil write *after* fragment
2327
       * shader execution.  Otherwise, the writes may hit memory before we get
2328
       * around to fetching from the input attachment and we may get the depth
2329
       * or stencil value from the current draw rather than the previous one.
2330
       */
2331
      ps.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
2332
                                         wm_prog_data->uses_kill;
2333

2334
#if GFX_VER >= 9
2335
      ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
2336
      ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
2337

2338
      ps.InputCoverageMaskState = ICMS_NONE;
2339
      assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
2340
      if (!wm_prog_data->uses_sample_mask)
2341
         ps.InputCoverageMaskState = ICMS_NONE;
2342
      else if (wm_prog_data->per_coarse_pixel_dispatch)
2343
         ps.InputCoverageMaskState  = ICMS_NORMAL;
2344
      else if (wm_prog_data->post_depth_coverage)
2345
         ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
2346
      else
2347
         ps.InputCoverageMaskState = ICMS_NORMAL;
2348
#else
2349
      ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2350
#endif
2351

2352
#if GFX_VER >= 11
2353
      ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
2354
         wm_prog_data->uses_depth_w_coefficients;
2355
      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
2356
#endif
2357
   }
2358
}
2359

2360
static void
2361
emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)
2362
{
2363
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
2364
      vft.PrimitiveTopologyType = pipeline->topology;
2365
   }
2366
}
2367
#endif
2368

2369
static void
2370
emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
2371
{
2372
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
2373
      vfs.StatisticsEnable = true;
2374
   }
2375
}
2376

2377
static void
2378
compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
2379
                   const VkPipelineMultisampleStateCreateInfo *ms_info,
2380
                   const struct anv_subpass *subpass)
2381
{
2382
   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2383
      pipeline->kill_pixel = false;
2384
      return;
2385
   }
2386

2387
   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2388

2389
   /* This computes the KillPixel portion of the computation for whether or
2390
    * not we want to enable the PMA fix on gfx8 or gfx9.  It's given by this
2391
    * chunk of the giant formula:
2392
    *
2393
    *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2394
    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2395
    *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2396
    *     3DSTATE_PS_BLEND::AlphaTestEnable ||
2397
    *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2398
    *
2399
    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
2400
    * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
2401
    * of an alpha test.
2402
    */
2403
   pipeline->kill_pixel =
2404
      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
2405
      wm_prog_data->uses_omask ||
2406
      (ms_info && ms_info->alphaToCoverageEnable);
2407
}
2408

2409
#if GFX_VER == 12
2410
static void
2411
emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
2412
{
2413
   if (!pipeline->use_primitive_replication) {
2414
      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
2415
      return;
2416
   }
2417

2418
   uint32_t view_mask = pipeline->subpass->view_mask;
2419
   int view_count = util_bitcount(view_mask);
2420
   assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
2421

2422
   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
2423
      pr.ReplicaMask = (1 << view_count) - 1;
2424
      pr.ReplicationCount = view_count - 1;
2425

2426
      int i = 0;
2427
      u_foreach_bit(view_index, view_mask) {
2428
         pr.RTAIOffset[i] = view_index;
2429
         i++;
2430
      }
2431
   }
2432
}
2433
#endif
2434

2435
static VkResult
2436
genX(graphics_pipeline_create)(
2437
    VkDevice                                    _device,
2438
    struct anv_pipeline_cache *                 cache,
2439
    const VkGraphicsPipelineCreateInfo*         pCreateInfo,
2440
    const VkAllocationCallbacks*                pAllocator,
2441
    VkPipeline*                                 pPipeline)
2442
{
2443
   ANV_FROM_HANDLE(anv_device, device, _device);
2444
   ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
2445
   struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
2446
   struct anv_graphics_pipeline *pipeline;
2447
   VkResult result;
2448

2449
   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
2450

2451
   /* Use the default pipeline cache if none is specified */
2452
   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2453
      cache = &device->default_pipeline_cache;
2454

2455
   pipeline = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2456
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2457
   if (pipeline == NULL)
2458
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
2459

2460
   result = anv_graphics_pipeline_init(pipeline, device, cache,
2461
                                       pCreateInfo, pAllocator);
2462
   if (result != VK_SUCCESS) {
2463
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2464
      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2465
         *pPipeline = VK_NULL_HANDLE;
2466
      return result;
2467
   }
2468

2469
   /* Information on which states are considered dynamic. */
2470
   const VkPipelineDynamicStateCreateInfo *dyn_info =
2471
      pCreateInfo->pDynamicState;
2472
   uint32_t dynamic_states = 0;
2473
   if (dyn_info) {
2474
      for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
2475
         dynamic_states |=
2476
            anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
2477
   }
2478

2479

2480
   /* If rasterization is not enabled, various CreateInfo structs must be
2481
    * ignored.
2482
    */
2483
   const bool raster_enabled =
2484
      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
2485
      (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
2486

2487
   const VkPipelineViewportStateCreateInfo *vp_info =
2488
      raster_enabled ? pCreateInfo->pViewportState : NULL;
2489

2490
   const VkPipelineMultisampleStateCreateInfo *ms_info =
2491
      raster_enabled ? pCreateInfo->pMultisampleState : NULL;
2492

2493
   const VkPipelineDepthStencilStateCreateInfo *ds_info =
2494
      raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
2495

2496
   const VkPipelineColorBlendStateCreateInfo *cb_info =
2497
      raster_enabled ? pCreateInfo->pColorBlendState : NULL;
2498

2499
   const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
2500
      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
2501
                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2502

2503
   enum intel_urb_deref_block_size urb_deref_block_size;
2504
   emit_urb_setup(pipeline, &urb_deref_block_size);
2505

2506
   assert(pCreateInfo->pVertexInputState);
2507
   emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
2508
   assert(pCreateInfo->pRasterizationState);
2509
   emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,
2510
                           pCreateInfo->pRasterizationState,
2511
                           ms_info, line_info, dynamic_states, pass, subpass,
2512
                           urb_deref_block_size);
2513
   emit_ms_state(pipeline, ms_info, dynamic_states);
2514
   emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
2515
   emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);
2516
   compute_kill_pixel(pipeline, ms_info, subpass);
2517

2518
   emit_3dstate_clip(pipeline,
2519
                     pCreateInfo->pInputAssemblyState,
2520
                     vp_info,
2521
                     pCreateInfo->pRasterizationState,
2522
                     dynamic_states);
2523
   emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState,
2524
                          dynamic_states);
2525

2526
#if GFX_VER == 12
2527
   emit_3dstate_primitive_replication(pipeline);
2528
#endif
2529

2530
#if 0
2531
   /* From gfx7_vs_state.c */
2532

2533
   /**
2534
    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2535
    * Geometry > Geometry Shader > State:
2536
    *
2537
    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2538
    *     whole fixed function pipeline when the GS enable changes value in
2539
    *     the 3DSTATE_GS."
2540
    *
2541
    * The hardware architects have clarified that in this context "flush the
2542
    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2543
    * Stall" bit set.
2544
    */
2545
   if (!device->info.is_haswell && !device->info.is_baytrail)
2546
      gfx7_emit_vs_workaround_flush(brw);
2547
#endif
2548

2549
   emit_3dstate_vs(pipeline);
2550
   emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);
2551
   emit_3dstate_gs(pipeline);
2552
   emit_3dstate_sbe(pipeline);
2553
   emit_3dstate_wm(pipeline, subpass,
2554
                   pCreateInfo->pInputAssemblyState,
2555
                   pCreateInfo->pRasterizationState,
2556
                   cb_info, ms_info, line_info, dynamic_states);
2557
   emit_3dstate_ps(pipeline, cb_info, ms_info);
2558
#if GFX_VER >= 8
2559
   emit_3dstate_ps_extra(pipeline, subpass,
2560
                         pCreateInfo->pRasterizationState);
2561

2562
   if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
2563
      emit_3dstate_vf_topology(pipeline);
2564
#endif
2565
   emit_3dstate_vf_statistics(pipeline);
2566

2567
   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2568

2569
   return pipeline->base.batch.status;
2570
}
2571

2572
#if GFX_VERx10 >= 125
2573

2574
static void
2575
emit_compute_state(struct anv_compute_pipeline *pipeline,
2576
                   const struct anv_device *device)
2577
{
2578
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2579
   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2580

2581
   const uint32_t subslices = MAX2(device->physical->subslice_total, 1);
2582

2583
   const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
2584
   const struct intel_device_info *devinfo = &device->info;
2585

2586
   anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
2587
      cfe.MaximumNumberofThreads =
2588
         devinfo->max_cs_threads * subslices - 1;
2589
      cfe.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, cs_bin);
2590
   }
2591
}
2592

2593
#else /* #if GFX_VERx10 >= 125 */
2594

2595
static void
2596
emit_compute_state(struct anv_compute_pipeline *pipeline,
2597
                   const struct anv_device *device)
2598
{
2599
   const struct intel_device_info *devinfo = &device->info;
2600
   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2601

2602
   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2603

2604
   const struct brw_cs_dispatch_info dispatch =
2605
      brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2606
   const uint32_t vfe_curbe_allocation =
2607
      ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2608
            cs_prog_data->push.cross_thread.regs, 2);
2609

2610
   const uint32_t subslices = MAX2(device->physical->subslice_total, 1);
2611

2612
   const struct anv_shader_bin *cs_bin = pipeline->cs;
2613

2614
   anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2615
#if GFX_VER > 7
2616
      vfe.StackSize              = 0;
2617
#else
2618
      vfe.GPGPUMode              = true;
2619
#endif
2620
      vfe.MaximumNumberofThreads =
2621
         devinfo->max_cs_threads * subslices - 1;
2622
      vfe.NumberofURBEntries     = GFX_VER <= 7 ? 0 : 2;
2623
#if GFX_VER < 11
2624
      vfe.ResetGatewayTimer      = true;
2625
#endif
2626
#if GFX_VER <= 8
2627
      vfe.BypassGatewayControl   = true;
2628
#endif
2629
      vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
2630
      vfe.CURBEAllocationSize    = vfe_curbe_allocation;
2631

2632
      if (cs_bin->prog_data->total_scratch) {
2633
         if (GFX_VER >= 8) {
2634
            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2635
             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2636
             */
2637
            vfe.PerThreadScratchSpace =
2638
               ffs(cs_bin->prog_data->total_scratch) - 11;
2639
         } else if (GFX_VERx10 == 75) {
2640
            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
2641
             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
2642
             */
2643
            vfe.PerThreadScratchSpace =
2644
               ffs(cs_bin->prog_data->total_scratch) - 12;
2645
         } else {
2646
            /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
2647
             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
2648
             */
2649
            vfe.PerThreadScratchSpace =
2650
               cs_bin->prog_data->total_scratch / 1024 - 1;
2651
         }
2652
         vfe.ScratchSpaceBasePointer =
2653
            get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2654
      }
2655
   }
2656

2657
   struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2658
      .KernelStartPointer     =
2659
         cs_bin->kernel.offset +
2660
         brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2661

2662
      /* Wa_1606682166 */
2663
      .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2664
      /* We add 1 because the CS indirect parameters buffer isn't accounted
2665
       * for in bind_map.surface_count.
2666
       */
2667
      .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
2668
      .BarrierEnable          = cs_prog_data->uses_barrier,
2669
      .SharedLocalMemorySize  =
2670
         encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
2671

2672
#if GFX_VERx10 != 75
2673
      .ConstantURBEntryReadOffset = 0,
2674
#endif
2675
      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2676
#if GFX_VERx10 >= 75
2677
      .CrossThreadConstantDataReadLength =
2678
         cs_prog_data->push.cross_thread.regs,
2679
#endif
2680
#if GFX_VER >= 12
2681
      /* TODO: Check if we are missing workarounds and enable mid-thread
2682
       * preemption.
2683
       *
2684
       * We still have issues with mid-thread preemption (it was already
2685
       * disabled by the kernel on gfx11, due to missing workarounds). It's
2686
       * possible that we are just missing some workarounds, and could enable
2687
       * it later, but for now let's disable it to fix a GPU in compute in Car
2688
       * Chase (and possibly more).
2689
       */
2690
      .ThreadPreemptionDisable = true,
2691
#endif
2692

2693
      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2694
   };
2695
   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2696
                                        pipeline->interface_descriptor_data,
2697
                                        &desc);
2698
}
2699

2700
#endif /* #if GFX_VERx10 >= 125 */
2701

2702
static VkResult
2703
compute_pipeline_create(
2704
    VkDevice                                    _device,
2705
    struct anv_pipeline_cache *                 cache,
2706
    const VkComputePipelineCreateInfo*          pCreateInfo,
2707
    const VkAllocationCallbacks*                pAllocator,
2708
    VkPipeline*                                 pPipeline)
2709
{
2710
   ANV_FROM_HANDLE(anv_device, device, _device);
2711
   struct anv_compute_pipeline *pipeline;
2712
   VkResult result;
2713

2714
   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
2715

2716
   /* Use the default pipeline cache if none is specified */
2717
   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2718
      cache = &device->default_pipeline_cache;
2719

2720
   pipeline = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2721
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2722
   if (pipeline == NULL)
2723
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
2724

2725
   result = anv_pipeline_init(&pipeline->base, device,
2726
                              ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
2727
                              pAllocator);
2728
   if (result != VK_SUCCESS) {
2729
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2730
      return result;
2731
   }
2732

2733
   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
2734
                         pipeline->batch_data, sizeof(pipeline->batch_data));
2735

2736
   pipeline->cs = NULL;
2737

2738
   assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
2739
   VK_FROM_HANDLE(vk_shader_module, module,  pCreateInfo->stage.module);
2740
   result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
2741
                                    pCreateInfo->stage.pName,
2742
                                    pCreateInfo->stage.pSpecializationInfo);
2743
   if (result != VK_SUCCESS) {
2744
      anv_pipeline_finish(&pipeline->base, device, pAllocator);
2745
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2746
      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2747
         *pPipeline = VK_NULL_HANDLE;
2748
      return result;
2749
   }
2750

2751
   emit_compute_state(pipeline, device);
2752

2753
   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2754

2755
   return pipeline->base.batch.status;
2756
}
2757

2758
VkResult genX(CreateGraphicsPipelines)(
2759
    VkDevice                                    _device,
2760
    VkPipelineCache                             pipelineCache,
2761
    uint32_t                                    count,
2762
    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
2763
    const VkAllocationCallbacks*                pAllocator,
2764
    VkPipeline*                                 pPipelines)
2765
{
2766
   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2767

2768
   VkResult result = VK_SUCCESS;
2769

2770
   unsigned i;
2771
   for (i = 0; i < count; i++) {
2772
      VkResult res = genX(graphics_pipeline_create)(_device,
2773
                                                    pipeline_cache,
2774
                                                    &pCreateInfos[i],
2775
                                                    pAllocator, &pPipelines[i]);
2776

2777
      if (res == VK_SUCCESS)
2778
         continue;
2779

2780
      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2781
       * is not obvious what error should be report upon 2 different failures.
2782
       * */
2783
      result = res;
2784
      if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2785
         break;
2786

2787
      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2788
         break;
2789
   }
2790

2791
   for (; i < count; i++)
2792
      pPipelines[i] = VK_NULL_HANDLE;
2793

2794
   return result;
2795
}
2796

2797
VkResult genX(CreateComputePipelines)(
2798
    VkDevice                                    _device,
2799
    VkPipelineCache                             pipelineCache,
2800
    uint32_t                                    count,
2801
    const VkComputePipelineCreateInfo*          pCreateInfos,
2802
    const VkAllocationCallbacks*                pAllocator,
2803
    VkPipeline*                                 pPipelines)
2804
{
2805
   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2806

2807
   VkResult result = VK_SUCCESS;
2808

2809
   unsigned i;
2810
   for (i = 0; i < count; i++) {
2811
      VkResult res = compute_pipeline_create(_device, pipeline_cache,
2812
                                             &pCreateInfos[i],
2813
                                             pAllocator, &pPipelines[i]);
2814

2815
      if (res == VK_SUCCESS)
2816
         continue;
2817

2818
      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2819
       * is not obvious what error should be report upon 2 different failures.
2820
       * */
2821
      result = res;
2822
      if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2823
         break;
2824

2825
      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2826
         break;
2827
   }
2828

2829
   for (; i < count; i++)
2830
      pPipelines[i] = VK_NULL_HANDLE;
2831

2832
   return result;
2833
}
2834

2835
#if GFX_VERx10 >= 125
2836

2837
static void
2838
assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
2839
                            uint32_t stage_idx,
2840
                            VkShaderStageFlags valid_stages)
2841
{
2842
   if (stage_idx == VK_SHADER_UNUSED_KHR)
2843
      return;
2844

2845
   assert(stage_idx <= pCreateInfo->stageCount);
2846
   assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
2847
   assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
2848
}
2849

2850
static VkResult
2851
ray_tracing_pipeline_create(
2852
    VkDevice                                    _device,
2853
    struct anv_pipeline_cache *                 cache,
2854
    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfo,
2855
    const VkAllocationCallbacks*                pAllocator,
2856
    VkPipeline*                                 pPipeline)
2857
{
2858
   ANV_FROM_HANDLE(anv_device, device, _device);
2859
   VkResult result;
2860

2861
   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
2862

2863
   /* Use the default pipeline cache if none is specified */
2864
   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2865
      cache = &device->default_pipeline_cache;
2866

2867
   VK_MULTIALLOC(ma);
2868
   VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
2869
   VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);
2870
   if (!vk_multialloc_alloc2(&ma, &device->vk.alloc, pAllocator,
2871
                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2872
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
2873

2874
   result = anv_pipeline_init(&pipeline->base, device,
2875
                              ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,
2876
                              pAllocator);
2877
   if (result != VK_SUCCESS) {
2878
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2879
      return result;
2880
   }
2881

2882
   pipeline->group_count = pCreateInfo->groupCount;
2883
   pipeline->groups = groups;
2884

2885
   ASSERTED const VkShaderStageFlags ray_tracing_stages =
2886
      VK_SHADER_STAGE_RAYGEN_BIT_KHR |
2887
      VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
2888
      VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
2889
      VK_SHADER_STAGE_MISS_BIT_KHR |
2890
      VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
2891
      VK_SHADER_STAGE_CALLABLE_BIT_KHR;
2892

2893
   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
2894
      assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
2895

2896
   for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
2897
      const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
2898
         &pCreateInfo->pGroups[i];
2899
      assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
2900
                                  VK_SHADER_STAGE_RAYGEN_BIT_KHR |
2901
                                  VK_SHADER_STAGE_MISS_BIT_KHR |
2902
                                  VK_SHADER_STAGE_CALLABLE_BIT_KHR);
2903
      assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
2904
                                  VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
2905
      assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
2906
                                  VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
2907
      assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
2908
                                  VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
2909
      switch (ginfo->type) {
2910
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
2911
         assert(ginfo->generalShader < pCreateInfo->stageCount);
2912
         assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
2913
         assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
2914
         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
2915
         break;
2916

2917
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
2918
         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
2919
         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
2920
         break;
2921

2922
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
2923
         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
2924
         break;
2925

2926
      default:
2927
         unreachable("Invalid ray-tracing shader group type");
2928
      }
2929
   }
2930

2931
   result = anv_ray_tracing_pipeline_init(pipeline, device, cache,
2932
                                          pCreateInfo, pAllocator);
2933
   if (result != VK_SUCCESS) {
2934
      anv_pipeline_finish(&pipeline->base, device, pAllocator);
2935
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2936
      return result;
2937
   }
2938

2939
   for (uint32_t i = 0; i < pipeline->group_count; i++) {
2940
      struct anv_rt_shader_group *group = &pipeline->groups[i];
2941

2942
      switch (group->type) {
2943
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2944
         struct GFX_RT_GENERAL_SBT_HANDLE sh = {};
2945
         sh.General = anv_shader_bin_get_bsr(group->general, 32);
2946
         GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
2947
         break;
2948
      }
2949

2950
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2951
         struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};
2952
         if (group->closest_hit)
2953
            sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2954
         if (group->any_hit)
2955
            sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2956
         GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);
2957
         break;
2958
      }
2959

2960
      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2961
         struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};
2962
         if (group->closest_hit)
2963
            sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2964
         sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
2965
         GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
2966
         break;
2967
      }
2968

2969
      default:
2970
         unreachable("Invalid shader group type");
2971
      }
2972
   }
2973

2974
   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2975

2976
   return pipeline->base.batch.status;
2977
}
2978

2979
VkResult
2980
genX(CreateRayTracingPipelinesKHR)(
2981
    VkDevice                                    _device,
2982
    VkDeferredOperationKHR                      deferredOperation,
2983
    VkPipelineCache                             pipelineCache,
2984
    uint32_t                                    createInfoCount,
2985
    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfos,
2986
    const VkAllocationCallbacks*                pAllocator,
2987
    VkPipeline*                                 pPipelines)
2988
{
2989
   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2990

2991
   VkResult result = VK_SUCCESS;
2992

2993
   unsigned i;
2994
   for (i = 0; i < createInfoCount; i++) {
2995
      VkResult res = ray_tracing_pipeline_create(_device, pipeline_cache,
2996
                                                 &pCreateInfos[i],
2997
                                                 pAllocator, &pPipelines[i]);
2998

2999
      if (res == VK_SUCCESS)
3000
         continue;
3001

3002
      /* Bail out on the first error as it is not obvious what error should be
3003
       * report upon 2 different failures. */
3004
      result = res;
3005
      if (result != VK_PIPELINE_COMPILE_REQUIRED_EXT)
3006
         break;
3007

3008
      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
3009
         break;
3010
   }
3011

3012
   for (; i < createInfoCount; i++)
3013
      pPipelines[i] = VK_NULL_HANDLE;
3014

3015
   return result;
3016
}
3017
#endif /* GFX_VERx10 >= 125 */
3018

3019
Product

Resources

Company