Path: blob/21.2-virgl/src/intel/vulkan/genX_pipeline.c
4547 views
/*1* Copyright © 2015 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include "anv_private.h"2425#include "genxml/gen_macros.h"26#include "genxml/genX_pack.h"27#include "genxml/gen_rt_pack.h"2829#include "common/intel_l3_config.h"30#include "common/intel_sample_positions.h"31#include "nir/nir_xfb_info.h"32#include "vk_util.h"33#include "vk_format.h"3435static uint32_t36vertex_element_comp_control(enum isl_format format, unsigned comp)37{38uint8_t bits;39switch (comp) {40case 0: bits = isl_format_layouts[format].channels.r.bits; break;41case 1: bits = isl_format_layouts[format].channels.g.bits; break;42case 2: bits = isl_format_layouts[format].channels.b.bits; break;43case 3: bits = isl_format_layouts[format].channels.a.bits; break;44default: unreachable("Invalid component");45}4647/*48* Take in account hardware restrictions when dealing with 64-bit floats.49*50* From Broadwell spec, command reference structures, page 586:51* "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,52* 64-bit components are stored * in the URB without any conversion. In53* this case, vertex elements must be written as 128 or 256 bits, with54* VFCOMP_STORE_0 being used to pad the output as required. E.g., if55* R64_PASSTHRU is used to copy a 64-bit Red component into the URB,56* Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,357* set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or58* Components 1-3 must be specified as VFCOMP_STORE_0 in order to output59* a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires60* Component 3 to be specified as VFCOMP_STORE_0 in order to output a61* 256-bit vertex element."62*/63if (bits) {64return VFCOMP_STORE_SRC;65} else if (comp >= 2 &&66!isl_format_layouts[format].channels.b.bits &&67isl_format_layouts[format].channels.r.type == ISL_RAW) {68/* When emitting 64-bit attributes, we need to write either 128 or 25669* bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and70* VFCOMP_STORE_0 to pad the written chunk */71return VFCOMP_NOSTORE;72} else if (comp < 3 ||73isl_format_layouts[format].channels.r.type == ISL_RAW) {74/* Note we need to pad with value 0, not 1, due hardware restrictions75* (see comment above) */76return VFCOMP_STORE_0;77} else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||78isl_format_layouts[format].channels.r.type == ISL_SINT) {79assert(comp == 3);80return VFCOMP_STORE_1_INT;81} else {82assert(comp == 3);83return VFCOMP_STORE_1_FP;84}85}8687static void88emit_vertex_input(struct anv_graphics_pipeline *pipeline,89const VkPipelineVertexInputStateCreateInfo *info)90{91const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);9293/* Pull inputs_read out of the VS prog data */94const uint64_t inputs_read = vs_prog_data->inputs_read;95const uint64_t double_inputs_read =96vs_prog_data->double_inputs_read & inputs_read;97assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);98const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;99const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;100const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||101vs_prog_data->uses_instanceid ||102vs_prog_data->uses_firstvertex ||103vs_prog_data->uses_baseinstance;104105uint32_t elem_count = __builtin_popcount(elements) -106__builtin_popcount(elements_double) / 2;107108const uint32_t total_elems =109MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);110111uint32_t *p;112113const uint32_t num_dwords = 1 + total_elems * 2;114p = anv_batch_emitn(&pipeline->base.batch, num_dwords,115GENX(3DSTATE_VERTEX_ELEMENTS));116if (!p)117return;118119for (uint32_t i = 0; i < total_elems; i++) {120/* The SKL docs for VERTEX_ELEMENT_STATE say:121*122* "All elements must be valid from Element[0] to the last valid123* element. (I.e. if Element[2] is valid then Element[1] and124* Element[0] must also be valid)."125*126* The SKL docs for 3D_Vertex_Component_Control say:127*128* "Don't store this component. (Not valid for Component 0, but can129* be used for Component 1-3)."130*131* So we can't just leave a vertex element blank and hope for the best.132* We have to tell the VF hardware to put something in it; so we just133* store a bunch of zero.134*135* TODO: Compact vertex elements so we never end up with holes.136*/137struct GENX(VERTEX_ELEMENT_STATE) element = {138.Valid = true,139.Component0Control = VFCOMP_STORE_0,140.Component1Control = VFCOMP_STORE_0,141.Component2Control = VFCOMP_STORE_0,142.Component3Control = VFCOMP_STORE_0,143};144GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);145}146147for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {148const VkVertexInputAttributeDescription *desc =149&info->pVertexAttributeDescriptions[i];150enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,151desc->format,152VK_IMAGE_ASPECT_COLOR_BIT,153VK_IMAGE_TILING_LINEAR);154155assert(desc->binding < MAX_VBS);156157if ((elements & (1 << desc->location)) == 0)158continue; /* Binding unused */159160uint32_t slot =161__builtin_popcount(elements & ((1 << desc->location) - 1)) -162DIV_ROUND_UP(__builtin_popcount(elements_double &163((1 << desc->location) -1)), 2);164165struct GENX(VERTEX_ELEMENT_STATE) element = {166.VertexBufferIndex = desc->binding,167.Valid = true,168.SourceElementFormat = format,169.EdgeFlagEnable = false,170.SourceElementOffset = desc->offset,171.Component0Control = vertex_element_comp_control(format, 0),172.Component1Control = vertex_element_comp_control(format, 1),173.Component2Control = vertex_element_comp_control(format, 2),174.Component3Control = vertex_element_comp_control(format, 3),175};176GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);177178#if GFX_VER >= 8179/* On Broadwell and later, we have a separate VF_INSTANCING packet180* that controls instancing. On Haswell and prior, that's part of181* VERTEX_BUFFER_STATE which we emit later.182*/183anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {184vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;185vfi.VertexElementIndex = slot;186vfi.InstanceDataStepRate =187pipeline->vb[desc->binding].instance_divisor;188}189#endif190}191192const uint32_t id_slot = elem_count;193if (needs_svgs_elem) {194/* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:195* "Within a VERTEX_ELEMENT_STATE structure, if a Component196* Control field is set to something other than VFCOMP_STORE_SRC,197* no higher-numbered Component Control fields may be set to198* VFCOMP_STORE_SRC"199*200* This means, that if we have BaseInstance, we need BaseVertex as201* well. Just do all or nothing.202*/203uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||204vs_prog_data->uses_baseinstance) ?205VFCOMP_STORE_SRC : VFCOMP_STORE_0;206207struct GENX(VERTEX_ELEMENT_STATE) element = {208.VertexBufferIndex = ANV_SVGS_VB_INDEX,209.Valid = true,210.SourceElementFormat = ISL_FORMAT_R32G32_UINT,211.Component0Control = base_ctrl,212.Component1Control = base_ctrl,213#if GFX_VER >= 8214.Component2Control = VFCOMP_STORE_0,215.Component3Control = VFCOMP_STORE_0,216#else217.Component2Control = VFCOMP_STORE_VID,218.Component3Control = VFCOMP_STORE_IID,219#endif220};221GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);222223#if GFX_VER >= 8224anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {225vfi.VertexElementIndex = id_slot;226}227#endif228}229230#if GFX_VER >= 8231anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {232sgvs.VertexIDEnable = vs_prog_data->uses_vertexid;233sgvs.VertexIDComponentNumber = 2;234sgvs.VertexIDElementOffset = id_slot;235sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid;236sgvs.InstanceIDComponentNumber = 3;237sgvs.InstanceIDElementOffset = id_slot;238}239#endif240241const uint32_t drawid_slot = elem_count + needs_svgs_elem;242if (vs_prog_data->uses_drawid) {243struct GENX(VERTEX_ELEMENT_STATE) element = {244.VertexBufferIndex = ANV_DRAWID_VB_INDEX,245.Valid = true,246.SourceElementFormat = ISL_FORMAT_R32_UINT,247.Component0Control = VFCOMP_STORE_SRC,248.Component1Control = VFCOMP_STORE_0,249.Component2Control = VFCOMP_STORE_0,250.Component3Control = VFCOMP_STORE_0,251};252GENX(VERTEX_ELEMENT_STATE_pack)(NULL,253&p[1 + drawid_slot * 2],254&element);255256#if GFX_VER >= 8257anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {258vfi.VertexElementIndex = drawid_slot;259}260#endif261}262}263264void265genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,266const struct intel_l3_config *l3_config,267VkShaderStageFlags active_stages,268const unsigned entry_size[4],269enum intel_urb_deref_block_size *deref_block_size)270{271const struct intel_device_info *devinfo = &device->info;272273unsigned entries[4];274unsigned start[4];275bool constrained;276intel_get_urb_config(devinfo, l3_config,277active_stages &278VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,279active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,280entry_size, entries, start, deref_block_size,281&constrained);282283#if GFX_VERx10 == 70284/* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:285*286* "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall287* needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,288* 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,289* 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL290* needs to be sent before any combination of VS associated 3DSTATE."291*/292anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {293pc.DepthStallEnable = true;294pc.PostSyncOperation = WriteImmediateData;295pc.Address = device->workaround_address;296}297#endif298299for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {300anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {301urb._3DCommandSubOpcode += i;302urb.VSURBStartingAddress = start[i];303urb.VSURBEntryAllocationSize = entry_size[i] - 1;304urb.VSNumberofURBEntries = entries[i];305}306}307}308309static void310emit_urb_setup(struct anv_graphics_pipeline *pipeline,311enum intel_urb_deref_block_size *deref_block_size)312{313unsigned entry_size[4];314for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {315const struct brw_vue_prog_data *prog_data =316!anv_pipeline_has_stage(pipeline, i) ? NULL :317(const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;318319entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;320}321322genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,323pipeline->base.l3_config,324pipeline->active_stages, entry_size,325deref_block_size);326}327328static void329emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)330{331const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);332333if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {334anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);335#if GFX_VER >= 8336anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);337#endif338return;339}340341const struct brw_vue_map *fs_input_map =342&anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;343344struct GENX(3DSTATE_SBE) sbe = {345GENX(3DSTATE_SBE_header),346.AttributeSwizzleEnable = true,347.PointSpriteTextureCoordinateOrigin = UPPERLEFT,348.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,349.ConstantInterpolationEnable = wm_prog_data->flat_inputs,350};351352#if GFX_VER >= 9353for (unsigned i = 0; i < 32; i++)354sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;355#endif356357#if GFX_VER >= 8358/* On Broadwell, they broke 3DSTATE_SBE into two packets */359struct GENX(3DSTATE_SBE_SWIZ) swiz = {360GENX(3DSTATE_SBE_SWIZ_header),361};362#else363# define swiz sbe364#endif365366int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,367fs_input_map);368assert(first_slot % 2 == 0);369unsigned urb_entry_read_offset = first_slot / 2;370int max_source_attr = 0;371for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {372uint8_t attr = wm_prog_data->urb_setup_attribs[idx];373int input_index = wm_prog_data->urb_setup[attr];374375assert(0 <= input_index);376377/* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the378* VUE header379*/380if (attr == VARYING_SLOT_VIEWPORT ||381attr == VARYING_SLOT_LAYER ||382attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {383continue;384}385386if (attr == VARYING_SLOT_PNTC) {387sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;388continue;389}390391const int slot = fs_input_map->varying_to_slot[attr];392393if (slot == -1) {394/* This attribute does not exist in the VUE--that means that the395* vertex shader did not write to it. It could be that it's a396* regular varying read by the fragment shader but not written by397* the vertex shader or it's gl_PrimitiveID. In the first case the398* value is undefined, in the second it needs to be399* gl_PrimitiveID.400*/401swiz.Attribute[input_index].ConstantSource = PRIM_ID;402swiz.Attribute[input_index].ComponentOverrideX = true;403swiz.Attribute[input_index].ComponentOverrideY = true;404swiz.Attribute[input_index].ComponentOverrideZ = true;405swiz.Attribute[input_index].ComponentOverrideW = true;406continue;407}408409/* We have to subtract two slots to accout for the URB entry output410* read offset in the VS and GS stages.411*/412const int source_attr = slot - 2 * urb_entry_read_offset;413assert(source_attr >= 0 && source_attr < 32);414max_source_attr = MAX2(max_source_attr, source_attr);415/* The hardware can only do overrides on 16 overrides at a time, and the416* other up to 16 have to be lined up so that the input index = the417* output index. We'll need to do some tweaking to make sure that's the418* case.419*/420if (input_index < 16)421swiz.Attribute[input_index].SourceAttribute = source_attr;422else423assert(source_attr == input_index);424}425426sbe.VertexURBEntryReadOffset = urb_entry_read_offset;427sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);428#if GFX_VER >= 8429sbe.ForceVertexURBEntryReadOffset = true;430sbe.ForceVertexURBEntryReadLength = true;431#endif432433uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,434GENX(3DSTATE_SBE_length));435if (!dw)436return;437GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);438439#if GFX_VER >= 8440dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));441if (!dw)442return;443GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);444#endif445}446447/** Returns the final polygon mode for rasterization448*449* This function takes into account polygon mode, primitive topology and the450* different shader stages which might generate their own type of primitives.451*/452VkPolygonMode453genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,454VkPrimitiveTopology primitive_topology)455{456if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {457switch (get_gs_prog_data(pipeline)->output_topology) {458case _3DPRIM_POINTLIST:459return VK_POLYGON_MODE_POINT;460461case _3DPRIM_LINELIST:462case _3DPRIM_LINESTRIP:463case _3DPRIM_LINELOOP:464return VK_POLYGON_MODE_LINE;465466case _3DPRIM_TRILIST:467case _3DPRIM_TRIFAN:468case _3DPRIM_TRISTRIP:469case _3DPRIM_RECTLIST:470case _3DPRIM_QUADLIST:471case _3DPRIM_QUADSTRIP:472case _3DPRIM_POLYGON:473return pipeline->polygon_mode;474}475unreachable("Unsupported GS output topology");476} else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {477switch (get_tes_prog_data(pipeline)->output_topology) {478case BRW_TESS_OUTPUT_TOPOLOGY_POINT:479return VK_POLYGON_MODE_POINT;480481case BRW_TESS_OUTPUT_TOPOLOGY_LINE:482return VK_POLYGON_MODE_LINE;483484case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:485case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:486return pipeline->polygon_mode;487}488unreachable("Unsupported TCS output topology");489} else {490switch (primitive_topology) {491case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:492return VK_POLYGON_MODE_POINT;493494case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:495case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:496case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:497case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:498return VK_POLYGON_MODE_LINE;499500case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:501case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:502case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:503case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:504case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:505return pipeline->polygon_mode;506507default:508unreachable("Unsupported primitive topology");509}510}511}512513uint32_t514genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,515VkPolygonMode raster_mode)516{517#if GFX_VER <= 7518if (raster_mode == VK_POLYGON_MODE_LINE) {519switch (pipeline->line_mode) {520case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:521return MSRASTMODE_ON_PATTERN;522523case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:524case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:525return MSRASTMODE_OFF_PIXEL;526527default:528unreachable("Unsupported line rasterization mode");529}530} else {531return pipeline->rasterization_samples > 1 ?532MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;533}534#else535unreachable("Only on gen7");536#endif537}538539static VkProvokingVertexModeEXT540vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)541{542const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =543vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);544545return rs_pv_info == NULL ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT :546rs_pv_info->provokingVertexMode;547}548549const uint32_t genX(vk_to_intel_cullmode)[] = {550[VK_CULL_MODE_NONE] = CULLMODE_NONE,551[VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT,552[VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK,553[VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH554};555556const uint32_t genX(vk_to_intel_fillmode)[] = {557[VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID,558[VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,559[VK_POLYGON_MODE_POINT] = FILL_MODE_POINT,560};561562const uint32_t genX(vk_to_intel_front_face)[] = {563[VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1,564[VK_FRONT_FACE_CLOCKWISE] = 0565};566567#if GFX_VER >= 9568static VkConservativeRasterizationModeEXT569vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)570{571const VkPipelineRasterizationConservativeStateCreateInfoEXT *cr =572vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);573574return cr ? cr->conservativeRasterizationMode :575VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;576}577#endif578579void580genX(rasterization_mode)(VkPolygonMode raster_mode,581VkLineRasterizationModeEXT line_mode,582uint32_t *api_mode,583bool *msaa_rasterization_enable)584{585#if GFX_VER >= 8586if (raster_mode == VK_POLYGON_MODE_LINE) {587/* Unfortunately, configuring our line rasterization hardware on gfx8588* and later is rather painful. Instead of giving us bits to tell the589* hardware what line mode to use like we had on gfx7, we now have an590* arcane combination of API Mode and MSAA enable bits which do things591* in a table which are expected to magically put the hardware into the592* right mode for your API. Sadly, Vulkan isn't any of the APIs the593* hardware people thought of so nothing works the way you want it to.594*595* Look at the table titled "Multisample Rasterization Modes" in Vol 7596* of the Skylake PRM for more details.597*/598switch (line_mode) {599case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:600*api_mode = DX100;601*msaa_rasterization_enable = true;602break;603604case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:605case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:606*api_mode = DX9OGL;607*msaa_rasterization_enable = false;608break;609610default:611unreachable("Unsupported line rasterization mode");612}613} else {614*api_mode = DX100;615*msaa_rasterization_enable = true;616}617#else618unreachable("Invalid call");619#endif620}621622static void623emit_rs_state(struct anv_graphics_pipeline *pipeline,624const VkPipelineInputAssemblyStateCreateInfo *ia_info,625const VkPipelineRasterizationStateCreateInfo *rs_info,626const VkPipelineMultisampleStateCreateInfo *ms_info,627const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,628const uint32_t dynamic_states,629const struct anv_render_pass *pass,630const struct anv_subpass *subpass,631enum intel_urb_deref_block_size urb_deref_block_size)632{633struct GENX(3DSTATE_SF) sf = {634GENX(3DSTATE_SF_header),635};636637sf.ViewportTransformEnable = true;638sf.StatisticsEnable = true;639sf.VertexSubPixelPrecisionSelect = _8Bit;640sf.AALineDistanceMode = true;641642switch (vk_provoking_vertex_mode(rs_info)) {643case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:644sf.TriangleStripListProvokingVertexSelect = 0;645sf.LineStripListProvokingVertexSelect = 0;646sf.TriangleFanProvokingVertexSelect = 1;647break;648649case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:650sf.TriangleStripListProvokingVertexSelect = 2;651sf.LineStripListProvokingVertexSelect = 1;652sf.TriangleFanProvokingVertexSelect = 2;653break;654655default:656unreachable("Invalid provoking vertex mode");657}658659#if GFX_VERx10 == 75660sf.LineStippleEnable = line_info && line_info->stippledLineEnable;661#endif662663#if GFX_VER >= 12664sf.DerefBlockSize = urb_deref_block_size;665#endif666667const struct brw_vue_prog_data *last_vue_prog_data =668anv_pipeline_get_last_vue_prog_data(pipeline);669670if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {671sf.PointWidthSource = Vertex;672} else {673sf.PointWidthSource = State;674sf.PointWidth = 1.0;675}676677#if GFX_VER >= 8678struct GENX(3DSTATE_RASTER) raster = {679GENX(3DSTATE_RASTER_header),680};681#else682# define raster sf683#endif684685VkPolygonMode raster_mode =686genX(raster_polygon_mode)(pipeline, ia_info->topology);687bool dynamic_primitive_topology =688dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;689690/* For details on 3DSTATE_RASTER multisample state, see the BSpec table691* "Multisample Modes State".692*/693#if GFX_VER >= 8694if (!dynamic_primitive_topology)695genX(rasterization_mode)(raster_mode, pipeline->line_mode,696&raster.APIMode,697&raster.DXMultisampleRasterizationEnable);698699/* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix700* computations. If we ever set this bit to a different value, they will701* need to be updated accordingly.702*/703raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;704raster.ForceMultisampling = false;705#else706uint32_t ms_rast_mode = 0;707708if (!dynamic_primitive_topology)709ms_rast_mode = genX(ms_rasterization_mode)(pipeline, raster_mode);710711raster.MultisampleRasterizationMode = ms_rast_mode;712#endif713714raster.AntialiasingEnable =715dynamic_primitive_topology ? 0 :716anv_rasterization_aa_mode(raster_mode, pipeline->line_mode);717718raster.FrontWinding =719dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?7200 : genX(vk_to_intel_front_face)[rs_info->frontFace];721raster.CullMode =722dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?7230 : genX(vk_to_intel_cullmode)[rs_info->cullMode];724725raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];726raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];727raster.ScissorRectangleEnable = true;728729#if GFX_VER >= 9730/* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */731raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;732raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;733#elif GFX_VER >= 8734raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;735#endif736737#if GFX_VER >= 9738raster.ConservativeRasterizationEnable =739vk_conservative_rasterization_mode(rs_info) !=740VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;741#endif742743bool depth_bias_enable =744dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE ?7450 : rs_info->depthBiasEnable;746747raster.GlobalDepthOffsetEnableSolid = depth_bias_enable;748raster.GlobalDepthOffsetEnableWireframe = depth_bias_enable;749raster.GlobalDepthOffsetEnablePoint = depth_bias_enable;750751#if GFX_VER == 7752/* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it753* can get the depth offsets correct.754*/755if (subpass->depth_stencil_attachment) {756VkFormat vk_format =757pass->attachments[subpass->depth_stencil_attachment->attachment].format;758assert(vk_format_is_depth_or_stencil(vk_format));759if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {760enum isl_format isl_format =761anv_get_isl_format(&pipeline->base.device->info, vk_format,762VK_IMAGE_ASPECT_DEPTH_BIT,763VK_IMAGE_TILING_OPTIMAL);764sf.DepthBufferSurfaceFormat =765isl_format_get_depth_format(isl_format, false);766}767}768#endif769770#if GFX_VER >= 8771GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);772GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);773#else774# undef raster775GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);776#endif777}778779static void780emit_ms_state(struct anv_graphics_pipeline *pipeline,781const VkPipelineMultisampleStateCreateInfo *info,782uint32_t dynamic_states)783{784/* Only lookup locations if the extensions is active, otherwise the default785* ones will be used either at device initialization time or through786* 3DSTATE_MULTISAMPLE on Gfx7/7.5 by passing NULL locations.787*/788if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations) {789/* If the sample locations are dynamic, 3DSTATE_MULTISAMPLE on Gfx7/7.5790* will be emitted dynamically, so skip it here. On Gfx8+791* 3DSTATE_SAMPLE_PATTERN will be emitted dynamically, so skip it here.792*/793if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)) {794#if GFX_VER >= 8795genX(emit_sample_pattern)(&pipeline->base.batch,796pipeline->dynamic_state.sample_locations.samples,797pipeline->dynamic_state.sample_locations.locations);798#endif799}800801genX(emit_multisample)(&pipeline->base.batch,802pipeline->dynamic_state.sample_locations.samples,803pipeline->dynamic_state.sample_locations.locations);804} else {805/* On Gfx8+ 3DSTATE_MULTISAMPLE does not hold anything we need to modify806* for sample locations, so we don't have to emit it dynamically.807*/808#if GFX_VER >= 8809genX(emit_multisample)(&pipeline->base.batch,810info ? info->rasterizationSamples : 1,811NULL);812#endif813}814815/* From the Vulkan 1.0 spec:816* If pSampleMask is NULL, it is treated as if the mask has all bits817* enabled, i.e. no coverage is removed from fragments.818*819* 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.820*/821#if GFX_VER >= 8822uint32_t sample_mask = 0xffff;823#else824uint32_t sample_mask = 0xff;825#endif826827if (info && info->pSampleMask)828sample_mask &= info->pSampleMask[0];829830anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {831sm.SampleMask = sample_mask;832}833834pipeline->cps_state = ANV_STATE_NULL;835#if GFX_VER >= 11836if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) &&837pipeline->base.device->vk.enabled_extensions.KHR_fragment_shading_rate) {838#if GFX_VER >= 12839struct anv_device *device = pipeline->base.device;840const uint32_t num_dwords =841GENX(CPS_STATE_length) * 4 * pipeline->dynamic_state.viewport.count;842pipeline->cps_state =843anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords, 32);844#endif845846genX(emit_shading_rate)(&pipeline->base.batch,847pipeline,848pipeline->cps_state,849&pipeline->dynamic_state);850}851#endif852}853854const uint32_t genX(vk_to_intel_logic_op)[] = {855[VK_LOGIC_OP_COPY] = LOGICOP_COPY,856[VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR,857[VK_LOGIC_OP_AND] = LOGICOP_AND,858[VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE,859[VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED,860[VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP,861[VK_LOGIC_OP_XOR] = LOGICOP_XOR,862[VK_LOGIC_OP_OR] = LOGICOP_OR,863[VK_LOGIC_OP_NOR] = LOGICOP_NOR,864[VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV,865[VK_LOGIC_OP_INVERT] = LOGICOP_INVERT,866[VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE,867[VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED,868[VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED,869[VK_LOGIC_OP_NAND] = LOGICOP_NAND,870[VK_LOGIC_OP_SET] = LOGICOP_SET,871};872873static const uint32_t vk_to_intel_blend[] = {874[VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO,875[VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE,876[VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR,877[VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR,878[VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR,879[VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR,880[VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA,881[VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA,882[VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA,883[VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA,884[VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR,885[VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,886[VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA,887[VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,888[VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE,889[VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR,890[VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR,891[VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA,892[VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA,893};894895static const uint32_t vk_to_intel_blend_op[] = {896[VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD,897[VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT,898[VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT,899[VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN,900[VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX,901};902903const uint32_t genX(vk_to_intel_compare_op)[] = {904[VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER,905[VK_COMPARE_OP_LESS] = PREFILTEROP_LESS,906[VK_COMPARE_OP_EQUAL] = PREFILTEROP_EQUAL,907[VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LEQUAL,908[VK_COMPARE_OP_GREATER] = PREFILTEROP_GREATER,909[VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_NOTEQUAL,910[VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GEQUAL,911[VK_COMPARE_OP_ALWAYS] = PREFILTEROP_ALWAYS,912};913914const uint32_t genX(vk_to_intel_stencil_op)[] = {915[VK_STENCIL_OP_KEEP] = STENCILOP_KEEP,916[VK_STENCIL_OP_ZERO] = STENCILOP_ZERO,917[VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE,918[VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT,919[VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT,920[VK_STENCIL_OP_INVERT] = STENCILOP_INVERT,921[VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR,922[VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR,923};924925const uint32_t genX(vk_to_intel_primitive_type)[] = {926[VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST,927[VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST,928[VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP,929[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST,930[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,931[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN,932[VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ,933[VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,934[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ,935[VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,936};937938/* This function sanitizes the VkStencilOpState by looking at the compare ops939* and trying to determine whether or not a given stencil op can ever actually940* occur. Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.941* This function returns true if, after sanitation, any of the stencil ops are942* set to something other than VK_STENCIL_OP_KEEP.943*/944static bool945sanitize_stencil_face(VkStencilOpState *face,946VkCompareOp depthCompareOp)947{948/* If compareOp is ALWAYS then the stencil test will never fail and failOp949* will never happen. Set failOp to KEEP in this case.950*/951if (face->compareOp == VK_COMPARE_OP_ALWAYS)952face->failOp = VK_STENCIL_OP_KEEP;953954/* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth955* or stencil tests will fail and passOp will never happen.956*/957if (face->compareOp == VK_COMPARE_OP_NEVER ||958depthCompareOp == VK_COMPARE_OP_NEVER)959face->passOp = VK_STENCIL_OP_KEEP;960961/* If compareOp is NEVER or depthCompareOp is ALWAYS then either the962* stencil test will fail or the depth test will pass. In either case,963* depthFailOp will never happen.964*/965if (face->compareOp == VK_COMPARE_OP_NEVER ||966depthCompareOp == VK_COMPARE_OP_ALWAYS)967face->depthFailOp = VK_STENCIL_OP_KEEP;968969return face->failOp != VK_STENCIL_OP_KEEP ||970face->depthFailOp != VK_STENCIL_OP_KEEP ||971face->passOp != VK_STENCIL_OP_KEEP;972}973974/* Intel hardware is fairly sensitive to whether or not depth/stencil writes975* are enabled. In the presence of discards, it's fairly easy to get into the976* non-promoted case which means a fairly big performance hit. From the Iron977* Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":978*979* "Non-promoted depth (N) is active whenever the depth test can be done980* early but it cannot determine whether or not to write source depth to981* the depth buffer, therefore the depth write must be performed post pixel982* shader. This includes cases where the pixel shader can kill pixels,983* including via sampler chroma key, as well as cases where the alpha test984* function is enabled, which kills pixels based on a programmable alpha985* test. In this case, even if the depth test fails, the pixel cannot be986* killed if a stencil write is indicated. Whether or not the stencil write987* happens depends on whether or not the pixel is killed later. In these988* cases if stencil test fails and stencil writes are off, the pixels can989* also be killed early. If stencil writes are enabled, the pixels must be990* treated as Computed depth (described above)."991*992* The same thing as mentioned in the stencil case can happen in the depth993* case as well if it thinks it writes depth but, thanks to the depth test994* being GL_EQUAL, the write doesn't actually matter. A little extra work995* up-front to try and disable depth and stencil writes can make a big996* difference.997*998* Unfortunately, the way depth and stencil testing is specified, there are999* many case where, regardless of depth/stencil writes being enabled, nothing1000* actually gets written due to some other bit of state being set. This1001* function attempts to "sanitize" the depth stencil state and disable writes1002* and sometimes even testing whenever possible.1003*/1004static void1005sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,1006bool *stencilWriteEnable,1007VkImageAspectFlags ds_aspects)1008{1009*stencilWriteEnable = state->stencilTestEnable;10101011/* If the depth test is disabled, we won't be writing anything. Make sure we1012* treat the test as always passing later on as well.1013*1014* Also, the Vulkan spec requires that if either depth or stencil is not1015* present, the pipeline is to act as if the test silently passes. In that1016* case we won't write either.1017*/1018if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {1019state->depthWriteEnable = false;1020state->depthCompareOp = VK_COMPARE_OP_ALWAYS;1021}10221023if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {1024*stencilWriteEnable = false;1025state->front.compareOp = VK_COMPARE_OP_ALWAYS;1026state->back.compareOp = VK_COMPARE_OP_ALWAYS;1027}10281029/* If the stencil test is enabled and always fails, then we will never get1030* to the depth test so we can just disable the depth test entirely.1031*/1032if (state->stencilTestEnable &&1033state->front.compareOp == VK_COMPARE_OP_NEVER &&1034state->back.compareOp == VK_COMPARE_OP_NEVER) {1035state->depthTestEnable = false;1036state->depthWriteEnable = false;1037}10381039/* If depthCompareOp is EQUAL then the value we would be writing to the1040* depth buffer is the same as the value that's already there so there's no1041* point in writing it.1042*/1043if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)1044state->depthWriteEnable = false;10451046/* If the stencil ops are such that we don't actually ever modify the1047* stencil buffer, we should disable writes.1048*/1049if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&1050!sanitize_stencil_face(&state->back, state->depthCompareOp))1051*stencilWriteEnable = false;10521053/* If the depth test always passes and we never write out depth, that's the1054* same as if the depth test is disabled entirely.1055*/1056if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&1057!state->depthWriteEnable)1058state->depthTestEnable = false;10591060/* If the stencil test always passes and we never write out stencil, that's1061* the same as if the stencil test is disabled entirely.1062*/1063if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&1064state->back.compareOp == VK_COMPARE_OP_ALWAYS &&1065!*stencilWriteEnable)1066state->stencilTestEnable = false;1067}10681069static void1070emit_ds_state(struct anv_graphics_pipeline *pipeline,1071const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,1072const uint32_t dynamic_states,1073const struct anv_render_pass *pass,1074const struct anv_subpass *subpass)1075{1076#if GFX_VER == 71077# define depth_stencil_dw pipeline->gfx7.depth_stencil_state1078#elif GFX_VER == 81079# define depth_stencil_dw pipeline->gfx8.wm_depth_stencil1080#else1081# define depth_stencil_dw pipeline->gfx9.wm_depth_stencil1082#endif10831084if (pCreateInfo == NULL) {1085/* We're going to OR this together with the dynamic state. We need1086* to make sure it's initialized to something useful.1087*/1088pipeline->writes_stencil = false;1089pipeline->stencil_test_enable = false;1090pipeline->writes_depth = false;1091pipeline->depth_test_enable = false;1092pipeline->depth_bounds_test_enable = false;1093memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));1094return;1095}10961097VkImageAspectFlags ds_aspects = 0;1098if (subpass->depth_stencil_attachment) {1099VkFormat depth_stencil_format =1100pass->attachments[subpass->depth_stencil_attachment->attachment].format;1101ds_aspects = vk_format_aspects(depth_stencil_format);1102}11031104VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;1105sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);1106pipeline->stencil_test_enable = info.stencilTestEnable;1107pipeline->writes_depth = info.depthWriteEnable;1108pipeline->depth_test_enable = info.depthTestEnable;1109pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;11101111bool dynamic_stencil_op =1112dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;11131114#if GFX_VER <= 71115struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {1116#else1117struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {1118#endif1119.DepthTestEnable =1120dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?11210 : info.depthTestEnable,11221123.DepthBufferWriteEnable =1124dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?11250 : info.depthWriteEnable,11261127.DepthTestFunction =1128dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?11290 : genX(vk_to_intel_compare_op)[info.depthCompareOp],11301131.DoubleSidedStencilEnable = true,11321133.StencilTestEnable =1134dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?11350 : info.stencilTestEnable,11361137.StencilFailOp = genX(vk_to_intel_stencil_op)[info.front.failOp],1138.StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.front.passOp],1139.StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.front.depthFailOp],1140.StencilTestFunction = genX(vk_to_intel_compare_op)[info.front.compareOp],1141.BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[info.back.failOp],1142.BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.back.passOp],1143.BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.back.depthFailOp],1144.BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[info.back.compareOp],1145};11461147if (dynamic_stencil_op) {1148depth_stencil.StencilFailOp = 0;1149depth_stencil.StencilPassDepthPassOp = 0;1150depth_stencil.StencilPassDepthFailOp = 0;1151depth_stencil.StencilTestFunction = 0;1152depth_stencil.BackfaceStencilFailOp = 0;1153depth_stencil.BackfaceStencilPassDepthPassOp = 0;1154depth_stencil.BackfaceStencilPassDepthFailOp = 0;1155depth_stencil.BackfaceStencilTestFunction = 0;1156}11571158#if GFX_VER <= 71159GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);1160#else1161GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);1162#endif1163}11641165static bool1166is_dual_src_blend_factor(VkBlendFactor factor)1167{1168return factor == VK_BLEND_FACTOR_SRC1_COLOR ||1169factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||1170factor == VK_BLEND_FACTOR_SRC1_ALPHA ||1171factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;1172}11731174static inline uint32_t *1175write_disabled_blend(uint32_t *state)1176{1177struct GENX(BLEND_STATE_ENTRY) entry = {1178.WriteDisableAlpha = true,1179.WriteDisableRed = true,1180.WriteDisableGreen = true,1181.WriteDisableBlue = true,1182};1183GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);1184return state + GENX(BLEND_STATE_ENTRY_length);1185}11861187static void1188emit_cb_state(struct anv_graphics_pipeline *pipeline,1189const VkPipelineColorBlendStateCreateInfo *info,1190const VkPipelineMultisampleStateCreateInfo *ms_info,1191uint32_t dynamic_states)1192{1193struct anv_device *device = pipeline->base.device;1194const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);11951196struct GENX(BLEND_STATE) blend_state = {1197#if GFX_VER >= 81198.AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,1199.AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,1200#endif1201};12021203uint32_t surface_count = 0;1204struct anv_pipeline_bind_map *map;1205if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {1206map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;1207surface_count = map->surface_count;1208}12091210const uint32_t num_dwords = GENX(BLEND_STATE_length) +1211GENX(BLEND_STATE_ENTRY_length) * surface_count;1212uint32_t *blend_state_start, *state_pos;12131214if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |1215ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {1216const struct intel_device_info *devinfo = &pipeline->base.device->info;1217blend_state_start = devinfo->ver >= 8 ?1218pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;1219pipeline->blend_state = ANV_STATE_NULL;1220} else {1221pipeline->blend_state =1222anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);1223blend_state_start = pipeline->blend_state.map;1224}1225state_pos = blend_state_start;12261227bool has_writeable_rt = false;1228state_pos += GENX(BLEND_STATE_length);1229#if GFX_VER >= 81230struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };1231#endif1232for (unsigned i = 0; i < surface_count; i++) {1233struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];12341235/* All color attachments are at the beginning of the binding table */1236if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)1237break;12381239/* We can have at most 8 attachments */1240assert(i < MAX_RTS);12411242if (info == NULL || binding->index >= info->attachmentCount) {1243state_pos = write_disabled_blend(state_pos);1244continue;1245}12461247if ((pipeline->dynamic_state.color_writes & (1u << binding->index)) == 0) {1248state_pos = write_disabled_blend(state_pos);1249continue;1250}12511252const VkPipelineColorBlendAttachmentState *a =1253&info->pAttachments[binding->index];12541255struct GENX(BLEND_STATE_ENTRY) entry = {1256#if GFX_VER < 81257.AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,1258.AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,1259#endif1260.LogicOpEnable = info->logicOpEnable,1261.LogicOpFunction = dynamic_states & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP ?12620: genX(vk_to_intel_logic_op)[info->logicOp],12631264/* Vulkan specification 1.2.168, VkLogicOp:1265*1266* "Logical operations are controlled by the logicOpEnable and1267* logicOp members of VkPipelineColorBlendStateCreateInfo. If1268* logicOpEnable is VK_TRUE, then a logical operation selected by1269* logicOp is applied between each color attachment and the1270* fragment’s corresponding output value, and blending of all1271* attachments is treated as if it were disabled."1272*1273* From the Broadwell PRM Volume 2d: Command Reference: Structures:1274* BLEND_STATE_ENTRY:1275*1276* "Enabling LogicOp and Color Buffer Blending at the same time is1277* UNDEFINED"1278*/1279.ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,1280.ColorClampRange = COLORCLAMP_RTFORMAT,1281.PreBlendColorClampEnable = true,1282.PostBlendColorClampEnable = true,1283.SourceBlendFactor = vk_to_intel_blend[a->srcColorBlendFactor],1284.DestinationBlendFactor = vk_to_intel_blend[a->dstColorBlendFactor],1285.ColorBlendFunction = vk_to_intel_blend_op[a->colorBlendOp],1286.SourceAlphaBlendFactor = vk_to_intel_blend[a->srcAlphaBlendFactor],1287.DestinationAlphaBlendFactor = vk_to_intel_blend[a->dstAlphaBlendFactor],1288.AlphaBlendFunction = vk_to_intel_blend_op[a->alphaBlendOp],1289.WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),1290.WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),1291.WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),1292.WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),1293};12941295if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||1296a->dstColorBlendFactor != a->dstAlphaBlendFactor ||1297a->colorBlendOp != a->alphaBlendOp) {1298#if GFX_VER >= 81299blend_state.IndependentAlphaBlendEnable = true;1300#else1301entry.IndependentAlphaBlendEnable = true;1302#endif1303}13041305/* The Dual Source Blending documentation says:1306*1307* "If SRC1 is included in a src/dst blend factor and1308* a DualSource RT Write message is not used, results1309* are UNDEFINED. (This reflects the same restriction in DX APIs,1310* where undefined results are produced if “o1” is not written1311* by a PS – there are no default values defined)."1312*1313* There is no way to gracefully fix this undefined situation1314* so we just disable the blending to prevent possible issues.1315*/1316if (!wm_prog_data->dual_src_blend &&1317(is_dual_src_blend_factor(a->srcColorBlendFactor) ||1318is_dual_src_blend_factor(a->dstColorBlendFactor) ||1319is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||1320is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {1321vk_debug_report(&device->physical->instance->vk,1322VK_DEBUG_REPORT_WARNING_BIT_EXT,1323&device->vk.base, 0, 0, "anv",1324"Enabled dual-src blend factors without writing both targets "1325"in the shader. Disabling blending to avoid GPU hangs.");1326entry.ColorBufferBlendEnable = false;1327}13281329if (a->colorWriteMask != 0)1330has_writeable_rt = true;13311332/* Our hardware applies the blend factor prior to the blend function1333* regardless of what function is used. Technically, this means the1334* hardware can do MORE than GL or Vulkan specify. However, it also1335* means that, for MIN and MAX, we have to stomp the blend factor to1336* ONE to make it a no-op.1337*/1338if (a->colorBlendOp == VK_BLEND_OP_MIN ||1339a->colorBlendOp == VK_BLEND_OP_MAX) {1340entry.SourceBlendFactor = BLENDFACTOR_ONE;1341entry.DestinationBlendFactor = BLENDFACTOR_ONE;1342}1343if (a->alphaBlendOp == VK_BLEND_OP_MIN ||1344a->alphaBlendOp == VK_BLEND_OP_MAX) {1345entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;1346entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;1347}1348GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);1349state_pos += GENX(BLEND_STATE_ENTRY_length);1350#if GFX_VER >= 81351if (i == 0)1352bs0 = entry;1353#endif1354}13551356#if GFX_VER >= 81357struct GENX(3DSTATE_PS_BLEND) blend = {1358GENX(3DSTATE_PS_BLEND_header),1359};1360blend.AlphaToCoverageEnable = blend_state.AlphaToCoverageEnable;1361blend.HasWriteableRT = has_writeable_rt;1362blend.ColorBufferBlendEnable = bs0.ColorBufferBlendEnable;1363blend.SourceAlphaBlendFactor = bs0.SourceAlphaBlendFactor;1364blend.DestinationAlphaBlendFactor = bs0.DestinationAlphaBlendFactor;1365blend.SourceBlendFactor = bs0.SourceBlendFactor;1366blend.DestinationBlendFactor = bs0.DestinationBlendFactor;1367blend.AlphaTestEnable = false;1368blend.IndependentAlphaBlendEnable = blend_state.IndependentAlphaBlendEnable;13691370if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |1371ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {1372GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);1373} else {1374anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)1375_blend = blend;1376}1377#else1378(void)has_writeable_rt;1379#endif13801381GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);13821383if (!(dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |1384ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP))) {1385anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {1386bsp.BlendStatePointer = pipeline->blend_state.offset;1387#if GFX_VER >= 81388bsp.BlendStatePointerValid = true;1389#endif1390}1391}1392}13931394static void1395emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,1396const VkPipelineInputAssemblyStateCreateInfo *ia_info,1397const VkPipelineViewportStateCreateInfo *vp_info,1398const VkPipelineRasterizationStateCreateInfo *rs_info,1399const uint32_t dynamic_states)1400{1401const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);1402(void) wm_prog_data;14031404struct GENX(3DSTATE_CLIP) clip = {1405GENX(3DSTATE_CLIP_header),1406};14071408clip.ClipEnable = true;1409clip.StatisticsEnable = true;1410clip.EarlyCullEnable = true;1411clip.APIMode = APIMODE_D3D;1412clip.GuardbandClipTestEnable = true;14131414/* Only enable the XY clip test when the final polygon rasterization1415* mode is VK_POLYGON_MODE_FILL. We want to leave it disabled for1416* points and lines so we get "pop-free" clipping.1417*/1418VkPolygonMode raster_mode =1419genX(raster_polygon_mode)(pipeline, ia_info->topology);1420clip.ViewportXYClipTestEnable =1421dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ?14220 : (raster_mode == VK_POLYGON_MODE_FILL);14231424#if GFX_VER >= 81425clip.VertexSubPixelPrecisionSelect = _8Bit;1426#endif1427clip.ClipMode = CLIPMODE_NORMAL;14281429switch (vk_provoking_vertex_mode(rs_info)) {1430case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:1431clip.TriangleStripListProvokingVertexSelect = 0;1432clip.LineStripListProvokingVertexSelect = 0;1433clip.TriangleFanProvokingVertexSelect = 1;1434break;14351436case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:1437clip.TriangleStripListProvokingVertexSelect = 2;1438clip.LineStripListProvokingVertexSelect = 1;1439clip.TriangleFanProvokingVertexSelect = 2;1440break;14411442default:1443unreachable("Invalid provoking vertex mode");1444}14451446clip.MinimumPointWidth = 0.125;1447clip.MaximumPointWidth = 255.875;14481449const struct brw_vue_prog_data *last =1450anv_pipeline_get_last_vue_prog_data(pipeline);14511452/* From the Vulkan 1.0.45 spec:1453*1454* "If the last active vertex processing stage shader entry point's1455* interface does not include a variable decorated with1456* ViewportIndex, then the first viewport is used."1457*/1458if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {1459clip.MaximumVPIndex = vp_info->viewportCount > 0 ?1460vp_info->viewportCount - 1 : 0;1461} else {1462clip.MaximumVPIndex = 0;1463}14641465/* From the Vulkan 1.0.45 spec:1466*1467* "If the last active vertex processing stage shader entry point's1468* interface does not include a variable decorated with Layer, then1469* the first layer is used."1470*/1471clip.ForceZeroRTAIndexEnable =1472!(last->vue_map.slots_valid & VARYING_BIT_LAYER);14731474#if GFX_VER == 71475clip.FrontWinding = genX(vk_to_intel_front_face)[rs_info->frontFace];1476clip.CullMode = genX(vk_to_intel_cullmode)[rs_info->cullMode];1477clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;1478clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;1479clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;1480#else1481clip.NonPerspectiveBarycentricEnable = wm_prog_data ?1482(wm_prog_data->barycentric_interp_modes &1483BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;1484#endif14851486GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);1487}14881489static void1490emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,1491const VkPipelineRasterizationStateCreateInfo *rs_info,1492const uint32_t dynamic_states)1493{1494const struct brw_vue_prog_data *prog_data =1495anv_pipeline_get_last_vue_prog_data(pipeline);1496const struct brw_vue_map *vue_map = &prog_data->vue_map;14971498nir_xfb_info *xfb_info;1499if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))1500xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;1501else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))1502xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;1503else1504xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;15051506#if GFX_VER == 71507# define streamout_state_dw pipeline->gfx7.streamout_state1508#else1509# define streamout_state_dw pipeline->gfx8.streamout_state1510#endif15111512struct GENX(3DSTATE_STREAMOUT) so = {1513GENX(3DSTATE_STREAMOUT_header),1514.RenderingDisable =1515(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) ?15160 : rs_info->rasterizerDiscardEnable,1517};15181519if (xfb_info) {1520so.SOFunctionEnable = true;1521so.SOStatisticsEnable = true;15221523switch (vk_provoking_vertex_mode(rs_info)) {1524case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:1525so.ReorderMode = LEADING;1526break;15271528case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:1529so.ReorderMode = TRAILING;1530break;15311532default:1533unreachable("Invalid provoking vertex mode");1534}15351536const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =1537vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);1538so.RenderStreamSelect = stream_info ?1539stream_info->rasterizationStream : 0;15401541#if GFX_VER >= 81542so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;1543so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;1544so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;1545so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;1546#else1547pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;1548pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;1549pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;1550pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;15511552/* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which1553* is a bit inconvenient because we don't know what buffers will1554* actually be enabled until draw time. We do our best here by1555* setting them based on buffers_written and we disable them1556* as-needed at draw time by setting EndAddress = BaseAddress.1557*/1558so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);1559so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);1560so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);1561so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);1562#endif15631564int urb_entry_read_offset = 0;1565int urb_entry_read_length =1566(prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;15671568/* We always read the whole vertex. This could be reduced at some1569* point by reading less and offsetting the register index in the1570* SO_DECLs.1571*/1572so.Stream0VertexReadOffset = urb_entry_read_offset;1573so.Stream0VertexReadLength = urb_entry_read_length - 1;1574so.Stream1VertexReadOffset = urb_entry_read_offset;1575so.Stream1VertexReadLength = urb_entry_read_length - 1;1576so.Stream2VertexReadOffset = urb_entry_read_offset;1577so.Stream2VertexReadLength = urb_entry_read_length - 1;1578so.Stream3VertexReadOffset = urb_entry_read_offset;1579so.Stream3VertexReadLength = urb_entry_read_length - 1;1580}15811582if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {1583GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);1584} else {1585anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), _so)1586_so = so;1587}15881589if (xfb_info) {1590struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];1591int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};1592int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};15931594memset(so_decl, 0, sizeof(so_decl));15951596for (unsigned i = 0; i < xfb_info->output_count; i++) {1597const nir_xfb_output_info *output = &xfb_info->outputs[i];1598unsigned buffer = output->buffer;1599unsigned stream = xfb_info->buffer_to_stream[buffer];16001601/* Our hardware is unusual in that it requires us to program SO_DECLs1602* for fake "hole" components, rather than simply taking the offset1603* for each real varying. Each hole can have size 1, 2, 3, or 4; we1604* program as many size = 4 holes as we can, then a final hole to1605* accommodate the final 1, 2, or 3 remaining.1606*/1607int hole_dwords = (output->offset - next_offset[buffer]) / 4;1608while (hole_dwords > 0) {1609so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {1610.HoleFlag = 1,1611.OutputBufferSlot = buffer,1612.ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,1613};1614hole_dwords -= 4;1615}16161617int varying = output->location;1618uint8_t component_mask = output->component_mask;1619/* VARYING_SLOT_PSIZ contains four scalar fields packed together:1620* - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x1621* - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y1622* - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z1623* - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w1624*/1625if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {1626varying = VARYING_SLOT_PSIZ;1627component_mask = 1 << 0; // SO_DECL_COMPMASK_X1628} else if (varying == VARYING_SLOT_LAYER) {1629varying = VARYING_SLOT_PSIZ;1630component_mask = 1 << 1; // SO_DECL_COMPMASK_Y1631} else if (varying == VARYING_SLOT_VIEWPORT) {1632varying = VARYING_SLOT_PSIZ;1633component_mask = 1 << 2; // SO_DECL_COMPMASK_Z1634} else if (varying == VARYING_SLOT_PSIZ) {1635component_mask = 1 << 3; // SO_DECL_COMPMASK_W1636}16371638next_offset[buffer] = output->offset +1639__builtin_popcount(component_mask) * 4;16401641const int slot = vue_map->varying_to_slot[varying];1642if (slot < 0) {1643/* This can happen if the shader never writes to the varying.1644* Insert a hole instead of actual varying data.1645*/1646so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {1647.HoleFlag = true,1648.OutputBufferSlot = buffer,1649.ComponentMask = component_mask,1650};1651} else {1652so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {1653.OutputBufferSlot = buffer,1654.RegisterIndex = slot,1655.ComponentMask = component_mask,1656};1657}1658}16591660int max_decls = 0;1661for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)1662max_decls = MAX2(max_decls, decls[s]);16631664uint8_t sbs[MAX_XFB_STREAMS] = { };1665for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {1666if (xfb_info->buffers_written & (1 << b))1667sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;1668}16691670uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,1671GENX(3DSTATE_SO_DECL_LIST),1672.StreamtoBufferSelects0 = sbs[0],1673.StreamtoBufferSelects1 = sbs[1],1674.StreamtoBufferSelects2 = sbs[2],1675.StreamtoBufferSelects3 = sbs[3],1676.NumEntries0 = decls[0],1677.NumEntries1 = decls[1],1678.NumEntries2 = decls[2],1679.NumEntries3 = decls[3]);16801681for (int i = 0; i < max_decls; i++) {1682GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,1683&(struct GENX(SO_DECL_ENTRY)) {1684.Stream0Decl = so_decl[0][i],1685.Stream1Decl = so_decl[1][i],1686.Stream2Decl = so_decl[2][i],1687.Stream3Decl = so_decl[3][i],1688});1689}1690}1691}16921693static uint32_t1694get_sampler_count(const struct anv_shader_bin *bin)1695{1696uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);16971698/* We can potentially have way more than 32 samplers and that's ok.1699* However, the 3DSTATE_XS packets only have 3 bits to specify how1700* many to pre-fetch and all values above 4 are marked reserved.1701*/1702return MIN2(count_by_4, 4);1703}17041705static UNUSED struct anv_address1706get_scratch_address(struct anv_pipeline *pipeline,1707gl_shader_stage stage,1708const struct anv_shader_bin *bin)1709{1710return (struct anv_address) {1711.bo = anv_scratch_pool_alloc(pipeline->device,1712&pipeline->device->scratch_pool,1713stage, bin->prog_data->total_scratch),1714.offset = 0,1715};1716}17171718static UNUSED uint32_t1719get_scratch_space(const struct anv_shader_bin *bin)1720{1721return ffs(bin->prog_data->total_scratch / 2048);1722}17231724static UNUSED uint32_t1725get_scratch_surf(struct anv_pipeline *pipeline,1726const struct anv_shader_bin *bin)1727{1728return anv_scratch_pool_get_surf(pipeline->device,1729&pipeline->device->scratch_pool,1730bin->prog_data->total_scratch) >> 4;1731}17321733static void1734emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)1735{1736const struct intel_device_info *devinfo = &pipeline->base.device->info;1737const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);1738const struct anv_shader_bin *vs_bin =1739pipeline->shaders[MESA_SHADER_VERTEX];17401741assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));17421743anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {1744vs.Enable = true;1745vs.StatisticsEnable = true;1746vs.KernelStartPointer = vs_bin->kernel.offset;1747#if GFX_VER >= 81748vs.SIMD8DispatchEnable =1749vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;1750#endif17511752assert(!vs_prog_data->base.base.use_alt_mode);1753#if GFX_VER < 111754vs.SingleVertexDispatch = false;1755#endif1756vs.VectorMaskEnable = false;1757/* Wa_1606682166:1758* Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.1759* Disable the Sampler state prefetch functionality in the SARB by1760* programming 0xB000[30] to '1'.1761*/1762vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);1763vs.BindingTableEntryCount = vs_bin->bind_map.surface_count;1764vs.FloatingPointMode = IEEE754;1765vs.IllegalOpcodeExceptionEnable = false;1766vs.SoftwareExceptionEnable = false;1767vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;17681769if (GFX_VER == 9 && devinfo->gt == 4 &&1770anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {1771/* On Sky Lake GT4, we have experienced some hangs related to the VS1772* cache and tessellation. It is unknown exactly what is happening1773* but the Haswell docs for the "VS Reference Count Full Force Miss1774* Enable" field of the "Thread Mode" register refer to a HSW bug in1775* which the VUE handle reference count would overflow resulting in1776* internal reference counting bugs. My (Jason's) best guess is that1777* this bug cropped back up on SKL GT4 when we suddenly had more1778* threads in play than any previous gfx9 hardware.1779*1780* What we do know for sure is that setting this bit when1781* tessellation shaders are in use fixes a GPU hang in Batman: Arkham1782* City when playing with DXVK (https://bugs.freedesktop.org/107280).1783* Disabling the vertex cache with tessellation shaders should only1784* have a minor performance impact as the tessellation shaders are1785* likely generating and processing far more geometry than the vertex1786* stage.1787*/1788vs.VertexCacheDisable = true;1789}17901791vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length;1792vs.VertexURBEntryReadOffset = 0;1793vs.DispatchGRFStartRegisterForURBData =1794vs_prog_data->base.base.dispatch_grf_start_reg;17951796#if GFX_VER >= 81797vs.UserClipDistanceClipTestEnableBitmask =1798vs_prog_data->base.clip_distance_mask;1799vs.UserClipDistanceCullTestEnableBitmask =1800vs_prog_data->base.cull_distance_mask;1801#endif18021803#if GFX_VERx10 >= 1251804vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, vs_bin);1805#else1806vs.PerThreadScratchSpace = get_scratch_space(vs_bin);1807vs.ScratchSpaceBasePointer =1808get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);1809#endif1810}1811}18121813static void1814emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,1815const VkPipelineTessellationStateCreateInfo *tess_info)1816{1817if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {1818anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);1819anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);1820anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);1821return;1822}18231824const struct intel_device_info *devinfo = &pipeline->base.device->info;1825const struct anv_shader_bin *tcs_bin =1826pipeline->shaders[MESA_SHADER_TESS_CTRL];1827const struct anv_shader_bin *tes_bin =1828pipeline->shaders[MESA_SHADER_TESS_EVAL];18291830const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);1831const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);18321833anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {1834hs.Enable = true;1835hs.StatisticsEnable = true;1836hs.KernelStartPointer = tcs_bin->kernel.offset;1837/* Wa_1606682166 */1838hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);1839hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;18401841#if GFX_VER >= 121842/* Wa_1604578095:1843*1844* Hang occurs when the number of max threads is less than 2 times1845* the number of instance count. The number of max threads must be1846* more than 2 times the number of instance count.1847*/1848assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);1849#endif18501851hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;1852hs.IncludeVertexHandles = true;1853hs.InstanceCount = tcs_prog_data->instances - 1;18541855hs.VertexURBEntryReadLength = 0;1856hs.VertexURBEntryReadOffset = 0;1857hs.DispatchGRFStartRegisterForURBData =1858tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;1859#if GFX_VER >= 121860hs.DispatchGRFStartRegisterForURBData5 =1861tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;1862#endif18631864#if GFX_VERx10 >= 1251865hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tcs_bin);1866#else1867hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);1868hs.ScratchSpaceBasePointer =1869get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);1870#endif18711872#if GFX_VER == 121873/* Patch Count threshold specifies the maximum number of patches that1874* will be accumulated before a thread dispatch is forced.1875*/1876hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;1877#endif18781879#if GFX_VER >= 91880hs.DispatchMode = tcs_prog_data->base.dispatch_mode;1881hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;1882#endif1883}18841885const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =1886tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;18871888VkTessellationDomainOrigin uv_origin =1889domain_origin_state ? domain_origin_state->domainOrigin :1890VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;18911892anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {1893te.Partitioning = tes_prog_data->partitioning;18941895if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {1896te.OutputTopology = tes_prog_data->output_topology;1897} else {1898/* When the origin is upper-left, we have to flip the winding order */1899if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {1900te.OutputTopology = OUTPUT_TRI_CW;1901} else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {1902te.OutputTopology = OUTPUT_TRI_CCW;1903} else {1904te.OutputTopology = tes_prog_data->output_topology;1905}1906}19071908te.TEDomain = tes_prog_data->domain;1909te.TEEnable = true;1910te.MaximumTessellationFactorOdd = 63.0;1911te.MaximumTessellationFactorNotOdd = 64.0;1912}19131914anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {1915ds.Enable = true;1916ds.StatisticsEnable = true;1917ds.KernelStartPointer = tes_bin->kernel.offset;1918/* Wa_1606682166 */1919ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);1920ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;1921ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;19221923ds.ComputeWCoordinateEnable =1924tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;19251926ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;1927ds.PatchURBEntryReadOffset = 0;1928ds.DispatchGRFStartRegisterForURBData =1929tes_prog_data->base.base.dispatch_grf_start_reg;19301931#if GFX_VER >= 81932#if GFX_VER < 111933ds.DispatchMode =1934tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?1935DISPATCH_MODE_SIMD8_SINGLE_PATCH :1936DISPATCH_MODE_SIMD4X2;1937#else1938assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);1939ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;1940#endif19411942ds.UserClipDistanceClipTestEnableBitmask =1943tes_prog_data->base.clip_distance_mask;1944ds.UserClipDistanceCullTestEnableBitmask =1945tes_prog_data->base.cull_distance_mask;1946#endif19471948#if GFX_VERx10 >= 1251949ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tes_bin);1950#else1951ds.PerThreadScratchSpace = get_scratch_space(tes_bin);1952ds.ScratchSpaceBasePointer =1953get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);1954#endif1955}1956}19571958static void1959emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)1960{1961const struct intel_device_info *devinfo = &pipeline->base.device->info;1962const struct anv_shader_bin *gs_bin =1963pipeline->shaders[MESA_SHADER_GEOMETRY];19641965if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {1966anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);1967return;1968}19691970const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);19711972anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {1973gs.Enable = true;1974gs.StatisticsEnable = true;1975gs.KernelStartPointer = gs_bin->kernel.offset;1976gs.DispatchMode = gs_prog_data->base.dispatch_mode;19771978gs.SingleProgramFlow = false;1979gs.VectorMaskEnable = false;1980/* Wa_1606682166 */1981gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);1982gs.BindingTableEntryCount = gs_bin->bind_map.surface_count;1983gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles;1984gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;19851986if (GFX_VER == 8) {1987/* Broadwell is weird. It needs us to divide by 2. */1988gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;1989} else {1990gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;1991}19921993gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;1994gs.OutputTopology = gs_prog_data->output_topology;1995gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;1996gs.ControlDataFormat = gs_prog_data->control_data_format;1997gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;1998gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1;1999gs.ReorderMode = TRAILING;20002001#if GFX_VER >= 82002gs.ExpectedVertexCount = gs_prog_data->vertices_in;2003gs.StaticOutput = gs_prog_data->static_vertex_count >= 0;2004gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?2005gs_prog_data->static_vertex_count : 0;2006#endif20072008gs.VertexURBEntryReadOffset = 0;2009gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;2010gs.DispatchGRFStartRegisterForURBData =2011gs_prog_data->base.base.dispatch_grf_start_reg;20122013#if GFX_VER >= 82014gs.UserClipDistanceClipTestEnableBitmask =2015gs_prog_data->base.clip_distance_mask;2016gs.UserClipDistanceCullTestEnableBitmask =2017gs_prog_data->base.cull_distance_mask;2018#endif20192020#if GFX_VERx10 >= 1252021gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, gs_bin);2022#else2023gs.PerThreadScratchSpace = get_scratch_space(gs_bin);2024gs.ScratchSpaceBasePointer =2025get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);2026#endif2027}2028}20292030static bool2031has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,2032const VkPipelineColorBlendStateCreateInfo *blend)2033{2034const struct anv_shader_bin *shader_bin =2035pipeline->shaders[MESA_SHADER_FRAGMENT];2036if (!shader_bin)2037return false;20382039if (!pipeline->dynamic_state.color_writes)2040return false;20412042const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;2043for (int i = 0; i < bind_map->surface_count; i++) {2044struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];20452046if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)2047continue;20482049if (binding->index == UINT32_MAX)2050continue;20512052if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)2053return true;2054}20552056return false;2057}20582059static void2060emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,2061const VkPipelineInputAssemblyStateCreateInfo *ia,2062const VkPipelineRasterizationStateCreateInfo *raster,2063const VkPipelineColorBlendStateCreateInfo *blend,2064const VkPipelineMultisampleStateCreateInfo *multisample,2065const VkPipelineRasterizationLineStateCreateInfoEXT *line,2066const uint32_t dynamic_states)2067{2068const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);20692070struct GENX(3DSTATE_WM) wm = {2071GENX(3DSTATE_WM_header),2072};2073wm.StatisticsEnable = true;2074wm.LineEndCapAntialiasingRegionWidth = _05pixels;2075wm.LineAntialiasingRegionWidth = _10pixels;2076wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;20772078if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {2079if (wm_prog_data->early_fragment_tests) {2080wm.EarlyDepthStencilControl = EDSC_PREPS;2081} else if (wm_prog_data->has_side_effects) {2082wm.EarlyDepthStencilControl = EDSC_PSEXEC;2083} else {2084wm.EarlyDepthStencilControl = EDSC_NORMAL;2085}20862087#if GFX_VER >= 82088/* Gen8 hardware tries to compute ThreadDispatchEnable for us but2089* doesn't take into account KillPixels when no depth or stencil2090* writes are enabled. In order for occlusion queries to work2091* correctly with no attachments, we need to force-enable PS thread2092* dispatch.2093*2094* The BDW docs are pretty clear that that this bit isn't validated2095* and probably shouldn't be used in production:2096*2097* "This must always be set to Normal. This field should not be2098* tested for functional validation."2099*2100* Unfortunately, however, the other mechanism we have for doing this2101* is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.2102* Given two bad options, we choose the one which works.2103*/2104pipeline->force_fragment_thread_dispatch =2105wm_prog_data->has_side_effects ||2106wm_prog_data->uses_kill;21072108if (pipeline->force_fragment_thread_dispatch ||2109!has_color_buffer_write_enabled(pipeline, blend)) {2110/* Only set this value in non dynamic mode. */2111wm.ForceThreadDispatchEnable =2112!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;2113}2114#endif21152116wm.BarycentricInterpolationMode =2117wm_prog_data->barycentric_interp_modes;21182119#if GFX_VER < 82120wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;2121wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;2122wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;2123wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;21242125/* If the subpass has a depth or stencil self-dependency, then we2126* need to force the hardware to do the depth/stencil write *after*2127* fragment shader execution. Otherwise, the writes may hit memory2128* before we get around to fetching from the input attachment and we2129* may get the depth or stencil value from the current draw rather2130* than the previous one.2131*/2132wm.PixelShaderKillsPixel = subpass->has_ds_self_dep ||2133wm_prog_data->uses_kill;21342135pipeline->force_fragment_thread_dispatch =2136wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||2137wm_prog_data->has_side_effects ||2138wm.PixelShaderKillsPixel;21392140if (pipeline->force_fragment_thread_dispatch ||2141has_color_buffer_write_enabled(pipeline, blend)) {2142/* Only set this value in non dynamic mode. */2143wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);2144}21452146if (multisample && multisample->rasterizationSamples > 1) {2147if (wm_prog_data->persample_dispatch) {2148wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;2149} else {2150wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;2151}2152} else {2153wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;2154}21552156VkPolygonMode raster_mode =2157genX(raster_polygon_mode)(pipeline, ia->topology);21582159wm.MultisampleRasterizationMode =2160dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ? 0 :2161genX(ms_rasterization_mode)(pipeline, raster_mode);2162#endif21632164wm.LineStippleEnable = line && line->stippledLineEnable;2165}21662167uint32_t dynamic_wm_states = ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;21682169#if GFX_VER < 82170dynamic_wm_states |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;2171#endif21722173if (dynamic_states & dynamic_wm_states) {2174const struct intel_device_info *devinfo = &pipeline->base.device->info;2175uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;2176GENX(3DSTATE_WM_pack)(NULL, dws, &wm);2177} else {2178anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)2179_wm = wm;2180}2181}21822183static void2184emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,2185const VkPipelineColorBlendStateCreateInfo *blend,2186const VkPipelineMultisampleStateCreateInfo *multisample)2187{2188UNUSED const struct intel_device_info *devinfo =2189&pipeline->base.device->info;2190const struct anv_shader_bin *fs_bin =2191pipeline->shaders[MESA_SHADER_FRAGMENT];21922193if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {2194anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {2195#if GFX_VER == 72196/* Even if no fragments are ever dispatched, gfx7 hardware hangs if2197* we don't at least set the maximum number of threads.2198*/2199ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;2200#endif2201}2202return;2203}22042205const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);22062207#if GFX_VER < 82208/* The hardware wedges if you have this bit set but don't turn on any dual2209* source blend factors.2210*/2211bool dual_src_blend = false;2212if (wm_prog_data->dual_src_blend && blend) {2213for (uint32_t i = 0; i < blend->attachmentCount; i++) {2214const VkPipelineColorBlendAttachmentState *bstate =2215&blend->pAttachments[i];22162217if (bstate->blendEnable &&2218(is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||2219is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||2220is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||2221is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {2222dual_src_blend = true;2223break;2224}2225}2226}2227#endif22282229anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {2230ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;2231ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;2232ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;22332234/* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:2235*2236* "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD322237* Dispatch must not be enabled for PER_PIXEL dispatch mode."2238*2239* Since 16x MSAA is first introduced on SKL, we don't need to apply2240* the workaround on any older hardware.2241*/2242if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&2243multisample && multisample->rasterizationSamples == 16) {2244assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);2245ps._32PixelDispatchEnable = false;2246}22472248ps.KernelStartPointer0 = fs_bin->kernel.offset +2249brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);2250ps.KernelStartPointer1 = fs_bin->kernel.offset +2251brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);2252ps.KernelStartPointer2 = fs_bin->kernel.offset +2253brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);22542255ps.SingleProgramFlow = false;2256ps.VectorMaskEnable = GFX_VER >= 8;2257/* Wa_1606682166 */2258ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);2259ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;2260ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 ||2261wm_prog_data->base.ubo_ranges[0].length;2262ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ?2263POSOFFSET_SAMPLE: POSOFFSET_NONE;2264#if GFX_VER < 82265ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;2266ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;2267ps.DualSourceBlendEnable = dual_src_blend;2268#endif22692270#if GFX_VERx10 == 752271/* Haswell requires the sample mask to be set in this packet as well2272* as in 3DSTATE_SAMPLE_MASK; the values should match.2273*/2274ps.SampleMask = 0xff;2275#endif22762277#if GFX_VER >= 92278ps.MaximumNumberofThreadsPerPSD = 64 - 1;2279#elif GFX_VER >= 82280ps.MaximumNumberofThreadsPerPSD = 64 - 2;2281#else2282ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;2283#endif22842285ps.DispatchGRFStartRegisterForConstantSetupData0 =2286brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);2287ps.DispatchGRFStartRegisterForConstantSetupData1 =2288brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);2289ps.DispatchGRFStartRegisterForConstantSetupData2 =2290brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);22912292#if GFX_VERx10 >= 1252293ps.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, fs_bin);2294#else2295ps.PerThreadScratchSpace = get_scratch_space(fs_bin);2296ps.ScratchSpaceBasePointer =2297get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);2298#endif2299}2300}23012302#if GFX_VER >= 82303static void2304emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,2305struct anv_subpass *subpass,2306const VkPipelineRasterizationStateCreateInfo *rs_info)2307{2308const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);23092310if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {2311anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);2312return;2313}23142315anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {2316ps.PixelShaderValid = true;2317ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;2318ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;2319ps.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;2320ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;2321ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;2322ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;23232324/* If the subpass has a depth or stencil self-dependency, then we need2325* to force the hardware to do the depth/stencil write *after* fragment2326* shader execution. Otherwise, the writes may hit memory before we get2327* around to fetching from the input attachment and we may get the depth2328* or stencil value from the current draw rather than the previous one.2329*/2330ps.PixelShaderKillsPixel = subpass->has_ds_self_dep ||2331wm_prog_data->uses_kill;23322333#if GFX_VER >= 92334ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;2335ps.PixelShaderPullsBary = wm_prog_data->pulls_bary;23362337ps.InputCoverageMaskState = ICMS_NONE;2338assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */2339if (!wm_prog_data->uses_sample_mask)2340ps.InputCoverageMaskState = ICMS_NONE;2341else if (wm_prog_data->per_coarse_pixel_dispatch)2342ps.InputCoverageMaskState = ICMS_NORMAL;2343else if (wm_prog_data->post_depth_coverage)2344ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;2345else2346ps.InputCoverageMaskState = ICMS_NORMAL;2347#else2348ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;2349#endif23502351#if GFX_VER >= 112352ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =2353wm_prog_data->uses_depth_w_coefficients;2354ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;2355#endif2356}2357}23582359static void2360emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)2361{2362anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {2363vft.PrimitiveTopologyType = pipeline->topology;2364}2365}2366#endif23672368static void2369emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)2370{2371anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {2372vfs.StatisticsEnable = true;2373}2374}23752376static void2377compute_kill_pixel(struct anv_graphics_pipeline *pipeline,2378const VkPipelineMultisampleStateCreateInfo *ms_info,2379const struct anv_subpass *subpass)2380{2381if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {2382pipeline->kill_pixel = false;2383return;2384}23852386const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);23872388/* This computes the KillPixel portion of the computation for whether or2389* not we want to enable the PMA fix on gfx8 or gfx9. It's given by this2390* chunk of the giant formula:2391*2392* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||2393* 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||2394* 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||2395* 3DSTATE_PS_BLEND::AlphaTestEnable ||2396* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)2397*2398* 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is2399* 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept2400* of an alpha test.2401*/2402pipeline->kill_pixel =2403subpass->has_ds_self_dep || wm_prog_data->uses_kill ||2404wm_prog_data->uses_omask ||2405(ms_info && ms_info->alphaToCoverageEnable);2406}24072408#if GFX_VER == 122409static void2410emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)2411{2412if (!pipeline->use_primitive_replication) {2413anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);2414return;2415}24162417uint32_t view_mask = pipeline->subpass->view_mask;2418int view_count = util_bitcount(view_mask);2419assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);24202421anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {2422pr.ReplicaMask = (1 << view_count) - 1;2423pr.ReplicationCount = view_count - 1;24242425int i = 0;2426u_foreach_bit(view_index, view_mask) {2427pr.RTAIOffset[i] = view_index;2428i++;2429}2430}2431}2432#endif24332434static VkResult2435genX(graphics_pipeline_create)(2436VkDevice _device,2437struct anv_pipeline_cache * cache,2438const VkGraphicsPipelineCreateInfo* pCreateInfo,2439const VkAllocationCallbacks* pAllocator,2440VkPipeline* pPipeline)2441{2442ANV_FROM_HANDLE(anv_device, device, _device);2443ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);2444struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];2445struct anv_graphics_pipeline *pipeline;2446VkResult result;24472448assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);24492450/* Use the default pipeline cache if none is specified */2451if (cache == NULL && device->physical->instance->pipeline_cache_enabled)2452cache = &device->default_pipeline_cache;24532454pipeline = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,2455VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);2456if (pipeline == NULL)2457return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);24582459result = anv_graphics_pipeline_init(pipeline, device, cache,2460pCreateInfo, pAllocator);2461if (result != VK_SUCCESS) {2462vk_free2(&device->vk.alloc, pAllocator, pipeline);2463if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)2464*pPipeline = VK_NULL_HANDLE;2465return result;2466}24672468/* Information on which states are considered dynamic. */2469const VkPipelineDynamicStateCreateInfo *dyn_info =2470pCreateInfo->pDynamicState;2471uint32_t dynamic_states = 0;2472if (dyn_info) {2473for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)2474dynamic_states |=2475anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);2476}247724782479/* If rasterization is not enabled, various CreateInfo structs must be2480* ignored.2481*/2482const bool raster_enabled =2483!pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||2484(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);24852486const VkPipelineViewportStateCreateInfo *vp_info =2487raster_enabled ? pCreateInfo->pViewportState : NULL;24882489const VkPipelineMultisampleStateCreateInfo *ms_info =2490raster_enabled ? pCreateInfo->pMultisampleState : NULL;24912492const VkPipelineDepthStencilStateCreateInfo *ds_info =2493raster_enabled ? pCreateInfo->pDepthStencilState : NULL;24942495const VkPipelineColorBlendStateCreateInfo *cb_info =2496raster_enabled ? pCreateInfo->pColorBlendState : NULL;24972498const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =2499vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,2500PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);25012502enum intel_urb_deref_block_size urb_deref_block_size;2503emit_urb_setup(pipeline, &urb_deref_block_size);25042505assert(pCreateInfo->pVertexInputState);2506emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);2507assert(pCreateInfo->pRasterizationState);2508emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,2509pCreateInfo->pRasterizationState,2510ms_info, line_info, dynamic_states, pass, subpass,2511urb_deref_block_size);2512emit_ms_state(pipeline, ms_info, dynamic_states);2513emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);2514emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);2515compute_kill_pixel(pipeline, ms_info, subpass);25162517emit_3dstate_clip(pipeline,2518pCreateInfo->pInputAssemblyState,2519vp_info,2520pCreateInfo->pRasterizationState,2521dynamic_states);2522emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState,2523dynamic_states);25242525#if GFX_VER == 122526emit_3dstate_primitive_replication(pipeline);2527#endif25282529#if 02530/* From gfx7_vs_state.c */25312532/**2533* From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >2534* Geometry > Geometry Shader > State:2535*2536* "Note: Because of corruption in IVB:GT2, software needs to flush the2537* whole fixed function pipeline when the GS enable changes value in2538* the 3DSTATE_GS."2539*2540* The hardware architects have clarified that in this context "flush the2541* whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS2542* Stall" bit set.2543*/2544if (!device->info.is_haswell && !device->info.is_baytrail)2545gfx7_emit_vs_workaround_flush(brw);2546#endif25472548emit_3dstate_vs(pipeline);2549emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);2550emit_3dstate_gs(pipeline);2551emit_3dstate_sbe(pipeline);2552emit_3dstate_wm(pipeline, subpass,2553pCreateInfo->pInputAssemblyState,2554pCreateInfo->pRasterizationState,2555cb_info, ms_info, line_info, dynamic_states);2556emit_3dstate_ps(pipeline, cb_info, ms_info);2557#if GFX_VER >= 82558emit_3dstate_ps_extra(pipeline, subpass,2559pCreateInfo->pRasterizationState);25602561if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))2562emit_3dstate_vf_topology(pipeline);2563#endif2564emit_3dstate_vf_statistics(pipeline);25652566*pPipeline = anv_pipeline_to_handle(&pipeline->base);25672568return pipeline->base.batch.status;2569}25702571#if GFX_VERx10 >= 12525722573static void2574emit_compute_state(struct anv_compute_pipeline *pipeline,2575const struct anv_device *device)2576{2577const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);2578anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);25792580const uint32_t subslices = MAX2(device->physical->subslice_total, 1);25812582const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;2583const struct intel_device_info *devinfo = &device->info;25842585anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {2586cfe.MaximumNumberofThreads =2587devinfo->max_cs_threads * subslices - 1;2588cfe.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, cs_bin);2589}2590}25912592#else /* #if GFX_VERx10 >= 125 */25932594static void2595emit_compute_state(struct anv_compute_pipeline *pipeline,2596const struct anv_device *device)2597{2598const struct intel_device_info *devinfo = &device->info;2599const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);26002601anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);26022603const struct brw_cs_dispatch_info dispatch =2604brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);2605const uint32_t vfe_curbe_allocation =2606ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +2607cs_prog_data->push.cross_thread.regs, 2);26082609const uint32_t subslices = MAX2(device->physical->subslice_total, 1);26102611const struct anv_shader_bin *cs_bin = pipeline->cs;26122613anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {2614#if GFX_VER > 72615vfe.StackSize = 0;2616#else2617vfe.GPGPUMode = true;2618#endif2619vfe.MaximumNumberofThreads =2620devinfo->max_cs_threads * subslices - 1;2621vfe.NumberofURBEntries = GFX_VER <= 7 ? 0 : 2;2622#if GFX_VER < 112623vfe.ResetGatewayTimer = true;2624#endif2625#if GFX_VER <= 82626vfe.BypassGatewayControl = true;2627#endif2628vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;2629vfe.CURBEAllocationSize = vfe_curbe_allocation;26302631if (cs_bin->prog_data->total_scratch) {2632if (GFX_VER >= 8) {2633/* Broadwell's Per Thread Scratch Space is in the range [0, 11]2634* where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.2635*/2636vfe.PerThreadScratchSpace =2637ffs(cs_bin->prog_data->total_scratch) - 11;2638} else if (GFX_VERx10 == 75) {2639/* Haswell's Per Thread Scratch Space is in the range [0, 10]2640* where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.2641*/2642vfe.PerThreadScratchSpace =2643ffs(cs_bin->prog_data->total_scratch) - 12;2644} else {2645/* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]2646* where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.2647*/2648vfe.PerThreadScratchSpace =2649cs_bin->prog_data->total_scratch / 1024 - 1;2650}2651vfe.ScratchSpaceBasePointer =2652get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);2653}2654}26552656struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {2657.KernelStartPointer =2658cs_bin->kernel.offset +2659brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),26602661/* Wa_1606682166 */2662.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),2663/* We add 1 because the CS indirect parameters buffer isn't accounted2664* for in bind_map.surface_count.2665*/2666.BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),2667.BarrierEnable = cs_prog_data->uses_barrier,2668.SharedLocalMemorySize =2669encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),26702671#if GFX_VERx10 != 752672.ConstantURBEntryReadOffset = 0,2673#endif2674.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,2675#if GFX_VERx10 >= 752676.CrossThreadConstantDataReadLength =2677cs_prog_data->push.cross_thread.regs,2678#endif2679#if GFX_VER >= 122680/* TODO: Check if we are missing workarounds and enable mid-thread2681* preemption.2682*2683* We still have issues with mid-thread preemption (it was already2684* disabled by the kernel on gfx11, due to missing workarounds). It's2685* possible that we are just missing some workarounds, and could enable2686* it later, but for now let's disable it to fix a GPU in compute in Car2687* Chase (and possibly more).2688*/2689.ThreadPreemptionDisable = true,2690#endif26912692.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,2693};2694GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,2695pipeline->interface_descriptor_data,2696&desc);2697}26982699#endif /* #if GFX_VERx10 >= 125 */27002701static VkResult2702compute_pipeline_create(2703VkDevice _device,2704struct anv_pipeline_cache * cache,2705const VkComputePipelineCreateInfo* pCreateInfo,2706const VkAllocationCallbacks* pAllocator,2707VkPipeline* pPipeline)2708{2709ANV_FROM_HANDLE(anv_device, device, _device);2710struct anv_compute_pipeline *pipeline;2711VkResult result;27122713assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);27142715/* Use the default pipeline cache if none is specified */2716if (cache == NULL && device->physical->instance->pipeline_cache_enabled)2717cache = &device->default_pipeline_cache;27182719pipeline = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,2720VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);2721if (pipeline == NULL)2722return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);27232724result = anv_pipeline_init(&pipeline->base, device,2725ANV_PIPELINE_COMPUTE, pCreateInfo->flags,2726pAllocator);2727if (result != VK_SUCCESS) {2728vk_free2(&device->vk.alloc, pAllocator, pipeline);2729return result;2730}27312732anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,2733pipeline->batch_data, sizeof(pipeline->batch_data));27342735pipeline->cs = NULL;27362737assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);2738VK_FROM_HANDLE(vk_shader_module, module, pCreateInfo->stage.module);2739result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,2740pCreateInfo->stage.pName,2741pCreateInfo->stage.pSpecializationInfo);2742if (result != VK_SUCCESS) {2743anv_pipeline_finish(&pipeline->base, device, pAllocator);2744vk_free2(&device->vk.alloc, pAllocator, pipeline);2745if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)2746*pPipeline = VK_NULL_HANDLE;2747return result;2748}27492750emit_compute_state(pipeline, device);27512752*pPipeline = anv_pipeline_to_handle(&pipeline->base);27532754return pipeline->base.batch.status;2755}27562757VkResult genX(CreateGraphicsPipelines)(2758VkDevice _device,2759VkPipelineCache pipelineCache,2760uint32_t count,2761const VkGraphicsPipelineCreateInfo* pCreateInfos,2762const VkAllocationCallbacks* pAllocator,2763VkPipeline* pPipelines)2764{2765ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);27662767VkResult result = VK_SUCCESS;27682769unsigned i;2770for (i = 0; i < count; i++) {2771VkResult res = genX(graphics_pipeline_create)(_device,2772pipeline_cache,2773&pCreateInfos[i],2774pAllocator, &pPipelines[i]);27752776if (res == VK_SUCCESS)2777continue;27782779/* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it2780* is not obvious what error should be report upon 2 different failures.2781* */2782result = res;2783if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)2784break;27852786if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)2787break;2788}27892790for (; i < count; i++)2791pPipelines[i] = VK_NULL_HANDLE;27922793return result;2794}27952796VkResult genX(CreateComputePipelines)(2797VkDevice _device,2798VkPipelineCache pipelineCache,2799uint32_t count,2800const VkComputePipelineCreateInfo* pCreateInfos,2801const VkAllocationCallbacks* pAllocator,2802VkPipeline* pPipelines)2803{2804ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);28052806VkResult result = VK_SUCCESS;28072808unsigned i;2809for (i = 0; i < count; i++) {2810VkResult res = compute_pipeline_create(_device, pipeline_cache,2811&pCreateInfos[i],2812pAllocator, &pPipelines[i]);28132814if (res == VK_SUCCESS)2815continue;28162817/* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it2818* is not obvious what error should be report upon 2 different failures.2819* */2820result = res;2821if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)2822break;28232824if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)2825break;2826}28272828for (; i < count; i++)2829pPipelines[i] = VK_NULL_HANDLE;28302831return result;2832}28332834#if GFX_VERx10 >= 12528352836static void2837assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,2838uint32_t stage_idx,2839VkShaderStageFlags valid_stages)2840{2841if (stage_idx == VK_SHADER_UNUSED_KHR)2842return;28432844assert(stage_idx <= pCreateInfo->stageCount);2845assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);2846assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);2847}28482849static VkResult2850ray_tracing_pipeline_create(2851VkDevice _device,2852struct anv_pipeline_cache * cache,2853const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,2854const VkAllocationCallbacks* pAllocator,2855VkPipeline* pPipeline)2856{2857ANV_FROM_HANDLE(anv_device, device, _device);2858VkResult result;28592860assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);28612862/* Use the default pipeline cache if none is specified */2863if (cache == NULL && device->physical->instance->pipeline_cache_enabled)2864cache = &device->default_pipeline_cache;28652866VK_MULTIALLOC(ma);2867VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);2868VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);2869if (!vk_multialloc_alloc2(&ma, &device->vk.alloc, pAllocator,2870VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))2871return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);28722873result = anv_pipeline_init(&pipeline->base, device,2874ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,2875pAllocator);2876if (result != VK_SUCCESS) {2877vk_free2(&device->vk.alloc, pAllocator, pipeline);2878return result;2879}28802881pipeline->group_count = pCreateInfo->groupCount;2882pipeline->groups = groups;28832884ASSERTED const VkShaderStageFlags ray_tracing_stages =2885VK_SHADER_STAGE_RAYGEN_BIT_KHR |2886VK_SHADER_STAGE_ANY_HIT_BIT_KHR |2887VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |2888VK_SHADER_STAGE_MISS_BIT_KHR |2889VK_SHADER_STAGE_INTERSECTION_BIT_KHR |2890VK_SHADER_STAGE_CALLABLE_BIT_KHR;28912892for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)2893assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);28942895for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {2896const VkRayTracingShaderGroupCreateInfoKHR *ginfo =2897&pCreateInfo->pGroups[i];2898assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,2899VK_SHADER_STAGE_RAYGEN_BIT_KHR |2900VK_SHADER_STAGE_MISS_BIT_KHR |2901VK_SHADER_STAGE_CALLABLE_BIT_KHR);2902assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,2903VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);2904assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,2905VK_SHADER_STAGE_ANY_HIT_BIT_KHR);2906assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,2907VK_SHADER_STAGE_INTERSECTION_BIT_KHR);2908switch (ginfo->type) {2909case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:2910assert(ginfo->generalShader < pCreateInfo->stageCount);2911assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);2912assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);2913assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);2914break;29152916case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:2917assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);2918assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);2919break;29202921case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:2922assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);2923break;29242925default:2926unreachable("Invalid ray-tracing shader group type");2927}2928}29292930result = anv_ray_tracing_pipeline_init(pipeline, device, cache,2931pCreateInfo, pAllocator);2932if (result != VK_SUCCESS) {2933anv_pipeline_finish(&pipeline->base, device, pAllocator);2934vk_free2(&device->vk.alloc, pAllocator, pipeline);2935return result;2936}29372938for (uint32_t i = 0; i < pipeline->group_count; i++) {2939struct anv_rt_shader_group *group = &pipeline->groups[i];29402941switch (group->type) {2942case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {2943struct GFX_RT_GENERAL_SBT_HANDLE sh = {};2944sh.General = anv_shader_bin_get_bsr(group->general, 32);2945GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);2946break;2947}29482949case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {2950struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};2951if (group->closest_hit)2952sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);2953if (group->any_hit)2954sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);2955GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);2956break;2957}29582959case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {2960struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};2961if (group->closest_hit)2962sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);2963sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);2964GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);2965break;2966}29672968default:2969unreachable("Invalid shader group type");2970}2971}29722973*pPipeline = anv_pipeline_to_handle(&pipeline->base);29742975return pipeline->base.batch.status;2976}29772978VkResult2979genX(CreateRayTracingPipelinesKHR)(2980VkDevice _device,2981VkDeferredOperationKHR deferredOperation,2982VkPipelineCache pipelineCache,2983uint32_t createInfoCount,2984const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,2985const VkAllocationCallbacks* pAllocator,2986VkPipeline* pPipelines)2987{2988ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);29892990VkResult result = VK_SUCCESS;29912992unsigned i;2993for (i = 0; i < createInfoCount; i++) {2994VkResult res = ray_tracing_pipeline_create(_device, pipeline_cache,2995&pCreateInfos[i],2996pAllocator, &pPipelines[i]);29972998if (res == VK_SUCCESS)2999continue;30003001/* Bail out on the first error as it is not obvious what error should be3002* report upon 2 different failures. */3003result = res;3004if (result != VK_PIPELINE_COMPILE_REQUIRED_EXT)3005break;30063007if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)3008break;3009}30103011for (; i < createInfoCount; i++)3012pPipelines[i] = VK_NULL_HANDLE;30133014return result;3015}3016#endif /* GFX_VERx10 >= 125 */301730183019