Path: blob/21.2-virgl/src/intel/vulkan/genX_cmd_buffer.c
4547 views
/*1* Copyright © 2015 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include <assert.h>24#include <stdbool.h>2526#include "anv_private.h"27#include "anv_measure.h"28#include "vk_format.h"29#include "vk_util.h"30#include "util/fast_idiv_by_const.h"3132#include "common/intel_aux_map.h"33#include "common/intel_l3_config.h"34#include "genxml/gen_macros.h"35#include "genxml/genX_pack.h"36#include "genxml/gen_rt_pack.h"3738#include "nir/nir_xfb_info.h"3940/* We reserve :41* - GPR 14 for secondary command buffer returns42* - GPR 15 for conditional rendering43*/44#define MI_BUILDER_NUM_ALLOC_GPRS 1445#define __gen_get_batch_dwords anv_batch_emit_dwords46#define __gen_address_offset anv_address_add47#define __gen_get_batch_address(b, a) anv_batch_address(b, a)48#include "common/mi_builder.h"4950static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,51uint32_t pipeline);5253static enum anv_pipe_bits54convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {55enum anv_pipe_bits bits = 0;56bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;57bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;58#if GFX_VER >= 1259bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;60bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;61#endif62bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;63bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;64bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;65bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;66bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;67bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;68bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;69bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;70return bits;71}7273#define anv_debug_dump_pc(pc) \74if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL)) { \75fputs("pc: emit PC=( ", stderr); \76anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \77fprintf(stderr, ") reason: %s\n", __FUNCTION__); \78}7980void81genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)82{83struct anv_device *device = cmd_buffer->device;84UNUSED const struct intel_device_info *devinfo = &device->info;85uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);8687/* If we are emitting a new state base address we probably need to re-emit88* binding tables.89*/90cmd_buffer->state.descriptors_dirty |= ~0;9192/* Emit a render target cache flush.93*94* This isn't documented anywhere in the PRM. However, it seems to be95* necessary prior to changing the surface state base adress. Without96* this, we get GPU hangs when using multi-level command buffers which97* clear depth, reset state base address, and then go render stuff.98*/99anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {100#if GFX_VER >= 12101pc.HDCPipelineFlushEnable = true;102#else103pc.DCFlushEnable = true;104#endif105pc.RenderTargetCacheFlushEnable = true;106pc.CommandStreamerStallEnable = true;107#if GFX_VER == 12108/* Wa_1606662791:109*110* Software must program PIPE_CONTROL command with "HDC Pipeline111* Flush" prior to programming of the below two non-pipeline state :112* * STATE_BASE_ADDRESS113* * 3DSTATE_BINDING_TABLE_POOL_ALLOC114*/115if (devinfo->revision == 0 /* A0 */)116pc.HDCPipelineFlushEnable = true;117#endif118anv_debug_dump_pc(pc);119}120121#if GFX_VER == 12122/* Wa_1607854226:123*124* Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline125* mode by putting the pipeline temporarily in 3D mode.126*/127uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;128genX(flush_pipeline_select_3d)(cmd_buffer);129#endif130131anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {132sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };133sba.GeneralStateMOCS = mocs;134sba.GeneralStateBaseAddressModifyEnable = true;135136sba.StatelessDataPortAccessMOCS = mocs;137138sba.SurfaceStateBaseAddress =139anv_cmd_buffer_surface_base_address(cmd_buffer);140sba.SurfaceStateMOCS = mocs;141sba.SurfaceStateBaseAddressModifyEnable = true;142143sba.DynamicStateBaseAddress =144(struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };145sba.DynamicStateMOCS = mocs;146sba.DynamicStateBaseAddressModifyEnable = true;147148sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };149sba.IndirectObjectMOCS = mocs;150sba.IndirectObjectBaseAddressModifyEnable = true;151152sba.InstructionBaseAddress =153(struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };154sba.InstructionMOCS = mocs;155sba.InstructionBaseAddressModifyEnable = true;156157# if (GFX_VER >= 8)158/* Broadwell requires that we specify a buffer size for a bunch of159* these fields. However, since we will be growing the BO's live, we160* just set them all to the maximum.161*/162sba.GeneralStateBufferSize = 0xfffff;163sba.IndirectObjectBufferSize = 0xfffff;164if (anv_use_softpin(device->physical)) {165/* With softpin, we use fixed addresses so we actually know how big166* our base addresses are.167*/168sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;169sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;170} else {171sba.DynamicStateBufferSize = 0xfffff;172sba.InstructionBufferSize = 0xfffff;173}174sba.GeneralStateBufferSizeModifyEnable = true;175sba.IndirectObjectBufferSizeModifyEnable = true;176sba.DynamicStateBufferSizeModifyEnable = true;177sba.InstructionBuffersizeModifyEnable = true;178# else179/* On gfx7, we have upper bounds instead. According to the docs,180* setting an upper bound of zero means that no bounds checking is181* performed so, in theory, we should be able to leave them zero.182* However, border color is broken and the GPU bounds-checks anyway.183* To avoid this and other potential problems, we may as well set it184* for everything.185*/186sba.GeneralStateAccessUpperBound =187(struct anv_address) { .bo = NULL, .offset = 0xfffff000 };188sba.GeneralStateAccessUpperBoundModifyEnable = true;189sba.DynamicStateAccessUpperBound =190(struct anv_address) { .bo = NULL, .offset = 0xfffff000 };191sba.DynamicStateAccessUpperBoundModifyEnable = true;192sba.InstructionAccessUpperBound =193(struct anv_address) { .bo = NULL, .offset = 0xfffff000 };194sba.InstructionAccessUpperBoundModifyEnable = true;195# endif196# if (GFX_VER >= 9)197if (anv_use_softpin(device->physical)) {198sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {199.bo = device->surface_state_pool.block_pool.bo,200.offset = 0,201};202sba.BindlessSurfaceStateSize = (1 << 20) - 1;203} else {204sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS;205sba.BindlessSurfaceStateSize = 0;206}207sba.BindlessSurfaceStateMOCS = mocs;208sba.BindlessSurfaceStateBaseAddressModifyEnable = true;209# endif210# if (GFX_VER >= 10)211sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };212sba.BindlessSamplerStateMOCS = mocs;213sba.BindlessSamplerStateBaseAddressModifyEnable = true;214sba.BindlessSamplerStateBufferSize = 0;215# endif216}217218#if GFX_VER == 12219/* Wa_1607854226:220*221* Put the pipeline back into its current mode.222*/223if (gfx12_wa_pipeline != UINT32_MAX)224genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);225#endif226227/* After re-setting the surface state base address, we have to do some228* cache flusing so that the sampler engine will pick up the new229* SURFACE_STATE objects and binding tables. From the Broadwell PRM,230* Shared Function > 3D Sampler > State > State Caching (page 96):231*232* Coherency with system memory in the state cache, like the texture233* cache is handled partially by software. It is expected that the234* command stream or shader will issue Cache Flush operation or235* Cache_Flush sampler message to ensure that the L1 cache remains236* coherent with system memory.237*238* [...]239*240* Whenever the value of the Dynamic_State_Base_Addr,241* Surface_State_Base_Addr are altered, the L1 state cache must be242* invalidated to ensure the new surface or sampler state is fetched243* from system memory.244*245* The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit246* which, according the PIPE_CONTROL instruction documentation in the247* Broadwell PRM:248*249* Setting this bit is independent of any other bit in this packet.250* This bit controls the invalidation of the L1 and L2 state caches251* at the top of the pipe i.e. at the parsing time.252*253* Unfortunately, experimentation seems to indicate that state cache254* invalidation through a PIPE_CONTROL does nothing whatsoever in255* regards to surface state and binding tables. In stead, it seems that256* invalidating the texture cache is what is actually needed.257*258* XXX: As far as we have been able to determine through259* experimentation, shows that flush the texture cache appears to be260* sufficient. The theory here is that all of the sampling/rendering261* units cache the binding table in the texture cache. However, we have262* yet to be able to actually confirm this.263*/264anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {265pc.TextureCacheInvalidationEnable = true;266pc.ConstantCacheInvalidationEnable = true;267pc.StateCacheInvalidationEnable = true;268anv_debug_dump_pc(pc);269}270}271272static void273add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,274struct anv_state state, struct anv_address addr)275{276VkResult result;277278if (anv_use_softpin(cmd_buffer->device->physical)) {279result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,280&cmd_buffer->pool->alloc,281addr.bo);282} else {283const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;284result = anv_reloc_list_add(&cmd_buffer->surface_relocs,285&cmd_buffer->pool->alloc,286state.offset + isl_dev->ss.addr_offset,287addr.bo, addr.offset, NULL);288}289290if (unlikely(result != VK_SUCCESS))291anv_batch_set_error(&cmd_buffer->batch, result);292}293294static void295add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,296struct anv_surface_state state)297{298const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;299300assert(!anv_address_is_null(state.address));301add_surface_reloc(cmd_buffer, state.state, state.address);302303if (!anv_address_is_null(state.aux_address)) {304VkResult result =305anv_reloc_list_add(&cmd_buffer->surface_relocs,306&cmd_buffer->pool->alloc,307state.state.offset + isl_dev->ss.aux_addr_offset,308state.aux_address.bo,309state.aux_address.offset,310NULL);311if (result != VK_SUCCESS)312anv_batch_set_error(&cmd_buffer->batch, result);313}314315if (!anv_address_is_null(state.clear_address)) {316VkResult result =317anv_reloc_list_add(&cmd_buffer->surface_relocs,318&cmd_buffer->pool->alloc,319state.state.offset +320isl_dev->ss.clear_color_state_offset,321state.clear_address.bo,322state.clear_address.offset,323NULL);324if (result != VK_SUCCESS)325anv_batch_set_error(&cmd_buffer->batch, result);326}327}328329static bool330isl_color_value_requires_conversion(union isl_color_value color,331const struct isl_surf *surf,332const struct isl_view *view)333{334if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))335return false;336337uint32_t surf_pack[4] = { 0, 0, 0, 0 };338isl_color_value_pack(&color, surf->format, surf_pack);339340uint32_t view_pack[4] = { 0, 0, 0, 0 };341union isl_color_value swiz_color =342isl_color_value_swizzle_inv(color, view->swizzle);343isl_color_value_pack(&swiz_color, view->format, view_pack);344345return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;346}347348static bool349anv_can_fast_clear_color_view(struct anv_device * device,350struct anv_image_view *iview,351VkImageLayout layout,352union isl_color_value clear_color,353uint32_t num_layers,354VkRect2D render_area)355{356if (iview->planes[0].isl.base_array_layer >=357anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,358iview->planes[0].isl.base_level))359return false;360361/* Start by getting the fast clear type. We use the first subpass362* layout here because we don't want to fast-clear if the first subpass363* to use the attachment can't handle fast-clears.364*/365enum anv_fast_clear_type fast_clear_type =366anv_layout_to_fast_clear_type(&device->info, iview->image,367VK_IMAGE_ASPECT_COLOR_BIT,368layout);369switch (fast_clear_type) {370case ANV_FAST_CLEAR_NONE:371return false;372case ANV_FAST_CLEAR_DEFAULT_VALUE:373if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))374return false;375break;376case ANV_FAST_CLEAR_ANY:377break;378}379380/* Potentially, we could do partial fast-clears but doing so has crazy381* alignment restrictions. It's easier to just restrict to full size382* fast clears for now.383*/384if (render_area.offset.x != 0 ||385render_area.offset.y != 0 ||386render_area.extent.width != iview->extent.width ||387render_area.extent.height != iview->extent.height)388return false;389390/* On Broadwell and earlier, we can only handle 0/1 clear colors */391if (GFX_VER <= 8 &&392!isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))393return false;394395/* If the clear color is one that would require non-trivial format396* conversion on resolve, we don't bother with the fast clear. This397* shouldn't be common as most clear colors are 0/1 and the most common398* format re-interpretation is for sRGB.399*/400if (isl_color_value_requires_conversion(clear_color,401&iview->image->planes[0].primary_surface.isl,402&iview->planes[0].isl)) {403anv_perf_warn(device, &iview->base,404"Cannot fast-clear to colors which would require "405"format conversion on resolve");406return false;407}408409/* We only allow fast clears to the first slice of an image (level 0,410* layer 0) and only for the entire slice. This guarantees us that, at411* any given time, there is only one clear color on any given image at412* any given time. At the time of our testing (Jan 17, 2018), there413* were no known applications which would benefit from fast-clearing414* more than just the first slice.415*/416if (iview->planes[0].isl.base_level > 0 ||417iview->planes[0].isl.base_array_layer > 0) {418anv_perf_warn(device, &iview->image->base,419"Rendering with multi-lod or multi-layer framebuffer "420"with LOAD_OP_LOAD and baseMipLevel > 0 or "421"baseArrayLayer > 0. Not fast clearing.");422return false;423}424425if (num_layers > 1) {426anv_perf_warn(device, &iview->image->base,427"Rendering to a multi-layer framebuffer with "428"LOAD_OP_CLEAR. Only fast-clearing the first slice");429}430431return true;432}433434static bool435anv_can_hiz_clear_ds_view(struct anv_device *device,436struct anv_image_view *iview,437VkImageLayout layout,438VkImageAspectFlags clear_aspects,439float depth_clear_value,440VkRect2D render_area)441{442/* We don't do any HiZ or depth fast-clears on gfx7 yet */443if (GFX_VER == 7)444return false;445446/* If we're just clearing stencil, we can always HiZ clear */447if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))448return true;449450/* We must have depth in order to have HiZ */451if (!(iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT))452return false;453454const enum isl_aux_usage clear_aux_usage =455anv_layout_to_aux_usage(&device->info, iview->image,456VK_IMAGE_ASPECT_DEPTH_BIT,457VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,458layout);459if (!blorp_can_hiz_clear_depth(&device->info,460&iview->image->planes[0].primary_surface.isl,461clear_aux_usage,462iview->planes[0].isl.base_level,463iview->planes[0].isl.base_array_layer,464render_area.offset.x,465render_area.offset.y,466render_area.offset.x +467render_area.extent.width,468render_area.offset.y +469render_area.extent.height))470return false;471472if (depth_clear_value != ANV_HZ_FC_VAL)473return false;474475/* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared476* portion of a HiZ buffer. Testing has revealed that Gfx8 only supports477* returning 0.0f. Gens prior to gfx8 do not support this feature at all.478*/479if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))480return false;481482/* If we got here, then we can fast clear */483return true;484}485486#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))487488#if GFX_VER == 12489static void490anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,491const struct anv_image *image,492VkImageAspectFlagBits aspect,493uint32_t base_level, uint32_t level_count,494uint32_t base_layer, uint32_t layer_count)495{496uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);497498const struct anv_surface *surface = &image->planes[plane].primary_surface;499uint64_t base_address =500anv_address_physical(anv_image_address(image, &surface->memory_range));501502const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;503uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);504505/* We're about to live-update the AUX-TT. We really don't want anyone else506* trying to read it while we're doing this. We could probably get away507* with not having this stall in some cases if we were really careful but508* it's better to play it safe. Full stall the GPU.509*/510anv_add_pending_pipe_bits(cmd_buffer,511ANV_PIPE_END_OF_PIPE_SYNC_BIT,512"before update AUX-TT");513genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);514515struct mi_builder b;516mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);517518for (uint32_t a = 0; a < layer_count; a++) {519const uint32_t layer = base_layer + a;520521uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;522for (uint32_t l = 0; l < level_count; l++) {523const uint32_t level = base_level + l;524525uint32_t logical_array_layer, logical_z_offset_px;526if (image->type == VK_IMAGE_TYPE_3D) {527logical_array_layer = 0;528529/* If the given miplevel does not have this layer, then any higher530* miplevels won't either because miplevels only get smaller the531* higher the LOD.532*/533assert(layer < image->extent.depth);534if (layer >= anv_minify(image->extent.depth, level))535break;536logical_z_offset_px = layer;537} else {538assert(layer < image->array_size);539logical_array_layer = layer;540logical_z_offset_px = 0;541}542543uint32_t slice_start_offset_B, slice_end_offset_B;544isl_surf_get_image_range_B_tile(isl_surf, level,545logical_array_layer,546logical_z_offset_px,547&slice_start_offset_B,548&slice_end_offset_B);549550start_offset_B = MIN2(start_offset_B, slice_start_offset_B);551end_offset_B = MAX2(end_offset_B, slice_end_offset_B);552}553554/* Aux operates 64K at a time */555start_offset_B = align_down_u64(start_offset_B, 64 * 1024);556end_offset_B = align_u64(end_offset_B, 64 * 1024);557558for (uint64_t offset = start_offset_B;559offset < end_offset_B; offset += 64 * 1024) {560uint64_t address = base_address + offset;561562uint64_t aux_entry_addr64, *aux_entry_map;563aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,564address, &aux_entry_addr64);565566assert(anv_use_softpin(cmd_buffer->device->physical));567struct anv_address aux_entry_address = {568.bo = NULL,569.offset = aux_entry_addr64,570};571572const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);573uint64_t new_aux_entry =574(old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;575576if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))577new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;578579mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));580}581}582583anv_add_pending_pipe_bits(cmd_buffer,584ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,585"after update AUX-TT");586}587#endif /* GFX_VER == 12 */588589/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless590* the initial layout is undefined, the HiZ buffer and depth buffer will591* represent the same data at the end of this operation.592*/593static void594transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,595const struct anv_image *image,596uint32_t base_layer, uint32_t layer_count,597VkImageLayout initial_layout,598VkImageLayout final_layout,599bool will_full_fast_clear)600{601uint32_t depth_plane =602anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT);603if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)604return;605606#if GFX_VER == 12607if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||608initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&609cmd_buffer->device->physical->has_implicit_ccs &&610cmd_buffer->device->info.has_aux_map) {611anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,6120, 1, base_layer, layer_count);613}614#endif615616/* If will_full_fast_clear is set, the caller promises to fast-clear the617* largest portion of the specified range as it can. For depth images,618* that means the entire image because we don't support multi-LOD HiZ.619*/620assert(image->planes[0].primary_surface.isl.levels == 1);621if (will_full_fast_clear)622return;623624const enum isl_aux_state initial_state =625anv_layout_to_aux_state(&cmd_buffer->device->info, image,626VK_IMAGE_ASPECT_DEPTH_BIT,627initial_layout);628const enum isl_aux_state final_state =629anv_layout_to_aux_state(&cmd_buffer->device->info, image,630VK_IMAGE_ASPECT_DEPTH_BIT,631final_layout);632633const bool initial_depth_valid =634isl_aux_state_has_valid_primary(initial_state);635const bool initial_hiz_valid =636isl_aux_state_has_valid_aux(initial_state);637const bool final_needs_depth =638isl_aux_state_has_valid_primary(final_state);639const bool final_needs_hiz =640isl_aux_state_has_valid_aux(final_state);641642/* Getting into the pass-through state for Depth is tricky and involves643* both a resolve and an ambiguate. We don't handle that state right now644* as anv_layout_to_aux_state never returns it. Resolve/ambiguate will645* trigger depth clears which require tile cache flushes.646*/647assert(final_state != ISL_AUX_STATE_PASS_THROUGH);648649if (final_needs_depth && !initial_depth_valid) {650assert(initial_hiz_valid);651anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,6520, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);653anv_add_pending_pipe_bits(cmd_buffer,654ANV_PIPE_TILE_CACHE_FLUSH_BIT,655"after depth resolve");656} else if (final_needs_hiz && !initial_hiz_valid) {657assert(initial_depth_valid);658anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,6590, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);660anv_add_pending_pipe_bits(cmd_buffer,661ANV_PIPE_TILE_CACHE_FLUSH_BIT,662"after hiz resolve");663}664}665666static inline bool667vk_image_layout_stencil_write_optimal(VkImageLayout layout)668{669return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||670layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||671layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR;672}673674/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless675* the initial layout is undefined, the HiZ buffer and depth buffer will676* represent the same data at the end of this operation.677*/678static void679transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,680const struct anv_image *image,681uint32_t base_level, uint32_t level_count,682uint32_t base_layer, uint32_t layer_count,683VkImageLayout initial_layout,684VkImageLayout final_layout,685bool will_full_fast_clear)686{687#if GFX_VER == 7688uint32_t plane = anv_image_aspect_to_plane(image->aspects,689VK_IMAGE_ASPECT_STENCIL_BIT);690691/* On gfx7, we have to store a texturable version of the stencil buffer in692* a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and693* forth at strategic points. Stencil writes are only allowed in following694* layouts:695*696* - VK_IMAGE_LAYOUT_GENERAL697* - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL698* - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL699* - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL700* - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR701*702* For general, we have no nice opportunity to transition so we do the copy703* to the shadow unconditionally at the end of the subpass. For transfer704* destinations, we can update it as part of the transfer op. For the other705* layouts, we delay the copy until a transition into some other layout.706*/707if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&708vk_image_layout_stencil_write_optimal(initial_layout) &&709!vk_image_layout_stencil_write_optimal(final_layout)) {710anv_image_copy_to_shadow(cmd_buffer, image,711VK_IMAGE_ASPECT_STENCIL_BIT,712base_level, level_count,713base_layer, layer_count);714}715#elif GFX_VER == 12716uint32_t plane = anv_image_aspect_to_plane(image->aspects,717VK_IMAGE_ASPECT_STENCIL_BIT);718if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)719return;720721if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||722initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&723cmd_buffer->device->physical->has_implicit_ccs &&724cmd_buffer->device->info.has_aux_map) {725anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,726base_level, level_count, base_layer, layer_count);727728/* If will_full_fast_clear is set, the caller promises to fast-clear the729* largest portion of the specified range as it can.730*/731if (will_full_fast_clear)732return;733734for (uint32_t l = 0; l < level_count; l++) {735const uint32_t level = base_level + l;736const VkRect2D clear_rect = {737.offset.x = 0,738.offset.y = 0,739.extent.width = anv_minify(image->extent.width, level),740.extent.height = anv_minify(image->extent.height, level),741};742743uint32_t aux_layers =744anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);745uint32_t level_layer_count =746MIN2(layer_count, aux_layers - base_layer);747748/* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression749* Enable:750*751* "When enabled, Stencil Buffer needs to be initialized via752* stencil clear (HZ_OP) before any renderpass."753*/754anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,755level, base_layer, level_layer_count,756clear_rect, 0 /* Stencil clear value */);757}758}759#endif760}761762#define MI_PREDICATE_SRC0 0x2400763#define MI_PREDICATE_SRC1 0x2408764#define MI_PREDICATE_RESULT 0x2418765766static void767set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,768const struct anv_image *image,769VkImageAspectFlagBits aspect,770uint32_t level,771uint32_t base_layer, uint32_t layer_count,772bool compressed)773{774uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);775776/* We only have compression tracking for CCS_E */777if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)778return;779780for (uint32_t a = 0; a < layer_count; a++) {781uint32_t layer = base_layer + a;782anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {783sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,784image, aspect,785level, layer);786sdi.ImmediateData = compressed ? UINT32_MAX : 0;787}788}789}790791static void792set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,793const struct anv_image *image,794VkImageAspectFlagBits aspect,795enum anv_fast_clear_type fast_clear)796{797anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {798sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,799image, aspect);800sdi.ImmediateData = fast_clear;801}802803/* Whenever we have fast-clear, we consider that slice to be compressed.804* This makes building predicates much easier.805*/806if (fast_clear != ANV_FAST_CLEAR_NONE)807set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);808}809810/* This is only really practical on haswell and above because it requires811* MI math in order to get it correct.812*/813#if GFX_VERx10 >= 75814static void815anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,816const struct anv_image *image,817VkImageAspectFlagBits aspect,818uint32_t level, uint32_t array_layer,819enum isl_aux_op resolve_op,820enum anv_fast_clear_type fast_clear_supported)821{822struct mi_builder b;823mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);824825const struct mi_value fast_clear_type =826mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,827image, aspect));828829if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {830/* In this case, we're doing a full resolve which means we want the831* resolve to happen if any compression (including fast-clears) is832* present.833*834* In order to simplify the logic a bit, we make the assumption that,835* if the first slice has been fast-cleared, it is also marked as836* compressed. See also set_image_fast_clear_state.837*/838const struct mi_value compression_state =839mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,840image, aspect,841level, array_layer));842mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);843mi_store(&b, compression_state, mi_imm(0));844845if (level == 0 && array_layer == 0) {846/* If the predicate is true, we want to write 0 to the fast clear type847* and, if it's false, leave it alone. We can do this by writing848*849* clear_type = clear_type & ~predicate;850*/851struct mi_value new_fast_clear_type =852mi_iand(&b, fast_clear_type,853mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));854mi_store(&b, fast_clear_type, new_fast_clear_type);855}856} else if (level == 0 && array_layer == 0) {857/* In this case, we are doing a partial resolve to get rid of fast-clear858* colors. We don't care about the compression state but we do care859* about how much fast clear is allowed by the final layout.860*/861assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);862assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);863864/* We need to compute (fast_clear_supported < image->fast_clear) */865struct mi_value pred =866mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);867mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));868869/* If the predicate is true, we want to write 0 to the fast clear type870* and, if it's false, leave it alone. We can do this by writing871*872* clear_type = clear_type & ~predicate;873*/874struct mi_value new_fast_clear_type =875mi_iand(&b, fast_clear_type, mi_inot(&b, pred));876mi_store(&b, fast_clear_type, new_fast_clear_type);877} else {878/* In this case, we're trying to do a partial resolve on a slice that879* doesn't have clear color. There's nothing to do.880*/881assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);882return;883}884885/* Set src1 to 0 and use a != condition */886mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));887888anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {889mip.LoadOperation = LOAD_LOADINV;890mip.CombineOperation = COMBINE_SET;891mip.CompareOperation = COMPARE_SRCS_EQUAL;892}893}894#endif /* GFX_VERx10 >= 75 */895896#if GFX_VER <= 8897static void898anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,899const struct anv_image *image,900VkImageAspectFlagBits aspect,901uint32_t level, uint32_t array_layer,902enum isl_aux_op resolve_op,903enum anv_fast_clear_type fast_clear_supported)904{905struct mi_builder b;906mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);907908struct mi_value fast_clear_type_mem =909mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,910image, aspect));911912/* This only works for partial resolves and only when the clear color is913* all or nothing. On the upside, this emits less command streamer code914* and works on Ivybridge and Bay Trail.915*/916assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);917assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);918919/* We don't support fast clears on anything other than the first slice. */920if (level > 0 || array_layer > 0)921return;922923/* On gfx8, we don't have a concept of default clear colors because we924* can't sample from CCS surfaces. It's enough to just load the fast clear925* state into the predicate register.926*/927mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);928mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));929mi_store(&b, fast_clear_type_mem, mi_imm(0));930931anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {932mip.LoadOperation = LOAD_LOADINV;933mip.CombineOperation = COMBINE_SET;934mip.CompareOperation = COMPARE_SRCS_EQUAL;935}936}937#endif /* GFX_VER <= 8 */938939static void940anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,941const struct anv_image *image,942enum isl_format format,943struct isl_swizzle swizzle,944VkImageAspectFlagBits aspect,945uint32_t level, uint32_t array_layer,946enum isl_aux_op resolve_op,947enum anv_fast_clear_type fast_clear_supported)948{949const uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);950951#if GFX_VER >= 9952anv_cmd_compute_resolve_predicate(cmd_buffer, image,953aspect, level, array_layer,954resolve_op, fast_clear_supported);955#else /* GFX_VER <= 8 */956anv_cmd_simple_resolve_predicate(cmd_buffer, image,957aspect, level, array_layer,958resolve_op, fast_clear_supported);959#endif960961/* CCS_D only supports full resolves and BLORP will assert on us if we try962* to do a partial resolve on a CCS_D surface.963*/964if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&965image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)966resolve_op = ISL_AUX_OP_FULL_RESOLVE;967968anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,969level, array_layer, 1, resolve_op, NULL, true);970}971972static void973anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,974const struct anv_image *image,975enum isl_format format,976struct isl_swizzle swizzle,977VkImageAspectFlagBits aspect,978uint32_t array_layer,979enum isl_aux_op resolve_op,980enum anv_fast_clear_type fast_clear_supported)981{982assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);983assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);984985#if GFX_VERx10 >= 75986anv_cmd_compute_resolve_predicate(cmd_buffer, image,987aspect, 0, array_layer,988resolve_op, fast_clear_supported);989990anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,991array_layer, 1, resolve_op, NULL, true);992#else993unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");994#endif995}996997void998genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,999const struct anv_image *image,1000VkImageAspectFlagBits aspect,1001enum isl_aux_usage aux_usage,1002uint32_t level,1003uint32_t base_layer,1004uint32_t layer_count)1005{1006/* The aspect must be exactly one of the image aspects. */1007assert(util_bitcount(aspect) == 1 && (aspect & image->aspects));10081009/* The only compression types with more than just fast-clears are MCS,1010* CCS_E, and HiZ. With HiZ we just trust the layout and don't actually1011* track the current fast-clear and compression state. This leaves us1012* with just MCS and CCS_E.1013*/1014if (aux_usage != ISL_AUX_USAGE_CCS_E &&1015aux_usage != ISL_AUX_USAGE_MCS)1016return;10171018set_image_compressed_bit(cmd_buffer, image, aspect,1019level, base_layer, layer_count, true);1020}10211022static void1023init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,1024const struct anv_image *image,1025VkImageAspectFlagBits aspect)1026{1027assert(cmd_buffer && image);1028assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);10291030set_image_fast_clear_state(cmd_buffer, image, aspect,1031ANV_FAST_CLEAR_NONE);10321033/* Initialize the struct fields that are accessed for fast-clears so that1034* the HW restrictions on the field values are satisfied.1035*/1036struct anv_address addr =1037anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);10381039if (GFX_VER >= 9) {1040const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;1041const unsigned num_dwords = GFX_VER >= 10 ?1042isl_dev->ss.clear_color_state_size / 4 :1043isl_dev->ss.clear_value_size / 4;1044for (unsigned i = 0; i < num_dwords; i++) {1045anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {1046sdi.Address = addr;1047sdi.Address.offset += i * 4;1048sdi.ImmediateData = 0;1049}1050}1051} else {1052anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {1053sdi.Address = addr;1054if (GFX_VERx10 >= 75) {1055/* Pre-SKL, the dword containing the clear values also contains1056* other fields, so we need to initialize those fields to match the1057* values that would be in a color attachment.1058*/1059sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |1060ISL_CHANNEL_SELECT_GREEN << 22 |1061ISL_CHANNEL_SELECT_BLUE << 19 |1062ISL_CHANNEL_SELECT_ALPHA << 16;1063} else if (GFX_VER == 7) {1064/* On IVB, the dword containing the clear values also contains1065* other fields that must be zero or can be zero.1066*/1067sdi.ImmediateData = 0;1068}1069}1070}1071}10721073/* Copy the fast-clear value dword(s) between a surface state object and an1074* image's fast clear state buffer.1075*/1076static void1077genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,1078struct anv_state surface_state,1079const struct anv_image *image,1080VkImageAspectFlagBits aspect,1081bool copy_from_surface_state)1082{1083assert(cmd_buffer && image);1084assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);10851086struct anv_address ss_clear_addr = {1087.bo = cmd_buffer->device->surface_state_pool.block_pool.bo,1088.offset = surface_state.offset +1089cmd_buffer->device->isl_dev.ss.clear_value_offset,1090};1091const struct anv_address entry_addr =1092anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);1093unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;10941095#if GFX_VER == 71096/* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM1097* and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is1098* in-flight when they are issued even if the memory touched is not1099* currently active for rendering. The weird bit is that it is not the1100* MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight1101* rendering hangs such that the next stalling command after the1102* MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.1103*1104* It is unclear exactly why this hang occurs. Both MI commands come with1105* warnings about the 3D pipeline but that doesn't seem to fully explain1106* it. My (Jason's) best theory is that it has something to do with the1107* fact that we're using a GPU state register as our temporary and that1108* something with reading/writing it is causing problems.1109*1110* In order to work around this issue, we emit a PIPE_CONTROL with the1111* command streamer stall bit set.1112*/1113anv_add_pending_pipe_bits(cmd_buffer,1114ANV_PIPE_CS_STALL_BIT,1115"after copy_fast_clear_dwords. Avoid potential hang");1116genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);1117#endif11181119struct mi_builder b;1120mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);11211122if (copy_from_surface_state) {1123mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);1124} else {1125mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);11261127/* Updating a surface state object may require that the state cache be1128* invalidated. From the SKL PRM, Shared Functions -> State -> State1129* Caching:1130*1131* Whenever the RENDER_SURFACE_STATE object in memory pointed to by1132* the Binding Table Pointer (BTP) and Binding Table Index (BTI) is1133* modified [...], the L1 state cache must be invalidated to ensure1134* the new surface or sampler state is fetched from system memory.1135*1136* In testing, SKL doesn't actually seem to need this, but HSW does.1137*/1138anv_add_pending_pipe_bits(cmd_buffer,1139ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,1140"after copy_fast_clear_dwords surface state update");1141}1142}11431144/**1145* @brief Transitions a color buffer from one layout to another.1146*1147* See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for1148* more information.1149*1150* @param level_count VK_REMAINING_MIP_LEVELS isn't supported.1151* @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,1152* this represents the maximum layers to transition at each1153* specified miplevel.1154*/1155static void1156transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,1157const struct anv_image *image,1158VkImageAspectFlagBits aspect,1159const uint32_t base_level, uint32_t level_count,1160uint32_t base_layer, uint32_t layer_count,1161VkImageLayout initial_layout,1162VkImageLayout final_layout,1163uint64_t src_queue_family,1164uint64_t dst_queue_family,1165bool will_full_fast_clear)1166{1167struct anv_device *device = cmd_buffer->device;1168const struct intel_device_info *devinfo = &device->info;1169/* Validate the inputs. */1170assert(cmd_buffer);1171assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);1172/* These values aren't supported for simplicity's sake. */1173assert(level_count != VK_REMAINING_MIP_LEVELS &&1174layer_count != VK_REMAINING_ARRAY_LAYERS);1175/* Ensure the subresource range is valid. */1176UNUSED uint64_t last_level_num = base_level + level_count;1177const uint32_t max_depth = anv_minify(image->extent.depth, base_level);1178UNUSED const uint32_t image_layers = MAX2(image->array_size, max_depth);1179assert((uint64_t)base_layer + layer_count <= image_layers);1180assert(last_level_num <= image->levels);1181/* The spec disallows these final layouts. */1182assert(final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&1183final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED);1184const struct isl_drm_modifier_info *isl_mod_info =1185image->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT1186? isl_drm_modifier_get_info(image->drm_format_mod)1187: NULL;11881189const bool src_queue_external =1190src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||1191src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;11921193const bool dst_queue_external =1194dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||1195dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;11961197/* Simultaneous acquire and release on external queues is illegal. */1198assert(!src_queue_external || !dst_queue_external);11991200/* Ownership transition on an external queue requires special action if the1201* image has a DRM format modifier because we store image data in1202* a driver-private bo which is inaccessible to the external queue.1203*/1204const bool mod_acquire =1205src_queue_external &&1206image->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;12071208const bool mod_release =1209dst_queue_external &&1210image->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;12111212if (initial_layout == final_layout &&1213!mod_acquire && !mod_release) {1214/* No work is needed. */1215return;1216}12171218uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);12191220if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&1221final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {1222/* This surface is a linear compressed image with a tiled shadow surface1223* for texturing. The client is about to use it in READ_ONLY_OPTIMAL so1224* we need to ensure the shadow copy is up-to-date.1225*/1226assert(image->tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);1227assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);1228assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);1229assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);1230assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));1231assert(plane == 0);1232anv_image_copy_to_shadow(cmd_buffer, image,1233VK_IMAGE_ASPECT_COLOR_BIT,1234base_level, level_count,1235base_layer, layer_count);1236}12371238if (base_layer >= anv_image_aux_layers(image, aspect, base_level))1239return;12401241assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);12421243/* The following layouts are equivalent for non-linear images. */1244const bool initial_layout_undefined =1245initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||1246initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;12471248bool must_init_fast_clear_state = false;1249bool must_init_aux_surface = false;12501251if (initial_layout_undefined) {1252/* The subresource may have been aliased and populated with arbitrary1253* data.1254*/1255must_init_fast_clear_state = true;1256must_init_aux_surface = true;1257} else if (mod_acquire) {1258/* The fast clear state lives in a driver-private bo, and therefore the1259* external/foreign queue is unaware of it.1260*1261* If this is the first time we are accessing the image, then the fast1262* clear state is uninitialized.1263*1264* If this is NOT the first time we are accessing the image, then the fast1265* clear state may still be valid and correct due to the resolve during1266* our most recent ownership release. However, we do not track the aux1267* state with MI stores, and therefore must assume the worst-case: that1268* this is the first time we are accessing the image.1269*/1270assert(image->planes[plane].fast_clear_memory_range.binding ==1271ANV_IMAGE_MEMORY_BINDING_PRIVATE);1272must_init_fast_clear_state = true;12731274if (image->planes[plane].aux_surface.memory_range.binding ==1275ANV_IMAGE_MEMORY_BINDING_PRIVATE) {1276assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);12771278/* The aux surface, like the fast clear state, lives in1279* a driver-private bo. We must initialize the aux surface for the1280* same reasons we must initialize the fast clear state.1281*/1282must_init_aux_surface = true;1283} else {1284assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);12851286/* The aux surface, unlike the fast clear state, lives in1287* application-visible VkDeviceMemory and is shared with the1288* external/foreign queue. Therefore, when we acquire ownership of the1289* image with a defined VkImageLayout, the aux surface is valid and has1290* the aux state required by the modifier.1291*/1292must_init_aux_surface = false;1293}1294}12951296#if GFX_VER == 121297/* We do not yet support modifiers with aux on gen12. */1298assert(image->tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);12991300if (initial_layout_undefined) {1301if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {1302anv_image_init_aux_tt(cmd_buffer, image, aspect,1303base_level, level_count,1304base_layer, layer_count);1305}1306}1307#else1308assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));1309#endif13101311if (must_init_fast_clear_state) {1312if (base_level == 0 && base_layer == 0)1313init_fast_clear_color(cmd_buffer, image, aspect);1314}13151316if (must_init_aux_surface) {1317assert(must_init_fast_clear_state);13181319/* Initialize the aux buffers to enable correct rendering. In order to1320* ensure that things such as storage images work correctly, aux buffers1321* need to be initialized to valid data.1322*1323* Having an aux buffer with invalid data is a problem for two reasons:1324*1325* 1) Having an invalid value in the buffer can confuse the hardware.1326* For instance, with CCS_E on SKL, a two-bit CCS value of 2 is1327* invalid and leads to the hardware doing strange things. It1328* doesn't hang as far as we can tell but rendering corruption can1329* occur.1330*1331* 2) If this transition is into the GENERAL layout and we then use the1332* image as a storage image, then we must have the aux buffer in the1333* pass-through state so that, if we then go to texture from the1334* image, we get the results of our storage image writes and not the1335* fast clear color or other random data.1336*1337* For CCS both of the problems above are real demonstrable issues. In1338* that case, the only thing we can do is to perform an ambiguate to1339* transition the aux surface into the pass-through state.1340*1341* For MCS, (2) is never an issue because we don't support multisampled1342* storage images. In theory, issue (1) is a problem with MCS but we've1343* never seen it in the wild. For 4x and 16x, all bit patters could, in1344* theory, be interpreted as something but we don't know that all bit1345* patterns are actually valid. For 2x and 8x, you could easily end up1346* with the MCS referring to an invalid plane because not all bits of1347* the MCS value are actually used. Even though we've never seen issues1348* in the wild, it's best to play it safe and initialize the MCS. We1349* can use a fast-clear for MCS because we only ever touch from render1350* and texture (no image load store).1351*/1352if (image->samples == 1) {1353for (uint32_t l = 0; l < level_count; l++) {1354const uint32_t level = base_level + l;13551356uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);1357if (base_layer >= aux_layers)1358break; /* We will only get fewer layers as level increases */1359uint32_t level_layer_count =1360MIN2(layer_count, aux_layers - base_layer);13611362/* If will_full_fast_clear is set, the caller promises to1363* fast-clear the largest portion of the specified range as it can.1364* For color images, that means only the first LOD and array slice.1365*/1366if (level == 0 && base_layer == 0 && will_full_fast_clear) {1367base_layer++;1368level_layer_count--;1369if (level_layer_count == 0)1370continue;1371}13721373anv_image_ccs_op(cmd_buffer, image,1374image->planes[plane].primary_surface.isl.format,1375ISL_SWIZZLE_IDENTITY,1376aspect, level, base_layer, level_layer_count,1377ISL_AUX_OP_AMBIGUATE, NULL, false);13781379if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {1380set_image_compressed_bit(cmd_buffer, image, aspect,1381level, base_layer, level_layer_count,1382false);1383}1384}1385} else {1386if (image->samples == 4 || image->samples == 16) {1387anv_perf_warn(cmd_buffer->device, &image->base,1388"Doing a potentially unnecessary fast-clear to "1389"define an MCS buffer.");1390}13911392/* If will_full_fast_clear is set, the caller promises to fast-clear1393* the largest portion of the specified range as it can.1394*/1395if (will_full_fast_clear)1396return;13971398assert(base_level == 0 && level_count == 1);1399anv_image_mcs_op(cmd_buffer, image,1400image->planes[plane].primary_surface.isl.format,1401ISL_SWIZZLE_IDENTITY,1402aspect, base_layer, layer_count,1403ISL_AUX_OP_FAST_CLEAR, NULL, false);1404}1405return;1406}14071408enum isl_aux_usage initial_aux_usage =1409anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);1410enum isl_aux_usage final_aux_usage =1411anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);14121413/* We must override the anv_layout_to_* functions because they are unaware of1414* acquire/release direction.1415*/1416if (mod_acquire) {1417initial_aux_usage = isl_mod_info->aux_usage;1418} else if (mod_release) {1419final_aux_usage = isl_mod_info->aux_usage;1420}14211422/* The current code assumes that there is no mixing of CCS_E and CCS_D.1423* We can handle transitions between CCS_D/E to and from NONE. What we1424* don't yet handle is switching between CCS_E and CCS_D within a given1425* image. Doing so in a performant way requires more detailed aux state1426* tracking such as what is done in i965. For now, just assume that we1427* only have one type of compression.1428*/1429assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||1430final_aux_usage == ISL_AUX_USAGE_NONE ||1431initial_aux_usage == final_aux_usage);14321433/* If initial aux usage is NONE, there is nothing to resolve */1434if (initial_aux_usage == ISL_AUX_USAGE_NONE)1435return;14361437enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;14381439/* If the initial layout supports more fast clear than the final layout1440* then we need at least a partial resolve.1441*/1442const enum anv_fast_clear_type initial_fast_clear =1443anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);1444const enum anv_fast_clear_type final_fast_clear =1445anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);1446if (final_fast_clear < initial_fast_clear)1447resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;14481449if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&1450final_aux_usage != ISL_AUX_USAGE_CCS_E)1451resolve_op = ISL_AUX_OP_FULL_RESOLVE;14521453if (resolve_op == ISL_AUX_OP_NONE)1454return;14551456/* Perform a resolve to synchronize data between the main and aux buffer.1457* Before we begin, we must satisfy the cache flushing requirement specified1458* in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":1459*1460* Any transition from any value in {Clear, Render, Resolve} to a1461* different value in {Clear, Render, Resolve} requires end of pipe1462* synchronization.1463*1464* We perform a flush of the write cache before and after the clear and1465* resolve operations to meet this requirement.1466*1467* Unlike other drawing, fast clear operations are not properly1468* synchronized. The first PIPE_CONTROL here likely ensures that the1469* contents of the previous render or clear hit the render target before we1470* resolve and the second likely ensures that the resolve is complete before1471* we do any more rendering or clearing.1472*/1473anv_add_pending_pipe_bits(cmd_buffer,1474ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |1475ANV_PIPE_END_OF_PIPE_SYNC_BIT,1476"after transition RT");14771478for (uint32_t l = 0; l < level_count; l++) {1479uint32_t level = base_level + l;14801481uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);1482if (base_layer >= aux_layers)1483break; /* We will only get fewer layers as level increases */1484uint32_t level_layer_count =1485MIN2(layer_count, aux_layers - base_layer);14861487for (uint32_t a = 0; a < level_layer_count; a++) {1488uint32_t array_layer = base_layer + a;14891490/* If will_full_fast_clear is set, the caller promises to fast-clear1491* the largest portion of the specified range as it can. For color1492* images, that means only the first LOD and array slice.1493*/1494if (level == 0 && array_layer == 0 && will_full_fast_clear)1495continue;14961497if (image->samples == 1) {1498anv_cmd_predicated_ccs_resolve(cmd_buffer, image,1499image->planes[plane].primary_surface.isl.format,1500ISL_SWIZZLE_IDENTITY,1501aspect, level, array_layer, resolve_op,1502final_fast_clear);1503} else {1504/* We only support fast-clear on the first layer so partial1505* resolves should not be used on other layers as they will use1506* the clear color stored in memory that is only valid for layer0.1507*/1508if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&1509array_layer != 0)1510continue;15111512anv_cmd_predicated_mcs_resolve(cmd_buffer, image,1513image->planes[plane].primary_surface.isl.format,1514ISL_SWIZZLE_IDENTITY,1515aspect, array_layer, resolve_op,1516final_fast_clear);1517}1518}1519}15201521anv_add_pending_pipe_bits(cmd_buffer,1522ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |1523ANV_PIPE_END_OF_PIPE_SYNC_BIT,1524"after transition RT");1525}15261527static VkResult1528genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,1529const struct anv_render_pass *pass,1530const struct anv_framebuffer *framebuffer,1531const VkRenderPassBeginInfo *begin)1532{1533struct anv_cmd_state *state = &cmd_buffer->state;15341535vk_free(&cmd_buffer->pool->alloc, state->attachments);15361537if (pass->attachment_count > 0) {1538state->attachments = vk_zalloc(&cmd_buffer->pool->alloc,1539pass->attachment_count *1540sizeof(state->attachments[0]),15418, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);1542if (state->attachments == NULL) {1543/* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */1544return anv_batch_set_error(&cmd_buffer->batch,1545VK_ERROR_OUT_OF_HOST_MEMORY);1546}1547} else {1548state->attachments = NULL;1549}15501551const VkRenderPassAttachmentBeginInfoKHR *attach_begin =1552vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR);1553if (begin && !attach_begin)1554assert(pass->attachment_count == framebuffer->attachment_count);15551556for (uint32_t i = 0; i < pass->attachment_count; ++i) {1557if (attach_begin && attach_begin->attachmentCount != 0) {1558assert(attach_begin->attachmentCount == pass->attachment_count);1559ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]);1560state->attachments[i].image_view = iview;1561} else if (framebuffer && i < framebuffer->attachment_count) {1562state->attachments[i].image_view = framebuffer->attachments[i];1563} else {1564state->attachments[i].image_view = NULL;1565}1566}15671568if (begin) {1569for (uint32_t i = 0; i < pass->attachment_count; ++i) {1570const struct anv_render_pass_attachment *pass_att = &pass->attachments[i];1571struct anv_attachment_state *att_state = &state->attachments[i];1572VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format);1573VkImageAspectFlags clear_aspects = 0;1574VkImageAspectFlags load_aspects = 0;15751576if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {1577/* color attachment */1578if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {1579clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;1580} else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {1581load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;1582}1583} else {1584/* depthstencil attachment */1585if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {1586if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {1587clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;1588} else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {1589load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;1590}1591}1592if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {1593if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {1594clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;1595} else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {1596load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;1597}1598}1599}16001601att_state->current_layout = pass_att->initial_layout;1602att_state->current_stencil_layout = pass_att->stencil_initial_layout;1603att_state->pending_clear_aspects = clear_aspects;1604att_state->pending_load_aspects = load_aspects;1605if (clear_aspects)1606att_state->clear_value = begin->pClearValues[i];16071608struct anv_image_view *iview = state->attachments[i].image_view;1609anv_assert(iview->vk_format == pass_att->format);16101611const uint32_t num_layers = iview->planes[0].isl.array_len;1612att_state->pending_clear_views = (1 << num_layers) - 1;16131614/* This will be initialized after the first subpass transition. */1615att_state->aux_usage = ISL_AUX_USAGE_NONE;16161617att_state->fast_clear = false;1618if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {1619assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);1620att_state->fast_clear =1621anv_can_fast_clear_color_view(cmd_buffer->device, iview,1622pass_att->first_subpass_layout,1623vk_to_isl_color(att_state->clear_value.color),1624framebuffer->layers,1625begin->renderArea);1626} else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |1627VK_IMAGE_ASPECT_STENCIL_BIT)) {1628att_state->fast_clear =1629anv_can_hiz_clear_ds_view(cmd_buffer->device, iview,1630pass_att->first_subpass_layout,1631clear_aspects,1632att_state->clear_value.depthStencil.depth,1633begin->renderArea);1634}1635}1636}16371638return VK_SUCCESS;1639}16401641/**1642* Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.1643*/1644static VkResult1645genX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer,1646const struct anv_render_pass *pass,1647const struct anv_subpass *subpass)1648{1649const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;1650struct anv_cmd_state *state = &cmd_buffer->state;16511652/* Reserve one for the NULL state. */1653unsigned num_states = 1;1654for (uint32_t i = 0; i < subpass->attachment_count; i++) {1655uint32_t att = subpass->attachments[i].attachment;1656if (att == VK_ATTACHMENT_UNUSED)1657continue;16581659assert(att < pass->attachment_count);1660if (!vk_format_is_color(pass->attachments[att].format))1661continue;16621663const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;1664assert(util_bitcount(att_usage) == 1);16651666if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT ||1667att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)1668num_states++;1669}16701671const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);1672state->attachment_states =1673anv_state_stream_alloc(&cmd_buffer->surface_state_stream,1674num_states * ss_stride, isl_dev->ss.align);1675if (state->attachment_states.map == NULL) {1676return anv_batch_set_error(&cmd_buffer->batch,1677VK_ERROR_OUT_OF_DEVICE_MEMORY);1678}16791680struct anv_state next_state = state->attachment_states;1681next_state.alloc_size = isl_dev->ss.size;16821683state->null_surface_state = next_state;1684next_state.offset += ss_stride;1685next_state.map += ss_stride;16861687for (uint32_t i = 0; i < subpass->attachment_count; i++) {1688uint32_t att = subpass->attachments[i].attachment;1689if (att == VK_ATTACHMENT_UNUSED)1690continue;16911692assert(att < pass->attachment_count);1693if (!vk_format_is_color(pass->attachments[att].format))1694continue;16951696const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;1697assert(util_bitcount(att_usage) == 1);16981699if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)1700state->attachments[att].color.state = next_state;1701else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)1702state->attachments[att].input.state = next_state;1703else1704continue;17051706state->attachments[att].color.state = next_state;1707next_state.offset += ss_stride;1708next_state.map += ss_stride;1709}17101711assert(next_state.offset == state->attachment_states.offset +1712state->attachment_states.alloc_size);17131714return VK_SUCCESS;1715}17161717VkResult1718genX(BeginCommandBuffer)(1719VkCommandBuffer commandBuffer,1720const VkCommandBufferBeginInfo* pBeginInfo)1721{1722ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);17231724/* If this is the first vkBeginCommandBuffer, we must *initialize* the1725* command buffer's state. Otherwise, we must *reset* its state. In both1726* cases we reset it.1727*1728* From the Vulkan 1.0 spec:1729*1730* If a command buffer is in the executable state and the command buffer1731* was allocated from a command pool with the1732* VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then1733* vkBeginCommandBuffer implicitly resets the command buffer, behaving1734* as if vkResetCommandBuffer had been called with1735* VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts1736* the command buffer in the recording state.1737*/1738anv_cmd_buffer_reset(cmd_buffer);17391740cmd_buffer->usage_flags = pBeginInfo->flags;17411742/* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for1743* primary level command buffers.1744*1745* From the Vulkan 1.0 spec:1746*1747* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a1748* secondary command buffer is considered to be entirely inside a render1749* pass. If this is a primary command buffer, then this bit is ignored.1750*/1751if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)1752cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;17531754genX(cmd_buffer_emit_state_base_address)(cmd_buffer);17551756/* We sometimes store vertex data in the dynamic state buffer for blorp1757* operations and our dynamic state stream may re-use data from previous1758* command buffers. In order to prevent stale cache data, we flush the VF1759* cache. We could do this on every blorp call but that's not really1760* needed as all of the data will get written by the CPU prior to the GPU1761* executing anything. The chances are fairly high that they will use1762* blorp at least once per primary command buffer so it shouldn't be1763* wasted.1764*1765* There is also a workaround on gfx8 which requires us to invalidate the1766* VF cache occasionally. It's easier if we can assume we start with a1767* fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)1768*/1769anv_add_pending_pipe_bits(cmd_buffer,1770ANV_PIPE_VF_CACHE_INVALIDATE_BIT,1771"new cmd buffer");17721773/* Re-emit the aux table register in every command buffer. This way we're1774* ensured that we have the table even if this command buffer doesn't1775* initialize any images.1776*/1777if (cmd_buffer->device->info.has_aux_map) {1778anv_add_pending_pipe_bits(cmd_buffer,1779ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,1780"new cmd buffer with aux-tt");1781}17821783/* We send an "Indirect State Pointers Disable" packet at1784* EndCommandBuffer, so all push contant packets are ignored during a1785* context restore. Documentation says after that command, we need to1786* emit push constants again before any rendering operation. So we1787* flag them dirty here to make sure they get emitted.1788*/1789cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;17901791VkResult result = VK_SUCCESS;1792if (cmd_buffer->usage_flags &1793VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {1794assert(pBeginInfo->pInheritanceInfo);1795ANV_FROM_HANDLE(anv_render_pass, pass,1796pBeginInfo->pInheritanceInfo->renderPass);1797struct anv_subpass *subpass =1798&pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];1799ANV_FROM_HANDLE(anv_framebuffer, framebuffer,1800pBeginInfo->pInheritanceInfo->framebuffer);18011802cmd_buffer->state.pass = pass;1803cmd_buffer->state.subpass = subpass;18041805/* This is optional in the inheritance info. */1806cmd_buffer->state.framebuffer = framebuffer;18071808result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,1809framebuffer, NULL);1810if (result != VK_SUCCESS)1811return result;18121813result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass,1814subpass);1815if (result != VK_SUCCESS)1816return result;18171818/* Record that HiZ is enabled if we can. */1819if (cmd_buffer->state.framebuffer) {1820const struct anv_image_view * const iview =1821anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);18221823if (iview) {1824VkImageLayout layout =1825cmd_buffer->state.subpass->depth_stencil_attachment->layout;18261827enum isl_aux_usage aux_usage =1828anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,1829VK_IMAGE_ASPECT_DEPTH_BIT,1830VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,1831layout);18321833cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage);1834}1835}18361837cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;1838}18391840#if GFX_VERx10 >= 751841if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {1842const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =1843vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);18441845/* If secondary buffer supports conditional rendering1846* we should emit commands as if conditional rendering is enabled.1847*/1848cmd_buffer->state.conditional_render_enabled =1849conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;1850}1851#endif18521853return result;1854}18551856/* From the PRM, Volume 2a:1857*1858* "Indirect State Pointers Disable1859*1860* At the completion of the post-sync operation associated with this pipe1861* control packet, the indirect state pointers in the hardware are1862* considered invalid; the indirect pointers are not saved in the context.1863* If any new indirect state commands are executed in the command stream1864* while the pipe control is pending, the new indirect state commands are1865* preserved.1866*1867* [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context1868* restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant1869* commands are only considered as Indirect State Pointers. Once ISP is1870* issued in a context, SW must initialize by programming push constant1871* commands for all the shaders (at least to zero length) before attempting1872* any rendering operation for the same context."1873*1874* 3DSTATE_CONSTANT_* packets are restored during a context restore,1875* even though they point to a BO that has been already unreferenced at1876* the end of the previous batch buffer. This has been fine so far since1877* we are protected by these scratch page (every address not covered by1878* a BO should be pointing to the scratch page). But on CNL, it is1879* causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*1880* instruction.1881*1882* The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the1883* hardware to ignore previous 3DSTATE_CONSTANT_* packets during a1884* context restore, so the mentioned hang doesn't happen. However,1885* software must program push constant commands for all stages prior to1886* rendering anything. So we flag them dirty in BeginCommandBuffer.1887*1888* Finally, we also make sure to stall at pixel scoreboard to make sure the1889* constants have been loaded into the EUs prior to disable the push constants1890* so that it doesn't hang a previous 3DPRIMITIVE.1891*/1892static void1893emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)1894{1895anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1896pc.StallAtPixelScoreboard = true;1897pc.CommandStreamerStallEnable = true;1898anv_debug_dump_pc(pc);1899}1900anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {1901pc.IndirectStatePointersDisable = true;1902pc.CommandStreamerStallEnable = true;1903anv_debug_dump_pc(pc);1904}1905}19061907VkResult1908genX(EndCommandBuffer)(1909VkCommandBuffer commandBuffer)1910{1911ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);19121913if (anv_batch_has_error(&cmd_buffer->batch))1914return cmd_buffer->batch.status;19151916anv_measure_endcommandbuffer(cmd_buffer);19171918/* We want every command buffer to start with the PMA fix in a known state,1919* so we disable it at the end of the command buffer.1920*/1921genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);19221923genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);19241925emit_isp_disable(cmd_buffer);19261927anv_cmd_buffer_end_batch_buffer(cmd_buffer);19281929return VK_SUCCESS;1930}19311932void1933genX(CmdExecuteCommands)(1934VkCommandBuffer commandBuffer,1935uint32_t commandBufferCount,1936const VkCommandBuffer* pCmdBuffers)1937{1938ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);19391940assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);19411942if (anv_batch_has_error(&primary->batch))1943return;19441945/* The secondary command buffers will assume that the PMA fix is disabled1946* when they begin executing. Make sure this is true.1947*/1948genX(cmd_buffer_enable_pma_fix)(primary, false);19491950/* The secondary command buffer doesn't know which textures etc. have been1951* flushed prior to their execution. Apply those flushes now.1952*/1953genX(cmd_buffer_apply_pipe_flushes)(primary);19541955for (uint32_t i = 0; i < commandBufferCount; i++) {1956ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);19571958assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);1959assert(!anv_batch_has_error(&secondary->batch));19601961#if GFX_VERx10 >= 751962if (secondary->state.conditional_render_enabled) {1963if (!primary->state.conditional_render_enabled) {1964/* Secondary buffer is constructed as if it will be executed1965* with conditional rendering, we should satisfy this dependency1966* regardless of conditional rendering being enabled in primary.1967*/1968struct mi_builder b;1969mi_builder_init(&b, &primary->device->info, &primary->batch);1970mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),1971mi_imm(UINT64_MAX));1972}1973}1974#endif19751976if (secondary->usage_flags &1977VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {1978/* If we're continuing a render pass from the primary, we need to1979* copy the surface states for the current subpass into the storage1980* we allocated for them in BeginCommandBuffer.1981*/1982struct anv_bo *ss_bo =1983primary->device->surface_state_pool.block_pool.bo;1984struct anv_state src_state = primary->state.attachment_states;1985struct anv_state dst_state = secondary->state.attachment_states;1986assert(src_state.alloc_size == dst_state.alloc_size);19871988genX(cmd_buffer_so_memcpy)(primary,1989(struct anv_address) {1990.bo = ss_bo,1991.offset = dst_state.offset,1992},1993(struct anv_address) {1994.bo = ss_bo,1995.offset = src_state.offset,1996},1997src_state.alloc_size);1998}19992000anv_cmd_buffer_add_secondary(primary, secondary);20012002assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||2003secondary->perf_query_pool == primary->perf_query_pool);2004if (secondary->perf_query_pool)2005primary->perf_query_pool = secondary->perf_query_pool;2006}20072008/* The secondary isn't counted in our VF cache tracking so we need to2009* invalidate the whole thing.2010*/2011if (GFX_VER >= 8 && GFX_VER <= 9) {2012anv_add_pending_pipe_bits(primary,2013ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,2014"Secondary cmd buffer not tracked in VF cache");2015}20162017/* The secondary may have selected a different pipeline (3D or compute) and2018* may have changed the current L3$ configuration. Reset our tracking2019* variables to invalid values to ensure that we re-emit these in the case2020* where we do any draws or compute dispatches from the primary after the2021* secondary has returned.2022*/2023primary->state.current_pipeline = UINT32_MAX;2024primary->state.current_l3_config = NULL;2025primary->state.current_hash_scale = 0;20262027/* Each of the secondary command buffers will use its own state base2028* address. We need to re-emit state base address for the primary after2029* all of the secondaries are done.2030*2031* TODO: Maybe we want to make this a dirty bit to avoid extra state base2032* address calls?2033*/2034genX(cmd_buffer_emit_state_base_address)(primary);2035}20362037/**2038* Program the hardware to use the specified L3 configuration.2039*/2040void2041genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,2042const struct intel_l3_config *cfg)2043{2044assert(cfg || GFX_VER >= 12);2045if (cfg == cmd_buffer->state.current_l3_config)2046return;20472048#if GFX_VER >= 112049/* On Gfx11+ we use only one config, so verify it remains the same and skip2050* the stalling programming entirely.2051*/2052assert(cfg == cmd_buffer->device->l3_config);2053#else2054if (INTEL_DEBUG & DEBUG_L3) {2055mesa_logd("L3 config transition: ");2056intel_dump_l3_config(cfg, stderr);2057}20582059/* According to the hardware docs, the L3 partitioning can only be changed2060* while the pipeline is completely drained and the caches are flushed,2061* which involves a first PIPE_CONTROL flush which stalls the pipeline...2062*/2063anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {2064pc.DCFlushEnable = true;2065pc.PostSyncOperation = NoWrite;2066pc.CommandStreamerStallEnable = true;2067anv_debug_dump_pc(pc);2068}20692070/* ...followed by a second pipelined PIPE_CONTROL that initiates2071* invalidation of the relevant caches. Note that because RO invalidation2072* happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL2073* command is processed by the CS) we cannot combine it with the previous2074* stalling flush as the hardware documentation suggests, because that2075* would cause the CS to stall on previous rendering *after* RO2076* invalidation and wouldn't prevent the RO caches from being polluted by2077* concurrent rendering before the stall completes. This intentionally2078* doesn't implement the SKL+ hardware workaround suggesting to enable CS2079* stall on PIPE_CONTROLs with the texture cache invalidation bit set for2080* GPGPU workloads because the previous and subsequent PIPE_CONTROLs2081* already guarantee that there is no concurrent GPGPU kernel execution2082* (see SKL HSD 2132585).2083*/2084anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {2085pc.TextureCacheInvalidationEnable = true;2086pc.ConstantCacheInvalidationEnable = true;2087pc.InstructionCacheInvalidateEnable = true;2088pc.StateCacheInvalidationEnable = true;2089pc.PostSyncOperation = NoWrite;2090anv_debug_dump_pc(pc);2091}20922093/* Now send a third stalling flush to make sure that invalidation is2094* complete when the L3 configuration registers are modified.2095*/2096anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {2097pc.DCFlushEnable = true;2098pc.PostSyncOperation = NoWrite;2099pc.CommandStreamerStallEnable = true;2100anv_debug_dump_pc(pc);2101}21022103genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);2104#endif /* GFX_VER >= 11 */2105cmd_buffer->state.current_l3_config = cfg;2106}21072108void2109genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)2110{2111UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;2112enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;21132114if (unlikely(cmd_buffer->device->physical->always_flush_cache))2115bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;2116else if (bits == 0)2117return;21182119/*2120* From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":2121*2122* Write synchronization is a special case of end-of-pipe2123* synchronization that requires that the render cache and/or depth2124* related caches are flushed to memory, where the data will become2125* globally visible. This type of synchronization is required prior to2126* SW (CPU) actually reading the result data from memory, or initiating2127* an operation that will use as a read surface (such as a texture2128* surface) a previous render target and/or depth/stencil buffer2129*2130*2131* From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":2132*2133* Exercising the write cache flush bits (Render Target Cache Flush2134* Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only2135* ensures the write caches are flushed and doesn't guarantee the data2136* is globally visible.2137*2138* SW can track the completion of the end-of-pipe-synchronization by2139* using "Notify Enable" and "PostSync Operation - Write Immediate2140* Data" in the PIPE_CONTROL command.2141*2142* In other words, flushes are pipelined while invalidations are handled2143* immediately. Therefore, if we're flushing anything then we need to2144* schedule an end-of-pipe sync before any invalidations can happen.2145*/2146if (bits & ANV_PIPE_FLUSH_BITS)2147bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;214821492150/* HSD 1209978178: docs say that before programming the aux table:2151*2152* "Driver must ensure that the engine is IDLE but ensure it doesn't2153* add extra flushes in the case it knows that the engine is already2154* IDLE."2155*/2156if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))2157bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;21582159/* If we're going to do an invalidate and we have a pending end-of-pipe2160* sync that has yet to be resolved, we do the end-of-pipe sync now.2161*/2162if ((bits & ANV_PIPE_INVALIDATE_BITS) &&2163(bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {2164bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;2165bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;2166}21672168/* Wa_1409226450, Wait for EU to be idle before pipe control which2169* invalidates the instruction cache2170*/2171if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))2172bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;21732174if ((GFX_VER >= 8 && GFX_VER <= 9) &&2175(bits & ANV_PIPE_CS_STALL_BIT) &&2176(bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {2177/* If we are doing a VF cache invalidate AND a CS stall (it must be2178* both) then we can reset our vertex cache tracking.2179*/2180memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,2181sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));2182memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,2183sizeof(cmd_buffer->state.gfx.ib_dirty_range));2184}21852186/* Project: SKL / Argument: LRI Post Sync Operation [23]2187*2188* "PIPECONTROL command with “Command Streamer Stall Enable” must be2189* programmed prior to programming a PIPECONTROL command with "LRI2190* Post Sync Operation" in GPGPU mode of operation (i.e when2191* PIPELINE_SELECT command is set to GPGPU mode of operation)."2192*2193* The same text exists a few rows below for Post Sync Op.2194*2195* On Gfx12 this is Wa_1607156449.2196*/2197if (bits & ANV_PIPE_POST_SYNC_BIT) {2198if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0 */)) &&2199cmd_buffer->state.current_pipeline == GPGPU)2200bits |= ANV_PIPE_CS_STALL_BIT;2201bits &= ~ANV_PIPE_POST_SYNC_BIT;2202}22032204if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |2205ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {2206anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {2207#if GFX_VER >= 122208pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;2209pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;2210#else2211/* Flushing HDC pipeline requires DC Flush on earlier HW. */2212pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;2213#endif2214pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;2215pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;2216pipe.RenderTargetCacheFlushEnable =2217bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;22182219/* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must2220* be set with any PIPE_CONTROL with Depth Flush Enable bit set.2221*/2222#if GFX_VER >= 122223pipe.DepthStallEnable =2224pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);2225#else2226pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;2227#endif22282229pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;2230pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;22312232/* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":2233*2234* "The most common action to perform upon reaching a2235* synchronization point is to write a value out to memory. An2236* immediate value (included with the synchronization command) may2237* be written."2238*2239*2240* From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":2241*2242* "In case the data flushed out by the render engine is to be2243* read back in to the render engine in coherent manner, then the2244* render engine has to wait for the fence completion before2245* accessing the flushed data. This can be achieved by following2246* means on various products: PIPE_CONTROL command with CS Stall2247* and the required write caches flushed with Post-Sync-Operation2248* as Write Immediate Data.2249*2250* Example:2251* - Workload-1 (3D/GPGPU/MEDIA)2252* - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write2253* Immediate Data, Required Write Cache Flush bits set)2254* - Workload-2 (Can use the data produce or output by2255* Workload-1)2256*/2257if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {2258pipe.CommandStreamerStallEnable = true;2259pipe.PostSyncOperation = WriteImmediateData;2260pipe.Address = cmd_buffer->device->workaround_address;2261}22622263/*2264* According to the Broadwell documentation, any PIPE_CONTROL with the2265* "Command Streamer Stall" bit set must also have another bit set,2266* with five different options:2267*2268* - Render Target Cache Flush2269* - Depth Cache Flush2270* - Stall at Pixel Scoreboard2271* - Post-Sync Operation2272* - Depth Stall2273* - DC Flush Enable2274*2275* I chose "Stall at Pixel Scoreboard" since that's what we use in2276* mesa and it seems to work fine. The choice is fairly arbitrary.2277*/2278if (pipe.CommandStreamerStallEnable &&2279!pipe.RenderTargetCacheFlushEnable &&2280!pipe.DepthCacheFlushEnable &&2281!pipe.StallAtPixelScoreboard &&2282!pipe.PostSyncOperation &&2283!pipe.DepthStallEnable &&2284!pipe.DCFlushEnable)2285pipe.StallAtPixelScoreboard = true;2286anv_debug_dump_pc(pipe);2287}22882289/* If a render target flush was emitted, then we can toggle off the bit2290* saying that render target writes are ongoing.2291*/2292if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)2293bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);22942295if (GFX_VERx10 == 75) {2296/* Haswell needs addition work-arounds:2297*2298* From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":2299*2300* Option 1:2301* PIPE_CONTROL command with the CS Stall and the required write2302* caches flushed with Post-SyncOperation as Write Immediate Data2303* followed by eight dummy MI_STORE_DATA_IMM (write to scratch2304* spce) commands.2305*2306* Example:2307* - Workload-12308* - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write2309* Immediate Data, Required Write Cache Flush bits set)2310* - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)2311* - Workload-2 (Can use the data produce or output by2312* Workload-1)2313*2314* Unfortunately, both the PRMs and the internal docs are a bit2315* out-of-date in this regard. What the windows driver does (and2316* this appears to actually work) is to emit a register read from the2317* memory address written by the pipe control above.2318*2319* What register we load into doesn't matter. We choose an indirect2320* rendering register because we know it always exists and it's one2321* of the first registers the command parser allows us to write. If2322* you don't have command parser support in your kernel (pre-4.2),2323* this will get turned into MI_NOOP and you won't get the2324* workaround. Unfortunately, there's just not much we can do in2325* that case. This register is perfectly safe to write since we2326* always re-load all of the indirect draw registers right before2327* 3DPRIMITIVE when needed anyway.2328*/2329anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {2330lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */2331lrm.MemoryAddress = cmd_buffer->device->workaround_address;2332}2333}23342335bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |2336ANV_PIPE_END_OF_PIPE_SYNC_BIT);2337}23382339if (bits & ANV_PIPE_INVALIDATE_BITS) {2340/* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",2341*2342* "If the VF Cache Invalidation Enable is set to a 1 in a2343* PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to2344* 0, with the VF Cache Invalidation Enable set to 0 needs to be sent2345* prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to2346* a 1."2347*2348* This appears to hang Broadwell, so we restrict it to just gfx9.2349*/2350if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))2351anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe);23522353anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {2354pipe.StateCacheInvalidationEnable =2355bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;2356pipe.ConstantCacheInvalidationEnable =2357bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;2358pipe.VFCacheInvalidationEnable =2359bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;2360pipe.TextureCacheInvalidationEnable =2361bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;2362pipe.InstructionCacheInvalidateEnable =2363bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;23642365/* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",2366*2367* "When VF Cache Invalidate is set “Post Sync Operation” must be2368* enabled to “Write Immediate Data” or “Write PS Depth Count” or2369* “Write Timestamp”.2370*/2371if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {2372pipe.PostSyncOperation = WriteImmediateData;2373pipe.Address = cmd_buffer->device->workaround_address;2374}2375anv_debug_dump_pc(pipe);2376}23772378#if GFX_VER == 122379if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) &&2380cmd_buffer->device->info.has_aux_map) {2381anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {2382lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);2383lri.DataDWord = 1;2384}2385}2386#endif23872388bits &= ~ANV_PIPE_INVALIDATE_BITS;2389}23902391cmd_buffer->state.pending_pipe_bits = bits;2392}23932394void genX(CmdPipelineBarrier)(2395VkCommandBuffer commandBuffer,2396VkPipelineStageFlags srcStageMask,2397VkPipelineStageFlags destStageMask,2398VkBool32 byRegion,2399uint32_t memoryBarrierCount,2400const VkMemoryBarrier* pMemoryBarriers,2401uint32_t bufferMemoryBarrierCount,2402const VkBufferMemoryBarrier* pBufferMemoryBarriers,2403uint32_t imageMemoryBarrierCount,2404const VkImageMemoryBarrier* pImageMemoryBarriers)2405{2406ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);24072408/* XXX: Right now, we're really dumb and just flush whatever categories2409* the app asks for. One of these days we may make this a bit better2410* but right now that's all the hardware allows for in most areas.2411*/2412VkAccessFlags src_flags = 0;2413VkAccessFlags dst_flags = 0;24142415for (uint32_t i = 0; i < memoryBarrierCount; i++) {2416src_flags |= pMemoryBarriers[i].srcAccessMask;2417dst_flags |= pMemoryBarriers[i].dstAccessMask;2418}24192420for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {2421src_flags |= pBufferMemoryBarriers[i].srcAccessMask;2422dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;2423}24242425for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {2426src_flags |= pImageMemoryBarriers[i].srcAccessMask;2427dst_flags |= pImageMemoryBarriers[i].dstAccessMask;2428ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image);2429const VkImageSubresourceRange *range =2430&pImageMemoryBarriers[i].subresourceRange;24312432uint32_t base_layer, layer_count;2433if (image->type == VK_IMAGE_TYPE_3D) {2434base_layer = 0;2435layer_count = anv_minify(image->extent.depth, range->baseMipLevel);2436} else {2437base_layer = range->baseArrayLayer;2438layer_count = anv_get_layerCount(image, range);2439}24402441if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {2442transition_depth_buffer(cmd_buffer, image,2443base_layer, layer_count,2444pImageMemoryBarriers[i].oldLayout,2445pImageMemoryBarriers[i].newLayout,2446false /* will_full_fast_clear */);2447}24482449if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {2450transition_stencil_buffer(cmd_buffer, image,2451range->baseMipLevel,2452anv_get_levelCount(image, range),2453base_layer, layer_count,2454pImageMemoryBarriers[i].oldLayout,2455pImageMemoryBarriers[i].newLayout,2456false /* will_full_fast_clear */);2457}24582459if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {2460VkImageAspectFlags color_aspects =2461anv_image_expand_aspects(image, range->aspectMask);2462anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {2463transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,2464range->baseMipLevel,2465anv_get_levelCount(image, range),2466base_layer, layer_count,2467pImageMemoryBarriers[i].oldLayout,2468pImageMemoryBarriers[i].newLayout,2469pImageMemoryBarriers[i].srcQueueFamilyIndex,2470pImageMemoryBarriers[i].dstQueueFamilyIndex,2471false /* will_full_fast_clear */);2472}2473}2474}24752476anv_add_pending_pipe_bits(cmd_buffer,2477anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |2478anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags),2479"pipe barrier");2480}24812482static void2483cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)2484{2485VkShaderStageFlags stages =2486cmd_buffer->state.gfx.pipeline->active_stages;24872488/* In order to avoid thrash, we assume that vertex and fragment stages2489* always exist. In the rare case where one is missing *and* the other2490* uses push concstants, this may be suboptimal. However, avoiding stalls2491* seems more important.2492*/2493stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;24942495if (stages == cmd_buffer->state.gfx.push_constant_stages)2496return;24972498#if GFX_VER >= 82499const unsigned push_constant_kb = 32;2500#elif GFX_VERx10 == 752501const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;2502#else2503const unsigned push_constant_kb = 16;2504#endif25052506const unsigned num_stages =2507util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);2508unsigned size_per_stage = push_constant_kb / num_stages;25092510/* Broadwell+ and Haswell gt3 require that the push constant sizes be in2511* units of 2KB. Incidentally, these are the same platforms that have2512* 32KB worth of push constant space.2513*/2514if (push_constant_kb == 32)2515size_per_stage &= ~1u;25162517uint32_t kb_used = 0;2518for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {2519unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;2520anv_batch_emit(&cmd_buffer->batch,2521GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {2522alloc._3DCommandSubOpcode = 18 + i;2523alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;2524alloc.ConstantBufferSize = push_size;2525}2526kb_used += push_size;2527}25282529anv_batch_emit(&cmd_buffer->batch,2530GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {2531alloc.ConstantBufferOffset = kb_used;2532alloc.ConstantBufferSize = push_constant_kb - kb_used;2533}25342535cmd_buffer->state.gfx.push_constant_stages = stages;25362537/* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:2538*2539* "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to2540* the next 3DPRIMITIVE command after programming the2541* 3DSTATE_PUSH_CONSTANT_ALLOC_VS"2542*2543* Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of2544* pipeline setup, we need to dirty push constants.2545*/2546cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;2547}25482549static VkResult2550emit_binding_table(struct anv_cmd_buffer *cmd_buffer,2551struct anv_cmd_pipeline_state *pipe_state,2552struct anv_shader_bin *shader,2553struct anv_state *bt_state)2554{2555struct anv_subpass *subpass = cmd_buffer->state.subpass;2556uint32_t state_offset;25572558struct anv_pipeline_bind_map *map = &shader->bind_map;2559if (map->surface_count == 0) {2560*bt_state = (struct anv_state) { 0, };2561return VK_SUCCESS;2562}25632564*bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,2565map->surface_count,2566&state_offset);2567uint32_t *bt_map = bt_state->map;25682569if (bt_state->map == NULL)2570return VK_ERROR_OUT_OF_DEVICE_MEMORY;25712572/* We only need to emit relocs if we're not using softpin. If we are using2573* softpin then we always keep all user-allocated memory objects resident.2574*/2575const bool need_client_mem_relocs =2576!anv_use_softpin(cmd_buffer->device->physical);2577struct anv_push_constants *push = &pipe_state->push_constants;25782579for (uint32_t s = 0; s < map->surface_count; s++) {2580struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];25812582struct anv_state surface_state;25832584switch (binding->set) {2585case ANV_DESCRIPTOR_SET_NULL:2586bt_map[s] = 0;2587break;25882589case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:2590/* Color attachment binding */2591assert(shader->stage == MESA_SHADER_FRAGMENT);2592if (binding->index < subpass->color_count) {2593const unsigned att =2594subpass->color_attachments[binding->index].attachment;25952596/* From the Vulkan 1.0.46 spec:2597*2598* "If any color or depth/stencil attachments are2599* VK_ATTACHMENT_UNUSED, then no writes occur for those2600* attachments."2601*/2602if (att == VK_ATTACHMENT_UNUSED) {2603surface_state = cmd_buffer->state.null_surface_state;2604} else {2605surface_state = cmd_buffer->state.attachments[att].color.state;2606}2607} else {2608surface_state = cmd_buffer->state.null_surface_state;2609}26102611assert(surface_state.map);2612bt_map[s] = surface_state.offset + state_offset;2613break;26142615case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {2616struct anv_state surface_state =2617anv_cmd_buffer_alloc_surface_state(cmd_buffer);26182619struct anv_address constant_data = {2620.bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,2621.offset = shader->kernel.offset +2622shader->prog_data->const_data_offset,2623};2624unsigned constant_data_size = shader->prog_data->const_data_size;26252626const enum isl_format format =2627anv_isl_format_for_descriptor_type(cmd_buffer->device,2628VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);2629anv_fill_buffer_surface_state(cmd_buffer->device,2630surface_state, format,2631ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,2632constant_data, constant_data_size, 1);26332634assert(surface_state.map);2635bt_map[s] = surface_state.offset + state_offset;2636add_surface_reloc(cmd_buffer, surface_state, constant_data);2637break;2638}26392640case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {2641/* This is always the first binding for compute shaders */2642assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);26432644struct anv_state surface_state =2645anv_cmd_buffer_alloc_surface_state(cmd_buffer);26462647const enum isl_format format =2648anv_isl_format_for_descriptor_type(cmd_buffer->device,2649VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);2650anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,2651format,2652ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,2653cmd_buffer->state.compute.num_workgroups,265412, 1);26552656assert(surface_state.map);2657bt_map[s] = surface_state.offset + state_offset;2658if (need_client_mem_relocs) {2659add_surface_reloc(cmd_buffer, surface_state,2660cmd_buffer->state.compute.num_workgroups);2661}2662break;2663}26642665case ANV_DESCRIPTOR_SET_DESCRIPTORS: {2666/* This is a descriptor set buffer so the set index is actually2667* given by binding->binding. (Yes, that's confusing.)2668*/2669struct anv_descriptor_set *set =2670pipe_state->descriptors[binding->index];2671assert(set->desc_mem.alloc_size);2672assert(set->desc_surface_state.alloc_size);2673bt_map[s] = set->desc_surface_state.offset + state_offset;2674add_surface_reloc(cmd_buffer, set->desc_surface_state,2675anv_descriptor_set_address(set));2676break;2677}26782679default: {2680assert(binding->set < MAX_SETS);2681const struct anv_descriptor_set *set =2682pipe_state->descriptors[binding->set];2683if (binding->index >= set->descriptor_count) {2684/* From the Vulkan spec section entitled "DescriptorSet and2685* Binding Assignment":2686*2687* "If the array is runtime-sized, then array elements greater2688* than or equal to the size of that binding in the bound2689* descriptor set must not be used."2690*2691* Unfortunately, the compiler isn't smart enough to figure out2692* when a dynamic binding isn't used so it may grab the whole2693* array and stick it in the binding table. In this case, it's2694* safe to just skip those bindings that are OOB.2695*/2696assert(binding->index < set->layout->descriptor_count);2697continue;2698}2699const struct anv_descriptor *desc = &set->descriptors[binding->index];27002701switch (desc->type) {2702case VK_DESCRIPTOR_TYPE_SAMPLER:2703/* Nothing for us to do here */2704continue;27052706case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:2707case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {2708if (desc->image_view) {2709struct anv_surface_state sstate =2710(desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?2711desc->image_view->planes[binding->plane].general_sampler_surface_state :2712desc->image_view->planes[binding->plane].optimal_sampler_surface_state;2713surface_state = sstate.state;2714assert(surface_state.alloc_size);2715if (need_client_mem_relocs)2716add_surface_state_relocs(cmd_buffer, sstate);2717} else {2718surface_state = cmd_buffer->device->null_surface_state;2719}2720break;2721}2722case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:2723assert(shader->stage == MESA_SHADER_FRAGMENT);2724assert(desc->image_view != NULL);2725if ((desc->image_view->aspect_mask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) {2726/* For depth and stencil input attachments, we treat it like any2727* old texture that a user may have bound.2728*/2729assert(desc->image_view->n_planes == 1);2730struct anv_surface_state sstate =2731(desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?2732desc->image_view->planes[0].general_sampler_surface_state :2733desc->image_view->planes[0].optimal_sampler_surface_state;2734surface_state = sstate.state;2735assert(surface_state.alloc_size);2736if (need_client_mem_relocs)2737add_surface_state_relocs(cmd_buffer, sstate);2738} else {2739/* For color input attachments, we create the surface state at2740* vkBeginRenderPass time so that we can include aux and clear2741* color information.2742*/2743assert(binding->input_attachment_index < subpass->input_count);2744const unsigned subpass_att = binding->input_attachment_index;2745const unsigned att = subpass->input_attachments[subpass_att].attachment;2746surface_state = cmd_buffer->state.attachments[att].input.state;2747}2748break;27492750case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {2751if (desc->image_view) {2752struct anv_surface_state sstate = (binding->write_only)2753? desc->image_view->planes[binding->plane].writeonly_storage_surface_state2754: desc->image_view->planes[binding->plane].storage_surface_state;2755surface_state = sstate.state;2756assert(surface_state.alloc_size);2757if (surface_state.offset == 0) {2758mesa_loge("Bound a image to a descriptor where the "2759"descriptor does not have NonReadable "2760"set and the image does not have a "2761"corresponding SPIR-V format enum.");2762vk_debug_report(&cmd_buffer->device->physical->instance->vk,2763VK_DEBUG_REPORT_ERROR_BIT_EXT,2764&desc->image_view->base,2765__LINE__, 0, "anv",2766"Bound a image to a descriptor where the "2767"descriptor does not have NonReadable "2768"set and the image does not have a "2769"corresponding SPIR-V format enum.");2770}2771if (surface_state.offset && need_client_mem_relocs)2772add_surface_state_relocs(cmd_buffer, sstate);2773} else {2774surface_state = cmd_buffer->device->null_surface_state;2775}2776break;2777}27782779case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:2780case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:2781case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:2782if (desc->buffer_view) {2783surface_state = desc->buffer_view->surface_state;2784assert(surface_state.alloc_size);2785if (need_client_mem_relocs) {2786add_surface_reloc(cmd_buffer, surface_state,2787desc->buffer_view->address);2788}2789} else {2790surface_state = cmd_buffer->device->null_surface_state;2791}2792break;27932794case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:2795case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {2796if (desc->buffer) {2797/* Compute the offset within the buffer */2798uint32_t dynamic_offset =2799push->dynamic_offsets[binding->dynamic_offset_index];2800uint64_t offset = desc->offset + dynamic_offset;2801/* Clamp to the buffer size */2802offset = MIN2(offset, desc->buffer->size);2803/* Clamp the range to the buffer size */2804uint32_t range = MIN2(desc->range, desc->buffer->size - offset);28052806/* Align the range for consistency */2807if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)2808range = align_u32(range, ANV_UBO_ALIGNMENT);28092810struct anv_address address =2811anv_address_add(desc->buffer->address, offset);28122813surface_state =2814anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);2815enum isl_format format =2816anv_isl_format_for_descriptor_type(cmd_buffer->device,2817desc->type);28182819isl_surf_usage_flags_t usage =2820desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?2821ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :2822ISL_SURF_USAGE_STORAGE_BIT;28232824anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,2825format, usage, address, range, 1);2826if (need_client_mem_relocs)2827add_surface_reloc(cmd_buffer, surface_state, address);2828} else {2829surface_state = cmd_buffer->device->null_surface_state;2830}2831break;2832}28332834case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:2835if (desc->buffer_view) {2836surface_state = (binding->write_only)2837? desc->buffer_view->writeonly_storage_surface_state2838: desc->buffer_view->storage_surface_state;2839assert(surface_state.alloc_size);2840if (need_client_mem_relocs) {2841add_surface_reloc(cmd_buffer, surface_state,2842desc->buffer_view->address);2843}2844} else {2845surface_state = cmd_buffer->device->null_surface_state;2846}2847break;28482849default:2850assert(!"Invalid descriptor type");2851continue;2852}2853assert(surface_state.map);2854bt_map[s] = surface_state.offset + state_offset;2855break;2856}2857}2858}28592860return VK_SUCCESS;2861}28622863static VkResult2864emit_samplers(struct anv_cmd_buffer *cmd_buffer,2865struct anv_cmd_pipeline_state *pipe_state,2866struct anv_shader_bin *shader,2867struct anv_state *state)2868{2869struct anv_pipeline_bind_map *map = &shader->bind_map;2870if (map->sampler_count == 0) {2871*state = (struct anv_state) { 0, };2872return VK_SUCCESS;2873}28742875uint32_t size = map->sampler_count * 16;2876*state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);28772878if (state->map == NULL)2879return VK_ERROR_OUT_OF_DEVICE_MEMORY;28802881for (uint32_t s = 0; s < map->sampler_count; s++) {2882struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];2883const struct anv_descriptor *desc =2884&pipe_state->descriptors[binding->set]->descriptors[binding->index];28852886if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&2887desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)2888continue;28892890struct anv_sampler *sampler = desc->sampler;28912892/* This can happen if we have an unfilled slot since TYPE_SAMPLER2893* happens to be zero.2894*/2895if (sampler == NULL)2896continue;28972898memcpy(state->map + (s * 16),2899sampler->state[binding->plane], sizeof(sampler->state[0]));2900}29012902return VK_SUCCESS;2903}29042905static uint32_t2906flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,2907struct anv_cmd_pipeline_state *pipe_state,2908const VkShaderStageFlags dirty,2909struct anv_shader_bin **shaders,2910uint32_t num_shaders)2911{2912VkShaderStageFlags flushed = 0;29132914VkResult result = VK_SUCCESS;2915for (uint32_t i = 0; i < num_shaders; i++) {2916if (!shaders[i])2917continue;29182919gl_shader_stage stage = shaders[i]->stage;2920VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);2921if ((vk_stage & dirty) == 0)2922continue;29232924assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));2925result = emit_samplers(cmd_buffer, pipe_state, shaders[i],2926&cmd_buffer->state.samplers[stage]);2927if (result != VK_SUCCESS)2928break;29292930assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));2931result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],2932&cmd_buffer->state.binding_tables[stage]);2933if (result != VK_SUCCESS)2934break;29352936flushed |= vk_stage;2937}29382939if (result != VK_SUCCESS) {2940assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);29412942result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);2943if (result != VK_SUCCESS)2944return 0;29452946/* Re-emit state base addresses so we get the new surface state base2947* address before we start emitting binding tables etc.2948*/2949genX(cmd_buffer_emit_state_base_address)(cmd_buffer);29502951/* Re-emit all active binding tables */2952flushed = 0;29532954for (uint32_t i = 0; i < num_shaders; i++) {2955if (!shaders[i])2956continue;29572958gl_shader_stage stage = shaders[i]->stage;29592960result = emit_samplers(cmd_buffer, pipe_state, shaders[i],2961&cmd_buffer->state.samplers[stage]);2962if (result != VK_SUCCESS) {2963anv_batch_set_error(&cmd_buffer->batch, result);2964return 0;2965}2966result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],2967&cmd_buffer->state.binding_tables[stage]);2968if (result != VK_SUCCESS) {2969anv_batch_set_error(&cmd_buffer->batch, result);2970return 0;2971}29722973flushed |= mesa_to_vk_shader_stage(stage);2974}2975}29762977return flushed;2978}29792980static void2981cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,2982uint32_t stages)2983{2984static const uint32_t sampler_state_opcodes[] = {2985[MESA_SHADER_VERTEX] = 43,2986[MESA_SHADER_TESS_CTRL] = 44, /* HS */2987[MESA_SHADER_TESS_EVAL] = 45, /* DS */2988[MESA_SHADER_GEOMETRY] = 46,2989[MESA_SHADER_FRAGMENT] = 47,2990[MESA_SHADER_COMPUTE] = 0,2991};29922993static const uint32_t binding_table_opcodes[] = {2994[MESA_SHADER_VERTEX] = 38,2995[MESA_SHADER_TESS_CTRL] = 39,2996[MESA_SHADER_TESS_EVAL] = 40,2997[MESA_SHADER_GEOMETRY] = 41,2998[MESA_SHADER_FRAGMENT] = 42,2999[MESA_SHADER_COMPUTE] = 0,3000};30013002anv_foreach_stage(s, stages) {3003assert(s < ARRAY_SIZE(binding_table_opcodes));3004assert(binding_table_opcodes[s] > 0);30053006if (cmd_buffer->state.samplers[s].alloc_size > 0) {3007anv_batch_emit(&cmd_buffer->batch,3008GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {3009ssp._3DCommandSubOpcode = sampler_state_opcodes[s];3010ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;3011}3012}30133014/* Always emit binding table pointers if we're asked to, since on SKL3015* this is what flushes push constants. */3016anv_batch_emit(&cmd_buffer->batch,3017GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {3018btp._3DCommandSubOpcode = binding_table_opcodes[s];3019btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;3020}3021}3022}30233024static struct anv_address3025get_push_range_address(struct anv_cmd_buffer *cmd_buffer,3026const struct anv_shader_bin *shader,3027const struct anv_push_range *range)3028{3029struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;3030switch (range->set) {3031case ANV_DESCRIPTOR_SET_DESCRIPTORS: {3032/* This is a descriptor set buffer so the set index is3033* actually given by binding->binding. (Yes, that's3034* confusing.)3035*/3036struct anv_descriptor_set *set =3037gfx_state->base.descriptors[range->index];3038return anv_descriptor_set_address(set);3039}30403041case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {3042if (gfx_state->base.push_constants_state.alloc_size == 0) {3043gfx_state->base.push_constants_state =3044anv_cmd_buffer_gfx_push_constants(cmd_buffer);3045}3046return (struct anv_address) {3047.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,3048.offset = gfx_state->base.push_constants_state.offset,3049};3050}30513052case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:3053return (struct anv_address) {3054.bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,3055.offset = shader->kernel.offset +3056shader->prog_data->const_data_offset,3057};30583059default: {3060assert(range->set < MAX_SETS);3061struct anv_descriptor_set *set =3062gfx_state->base.descriptors[range->set];3063const struct anv_descriptor *desc =3064&set->descriptors[range->index];30653066if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {3067if (desc->buffer_view)3068return desc->buffer_view->address;3069} else {3070assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);3071if (desc->buffer) {3072const struct anv_push_constants *push =3073&gfx_state->base.push_constants;3074uint32_t dynamic_offset =3075push->dynamic_offsets[range->dynamic_offset_index];3076return anv_address_add(desc->buffer->address,3077desc->offset + dynamic_offset);3078}3079}30803081/* For NULL UBOs, we just return an address in the workaround BO. We do3082* writes to it for workarounds but always at the bottom. The higher3083* bytes should be all zeros.3084*/3085assert(range->length * 32 <= 2048);3086return (struct anv_address) {3087.bo = cmd_buffer->device->workaround_bo,3088.offset = 1024,3089};3090}3091}3092}309330943095/** Returns the size in bytes of the bound buffer3096*3097* The range is relative to the start of the buffer, not the start of the3098* range. The returned range may be smaller than3099*3100* (range->start + range->length) * 32;3101*/3102static uint32_t3103get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,3104const struct anv_shader_bin *shader,3105const struct anv_push_range *range)3106{3107assert(shader->stage != MESA_SHADER_COMPUTE);3108const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;3109switch (range->set) {3110case ANV_DESCRIPTOR_SET_DESCRIPTORS: {3111struct anv_descriptor_set *set =3112gfx_state->base.descriptors[range->index];3113assert(range->start * 32 < set->desc_mem.alloc_size);3114assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);3115return set->desc_mem.alloc_size;3116}31173118case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:3119return (range->start + range->length) * 32;31203121case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:3122return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);31233124default: {3125assert(range->set < MAX_SETS);3126struct anv_descriptor_set *set =3127gfx_state->base.descriptors[range->set];3128const struct anv_descriptor *desc =3129&set->descriptors[range->index];31303131if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {3132if (!desc->buffer_view)3133return 0;31343135if (range->start * 32 > desc->buffer_view->range)3136return 0;31373138return desc->buffer_view->range;3139} else {3140if (!desc->buffer)3141return 0;31423143assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);3144/* Compute the offset within the buffer */3145const struct anv_push_constants *push =3146&gfx_state->base.push_constants;3147uint32_t dynamic_offset =3148push->dynamic_offsets[range->dynamic_offset_index];3149uint64_t offset = desc->offset + dynamic_offset;3150/* Clamp to the buffer size */3151offset = MIN2(offset, desc->buffer->size);3152/* Clamp the range to the buffer size */3153uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset);31543155/* Align the range for consistency */3156bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);31573158return bound_range;3159}3160}3161}3162}31633164static void3165cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,3166gl_shader_stage stage,3167struct anv_address *buffers,3168unsigned buffer_count)3169{3170const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;3171const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;31723173static const uint32_t push_constant_opcodes[] = {3174[MESA_SHADER_VERTEX] = 21,3175[MESA_SHADER_TESS_CTRL] = 25, /* HS */3176[MESA_SHADER_TESS_EVAL] = 26, /* DS */3177[MESA_SHADER_GEOMETRY] = 22,3178[MESA_SHADER_FRAGMENT] = 23,3179[MESA_SHADER_COMPUTE] = 0,3180};31813182assert(stage < ARRAY_SIZE(push_constant_opcodes));3183assert(push_constant_opcodes[stage] > 0);31843185anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {3186c._3DCommandSubOpcode = push_constant_opcodes[stage];31873188if (anv_pipeline_has_stage(pipeline, stage)) {3189const struct anv_pipeline_bind_map *bind_map =3190&pipeline->shaders[stage]->bind_map;31913192#if GFX_VER >= 93193/* This field exists since Gfx8. However, the Broadwell PRM says:3194*3195* "Constant Buffer Object Control State must be always programmed3196* to zero."3197*3198* This restriction does not exist on any newer platforms.3199*3200* We only have one MOCS field for the whole packet, not one per3201* buffer. We could go out of our way here to walk over all of the3202* buffers and see if any of them are used externally and use the3203* external MOCS. However, the notion that someone would use the3204* same bit of memory for both scanout and a UBO is nuts. Let's not3205* bother and assume it's all internal.3206*/3207c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);3208#endif32093210#if GFX_VERx10 >= 753211/* The Skylake PRM contains the following restriction:3212*3213* "The driver must ensure The following case does not occur3214* without a flush to the 3D engine: 3DSTATE_CONSTANT_* with3215* buffer 3 read length equal to zero committed followed by a3216* 3DSTATE_CONSTANT_* with buffer 0 read length not equal to3217* zero committed."3218*3219* To avoid this, we program the buffers in the highest slots.3220* This way, slot 0 is only used if slot 3 is also used.3221*/3222assert(buffer_count <= 4);3223const unsigned shift = 4 - buffer_count;3224for (unsigned i = 0; i < buffer_count; i++) {3225const struct anv_push_range *range = &bind_map->push_ranges[i];32263227/* At this point we only have non-empty ranges */3228assert(range->length > 0);32293230/* For Ivy Bridge, make sure we only set the first range (actual3231* push constants)3232*/3233assert((GFX_VERx10 >= 75) || i == 0);32343235c.ConstantBody.ReadLength[i + shift] = range->length;3236c.ConstantBody.Buffer[i + shift] =3237anv_address_add(buffers[i], range->start * 32);3238}3239#else3240/* For Ivy Bridge, push constants are relative to dynamic state3241* base address and we only ever push actual push constants.3242*/3243if (bind_map->push_ranges[0].length > 0) {3244assert(buffer_count == 1);3245assert(bind_map->push_ranges[0].set ==3246ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);3247assert(buffers[0].bo ==3248cmd_buffer->device->dynamic_state_pool.block_pool.bo);3249c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;3250c.ConstantBody.Buffer[0].bo = NULL;3251c.ConstantBody.Buffer[0].offset = buffers[0].offset;3252}3253assert(bind_map->push_ranges[1].length == 0);3254assert(bind_map->push_ranges[2].length == 0);3255assert(bind_map->push_ranges[3].length == 0);3256#endif3257}3258}3259}32603261#if GFX_VER >= 123262static void3263cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,3264uint32_t shader_mask,3265struct anv_address *buffers,3266uint32_t buffer_count)3267{3268if (buffer_count == 0) {3269anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {3270c.ShaderUpdateEnable = shader_mask;3271c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);3272}3273return;3274}32753276const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;3277const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;32783279static const UNUSED uint32_t push_constant_opcodes[] = {3280[MESA_SHADER_VERTEX] = 21,3281[MESA_SHADER_TESS_CTRL] = 25, /* HS */3282[MESA_SHADER_TESS_EVAL] = 26, /* DS */3283[MESA_SHADER_GEOMETRY] = 22,3284[MESA_SHADER_FRAGMENT] = 23,3285[MESA_SHADER_COMPUTE] = 0,3286};32873288gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);3289assert(stage < ARRAY_SIZE(push_constant_opcodes));3290assert(push_constant_opcodes[stage] > 0);32913292const struct anv_pipeline_bind_map *bind_map =3293&pipeline->shaders[stage]->bind_map;32943295uint32_t *dw;3296const uint32_t buffer_mask = (1 << buffer_count) - 1;3297const uint32_t num_dwords = 2 + 2 * buffer_count;32983299dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,3300GENX(3DSTATE_CONSTANT_ALL),3301.ShaderUpdateEnable = shader_mask,3302.PointerBufferMask = buffer_mask,3303.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));33043305for (int i = 0; i < buffer_count; i++) {3306const struct anv_push_range *range = &bind_map->push_ranges[i];3307GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(3308&cmd_buffer->batch, dw + 2 + i * 2,3309&(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {3310.PointerToConstantBuffer =3311anv_address_add(buffers[i], range->start * 32),3312.ConstantBufferReadLength = range->length,3313});3314}3315}3316#endif33173318static void3319cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,3320VkShaderStageFlags dirty_stages)3321{3322VkShaderStageFlags flushed = 0;3323struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;3324const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;33253326#if GFX_VER >= 123327uint32_t nobuffer_stages = 0;3328#endif33293330/* Compute robust pushed register access mask for each stage. */3331if (cmd_buffer->device->robust_buffer_access) {3332anv_foreach_stage(stage, dirty_stages) {3333if (!anv_pipeline_has_stage(pipeline, stage))3334continue;33353336const struct anv_shader_bin *shader = pipeline->shaders[stage];3337const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;3338struct anv_push_constants *push = &gfx_state->base.push_constants;33393340push->push_reg_mask[stage] = 0;3341/* Start of the current range in the shader, relative to the start of3342* push constants in the shader.3343*/3344unsigned range_start_reg = 0;3345for (unsigned i = 0; i < 4; i++) {3346const struct anv_push_range *range = &bind_map->push_ranges[i];3347if (range->length == 0)3348continue;33493350unsigned bound_size =3351get_push_range_bound_size(cmd_buffer, shader, range);3352if (bound_size >= range->start * 32) {3353unsigned bound_regs =3354MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,3355range->length);3356assert(range_start_reg + bound_regs <= 64);3357push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,3358bound_regs);3359}33603361cmd_buffer->state.push_constants_dirty |=3362mesa_to_vk_shader_stage(stage);33633364range_start_reg += range->length;3365}3366}3367}33683369/* Resets the push constant state so that we allocate a new one if3370* needed.3371*/3372gfx_state->base.push_constants_state = ANV_STATE_NULL;33733374anv_foreach_stage(stage, dirty_stages) {3375unsigned buffer_count = 0;3376flushed |= mesa_to_vk_shader_stage(stage);3377UNUSED uint32_t max_push_range = 0;33783379struct anv_address buffers[4] = {};3380if (anv_pipeline_has_stage(pipeline, stage)) {3381const struct anv_shader_bin *shader = pipeline->shaders[stage];3382const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;33833384/* We have to gather buffer addresses as a second step because the3385* loop above puts data into the push constant area and the call to3386* get_push_range_address is what locks our push constants and copies3387* them into the actual GPU buffer. If we did the two loops at the3388* same time, we'd risk only having some of the sizes in the push3389* constant buffer when we did the copy.3390*/3391for (unsigned i = 0; i < 4; i++) {3392const struct anv_push_range *range = &bind_map->push_ranges[i];3393if (range->length == 0)3394break;33953396buffers[i] = get_push_range_address(cmd_buffer, shader, range);3397max_push_range = MAX2(max_push_range, range->length);3398buffer_count++;3399}34003401/* We have at most 4 buffers but they should be tightly packed */3402for (unsigned i = buffer_count; i < 4; i++)3403assert(bind_map->push_ranges[i].length == 0);3404}34053406#if GFX_VER >= 123407/* If this stage doesn't have any push constants, emit it later in a3408* single CONSTANT_ALL packet.3409*/3410if (buffer_count == 0) {3411nobuffer_stages |= 1 << stage;3412continue;3413}34143415/* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL3416* contains only 5 bits, so we can only use it for buffers smaller than3417* 32.3418*/3419if (max_push_range < 32) {3420cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,3421buffers, buffer_count);3422continue;3423}3424#endif34253426cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);3427}34283429#if GFX_VER >= 123430if (nobuffer_stages)3431cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);3432#endif34333434cmd_buffer->state.push_constants_dirty &= ~flushed;3435}34363437static void3438cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)3439{3440const uint32_t clip_states =3441#if GFX_VER <= 73442ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |3443ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |3444#endif3445ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |3446ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |3447ANV_CMD_DIRTY_PIPELINE;34483449if ((cmd_buffer->state.gfx.dirty & clip_states) == 0)3450return;34513452/* Take dynamic primitive topology in to account with3453* 3DSTATE_CLIP::ViewportXYClipTestEnable3454*/3455bool xy_clip_test_enable = 0;34563457if (cmd_buffer->state.gfx.pipeline->dynamic_states &3458ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {3459VkPrimitiveTopology primitive_topology =3460cmd_buffer->state.gfx.dynamic.primitive_topology;34613462VkPolygonMode dynamic_raster_mode =3463genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,3464primitive_topology);34653466xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);3467}34683469#if GFX_VER <= 73470const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;3471#endif3472struct GENX(3DSTATE_CLIP) clip = {3473GENX(3DSTATE_CLIP_header),3474#if GFX_VER <= 73475.FrontWinding = genX(vk_to_intel_front_face)[d->front_face],3476.CullMode = genX(vk_to_intel_cullmode)[d->cull_mode],3477#endif3478.ViewportXYClipTestEnable = xy_clip_test_enable,3479};3480uint32_t dwords[GENX(3DSTATE_CLIP_length)];34813482struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;3483const struct brw_vue_prog_data *last =3484anv_pipeline_get_last_vue_prog_data(pipeline);3485if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {3486clip.MaximumVPIndex =3487cmd_buffer->state.gfx.dynamic.viewport.count > 0 ?3488cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0;3489}34903491GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);3492anv_batch_emit_merge(&cmd_buffer->batch, dwords,3493pipeline->gfx7.clip);3494}34953496static void3497cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)3498{3499const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;3500struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;35013502#if GFX_VER == 73503# define streamout_state_dw pipeline->gfx7.streamout_state3504#else3505# define streamout_state_dw pipeline->gfx8.streamout_state3506#endif35073508uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];35093510struct GENX(3DSTATE_STREAMOUT) so = {3511GENX(3DSTATE_STREAMOUT_header),3512.RenderingDisable = d->raster_discard,3513};3514GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);3515anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);3516}35173518void3519genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)3520{3521struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;3522uint32_t *p;35233524assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);35253526genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);35273528genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);35293530genX(flush_pipeline_select_3d)(cmd_buffer);35313532/* Apply any pending pipeline flushes we may have. We want to apply them3533* now because, if any of those flushes are for things like push constants,3534* the GPU will read the state at weird times.3535*/3536genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);35373538uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;3539if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)3540vb_emit |= pipeline->vb_used;35413542if (vb_emit) {3543const uint32_t num_buffers = __builtin_popcount(vb_emit);3544const uint32_t num_dwords = 1 + num_buffers * 4;35453546p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,3547GENX(3DSTATE_VERTEX_BUFFERS));3548uint32_t i = 0;3549u_foreach_bit(vb, vb_emit) {3550struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;3551uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;35523553/* If dynamic, use stride/size from vertex binding, otherwise use3554* stride/size that was setup in the pipeline object.3555*/3556bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride;3557bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size;35583559struct GENX(VERTEX_BUFFER_STATE) state;3560if (buffer) {3561uint32_t stride = dynamic_stride ?3562cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride;3563/* From the Vulkan spec (vkCmdBindVertexBuffers2EXT):3564*3565* "If pname:pSizes is not NULL then pname:pSizes[i] specifies3566* the bound size of the vertex buffer starting from the corresponding3567* elements of pname:pBuffers[i] plus pname:pOffsets[i]."3568*/3569UNUSED uint32_t size = dynamic_size ?3570cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset;35713572state = (struct GENX(VERTEX_BUFFER_STATE)) {3573.VertexBufferIndex = vb,35743575.MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,3576ISL_SURF_USAGE_VERTEX_BUFFER_BIT),3577#if GFX_VER <= 73578.BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,3579.InstanceDataStepRate = pipeline->vb[vb].instance_divisor,3580#endif3581.AddressModifyEnable = true,3582.BufferPitch = stride,3583.BufferStartingAddress = anv_address_add(buffer->address, offset),3584.NullVertexBuffer = offset >= buffer->size,3585#if GFX_VER >= 123586.L3BypassDisable = true,3587#endif35883589#if GFX_VER >= 83590.BufferSize = size,3591#else3592/* XXX: to handle dynamic offset for older gens we might want3593* to modify Endaddress, but there are issues when doing so:3594*3595* https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/74393596*/3597.EndAddress = anv_address_add(buffer->address, buffer->size - 1),3598#endif3599};3600} else {3601state = (struct GENX(VERTEX_BUFFER_STATE)) {3602.VertexBufferIndex = vb,3603.NullVertexBuffer = true,3604};3605}36063607#if GFX_VER >= 8 && GFX_VER <= 93608genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,3609state.BufferStartingAddress,3610state.BufferSize);3611#endif36123613GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);3614i++;3615}3616}36173618cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;36193620uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &3621pipeline->active_stages;3622if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&3623!cmd_buffer->state.push_constants_dirty)3624return;36253626if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||3627(GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &3628ANV_CMD_DIRTY_PIPELINE))) {3629/* We don't need any per-buffer dirty tracking because you're not3630* allowed to bind different XFB buffers while XFB is enabled.3631*/3632for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {3633struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];3634anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {3635#if GFX_VER < 123636sob.SOBufferIndex = idx;3637#else3638sob._3DCommandOpcode = 0;3639sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;3640#endif36413642if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {3643sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);3644sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,3645xfb->offset);3646#if GFX_VER >= 83647sob.SOBufferEnable = true;3648sob.StreamOffsetWriteEnable = false;3649/* Size is in DWords - 1 */3650sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;3651#else3652/* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so3653* we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the3654* default for an empty SO_BUFFER packet) to disable them.3655*/3656sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];3657sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,3658xfb->offset + xfb->size);3659#endif3660}3661}3662}36633664/* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */3665if (GFX_VER >= 10) {3666anv_add_pending_pipe_bits(cmd_buffer,3667ANV_PIPE_CS_STALL_BIT,3668"after 3DSTATE_SO_BUFFER call");3669}3670}36713672if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {3673anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);36743675/* Remove from dynamic state emission all of stuff that is baked into3676* the pipeline.3677*/3678cmd_buffer->state.gfx.dirty &= ~pipeline->static_state_mask;36793680/* If the pipeline changed, we may need to re-allocate push constant3681* space in the URB.3682*/3683cmd_buffer_alloc_push_constants(cmd_buffer);3684}36853686if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)3687cmd_buffer->state.gfx.primitive_topology = pipeline->topology;36883689#if GFX_VER <= 73690if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||3691cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {3692/* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:3693*3694* "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth3695* stall needs to be sent just prior to any 3DSTATE_VS,3696* 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,3697* 3DSTATE_BINDING_TABLE_POINTER_VS,3698* 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one3699* PIPE_CONTROL needs to be sent before any combination of VS3700* associated 3DSTATE."3701*/3702anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {3703pc.DepthStallEnable = true;3704pc.PostSyncOperation = WriteImmediateData;3705pc.Address = cmd_buffer->device->workaround_address;3706anv_debug_dump_pc(pc);3707}3708}3709#endif37103711/* Render targets live in the same binding table as fragment descriptors */3712if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)3713descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;37143715/* We emit the binding tables and sampler tables first, then emit push3716* constants and then finally emit binding table and sampler table3717* pointers. It has to happen in this order, since emitting the binding3718* tables may change the push constants (in case of storage images). After3719* emitting push constants, on SKL+ we have to emit the corresponding3720* 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.3721*/3722uint32_t dirty = 0;3723if (descriptors_dirty) {3724dirty = flush_descriptor_sets(cmd_buffer,3725&cmd_buffer->state.gfx.base,3726descriptors_dirty,3727pipeline->shaders,3728ARRAY_SIZE(pipeline->shaders));3729cmd_buffer->state.descriptors_dirty &= ~dirty;3730}37313732if (dirty || cmd_buffer->state.push_constants_dirty) {3733/* Because we're pushing UBOs, we have to push whenever either3734* descriptors or push constants is dirty.3735*/3736dirty |= cmd_buffer->state.push_constants_dirty;3737dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS;3738cmd_buffer_flush_push_constants(cmd_buffer, dirty);3739}37403741if (dirty)3742cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);37433744cmd_buffer_emit_clip(cmd_buffer);37453746if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)3747cmd_buffer_emit_streamout(cmd_buffer);37483749if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)3750gfx8_cmd_buffer_emit_viewport(cmd_buffer);37513752if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |3753ANV_CMD_DIRTY_PIPELINE)) {3754gfx8_cmd_buffer_emit_depth_viewport(cmd_buffer,3755pipeline->depth_clamp_enable);3756}37573758if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR |3759ANV_CMD_DIRTY_RENDER_TARGETS))3760gfx7_cmd_buffer_emit_scissor(cmd_buffer);37613762genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);3763}37643765static void3766emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,3767struct anv_address addr,3768uint32_t size, uint32_t index)3769{3770uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,3771GENX(3DSTATE_VERTEX_BUFFERS));37723773GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,3774&(struct GENX(VERTEX_BUFFER_STATE)) {3775.VertexBufferIndex = index,3776.AddressModifyEnable = true,3777.BufferPitch = 0,3778.MOCS = addr.bo ? anv_mocs(cmd_buffer->device, addr.bo,3779ISL_SURF_USAGE_VERTEX_BUFFER_BIT) : 0,3780.NullVertexBuffer = size == 0,3781#if GFX_VER >= 123782.L3BypassDisable = true,3783#endif3784#if (GFX_VER >= 8)3785.BufferStartingAddress = addr,3786.BufferSize = size3787#else3788.BufferStartingAddress = addr,3789.EndAddress = anv_address_add(addr, size),3790#endif3791});37923793genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,3794index, addr, size);3795}37963797static void3798emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,3799struct anv_address addr)3800{3801emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);3802}38033804static void3805emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,3806uint32_t base_vertex, uint32_t base_instance)3807{3808if (base_vertex == 0 && base_instance == 0) {3809emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);3810} else {3811struct anv_state id_state =3812anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);38133814((uint32_t *)id_state.map)[0] = base_vertex;3815((uint32_t *)id_state.map)[1] = base_instance;38163817struct anv_address addr = {3818.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,3819.offset = id_state.offset,3820};38213822emit_base_vertex_instance_bo(cmd_buffer, addr);3823}3824}38253826static void3827emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)3828{3829struct anv_state state =3830anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);38313832((uint32_t *)state.map)[0] = draw_index;38333834struct anv_address addr = {3835.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,3836.offset = state.offset,3837};38383839emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);3840}38413842static void3843update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,3844uint32_t access_type)3845{3846struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;3847const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);38483849uint64_t vb_used = pipeline->vb_used;3850if (vs_prog_data->uses_firstvertex ||3851vs_prog_data->uses_baseinstance)3852vb_used |= 1ull << ANV_SVGS_VB_INDEX;3853if (vs_prog_data->uses_drawid)3854vb_used |= 1ull << ANV_DRAWID_VB_INDEX;38553856genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,3857access_type == RANDOM,3858vb_used);3859}38603861ALWAYS_INLINE static void3862cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,3863const struct brw_vs_prog_data *vs_prog_data,3864uint32_t base_vertex,3865uint32_t base_instance,3866uint32_t draw_id,3867bool force_flush)3868{3869bool emitted = false;3870if (vs_prog_data->uses_firstvertex ||3871vs_prog_data->uses_baseinstance) {3872emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);3873emitted = true;3874}3875if (vs_prog_data->uses_drawid) {3876emit_draw_index(cmd_buffer, draw_id);3877emitted = true;3878}3879/* Emitting draw index or vertex index BOs may result in needing3880* additional VF cache flushes.3881*/3882if (emitted || force_flush)3883genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);3884}38853886void genX(CmdDraw)(3887VkCommandBuffer commandBuffer,3888uint32_t vertexCount,3889uint32_t instanceCount,3890uint32_t firstVertex,3891uint32_t firstInstance)3892{3893ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);3894struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;3895const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);38963897if (anv_batch_has_error(&cmd_buffer->batch))3898return;38993900const uint32_t count = (vertexCount *3901instanceCount *3902(pipeline->use_primitive_replication ?39031 : anv_subpass_view_count(cmd_buffer->state.subpass)));3904anv_measure_snapshot(cmd_buffer,3905INTEL_SNAPSHOT_DRAW,3906"draw", count);39073908genX(cmd_buffer_flush_state)(cmd_buffer);39093910if (cmd_buffer->state.conditional_render_enabled)3911genX(cmd_emit_conditional_render_predicate)(cmd_buffer);39123913cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,3914firstVertex, firstInstance, 0,3915true);39163917/* Our implementation of VK_KHR_multiview uses instancing to draw the3918* different views. We need to multiply instanceCount by the view count.3919*/3920if (!pipeline->use_primitive_replication)3921instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);39223923anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {3924prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;3925prim.VertexAccessType = SEQUENTIAL;3926prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;3927prim.VertexCountPerInstance = vertexCount;3928prim.StartVertexLocation = firstVertex;3929prim.InstanceCount = instanceCount;3930prim.StartInstanceLocation = firstInstance;3931prim.BaseVertexLocation = 0;3932}39333934update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);3935}39363937void genX(CmdDrawMultiEXT)(3938VkCommandBuffer commandBuffer,3939uint32_t drawCount,3940const VkMultiDrawInfoEXT *pVertexInfo,3941uint32_t instanceCount,3942uint32_t firstInstance,3943uint32_t stride)3944{3945ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);3946struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;3947const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);39483949if (anv_batch_has_error(&cmd_buffer->batch))3950return;39513952const uint32_t count = (drawCount *3953instanceCount *3954(pipeline->use_primitive_replication ?39551 : anv_subpass_view_count(cmd_buffer->state.subpass)));3956anv_measure_snapshot(cmd_buffer,3957INTEL_SNAPSHOT_DRAW,3958"draw_multi", count);39593960genX(cmd_buffer_flush_state)(cmd_buffer);39613962if (cmd_buffer->state.conditional_render_enabled)3963genX(cmd_emit_conditional_render_predicate)(cmd_buffer);39643965/* Our implementation of VK_KHR_multiview uses instancing to draw the3966* different views. We need to multiply instanceCount by the view count.3967*/3968if (!pipeline->use_primitive_replication)3969instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);39703971uint32_t i = 0;3972vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {3973cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,3974draw->firstVertex,3975firstInstance, i, !i);39763977anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {3978prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;3979prim.VertexAccessType = SEQUENTIAL;3980prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;3981prim.VertexCountPerInstance = draw->vertexCount;3982prim.StartVertexLocation = draw->firstVertex;3983prim.InstanceCount = instanceCount;3984prim.StartInstanceLocation = firstInstance;3985prim.BaseVertexLocation = 0;3986}3987}39883989update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);3990}39913992void genX(CmdDrawIndexed)(3993VkCommandBuffer commandBuffer,3994uint32_t indexCount,3995uint32_t instanceCount,3996uint32_t firstIndex,3997int32_t vertexOffset,3998uint32_t firstInstance)3999{4000ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4001struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;4002const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);40034004if (anv_batch_has_error(&cmd_buffer->batch))4005return;40064007const uint32_t count = (indexCount *4008instanceCount *4009(pipeline->use_primitive_replication ?40101 : anv_subpass_view_count(cmd_buffer->state.subpass)));4011anv_measure_snapshot(cmd_buffer,4012INTEL_SNAPSHOT_DRAW,4013"draw indexed",4014count);40154016genX(cmd_buffer_flush_state)(cmd_buffer);40174018if (cmd_buffer->state.conditional_render_enabled)4019genX(cmd_emit_conditional_render_predicate)(cmd_buffer);40204021cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);40224023/* Our implementation of VK_KHR_multiview uses instancing to draw the4024* different views. We need to multiply instanceCount by the view count.4025*/4026if (!pipeline->use_primitive_replication)4027instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);40284029anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4030prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;4031prim.VertexAccessType = RANDOM;4032prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4033prim.VertexCountPerInstance = indexCount;4034prim.StartVertexLocation = firstIndex;4035prim.InstanceCount = instanceCount;4036prim.StartInstanceLocation = firstInstance;4037prim.BaseVertexLocation = vertexOffset;4038}40394040update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);4041}40424043void genX(CmdDrawMultiIndexedEXT)(4044VkCommandBuffer commandBuffer,4045uint32_t drawCount,4046const VkMultiDrawIndexedInfoEXT *pIndexInfo,4047uint32_t instanceCount,4048uint32_t firstInstance,4049uint32_t stride,4050const int32_t *pVertexOffset)4051{4052ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4053struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;4054const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);40554056if (anv_batch_has_error(&cmd_buffer->batch))4057return;40584059const uint32_t count = (drawCount *4060instanceCount *4061(pipeline->use_primitive_replication ?40621 : anv_subpass_view_count(cmd_buffer->state.subpass)));4063anv_measure_snapshot(cmd_buffer,4064INTEL_SNAPSHOT_DRAW,4065"draw indexed_multi",4066count);40674068genX(cmd_buffer_flush_state)(cmd_buffer);40694070if (cmd_buffer->state.conditional_render_enabled)4071genX(cmd_emit_conditional_render_predicate)(cmd_buffer);40724073/* Our implementation of VK_KHR_multiview uses instancing to draw the4074* different views. We need to multiply instanceCount by the view count.4075*/4076if (!pipeline->use_primitive_replication)4077instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);40784079uint32_t i = 0;4080if (pVertexOffset) {4081if (vs_prog_data->uses_drawid) {4082bool emitted = true;4083if (vs_prog_data->uses_firstvertex ||4084vs_prog_data->uses_baseinstance) {4085emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);4086emitted = true;4087}4088vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {4089if (vs_prog_data->uses_drawid) {4090emit_draw_index(cmd_buffer, i);4091emitted = true;4092}4093/* Emitting draw index or vertex index BOs may result in needing4094* additional VF cache flushes.4095*/4096if (emitted)4097genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);40984099anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4100prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;4101prim.VertexAccessType = RANDOM;4102prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4103prim.VertexCountPerInstance = draw->indexCount;4104prim.StartVertexLocation = draw->firstIndex;4105prim.InstanceCount = instanceCount;4106prim.StartInstanceLocation = firstInstance;4107prim.BaseVertexLocation = *pVertexOffset;4108}4109emitted = false;4110}4111} else {4112if (vs_prog_data->uses_firstvertex ||4113vs_prog_data->uses_baseinstance) {4114emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);4115/* Emitting draw index or vertex index BOs may result in needing4116* additional VF cache flushes.4117*/4118genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);4119}4120vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {4121anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4122prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;4123prim.VertexAccessType = RANDOM;4124prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4125prim.VertexCountPerInstance = draw->indexCount;4126prim.StartVertexLocation = draw->firstIndex;4127prim.InstanceCount = instanceCount;4128prim.StartInstanceLocation = firstInstance;4129prim.BaseVertexLocation = *pVertexOffset;4130}4131}4132}4133} else {4134vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {4135cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,4136draw->vertexOffset,4137firstInstance, i, i != 0);41384139anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4140prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;4141prim.VertexAccessType = RANDOM;4142prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4143prim.VertexCountPerInstance = draw->indexCount;4144prim.StartVertexLocation = draw->firstIndex;4145prim.InstanceCount = instanceCount;4146prim.StartInstanceLocation = firstInstance;4147prim.BaseVertexLocation = draw->vertexOffset;4148}4149}4150}41514152update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);4153}41544155/* Auto-Draw / Indirect Registers */4156#define GFX7_3DPRIM_END_OFFSET 0x24204157#define GFX7_3DPRIM_START_VERTEX 0x24304158#define GFX7_3DPRIM_VERTEX_COUNT 0x24344159#define GFX7_3DPRIM_INSTANCE_COUNT 0x24384160#define GFX7_3DPRIM_START_INSTANCE 0x243C4161#define GFX7_3DPRIM_BASE_VERTEX 0x244041624163void genX(CmdDrawIndirectByteCountEXT)(4164VkCommandBuffer commandBuffer,4165uint32_t instanceCount,4166uint32_t firstInstance,4167VkBuffer counterBuffer,4168VkDeviceSize counterBufferOffset,4169uint32_t counterOffset,4170uint32_t vertexStride)4171{4172#if GFX_VERx10 >= 754173ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4174ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);4175struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;4176const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);41774178/* firstVertex is always zero for this draw function */4179const uint32_t firstVertex = 0;41804181if (anv_batch_has_error(&cmd_buffer->batch))4182return;41834184anv_measure_snapshot(cmd_buffer,4185INTEL_SNAPSHOT_DRAW,4186"draw indirect byte count",4187instanceCount);41884189genX(cmd_buffer_flush_state)(cmd_buffer);41904191if (vs_prog_data->uses_firstvertex ||4192vs_prog_data->uses_baseinstance)4193emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);4194if (vs_prog_data->uses_drawid)4195emit_draw_index(cmd_buffer, 0);41964197/* Emitting draw index or vertex index BOs may result in needing4198* additional VF cache flushes.4199*/4200genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);42014202/* Our implementation of VK_KHR_multiview uses instancing to draw the4203* different views. We need to multiply instanceCount by the view count.4204*/4205if (!pipeline->use_primitive_replication)4206instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);42074208struct mi_builder b;4209mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);4210struct mi_value count =4211mi_mem32(anv_address_add(counter_buffer->address,4212counterBufferOffset));4213if (counterOffset)4214count = mi_isub(&b, count, mi_imm(counterOffset));4215count = mi_udiv32_imm(&b, count, vertexStride);4216mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);42174218mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));4219mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount));4220mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));4221mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));42224223anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4224prim.IndirectParameterEnable = true;4225prim.VertexAccessType = SEQUENTIAL;4226prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4227}42284229update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);4230#endif /* GFX_VERx10 >= 75 */4231}42324233static void4234load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,4235struct anv_address addr,4236bool indexed)4237{4238struct mi_builder b;4239mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);42404241mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),4242mi_mem32(anv_address_add(addr, 0)));42434244struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));4245unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass);4246if (view_count > 1) {4247#if GFX_VERx10 >= 754248instance_count = mi_imul_imm(&b, instance_count, view_count);4249#else4250anv_finishme("Multiview + indirect draw requires MI_MATH; "4251"MI_MATH is not supported on Ivy Bridge");4252#endif4253}4254mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);42554256mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),4257mi_mem32(anv_address_add(addr, 8)));42584259if (indexed) {4260mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),4261mi_mem32(anv_address_add(addr, 12)));4262mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),4263mi_mem32(anv_address_add(addr, 16)));4264} else {4265mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),4266mi_mem32(anv_address_add(addr, 12)));4267mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));4268}4269}42704271void genX(CmdDrawIndirect)(4272VkCommandBuffer commandBuffer,4273VkBuffer _buffer,4274VkDeviceSize offset,4275uint32_t drawCount,4276uint32_t stride)4277{4278ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4279ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);4280struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;4281const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);42824283if (anv_batch_has_error(&cmd_buffer->batch))4284return;42854286genX(cmd_buffer_flush_state)(cmd_buffer);42874288if (cmd_buffer->state.conditional_render_enabled)4289genX(cmd_emit_conditional_render_predicate)(cmd_buffer);42904291for (uint32_t i = 0; i < drawCount; i++) {4292struct anv_address draw = anv_address_add(buffer->address, offset);42934294if (vs_prog_data->uses_firstvertex ||4295vs_prog_data->uses_baseinstance)4296emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));4297if (vs_prog_data->uses_drawid)4298emit_draw_index(cmd_buffer, i);42994300/* Emitting draw index or vertex index BOs may result in needing4301* additional VF cache flushes.4302*/4303genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);43044305load_indirect_parameters(cmd_buffer, draw, false);43064307anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4308prim.IndirectParameterEnable = true;4309prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;4310prim.VertexAccessType = SEQUENTIAL;4311prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4312}43134314update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);43154316offset += stride;4317}4318}43194320void genX(CmdDrawIndexedIndirect)(4321VkCommandBuffer commandBuffer,4322VkBuffer _buffer,4323VkDeviceSize offset,4324uint32_t drawCount,4325uint32_t stride)4326{4327ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4328ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);4329struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;4330const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);43314332if (anv_batch_has_error(&cmd_buffer->batch))4333return;43344335genX(cmd_buffer_flush_state)(cmd_buffer);43364337if (cmd_buffer->state.conditional_render_enabled)4338genX(cmd_emit_conditional_render_predicate)(cmd_buffer);43394340for (uint32_t i = 0; i < drawCount; i++) {4341struct anv_address draw = anv_address_add(buffer->address, offset);43424343/* TODO: We need to stomp base vertex to 0 somehow */4344if (vs_prog_data->uses_firstvertex ||4345vs_prog_data->uses_baseinstance)4346emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));4347if (vs_prog_data->uses_drawid)4348emit_draw_index(cmd_buffer, i);43494350/* Emitting draw index or vertex index BOs may result in needing4351* additional VF cache flushes.4352*/4353genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);43544355load_indirect_parameters(cmd_buffer, draw, true);43564357anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4358prim.IndirectParameterEnable = true;4359prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;4360prim.VertexAccessType = RANDOM;4361prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4362}43634364update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);43654366offset += stride;4367}4368}43694370static struct mi_value4371prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,4372struct mi_builder *b,4373struct anv_address count_address,4374const bool conditional_render_enabled)4375{4376struct mi_value ret = mi_imm(0);43774378if (conditional_render_enabled) {4379#if GFX_VERx10 >= 754380ret = mi_new_gpr(b);4381mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));4382#endif4383} else {4384/* Upload the current draw count from the draw parameters buffer to4385* MI_PREDICATE_SRC0.4386*/4387mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));4388mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));4389}43904391return ret;4392}43934394static void4395emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,4396struct mi_builder *b,4397uint32_t draw_index)4398{4399/* Upload the index of the current primitive to MI_PREDICATE_SRC1. */4400mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));44014402if (draw_index == 0) {4403anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {4404mip.LoadOperation = LOAD_LOADINV;4405mip.CombineOperation = COMBINE_SET;4406mip.CompareOperation = COMPARE_SRCS_EQUAL;4407}4408} else {4409/* While draw_index < draw_count the predicate's result will be4410* (draw_index == draw_count) ^ TRUE = TRUE4411* When draw_index == draw_count the result is4412* (TRUE) ^ TRUE = FALSE4413* After this all results will be:4414* (FALSE) ^ FALSE = FALSE4415*/4416anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {4417mip.LoadOperation = LOAD_LOAD;4418mip.CombineOperation = COMBINE_XOR;4419mip.CompareOperation = COMPARE_SRCS_EQUAL;4420}4421}4422}44234424#if GFX_VERx10 >= 754425static void4426emit_draw_count_predicate_with_conditional_render(4427struct anv_cmd_buffer *cmd_buffer,4428struct mi_builder *b,4429uint32_t draw_index,4430struct mi_value max)4431{4432struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);4433pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));44344435#if GFX_VER >= 84436mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);4437#else4438/* MI_PREDICATE_RESULT is not whitelisted in i915 command parser4439* so we emit MI_PREDICATE to set it.4440*/44414442mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);4443mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));44444445anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {4446mip.LoadOperation = LOAD_LOADINV;4447mip.CombineOperation = COMBINE_SET;4448mip.CompareOperation = COMPARE_SRCS_EQUAL;4449}4450#endif4451}4452#endif44534454void genX(CmdDrawIndirectCount)(4455VkCommandBuffer commandBuffer,4456VkBuffer _buffer,4457VkDeviceSize offset,4458VkBuffer _countBuffer,4459VkDeviceSize countBufferOffset,4460uint32_t maxDrawCount,4461uint32_t stride)4462{4463ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4464ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);4465ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);4466struct anv_cmd_state *cmd_state = &cmd_buffer->state;4467struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;4468const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);44694470if (anv_batch_has_error(&cmd_buffer->batch))4471return;44724473genX(cmd_buffer_flush_state)(cmd_buffer);44744475struct mi_builder b;4476mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);4477struct anv_address count_address =4478anv_address_add(count_buffer->address, countBufferOffset);4479struct mi_value max =4480prepare_for_draw_count_predicate(cmd_buffer, &b, count_address,4481cmd_state->conditional_render_enabled);44824483for (uint32_t i = 0; i < maxDrawCount; i++) {4484struct anv_address draw = anv_address_add(buffer->address, offset);44854486#if GFX_VERx10 >= 754487if (cmd_state->conditional_render_enabled) {4488emit_draw_count_predicate_with_conditional_render(4489cmd_buffer, &b, i, mi_value_ref(&b, max));4490} else {4491emit_draw_count_predicate(cmd_buffer, &b, i);4492}4493#else4494emit_draw_count_predicate(cmd_buffer, &b, i);4495#endif44964497if (vs_prog_data->uses_firstvertex ||4498vs_prog_data->uses_baseinstance)4499emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));4500if (vs_prog_data->uses_drawid)4501emit_draw_index(cmd_buffer, i);45024503/* Emitting draw index or vertex index BOs may result in needing4504* additional VF cache flushes.4505*/4506genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);45074508load_indirect_parameters(cmd_buffer, draw, false);45094510anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4511prim.IndirectParameterEnable = true;4512prim.PredicateEnable = true;4513prim.VertexAccessType = SEQUENTIAL;4514prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4515}45164517update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);45184519offset += stride;4520}45214522mi_value_unref(&b, max);4523}45244525void genX(CmdDrawIndexedIndirectCount)(4526VkCommandBuffer commandBuffer,4527VkBuffer _buffer,4528VkDeviceSize offset,4529VkBuffer _countBuffer,4530VkDeviceSize countBufferOffset,4531uint32_t maxDrawCount,4532uint32_t stride)4533{4534ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4535ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);4536ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);4537struct anv_cmd_state *cmd_state = &cmd_buffer->state;4538struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;4539const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);45404541if (anv_batch_has_error(&cmd_buffer->batch))4542return;45434544genX(cmd_buffer_flush_state)(cmd_buffer);45454546struct mi_builder b;4547mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);4548struct anv_address count_address =4549anv_address_add(count_buffer->address, countBufferOffset);4550struct mi_value max =4551prepare_for_draw_count_predicate(cmd_buffer, &b, count_address,4552cmd_state->conditional_render_enabled);45534554for (uint32_t i = 0; i < maxDrawCount; i++) {4555struct anv_address draw = anv_address_add(buffer->address, offset);45564557#if GFX_VERx10 >= 754558if (cmd_state->conditional_render_enabled) {4559emit_draw_count_predicate_with_conditional_render(4560cmd_buffer, &b, i, mi_value_ref(&b, max));4561} else {4562emit_draw_count_predicate(cmd_buffer, &b, i);4563}4564#else4565emit_draw_count_predicate(cmd_buffer, &b, i);4566#endif45674568/* TODO: We need to stomp base vertex to 0 somehow */4569if (vs_prog_data->uses_firstvertex ||4570vs_prog_data->uses_baseinstance)4571emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));4572if (vs_prog_data->uses_drawid)4573emit_draw_index(cmd_buffer, i);45744575/* Emitting draw index or vertex index BOs may result in needing4576* additional VF cache flushes.4577*/4578genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);45794580load_indirect_parameters(cmd_buffer, draw, true);45814582anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {4583prim.IndirectParameterEnable = true;4584prim.PredicateEnable = true;4585prim.VertexAccessType = RANDOM;4586prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;4587}45884589update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);45904591offset += stride;4592}45934594mi_value_unref(&b, max);4595}45964597void genX(CmdBeginTransformFeedbackEXT)(4598VkCommandBuffer commandBuffer,4599uint32_t firstCounterBuffer,4600uint32_t counterBufferCount,4601const VkBuffer* pCounterBuffers,4602const VkDeviceSize* pCounterBufferOffsets)4603{4604ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);46054606assert(firstCounterBuffer < MAX_XFB_BUFFERS);4607assert(counterBufferCount <= MAX_XFB_BUFFERS);4608assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);46094610/* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:4611*4612* "Ssoftware must ensure that no HW stream output operations can be in4613* process or otherwise pending at the point that the MI_LOAD/STORE4614* commands are processed. This will likely require a pipeline flush."4615*/4616anv_add_pending_pipe_bits(cmd_buffer,4617ANV_PIPE_CS_STALL_BIT,4618"begin transform feedback");4619genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);46204621for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {4622/* If we have a counter buffer, this is a resume so we need to load the4623* value into the streamout offset register. Otherwise, this is a begin4624* and we need to reset it to zero.4625*/4626if (pCounterBuffers &&4627idx >= firstCounterBuffer &&4628idx - firstCounterBuffer < counterBufferCount &&4629pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {4630uint32_t cb_idx = idx - firstCounterBuffer;4631ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);4632uint64_t offset = pCounterBufferOffsets ?4633pCounterBufferOffsets[cb_idx] : 0;46344635anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {4636lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;4637lrm.MemoryAddress = anv_address_add(counter_buffer->address,4638offset);4639}4640} else {4641anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {4642lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;4643lri.DataDWord = 0;4644}4645}4646}46474648cmd_buffer->state.xfb_enabled = true;4649cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;4650}46514652void genX(CmdEndTransformFeedbackEXT)(4653VkCommandBuffer commandBuffer,4654uint32_t firstCounterBuffer,4655uint32_t counterBufferCount,4656const VkBuffer* pCounterBuffers,4657const VkDeviceSize* pCounterBufferOffsets)4658{4659ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);46604661assert(firstCounterBuffer < MAX_XFB_BUFFERS);4662assert(counterBufferCount <= MAX_XFB_BUFFERS);4663assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);46644665/* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:4666*4667* "Ssoftware must ensure that no HW stream output operations can be in4668* process or otherwise pending at the point that the MI_LOAD/STORE4669* commands are processed. This will likely require a pipeline flush."4670*/4671anv_add_pending_pipe_bits(cmd_buffer,4672ANV_PIPE_CS_STALL_BIT,4673"end transform feedback");4674genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);46754676for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {4677unsigned idx = firstCounterBuffer + cb_idx;46784679/* If we have a counter buffer, this is a resume so we need to load the4680* value into the streamout offset register. Otherwise, this is a begin4681* and we need to reset it to zero.4682*/4683if (pCounterBuffers &&4684cb_idx < counterBufferCount &&4685pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {4686ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);4687uint64_t offset = pCounterBufferOffsets ?4688pCounterBufferOffsets[cb_idx] : 0;46894690anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {4691srm.MemoryAddress = anv_address_add(counter_buffer->address,4692offset);4693srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;4694}4695}4696}46974698cmd_buffer->state.xfb_enabled = false;4699cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;4700}47014702void4703genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)4704{4705struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;4706struct anv_compute_pipeline *pipeline = comp_state->pipeline;47074708assert(pipeline->cs);47094710genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);47114712genX(flush_pipeline_select_gpgpu)(cmd_buffer);47134714/* Apply any pending pipeline flushes we may have. We want to apply them4715* now because, if any of those flushes are for things like push constants,4716* the GPU will read the state at weird times.4717*/4718genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);47194720if (cmd_buffer->state.compute.pipeline_dirty) {4721/* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:4722*4723* "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless4724* the only bits that are changed are scoreboard related: Scoreboard4725* Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For4726* these scoreboard related states, a MEDIA_STATE_FLUSH is4727* sufficient."4728*/4729anv_add_pending_pipe_bits(cmd_buffer,4730ANV_PIPE_CS_STALL_BIT,4731"flush compute state");4732genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);47334734anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);47354736/* The workgroup size of the pipeline affects our push constant layout4737* so flag push constants as dirty if we change the pipeline.4738*/4739cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;4740}47414742if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||4743cmd_buffer->state.compute.pipeline_dirty) {4744flush_descriptor_sets(cmd_buffer,4745&cmd_buffer->state.compute.base,4746VK_SHADER_STAGE_COMPUTE_BIT,4747&pipeline->cs, 1);4748cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;47494750#if GFX_VERx10 < 1254751uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];4752struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {4753.BindingTablePointer =4754cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,4755.SamplerStatePointer =4756cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,4757};4758GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);47594760struct anv_state state =4761anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,4762pipeline->interface_descriptor_data,4763GENX(INTERFACE_DESCRIPTOR_DATA_length),476464);47654766uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);4767anv_batch_emit(&cmd_buffer->batch,4768GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {4769mid.InterfaceDescriptorTotalLength = size;4770mid.InterfaceDescriptorDataStartAddress = state.offset;4771}4772#endif4773}47744775if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {4776comp_state->push_data =4777anv_cmd_buffer_cs_push_constants(cmd_buffer);47784779#if GFX_VERx10 < 1254780if (comp_state->push_data.alloc_size) {4781anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {4782curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;4783curbe.CURBEDataStartAddress = comp_state->push_data.offset;4784}4785}4786#endif47874788cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;4789}47904791cmd_buffer->state.compute.pipeline_dirty = false;47924793genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);4794}47954796#if GFX_VER == 747974798static VkResult4799verify_cmd_parser(const struct anv_device *device,4800int required_version,4801const char *function)4802{4803if (device->physical->cmd_parser_version < required_version) {4804return vk_errorf(device, &device->physical->vk.base,4805VK_ERROR_FEATURE_NOT_PRESENT,4806"cmd parser version %d is required for %s",4807required_version, function);4808} else {4809return VK_SUCCESS;4810}4811}48124813#endif48144815static void4816anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,4817uint32_t baseGroupX,4818uint32_t baseGroupY,4819uint32_t baseGroupZ)4820{4821if (anv_batch_has_error(&cmd_buffer->batch))4822return;48234824struct anv_push_constants *push =4825&cmd_buffer->state.compute.base.push_constants;4826if (push->cs.base_work_group_id[0] != baseGroupX ||4827push->cs.base_work_group_id[1] != baseGroupY ||4828push->cs.base_work_group_id[2] != baseGroupZ) {4829push->cs.base_work_group_id[0] = baseGroupX;4830push->cs.base_work_group_id[1] = baseGroupY;4831push->cs.base_work_group_id[2] = baseGroupZ;48324833cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;4834}4835}48364837void genX(CmdDispatch)(4838VkCommandBuffer commandBuffer,4839uint32_t x,4840uint32_t y,4841uint32_t z)4842{4843genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);4844}48454846#if GFX_VERx10 >= 12548474848static inline void4849emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,4850const struct anv_compute_pipeline *pipeline, bool indirect,4851const struct brw_cs_prog_data *prog_data,4852uint32_t groupCountX, uint32_t groupCountY,4853uint32_t groupCountZ)4854{4855struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;4856const struct anv_shader_bin *cs_bin = pipeline->cs;4857bool predicate = cmd_buffer->state.conditional_render_enabled;48584859const struct intel_device_info *devinfo = &pipeline->base.device->info;4860const struct brw_cs_dispatch_info dispatch =4861brw_cs_get_dispatch_info(devinfo, prog_data, NULL);48624863anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {4864cw.IndirectParameterEnable = indirect;4865cw.PredicateEnable = predicate;4866cw.SIMDSize = dispatch.simd_size / 16;4867cw.IndirectDataStartAddress = comp_state->push_data.offset;4868cw.IndirectDataLength = comp_state->push_data.alloc_size;4869cw.LocalXMaximum = prog_data->local_size[0] - 1;4870cw.LocalYMaximum = prog_data->local_size[1] - 1;4871cw.LocalZMaximum = prog_data->local_size[2] - 1;4872cw.ThreadGroupIDXDimension = groupCountX;4873cw.ThreadGroupIDYDimension = groupCountY;4874cw.ThreadGroupIDZDimension = groupCountZ;4875cw.ExecutionMask = dispatch.right_mask;48764877cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {4878.KernelStartPointer = cs_bin->kernel.offset,4879.SamplerStatePointer =4880cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,4881.BindingTablePointer =4882cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,4883.BindingTableEntryCount =48841 + MIN2(pipeline->cs->bind_map.surface_count, 30),4885.NumberofThreadsinGPGPUThreadGroup = dispatch.threads,4886.SharedLocalMemorySize = encode_slm_size(GFX_VER,4887prog_data->base.total_shared),4888.BarrierEnable = prog_data->uses_barrier,4889};4890}4891}48924893#else /* #if GFX_VERx10 >= 125 */48944895static inline void4896emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,4897const struct anv_compute_pipeline *pipeline, bool indirect,4898const struct brw_cs_prog_data *prog_data,4899uint32_t groupCountX, uint32_t groupCountY,4900uint32_t groupCountZ)4901{4902bool predicate = (GFX_VER <= 7 && indirect) ||4903cmd_buffer->state.conditional_render_enabled;49044905const struct intel_device_info *devinfo = &pipeline->base.device->info;4906const struct brw_cs_dispatch_info dispatch =4907brw_cs_get_dispatch_info(devinfo, prog_data, NULL);49084909anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {4910ggw.IndirectParameterEnable = indirect;4911ggw.PredicateEnable = predicate;4912ggw.SIMDSize = dispatch.simd_size / 16;4913ggw.ThreadDepthCounterMaximum = 0;4914ggw.ThreadHeightCounterMaximum = 0;4915ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;4916ggw.ThreadGroupIDXDimension = groupCountX;4917ggw.ThreadGroupIDYDimension = groupCountY;4918ggw.ThreadGroupIDZDimension = groupCountZ;4919ggw.RightExecutionMask = dispatch.right_mask;4920ggw.BottomExecutionMask = 0xffffffff;4921}49224923anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);4924}49254926#endif /* #if GFX_VERx10 >= 125 */49274928static inline void4929emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,4930const struct anv_compute_pipeline *pipeline, bool indirect,4931const struct brw_cs_prog_data *prog_data,4932uint32_t groupCountX, uint32_t groupCountY,4933uint32_t groupCountZ)4934{4935#if GFX_VERx10 >= 1254936emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,4937groupCountY, groupCountZ);4938#else4939emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,4940groupCountY, groupCountZ);4941#endif4942}49434944void genX(CmdDispatchBase)(4945VkCommandBuffer commandBuffer,4946uint32_t baseGroupX,4947uint32_t baseGroupY,4948uint32_t baseGroupZ,4949uint32_t groupCountX,4950uint32_t groupCountY,4951uint32_t groupCountZ)4952{4953ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);4954struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;4955const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);49564957anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,4958baseGroupY, baseGroupZ);49594960if (anv_batch_has_error(&cmd_buffer->batch))4961return;49624963anv_measure_snapshot(cmd_buffer,4964INTEL_SNAPSHOT_COMPUTE,4965"compute",4966groupCountX * groupCountY * groupCountZ *4967prog_data->local_size[0] * prog_data->local_size[1] *4968prog_data->local_size[2]);49694970if (prog_data->uses_num_work_groups) {4971struct anv_state state =4972anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);4973uint32_t *sizes = state.map;4974sizes[0] = groupCountX;4975sizes[1] = groupCountY;4976sizes[2] = groupCountZ;4977cmd_buffer->state.compute.num_workgroups = (struct anv_address) {4978.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,4979.offset = state.offset,4980};49814982/* The num_workgroups buffer goes in the binding table */4983cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;4984}49854986genX(cmd_buffer_flush_compute_state)(cmd_buffer);49874988if (cmd_buffer->state.conditional_render_enabled)4989genX(cmd_emit_conditional_render_predicate)(cmd_buffer);49904991emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,4992groupCountY, groupCountZ);4993}49944995#define GPGPU_DISPATCHDIMX 0x25004996#define GPGPU_DISPATCHDIMY 0x25044997#define GPGPU_DISPATCHDIMZ 0x250849984999void genX(CmdDispatchIndirect)(5000VkCommandBuffer commandBuffer,5001VkBuffer _buffer,5002VkDeviceSize offset)5003{5004ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);5005ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);5006struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;5007const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);5008struct anv_address addr = anv_address_add(buffer->address, offset);5009UNUSED struct anv_batch *batch = &cmd_buffer->batch;50105011anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);50125013#if GFX_VER == 75014/* Linux 4.4 added command parser version 5 which allows the GPGPU5015* indirect dispatch registers to be written.5016*/5017if (verify_cmd_parser(cmd_buffer->device, 5,5018"vkCmdDispatchIndirect") != VK_SUCCESS)5019return;5020#endif50215022anv_measure_snapshot(cmd_buffer,5023INTEL_SNAPSHOT_COMPUTE,5024"compute indirect",50250);50265027if (prog_data->uses_num_work_groups) {5028cmd_buffer->state.compute.num_workgroups = addr;50295030/* The num_workgroups buffer goes in the binding table */5031cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;5032}50335034genX(cmd_buffer_flush_compute_state)(cmd_buffer);50355036struct mi_builder b;5037mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);50385039struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));5040struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));5041struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));50425043mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);5044mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);5045mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);50465047#if GFX_VER <= 75048/* predicate = (compute_dispatch_indirect_x_size == 0); */5049mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);5050mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));5051anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {5052mip.LoadOperation = LOAD_LOAD;5053mip.CombineOperation = COMBINE_SET;5054mip.CompareOperation = COMPARE_SRCS_EQUAL;5055}50565057/* predicate |= (compute_dispatch_indirect_y_size == 0); */5058mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);5059anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {5060mip.LoadOperation = LOAD_LOAD;5061mip.CombineOperation = COMBINE_OR;5062mip.CompareOperation = COMPARE_SRCS_EQUAL;5063}50645065/* predicate |= (compute_dispatch_indirect_z_size == 0); */5066mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);5067anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {5068mip.LoadOperation = LOAD_LOAD;5069mip.CombineOperation = COMBINE_OR;5070mip.CompareOperation = COMPARE_SRCS_EQUAL;5071}50725073/* predicate = !predicate; */5074anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {5075mip.LoadOperation = LOAD_LOADINV;5076mip.CombineOperation = COMBINE_OR;5077mip.CompareOperation = COMPARE_FALSE;5078}50795080#if GFX_VERx10 == 755081if (cmd_buffer->state.conditional_render_enabled) {5082/* predicate &= !(conditional_rendering_predicate == 0); */5083mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),5084mi_reg32(ANV_PREDICATE_RESULT_REG));5085anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {5086mip.LoadOperation = LOAD_LOADINV;5087mip.CombineOperation = COMBINE_AND;5088mip.CompareOperation = COMPARE_SRCS_EQUAL;5089}5090}5091#endif50925093#else /* GFX_VER > 7 */5094if (cmd_buffer->state.conditional_render_enabled)5095genX(cmd_emit_conditional_render_predicate)(cmd_buffer);5096#endif50975098emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);5099}51005101#if GFX_VERx10 >= 1255102static void5103calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])5104{5105unsigned total_shift = 0;5106memset(local_shift, 0, 3);51075108bool progress;5109do {5110progress = false;5111for (unsigned i = 0; i < 3; i++) {5112assert(global[i] > 0);5113if ((1 << local_shift[i]) < global[i]) {5114progress = true;5115local_shift[i]++;5116total_shift++;5117}51185119if (total_shift == 3)5120return;5121}5122} while(progress);51235124/* Assign whatever's left to x */5125local_shift[0] += 3 - total_shift;5126}51275128static struct GFX_RT_SHADER_TABLE5129vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)5130{5131return (struct GFX_RT_SHADER_TABLE) {5132.BaseAddress = anv_address_from_u64(region->deviceAddress),5133.Stride = region->stride,5134};5135}51365137static void5138cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,5139const VkStridedDeviceAddressRegionKHR *raygen_sbt,5140const VkStridedDeviceAddressRegionKHR *miss_sbt,5141const VkStridedDeviceAddressRegionKHR *hit_sbt,5142const VkStridedDeviceAddressRegionKHR *callable_sbt,5143bool is_indirect,5144uint32_t launch_width,5145uint32_t launch_height,5146uint32_t launch_depth,5147uint64_t launch_size_addr)5148{5149struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;5150struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;51515152if (anv_batch_has_error(&cmd_buffer->batch))5153return;51545155/* If we have a known degenerate launch size, just bail */5156if (!is_indirect &&5157(launch_width == 0 || launch_height == 0 || launch_depth == 0))5158return;51595160genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);5161genX(flush_pipeline_select_gpgpu)(cmd_buffer);51625163cmd_buffer->state.rt.pipeline_dirty = false;51645165genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);51665167/* Add these to the reloc list as they're internal buffers that don't5168* actually have relocs to pick them up manually.5169*5170* TODO(RT): This is a bit of a hack5171*/5172anv_reloc_list_add_bo(cmd_buffer->batch.relocs,5173cmd_buffer->batch.alloc,5174rt->scratch.bo);51755176/* Allocate and set up our RT_DISPATCH_GLOBALS */5177struct anv_state rtdg_state =5178anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,5179BRW_RT_PUSH_CONST_OFFSET +5180sizeof(struct anv_push_constants),518164);51825183struct GFX_RT_DISPATCH_GLOBALS rtdg = {5184.MemBaseAddress = (struct anv_address) {5185.bo = rt->scratch.bo,5186.offset = rt->scratch.layout.ray_stack_start,5187},5188.CallStackHandler =5189anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),5190.AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,5191.NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,5192.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,5193.Flags = RT_DEPTH_TEST_LESS_EQUAL,5194.HitGroupTable = vk_sdar_to_shader_table(hit_sbt),5195.MissGroupTable = vk_sdar_to_shader_table(miss_sbt),5196.SWStackSize = rt->scratch.layout.sw_stack_size / 64,5197.LaunchWidth = launch_width,5198.LaunchHeight = launch_height,5199.LaunchDepth = launch_depth,5200.CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),5201};5202GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);52035204/* Push constants go after the RT_DISPATCH_GLOBALS */5205assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);5206memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,5207&cmd_buffer->state.rt.base.push_constants,5208sizeof(struct anv_push_constants));52095210struct anv_address rtdg_addr = {5211.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,5212.offset = rtdg_state.offset,5213};52145215uint8_t local_size_log2[3];5216uint32_t global_size[3] = {};5217if (is_indirect) {5218/* Pick a local size that's probably ok. We assume most TraceRays calls5219* will use a two-dimensional dispatch size. Worst case, our initial5220* dispatch will be a little slower than it has to be.5221*/5222local_size_log2[0] = 2;5223local_size_log2[1] = 1;5224local_size_log2[2] = 0;52255226struct mi_builder b;5227mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);52285229struct mi_value launch_size[3] = {5230mi_mem32(anv_address_from_u64(launch_size_addr + 0)),5231mi_mem32(anv_address_from_u64(launch_size_addr + 4)),5232mi_mem32(anv_address_from_u64(launch_size_addr + 8)),5233};52345235/* Store the original launch size into RT_DISPATCH_GLOBALS5236*5237* TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets5238* moved into a genX version.5239*/5240mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),5241mi_value_ref(&b, launch_size[0]));5242mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),5243mi_value_ref(&b, launch_size[1]));5244mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),5245mi_value_ref(&b, launch_size[2]));52465247/* Compute the global dispatch size */5248for (unsigned i = 0; i < 3; i++) {5249if (local_size_log2[i] == 0)5250continue;52515252/* global_size = DIV_ROUND_UP(launch_size, local_size)5253*5254* Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm5255* has the semantics of shifting the enture 64-bit value and taking5256* the bottom 32 so we don't have to worry about roll-over.5257*/5258uint32_t local_size = 1 << local_size_log2[i];5259launch_size[i] = mi_iadd(&b, launch_size[i],5260mi_imm(local_size - 1));5261launch_size[i] = mi_ushr32_imm(&b, launch_size[i],5262local_size_log2[i]);5263}52645265mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);5266mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);5267mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);5268} else {5269uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };5270calc_local_trace_size(local_size_log2, launch_size);52715272for (unsigned i = 0; i < 3; i++) {5273/* We have to be a bit careful here because DIV_ROUND_UP adds to the5274* numerator value may overflow. Cast to uint64_t to avoid this.5275*/5276uint32_t local_size = 1 << local_size_log2[i];5277global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);5278}5279}52805281anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {5282cw.IndirectParameterEnable = is_indirect;5283cw.PredicateEnable = false;5284cw.SIMDSize = SIMD8;5285cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;5286cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;5287cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;5288cw.ThreadGroupIDXDimension = global_size[0];5289cw.ThreadGroupIDYDimension = global_size[1];5290cw.ThreadGroupIDZDimension = global_size[2];5291cw.ExecutionMask = 0xff;5292cw.EmitInlineParameter = true;52935294const gl_shader_stage s = MESA_SHADER_RAYGEN;5295struct anv_device *device = cmd_buffer->device;5296struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];5297struct anv_state *samplers = &cmd_buffer->state.samplers[s];5298cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {5299.KernelStartPointer = device->rt_trampoline->kernel.offset,5300.SamplerStatePointer = samplers->offset,5301/* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */5302.SamplerCount = 0,5303.BindingTablePointer = surfaces->offset,5304.NumberofThreadsinGPGPUThreadGroup = 1,5305.BTDMode = true,5306};53075308struct brw_rt_raygen_trampoline_params trampoline_params = {5309.rt_disp_globals_addr = anv_address_physical(rtdg_addr),5310.raygen_bsr_addr = raygen_sbt->deviceAddress,5311.is_indirect = is_indirect,5312.local_group_size_log2 = {5313local_size_log2[0],5314local_size_log2[1],5315local_size_log2[2],5316},5317};5318STATIC_ASSERT(sizeof(trampoline_params) == 32);5319memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));5320}5321}53225323void5324genX(CmdTraceRaysKHR)(5325VkCommandBuffer commandBuffer,5326const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,5327const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,5328const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,5329const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,5330uint32_t width,5331uint32_t height,5332uint32_t depth)5333{5334ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);53355336cmd_buffer_trace_rays(cmd_buffer,5337pRaygenShaderBindingTable,5338pMissShaderBindingTable,5339pHitShaderBindingTable,5340pCallableShaderBindingTable,5341false /* is_indirect */,5342width, height, depth,53430 /* launch_size_addr */);5344}53455346void5347genX(CmdTraceRaysIndirectKHR)(5348VkCommandBuffer commandBuffer,5349const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,5350const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,5351const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,5352const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,5353VkDeviceAddress indirectDeviceAddress)5354{5355ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);53565357cmd_buffer_trace_rays(cmd_buffer,5358pRaygenShaderBindingTable,5359pMissShaderBindingTable,5360pHitShaderBindingTable,5361pCallableShaderBindingTable,5362true /* is_indirect */,53630, 0, 0, /* width, height, depth, */5364indirectDeviceAddress);5365}5366#endif /* GFX_VERx10 >= 125 */53675368static void5369genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,5370uint32_t pipeline)5371{5372UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;53735374if (cmd_buffer->state.current_pipeline == pipeline)5375return;53765377#if GFX_VER >= 8 && GFX_VER < 105378/* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:5379*5380* Software must clear the COLOR_CALC_STATE Valid field in5381* 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT5382* with Pipeline Select set to GPGPU.5383*5384* The internal hardware docs recommend the same workaround for Gfx95385* hardware too.5386*/5387if (pipeline == GPGPU)5388anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);5389#endif53905391#if GFX_VER == 95392if (pipeline == _3D) {5393/* There is a mid-object preemption workaround which requires you to5394* re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However,5395* even without preemption, we have issues with geometry flickering when5396* GPGPU and 3D are back-to-back and this seems to fix it. We don't5397* really know why.5398*/5399const uint32_t subslices =5400MAX2(cmd_buffer->device->physical->subslice_total, 1);5401anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {5402vfe.MaximumNumberofThreads =5403devinfo->max_cs_threads * subslices - 1;5404vfe.NumberofURBEntries = 2;5405vfe.URBEntryAllocationSize = 2;5406}54075408/* We just emitted a dummy MEDIA_VFE_STATE so now that packet is5409* invalid. Set the compute pipeline to dirty to force a re-emit of the5410* pipeline in case we get back-to-back dispatch calls with the same5411* pipeline and a PIPELINE_SELECT in between.5412*/5413cmd_buffer->state.compute.pipeline_dirty = true;5414}5415#endif54165417/* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]5418* PIPELINE_SELECT [DevBWR+]":5419*5420* Project: DEVSNB+5421*5422* Software must ensure all the write caches are flushed through a5423* stalling PIPE_CONTROL command followed by another PIPE_CONTROL5424* command to invalidate read only caches prior to programming5425* MI_PIPELINE_SELECT command to change the Pipeline Select Mode.5426*/5427anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {5428pc.RenderTargetCacheFlushEnable = true;5429pc.DepthCacheFlushEnable = true;5430#if GFX_VER >= 125431pc.HDCPipelineFlushEnable = true;5432#else5433pc.DCFlushEnable = true;5434#endif5435pc.PostSyncOperation = NoWrite;5436pc.CommandStreamerStallEnable = true;5437#if GFX_VER >= 125438/* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be5439* set with any PIPE_CONTROL with Depth Flush Enable bit set.5440*/5441pc.DepthStallEnable = true;5442#endif5443anv_debug_dump_pc(pc);5444}54455446anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {5447pc.TextureCacheInvalidationEnable = true;5448pc.ConstantCacheInvalidationEnable = true;5449pc.StateCacheInvalidationEnable = true;5450pc.InstructionCacheInvalidateEnable = true;5451pc.PostSyncOperation = NoWrite;5452anv_debug_dump_pc(pc);5453}54545455anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {5456#if GFX_VER >= 95457ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;5458ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;5459#endif5460ps.PipelineSelection = pipeline;5461}54625463#if GFX_VER == 95464if (devinfo->is_geminilake) {5465/* Project: DevGLK5466*5467* "This chicken bit works around a hardware issue with barrier logic5468* encountered when switching between GPGPU and 3D pipelines. To5469* workaround the issue, this mode bit should be set after a pipeline5470* is selected."5471*/5472anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {5473scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU5474: GLK_BARRIER_MODE_3D_HULL;5475scec1.GLKBarrierModeMask = 1;5476}5477}5478#endif54795480cmd_buffer->state.current_pipeline = pipeline;5481}54825483void5484genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)5485{5486genX(flush_pipeline_select)(cmd_buffer, _3D);5487}54885489void5490genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)5491{5492genX(flush_pipeline_select)(cmd_buffer, GPGPU);5493}54945495void5496genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)5497{5498if (GFX_VER >= 8)5499return;55005501/* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:5502*5503* "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any5504* combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,5505* 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first5506* issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit5507* set), followed by a pipelined depth cache flush (PIPE_CONTROL with5508* Depth Flush Bit set, followed by another pipelined depth stall5509* (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise5510* guarantee that the pipeline from WM onwards is already flushed (e.g.,5511* via a preceding MI_FLUSH)."5512*/5513anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {5514pipe.DepthStallEnable = true;5515anv_debug_dump_pc(pipe);5516}5517anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {5518pipe.DepthCacheFlushEnable = true;5519#if GFX_VER >= 125520pipe.TileCacheFlushEnable = true;5521#endif5522anv_debug_dump_pc(pipe);5523}5524anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {5525pipe.DepthStallEnable = true;5526anv_debug_dump_pc(pipe);5527}5528}55295530/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:5531*5532* "The VF cache needs to be invalidated before binding and then using5533* Vertex Buffers that overlap with any previously bound Vertex Buffer5534* (at a 64B granularity) since the last invalidation. A VF cache5535* invalidate is performed by setting the "VF Cache Invalidation Enable"5536* bit in PIPE_CONTROL."5537*5538* This is implemented by carefully tracking all vertex and index buffer5539* bindings and flushing if the cache ever ends up with a range in the cache5540* that would exceed 4 GiB. This is implemented in three parts:5541*5542* 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called5543* every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the5544* tracking code of the new binding. If this new binding would cause5545* the cache to have a too-large range on the next draw call, a pipeline5546* stall and VF cache invalidate are added to pending_pipeline_bits.5547*5548* 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to5549* empty whenever we emit a VF invalidate.5550*5551* 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called5552* after every 3DPRIMITIVE and copies the bound range into the dirty5553* range for each used buffer. This has to be a separate step because5554* we don't always re-bind all buffers and so 1. can't know which5555* buffers are actually bound.5556*/5557void5558genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,5559int vb_index,5560struct anv_address vb_address,5561uint32_t vb_size)5562{5563if (GFX_VER < 8 || GFX_VER > 9 ||5564!anv_use_softpin(cmd_buffer->device->physical))5565return;55665567struct anv_vb_cache_range *bound, *dirty;5568if (vb_index == -1) {5569bound = &cmd_buffer->state.gfx.ib_bound_range;5570dirty = &cmd_buffer->state.gfx.ib_dirty_range;5571} else {5572assert(vb_index >= 0);5573assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));5574assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));5575bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];5576dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];5577}55785579if (vb_size == 0) {5580bound->start = 0;5581bound->end = 0;5582return;5583}55845585assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));5586bound->start = intel_48b_address(anv_address_physical(vb_address));5587bound->end = bound->start + vb_size;5588assert(bound->end > bound->start); /* No overflow */55895590/* Align everything to a cache line */5591bound->start &= ~(64ull - 1ull);5592bound->end = align_u64(bound->end, 64);55935594/* Compute the dirty range */5595dirty->start = MIN2(dirty->start, bound->start);5596dirty->end = MAX2(dirty->end, bound->end);55975598/* If our range is larger than 32 bits, we have to flush */5599assert(bound->end - bound->start <= (1ull << 32));5600if (dirty->end - dirty->start > (1ull << 32)) {5601anv_add_pending_pipe_bits(cmd_buffer,5602ANV_PIPE_CS_STALL_BIT |5603ANV_PIPE_VF_CACHE_INVALIDATE_BIT,5604"vb > 32b range");5605}5606}56075608void5609genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,5610uint32_t access_type,5611uint64_t vb_used)5612{5613if (GFX_VER < 8 || GFX_VER > 9 ||5614!anv_use_softpin(cmd_buffer->device->physical))5615return;56165617if (access_type == RANDOM) {5618/* We have an index buffer */5619struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;5620struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;56215622if (bound->end > bound->start) {5623dirty->start = MIN2(dirty->start, bound->start);5624dirty->end = MAX2(dirty->end, bound->end);5625}5626}56275628uint64_t mask = vb_used;5629while (mask) {5630int i = u_bit_scan64(&mask);5631assert(i >= 0);5632assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));5633assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));56345635struct anv_vb_cache_range *bound, *dirty;5636bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];5637dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];56385639if (bound->end > bound->start) {5640dirty->start = MIN2(dirty->start, bound->start);5641dirty->end = MAX2(dirty->end, bound->end);5642}5643}5644}56455646/**5647* Update the pixel hashing modes that determine the balancing of PS threads5648* across subslices and slices.5649*5650* \param width Width bound of the rendering area (already scaled down if \p5651* scale is greater than 1).5652* \param height Height bound of the rendering area (already scaled down if \p5653* scale is greater than 1).5654* \param scale The number of framebuffer samples that could potentially be5655* affected by an individual channel of the PS thread. This is5656* typically one for single-sampled rendering, but for operations5657* like CCS resolves and fast clears a single PS invocation may5658* update a huge number of pixels, in which case a finer5659* balancing is desirable in order to maximally utilize the5660* bandwidth available. UINT_MAX can be used as shorthand for5661* "finest hashing mode available".5662*/5663void5664genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,5665unsigned width, unsigned height,5666unsigned scale)5667{5668#if GFX_VER == 95669const struct intel_device_info *devinfo = &cmd_buffer->device->info;5670const unsigned slice_hashing[] = {5671/* Because all Gfx9 platforms with more than one slice require5672* three-way subslice hashing, a single "normal" 16x16 slice hashing5673* block is guaranteed to suffer from substantial imbalance, with one5674* subslice receiving twice as much work as the other two in the5675* slice.5676*5677* The performance impact of that would be particularly severe when5678* three-way hashing is also in use for slice balancing (which is the5679* case for all Gfx9 GT4 platforms), because one of the slices5680* receives one every three 16x16 blocks in either direction, which5681* is roughly the periodicity of the underlying subslice imbalance5682* pattern ("roughly" because in reality the hardware's5683* implementation of three-way hashing doesn't do exact modulo 35684* arithmetic, which somewhat decreases the magnitude of this effect5685* in practice). This leads to a systematic subslice imbalance5686* within that slice regardless of the size of the primitive. The5687* 32x32 hashing mode guarantees that the subslice imbalance within a5688* single slice hashing block is minimal, largely eliminating this5689* effect.5690*/5691_32x32,5692/* Finest slice hashing mode available. */5693NORMAL5694};5695const unsigned subslice_hashing[] = {5696/* 16x16 would provide a slight cache locality benefit especially5697* visible in the sampler L1 cache efficiency of low-bandwidth5698* non-LLC platforms, but it comes at the cost of greater subslice5699* imbalance for primitives of dimensions approximately intermediate5700* between 16x4 and 16x16.5701*/5702_16x4,5703/* Finest subslice hashing mode available. */5704_8x45705};5706/* Dimensions of the smallest hashing block of a given hashing mode. If5707* the rendering area is smaller than this there can't possibly be any5708* benefit from switching to this mode, so we optimize out the5709* transition.5710*/5711const unsigned min_size[][2] = {5712{ 16, 4 },5713{ 8, 4 }5714};5715const unsigned idx = scale > 1;57165717if (cmd_buffer->state.current_hash_scale != scale &&5718(width > min_size[idx][0] || height > min_size[idx][1])) {5719anv_add_pending_pipe_bits(cmd_buffer,5720ANV_PIPE_CS_STALL_BIT |5721ANV_PIPE_STALL_AT_SCOREBOARD_BIT,5722"change pixel hash mode");5723genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);57245725anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {5726gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);5727gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);5728gt.SubsliceHashing = subslice_hashing[idx];5729gt.SubsliceHashingMask = -1;5730}57315732cmd_buffer->state.current_hash_scale = scale;5733}5734#endif5735}57365737static void5738cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)5739{5740struct anv_device *device = cmd_buffer->device;5741const struct anv_image_view *iview =5742anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);5743const struct anv_image *image = iview ? iview->image : NULL;57445745/* FIXME: Width and Height are wrong */57465747genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);57485749uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,5750device->isl_dev.ds.size / 4);5751if (dw == NULL)5752return;57535754struct isl_depth_stencil_hiz_emit_info info = { };57555756if (iview)5757info.view = &iview->planes[0].isl;57585759if (image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {5760uint32_t depth_plane =5761anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_DEPTH_BIT);5762const struct anv_surface *depth_surface =5763&image->planes[depth_plane].primary_surface;5764const struct anv_address depth_address =5765anv_image_address(image, &depth_surface->memory_range);57665767info.depth_surf = &depth_surface->isl;57685769info.depth_address =5770anv_batch_emit_reloc(&cmd_buffer->batch,5771dw + device->isl_dev.ds.depth_offset / 4,5772depth_address.bo, depth_address.offset);5773info.mocs =5774anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);57755776const uint32_t ds =5777cmd_buffer->state.subpass->depth_stencil_attachment->attachment;5778info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage;5779if (info.hiz_usage != ISL_AUX_USAGE_NONE) {5780assert(isl_aux_usage_has_hiz(info.hiz_usage));57815782const struct anv_surface *hiz_surface =5783&image->planes[depth_plane].aux_surface;5784const struct anv_address hiz_address =5785anv_image_address(image, &hiz_surface->memory_range);57865787info.hiz_surf = &hiz_surface->isl;57885789info.hiz_address =5790anv_batch_emit_reloc(&cmd_buffer->batch,5791dw + device->isl_dev.ds.hiz_offset / 4,5792hiz_address.bo, hiz_address.offset);57935794info.depth_clear_value = ANV_HZ_FC_VAL;5795}5796}57975798if (image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {5799uint32_t stencil_plane =5800anv_image_aspect_to_plane(image->aspects, VK_IMAGE_ASPECT_STENCIL_BIT);5801const struct anv_surface *stencil_surface =5802&image->planes[stencil_plane].primary_surface;5803const struct anv_address stencil_address =5804anv_image_address(image, &stencil_surface->memory_range);58055806info.stencil_surf = &stencil_surface->isl;58075808info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;5809info.stencil_address =5810anv_batch_emit_reloc(&cmd_buffer->batch,5811dw + device->isl_dev.ds.stencil_offset / 4,5812stencil_address.bo, stencil_address.offset);5813info.mocs =5814anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);5815}58165817isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);58185819if (GFX_VER >= 12) {5820cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;5821genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);58225823/* Wa_14082245815824*5825* Workaround: Gfx12LP Astep only An additional pipe control with5826* post-sync = store dword operation would be required.( w/a is to5827* have an additional pipe control after the stencil state whenever5828* the surface state bits of this state is changing).5829*/5830anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {5831pc.PostSyncOperation = WriteImmediateData;5832pc.Address = cmd_buffer->device->workaround_address;5833}5834}5835cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);5836}58375838/**5839* This ANDs the view mask of the current subpass with the pending clear5840* views in the attachment to get the mask of views active in the subpass5841* that still need to be cleared.5842*/5843static inline uint32_t5844get_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state,5845const struct anv_attachment_state *att_state)5846{5847return cmd_state->subpass->view_mask & att_state->pending_clear_views;5848}58495850static inline bool5851do_first_layer_clear(const struct anv_cmd_state *cmd_state,5852const struct anv_attachment_state *att_state)5853{5854if (!cmd_state->subpass->view_mask)5855return true;58565857uint32_t pending_clear_mask =5858get_multiview_subpass_clear_mask(cmd_state, att_state);58595860return pending_clear_mask & 1;5861}58625863static inline bool5864current_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state,5865uint32_t att_idx)5866{5867const uint32_t last_subpass_idx =5868cmd_state->pass->attachments[att_idx].last_subpass_idx;5869const struct anv_subpass *last_subpass =5870&cmd_state->pass->subpasses[last_subpass_idx];5871return last_subpass == cmd_state->subpass;5872}58735874static void5875cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,5876uint32_t subpass_id)5877{5878struct anv_cmd_state *cmd_state = &cmd_buffer->state;5879struct anv_render_pass *pass = cmd_state->pass;5880struct anv_subpass *subpass = &pass->subpasses[subpass_id];5881cmd_state->subpass = subpass;58825883cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;58845885/* Our implementation of VK_KHR_multiview uses instancing to draw the5886* different views. If the client asks for instancing, we need to use the5887* Instance Data Step Rate to ensure that we repeat the client's5888* per-instance data once for each view. Since this bit is in5889* VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top5890* of each subpass.5891*/5892if (GFX_VER == 7)5893cmd_buffer->state.gfx.vb_dirty |= ~0;58945895/* It is possible to start a render pass with an old pipeline. Because the5896* render pass and subpass index are both baked into the pipeline, this is5897* highly unlikely. In order to do so, it requires that you have a render5898* pass with a single subpass and that you use that render pass twice5899* back-to-back and use the same pipeline at the start of the second render5900* pass as at the end of the first. In order to avoid unpredictable issues5901* with this edge case, we just dirty the pipeline at the start of every5902* subpass.5903*/5904cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;59055906/* Accumulate any subpass flushes that need to happen before the subpass */5907anv_add_pending_pipe_bits(cmd_buffer,5908cmd_buffer->state.pass->subpass_flushes[subpass_id],5909"begin subpass deps/attachments");59105911VkRect2D render_area = cmd_buffer->state.render_area;5912struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;59135914bool is_multiview = subpass->view_mask != 0;59155916for (uint32_t i = 0; i < subpass->attachment_count; ++i) {5917const uint32_t a = subpass->attachments[i].attachment;5918if (a == VK_ATTACHMENT_UNUSED)5919continue;59205921assert(a < cmd_state->pass->attachment_count);5922struct anv_attachment_state *att_state = &cmd_state->attachments[a];59235924struct anv_image_view *iview = cmd_state->attachments[a].image_view;5925const struct anv_image *image = iview->image;59265927VkImageLayout target_layout = subpass->attachments[i].layout;5928VkImageLayout target_stencil_layout =5929subpass->attachments[i].stencil_layout;59305931uint32_t level = iview->planes[0].isl.base_level;5932uint32_t width = anv_minify(iview->image->extent.width, level);5933uint32_t height = anv_minify(iview->image->extent.height, level);5934bool full_surface_draw =5935render_area.offset.x == 0 && render_area.offset.y == 0 &&5936render_area.extent.width == width &&5937render_area.extent.height == height;59385939uint32_t base_layer, layer_count;5940if (image->type == VK_IMAGE_TYPE_3D) {5941base_layer = 0;5942layer_count = anv_minify(iview->image->extent.depth, level);5943} else {5944base_layer = iview->planes[0].isl.base_array_layer;5945layer_count = fb->layers;5946}59475948if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {5949bool will_full_fast_clear =5950(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) &&5951att_state->fast_clear && full_surface_draw;59525953assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);5954transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,5955level, 1, base_layer, layer_count,5956att_state->current_layout, target_layout,5957VK_QUEUE_FAMILY_IGNORED,5958VK_QUEUE_FAMILY_IGNORED,5959will_full_fast_clear);5960att_state->aux_usage =5961anv_layout_to_aux_usage(&cmd_buffer->device->info, image,5962VK_IMAGE_ASPECT_COLOR_BIT,5963VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,5964target_layout);5965}59665967if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {5968bool will_full_fast_clear =5969(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&5970att_state->fast_clear && full_surface_draw;59715972transition_depth_buffer(cmd_buffer, image,5973base_layer, layer_count,5974att_state->current_layout, target_layout,5975will_full_fast_clear);5976att_state->aux_usage =5977anv_layout_to_aux_usage(&cmd_buffer->device->info, image,5978VK_IMAGE_ASPECT_DEPTH_BIT,5979VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,5980target_layout);5981}59825983if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {5984bool will_full_fast_clear =5985(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&5986att_state->fast_clear && full_surface_draw;59875988transition_stencil_buffer(cmd_buffer, image,5989level, 1, base_layer, layer_count,5990att_state->current_stencil_layout,5991target_stencil_layout,5992will_full_fast_clear);5993}5994att_state->current_layout = target_layout;5995att_state->current_stencil_layout = target_stencil_layout;59965997if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {5998assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);59996000/* Multi-planar images are not supported as attachments */6001assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);6002assert(image->n_planes == 1);60036004uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer;6005uint32_t clear_layer_count = fb->layers;60066007if (att_state->fast_clear &&6008do_first_layer_clear(cmd_state, att_state)) {6009/* We only support fast-clears on the first layer */6010assert(level == 0 && base_layer == 0);60116012union isl_color_value clear_color = {};6013anv_clear_color_from_att_state(&clear_color, att_state, iview);6014if (iview->image->samples == 1) {6015anv_image_ccs_op(cmd_buffer, image,6016iview->planes[0].isl.format,6017iview->planes[0].isl.swizzle,6018VK_IMAGE_ASPECT_COLOR_BIT,60190, 0, 1, ISL_AUX_OP_FAST_CLEAR,6020&clear_color,6021false);6022} else {6023anv_image_mcs_op(cmd_buffer, image,6024iview->planes[0].isl.format,6025iview->planes[0].isl.swizzle,6026VK_IMAGE_ASPECT_COLOR_BIT,60270, 1, ISL_AUX_OP_FAST_CLEAR,6028&clear_color,6029false);6030}6031base_clear_layer++;6032clear_layer_count--;6033if (is_multiview)6034att_state->pending_clear_views &= ~1;60356036if (isl_color_value_is_zero(clear_color,6037iview->planes[0].isl.format)) {6038/* This image has the auxiliary buffer enabled. We can mark the6039* subresource as not needing a resolve because the clear color6040* will match what's in every RENDER_SURFACE_STATE object when6041* it's being used for sampling.6042*/6043set_image_fast_clear_state(cmd_buffer, iview->image,6044VK_IMAGE_ASPECT_COLOR_BIT,6045ANV_FAST_CLEAR_DEFAULT_VALUE);6046} else {6047set_image_fast_clear_state(cmd_buffer, iview->image,6048VK_IMAGE_ASPECT_COLOR_BIT,6049ANV_FAST_CLEAR_ANY);6050}6051}60526053/* From the VkFramebufferCreateInfo spec:6054*6055* "If the render pass uses multiview, then layers must be one and each6056* attachment requires a number of layers that is greater than the6057* maximum bit index set in the view mask in the subpasses in which it6058* is used."6059*6060* So if multiview is active we ignore the number of layers in the6061* framebuffer and instead we honor the view mask from the subpass.6062*/6063if (is_multiview) {6064assert(image->n_planes == 1);6065uint32_t pending_clear_mask =6066get_multiview_subpass_clear_mask(cmd_state, att_state);60676068u_foreach_bit(layer_idx, pending_clear_mask) {6069uint32_t layer =6070iview->planes[0].isl.base_array_layer + layer_idx;60716072anv_image_clear_color(cmd_buffer, image,6073VK_IMAGE_ASPECT_COLOR_BIT,6074att_state->aux_usage,6075iview->planes[0].isl.format,6076iview->planes[0].isl.swizzle,6077level, layer, 1,6078render_area,6079vk_to_isl_color(att_state->clear_value.color));6080}60816082att_state->pending_clear_views &= ~pending_clear_mask;6083} else if (clear_layer_count > 0) {6084assert(image->n_planes == 1);6085anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,6086att_state->aux_usage,6087iview->planes[0].isl.format,6088iview->planes[0].isl.swizzle,6089level, base_clear_layer, clear_layer_count,6090render_area,6091vk_to_isl_color(att_state->clear_value.color));6092}6093} else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |6094VK_IMAGE_ASPECT_STENCIL_BIT)) {6095if (att_state->fast_clear &&6096(att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {6097/* We currently only support HiZ for single-LOD images */6098assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage));6099assert(iview->planes[0].isl.base_level == 0);6100assert(iview->planes[0].isl.levels == 1);6101}61026103if (is_multiview) {6104uint32_t pending_clear_mask =6105get_multiview_subpass_clear_mask(cmd_state, att_state);61066107u_foreach_bit(layer_idx, pending_clear_mask) {6108uint32_t layer =6109iview->planes[0].isl.base_array_layer + layer_idx;61106111if (att_state->fast_clear) {6112anv_image_hiz_clear(cmd_buffer, image,6113att_state->pending_clear_aspects,6114level, layer, 1, render_area,6115att_state->clear_value.depthStencil.stencil);6116} else {6117anv_image_clear_depth_stencil(cmd_buffer, image,6118att_state->pending_clear_aspects,6119att_state->aux_usage,6120level, layer, 1, render_area,6121att_state->clear_value.depthStencil.depth,6122att_state->clear_value.depthStencil.stencil);6123}6124}61256126att_state->pending_clear_views &= ~pending_clear_mask;6127} else {6128if (att_state->fast_clear) {6129anv_image_hiz_clear(cmd_buffer, image,6130att_state->pending_clear_aspects,6131level, base_layer, layer_count,6132render_area,6133att_state->clear_value.depthStencil.stencil);6134} else {6135anv_image_clear_depth_stencil(cmd_buffer, image,6136att_state->pending_clear_aspects,6137att_state->aux_usage,6138level, base_layer, layer_count,6139render_area,6140att_state->clear_value.depthStencil.depth,6141att_state->clear_value.depthStencil.stencil);6142}6143}6144} else {6145assert(att_state->pending_clear_aspects == 0);6146}61476148/* If multiview is enabled, then we are only done clearing when we no6149* longer have pending layers to clear, or when we have processed the6150* last subpass that uses this attachment.6151*/6152if (!is_multiview ||6153att_state->pending_clear_views == 0 ||6154current_subpass_is_last_for_attachment(cmd_state, a)) {6155att_state->pending_clear_aspects = 0;6156}61576158att_state->pending_load_aspects = 0;6159}61606161/* We've transitioned all our images possibly fast clearing them. Now we6162* can fill out the surface states that we will use as render targets6163* during actual subpass rendering.6164*/6165VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer,6166pass, subpass);6167if (result != VK_SUCCESS)6168return;61696170isl_null_fill_state(&cmd_buffer->device->isl_dev,6171cmd_state->null_surface_state.map,6172.size = isl_extent3d(fb->width, fb->height, fb->layers));61736174for (uint32_t i = 0; i < subpass->attachment_count; ++i) {6175const uint32_t att = subpass->attachments[i].attachment;6176if (att == VK_ATTACHMENT_UNUSED)6177continue;61786179assert(att < cmd_state->pass->attachment_count);6180struct anv_render_pass_attachment *pass_att = &pass->attachments[att];6181struct anv_attachment_state *att_state = &cmd_state->attachments[att];6182struct anv_image_view *iview = att_state->image_view;61836184if (!vk_format_is_color(pass_att->format))6185continue;61866187const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;6188assert(util_bitcount(att_usage) == 1);61896190struct anv_surface_state *surface_state;6191isl_surf_usage_flags_t isl_surf_usage;6192enum isl_aux_usage isl_aux_usage;6193if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {6194surface_state = &att_state->color;6195isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;6196isl_aux_usage = att_state->aux_usage;6197} else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {6198surface_state = &att_state->input;6199isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT;6200isl_aux_usage =6201anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,6202VK_IMAGE_ASPECT_COLOR_BIT,6203VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,6204att_state->current_layout);6205} else {6206continue;6207}62086209/* We had better have a surface state when we get here */6210assert(surface_state->state.map);62116212union isl_color_value clear_color = { .u32 = { 0, } };6213if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR &&6214att_state->fast_clear)6215anv_clear_color_from_att_state(&clear_color, att_state, iview);62166217anv_image_fill_surface_state(cmd_buffer->device,6218iview->image,6219VK_IMAGE_ASPECT_COLOR_BIT,6220&iview->planes[0].isl,6221isl_surf_usage,6222isl_aux_usage,6223&clear_color,62240,6225surface_state,6226NULL);62276228add_surface_state_relocs(cmd_buffer, *surface_state);62296230if (GFX_VER < 10 &&6231pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD &&6232iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&6233iview->planes[0].isl.base_level == 0 &&6234iview->planes[0].isl.base_array_layer == 0) {6235genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state,6236iview->image,6237VK_IMAGE_ASPECT_COLOR_BIT,6238false /* copy to ss */);6239}6240}62416242#if GFX_VER >= 116243/* The PIPE_CONTROL command description says:6244*6245* "Whenever a Binding Table Index (BTI) used by a Render Taget Message6246* points to a different RENDER_SURFACE_STATE, SW must issue a Render6247* Target Cache Flush by enabling this bit. When render target flush6248* is set due to new association of BTI, PS Scoreboard Stall bit must6249* be set in this packet."6250*/6251anv_add_pending_pipe_bits(cmd_buffer,6252ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |6253ANV_PIPE_STALL_AT_SCOREBOARD_BIT,6254"change RT");6255#endif62566257#if GFX_VERx10 == 1206258/* Wa_140104557006259*6260* ISL will change some CHICKEN registers depending on the depth surface6261* format, along with emitting the depth and stencil packets. In that case,6262* we want to do a depth flush and stall, so the pipeline is not using these6263* settings while we change the registers.6264*/6265anv_add_pending_pipe_bits(cmd_buffer,6266ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |6267ANV_PIPE_DEPTH_STALL_BIT |6268ANV_PIPE_END_OF_PIPE_SYNC_BIT,6269"change DS");6270genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);6271#endif62726273cmd_buffer_emit_depth_stencil(cmd_buffer);6274}62756276static enum blorp_filter6277vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode)6278{6279switch (vk_mode) {6280case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR:6281return BLORP_FILTER_SAMPLE_0;6282case VK_RESOLVE_MODE_AVERAGE_BIT_KHR:6283return BLORP_FILTER_AVERAGE;6284case VK_RESOLVE_MODE_MIN_BIT_KHR:6285return BLORP_FILTER_MIN_SAMPLE;6286case VK_RESOLVE_MODE_MAX_BIT_KHR:6287return BLORP_FILTER_MAX_SAMPLE;6288default:6289return BLORP_FILTER_NONE;6290}6291}62926293static void6294cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)6295{6296struct anv_cmd_state *cmd_state = &cmd_buffer->state;6297struct anv_subpass *subpass = cmd_state->subpass;6298uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state);6299struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;63006301/* We are done with the previous subpass and all rendering directly to that6302* subpass is now complete. Zero out all the surface states so we don't6303* accidentally use them between now and the next subpass.6304*/6305for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) {6306memset(&cmd_state->attachments[i].color, 0,6307sizeof(cmd_state->attachments[i].color));6308memset(&cmd_state->attachments[i].input, 0,6309sizeof(cmd_state->attachments[i].input));6310}6311cmd_state->null_surface_state = ANV_STATE_NULL;6312cmd_state->attachment_states = ANV_STATE_NULL;63136314for (uint32_t i = 0; i < subpass->attachment_count; ++i) {6315const uint32_t a = subpass->attachments[i].attachment;6316if (a == VK_ATTACHMENT_UNUSED)6317continue;63186319assert(a < cmd_state->pass->attachment_count);6320struct anv_attachment_state *att_state = &cmd_state->attachments[a];6321struct anv_image_view *iview = att_state->image_view;63226323assert(util_bitcount(subpass->attachments[i].usage) == 1);6324if (subpass->attachments[i].usage ==6325VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {6326/* We assume that if we're ending a subpass, we did do some rendering6327* so we may end up with compressed data.6328*/6329genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,6330VK_IMAGE_ASPECT_COLOR_BIT,6331att_state->aux_usage,6332iview->planes[0].isl.base_level,6333iview->planes[0].isl.base_array_layer,6334fb->layers);6335} else if (subpass->attachments[i].usage ==6336VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {6337/* We may be writing depth or stencil so we need to mark the surface.6338* Unfortunately, there's no way to know at this point whether the6339* depth or stencil tests used will actually write to the surface.6340*6341* Even though stencil may be plane 1, it always shares a base_level6342* with depth.6343*/6344const struct isl_view *ds_view = &iview->planes[0].isl;6345if (iview->aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {6346genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,6347VK_IMAGE_ASPECT_DEPTH_BIT,6348att_state->aux_usage,6349ds_view->base_level,6350ds_view->base_array_layer,6351fb->layers);6352}6353if (iview->aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {6354/* Even though stencil may be plane 1, it always shares a6355* base_level with depth.6356*/6357genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,6358VK_IMAGE_ASPECT_STENCIL_BIT,6359ISL_AUX_USAGE_NONE,6360ds_view->base_level,6361ds_view->base_array_layer,6362fb->layers);6363}6364}6365}63666367if (subpass->has_color_resolve) {6368/* We are about to do some MSAA resolves. We need to flush so that the6369* result of writes to the MSAA color attachments show up in the sampler6370* when we blit to the single-sampled resolve target.6371*/6372anv_add_pending_pipe_bits(cmd_buffer,6373ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |6374ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,6375"MSAA resolve");63766377for (uint32_t i = 0; i < subpass->color_count; ++i) {6378uint32_t src_att = subpass->color_attachments[i].attachment;6379uint32_t dst_att = subpass->resolve_attachments[i].attachment;63806381if (dst_att == VK_ATTACHMENT_UNUSED)6382continue;63836384assert(src_att < cmd_buffer->state.pass->attachment_count);6385assert(dst_att < cmd_buffer->state.pass->attachment_count);63866387if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {6388/* From the Vulkan 1.0 spec:6389*6390* If the first use of an attachment in a render pass is as a6391* resolve attachment, then the loadOp is effectively ignored6392* as the resolve is guaranteed to overwrite all pixels in the6393* render area.6394*/6395cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;6396}63976398struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;6399struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;64006401const VkRect2D render_area = cmd_buffer->state.render_area;64026403enum isl_aux_usage src_aux_usage =6404cmd_buffer->state.attachments[src_att].aux_usage;6405enum isl_aux_usage dst_aux_usage =6406cmd_buffer->state.attachments[dst_att].aux_usage;64076408assert(src_iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT &&6409dst_iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT);64106411anv_image_msaa_resolve(cmd_buffer,6412src_iview->image, src_aux_usage,6413src_iview->planes[0].isl.base_level,6414src_iview->planes[0].isl.base_array_layer,6415dst_iview->image, dst_aux_usage,6416dst_iview->planes[0].isl.base_level,6417dst_iview->planes[0].isl.base_array_layer,6418VK_IMAGE_ASPECT_COLOR_BIT,6419render_area.offset.x, render_area.offset.y,6420render_area.offset.x, render_area.offset.y,6421render_area.extent.width,6422render_area.extent.height,6423fb->layers, BLORP_FILTER_NONE);6424}6425}64266427if (subpass->ds_resolve_attachment) {6428/* We are about to do some MSAA resolves. We need to flush so that the6429* result of writes to the MSAA depth attachments show up in the sampler6430* when we blit to the single-sampled resolve target.6431*/6432anv_add_pending_pipe_bits(cmd_buffer,6433ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |6434ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,6435"MSAA resolve");64366437uint32_t src_att = subpass->depth_stencil_attachment->attachment;6438uint32_t dst_att = subpass->ds_resolve_attachment->attachment;64396440assert(src_att < cmd_buffer->state.pass->attachment_count);6441assert(dst_att < cmd_buffer->state.pass->attachment_count);64426443if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {6444/* From the Vulkan 1.0 spec:6445*6446* If the first use of an attachment in a render pass is as a6447* resolve attachment, then the loadOp is effectively ignored6448* as the resolve is guaranteed to overwrite all pixels in the6449* render area.6450*/6451cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;6452}64536454struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;6455struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;64566457const VkRect2D render_area = cmd_buffer->state.render_area;64586459struct anv_attachment_state *src_state =6460&cmd_state->attachments[src_att];6461struct anv_attachment_state *dst_state =6462&cmd_state->attachments[dst_att];64636464if ((src_iview->image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&6465subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {64666467/* MSAA resolves sample from the source attachment. Transition the6468* depth attachment first to get rid of any HiZ that we may not be6469* able to handle.6470*/6471transition_depth_buffer(cmd_buffer, src_iview->image,6472src_iview->planes[0].isl.base_array_layer,6473fb->layers,6474src_state->current_layout,6475VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,6476false /* will_full_fast_clear */);6477src_state->aux_usage =6478anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image,6479VK_IMAGE_ASPECT_DEPTH_BIT,6480VK_IMAGE_USAGE_TRANSFER_SRC_BIT,6481VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);6482src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;64836484/* MSAA resolves write to the resolve attachment as if it were any6485* other transfer op. Transition the resolve attachment accordingly.6486*/6487VkImageLayout dst_initial_layout = dst_state->current_layout;64886489/* If our render area is the entire size of the image, we're going to6490* blow it all away so we can claim the initial layout is UNDEFINED6491* and we'll get a HiZ ambiguate instead of a resolve.6492*/6493if (dst_iview->image->type != VK_IMAGE_TYPE_3D &&6494render_area.offset.x == 0 && render_area.offset.y == 0 &&6495render_area.extent.width == dst_iview->extent.width &&6496render_area.extent.height == dst_iview->extent.height)6497dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;64986499transition_depth_buffer(cmd_buffer, dst_iview->image,6500dst_iview->planes[0].isl.base_array_layer,6501fb->layers,6502dst_initial_layout,6503VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,6504false /* will_full_fast_clear */);6505dst_state->aux_usage =6506anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image,6507VK_IMAGE_ASPECT_DEPTH_BIT,6508VK_IMAGE_USAGE_TRANSFER_DST_BIT,6509VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);6510dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;65116512enum blorp_filter filter =6513vk_to_blorp_resolve_mode(subpass->depth_resolve_mode);65146515anv_image_msaa_resolve(cmd_buffer,6516src_iview->image, src_state->aux_usage,6517src_iview->planes[0].isl.base_level,6518src_iview->planes[0].isl.base_array_layer,6519dst_iview->image, dst_state->aux_usage,6520dst_iview->planes[0].isl.base_level,6521dst_iview->planes[0].isl.base_array_layer,6522VK_IMAGE_ASPECT_DEPTH_BIT,6523render_area.offset.x, render_area.offset.y,6524render_area.offset.x, render_area.offset.y,6525render_area.extent.width,6526render_area.extent.height,6527fb->layers, filter);6528}65296530if ((src_iview->image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&6531subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {65326533src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;6534dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;65356536enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE;6537uint32_t plane = anv_image_aspect_to_plane(dst_iview->image->aspects,6538VK_IMAGE_ASPECT_STENCIL_BIT);6539enum isl_aux_usage dst_aux_usage =6540dst_iview->image->planes[plane].aux_usage;65416542enum blorp_filter filter =6543vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode);65446545anv_image_msaa_resolve(cmd_buffer,6546src_iview->image, src_aux_usage,6547src_iview->planes[0].isl.base_level,6548src_iview->planes[0].isl.base_array_layer,6549dst_iview->image, dst_aux_usage,6550dst_iview->planes[0].isl.base_level,6551dst_iview->planes[0].isl.base_array_layer,6552VK_IMAGE_ASPECT_STENCIL_BIT,6553render_area.offset.x, render_area.offset.y,6554render_area.offset.x, render_area.offset.y,6555render_area.extent.width,6556render_area.extent.height,6557fb->layers, filter);6558}6559}65606561#if GFX_VER == 76562/* On gfx7, we have to store a texturable version of the stencil buffer in6563* a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and6564* forth at strategic points. Stencil writes are only allowed in following6565* layouts:6566*6567* - VK_IMAGE_LAYOUT_GENERAL6568* - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL6569* - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL6570* - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL6571* - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR6572*6573* For general, we have no nice opportunity to transition so we do the copy6574* to the shadow unconditionally at the end of the subpass. For transfer6575* destinations, we can update it as part of the transfer op. For the other6576* layouts, we delay the copy until a transition into some other layout.6577*/6578if (subpass->depth_stencil_attachment) {6579uint32_t a = subpass->depth_stencil_attachment->attachment;6580assert(a != VK_ATTACHMENT_UNUSED);65816582struct anv_attachment_state *att_state = &cmd_state->attachments[a];6583struct anv_image_view *iview = cmd_state->attachments[a].image_view;;6584const struct anv_image *image = iview->image;65856586if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {6587uint32_t plane = anv_image_aspect_to_plane(image->aspects,6588VK_IMAGE_ASPECT_STENCIL_BIT);65896590if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&6591att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) {6592assert(image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);6593anv_image_copy_to_shadow(cmd_buffer, image,6594VK_IMAGE_ASPECT_STENCIL_BIT,6595iview->planes[plane].isl.base_level, 1,6596iview->planes[plane].isl.base_array_layer,6597fb->layers);6598}6599}6600}6601#endif /* GFX_VER == 7 */66026603for (uint32_t i = 0; i < subpass->attachment_count; ++i) {6604const uint32_t a = subpass->attachments[i].attachment;6605if (a == VK_ATTACHMENT_UNUSED)6606continue;66076608if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id)6609continue;66106611assert(a < cmd_state->pass->attachment_count);6612struct anv_attachment_state *att_state = &cmd_state->attachments[a];6613struct anv_image_view *iview = cmd_state->attachments[a].image_view;6614const struct anv_image *image = iview->image;66156616/* Transition the image into the final layout for this render pass */6617VkImageLayout target_layout =6618cmd_state->pass->attachments[a].final_layout;6619VkImageLayout target_stencil_layout =6620cmd_state->pass->attachments[a].stencil_final_layout;66216622uint32_t base_layer, layer_count;6623if (image->type == VK_IMAGE_TYPE_3D) {6624base_layer = 0;6625layer_count = anv_minify(iview->image->extent.depth,6626iview->planes[0].isl.base_level);6627} else {6628base_layer = iview->planes[0].isl.base_array_layer;6629layer_count = fb->layers;6630}66316632if (image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {6633assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);6634transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,6635iview->planes[0].isl.base_level, 1,6636base_layer, layer_count,6637att_state->current_layout, target_layout,6638VK_QUEUE_FAMILY_IGNORED,6639VK_QUEUE_FAMILY_IGNORED,6640false /* will_full_fast_clear */);6641}66426643if (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {6644transition_depth_buffer(cmd_buffer, image,6645base_layer, layer_count,6646att_state->current_layout, target_layout,6647false /* will_full_fast_clear */);6648}66496650if (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {6651transition_stencil_buffer(cmd_buffer, image,6652iview->planes[0].isl.base_level, 1,6653base_layer, layer_count,6654att_state->current_stencil_layout,6655target_stencil_layout,6656false /* will_full_fast_clear */);6657}6658}66596660/* Accumulate any subpass flushes that need to happen after the subpass.6661* Yes, they do get accumulated twice in the NextSubpass case but since6662* genX_CmdNextSubpass just calls end/begin back-to-back, we just end up6663* ORing the bits in twice so it's harmless.6664*/6665anv_add_pending_pipe_bits(cmd_buffer,6666cmd_buffer->state.pass->subpass_flushes[subpass_id + 1],6667"end subpass deps/attachments");6668}66696670void genX(CmdBeginRenderPass2)(6671VkCommandBuffer commandBuffer,6672const VkRenderPassBeginInfo* pRenderPassBeginInfo,6673const VkSubpassBeginInfoKHR* pSubpassBeginInfo)6674{6675ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);6676ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBeginInfo->renderPass);6677ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);6678VkResult result;66796680cmd_buffer->state.framebuffer = framebuffer;6681cmd_buffer->state.pass = pass;6682cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;66836684anv_measure_beginrenderpass(cmd_buffer);66856686result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,6687framebuffer,6688pRenderPassBeginInfo);6689if (result != VK_SUCCESS) {6690assert(anv_batch_has_error(&cmd_buffer->batch));6691return;6692}66936694genX(flush_pipeline_select_3d)(cmd_buffer);66956696cmd_buffer_begin_subpass(cmd_buffer, 0);6697}66986699void genX(CmdNextSubpass2)(6700VkCommandBuffer commandBuffer,6701const VkSubpassBeginInfoKHR* pSubpassBeginInfo,6702const VkSubpassEndInfoKHR* pSubpassEndInfo)6703{6704ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);67056706if (anv_batch_has_error(&cmd_buffer->batch))6707return;67086709assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);67106711uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state);6712cmd_buffer_end_subpass(cmd_buffer);6713cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);6714}67156716void genX(CmdEndRenderPass2)(6717VkCommandBuffer commandBuffer,6718const VkSubpassEndInfoKHR* pSubpassEndInfo)6719{6720ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);67216722if (anv_batch_has_error(&cmd_buffer->batch))6723return;67246725cmd_buffer_end_subpass(cmd_buffer);67266727cmd_buffer->state.hiz_enabled = false;67286729/* Remove references to render pass specific state. This enables us to6730* detect whether or not we're in a renderpass.6731*/6732cmd_buffer->state.framebuffer = NULL;6733cmd_buffer->state.pass = NULL;6734cmd_buffer->state.subpass = NULL;6735}67366737void6738genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)6739{6740#if GFX_VERx10 >= 756741struct mi_builder b;6742mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);67436744mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),6745mi_reg32(ANV_PREDICATE_RESULT_REG));6746mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));67476748anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {6749mip.LoadOperation = LOAD_LOADINV;6750mip.CombineOperation = COMBINE_SET;6751mip.CompareOperation = COMPARE_SRCS_EQUAL;6752}6753#endif6754}67556756#if GFX_VERx10 >= 756757void genX(CmdBeginConditionalRenderingEXT)(6758VkCommandBuffer commandBuffer,6759const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)6760{6761ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);6762ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);6763struct anv_cmd_state *cmd_state = &cmd_buffer->state;6764struct anv_address value_address =6765anv_address_add(buffer->address, pConditionalRenderingBegin->offset);67666767const bool isInverted = pConditionalRenderingBegin->flags &6768VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;67696770cmd_state->conditional_render_enabled = true;67716772genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);67736774struct mi_builder b;6775mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);67766777/* Section 19.4 of the Vulkan 1.1.85 spec says:6778*6779* If the value of the predicate in buffer memory changes6780* while conditional rendering is active, the rendering commands6781* may be discarded in an implementation-dependent way.6782* Some implementations may latch the value of the predicate6783* upon beginning conditional rendering while others6784* may read it before every rendering command.6785*6786* So it's perfectly fine to read a value from the buffer once.6787*/6788struct mi_value value = mi_mem32(value_address);67896790/* Precompute predicate result, it is necessary to support secondary6791* command buffers since it is unknown if conditional rendering is6792* inverted when populating them.6793*/6794mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),6795isInverted ? mi_uge(&b, mi_imm(0), value) :6796mi_ult(&b, mi_imm(0), value));6797}67986799void genX(CmdEndConditionalRenderingEXT)(6800VkCommandBuffer commandBuffer)6801{6802ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);6803struct anv_cmd_state *cmd_state = &cmd_buffer->state;68046805cmd_state->conditional_render_enabled = false;6806}6807#endif68086809/* Set of stage bits for which are pipelined, i.e. they get queued by the6810* command streamer for later execution.6811*/6812#define ANV_PIPELINE_STAGE_PIPELINED_BITS \6813(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | \6814VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | \6815VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | \6816VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | \6817VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | \6818VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | \6819VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | \6820VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | \6821VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | \6822VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | \6823VK_PIPELINE_STAGE_TRANSFER_BIT | \6824VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | \6825VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | \6826VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)68276828void genX(CmdSetEvent)(6829VkCommandBuffer commandBuffer,6830VkEvent _event,6831VkPipelineStageFlags stageMask)6832{6833ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);6834ANV_FROM_HANDLE(anv_event, event, _event);68356836cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;6837genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);68386839anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {6840if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {6841pc.StallAtPixelScoreboard = true;6842pc.CommandStreamerStallEnable = true;6843}68446845pc.DestinationAddressType = DAT_PPGTT,6846pc.PostSyncOperation = WriteImmediateData,6847pc.Address = (struct anv_address) {6848cmd_buffer->device->dynamic_state_pool.block_pool.bo,6849event->state.offset6850};6851pc.ImmediateData = VK_EVENT_SET;6852anv_debug_dump_pc(pc);6853}6854}68556856void genX(CmdResetEvent)(6857VkCommandBuffer commandBuffer,6858VkEvent _event,6859VkPipelineStageFlags stageMask)6860{6861ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);6862ANV_FROM_HANDLE(anv_event, event, _event);68636864cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;6865genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);68666867anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {6868if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {6869pc.StallAtPixelScoreboard = true;6870pc.CommandStreamerStallEnable = true;6871}68726873pc.DestinationAddressType = DAT_PPGTT;6874pc.PostSyncOperation = WriteImmediateData;6875pc.Address = (struct anv_address) {6876cmd_buffer->device->dynamic_state_pool.block_pool.bo,6877event->state.offset6878};6879pc.ImmediateData = VK_EVENT_RESET;6880anv_debug_dump_pc(pc);6881}6882}68836884void genX(CmdWaitEvents)(6885VkCommandBuffer commandBuffer,6886uint32_t eventCount,6887const VkEvent* pEvents,6888VkPipelineStageFlags srcStageMask,6889VkPipelineStageFlags destStageMask,6890uint32_t memoryBarrierCount,6891const VkMemoryBarrier* pMemoryBarriers,6892uint32_t bufferMemoryBarrierCount,6893const VkBufferMemoryBarrier* pBufferMemoryBarriers,6894uint32_t imageMemoryBarrierCount,6895const VkImageMemoryBarrier* pImageMemoryBarriers)6896{6897#if GFX_VER >= 86898ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);68996900for (uint32_t i = 0; i < eventCount; i++) {6901ANV_FROM_HANDLE(anv_event, event, pEvents[i]);69026903anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {6904sem.WaitMode = PollingMode,6905sem.CompareOperation = COMPARE_SAD_EQUAL_SDD,6906sem.SemaphoreDataDword = VK_EVENT_SET,6907sem.SemaphoreAddress = (struct anv_address) {6908cmd_buffer->device->dynamic_state_pool.block_pool.bo,6909event->state.offset6910};6911}6912}6913#else6914anv_finishme("Implement events on gfx7");6915#endif69166917genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask,6918false, /* byRegion */6919memoryBarrierCount, pMemoryBarriers,6920bufferMemoryBarrierCount, pBufferMemoryBarriers,6921imageMemoryBarrierCount, pImageMemoryBarriers);6922}69236924VkResult genX(CmdSetPerformanceOverrideINTEL)(6925VkCommandBuffer commandBuffer,6926const VkPerformanceOverrideInfoINTEL* pOverrideInfo)6927{6928ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);69296930switch (pOverrideInfo->type) {6931case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {6932#if GFX_VER >= 96933anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {6934csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;6935csdm2.MediaInstructionDisable = pOverrideInfo->enable;6936csdm2._3DRenderingInstructionDisableMask = true;6937csdm2.MediaInstructionDisableMask = true;6938}6939#else6940anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {6941instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;6942instpm.MediaInstructionDisable = pOverrideInfo->enable;6943instpm._3DRenderingInstructionDisableMask = true;6944instpm.MediaInstructionDisableMask = true;6945}6946#endif6947break;6948}69496950case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:6951if (pOverrideInfo->enable) {6952/* FLUSH ALL THE THINGS! As requested by the MDAPI team. */6953anv_add_pending_pipe_bits(cmd_buffer,6954ANV_PIPE_FLUSH_BITS |6955ANV_PIPE_INVALIDATE_BITS,6956"perf counter isolation");6957genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);6958}6959break;69606961default:6962unreachable("Invalid override");6963}69646965return VK_SUCCESS;6966}69676968VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(6969VkCommandBuffer commandBuffer,6970const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)6971{6972/* TODO: Waiting on the register to write, might depend on generation. */69736974return VK_SUCCESS;6975}69766977void genX(cmd_emit_timestamp)(struct anv_batch *batch,6978struct anv_bo *bo,6979uint32_t offset) {6980anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {6981pc.CommandStreamerStallEnable = true;6982pc.PostSyncOperation = WriteTimestamp;6983pc.Address = (struct anv_address) {bo, offset};6984anv_debug_dump_pc(pc);6985}6986}698769886989