Path: blob/21.2-virgl/src/freedreno/vulkan/tu_cmd_buffer.c
4565 views
/*1* Copyright © 2016 Red Hat.2* Copyright © 2016 Bas Nieuwenhuizen3*4* based in part on anv driver which is:5* Copyright © 2015 Intel Corporation6*7* Permission is hereby granted, free of charge, to any person obtaining a8* copy of this software and associated documentation files (the "Software"),9* to deal in the Software without restriction, including without limitation10* the rights to use, copy, modify, merge, publish, distribute, sublicense,11* and/or sell copies of the Software, and to permit persons to whom the12* Software is furnished to do so, subject to the following conditions:13*14* The above copyright notice and this permission notice (including the next15* paragraph) shall be included in all copies or substantial portions of the16* Software.17*18* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR19* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,20* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL21* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER22* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING23* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER24* DEALINGS IN THE SOFTWARE.25*/2627#include "tu_private.h"2829#include "adreno_pm4.xml.h"30#include "adreno_common.xml.h"3132#include "vk_format.h"33#include "vk_util.h"3435#include "tu_cs.h"3637void38tu6_emit_event_write(struct tu_cmd_buffer *cmd,39struct tu_cs *cs,40enum vgt_event_type event)41{42bool need_seqno = false;43switch (event) {44case CACHE_FLUSH_TS:45case WT_DONE_TS:46case RB_DONE_TS:47case PC_CCU_FLUSH_DEPTH_TS:48case PC_CCU_FLUSH_COLOR_TS:49case PC_CCU_RESOLVE_TS:50need_seqno = true;51break;52default:53break;54}5556tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);57tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));58if (need_seqno) {59tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));60tu_cs_emit(cs, 0);61}62}6364static void65tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,66struct tu_cs *cs,67enum tu_cmd_flush_bits flushes)68{69/* Experiments show that invalidating CCU while it still has data in it70* doesn't work, so make sure to always flush before invalidating in case71* any data remains that hasn't yet been made available through a barrier.72* However it does seem to work for UCHE.73*/74if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |75TU_CMD_FLAG_CCU_INVALIDATE_COLOR))76tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);77if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |78TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))79tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);80if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)81tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);82if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)83tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);84if (flushes & TU_CMD_FLAG_CACHE_FLUSH)85tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);86if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)87tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);88if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)89tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);90if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)91tu_cs_emit_wfi(cs);92if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)93tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);94}9596/* "Normal" cache flushes, that don't require any special handling */9798static void99tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,100struct tu_cs *cs)101{102tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);103cmd_buffer->state.cache.flush_bits = 0;104}105106/* Renderpass cache flushes */107108void109tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,110struct tu_cs *cs)111{112if (!cmd_buffer->state.renderpass_cache.flush_bits)113return;114tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);115cmd_buffer->state.renderpass_cache.flush_bits = 0;116}117118/* Cache flushes for things that use the color/depth read/write path (i.e.119* blits and draws). This deals with changing CCU state as well as the usual120* cache flushing.121*/122123void124tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,125struct tu_cs *cs,126enum tu_cmd_ccu_state ccu_state)127{128enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;129130assert(ccu_state != TU_CMD_CCU_UNKNOWN);131132/* Changing CCU state must involve invalidating the CCU. In sysmem mode,133* the CCU may also contain data that we haven't flushed out yet, so we134* also need to flush. Also, in order to program RB_CCU_CNTL, we need to135* emit a WFI as it isn't pipelined.136*/137if (ccu_state != cmd_buffer->state.ccu_state) {138if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {139flushes |=140TU_CMD_FLAG_CCU_FLUSH_COLOR |141TU_CMD_FLAG_CCU_FLUSH_DEPTH;142cmd_buffer->state.cache.pending_flush_bits &= ~(143TU_CMD_FLAG_CCU_FLUSH_COLOR |144TU_CMD_FLAG_CCU_FLUSH_DEPTH);145}146flushes |=147TU_CMD_FLAG_CCU_INVALIDATE_COLOR |148TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |149TU_CMD_FLAG_WAIT_FOR_IDLE;150cmd_buffer->state.cache.pending_flush_bits &= ~(151TU_CMD_FLAG_CCU_INVALIDATE_COLOR |152TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |153TU_CMD_FLAG_WAIT_FOR_IDLE);154}155156tu6_emit_flushes(cmd_buffer, cs, flushes);157cmd_buffer->state.cache.flush_bits = 0;158159if (ccu_state != cmd_buffer->state.ccu_state) {160struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;161tu_cs_emit_regs(cs,162A6XX_RB_CCU_CNTL(.color_offset =163ccu_state == TU_CMD_CCU_GMEM ?164phys_dev->ccu_offset_gmem :165phys_dev->ccu_offset_bypass,166.gmem = ccu_state == TU_CMD_CCU_GMEM));167cmd_buffer->state.ccu_state = ccu_state;168}169}170171static void172tu6_emit_zs(struct tu_cmd_buffer *cmd,173const struct tu_subpass *subpass,174struct tu_cs *cs)175{176const struct tu_framebuffer *fb = cmd->state.framebuffer;177178const uint32_t a = subpass->depth_stencil_attachment.attachment;179if (a == VK_ATTACHMENT_UNUSED) {180tu_cs_emit_regs(cs,181A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),182A6XX_RB_DEPTH_BUFFER_PITCH(0),183A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),184A6XX_RB_DEPTH_BUFFER_BASE(0),185A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));186187tu_cs_emit_regs(cs,188A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));189190tu_cs_emit_regs(cs,191A6XX_GRAS_LRZ_BUFFER_BASE(0),192A6XX_GRAS_LRZ_BUFFER_PITCH(0),193A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));194195tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));196197return;198}199200const struct tu_image_view *iview = fb->attachments[a].attachment;201const struct tu_render_pass_attachment *attachment =202&cmd->state.pass->attachments[a];203enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);204205tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);206tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);207tu_cs_image_ref(cs, iview, 0);208tu_cs_emit(cs, attachment->gmem_offset);209210tu_cs_emit_regs(cs,211A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));212213tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);214tu_cs_image_flag_ref(cs, iview, 0);215216tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = iview->image->bo,217.bo_offset = iview->image->bo_offset + iview->image->lrz_offset),218A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch),219A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());220221if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||222attachment->format == VK_FORMAT_S8_UINT) {223224tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);225tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);226if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {227tu_cs_image_stencil_ref(cs, iview, 0);228tu_cs_emit(cs, attachment->gmem_offset_stencil);229} else {230tu_cs_image_ref(cs, iview, 0);231tu_cs_emit(cs, attachment->gmem_offset);232}233} else {234tu_cs_emit_regs(cs,235A6XX_RB_STENCIL_INFO(0));236}237}238239static void240tu6_emit_mrt(struct tu_cmd_buffer *cmd,241const struct tu_subpass *subpass,242struct tu_cs *cs)243{244const struct tu_framebuffer *fb = cmd->state.framebuffer;245246for (uint32_t i = 0; i < subpass->color_count; ++i) {247uint32_t a = subpass->color_attachments[i].attachment;248if (a == VK_ATTACHMENT_UNUSED)249continue;250251const struct tu_image_view *iview = fb->attachments[a].attachment;252253tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);254tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);255tu_cs_image_ref(cs, iview, 0);256tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);257258tu_cs_emit_regs(cs,259A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));260261tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3);262tu_cs_image_flag_ref(cs, iview, 0);263}264265tu_cs_emit_regs(cs,266A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));267tu_cs_emit_regs(cs,268A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));269270unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);271tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));272}273274void275tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)276{277const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);278bool msaa_disable = samples == MSAA_ONE;279280tu_cs_emit_regs(cs,281A6XX_SP_TP_RAS_MSAA_CNTL(samples),282A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,283.msaa_disable = msaa_disable));284285tu_cs_emit_regs(cs,286A6XX_GRAS_RAS_MSAA_CNTL(samples),287A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,288.msaa_disable = msaa_disable));289290tu_cs_emit_regs(cs,291A6XX_RB_RAS_MSAA_CNTL(samples),292A6XX_RB_DEST_MSAA_CNTL(.samples = samples,293.msaa_disable = msaa_disable));294295tu_cs_emit_regs(cs,296A6XX_RB_MSAA_CNTL(samples));297}298299static void300tu6_emit_bin_size(struct tu_cs *cs,301uint32_t bin_w, uint32_t bin_h, uint32_t flags)302{303tu_cs_emit_regs(cs,304A6XX_GRAS_BIN_CONTROL(.binw = bin_w,305.binh = bin_h,306.dword = flags));307308tu_cs_emit_regs(cs,309A6XX_RB_BIN_CONTROL(.binw = bin_w,310.binh = bin_h,311.dword = flags));312313/* no flag for RB_BIN_CONTROL2... */314tu_cs_emit_regs(cs,315A6XX_RB_BIN_CONTROL2(.binw = bin_w,316.binh = bin_h));317}318319static void320tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,321const struct tu_subpass *subpass,322struct tu_cs *cs,323bool binning)324{325const struct tu_framebuffer *fb = cmd->state.framebuffer;326/* doesn't RB_RENDER_CNTL set differently for binning pass: */327bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;328uint32_t cntl = 0;329cntl |= A6XX_RB_RENDER_CNTL_UNK4;330if (binning) {331if (no_track)332return;333cntl |= A6XX_RB_RENDER_CNTL_BINNING;334} else {335uint32_t mrts_ubwc_enable = 0;336for (uint32_t i = 0; i < subpass->color_count; ++i) {337uint32_t a = subpass->color_attachments[i].attachment;338if (a == VK_ATTACHMENT_UNUSED)339continue;340341const struct tu_image_view *iview = fb->attachments[a].attachment;342if (iview->ubwc_enabled)343mrts_ubwc_enable |= 1 << i;344}345346cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);347348const uint32_t a = subpass->depth_stencil_attachment.attachment;349if (a != VK_ATTACHMENT_UNUSED) {350const struct tu_image_view *iview = fb->attachments[a].attachment;351if (iview->ubwc_enabled)352cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;353}354355if (no_track) {356tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1);357tu_cs_emit(cs, cntl);358return;359}360361/* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs362* in order to set it correctly for the different subpasses. However,363* that means the packets we're emitting also happen during binning. So364* we need to guard the write on !BINNING at CP execution time.365*/366tu_cs_reserve(cs, 3 + 4);367tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);368tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |369CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);370tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));371}372373tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);374tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));375tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);376tu_cs_emit(cs, cntl);377}378379static void380tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)381{382struct tu_physical_device *phys_dev = cmd->device->physical_device;383const VkRect2D *render_area = &cmd->state.render_area;384385/* Avoid assertion fails with an empty render area at (0, 0) where the386* subtraction below wraps around. Empty render areas should be forced to387* the sysmem path by use_sysmem_rendering(). It's not even clear whether388* an empty scissor here works, and the blob seems to force sysmem too as389* it sets something wrong (non-empty) for the scissor.390*/391if (render_area->extent.width == 0 ||392render_area->extent.height == 0)393return;394395uint32_t x1 = render_area->offset.x;396uint32_t y1 = render_area->offset.y;397uint32_t x2 = x1 + render_area->extent.width - 1;398uint32_t y2 = y1 + render_area->extent.height - 1;399400if (align) {401x1 = x1 & ~(phys_dev->info->gmem_align_w - 1);402y1 = y1 & ~(phys_dev->info->gmem_align_h - 1);403x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1;404y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;405}406407tu_cs_emit_regs(cs,408A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),409A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));410}411412void413tu6_emit_window_scissor(struct tu_cs *cs,414uint32_t x1,415uint32_t y1,416uint32_t x2,417uint32_t y2)418{419tu_cs_emit_regs(cs,420A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),421A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));422423tu_cs_emit_regs(cs,424A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),425A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));426}427428void429tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)430{431tu_cs_emit_regs(cs,432A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));433434tu_cs_emit_regs(cs,435A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));436437tu_cs_emit_regs(cs,438A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));439440tu_cs_emit_regs(cs,441A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));442}443444static void445tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)446{447uint32_t enable_mask;448switch (id) {449case TU_DRAW_STATE_PROGRAM:450case TU_DRAW_STATE_VI:451case TU_DRAW_STATE_FS_CONST:452/* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even453* when resources would actually be used in the binning shader.454* Presumably the overhead of prefetching the resources isn't455* worth it.456*/457case TU_DRAW_STATE_DESC_SETS_LOAD:458enable_mask = CP_SET_DRAW_STATE__0_GMEM |459CP_SET_DRAW_STATE__0_SYSMEM;460break;461case TU_DRAW_STATE_PROGRAM_BINNING:462case TU_DRAW_STATE_VI_BINNING:463enable_mask = CP_SET_DRAW_STATE__0_BINNING;464break;465case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:466enable_mask = CP_SET_DRAW_STATE__0_GMEM;467break;468case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:469enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;470break;471default:472enable_mask = CP_SET_DRAW_STATE__0_GMEM |473CP_SET_DRAW_STATE__0_SYSMEM |474CP_SET_DRAW_STATE__0_BINNING;475break;476}477478STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32);479480/* We need to reload the descriptors every time the descriptor sets481* change. However, the commands we send only depend on the pipeline482* because the whole point is to cache descriptors which are used by the483* pipeline. There's a problem here, in that the firmware has an484* "optimization" which skips executing groups that are set to the same485* value as the last draw. This means that if the descriptor sets change486* but not the pipeline, we'd try to re-execute the same buffer which487* the firmware would ignore and we wouldn't pre-load the new488* descriptors. Set the DIRTY bit to avoid this optimization489*/490if (id == TU_DRAW_STATE_DESC_SETS_LOAD)491enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;492493tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |494enable_mask |495CP_SET_DRAW_STATE__0_GROUP_ID(id) |496COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE));497tu_cs_emit_qw(cs, state.iova);498}499500static bool501use_hw_binning(struct tu_cmd_buffer *cmd)502{503const struct tu_framebuffer *fb = cmd->state.framebuffer;504505/* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible506* with non-hw binning GMEM rendering. this is required because some of the507* XFB commands need to only be executed once508*/509if (cmd->state.xfb_used)510return true;511512if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))513return false;514515if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))516return true;517518return (fb->tile_count.width * fb->tile_count.height) > 2;519}520521static bool522use_sysmem_rendering(struct tu_cmd_buffer *cmd)523{524if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))525return true;526527/* can't fit attachments into gmem */528if (!cmd->state.pass->gmem_pixels)529return true;530531if (cmd->state.framebuffer->layers > 1)532return true;533534/* Use sysmem for empty render areas */535if (cmd->state.render_area.extent.width == 0 ||536cmd->state.render_area.extent.height == 0)537return true;538539if (cmd->state.has_tess)540return true;541542return false;543}544545static void546tu6_emit_tile_select(struct tu_cmd_buffer *cmd,547struct tu_cs *cs,548uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)549{550const struct tu_framebuffer *fb = cmd->state.framebuffer;551552tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);553tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));554555const uint32_t x1 = fb->tile0.width * tx;556const uint32_t y1 = fb->tile0.height * ty;557const uint32_t x2 = x1 + fb->tile0.width - 1;558const uint32_t y2 = y1 + fb->tile0.height - 1;559tu6_emit_window_scissor(cs, x1, y1, x2, y2);560tu6_emit_window_offset(cs, x1, y1);561562tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));563564if (use_hw_binning(cmd)) {565tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);566567tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);568tu_cs_emit(cs, 0x0);569570tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);571tu_cs_emit(cs, fb->pipe_sizes[pipe] |572CP_SET_BIN_DATA5_0_VSC_N(slot));573tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);574tu_cs_emit(cs, pipe * 4);575tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);576577tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);578tu_cs_emit(cs, 0x0);579580tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);581tu_cs_emit(cs, 0x0);582} else {583tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);584tu_cs_emit(cs, 0x1);585586tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);587tu_cs_emit(cs, 0x0);588}589}590591static void592tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,593struct tu_cs *cs,594uint32_t layer_mask,595uint32_t a,596uint32_t gmem_a)597{598const struct tu_framebuffer *fb = cmd->state.framebuffer;599struct tu_image_view *dst = fb->attachments[a].attachment;600struct tu_image_view *src = fb->attachments[gmem_a].attachment;601602tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);603}604605static void606tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,607struct tu_cs *cs,608const struct tu_subpass *subpass)609{610if (subpass->resolve_attachments) {611/* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass612* Commands":613*614* End-of-subpass multisample resolves are treated as color615* attachment writes for the purposes of synchronization.616* This applies to resolve operations for both color and617* depth/stencil attachments. That is, they are considered to618* execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT619* pipeline stage and their writes are synchronized with620* VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between621* rendering within a subpass and any resolve operations at the end622* of the subpass occurs automatically, without need for explicit623* dependencies or pipeline barriers. However, if the resolve624* attachment is also used in a different subpass, an explicit625* dependency is needed.626*627* We use the CP_BLIT path for sysmem resolves, which is really a628* transfer command, so we have to manually flush similar to the gmem629* resolve case. However, a flush afterwards isn't needed because of the630* last sentence and the fact that we're in sysmem mode.631*/632tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);633if (subpass->resolve_depth_stencil)634tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);635636tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);637638/* Wait for the flushes to land before using the 2D engine */639tu_cs_emit_wfi(cs);640641for (unsigned i = 0; i < subpass->resolve_count; i++) {642uint32_t a = subpass->resolve_attachments[i].attachment;643if (a == VK_ATTACHMENT_UNUSED)644continue;645646uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);647648tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a);649}650}651}652653static void654tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)655{656const struct tu_render_pass *pass = cmd->state.pass;657const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];658659tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);660tu_cs_emit(cs, 0x0);661662tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);663tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));664665tu6_emit_blit_scissor(cmd, cs, true);666667for (uint32_t a = 0; a < pass->attachment_count; ++a) {668if (pass->attachments[a].gmem_offset >= 0)669tu_store_gmem_attachment(cmd, cs, a, a);670}671672if (subpass->resolve_attachments) {673for (unsigned i = 0; i < subpass->resolve_count; i++) {674uint32_t a = subpass->resolve_attachments[i].attachment;675if (a != VK_ATTACHMENT_UNUSED) {676uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);677tu_store_gmem_attachment(cmd, cs, a, gmem_a);678}679}680}681}682683static void684tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)685{686tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);687tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |688CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |689CP_SET_DRAW_STATE__0_GROUP_ID(0));690tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));691tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));692693cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;694}695696static void697tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)698{699struct tu_device *dev = cmd->device;700const struct tu_physical_device *phys_dev = dev->physical_device;701702tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);703704tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(705.vs_state = true,706.hs_state = true,707.ds_state = true,708.gs_state = true,709.fs_state = true,710.cs_state = true,711.gfx_ibo = true,712.cs_ibo = true,713.gfx_shared_const = true,714.cs_shared_const = true,715.gfx_bindless = 0x1f,716.cs_bindless = 0x1f));717718tu_cs_emit_wfi(cs);719720cmd->state.cache.pending_flush_bits &=721~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);722723tu_cs_emit_regs(cs,724A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass));725cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;726tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);727tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0);728tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);729tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);730tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);731tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B600, 0x100000);732tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);733tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);734735tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);736tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);737tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);738tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);739tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);740tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);741tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_SHARED_CONSTS, 0);742tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);743tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);744tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);745tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);746tu_cs_emit_write_reg(cs, REG_A6XX_SP_MODE_CONTROL,747A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);748749/* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */750tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);751tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);752tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);753754tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);755756tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);757tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);758tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);759tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);760tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);761tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);762tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);763tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);764765tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));766tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);767768tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));769770tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);771772tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);773tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);774tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);775tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);776tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);777tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);778tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);779tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);780tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);781782tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);783784tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);785786tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */787tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */788789tu_disable_draw_states(cmd, cs);790791tu_cs_emit_regs(cs,792A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,793.bo_offset = gb_offset(bcolor_builtin)));794tu_cs_emit_regs(cs,795A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,796.bo_offset = gb_offset(bcolor_builtin)));797798/* VSC buffers:799* use vsc pitches from the largest values used so far with this device800* if there hasn't been overflow, there will already be a scratch bo801* allocated for these sizes802*803* if overflow is detected, the stream size is increased by 2x804*/805mtx_lock(&dev->mutex);806807struct tu6_global *global = dev->global_bo.map;808809uint32_t vsc_draw_overflow = global->vsc_draw_overflow;810uint32_t vsc_prim_overflow = global->vsc_prim_overflow;811812if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)813dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;814815if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)816dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;817818cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;819cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;820821mtx_unlock(&dev->mutex);822823struct tu_bo *vsc_bo;824uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +825cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;826827tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);828829tu_cs_emit_regs(cs,830A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));831tu_cs_emit_regs(cs,832A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));833tu_cs_emit_regs(cs,834A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,835.bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));836837tu_cs_sanity_check(cs);838}839840static void841update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)842{843const struct tu_framebuffer *fb = cmd->state.framebuffer;844845tu_cs_emit_regs(cs,846A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,847.height = fb->tile0.height));848849tu_cs_emit_regs(cs,850A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,851.ny = fb->tile_count.height));852853tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);854tu_cs_emit_array(cs, fb->pipe_config, 32);855856tu_cs_emit_regs(cs,857A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),858A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));859860tu_cs_emit_regs(cs,861A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),862A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));863}864865static void866emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)867{868const struct tu_framebuffer *fb = cmd->state.framebuffer;869const uint32_t used_pipe_count =870fb->pipe_count.width * fb->pipe_count.height;871872for (int i = 0; i < used_pipe_count; i++) {873tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);874tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |875CP_COND_WRITE5_0_WRITE_MEMORY);876tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));877tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));878tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));879tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));880tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));881tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));882883tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);884tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |885CP_COND_WRITE5_0_WRITE_MEMORY);886tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));887tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));888tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));889tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));890tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));891tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));892}893894tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);895}896897static void898tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)899{900struct tu_physical_device *phys_dev = cmd->device->physical_device;901const struct tu_framebuffer *fb = cmd->state.framebuffer;902903tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);904905tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);906tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));907908tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);909tu_cs_emit(cs, 0x1);910911tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);912tu_cs_emit(cs, 0x1);913914tu_cs_emit_wfi(cs);915916tu_cs_emit_regs(cs,917A6XX_VFD_MODE_CNTL(.binning_pass = true));918919update_vsc_pipe(cmd, cs);920921tu_cs_emit_regs(cs,922A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->info->a6xx.magic.PC_UNKNOWN_9805));923924tu_cs_emit_regs(cs,925A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->info->a6xx.magic.SP_UNKNOWN_A0F8));926927tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);928tu_cs_emit(cs, UNK_2C);929930tu_cs_emit_regs(cs,931A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));932933tu_cs_emit_regs(cs,934A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));935936/* emit IB to binning drawcmds: */937tu_cs_emit_call(cs, &cmd->draw_cs);938939/* switching from binning pass to GMEM pass will cause a switch from940* PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)941* so make sure these states are re-emitted942* (eventually these states shouldn't exist at all with shader prologue)943* only VS and GS are invalidated, as FS isn't emitted in binning pass,944* and we don't use HW binning when tesselation is used945*/946tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);947tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |948CP_SET_DRAW_STATE__0_DISABLE |949CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_SHADER_GEOM_CONST));950tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));951tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));952953tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);954tu_cs_emit(cs, UNK_2D);955956/* This flush is probably required because the VSC, which produces the957* visibility stream, is a client of UCHE, whereas the CP needs to read the958* visibility stream (without caching) to do draw skipping. The959* WFI+WAIT_FOR_ME combination guarantees that the binning commands960* submitted are finished before reading the VSC regs (in961* emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as962* part of draws).963*/964tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);965966tu_cs_emit_wfi(cs);967968tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);969970emit_vsc_overflow_test(cmd, cs);971972tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);973tu_cs_emit(cs, 0x0);974975tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);976tu_cs_emit(cs, 0x0);977}978979static struct tu_draw_state980tu_emit_input_attachments(struct tu_cmd_buffer *cmd,981const struct tu_subpass *subpass,982bool gmem)983{984/* note: we can probably emit input attachments just once for the whole985* renderpass, this would avoid emitting both sysmem/gmem versions986*987* emit two texture descriptors for each input, as a workaround for988* d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil)989* tu_shader lowers uint input attachment loads to use the 2nd descriptor990* in the pair991* TODO: a smarter workaround992*/993994if (!subpass->input_count)995return (struct tu_draw_state) {};996997struct tu_cs_memory texture;998VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,999A6XX_TEX_CONST_DWORDS, &texture);1000if (result != VK_SUCCESS) {1001cmd->record_result = result;1002return (struct tu_draw_state) {};1003}10041005for (unsigned i = 0; i < subpass->input_count * 2; i++) {1006uint32_t a = subpass->input_attachments[i / 2].attachment;1007if (a == VK_ATTACHMENT_UNUSED)1008continue;10091010struct tu_image_view *iview =1011cmd->state.framebuffer->attachments[a].attachment;1012const struct tu_render_pass_attachment *att =1013&cmd->state.pass->attachments[a];1014uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];1015uint32_t gmem_offset = att->gmem_offset;1016uint32_t cpp = att->cpp;10171018memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);10191020if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {1021/* note this works because spec says fb and input attachments1022* must use identity swizzle1023*/1024dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |1025A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |1026A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);1027if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) {1028dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) |1029A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) |1030A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |1031A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |1032A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);1033} else {1034dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) |1035A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |1036A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |1037A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |1038A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);1039}1040}10411042if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1043dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;1044dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);1045dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK);1046dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6);1047dst[3] = 0;1048dst[4] = iview->stencil_base_addr;1049dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;10501051cpp = att->samples;1052gmem_offset = att->gmem_offset_stencil;1053}10541055if (!gmem)1056continue;10571058/* patched for gmem */1059dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);1060dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);1061dst[2] =1062A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |1063A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);1064dst[3] = 0;1065dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;1066dst[5] = A6XX_TEX_CONST_5_DEPTH(1);1067for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)1068dst[i] = 0;1069}10701071struct tu_cs cs;1072struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);10731074tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);1075tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |1076CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |1077CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |1078CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |1079CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));1080tu_cs_emit_qw(&cs, texture.iova);10811082tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));10831084tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));10851086assert(cs.cur == cs.end); /* validate draw state size */10871088return ds;1089}10901091static void1092tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)1093{1094struct tu_cs *cs = &cmd->draw_cs;10951096tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);1097tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,1098tu_emit_input_attachments(cmd, subpass, true));1099tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,1100tu_emit_input_attachments(cmd, subpass, false));1101}11021103static void1104tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,1105const VkRenderPassBeginInfo *info)1106{1107struct tu_cs *cs = &cmd->draw_cs;11081109tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);11101111tu6_emit_blit_scissor(cmd, cs, true);11121113for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)1114tu_load_gmem_attachment(cmd, cs, i, false);11151116tu6_emit_blit_scissor(cmd, cs, false);11171118for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)1119tu_clear_gmem_attachment(cmd, cs, i, info);11201121tu_cond_exec_end(cs);11221123tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);11241125for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)1126tu_clear_sysmem_attachment(cmd, cs, i, info);11271128tu_cond_exec_end(cs);1129}11301131static void1132tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1133{1134const struct tu_framebuffer *fb = cmd->state.framebuffer;11351136assert(fb->width > 0 && fb->height > 0);1137tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);1138tu6_emit_window_offset(cs, 0, 0);11391140tu6_emit_bin_size(cs, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */11411142tu6_emit_event_write(cmd, cs, LRZ_FLUSH);11431144tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);1145tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));11461147tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);1148tu_cs_emit(cs, 0x0);11491150tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);11511152/* enable stream-out, with sysmem there is only one pass: */1153tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));11541155tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);1156tu_cs_emit(cs, 0x1);11571158tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);1159tu_cs_emit(cs, 0x0);11601161tu_cs_sanity_check(cs);1162}11631164static void1165tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1166{1167/* Do any resolves of the last subpass. These are handled in the1168* tile_store_ib in the gmem path.1169*/1170tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);11711172tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);11731174tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);1175tu_cs_emit(cs, 0x0);11761177tu6_emit_event_write(cmd, cs, LRZ_FLUSH);11781179tu_cs_sanity_check(cs);1180}11811182static void1183tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1184{1185struct tu_physical_device *phys_dev = cmd->device->physical_device;11861187tu6_emit_event_write(cmd, cs, LRZ_FLUSH);11881189tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);1190tu_cs_emit(cs, 0x0);11911192tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);11931194const struct tu_framebuffer *fb = cmd->state.framebuffer;1195if (use_hw_binning(cmd)) {1196/* enable stream-out during binning pass: */1197tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));11981199tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,1200A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);12011202tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);12031204tu6_emit_binning_pass(cmd, cs);12051206/* and disable stream-out for draw pass: */1207tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));12081209tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,1210A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);12111212tu_cs_emit_regs(cs,1213A6XX_VFD_MODE_CNTL(0));12141215tu_cs_emit_regs(cs, A6XX_PC_UNKNOWN_9805(.unknown = phys_dev->info->a6xx.magic.PC_UNKNOWN_9805));12161217tu_cs_emit_regs(cs, A6XX_SP_UNKNOWN_A0F8(.unknown = phys_dev->info->a6xx.magic.SP_UNKNOWN_A0F8));12181219tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);1220tu_cs_emit(cs, 0x1);1221} else {1222/* no binning pass, so enable stream-out for draw pass:: */1223tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));12241225tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height, 0x6000000);1226}12271228tu_cs_sanity_check(cs);1229}12301231static void1232tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1233{1234tu_cs_emit_call(cs, &cmd->draw_cs);12351236if (use_hw_binning(cmd)) {1237tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);1238tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));1239}12401241tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);12421243tu_cs_sanity_check(cs);1244}12451246static void1247tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)1248{1249tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);12501251tu_cs_emit_regs(cs,1252A6XX_GRAS_LRZ_CNTL(0));12531254tu6_emit_event_write(cmd, cs, LRZ_FLUSH);12551256tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);12571258tu_cs_sanity_check(cs);1259}12601261static void1262tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)1263{1264const struct tu_framebuffer *fb = cmd->state.framebuffer;12651266tu6_tile_render_begin(cmd, &cmd->cs);12671268uint32_t pipe = 0;1269for (uint32_t py = 0; py < fb->pipe_count.height; py++) {1270for (uint32_t px = 0; px < fb->pipe_count.width; px++, pipe++) {1271uint32_t tx1 = px * fb->pipe0.width;1272uint32_t ty1 = py * fb->pipe0.height;1273uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);1274uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);1275uint32_t slot = 0;1276for (uint32_t ty = ty1; ty < ty2; ty++) {1277for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {1278tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);1279tu6_render_tile(cmd, &cmd->cs);1280}1281}1282}1283}12841285tu6_tile_render_end(cmd, &cmd->cs);1286}12871288static void1289tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)1290{1291tu6_sysmem_render_begin(cmd, &cmd->cs);12921293tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);12941295tu6_sysmem_render_end(cmd, &cmd->cs);1296}12971298static void1299tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)1300{1301const uint32_t tile_store_space = 7 + (35 * 2) * cmd->state.pass->attachment_count;1302struct tu_cs sub_cs;13031304VkResult result =1305tu_cs_begin_sub_stream(&cmd->sub_cs, tile_store_space, &sub_cs);1306if (result != VK_SUCCESS) {1307cmd->record_result = result;1308return;1309}13101311/* emit to tile-store sub_cs */1312tu6_emit_tile_store(cmd, &sub_cs);13131314cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->sub_cs, &sub_cs);1315}13161317static VkResult1318tu_create_cmd_buffer(struct tu_device *device,1319struct tu_cmd_pool *pool,1320VkCommandBufferLevel level,1321VkCommandBuffer *pCommandBuffer)1322{1323struct tu_cmd_buffer *cmd_buffer;13241325cmd_buffer = vk_object_zalloc(&device->vk, NULL, sizeof(*cmd_buffer),1326VK_OBJECT_TYPE_COMMAND_BUFFER);1327if (cmd_buffer == NULL)1328return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);13291330cmd_buffer->device = device;1331cmd_buffer->pool = pool;1332cmd_buffer->level = level;13331334if (pool) {1335list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);1336cmd_buffer->queue_family_index = pool->queue_family_index;13371338} else {1339/* Init the pool_link so we can safely call list_del when we destroy1340* the command buffer1341*/1342list_inithead(&cmd_buffer->pool_link);1343cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;1344}13451346tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);1347tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);1348tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);1349tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);13501351*pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);13521353return VK_SUCCESS;1354}13551356static void1357tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)1358{1359list_del(&cmd_buffer->pool_link);13601361tu_cs_finish(&cmd_buffer->cs);1362tu_cs_finish(&cmd_buffer->draw_cs);1363tu_cs_finish(&cmd_buffer->draw_epilogue_cs);1364tu_cs_finish(&cmd_buffer->sub_cs);13651366vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);1367}13681369static VkResult1370tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)1371{1372cmd_buffer->record_result = VK_SUCCESS;13731374tu_cs_reset(&cmd_buffer->cs);1375tu_cs_reset(&cmd_buffer->draw_cs);1376tu_cs_reset(&cmd_buffer->draw_epilogue_cs);1377tu_cs_reset(&cmd_buffer->sub_cs);13781379for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {1380memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));1381memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));1382}13831384cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;13851386return cmd_buffer->record_result;1387}13881389VKAPI_ATTR VkResult VKAPI_CALL1390tu_AllocateCommandBuffers(VkDevice _device,1391const VkCommandBufferAllocateInfo *pAllocateInfo,1392VkCommandBuffer *pCommandBuffers)1393{1394TU_FROM_HANDLE(tu_device, device, _device);1395TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);13961397VkResult result = VK_SUCCESS;1398uint32_t i;13991400for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {14011402if (!list_is_empty(&pool->free_cmd_buffers)) {1403struct tu_cmd_buffer *cmd_buffer = list_first_entry(1404&pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);14051406list_del(&cmd_buffer->pool_link);1407list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);14081409result = tu_reset_cmd_buffer(cmd_buffer);1410cmd_buffer->level = pAllocateInfo->level;1411vk_object_base_reset(&cmd_buffer->base);14121413pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);1414} else {1415result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,1416&pCommandBuffers[i]);1417}1418if (result != VK_SUCCESS)1419break;1420}14211422if (result != VK_SUCCESS) {1423tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,1424pCommandBuffers);14251426/* From the Vulkan 1.0.66 spec:1427*1428* "vkAllocateCommandBuffers can be used to create multiple1429* command buffers. If the creation of any of those command1430* buffers fails, the implementation must destroy all1431* successfully created command buffer objects from this1432* command, set all entries of the pCommandBuffers array to1433* NULL and return the error."1434*/1435memset(pCommandBuffers, 0,1436sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);1437}14381439return result;1440}14411442VKAPI_ATTR void VKAPI_CALL1443tu_FreeCommandBuffers(VkDevice device,1444VkCommandPool commandPool,1445uint32_t commandBufferCount,1446const VkCommandBuffer *pCommandBuffers)1447{1448for (uint32_t i = 0; i < commandBufferCount; i++) {1449TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);14501451if (cmd_buffer) {1452if (cmd_buffer->pool) {1453list_del(&cmd_buffer->pool_link);1454list_addtail(&cmd_buffer->pool_link,1455&cmd_buffer->pool->free_cmd_buffers);1456} else1457tu_cmd_buffer_destroy(cmd_buffer);1458}1459}1460}14611462VKAPI_ATTR VkResult VKAPI_CALL1463tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,1464VkCommandBufferResetFlags flags)1465{1466TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);1467return tu_reset_cmd_buffer(cmd_buffer);1468}14691470/* Initialize the cache, assuming all necessary flushes have happened but *not*1471* invalidations.1472*/1473static void1474tu_cache_init(struct tu_cache_state *cache)1475{1476cache->flush_bits = 0;1477cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;1478}14791480VKAPI_ATTR VkResult VKAPI_CALL1481tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,1482const VkCommandBufferBeginInfo *pBeginInfo)1483{1484TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);1485VkResult result = VK_SUCCESS;14861487if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {1488/* If the command buffer has already been resetted with1489* vkResetCommandBuffer, no need to do it again.1490*/1491result = tu_reset_cmd_buffer(cmd_buffer);1492if (result != VK_SUCCESS)1493return result;1494}14951496memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));1497cmd_buffer->state.index_size = 0xff; /* dirty restart index */14981499cmd_buffer->state.last_vs_params.first_instance = -1;1500cmd_buffer->state.last_vs_params.params_offset = -1;1501cmd_buffer->state.last_vs_params.vertex_offset = -1;15021503tu_cache_init(&cmd_buffer->state.cache);1504tu_cache_init(&cmd_buffer->state.renderpass_cache);1505cmd_buffer->usage_flags = pBeginInfo->flags;15061507tu_cs_begin(&cmd_buffer->cs);1508tu_cs_begin(&cmd_buffer->draw_cs);1509tu_cs_begin(&cmd_buffer->draw_epilogue_cs);15101511/* setup initial configuration into command buffer */1512if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {1513switch (cmd_buffer->queue_family_index) {1514case TU_QUEUE_GENERAL:1515tu6_init_hw(cmd_buffer, &cmd_buffer->cs);1516break;1517default:1518break;1519}1520} else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {1521assert(pBeginInfo->pInheritanceInfo);15221523vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {1524switch (ext->sType) {1525case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {1526const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext;1527cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;1528break;1529default:1530break;1531}1532}1533}15341535if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {1536cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);1537cmd_buffer->state.subpass =1538&cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];1539} else {1540/* When executing in the middle of another command buffer, the CCU1541* state is unknown.1542*/1543cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;1544}1545}15461547cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;15481549return VK_SUCCESS;1550}15511552VKAPI_ATTR void VKAPI_CALL1553tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,1554uint32_t firstBinding,1555uint32_t bindingCount,1556const VkBuffer *pBuffers,1557const VkDeviceSize *pOffsets)1558{1559tu_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount,1560pBuffers, pOffsets, NULL, NULL);1561}15621563VKAPI_ATTR void VKAPI_CALL1564tu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,1565uint32_t firstBinding,1566uint32_t bindingCount,1567const VkBuffer* pBuffers,1568const VkDeviceSize* pOffsets,1569const VkDeviceSize* pSizes,1570const VkDeviceSize* pStrides)1571{1572TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1573struct tu_cs cs;1574/* TODO: track a "max_vb" value for the cmdbuf to save a bit of memory */1575cmd->state.vertex_buffers.iova = tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * MAX_VBS).iova;15761577for (uint32_t i = 0; i < bindingCount; i++) {1578if (pBuffers[i] == VK_NULL_HANDLE) {1579cmd->state.vb[firstBinding + i].base = 0;1580cmd->state.vb[firstBinding + i].size = 0;1581} else {1582struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);1583cmd->state.vb[firstBinding + i].base = tu_buffer_iova(buf) + pOffsets[i];1584cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->size - pOffsets[i]);1585}15861587if (pStrides)1588cmd->state.vb[firstBinding + i].stride = pStrides[i];1589}15901591for (uint32_t i = 0; i < MAX_VBS; i++) {1592tu_cs_emit_regs(&cs,1593A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base),1594A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size));1595}15961597cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;15981599if (pStrides) {1600cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova =1601tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * MAX_VBS).iova;16021603for (uint32_t i = 0; i < MAX_VBS; i++)1604tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride));16051606cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;1607}1608}16091610VKAPI_ATTR void VKAPI_CALL1611tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,1612VkBuffer buffer,1613VkDeviceSize offset,1614VkIndexType indexType)1615{1616TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1617TU_FROM_HANDLE(tu_buffer, buf, buffer);1618161916201621uint32_t index_size, index_shift, restart_index;16221623switch (indexType) {1624case VK_INDEX_TYPE_UINT16:1625index_size = INDEX4_SIZE_16_BIT;1626index_shift = 1;1627restart_index = 0xffff;1628break;1629case VK_INDEX_TYPE_UINT32:1630index_size = INDEX4_SIZE_32_BIT;1631index_shift = 2;1632restart_index = 0xffffffff;1633break;1634case VK_INDEX_TYPE_UINT8_EXT:1635index_size = INDEX4_SIZE_8_BIT;1636index_shift = 0;1637restart_index = 0xff;1638break;1639default:1640unreachable("invalid VkIndexType");1641}16421643/* initialize/update the restart index */1644if (cmd->state.index_size != index_size)1645tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));16461647assert(buf->size >= offset);16481649cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset;1650cmd->state.max_index_count = (buf->size - offset) >> index_shift;1651cmd->state.index_size = index_size;1652}16531654VKAPI_ATTR void VKAPI_CALL1655tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,1656VkPipelineBindPoint pipelineBindPoint,1657VkPipelineLayout _layout,1658uint32_t firstSet,1659uint32_t descriptorSetCount,1660const VkDescriptorSet *pDescriptorSets,1661uint32_t dynamicOffsetCount,1662const uint32_t *pDynamicOffsets)1663{1664TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1665TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);1666unsigned dyn_idx = 0;16671668struct tu_descriptor_state *descriptors_state =1669tu_get_descriptors_state(cmd, pipelineBindPoint);16701671for (unsigned i = 0; i < descriptorSetCount; ++i) {1672unsigned idx = i + firstSet;1673TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);16741675descriptors_state->sets[idx] = set;16761677for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {1678/* update the contents of the dynamic descriptor set */1679unsigned src_idx = j;1680unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;1681assert(dyn_idx < dynamicOffsetCount);16821683uint32_t *dst =1684&descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];1685uint32_t *src =1686&set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];1687uint32_t offset = pDynamicOffsets[dyn_idx];16881689/* Patch the storage/uniform descriptors right away. */1690if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {1691/* Note: we can assume here that the addition won't roll over and1692* change the SIZE field.1693*/1694uint64_t va = src[0] | ((uint64_t)src[1] << 32);1695va += offset;1696dst[0] = va;1697dst[1] = va >> 32;1698} else {1699memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);1700/* Note: A6XX_IBO_5_DEPTH is always 0 */1701uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);1702va += offset;1703dst[4] = va;1704dst[5] = va >> 32;1705}1706}1707}1708assert(dyn_idx == dynamicOffsetCount);17091710uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value;1711uint64_t addr[MAX_SETS + 1] = {};1712struct tu_cs *cs, state_cs;17131714for (uint32_t i = 0; i < MAX_SETS; i++) {1715struct tu_descriptor_set *set = descriptors_state->sets[i];1716if (set)1717addr[i] = set->va | 3;1718}17191720if (layout->dynamic_offset_count) {1721/* allocate and fill out dynamic descriptor set */1722struct tu_cs_memory dynamic_desc_set;1723VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,1724A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);1725if (result != VK_SUCCESS) {1726cmd->record_result = result;1727return;1728}17291730memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,1731layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);1732addr[MAX_SETS] = dynamic_desc_set.iova | 3;1733}17341735if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {1736sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);1737hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);1738hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);17391740cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24);1741cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS;1742cs = &state_cs;1743} else {1744assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);17451746sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);1747hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);1748hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);17491750cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;1751cs = &cmd->cs;1752}17531754tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 10);1755tu_cs_emit_array(cs, (const uint32_t*) addr, 10);1756tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 10);1757tu_cs_emit_array(cs, (const uint32_t*) addr, 10);1758tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value));17591760if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {1761assert(cs->cur == cs->end); /* validate draw state size */1762/* note: this also avoids emitting draw states before renderpass clears,1763* which may use the 3D clear path (for MSAA cases)1764*/1765if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {1766tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);1767tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);1768}1769}1770}17711772VKAPI_ATTR void VKAPI_CALL1773tu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,1774VkPipelineBindPoint pipelineBindPoint,1775VkPipelineLayout _layout,1776uint32_t _set,1777uint32_t descriptorWriteCount,1778const VkWriteDescriptorSet *pDescriptorWrites)1779{1780TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1781TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);1782struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;1783struct tu_descriptor_set *set =1784&tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set;17851786struct tu_cs_memory set_mem;1787VkResult result = tu_cs_alloc(&cmd->sub_cs,1788DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),1789A6XX_TEX_CONST_DWORDS, &set_mem);1790if (result != VK_SUCCESS) {1791cmd->record_result = result;1792return;1793}17941795/* preserve previous content if the layout is the same: */1796if (set->layout == layout)1797memcpy(set_mem.map, set->mapped_ptr, layout->size);17981799set->layout = layout;1800set->mapped_ptr = set_mem.map;1801set->va = set_mem.iova;18021803tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set),1804descriptorWriteCount, pDescriptorWrites, 0, NULL);18051806tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set,18071, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },18080, NULL);1809}18101811VKAPI_ATTR void VKAPI_CALL1812tu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,1813VkDescriptorUpdateTemplate descriptorUpdateTemplate,1814VkPipelineLayout _layout,1815uint32_t _set,1816const void* pData)1817{1818TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1819TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);1820TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate);1821struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;1822struct tu_descriptor_set *set =1823&tu_get_descriptors_state(cmd, templ->bind_point)->push_set;18241825struct tu_cs_memory set_mem;1826VkResult result = tu_cs_alloc(&cmd->sub_cs,1827DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),1828A6XX_TEX_CONST_DWORDS, &set_mem);1829if (result != VK_SUCCESS) {1830cmd->record_result = result;1831return;1832}18331834/* preserve previous content if the layout is the same: */1835if (set->layout == layout)1836memcpy(set_mem.map, set->mapped_ptr, layout->size);18371838set->layout = layout;1839set->mapped_ptr = set_mem.map;1840set->va = set_mem.iova;18411842tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData);18431844tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set,18451, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },18460, NULL);1847}18481849VKAPI_ATTR void VKAPI_CALL1850tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,1851uint32_t firstBinding,1852uint32_t bindingCount,1853const VkBuffer *pBuffers,1854const VkDeviceSize *pOffsets,1855const VkDeviceSize *pSizes)1856{1857TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1858struct tu_cs *cs = &cmd->draw_cs;18591860/* using COND_REG_EXEC for xfb commands matches the blob behavior1861* presumably there isn't any benefit using a draw state when the1862* condition is (SYSMEM | BINNING)1863*/1864tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |1865CP_COND_REG_EXEC_0_SYSMEM |1866CP_COND_REG_EXEC_0_BINNING);18671868for (uint32_t i = 0; i < bindingCount; i++) {1869TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);1870uint64_t iova = buf->bo->iova + pOffsets[i];1871uint32_t size = buf->bo->size - pOffsets[i];1872uint32_t idx = i + firstBinding;18731874if (pSizes && pSizes[i] != VK_WHOLE_SIZE)1875size = pSizes[i];18761877/* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */1878uint32_t offset = iova & 0x1f;1879iova &= ~(uint64_t) 0x1f;18801881tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);1882tu_cs_emit_qw(cs, iova);1883tu_cs_emit(cs, size + offset);18841885cmd->state.streamout_offset[idx] = offset;1886}18871888tu_cond_exec_end(cs);1889}18901891VKAPI_ATTR void VKAPI_CALL1892tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,1893uint32_t firstCounterBuffer,1894uint32_t counterBufferCount,1895const VkBuffer *pCounterBuffers,1896const VkDeviceSize *pCounterBufferOffsets)1897{1898TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1899struct tu_cs *cs = &cmd->draw_cs;19001901tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |1902CP_COND_REG_EXEC_0_SYSMEM |1903CP_COND_REG_EXEC_0_BINNING);19041905/* TODO: only update offset for active buffers */1906for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)1907tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));19081909for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {1910uint32_t idx = firstCounterBuffer + i;1911uint32_t offset = cmd->state.streamout_offset[idx];1912uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;19131914if (!pCounterBuffers[i])1915continue;19161917TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);19181919tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);1920tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |1921CP_MEM_TO_REG_0_UNK31 |1922CP_MEM_TO_REG_0_CNT(1));1923tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);19241925if (offset) {1926tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);1927tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |1928CP_REG_RMW_0_SRC1_ADD);1929tu_cs_emit_qw(cs, 0xffffffff);1930tu_cs_emit_qw(cs, offset);1931}1932}19331934tu_cond_exec_end(cs);1935}19361937VKAPI_ATTR void VKAPI_CALL1938tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,1939uint32_t firstCounterBuffer,1940uint32_t counterBufferCount,1941const VkBuffer *pCounterBuffers,1942const VkDeviceSize *pCounterBufferOffsets)1943{1944TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1945struct tu_cs *cs = &cmd->draw_cs;19461947tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |1948CP_COND_REG_EXEC_0_SYSMEM |1949CP_COND_REG_EXEC_0_BINNING);19501951/* TODO: only flush buffers that need to be flushed */1952for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {1953/* note: FLUSH_BASE is always the same, so it could go in init_hw()? */1954tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);1955tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));1956tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);1957}19581959for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {1960uint32_t idx = firstCounterBuffer + i;1961uint32_t offset = cmd->state.streamout_offset[idx];1962uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;19631964if (!pCounterBuffers[i])1965continue;19661967TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);19681969/* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */1970tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);1971tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |1972CP_MEM_TO_REG_0_SHIFT_BY_2 |19730x40000 | /* ??? */1974CP_MEM_TO_REG_0_UNK31 |1975CP_MEM_TO_REG_0_CNT(1));1976tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));19771978if (offset) {1979tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);1980tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |1981CP_REG_RMW_0_SRC1_ADD);1982tu_cs_emit_qw(cs, 0xffffffff);1983tu_cs_emit_qw(cs, -offset);1984}19851986tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);1987tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |1988CP_REG_TO_MEM_0_CNT(1));1989tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);1990}19911992tu_cond_exec_end(cs);19931994cmd->state.xfb_used = true;1995}19961997VKAPI_ATTR void VKAPI_CALL1998tu_CmdPushConstants(VkCommandBuffer commandBuffer,1999VkPipelineLayout layout,2000VkShaderStageFlags stageFlags,2001uint32_t offset,2002uint32_t size,2003const void *pValues)2004{2005TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2006memcpy((void*) cmd->push_constants + offset, pValues, size);2007cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;2008}20092010/* Flush everything which has been made available but we haven't actually2011* flushed yet.2012*/2013static void2014tu_flush_all_pending(struct tu_cache_state *cache)2015{2016cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;2017cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;2018}20192020VKAPI_ATTR VkResult VKAPI_CALL2021tu_EndCommandBuffer(VkCommandBuffer commandBuffer)2022{2023TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);20242025/* We currently flush CCU at the end of the command buffer, like2026* what the blob does. There's implicit synchronization around every2027* vkQueueSubmit, but the kernel only flushes the UCHE, and we don't2028* know yet if this command buffer will be the last in the submit so we2029* have to defensively flush everything else.2030*2031* TODO: We could definitely do better than this, since these flushes2032* aren't required by Vulkan, but we'd need kernel support to do that.2033* Ideally, we'd like the kernel to flush everything afterwards, so that we2034* wouldn't have to do any flushes here, and when submitting multiple2035* command buffers there wouldn't be any unnecessary flushes in between.2036*/2037if (cmd_buffer->state.pass) {2038tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);2039tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);2040} else {2041tu_flush_all_pending(&cmd_buffer->state.cache);2042cmd_buffer->state.cache.flush_bits |=2043TU_CMD_FLAG_CCU_FLUSH_COLOR |2044TU_CMD_FLAG_CCU_FLUSH_DEPTH;2045tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);2046}20472048tu_cs_end(&cmd_buffer->cs);2049tu_cs_end(&cmd_buffer->draw_cs);2050tu_cs_end(&cmd_buffer->draw_epilogue_cs);20512052cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;20532054return cmd_buffer->record_result;2055}20562057static struct tu_cs2058tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)2059{2060struct tu_cs cs;20612062assert(id < ARRAY_SIZE(cmd->state.dynamic_state));2063cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size);20642065/* note: this also avoids emitting draw states before renderpass clears,2066* which may use the 3D clear path (for MSAA cases)2067*/2068if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)2069return cs;20702071tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);2072tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);20732074return cs;2075}20762077VKAPI_ATTR void VKAPI_CALL2078tu_CmdBindPipeline(VkCommandBuffer commandBuffer,2079VkPipelineBindPoint pipelineBindPoint,2080VkPipeline _pipeline)2081{2082TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2083TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);20842085if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {2086cmd->state.compute_pipeline = pipeline;2087tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state);2088return;2089}20902091assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);20922093cmd->state.pipeline = pipeline;2094cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS | TU_CMD_DIRTY_LRZ;20952096/* note: this also avoids emitting draw states before renderpass clears,2097* which may use the 3D clear path (for MSAA cases)2098*/2099if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {2100struct tu_cs *cs = &cmd->draw_cs;2101uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);21022103tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));2104tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);2105tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);2106tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);2107tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);2108tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);2109tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);2110tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);21112112u_foreach_bit(i, mask)2113tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);2114}21152116/* the vertex_buffers draw state always contains all the currently2117* bound vertex buffers. update its size to only emit the vbs which2118* are actually used by the pipeline2119* note there is a HW optimization which makes it so the draw state2120* is not re-executed completely when only the size changes2121*/2122if (cmd->state.vertex_buffers.size != pipeline->num_vbs * 4) {2123cmd->state.vertex_buffers.size = pipeline->num_vbs * 4;2124cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;2125}21262127if ((pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE)) &&2128cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != pipeline->num_vbs * 2) {2129cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = pipeline->num_vbs * 2;2130cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;2131}21322133#define UPDATE_REG(X, Y) { \2134/* note: would be better to have pipeline bits already masked */ \2135uint32_t pipeline_bits = pipeline->X & pipeline->X##_mask; \2136if ((cmd->state.X & pipeline->X##_mask) != pipeline_bits) { \2137cmd->state.X &= ~pipeline->X##_mask; \2138cmd->state.X |= pipeline_bits; \2139cmd->state.dirty |= TU_CMD_DIRTY_##Y; \2140} \2141if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y))) \2142cmd->state.dirty &= ~TU_CMD_DIRTY_##Y; \2143}21442145/* these registers can have bits set from both pipeline and dynamic state2146* this updates the bits set by the pipeline2147* if the pipeline doesn't use a dynamic state for the register, then2148* the relevant dirty bit is cleared to avoid overriding the non-dynamic2149* state with a dynamic state the next draw.2150*/2151UPDATE_REG(gras_su_cntl, GRAS_SU_CNTL);2152UPDATE_REG(rb_depth_cntl, RB_DEPTH_CNTL);2153UPDATE_REG(rb_stencil_cntl, RB_STENCIL_CNTL);2154#undef UPDATE_REG21552156if (pipeline->rb_depth_cntl_disable)2157cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;2158}21592160VKAPI_ATTR void VKAPI_CALL2161tu_CmdSetViewport(VkCommandBuffer commandBuffer,2162uint32_t firstViewport,2163uint32_t viewportCount,2164const VkViewport *pViewports)2165{2166TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2167struct tu_cs cs;21682169memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports));2170cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount);21712172cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport);2173tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport);2174}21752176VKAPI_ATTR void VKAPI_CALL2177tu_CmdSetScissor(VkCommandBuffer commandBuffer,2178uint32_t firstScissor,2179uint32_t scissorCount,2180const VkRect2D *pScissors)2181{2182TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2183struct tu_cs cs;21842185memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors));2186cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount);21872188cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor);2189tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor);2190}21912192VKAPI_ATTR void VKAPI_CALL2193tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)2194{2195TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);21962197cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;2198cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);21992200cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;2201}22022203VKAPI_ATTR void VKAPI_CALL2204tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,2205float depthBiasConstantFactor,2206float depthBiasClamp,2207float depthBiasSlopeFactor)2208{2209TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2210struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);22112212tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);2213}22142215VKAPI_ATTR void VKAPI_CALL2216tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,2217const float blendConstants[4])2218{2219TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2220struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);22212222tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);2223tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);2224}22252226VKAPI_ATTR void VKAPI_CALL2227tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,2228float minDepthBounds,2229float maxDepthBounds)2230{2231TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2232struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);22332234tu_cs_emit_regs(&cs,2235A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),2236A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));2237}22382239void2240update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)2241{2242if (face & VK_STENCIL_FACE_FRONT_BIT)2243*value = (*value & 0xff00) | (mask & 0xff);2244if (face & VK_STENCIL_FACE_BACK_BIT)2245*value = (*value & 0xff) | (mask & 0xff) << 8;2246}22472248VKAPI_ATTR void VKAPI_CALL2249tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,2250VkStencilFaceFlags faceMask,2251uint32_t compareMask)2252{2253TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2254struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);22552256update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);22572258tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));2259}22602261VKAPI_ATTR void VKAPI_CALL2262tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,2263VkStencilFaceFlags faceMask,2264uint32_t writeMask)2265{2266TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2267struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);22682269update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);22702271tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));22722273cmd->state.dirty |= TU_CMD_DIRTY_LRZ;2274}22752276VKAPI_ATTR void VKAPI_CALL2277tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,2278VkStencilFaceFlags faceMask,2279uint32_t reference)2280{2281TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2282struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);22832284update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);22852286tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));2287}22882289VKAPI_ATTR void VKAPI_CALL2290tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,2291const VkSampleLocationsInfoEXT* pSampleLocationsInfo)2292{2293TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2294struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);22952296assert(pSampleLocationsInfo);22972298tu6_emit_sample_locations(&cs, pSampleLocationsInfo);2299}23002301VKAPI_ATTR void VKAPI_CALL2302tu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)2303{2304TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);23052306cmd->state.gras_su_cntl &=2307~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK);23082309if (cullMode & VK_CULL_MODE_FRONT_BIT)2310cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;2311if (cullMode & VK_CULL_MODE_BACK_BIT)2312cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;23132314cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;2315}23162317VKAPI_ATTR void VKAPI_CALL2318tu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)2319{2320TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);23212322cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;23232324if (frontFace == VK_FRONT_FACE_CLOCKWISE)2325cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;23262327cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;2328}23292330VKAPI_ATTR void VKAPI_CALL2331tu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,2332VkPrimitiveTopology primitiveTopology)2333{2334TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);23352336cmd->state.primtype = tu6_primtype(primitiveTopology);2337}23382339VKAPI_ATTR void VKAPI_CALL2340tu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,2341uint32_t viewportCount,2342const VkViewport* pViewports)2343{2344tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);2345}23462347VKAPI_ATTR void VKAPI_CALL2348tu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,2349uint32_t scissorCount,2350const VkRect2D* pScissors)2351{2352tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);2353}23542355VKAPI_ATTR void VKAPI_CALL2356tu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,2357VkBool32 depthTestEnable)2358{2359TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);23602361cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_ENABLE;23622363if (depthTestEnable)2364cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_ENABLE;23652366cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;2367}23682369VKAPI_ATTR void VKAPI_CALL2370tu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,2371VkBool32 depthWriteEnable)2372{2373TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);23742375cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;23762377if (depthWriteEnable)2378cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;23792380cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;2381}23822383VKAPI_ATTR void VKAPI_CALL2384tu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,2385VkCompareOp depthCompareOp)2386{2387TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);23882389cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;23902391cmd->state.rb_depth_cntl |=2392A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp));23932394cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;2395}23962397VKAPI_ATTR void VKAPI_CALL2398tu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,2399VkBool32 depthBoundsTestEnable)2400{2401TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);24022403cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;24042405if (depthBoundsTestEnable)2406cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;24072408cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;2409}24102411VKAPI_ATTR void VKAPI_CALL2412tu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,2413VkBool32 stencilTestEnable)2414{2415TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);24162417cmd->state.rb_stencil_cntl &= ~(2418A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |2419A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |2420A6XX_RB_STENCIL_CONTROL_STENCIL_READ);24212422if (stencilTestEnable) {2423cmd->state.rb_stencil_cntl |=2424A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |2425A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |2426A6XX_RB_STENCIL_CONTROL_STENCIL_READ;2427}24282429cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;2430}24312432VKAPI_ATTR void VKAPI_CALL2433tu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,2434VkStencilFaceFlags faceMask,2435VkStencilOp failOp,2436VkStencilOp passOp,2437VkStencilOp depthFailOp,2438VkCompareOp compareOp)2439{2440TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);24412442if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {2443cmd->state.rb_stencil_cntl &= ~(2444A6XX_RB_STENCIL_CONTROL_FUNC__MASK |2445A6XX_RB_STENCIL_CONTROL_FAIL__MASK |2446A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |2447A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK);24482449cmd->state.rb_stencil_cntl |=2450A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) |2451A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) |2452A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) |2453A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp));2454}24552456if (faceMask & VK_STENCIL_FACE_BACK_BIT) {2457cmd->state.rb_stencil_cntl &= ~(2458A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |2459A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |2460A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |2461A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);24622463cmd->state.rb_stencil_cntl |=2464A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) |2465A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) |2466A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) |2467A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp));2468}24692470cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;2471}24722473static void2474tu_flush_for_access(struct tu_cache_state *cache,2475enum tu_cmd_access_mask src_mask,2476enum tu_cmd_access_mask dst_mask)2477{2478enum tu_cmd_flush_bits flush_bits = 0;24792480if (src_mask & TU_ACCESS_HOST_WRITE) {2481/* Host writes are always visible to CP, so only invalidate GPU caches */2482cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE;2483}24842485if (src_mask & TU_ACCESS_SYSMEM_WRITE) {2486/* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as2487* well.2488*/2489cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;2490}24912492if (src_mask & TU_ACCESS_CP_WRITE) {2493/* Flush the CP write queue. However a WFI shouldn't be necessary as2494* WAIT_MEM_WRITES should cover it.2495*/2496cache->pending_flush_bits |=2497TU_CMD_FLAG_WAIT_MEM_WRITES |2498TU_CMD_FLAG_GPU_INVALIDATE |2499TU_CMD_FLAG_WAIT_FOR_ME;2500}25012502#define SRC_FLUSH(domain, flush, invalidate) \2503if (src_mask & TU_ACCESS_##domain##_WRITE) { \2504cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \2505(TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \2506}25072508SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)2509SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)2510SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)25112512#undef SRC_FLUSH25132514#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \2515if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \2516flush_bits |= TU_CMD_FLAG_##flush; \2517cache->pending_flush_bits |= \2518(TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \2519}25202521SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)2522SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)25232524#undef SRC_INCOHERENT_FLUSH25252526/* Treat host & sysmem write accesses the same, since the kernel implicitly2527* drains the queue before signalling completion to the host.2528*/2529if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE |2530TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) {2531flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;2532}25332534#define DST_FLUSH(domain, flush, invalidate) \2535if (dst_mask & (TU_ACCESS_##domain##_READ | \2536TU_ACCESS_##domain##_WRITE)) { \2537flush_bits |= cache->pending_flush_bits & \2538(TU_CMD_FLAG_##invalidate | \2539(TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \2540}25412542DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)2543DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)2544DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)25452546#undef DST_FLUSH25472548#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \2549if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ | \2550TU_ACCESS_##domain##_INCOHERENT_WRITE)) { \2551flush_bits |= TU_CMD_FLAG_##invalidate | \2552(cache->pending_flush_bits & \2553(TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \2554}25552556DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)2557DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)25582559#undef DST_INCOHERENT_FLUSH25602561if (dst_mask & TU_ACCESS_WFI_READ) {2562flush_bits |= cache->pending_flush_bits &2563(TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE);2564}25652566if (dst_mask & TU_ACCESS_WFM_READ) {2567flush_bits |= cache->pending_flush_bits &2568(TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME);2569}25702571cache->flush_bits |= flush_bits;2572cache->pending_flush_bits &= ~flush_bits;2573}25742575static enum tu_cmd_access_mask2576vk2tu_access(VkAccessFlags flags, bool gmem)2577{2578enum tu_cmd_access_mask mask = 0;25792580/* If the GPU writes a buffer that is then read by an indirect draw2581* command, we theoretically need to emit a WFI to wait for any cache2582* flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to2583* complete. Waiting for the WFI to complete is performed as part of the2584* draw by the firmware, so we just need to execute the WFI.2585*2586* Transform feedback counters are read via CP_MEM_TO_REG, which implicitly2587* does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.2588*2589* Currently we read the draw predicate using CP_MEM_TO_MEM, which2590* also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*2591* implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to2592* complete since it's written for DX11 where you can only predicate on the2593* result of a query object. So if we implement 64-bit comparisons in the2594* future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit2595* comparisons, then this will have to be dealt with.2596*/2597if (flags &2598(VK_ACCESS_INDIRECT_COMMAND_READ_BIT |2599VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |2600VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT |2601VK_ACCESS_MEMORY_READ_BIT)) {2602mask |= TU_ACCESS_WFI_READ;2603}26042605if (flags &2606(VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */2607VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */2608VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */2609VK_ACCESS_MEMORY_READ_BIT)) {2610mask |= TU_ACCESS_SYSMEM_READ;2611}26122613if (flags &2614(VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |2615VK_ACCESS_MEMORY_WRITE_BIT)) {2616mask |= TU_ACCESS_CP_WRITE;2617}26182619if (flags &2620(VK_ACCESS_HOST_READ_BIT |2621VK_ACCESS_MEMORY_WRITE_BIT)) {2622mask |= TU_ACCESS_HOST_READ;2623}26242625if (flags &2626(VK_ACCESS_HOST_WRITE_BIT |2627VK_ACCESS_MEMORY_WRITE_BIT)) {2628mask |= TU_ACCESS_HOST_WRITE;2629}26302631if (flags &2632(VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */2633VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */2634VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */2635/* TODO: Is there a no-cache bit for textures so that we can ignore2636* these?2637*/2638VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */2639VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */2640VK_ACCESS_MEMORY_READ_BIT)) {2641mask |= TU_ACCESS_UCHE_READ;2642}26432644if (flags &2645(VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */2646VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */2647VK_ACCESS_MEMORY_WRITE_BIT)) {2648mask |= TU_ACCESS_UCHE_WRITE;2649}26502651/* When using GMEM, the CCU is always flushed automatically to GMEM, and2652* then GMEM is flushed to sysmem. Furthermore, we already had to flush any2653* previous writes in sysmem mode when transitioning to GMEM. Therefore we2654* can ignore CCU and pretend that color attachments and transfers use2655* sysmem directly.2656*/26572658if (flags &2659(VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |2660VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |2661VK_ACCESS_MEMORY_READ_BIT)) {2662if (gmem)2663mask |= TU_ACCESS_SYSMEM_READ;2664else2665mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;2666}26672668if (flags &2669(VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |2670VK_ACCESS_MEMORY_READ_BIT)) {2671if (gmem)2672mask |= TU_ACCESS_SYSMEM_READ;2673else2674mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;2675}26762677if (flags &2678(VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |2679VK_ACCESS_MEMORY_WRITE_BIT)) {2680if (gmem) {2681mask |= TU_ACCESS_SYSMEM_WRITE;2682} else {2683mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;2684}2685}26862687if (flags &2688(VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |2689VK_ACCESS_MEMORY_WRITE_BIT)) {2690if (gmem) {2691mask |= TU_ACCESS_SYSMEM_WRITE;2692} else {2693mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;2694}2695}26962697/* When the dst access is a transfer read/write, it seems we sometimes need2698* to insert a WFI after any flushes, to guarantee that the flushes finish2699* before the 2D engine starts. However the opposite (i.e. a WFI after2700* CP_BLIT and before any subsequent flush) does not seem to be needed, and2701* the blob doesn't emit such a WFI.2702*/27032704if (flags &2705(VK_ACCESS_TRANSFER_WRITE_BIT |2706VK_ACCESS_MEMORY_WRITE_BIT)) {2707if (gmem) {2708mask |= TU_ACCESS_SYSMEM_WRITE;2709} else {2710mask |= TU_ACCESS_CCU_COLOR_WRITE;2711}2712mask |= TU_ACCESS_WFI_READ;2713}27142715if (flags &2716(VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */2717VK_ACCESS_MEMORY_READ_BIT)) {2718mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;2719}27202721return mask;2722}272327242725VKAPI_ATTR void VKAPI_CALL2726tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,2727uint32_t commandBufferCount,2728const VkCommandBuffer *pCmdBuffers)2729{2730TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2731VkResult result;27322733assert(commandBufferCount > 0);27342735/* Emit any pending flushes. */2736if (cmd->state.pass) {2737tu_flush_all_pending(&cmd->state.renderpass_cache);2738tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);2739} else {2740tu_flush_all_pending(&cmd->state.cache);2741tu_emit_cache_flush(cmd, &cmd->cs);2742}27432744for (uint32_t i = 0; i < commandBufferCount; i++) {2745TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);27462747if (secondary->usage_flags &2748VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {2749assert(tu_cs_is_empty(&secondary->cs));27502751result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);2752if (result != VK_SUCCESS) {2753cmd->record_result = result;2754break;2755}27562757result = tu_cs_add_entries(&cmd->draw_epilogue_cs,2758&secondary->draw_epilogue_cs);2759if (result != VK_SUCCESS) {2760cmd->record_result = result;2761break;2762}27632764if (secondary->state.has_tess)2765cmd->state.has_tess = true;2766if (secondary->state.has_subpass_predication)2767cmd->state.has_subpass_predication = true;2768} else {2769assert(tu_cs_is_empty(&secondary->draw_cs));2770assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));27712772tu_cs_add_entries(&cmd->cs, &secondary->cs);2773}27742775cmd->state.index_size = secondary->state.index_size; /* for restart index update */2776}2777cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */27782779if (cmd->state.pass) {2780/* After a secondary command buffer is executed, LRZ is not valid2781* until it is cleared again.2782*/2783cmd->state.lrz.valid = false;2784}27852786/* After executing secondary command buffers, there may have been arbitrary2787* flushes executed, so when we encounter a pipeline barrier with a2788* srcMask, we have to assume that we need to invalidate. Therefore we need2789* to re-initialize the cache with all pending invalidate bits set.2790*/2791if (cmd->state.pass) {2792tu_cache_init(&cmd->state.renderpass_cache);2793} else {2794tu_cache_init(&cmd->state.cache);2795}2796}27972798VKAPI_ATTR VkResult VKAPI_CALL2799tu_CreateCommandPool(VkDevice _device,2800const VkCommandPoolCreateInfo *pCreateInfo,2801const VkAllocationCallbacks *pAllocator,2802VkCommandPool *pCmdPool)2803{2804TU_FROM_HANDLE(tu_device, device, _device);2805struct tu_cmd_pool *pool;28062807pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),2808VK_OBJECT_TYPE_COMMAND_POOL);2809if (pool == NULL)2810return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);28112812if (pAllocator)2813pool->alloc = *pAllocator;2814else2815pool->alloc = device->vk.alloc;28162817list_inithead(&pool->cmd_buffers);2818list_inithead(&pool->free_cmd_buffers);28192820pool->queue_family_index = pCreateInfo->queueFamilyIndex;28212822*pCmdPool = tu_cmd_pool_to_handle(pool);28232824return VK_SUCCESS;2825}28262827VKAPI_ATTR void VKAPI_CALL2828tu_DestroyCommandPool(VkDevice _device,2829VkCommandPool commandPool,2830const VkAllocationCallbacks *pAllocator)2831{2832TU_FROM_HANDLE(tu_device, device, _device);2833TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);28342835if (!pool)2836return;28372838list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,2839&pool->cmd_buffers, pool_link)2840{2841tu_cmd_buffer_destroy(cmd_buffer);2842}28432844list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,2845&pool->free_cmd_buffers, pool_link)2846{2847tu_cmd_buffer_destroy(cmd_buffer);2848}28492850vk_object_free(&device->vk, pAllocator, pool);2851}28522853VKAPI_ATTR VkResult VKAPI_CALL2854tu_ResetCommandPool(VkDevice device,2855VkCommandPool commandPool,2856VkCommandPoolResetFlags flags)2857{2858TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);2859VkResult result;28602861list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,2862pool_link)2863{2864result = tu_reset_cmd_buffer(cmd_buffer);2865if (result != VK_SUCCESS)2866return result;2867}28682869return VK_SUCCESS;2870}28712872VKAPI_ATTR void VKAPI_CALL2873tu_TrimCommandPool(VkDevice device,2874VkCommandPool commandPool,2875VkCommandPoolTrimFlags flags)2876{2877TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);28782879if (!pool)2880return;28812882list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,2883&pool->free_cmd_buffers, pool_link)2884{2885tu_cmd_buffer_destroy(cmd_buffer);2886}2887}28882889static void2890tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,2891const struct tu_subpass_barrier *barrier,2892bool external)2893{2894/* Note: we don't know until the end of the subpass whether we'll use2895* sysmem, so assume sysmem here to be safe.2896*/2897struct tu_cache_state *cache =2898external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;2899enum tu_cmd_access_mask src_flags =2900vk2tu_access(barrier->src_access_mask, false);2901enum tu_cmd_access_mask dst_flags =2902vk2tu_access(barrier->dst_access_mask, false);29032904if (barrier->incoherent_ccu_color)2905src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;2906if (barrier->incoherent_ccu_depth)2907src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;29082909tu_flush_for_access(cache, src_flags, dst_flags);2910}29112912VKAPI_ATTR void VKAPI_CALL2913tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,2914const VkRenderPassBeginInfo *pRenderPassBegin,2915const VkSubpassBeginInfo *pSubpassBeginInfo)2916{2917TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2918TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);2919TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);29202921cmd->state.pass = pass;2922cmd->state.subpass = pass->subpasses;2923cmd->state.framebuffer = fb;2924cmd->state.render_area = pRenderPassBegin->renderArea;29252926tu_cmd_prepare_tile_store_ib(cmd);29272928/* Note: because this is external, any flushes will happen before draw_cs2929* gets called. However deferred flushes could have to happen later as part2930* of the subpass.2931*/2932tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);2933cmd->state.renderpass_cache.pending_flush_bits =2934cmd->state.cache.pending_flush_bits;2935cmd->state.renderpass_cache.flush_bits = 0;29362937/* Track LRZ valid state */2938uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;2939if (a != VK_ATTACHMENT_UNUSED) {2940const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];2941struct tu_image *image = fb->attachments[a].attachment->image;2942/* if image has lrz and it isn't a stencil-only clear: */2943if (image->lrz_height &&2944(att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {2945cmd->state.lrz.image = image;2946cmd->state.lrz.valid = true;2947cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;29482949tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);2950tu6_emit_event_write(cmd, &cmd->cs, PC_CCU_FLUSH_COLOR_TS);2951} else {2952cmd->state.lrz.valid = false;2953}2954cmd->state.dirty |= TU_CMD_DIRTY_LRZ;2955}29562957tu_emit_renderpass_begin(cmd, pRenderPassBegin);29582959tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);2960tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);2961if (cmd->state.subpass->samples)2962tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);2963tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);29642965tu_set_input_attachments(cmd, cmd->state.subpass);2966}29672968VKAPI_ATTR void VKAPI_CALL2969tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,2970const VkSubpassBeginInfo *pSubpassBeginInfo,2971const VkSubpassEndInfo *pSubpassEndInfo)2972{2973TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2974const struct tu_render_pass *pass = cmd->state.pass;2975struct tu_cs *cs = &cmd->draw_cs;29762977const struct tu_subpass *subpass = cmd->state.subpass++;29782979/* Track LRZ valid state2980*2981* TODO: Improve this tracking for keeping the state of the past depth/stencil images,2982* so if they become active again, we reuse its old state.2983*/2984cmd->state.lrz.valid = false;2985cmd->state.dirty |= TU_CMD_DIRTY_LRZ;29862987tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);29882989if (subpass->resolve_attachments) {2990tu6_emit_blit_scissor(cmd, cs, true);29912992for (unsigned i = 0; i < subpass->resolve_count; i++) {2993uint32_t a = subpass->resolve_attachments[i].attachment;2994if (a == VK_ATTACHMENT_UNUSED)2995continue;29962997uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);29982999tu_store_gmem_attachment(cmd, cs, a, gmem_a);30003001if (pass->attachments[a].gmem_offset < 0)3002continue;30033004/* TODO:3005* check if the resolved attachment is needed by later subpasses,3006* if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..3007*/3008tu_finishme("missing GMEM->GMEM resolve path\n");3009tu_load_gmem_attachment(cmd, cs, a, true);3010}3011}30123013tu_cond_exec_end(cs);30143015tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);30163017tu6_emit_sysmem_resolves(cmd, cs, subpass);30183019tu_cond_exec_end(cs);30203021/* Handle dependencies for the next subpass */3022tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);30233024/* emit mrt/zs/msaa/ubwc state for the subpass that is starting */3025tu6_emit_zs(cmd, cmd->state.subpass, cs);3026tu6_emit_mrt(cmd, cmd->state.subpass, cs);3027if (cmd->state.subpass->samples)3028tu6_emit_msaa(cs, cmd->state.subpass->samples);3029tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);30303031tu_set_input_attachments(cmd, cmd->state.subpass);3032}30333034static uint32_t3035tu6_user_consts_size(const struct tu_pipeline *pipeline,3036struct tu_descriptor_state *descriptors_state,3037gl_shader_stage type)3038{3039const struct tu_program_descriptor_linkage *link =3040&pipeline->program.link[type];3041const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;3042uint32_t dwords = 0;30433044if (link->push_consts.count > 0) {3045unsigned num_units = link->push_consts.count;3046dwords += 4 + num_units * 4;3047}30483049for (uint32_t i = 0; i < state->num_enabled; i++) {3050uint32_t size = state->range[i].end - state->range[i].start;30513052size = MIN2(size, (16 * link->constlen) - state->range[i].offset);30533054if (size == 0)3055continue;30563057if (!state->range[i].ubo.bindless)3058continue;30593060uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?3061descriptors_state->dynamic_descriptors :3062descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;3063unsigned block = state->range[i].ubo.block;3064uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;3065uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;3066desc_size = desc_size > state->range[i].start ?3067desc_size - state->range[i].start : 0;30683069if (desc_size < size) {3070uint32_t zero_size = size - desc_size;3071dwords += 4 + zero_size / 4;3072size = desc_size;3073}30743075if (size > 0) {3076dwords += 4;3077}3078}30793080return dwords;3081}30823083static void3084tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,3085struct tu_descriptor_state *descriptors_state,3086gl_shader_stage type,3087uint32_t *push_constants)3088{3089const struct tu_program_descriptor_linkage *link =3090&pipeline->program.link[type];3091const struct ir3_const_state *const_state = &link->const_state;3092const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;30933094if (link->push_consts.count > 0) {3095unsigned num_units = link->push_consts.count;3096unsigned offset = link->push_consts.lo;3097tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);3098tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |3099CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |3100CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |3101CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |3102CP_LOAD_STATE6_0_NUM_UNIT(num_units));3103tu_cs_emit(cs, 0);3104tu_cs_emit(cs, 0);3105for (unsigned i = 0; i < num_units * 4; i++)3106tu_cs_emit(cs, push_constants[i + offset * 4]);3107}31083109for (uint32_t i = 0; i < state->num_enabled; i++) {3110uint32_t size = state->range[i].end - state->range[i].start;3111uint32_t offset = state->range[i].start;31123113/* and even if the start of the const buffer is before3114* first_immediate, the end may not be:3115*/3116size = MIN2(size, (16 * link->constlen) - state->range[i].offset);31173118if (size == 0)3119continue;31203121/* things should be aligned to vec4: */3122debug_assert((state->range[i].offset % 16) == 0);3123debug_assert((size % 16) == 0);3124debug_assert((offset % 16) == 0);31253126/* Dig out the descriptor from the descriptor state and read the VA from3127* it. All our UBOs are bindless with the exception of the NIR3128* constant_data, which is uploaded once in the pipeline.3129*/3130if (!state->range[i].ubo.bindless) {3131assert(state->range[i].ubo.block == const_state->constant_data_ubo);3132continue;3133}31343135uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?3136descriptors_state->dynamic_descriptors :3137descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;3138unsigned block = state->range[i].ubo.block;3139uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;3140uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);3141uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;3142desc_size = desc_size > state->range[i].start ?3143desc_size - state->range[i].start : 0;31443145/* Handle null UBO descriptors and out-of-range UBO reads by filling the3146* rest with 0, simulating what reading with ldc would do. This behavior3147* is required by VK_EXT_robustness2.3148*/3149if (desc_size < size) {3150uint32_t zero_size = size - desc_size;3151uint32_t zero_offset = state->range[i].offset + desc_size;3152tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4);3153tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) |3154CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |3155CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |3156CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |3157CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16));3158tu_cs_emit_qw(cs, 0);3159for (unsigned i = 0; i < zero_size / 4; i++) {3160tu_cs_emit(cs, 0);3161}3162size = desc_size;3163}31643165if (size > 0) {3166assert(va);3167tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);3168tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |3169CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |3170CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |3171CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |3172CP_LOAD_STATE6_0_NUM_UNIT(size / 16));3173tu_cs_emit_qw(cs, va + offset);3174}3175}3176}31773178static struct tu_draw_state3179tu6_emit_consts(struct tu_cmd_buffer *cmd,3180const struct tu_pipeline *pipeline,3181struct tu_descriptor_state *descriptors_state,3182gl_shader_stage type)3183{3184uint32_t dwords = tu6_user_consts_size(pipeline, descriptors_state, type);3185if (dwords == 0)3186return (struct tu_draw_state) {};31873188struct tu_cs cs;3189tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);31903191tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);31923193return tu_cs_end_draw_state(&cmd->sub_cs, &cs);3194}31953196static struct tu_draw_state3197tu6_emit_consts_geom(struct tu_cmd_buffer *cmd,3198const struct tu_pipeline *pipeline,3199struct tu_descriptor_state *descriptors_state)3200{3201uint32_t dwords = 0;32023203for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)3204dwords += tu6_user_consts_size(pipeline, descriptors_state, type);32053206if (dwords == 0)3207return (struct tu_draw_state) {};32083209struct tu_cs cs;3210tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);32113212for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)3213tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);32143215return tu_cs_end_draw_state(&cmd->sub_cs, &cs);3216}32173218static uint64_t3219get_tess_param_bo_size(const struct tu_pipeline *pipeline,3220uint32_t draw_count)3221{3222/* TODO: For indirect draws, we can't compute the BO size ahead of time.3223* Still not sure what to do here, so just allocate a reasonably large3224* BO and hope for the best for now. */3225if (!draw_count)3226draw_count = 2048;32273228/* the tess param BO is pipeline->tess.param_stride bytes per patch,3229* which includes both the per-vertex outputs and per-patch outputs3230* build_primitive_map in ir3 calculates this stride3231*/3232uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;3233uint32_t num_patches = draw_count / verts_per_patch;3234return num_patches * pipeline->tess.param_stride;3235}32363237static uint64_t3238get_tess_factor_bo_size(const struct tu_pipeline *pipeline,3239uint32_t draw_count)3240{3241/* TODO: For indirect draws, we can't compute the BO size ahead of time.3242* Still not sure what to do here, so just allocate a reasonably large3243* BO and hope for the best for now. */3244if (!draw_count)3245draw_count = 2048;32463247/* Each distinct patch gets its own tess factor output. */3248uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;3249uint32_t num_patches = draw_count / verts_per_patch;3250uint32_t factor_stride;3251switch (pipeline->tess.patch_type) {3252case IR3_TESS_ISOLINES:3253factor_stride = 12;3254break;3255case IR3_TESS_TRIANGLES:3256factor_stride = 20;3257break;3258case IR3_TESS_QUADS:3259factor_stride = 28;3260break;3261default:3262unreachable("bad tessmode");3263}3264return factor_stride * num_patches;3265}32663267static VkResult3268tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,3269uint32_t draw_count,3270const struct tu_pipeline *pipeline,3271struct tu_draw_state *state,3272uint64_t *factor_iova)3273{3274struct tu_cs cs;3275VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);3276if (result != VK_SUCCESS)3277return result;32783279const struct tu_program_descriptor_linkage *hs_link =3280&pipeline->program.link[MESA_SHADER_TESS_CTRL];3281bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen;32823283const struct tu_program_descriptor_linkage *ds_link =3284&pipeline->program.link[MESA_SHADER_TESS_EVAL];3285bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen;32863287uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);3288uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);3289uint64_t tess_bo_size = tess_factor_size + tess_param_size;3290if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) {3291struct tu_bo *tess_bo;3292result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);3293if (result != VK_SUCCESS)3294return result;32953296uint64_t tess_factor_iova = tess_bo->iova;3297uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;32983299if (hs_uses_bo) {3300tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);3301tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |3302CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |3303CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |3304CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |3305CP_LOAD_STATE6_0_NUM_UNIT(1));3306tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));3307tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));3308tu_cs_emit_qw(&cs, tess_param_iova);3309tu_cs_emit_qw(&cs, tess_factor_iova);3310}33113312if (ds_uses_bo) {3313tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);3314tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |3315CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |3316CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |3317CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |3318CP_LOAD_STATE6_0_NUM_UNIT(1));3319tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));3320tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));3321tu_cs_emit_qw(&cs, tess_param_iova);3322tu_cs_emit_qw(&cs, tess_factor_iova);3323}33243325*factor_iova = tess_factor_iova;3326}3327*state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);3328return VK_SUCCESS;3329}33303331static enum tu_lrz_direction3332tu6_lrz_depth_mode(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,3333VkCompareOp depthCompareOp,3334bool *invalidate_lrz)3335{3336enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;33373338/* LRZ does not support some depth modes. */3339switch (depthCompareOp) {3340case VK_COMPARE_OP_ALWAYS:3341case VK_COMPARE_OP_NOT_EQUAL:3342*invalidate_lrz = true;3343gras_lrz_cntl->lrz_write = false;3344break;3345case VK_COMPARE_OP_EQUAL:3346case VK_COMPARE_OP_NEVER:3347gras_lrz_cntl->lrz_write = false;3348break;3349case VK_COMPARE_OP_GREATER:3350case VK_COMPARE_OP_GREATER_OR_EQUAL:3351lrz_direction = TU_LRZ_GREATER;3352gras_lrz_cntl->greater = true;3353break;3354case VK_COMPARE_OP_LESS:3355case VK_COMPARE_OP_LESS_OR_EQUAL:3356lrz_direction = TU_LRZ_LESS;3357break;3358default:3359unreachable("bad VK_COMPARE_OP value or uninitialized");3360break;3361};33623363return lrz_direction;3364}33653366/* update lrz state based on stencil-test func:3367*3368* Conceptually the order of the pipeline is:3369*3370*3371* FS -> Alpha-Test -> Stencil-Test -> Depth-Test3372* | |3373* if wrmask != 0 if wrmask != 03374* | |3375* v v3376* Stencil-Write Depth-Write3377*3378* Because Stencil-Test can have side effects (Stencil-Write) prior3379* to depth test, in this case we potentially need to disable early3380* lrz-test. See:3381*3382* https://www.khronos.org/opengl/wiki/Per-Sample_Processing3383*/3384static void3385tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,3386VkCompareOp func,3387bool stencil_write,3388bool *invalidate_lrz)3389{3390switch (func) {3391case VK_COMPARE_OP_ALWAYS:3392/* nothing to do for LRZ, but for stencil test when stencil-3393* write is enabled, we need to disable lrz-test, since3394* conceptually stencil test and write happens before depth-test.3395*/3396if (stencil_write) {3397gras_lrz_cntl->enable = false;3398gras_lrz_cntl->z_test_enable = false;3399*invalidate_lrz = true;3400}3401break;3402case VK_COMPARE_OP_NEVER:3403/* fragment never passes, disable lrz_write for this draw. */3404gras_lrz_cntl->lrz_write = false;3405break;3406default:3407/* whether the fragment passes or not depends on result3408* of stencil test, which we cannot know when doing binning3409* pass.3410*/3411gras_lrz_cntl->lrz_write = false;3412/* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-3413* effects from stencil test we need to disable lrz-test.3414*/3415if (stencil_write) {3416gras_lrz_cntl->enable = false;3417gras_lrz_cntl->z_test_enable = false;3418*invalidate_lrz = true;3419}3420break;3421}3422}34233424static struct A6XX_GRAS_LRZ_CNTL3425tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,3426const uint32_t a)3427{3428struct tu_pipeline *pipeline = cmd->state.pipeline;3429struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };3430bool invalidate_lrz = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ;3431bool force_disable_write = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE;3432enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;34333434gras_lrz_cntl.enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_ENABLE;3435gras_lrz_cntl.lrz_write = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;3436gras_lrz_cntl.z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;3437gras_lrz_cntl.z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;34383439VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;3440lrz_direction = tu6_lrz_depth_mode(&gras_lrz_cntl, depth_compare_op, &invalidate_lrz);34413442/* LRZ doesn't transition properly between GREATER* and LESS* depth compare ops */3443if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&3444lrz_direction != TU_LRZ_UNKNOWN &&3445cmd->state.lrz.prev_direction != lrz_direction) {3446invalidate_lrz = true;3447}34483449cmd->state.lrz.prev_direction = lrz_direction;34503451/* Invalidate LRZ and disable write if stencil test is enabled */3452bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;3453if (stencil_test_enable) {3454bool stencil_front_writemask =3455(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?3456(cmd->state.dynamic_stencil_wrmask & 0xff) :3457(pipeline->stencil_wrmask & 0xff);34583459bool stencil_back_writemask =3460(pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?3461((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :3462(pipeline->stencil_wrmask & 0xff00) >> 8;34633464VkCompareOp stencil_front_compare_op =3465(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;34663467VkCompareOp stencil_back_compare_op =3468(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;34693470tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op,3471stencil_front_writemask, &invalidate_lrz);34723473tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op,3474stencil_back_writemask, &invalidate_lrz);3475}34763477if (force_disable_write)3478gras_lrz_cntl.lrz_write = false;34793480if (invalidate_lrz) {3481cmd->state.lrz.valid = false;3482}34833484/* In case no depth attachment or invalid, we clear the gras_lrz_cntl register */3485if (a == VK_ATTACHMENT_UNUSED || !cmd->state.lrz.valid)3486memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));34873488return gras_lrz_cntl;3489}34903491static struct tu_draw_state3492tu6_build_lrz(struct tu_cmd_buffer *cmd)3493{3494const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;3495struct tu_cs lrz_cs;3496struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &lrz_cs, 4);34973498struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);34993500tu_cs_emit_regs(&lrz_cs, A6XX_GRAS_LRZ_CNTL(3501.enable = gras_lrz_cntl.enable,3502.greater = gras_lrz_cntl.greater,3503.lrz_write = gras_lrz_cntl.lrz_write,3504.z_test_enable = gras_lrz_cntl.z_test_enable,3505.z_bounds_enable = gras_lrz_cntl.z_bounds_enable));3506tu_cs_emit_regs(&lrz_cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));35073508return ds;3509}35103511static bool3512tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)3513{3514bool depth_write_enable =3515cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;35163517VkCompareOp depth_compare_op =3518(cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;35193520bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER;35213522return depth_test_enable && depth_write_enable && depth_compare_op_writes;3523}35243525static bool3526tu6_writes_stencil(struct tu_cmd_buffer *cmd)3527{3528bool stencil_test_enable =3529cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;35303531bool stencil_front_writemask =3532(cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?3533(cmd->state.dynamic_stencil_wrmask & 0xff) :3534(cmd->state.pipeline->stencil_wrmask & 0xff);35353536bool stencil_back_writemask =3537(cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?3538((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :3539(cmd->state.pipeline->stencil_wrmask & 0xff00) >> 8;35403541VkStencilOp front_fail_op =3542(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT;3543VkStencilOp front_pass_op =3544(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT;3545VkStencilOp front_depth_fail_op =3546(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT;3547VkStencilOp back_fail_op =3548(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT;3549VkStencilOp back_pass_op =3550(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT;3551VkStencilOp back_depth_fail_op =3552(cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT;35533554bool stencil_front_op_writes =3555front_pass_op != VK_STENCIL_OP_KEEP &&3556front_fail_op != VK_STENCIL_OP_KEEP &&3557front_depth_fail_op != VK_STENCIL_OP_KEEP;35583559bool stencil_back_op_writes =3560back_pass_op != VK_STENCIL_OP_KEEP &&3561back_fail_op != VK_STENCIL_OP_KEEP &&3562back_depth_fail_op != VK_STENCIL_OP_KEEP;35633564return stencil_test_enable &&3565((stencil_front_writemask && stencil_front_op_writes) ||3566(stencil_back_writemask && stencil_back_op_writes));3567}35683569static struct tu_draw_state3570tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd)3571{3572struct tu_cs cs;3573struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 4);35743575enum a6xx_ztest_mode zmode = A6XX_EARLY_Z;3576bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_ENABLE;3577bool depth_write = tu6_writes_depth(cmd, depth_test_enable);3578bool stencil_write = tu6_writes_stencil(cmd);35793580if (cmd->state.pipeline->lrz.fs_has_kill &&3581(depth_write || stencil_write)) {3582zmode = cmd->state.lrz.valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;3583}35843585if (cmd->state.pipeline->lrz.force_late_z || !depth_test_enable)3586zmode = A6XX_LATE_Z;35873588/* User defined early tests take precedence above all else */3589if (cmd->state.pipeline->lrz.early_fragment_tests)3590zmode = A6XX_EARLY_Z;35913592tu_cs_emit_pkt4(&cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);3593tu_cs_emit(&cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));35943595tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);3596tu_cs_emit(&cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));3597return ds;3598}35993600static VkResult3601tu6_draw_common(struct tu_cmd_buffer *cmd,3602struct tu_cs *cs,3603bool indexed,3604/* note: draw_count is 0 for indirect */3605uint32_t draw_count)3606{3607const struct tu_pipeline *pipeline = cmd->state.pipeline;3608VkResult result;36093610tu_emit_cache_flush_renderpass(cmd, cs);36113612tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(3613.primitive_restart =3614pipeline->ia.primitive_restart && indexed,3615.provoking_vtx_last = pipeline->provoking_vertex_last,3616.tess_upper_left_domain_origin =3617pipeline->tess.upper_left_domain_origin));36183619bool has_tess =3620pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;36213622/* Early exit if there is nothing to emit, saves CPU cycles */3623if (!(cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) &&3624!has_tess)3625return VK_SUCCESS;36263627bool dirty_lrz = cmd->state.dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL | TU_CMD_DIRTY_RB_STENCIL_CNTL);36283629struct tu_descriptor_state *descriptors_state =3630&cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];36313632if (dirty_lrz) {3633cmd->state.lrz.state = tu6_build_lrz(cmd);3634cmd->state.depth_plane_state = tu6_build_depth_plane_z_mode(cmd);3635}36363637if (cmd->state.dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) {3638struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2);3639tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl));3640}36413642if (cmd->state.dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) {3643struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2);3644uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl;36453646if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_ENABLE) ||3647(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE))3648rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;36493650if (pipeline->rb_depth_cntl_disable)3651rb_depth_cntl = 0;36523653tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl));3654}36553656if (cmd->state.dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) {3657struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2);3658tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl));3659}36603661if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {3662cmd->state.shader_const[0] =3663tu6_emit_consts_geom(cmd, pipeline, descriptors_state);3664cmd->state.shader_const[1] =3665tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);3666}36673668struct tu_draw_state tess_consts = {};3669if (has_tess) {3670uint64_t tess_factor_iova = 0;36713672cmd->state.has_tess = true;3673result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);3674if (result != VK_SUCCESS)3675return result;36763677/* this sequence matches what the blob does before every tess draw3678* PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi3679* before writing to it3680*/3681tu_cs_emit_wfi(cs);36823683tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova));36843685tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);3686tu_cs_emit(cs, draw_count);3687}36883689/* for the first draw in a renderpass, re-emit all the draw states3690*3691* and if a draw-state disabling path (CmdClearAttachments 3D fallback) was3692* used, then draw states must be re-emitted. note however this only happens3693* in the sysmem path, so this can be skipped this for the gmem path (TODO)3694*3695* the two input attachment states are excluded because secondary command3696* buffer doesn't have a state ib to restore it, and not re-emitting them3697* is OK since CmdClearAttachments won't disable/overwrite them3698*/3699if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {3700tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));37013702tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);3703tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);3704tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);3705tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);3706tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);3707tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);3708tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);3709tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);3710tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);3711tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);3712tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);3713tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);3714tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);3715tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);3716tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);3717tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);37183719for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {3720tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,3721((pipeline->dynamic_state_mask & BIT(i)) ?3722cmd->state.dynamic_state[i] :3723pipeline->dynamic_state[i]));3724}3725} else {3726/* emit draw states that were just updated3727* note we eventually don't want to have to emit anything here3728*/3729bool emit_binding_stride = false;3730uint32_t draw_state_count =3731has_tess +3732((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +3733((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +3734((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +3735((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +3736(dirty_lrz ? 2 : 0);37373738if ((cmd->state.dirty & TU_CMD_DIRTY_VB_STRIDE) &&3739(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {3740emit_binding_stride = true;3741draw_state_count += 1;3742}37433744if (draw_state_count > 0)3745tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);37463747/* We may need to re-emit tess consts if the current draw call is3748* sufficiently larger than the last draw call. */3749if (has_tess)3750tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);3751if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {3752tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);3753tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);3754}3755if (cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD)3756tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);3757if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)3758tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);3759if (emit_binding_stride) {3760tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE,3761cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]);3762}3763if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS)3764tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);37653766if (dirty_lrz) {3767tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);3768tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);3769}3770}37713772tu_cs_sanity_check(cs);37733774/* There are too many graphics dirty bits to list here, so just list the3775* bits to preserve instead. The only things not emitted here are3776* compute-related state.3777*/3778cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;3779return VK_SUCCESS;3780}37813782static uint32_t3783tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)3784{3785const struct tu_pipeline *pipeline = cmd->state.pipeline;3786enum pc_di_primtype primtype = pipeline->ia.primtype;37873788if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY))3789primtype = cmd->state.primtype;37903791uint32_t initiator =3792CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |3793CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |3794CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |3795CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);37963797if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)3798initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;37993800switch (pipeline->tess.patch_type) {3801case IR3_TESS_TRIANGLES:3802initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |3803CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;3804break;3805case IR3_TESS_ISOLINES:3806initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |3807CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;3808break;3809case IR3_TESS_NONE:3810initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);3811break;3812case IR3_TESS_QUADS:3813initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |3814CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;3815break;3816}3817return initiator;3818}381938203821static uint32_t3822vs_params_offset(struct tu_cmd_buffer *cmd)3823{3824const struct tu_program_descriptor_linkage *link =3825&cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];3826const struct ir3_const_state *const_state = &link->const_state;38273828if (const_state->offsets.driver_param >= link->constlen)3829return 0;38303831/* this layout is required by CP_DRAW_INDIRECT_MULTI */3832STATIC_ASSERT(IR3_DP_DRAWID == 0);3833STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);3834STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);38353836/* 0 means disabled for CP_DRAW_INDIRECT_MULTI */3837assert(const_state->offsets.driver_param != 0);38383839return const_state->offsets.driver_param;3840}38413842static void3843tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)3844{3845if (cmd->state.vs_params.iova) {3846cmd->state.vs_params = (struct tu_draw_state) {};3847cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;3848}3849}38503851static void3852tu6_emit_vs_params(struct tu_cmd_buffer *cmd,3853uint32_t vertex_offset,3854uint32_t first_instance)3855{3856uint32_t offset = vs_params_offset(cmd);38573858if (offset == cmd->state.last_vs_params.params_offset &&3859vertex_offset == cmd->state.last_vs_params.vertex_offset &&3860first_instance == cmd->state.last_vs_params.first_instance) {3861return;3862}38633864struct tu_cs cs;3865VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);3866if (result != VK_SUCCESS) {3867cmd->record_result = result;3868return;3869}38703871tu_cs_emit_regs(&cs,3872A6XX_VFD_INDEX_OFFSET(vertex_offset),3873A6XX_VFD_INSTANCE_START_OFFSET(first_instance));38743875if (offset) {3876tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);3877tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |3878CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |3879CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |3880CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |3881CP_LOAD_STATE6_0_NUM_UNIT(1));3882tu_cs_emit(&cs, 0);3883tu_cs_emit(&cs, 0);38843885tu_cs_emit(&cs, 0);3886tu_cs_emit(&cs, vertex_offset);3887tu_cs_emit(&cs, first_instance);3888tu_cs_emit(&cs, 0);3889}38903891cmd->state.last_vs_params.params_offset = offset;3892cmd->state.last_vs_params.vertex_offset = vertex_offset;3893cmd->state.last_vs_params.first_instance = first_instance;38943895struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);3896cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};38973898cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;3899}39003901VKAPI_ATTR void VKAPI_CALL3902tu_CmdDraw(VkCommandBuffer commandBuffer,3903uint32_t vertexCount,3904uint32_t instanceCount,3905uint32_t firstVertex,3906uint32_t firstInstance)3907{3908TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);3909struct tu_cs *cs = &cmd->draw_cs;39103911tu6_emit_vs_params(cmd, firstVertex, firstInstance);39123913tu6_draw_common(cmd, cs, false, vertexCount);39143915tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);3916tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));3917tu_cs_emit(cs, instanceCount);3918tu_cs_emit(cs, vertexCount);3919}39203921VKAPI_ATTR void VKAPI_CALL3922tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,3923uint32_t indexCount,3924uint32_t instanceCount,3925uint32_t firstIndex,3926int32_t vertexOffset,3927uint32_t firstInstance)3928{3929TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);3930struct tu_cs *cs = &cmd->draw_cs;39313932tu6_emit_vs_params(cmd, vertexOffset, firstInstance);39333934tu6_draw_common(cmd, cs, true, indexCount);39353936tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);3937tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));3938tu_cs_emit(cs, instanceCount);3939tu_cs_emit(cs, indexCount);3940tu_cs_emit(cs, firstIndex);3941tu_cs_emit_qw(cs, cmd->state.index_va);3942tu_cs_emit(cs, cmd->state.max_index_count);3943}39443945/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes3946* do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if3947* pending for these opcodes. This may result in a few extra WAIT_FOR_ME's3948* with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's3949* before draw opcodes that don't need it.3950*/3951static void3952draw_wfm(struct tu_cmd_buffer *cmd)3953{3954cmd->state.renderpass_cache.flush_bits |=3955cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;3956cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;3957}39583959VKAPI_ATTR void VKAPI_CALL3960tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,3961VkBuffer _buffer,3962VkDeviceSize offset,3963uint32_t drawCount,3964uint32_t stride)3965{3966TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);3967TU_FROM_HANDLE(tu_buffer, buf, _buffer);3968struct tu_cs *cs = &cmd->draw_cs;39693970tu6_emit_empty_vs_params(cmd);39713972if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)3973draw_wfm(cmd);39743975tu6_draw_common(cmd, cs, false, 0);39763977tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);3978tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));3979tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |3980A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));3981tu_cs_emit(cs, drawCount);3982tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);3983tu_cs_emit(cs, stride);3984}39853986VKAPI_ATTR void VKAPI_CALL3987tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,3988VkBuffer _buffer,3989VkDeviceSize offset,3990uint32_t drawCount,3991uint32_t stride)3992{3993TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);3994TU_FROM_HANDLE(tu_buffer, buf, _buffer);3995struct tu_cs *cs = &cmd->draw_cs;39963997tu6_emit_empty_vs_params(cmd);39983999if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)4000draw_wfm(cmd);40014002tu6_draw_common(cmd, cs, true, 0);40034004tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);4005tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));4006tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |4007A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));4008tu_cs_emit(cs, drawCount);4009tu_cs_emit_qw(cs, cmd->state.index_va);4010tu_cs_emit(cs, cmd->state.max_index_count);4011tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);4012tu_cs_emit(cs, stride);4013}40144015VKAPI_ATTR void VKAPI_CALL4016tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,4017VkBuffer _buffer,4018VkDeviceSize offset,4019VkBuffer countBuffer,4020VkDeviceSize countBufferOffset,4021uint32_t drawCount,4022uint32_t stride)4023{4024TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);4025TU_FROM_HANDLE(tu_buffer, buf, _buffer);4026TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);4027struct tu_cs *cs = &cmd->draw_cs;40284029tu6_emit_empty_vs_params(cmd);40304031/* It turns out that the firmware we have for a650 only partially fixed the4032* problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete4033* before reading indirect parameters. It waits for WFI's before reading4034* the draw parameters, but after reading the indirect count :(.4035*/4036draw_wfm(cmd);40374038tu6_draw_common(cmd, cs, false, 0);40394040tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);4041tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));4042tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |4043A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));4044tu_cs_emit(cs, drawCount);4045tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);4046tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);4047tu_cs_emit(cs, stride);4048}40494050VKAPI_ATTR void VKAPI_CALL4051tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,4052VkBuffer _buffer,4053VkDeviceSize offset,4054VkBuffer countBuffer,4055VkDeviceSize countBufferOffset,4056uint32_t drawCount,4057uint32_t stride)4058{4059TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);4060TU_FROM_HANDLE(tu_buffer, buf, _buffer);4061TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);4062struct tu_cs *cs = &cmd->draw_cs;40634064tu6_emit_empty_vs_params(cmd);40654066draw_wfm(cmd);40674068tu6_draw_common(cmd, cs, true, 0);40694070tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);4071tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));4072tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |4073A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));4074tu_cs_emit(cs, drawCount);4075tu_cs_emit_qw(cs, cmd->state.index_va);4076tu_cs_emit(cs, cmd->state.max_index_count);4077tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);4078tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);4079tu_cs_emit(cs, stride);4080}40814082VKAPI_ATTR void VKAPI_CALL4083tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,4084uint32_t instanceCount,4085uint32_t firstInstance,4086VkBuffer _counterBuffer,4087VkDeviceSize counterBufferOffset,4088uint32_t counterOffset,4089uint32_t vertexStride)4090{4091TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);4092TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);4093struct tu_cs *cs = &cmd->draw_cs;40944095/* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.4096* Plus, for the common case where the counter buffer is written by4097* vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to4098* complete which means we need a WAIT_FOR_ME anyway.4099*/4100draw_wfm(cmd);41014102tu6_emit_vs_params(cmd, 0, firstInstance);41034104tu6_draw_common(cmd, cs, false, 0);41054106tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);4107tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));4108tu_cs_emit(cs, instanceCount);4109tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset);4110tu_cs_emit(cs, counterOffset);4111tu_cs_emit(cs, vertexStride);4112}41134114struct tu_dispatch_info4115{4116/**4117* Determine the layout of the grid (in block units) to be used.4118*/4119uint32_t blocks[3];41204121/**4122* A starting offset for the grid. If unaligned is set, the offset4123* must still be aligned.4124*/4125uint32_t offsets[3];4126/**4127* Whether it's an unaligned compute dispatch.4128*/4129bool unaligned;41304131/**4132* Indirect compute parameters resource.4133*/4134struct tu_buffer *indirect;4135uint64_t indirect_offset;4136};41374138static void4139tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,4140struct tu_cs *cs, struct tu_pipeline *pipeline,4141const struct tu_dispatch_info *info)4142{4143gl_shader_stage type = MESA_SHADER_COMPUTE;4144const struct tu_program_descriptor_linkage *link =4145&pipeline->program.link[type];4146const struct ir3_const_state *const_state = &link->const_state;4147uint32_t offset = const_state->offsets.driver_param;4148unsigned subgroup_size = pipeline->compute.subgroup_size;4149unsigned subgroup_shift = util_logbase2(subgroup_size);41504151if (link->constlen <= offset)4152return;41534154uint32_t num_consts = MIN2(const_state->num_driver_params,4155(link->constlen - offset) * 4);41564157if (!info->indirect) {4158uint32_t driver_params[12] = {4159[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],4160[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],4161[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],4162[IR3_DP_BASE_GROUP_X] = info->offsets[0],4163[IR3_DP_BASE_GROUP_Y] = info->offsets[1],4164[IR3_DP_BASE_GROUP_Z] = info->offsets[2],4165[IR3_DP_SUBGROUP_SIZE] = subgroup_size,4166[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,4167};41684169assert(num_consts <= ARRAY_SIZE(driver_params));41704171/* push constants */4172tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);4173tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |4174CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |4175CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |4176CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |4177CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));4178tu_cs_emit(cs, 0);4179tu_cs_emit(cs, 0);4180uint32_t i;4181for (i = 0; i < num_consts; i++)4182tu_cs_emit(cs, driver_params[i]);4183} else if (!(info->indirect_offset & 0xf)) {4184tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);4185tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |4186CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |4187CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |4188CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |4189CP_LOAD_STATE6_0_NUM_UNIT(1));4190tu_cs_emit_qw(cs, tu_buffer_iova(info->indirect) + info->indirect_offset);4191} else {4192/* Vulkan guarantees only 4 byte alignment for indirect_offset.4193* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.4194*/41954196uint64_t indirect_iova = tu_buffer_iova(info->indirect) + info->indirect_offset;41974198for (uint32_t i = 0; i < 3; i++) {4199tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);4200tu_cs_emit(cs, 0);4201tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i]));4202tu_cs_emit_qw(cs, indirect_iova + i * 4);4203}42044205tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);4206tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);42074208tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);4209tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |4210CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |4211CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |4212CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |4213CP_LOAD_STATE6_0_NUM_UNIT(1));4214tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));4215}42164217/* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect4218* dispatch.4219*/4220if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {4221tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);4222tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |4223CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |4224CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |4225CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |4226CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));4227tu_cs_emit_qw(cs, 0);4228tu_cs_emit(cs, 0); /* BASE_GROUP_X */4229tu_cs_emit(cs, 0); /* BASE_GROUP_Y */4230tu_cs_emit(cs, 0); /* BASE_GROUP_Z */4231tu_cs_emit(cs, subgroup_size);4232if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {4233assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));4234tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */4235tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */4236tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */4237tu_cs_emit(cs, subgroup_shift);4238}4239}4240}42414242static void4243tu_dispatch(struct tu_cmd_buffer *cmd,4244const struct tu_dispatch_info *info)4245{4246struct tu_cs *cs = &cmd->cs;4247struct tu_pipeline *pipeline = cmd->state.compute_pipeline;4248struct tu_descriptor_state *descriptors_state =4249&cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];42504251/* TODO: We could probably flush less if we add a compute_flush_bits4252* bitfield.4253*/4254tu_emit_cache_flush(cmd, cs);42554256/* note: no reason to have this in a separate IB */4257tu_cs_emit_state_ib(cs,4258tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE));42594260tu_emit_compute_driver_params(cmd, cs, pipeline, info);42614262if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)4263tu_cs_emit_state_ib(cs, pipeline->load_state);42644265cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;42664267tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);4268tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));42694270const uint32_t *local_size = pipeline->compute.local_size;4271const uint32_t *num_groups = info->blocks;4272tu_cs_emit_regs(cs,4273A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,4274.localsizex = local_size[0] - 1,4275.localsizey = local_size[1] - 1,4276.localsizez = local_size[2] - 1),4277A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),4278A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),4279A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),4280A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),4281A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),4282A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));42834284tu_cs_emit_regs(cs,4285A6XX_HLSQ_CS_KERNEL_GROUP_X(1),4286A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),4287A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));42884289if (info->indirect) {4290uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;42914292tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);4293tu_cs_emit(cs, 0x00000000);4294tu_cs_emit_qw(cs, iova);4295tu_cs_emit(cs,4296A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |4297A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |4298A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));4299} else {4300tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);4301tu_cs_emit(cs, 0x00000000);4302tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));4303tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));4304tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));4305}43064307tu_cs_emit_wfi(cs);4308}43094310VKAPI_ATTR void VKAPI_CALL4311tu_CmdDispatchBase(VkCommandBuffer commandBuffer,4312uint32_t base_x,4313uint32_t base_y,4314uint32_t base_z,4315uint32_t x,4316uint32_t y,4317uint32_t z)4318{4319TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);4320struct tu_dispatch_info info = {};43214322info.blocks[0] = x;4323info.blocks[1] = y;4324info.blocks[2] = z;43254326info.offsets[0] = base_x;4327info.offsets[1] = base_y;4328info.offsets[2] = base_z;4329tu_dispatch(cmd_buffer, &info);4330}43314332VKAPI_ATTR void VKAPI_CALL4333tu_CmdDispatch(VkCommandBuffer commandBuffer,4334uint32_t x,4335uint32_t y,4336uint32_t z)4337{4338tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);4339}43404341VKAPI_ATTR void VKAPI_CALL4342tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,4343VkBuffer _buffer,4344VkDeviceSize offset)4345{4346TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);4347TU_FROM_HANDLE(tu_buffer, buffer, _buffer);4348struct tu_dispatch_info info = {};43494350info.indirect = buffer;4351info.indirect_offset = offset;43524353tu_dispatch(cmd_buffer, &info);4354}43554356VKAPI_ATTR void VKAPI_CALL4357tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,4358const VkSubpassEndInfoKHR *pSubpassEndInfo)4359{4360TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);43614362tu_cs_end(&cmd_buffer->draw_cs);4363tu_cs_end(&cmd_buffer->draw_epilogue_cs);43644365if (use_sysmem_rendering(cmd_buffer))4366tu_cmd_render_sysmem(cmd_buffer);4367else4368tu_cmd_render_tiles(cmd_buffer);43694370/* outside of renderpasses we assume all draw states are disabled4371* we can do this in the main cs because no resolve/store commands4372* should use a draw command (TODO: this will change if unaligned4373* GMEM stores are supported)4374*/4375tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);43764377/* discard draw_cs and draw_epilogue_cs entries now that the tiles are4378rendered */4379tu_cs_discard_entries(&cmd_buffer->draw_cs);4380tu_cs_begin(&cmd_buffer->draw_cs);4381tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);4382tu_cs_begin(&cmd_buffer->draw_epilogue_cs);43834384cmd_buffer->state.cache.pending_flush_bits |=4385cmd_buffer->state.renderpass_cache.pending_flush_bits;4386tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);43874388cmd_buffer->state.pass = NULL;4389cmd_buffer->state.subpass = NULL;4390cmd_buffer->state.framebuffer = NULL;4391cmd_buffer->state.has_tess = false;4392cmd_buffer->state.has_subpass_predication = false;43934394/* LRZ is not valid next time we use it */4395cmd_buffer->state.lrz.valid = false;4396cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;4397}43984399struct tu_barrier_info4400{4401uint32_t eventCount;4402const VkEvent *pEvents;4403VkPipelineStageFlags srcStageMask;4404};44054406static void4407tu_barrier(struct tu_cmd_buffer *cmd,4408uint32_t memoryBarrierCount,4409const VkMemoryBarrier *pMemoryBarriers,4410uint32_t bufferMemoryBarrierCount,4411const VkBufferMemoryBarrier *pBufferMemoryBarriers,4412uint32_t imageMemoryBarrierCount,4413const VkImageMemoryBarrier *pImageMemoryBarriers,4414const struct tu_barrier_info *info)4415{4416struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;4417VkAccessFlags srcAccessMask = 0;4418VkAccessFlags dstAccessMask = 0;44194420for (uint32_t i = 0; i < memoryBarrierCount; i++) {4421srcAccessMask |= pMemoryBarriers[i].srcAccessMask;4422dstAccessMask |= pMemoryBarriers[i].dstAccessMask;4423}44244425for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {4426srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;4427dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;4428}44294430enum tu_cmd_access_mask src_flags = 0;4431enum tu_cmd_access_mask dst_flags = 0;44324433for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {4434VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;4435if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) {4436/* The underlying memory for this image may have been used earlier4437* within the same queue submission for a different image, which4438* means that there may be old, stale cache entries which are in the4439* "wrong" location, which could cause problems later after writing4440* to the image. We don't want these entries being flushed later and4441* overwriting the actual image, so we need to flush the CCU.4442*/4443src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;4444}4445srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;4446dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;4447}44484449/* Inside a renderpass, we don't know yet whether we'll be using sysmem4450* so we have to use the sysmem flushes.4451*/4452bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&4453!cmd->state.pass;4454src_flags |= vk2tu_access(srcAccessMask, gmem);4455dst_flags |= vk2tu_access(dstAccessMask, gmem);44564457struct tu_cache_state *cache =4458cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;4459tu_flush_for_access(cache, src_flags, dst_flags);44604461for (uint32_t i = 0; i < info->eventCount; i++) {4462TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);44634464tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);4465tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |4466CP_WAIT_REG_MEM_0_POLL_MEMORY);4467tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */4468tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));4469tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));4470tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));4471}4472}44734474VKAPI_ATTR void VKAPI_CALL4475tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,4476VkPipelineStageFlags srcStageMask,4477VkPipelineStageFlags dstStageMask,4478VkDependencyFlags dependencyFlags,4479uint32_t memoryBarrierCount,4480const VkMemoryBarrier *pMemoryBarriers,4481uint32_t bufferMemoryBarrierCount,4482const VkBufferMemoryBarrier *pBufferMemoryBarriers,4483uint32_t imageMemoryBarrierCount,4484const VkImageMemoryBarrier *pImageMemoryBarriers)4485{4486TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);4487struct tu_barrier_info info;44884489info.eventCount = 0;4490info.pEvents = NULL;4491info.srcStageMask = srcStageMask;44924493tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,4494bufferMemoryBarrierCount, pBufferMemoryBarriers,4495imageMemoryBarrierCount, pImageMemoryBarriers, &info);4496}44974498static void4499write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,4500VkPipelineStageFlags stageMask, unsigned value)4501{4502struct tu_cs *cs = &cmd->cs;45034504/* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */4505assert(!cmd->state.pass);45064507tu_emit_cache_flush(cmd, cs);45084509/* Flags that only require a top-of-pipe event. DrawIndirect parameters are4510* read by the CP, so the draw indirect stage counts as top-of-pipe too.4511*/4512VkPipelineStageFlags top_of_pipe_flags =4513VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |4514VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;45154516if (!(stageMask & ~top_of_pipe_flags)) {4517tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);4518tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */4519tu_cs_emit(cs, value);4520} else {4521/* Use a RB_DONE_TS event to wait for everything to complete. */4522tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);4523tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));4524tu_cs_emit_qw(cs, event->bo.iova);4525tu_cs_emit(cs, value);4526}4527}45284529VKAPI_ATTR void VKAPI_CALL4530tu_CmdSetEvent(VkCommandBuffer commandBuffer,4531VkEvent _event,4532VkPipelineStageFlags stageMask)4533{4534TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);4535TU_FROM_HANDLE(tu_event, event, _event);45364537write_event(cmd, event, stageMask, 1);4538}45394540VKAPI_ATTR void VKAPI_CALL4541tu_CmdResetEvent(VkCommandBuffer commandBuffer,4542VkEvent _event,4543VkPipelineStageFlags stageMask)4544{4545TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);4546TU_FROM_HANDLE(tu_event, event, _event);45474548write_event(cmd, event, stageMask, 0);4549}45504551VKAPI_ATTR void VKAPI_CALL4552tu_CmdWaitEvents(VkCommandBuffer commandBuffer,4553uint32_t eventCount,4554const VkEvent *pEvents,4555VkPipelineStageFlags srcStageMask,4556VkPipelineStageFlags dstStageMask,4557uint32_t memoryBarrierCount,4558const VkMemoryBarrier *pMemoryBarriers,4559uint32_t bufferMemoryBarrierCount,4560const VkBufferMemoryBarrier *pBufferMemoryBarriers,4561uint32_t imageMemoryBarrierCount,4562const VkImageMemoryBarrier *pImageMemoryBarriers)4563{4564TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);4565struct tu_barrier_info info;45664567info.eventCount = eventCount;4568info.pEvents = pEvents;4569info.srcStageMask = 0;45704571tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,4572bufferMemoryBarrierCount, pBufferMemoryBarriers,4573imageMemoryBarrierCount, pImageMemoryBarriers, &info);4574}45754576VKAPI_ATTR void VKAPI_CALL4577tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)4578{4579/* No-op */4580}458145824583VKAPI_ATTR void VKAPI_CALL4584tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,4585const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)4586{4587TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);45884589cmd->state.predication_active = true;4590if (cmd->state.pass)4591cmd->state.has_subpass_predication = true;45924593struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;45944595tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);4596tu_cs_emit(cs, 1);45974598/* Wait for any writes to the predicate to land */4599if (cmd->state.pass)4600tu_emit_cache_flush_renderpass(cmd, cs);4601else4602tu_emit_cache_flush(cmd, cs);46034604TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);4605uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset;46064607/* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan4608* mandates 32-bit comparisons. Our workaround is to copy the the reference4609* value to the low 32-bits of a location where the high 32 bits are known4610* to be 0 and then compare that.4611*/4612tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);4613tu_cs_emit(cs, 0);4614tu_cs_emit_qw(cs, global_iova(cmd, predicate));4615tu_cs_emit_qw(cs, iova);46164617tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);4618tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);46194620bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;4621tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);4622tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |4623CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));4624tu_cs_emit_qw(cs, global_iova(cmd, predicate));4625}46264627VKAPI_ATTR void VKAPI_CALL4628tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4629{4630TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);46314632cmd->state.predication_active = false;46334634struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;46354636tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);4637tu_cs_emit(cs, 0);4638}4639464046414642