Path: blob/21.2-virgl/src/gallium/drivers/radeonsi/si_gfx_cs.c
4570 views
/*1* Copyright 2010 Jerome Glisse <[email protected]>2* Copyright 2018 Advanced Micro Devices, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the "Software"),7* to deal in the Software without restriction, including without limitation8* on the rights to use, copy, modify, merge, publish, distribute, sub9* license, and/or sell copies of the Software, and to permit persons to whom10* the Software is furnished to do so, subject to the following conditions:11*12* The above copyright notice and this permission notice (including the next13* paragraph) shall be included in all copies or substantial portions of the14* Software.15*16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR17* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,18* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL19* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,20* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR21* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE22* USE OR OTHER DEALINGS IN THE SOFTWARE.23*/2425#include "si_build_pm4.h"26#include "si_pipe.h"27#include "sid.h"28#include "util/os_time.h"29#include "util/u_log.h"30#include "util/u_upload_mgr.h"31#include "ac_debug.h"3233/* initialize */34void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)35{36struct radeon_cmdbuf *cs = &ctx->gfx_cs;3738/* There are two memory usage counters in the winsys for all buffers39* that have been added (cs_add_buffer) and two counters in the pipe40* driver for those that haven't been added yet.41*/42if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, ctx->vram_kb, ctx->gtt_kb))) {43ctx->gtt_kb = 0;44ctx->vram_kb = 0;45si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);46return;47}48ctx->gtt_kb = 0;49ctx->vram_kb = 0;5051unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx, num_draws);52if (!ctx->ws->cs_check_space(cs, need_dwords, false))53si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);54}5556void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)57{58struct radeon_cmdbuf *cs = &ctx->gfx_cs;59struct radeon_winsys *ws = ctx->ws;60struct si_screen *sscreen = ctx->screen;61const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;62unsigned wait_flags = 0;6364if (ctx->gfx_flush_in_progress)65return;6667/* The amdgpu kernel driver synchronizes execution for shared DMABUFs between68* processes on DRM >= 3.39.0, so we don't have to wait at the end of IBs to69* make sure everything is idle.70*71* The amdgpu winsys synchronizes execution for buffers shared by different72* contexts within the same process.73*74* Interop with AMDVLK, RADV, or OpenCL within the same process requires75* explicit fences or glFinish.76*/77if (sscreen->info.is_amdgpu && sscreen->info.drm_minor >= 39)78flags |= RADEON_FLUSH_START_NEXT_GFX_IB_NOW;7980if (!sscreen->info.kernel_flushes_tc_l2_after_ib) {81wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;82} else if (ctx->chip_class == GFX6) {83/* The kernel flushes L2 before shaders are finished. */84wait_flags |= wait_ps_cs;85} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW) ||86((flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION) &&87!ws->cs_is_secure(cs))) {88/* TODO: this workaround fixes subtitles rendering with mpv -vo=vaapi and89* tmz but shouldn't be necessary.90*/91wait_flags |= wait_ps_cs;92}9394/* Drop this flush if it's a no-op. */95if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&96(!wait_flags || !ctx->gfx_last_ib_is_busy) &&97!(flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)) {98tc_driver_internal_flush_notify(ctx->tc);99return;100}101102/* Non-aux contexts must set up no-op API dispatch on GPU resets. This is103* similar to si_get_reset_status but here we can ignore soft-recoveries,104* while si_get_reset_status can't. */105if (!(ctx->context_flags & SI_CONTEXT_FLAG_AUX) &&106ctx->device_reset_callback.reset) {107enum pipe_reset_status status = ctx->ws->ctx_query_reset_status(ctx->ctx, true, NULL);108if (status != PIPE_NO_RESET)109ctx->device_reset_callback.reset(ctx->device_reset_callback.data, status);110}111112if (sscreen->debug_flags & DBG(CHECK_VM))113flags &= ~PIPE_FLUSH_ASYNC;114115ctx->gfx_flush_in_progress = true;116117if (radeon_emitted(&ctx->prim_discard_compute_cs, 0))118si_compute_signal_gfx(ctx);119120if (ctx->has_graphics) {121if (!list_is_empty(&ctx->active_queries))122si_suspend_queries(ctx);123124ctx->streamout.suspended = false;125if (ctx->streamout.begin_emitted) {126si_emit_streamout_end(ctx);127ctx->streamout.suspended = true;128129/* Since NGG streamout uses GDS, we need to make GDS130* idle when we leave the IB, otherwise another process131* might overwrite it while our shaders are busy.132*/133if (sscreen->use_ngg_streamout)134wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;135}136}137138/* Make sure CP DMA is idle at the end of IBs after L2 prefetches139* because the kernel doesn't wait for it. */140if (ctx->chip_class >= GFX7)141si_cp_dma_wait_for_idle(ctx, &ctx->gfx_cs);142143/* Wait for draw calls to finish if needed. */144if (wait_flags) {145ctx->flags |= wait_flags;146ctx->emit_cache_flush(ctx, &ctx->gfx_cs);147}148ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;149150if (ctx->current_saved_cs) {151si_trace_emit(ctx);152153/* Save the IB for debug contexts. */154si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);155ctx->current_saved_cs->flushed = true;156ctx->current_saved_cs->time_flush = os_time_get_nano();157158si_log_hw_flush(ctx);159}160161if (si_compute_prim_discard_enabled(ctx)) {162/* The compute IB can start after the previous gfx IB starts. */163if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {164ctx->ws->cs_add_fence_dependency(165&ctx->gfx_cs, ctx->last_gfx_fence,166RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);167}168169/* Remember the last execution barrier. It's in the IB.170* It will signal the start of the next compute IB.171*/172if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {173*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);174ctx->last_pkt3_write_data = NULL;175176si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);177ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;178si_resource_reference(&ctx->barrier_buf, NULL);179180ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);181}182}183184if (ctx->is_noop)185flags |= RADEON_FLUSH_NOOP;186187/* Flush the CS. */188ws->cs_flush(cs, flags, &ctx->last_gfx_fence);189190tc_driver_internal_flush_notify(ctx->tc);191if (fence)192ws->fence_reference(fence, ctx->last_gfx_fence);193194ctx->num_gfx_cs_flushes++;195196if (si_compute_prim_discard_enabled(ctx)) {197/* Remember the last execution barrier, which is the last fence198* in this case.199*/200if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {201ctx->last_pkt3_write_data = NULL;202si_resource_reference(&ctx->last_ib_barrier_buf, NULL);203ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);204}205}206207/* Check VM faults if needed. */208if (sscreen->debug_flags & DBG(CHECK_VM)) {209/* Use conservative timeout 800ms, after which we won't wait any210* longer and assume the GPU is hung.211*/212ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);213214si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);215}216217if (unlikely(ctx->thread_trace &&218(flags & PIPE_FLUSH_END_OF_FRAME))) {219si_handle_thread_trace(ctx, &ctx->gfx_cs);220}221222if (ctx->current_saved_cs)223si_saved_cs_reference(&ctx->current_saved_cs, NULL);224225si_begin_new_gfx_cs(ctx, false);226ctx->gfx_flush_in_progress = false;227}228229static void si_begin_gfx_cs_debug(struct si_context *ctx)230{231static const uint32_t zeros[1];232assert(!ctx->current_saved_cs);233234ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));235if (!ctx->current_saved_cs)236return;237238pipe_reference_init(&ctx->current_saved_cs->reference, 1);239240ctx->current_saved_cs->trace_buf =241si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));242if (!ctx->current_saved_cs->trace_buf) {243free(ctx->current_saved_cs);244ctx->current_saved_cs = NULL;245return;246}247248pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),249zeros);250ctx->current_saved_cs->trace_id = 0;251252si_trace_emit(ctx);253254radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->current_saved_cs->trace_buf,255RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);256}257258static void si_add_gds_to_buffer_list(struct si_context *sctx)259{260if (sctx->gds) {261sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);262if (sctx->gds_oa) {263sctx->ws->cs_add_buffer(&sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);264}265}266}267268void si_allocate_gds(struct si_context *sctx)269{270struct radeon_winsys *ws = sctx->ws;271272if (sctx->gds)273return;274275assert(sctx->screen->use_ngg_streamout);276277/* 4 streamout GDS counters.278* We need 256B (64 dw) of GDS, otherwise streamout hangs.279*/280sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, RADEON_FLAG_DRIVER_INTERNAL);281sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, RADEON_FLAG_DRIVER_INTERNAL);282283assert(sctx->gds && sctx->gds_oa);284si_add_gds_to_buffer_list(sctx);285}286287void si_set_tracked_regs_to_clear_state(struct si_context *ctx)288{289STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);290291ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;292ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;293ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;294ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;295ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;296ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;297ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;298ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;299ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;300ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;301ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;302ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;303ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;304ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;305ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;306ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;307ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;308ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;309ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;310ctx->tracked_regs.reg_value[SI_TRACKED_DB_VRS_OVERRIDE_CNTL] = 0x00000000;311ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;312ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;313ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;314ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;315ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;316ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;317ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;318ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;319ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;320ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;321ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;322ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;323ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;324ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;325ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;326ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;327ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;328ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;329ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;330ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;331ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;332ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;333ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;334ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;335ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;336ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;337ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;338ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;339ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;340ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;341ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;342ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;343ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;344ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;345ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;346ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;347ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;348ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;349ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;350ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */351352/* Set all cleared context registers to saved. */353ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */354ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */355}356357void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper)358{359if (wrapper) {360if (wrapper != sctx->b.draw_vbo) {361assert (!sctx->real_draw_vbo);362sctx->real_draw_vbo = sctx->b.draw_vbo;363sctx->b.draw_vbo = wrapper;364}365} else if (sctx->real_draw_vbo) {366sctx->real_draw_vbo = NULL;367si_select_draw_vbo(sctx);368}369}370371static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx,372const struct pipe_draw_info *info,373unsigned drawid_offset,374const struct pipe_draw_indirect_info *indirect,375const struct pipe_draw_start_count_bias *draws,376unsigned num_draws) {377struct si_context *sctx = (struct si_context *)ctx;378379bool secure = si_gfx_resources_check_encrypted(sctx);380if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {381si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |382RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);383}384385sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);386}387388void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)389{390bool is_secure = false;391392if (unlikely(radeon_uses_secure_bos(ctx->ws))) {393/* Disable features that don't work with TMZ:394* - primitive discard395*/396ctx->prim_discard_vertex_count_threshold = UINT_MAX;397398is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);399400si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);401}402403if (ctx->is_debug)404si_begin_gfx_cs_debug(ctx);405406si_add_gds_to_buffer_list(ctx);407408/* Always invalidate caches at the beginning of IBs, because external409* users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our410* buffers.411*412* Note that the cache flush done by the kernel at the end of GFX IBs413* isn't useful here, because that flush can finish after the following414* IB starts drawing.415*416* TODO: Do we also need to invalidate CB & DB caches?417*/418ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |419SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;420ctx->pipeline_stats_enabled = -1;421422/* We don't know if the last draw call used GS fast launch, so assume it didn't. */423if (ctx->chip_class == GFX10 && ctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)424ctx->flags |= SI_CONTEXT_VGT_FLUSH;425426if (ctx->border_color_buffer) {427radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->border_color_buffer,428RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);429}430if (ctx->shadowed_regs) {431radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->shadowed_regs,432RADEON_USAGE_READWRITE,433RADEON_PRIO_DESCRIPTORS);434}435436si_add_all_descriptors_to_bo_list(ctx);437438if (first_cs || !ctx->shadowed_regs) {439si_shader_pointers_mark_dirty(ctx);440ctx->cs_shader_state.initialized = false;441}442443if (!ctx->has_graphics) {444ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;445return;446}447448if (ctx->tess_rings) {449radeon_add_to_buffer_list(ctx, &ctx->gfx_cs,450unlikely(is_secure) ? si_resource(ctx->tess_rings_tmz) : si_resource(ctx->tess_rings),451RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);452}453454/* set all valid group as dirty so they get reemited on455* next draw command456*/457si_pm4_reset_emitted(ctx, first_cs);458459/* The CS initialization should be emitted before everything else. */460if (ctx->cs_preamble_state)461si_pm4_emit(ctx, ctx->cs_preamble_state);462if (ctx->cs_preamble_tess_rings)463si_pm4_emit(ctx, unlikely(is_secure) ? ctx->cs_preamble_tess_rings_tmz :464ctx->cs_preamble_tess_rings);465if (ctx->cs_preamble_gs_rings)466si_pm4_emit(ctx, ctx->cs_preamble_gs_rings);467468if (ctx->queued.named.ls)469ctx->prefetch_L2_mask |= SI_PREFETCH_LS;470if (ctx->queued.named.hs)471ctx->prefetch_L2_mask |= SI_PREFETCH_HS;472if (ctx->queued.named.es)473ctx->prefetch_L2_mask |= SI_PREFETCH_ES;474if (ctx->queued.named.gs)475ctx->prefetch_L2_mask |= SI_PREFETCH_GS;476if (ctx->queued.named.vs)477ctx->prefetch_L2_mask |= SI_PREFETCH_VS;478if (ctx->queued.named.ps)479ctx->prefetch_L2_mask |= SI_PREFETCH_PS;480481/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */482bool has_clear_state = ctx->screen->info.has_clear_state;483if (has_clear_state || ctx->shadowed_regs) {484ctx->framebuffer.dirty_cbufs =485u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);486/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */487ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;488} else {489ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);490ctx->framebuffer.dirty_zsbuf = true;491}492493/* Even with shadowed registers, we have to add buffers to the buffer list.494* These atoms are the only ones that add buffers.495*/496si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);497si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);498if (ctx->screen->use_ngg_culling)499si_mark_atom_dirty(ctx, &ctx->atoms.s.ngg_cull_state);500501if (first_cs || !ctx->shadowed_regs) {502/* These don't add any buffers, so skip them with shadowing. */503si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);504/* CLEAR_STATE sets zeros. */505if (!has_clear_state || ctx->clip_state_any_nonzeros)506si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);507ctx->sample_locs_num_samples = 0;508si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);509si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);510/* CLEAR_STATE sets 0xffff. */511if (!has_clear_state || ctx->sample_mask != 0xffff)512si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);513si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);514/* CLEAR_STATE sets zeros. */515if (!has_clear_state || ctx->blend_color_any_nonzeros)516si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);517si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);518if (ctx->chip_class >= GFX9)519si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);520si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);521si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);522if (!ctx->screen->use_ngg_streamout)523si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);524/* CLEAR_STATE disables all window rectangles. */525if (!has_clear_state || ctx->num_window_rectangles > 0)526si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);527si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);528si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);529si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);530531/* Invalidate various draw states so that they are emitted before532* the first draw call. */533si_invalidate_draw_constants(ctx);534ctx->last_index_size = -1;535ctx->last_primitive_restart_en = -1;536ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;537ctx->last_prim = -1;538ctx->last_multi_vgt_param = -1;539ctx->last_vs_state = ~0;540ctx->last_ls = NULL;541ctx->last_tcs = NULL;542ctx->last_tes_sh_base = -1;543ctx->last_num_tcs_input_cp = -1;544ctx->last_ls_hs_config = -1; /* impossible value */545ctx->last_binning_enabled = -1;546547if (has_clear_state) {548si_set_tracked_regs_to_clear_state(ctx);549} else {550/* Set all register values to unknown. */551ctx->tracked_regs.reg_saved = 0;552ctx->last_gs_out_prim = -1; /* unknown */553}554555/* 0xffffffff is an impossible value to register SPI_PS_INPUT_CNTL_n */556memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);557}558559si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);560if (ctx->scratch_buffer) {561si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);562}563564if (ctx->streamout.suspended) {565ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;566si_streamout_buffers_dirty(ctx);567}568569if (!list_is_empty(&ctx->active_queries))570si_resume_queries(ctx);571572assert(!ctx->gfx_cs.prev_dw);573ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;574ctx->prim_discard_compute_ib_initialized = false;575576/* Compute-based primitive discard:577* The index ring is divided into 2 halves. Switch between the halves578* in the same fashion as doublebuffering.579*/580if (ctx->index_ring_base)581ctx->index_ring_base = 0;582else583ctx->index_ring_base = ctx->index_ring_size_per_ib;584585ctx->index_ring_offset = 0;586587/* All buffer references are removed on a flush, so si_check_needs_implicit_sync588* cannot determine if si_make_CB_shader_coherent() needs to be called.589* ctx->force_cb_shader_coherent will be cleared by the first call to590* si_make_CB_shader_coherent.591*/592ctx->force_cb_shader_coherent = true;593}594595void si_trace_emit(struct si_context *sctx)596{597struct radeon_cmdbuf *cs = &sctx->gfx_cs;598uint32_t trace_id = ++sctx->current_saved_cs->trace_id;599600si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);601602radeon_begin(cs);603radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));604radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));605radeon_end();606607if (sctx->log)608u_log_flush(sctx->log);609}610611void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)612{613if (!si_compute_prim_discard_enabled(sctx))614return;615616if (!sctx->barrier_buf) {617u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,618(struct pipe_resource **)&sctx->barrier_buf);619}620621/* Emit a placeholder to signal the next compute IB to start.622* See si_compute_prim_discard.c for explanation.623*/624uint32_t signal = 1;625si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,626&signal);627628sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5];629630/* Only the last occurrence of WRITE_DATA will be executed.631* The packet will be enabled in si_flush_gfx_cs.632*/633*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);634}635636void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)637{638bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;639640assert(sctx->chip_class <= GFX9);641642/* This seems problematic with GFX7 (see #4764) */643if (sctx->chip_class != GFX7)644cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */645646radeon_begin(cs);647648if (sctx->chip_class == GFX9 || compute_ib) {649/* Flush caches and wait for the caches to assert idle. */650radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));651radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */652radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */653radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */654radeon_emit(cs, 0); /* CP_COHER_BASE */655radeon_emit(cs, 0); /* CP_COHER_BASE_HI */656radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */657} else {658/* ACQUIRE_MEM is only required on a compute ring. */659radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));660radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */661radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */662radeon_emit(cs, 0); /* CP_COHER_BASE */663radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */664}665radeon_end();666667/* ACQUIRE_MEM has an implicit context roll if the current context668* is busy. */669if (!compute_ib)670sctx->context_roll = true;671}672673void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)674{675uint32_t gcr_cntl = 0;676unsigned cb_db_event = 0;677unsigned flags = ctx->flags;678679if (!ctx->has_graphics) {680/* Only process compute flags. */681flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |682SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |683SI_CONTEXT_CS_PARTIAL_FLUSH;684}685686/* We don't need these. */687assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));688689radeon_begin(cs);690691if (flags & SI_CONTEXT_VGT_FLUSH) {692radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));693radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));694}695696if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)697ctx->num_cb_cache_flushes++;698if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)699ctx->num_db_cache_flushes++;700701if (flags & SI_CONTEXT_INV_ICACHE)702gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);703if (flags & SI_CONTEXT_INV_SCACHE) {704/* TODO: When writing to the SMEM L1 cache, we need to set SEQ705* to FORWARD when both L1 and L2 are written out (WB or INV).706*/707gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);708}709if (flags & SI_CONTEXT_INV_VCACHE)710gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);711712/* The L2 cache ops are:713* - INV: - invalidate lines that reflect memory (were loaded from memory)714* - don't touch lines that were overwritten (were stored by gfx clients)715* - WB: - don't touch lines that reflect memory716* - write back lines that were overwritten717* - WB | INV: - invalidate lines that reflect memory718* - write back lines that were overwritten719*720* GLM doesn't support WB alone. If WB is set, INV must be set too.721*/722if (flags & SI_CONTEXT_INV_L2) {723/* Writeback and invalidate everything in L2. */724gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);725ctx->num_L2_invalidates++;726} else if (flags & SI_CONTEXT_WB_L2) {727gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);728} else if (flags & SI_CONTEXT_INV_L2_METADATA) {729gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);730}731732if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {733if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {734/* Flush CMASK/FMASK/DCC. Will wait for idle later. */735radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));736radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));737}738if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {739/* Flush HTILE. Will wait for idle later. */740radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));741radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));742}743744/* First flush CB/DB, then L1/L2. */745gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);746747if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==748(SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {749cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;750} else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {751cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;752} else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {753cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;754} else {755assert(0);756}757} else {758/* Wait for graphics shaders to go idle if requested. */759if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {760radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));761radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));762/* Only count explicit shader flushes, not implicit ones. */763ctx->num_vs_flushes++;764ctx->num_ps_flushes++;765} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {766radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));767radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));768ctx->num_vs_flushes++;769}770}771772if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {773radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));774radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));775ctx->num_cs_flushes++;776ctx->compute_is_busy = false;777}778radeon_end();779780if (cb_db_event) {781struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?782ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch;783/* CB/DB flush and invalidate (or possibly just a wait for a784* meta flush) via RELEASE_MEM.785*786* Combine this with other cache flushes when possible; this787* requires affected shaders to be idle, so do it after the788* CS_PARTIAL_FLUSH before (VS/PS partial flushes are always789* implied).790*/791uint64_t va;792793/* Do the flush (enqueue the event and wait for it). */794va = wait_mem_scratch->gpu_address;795ctx->wait_mem_number++;796797/* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */798unsigned glm_wb = G_586_GLM_WB(gcr_cntl);799unsigned glm_inv = G_586_GLM_INV(gcr_cntl);800unsigned glv_inv = G_586_GLV_INV(gcr_cntl);801unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);802assert(G_586_GL2_US(gcr_cntl) == 0);803assert(G_586_GL2_RANGE(gcr_cntl) == 0);804assert(G_586_GL2_DISCARD(gcr_cntl) == 0);805unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);806unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);807unsigned gcr_seq = G_586_SEQ(gcr_cntl);808809gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &810C_586_GL2_WB; /* keep SEQ */811812si_cp_release_mem(ctx, cs, cb_db_event,813S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |814S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |815S_490_SEQ(gcr_seq),816EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,817EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,818SI_NOT_QUERY);819820if (unlikely(ctx->thread_trace_enabled)) {821si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);822}823824si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);825826if (unlikely(ctx->thread_trace_enabled)) {827si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);828}829}830831radeon_begin_again(cs);832833/* Ignore fields that only modify the behavior of other fields. */834if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {835unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31;836837/* Flush caches and wait for the caches to assert idle.838* The cache flush is executed in the ME, but the PFP waits839* for completion.840*/841radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));842radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */843radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */844radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */845radeon_emit(cs, 0); /* CP_COHER_BASE */846radeon_emit(cs, 0); /* CP_COHER_BASE_HI */847radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */848radeon_emit(cs, gcr_cntl); /* GCR_CNTL */849} else if (flags & SI_CONTEXT_PFP_SYNC_ME) {850/* Synchronize PFP with ME. (this stalls PFP) */851radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));852radeon_emit(cs, 0);853}854855if (flags & SI_CONTEXT_START_PIPELINE_STATS && ctx->pipeline_stats_enabled != 1) {856radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));857radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));858ctx->pipeline_stats_enabled = 1;859} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && ctx->pipeline_stats_enabled != 0) {860radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));861radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));862ctx->pipeline_stats_enabled = 0;863}864radeon_end();865866ctx->flags = 0;867}868869void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)870{871uint32_t flags = sctx->flags;872873if (!sctx->has_graphics) {874/* Only process compute flags. */875flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |876SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |877SI_CONTEXT_CS_PARTIAL_FLUSH;878}879880uint32_t cp_coher_cntl = 0;881const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);882const bool is_barrier =883flush_cb_db ||884/* INV_ICACHE == beginning of gfx IB. Checking885* INV_ICACHE fixes corruption for DeusExMD with886* compute-based culling, but I don't know why.887*/888flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||889(flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);890891assert(sctx->chip_class <= GFX9);892893if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)894sctx->num_cb_cache_flushes++;895if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)896sctx->num_db_cache_flushes++;897898/* GFX6 has a bug that it always flushes ICACHE and KCACHE if either899* bit is set. An alternative way is to write SQC_CACHES, but that900* doesn't seem to work reliably. Since the bug doesn't affect901* correctness (it only does more work than necessary) and902* the performance impact is likely negligible, there is no plan903* to add a workaround for it.904*/905906if (flags & SI_CONTEXT_INV_ICACHE)907cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);908if (flags & SI_CONTEXT_INV_SCACHE)909cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);910911if (sctx->chip_class <= GFX8) {912if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {913cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |914S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |915S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |916S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |917S_0085F0_CB7_DEST_BASE_ENA(1);918919/* Necessary for DCC */920if (sctx->chip_class == GFX8)921si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,922EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);923}924if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)925cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);926}927928radeon_begin(cs);929930if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {931/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */932radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));933radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));934}935if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {936/* Flush HTILE. SURFACE_SYNC will wait for idle. */937radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));938radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));939}940941/* Wait for shader engines to go idle.942* VS and PS waits are unnecessary if SURFACE_SYNC is going to wait943* for everything including CB/DB cache flushes.944*/945if (!flush_cb_db) {946if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {947radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));948radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));949/* Only count explicit shader flushes, not implicit ones950* done by SURFACE_SYNC.951*/952sctx->num_vs_flushes++;953sctx->num_ps_flushes++;954} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {955radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));956radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));957sctx->num_vs_flushes++;958}959}960961if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {962radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));963radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));964sctx->num_cs_flushes++;965sctx->compute_is_busy = false;966}967968/* VGT state synchronization. */969if (flags & SI_CONTEXT_VGT_FLUSH) {970radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));971radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));972}973if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {974radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));975radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));976}977978radeon_end();979980/* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't981* wait for idle on GFX9. We have to use a TS event.982*/983if (sctx->chip_class == GFX9 && flush_cb_db) {984uint64_t va;985unsigned tc_flags, cb_db_event;986987/* Set the CB/DB flush event. */988switch (flush_cb_db) {989case SI_CONTEXT_FLUSH_AND_INV_CB:990cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;991break;992case SI_CONTEXT_FLUSH_AND_INV_DB:993cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;994break;995default:996/* both CB & DB */997cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;998}9991000/* These are the only allowed combinations. If you need to1001* do multiple operations at once, do them separately.1002* All operations that invalidate L2 also seem to invalidate1003* metadata. Volatile (VOL) and WC flushes are not listed here.1004*1005* TC | TC_WB = writeback & invalidate L2 & L11006* TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC1007* TC_WB | TC_NC = writeback L2 for MTYPE == NC1008* TC | TC_NC = invalidate L2 for MTYPE == NC1009* TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)1010* TCL1 = invalidate L11011*/1012tc_flags = 0;10131014if (flags & SI_CONTEXT_INV_L2_METADATA) {1015tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;1016}10171018/* Ideally flush TC together with CB/DB. */1019if (flags & SI_CONTEXT_INV_L2) {1020/* Writeback and invalidate everything in L2 & L1. */1021tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;10221023/* Clear the flags. */1024flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);1025sctx->num_L2_invalidates++;1026}10271028/* Do the flush (enqueue the event and wait for it). */1029struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ?1030sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch;1031va = wait_mem_scratch->gpu_address;1032sctx->wait_mem_number++;10331034si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,1035EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,1036wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);10371038if (unlikely(sctx->thread_trace_enabled)) {1039si_sqtt_describe_barrier_start(sctx, &sctx->gfx_cs);1040}10411042si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);10431044if (unlikely(sctx->thread_trace_enabled)) {1045si_sqtt_describe_barrier_end(sctx, &sctx->gfx_cs, sctx->flags);1046}1047}10481049/* GFX6-GFX8 only:1050* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC1051* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.1052*1053* cp_coher_cntl should contain all necessary flags except TC and PFP flags1054* at this point.1055*1056* GFX6-GFX7 don't support L2 write-back.1057*/1058if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {1059/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)1060* WB must be set on GFX8+ when TC_ACTION is set.1061*/1062si_emit_surface_sync(sctx, cs,1063cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |1064S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));1065cp_coher_cntl = 0;1066sctx->num_L2_invalidates++;1067} else {1068/* L1 invalidation and L2 writeback must be done separately,1069* because both operations can't be done together.1070*/1071if (flags & SI_CONTEXT_WB_L2) {1072/* WB = write-back1073* NC = apply to non-coherent MTYPEs1074* (i.e. MTYPE <= 1, which is what we use everywhere)1075*1076* WB doesn't work without NC.1077*/1078si_emit_surface_sync(1079sctx, cs,1080cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));1081cp_coher_cntl = 0;1082sctx->num_L2_writebacks++;1083}1084if (flags & SI_CONTEXT_INV_VCACHE) {1085/* Invalidate per-CU VMEM L1. */1086si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));1087cp_coher_cntl = 0;1088}1089}10901091/* If TC flushes haven't cleared this... */1092if (cp_coher_cntl)1093si_emit_surface_sync(sctx, cs, cp_coher_cntl);10941095if (flags & SI_CONTEXT_PFP_SYNC_ME) {1096radeon_begin(cs);1097radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));1098radeon_emit(cs, 0);1099radeon_end();1100}11011102if (is_barrier)1103si_prim_discard_signal_next_compute_ib_start(sctx);11041105if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {1106radeon_begin(cs);1107radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));1108radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));1109radeon_end();1110sctx->pipeline_stats_enabled = 1;1111} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && sctx->pipeline_stats_enabled != 0) {1112radeon_begin(cs);1113radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));1114radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));1115radeon_end();1116sctx->pipeline_stats_enabled = 0;1117}11181119sctx->flags = 0;1120}112111221123