Path: blob/21.2-virgl/src/freedreno/vulkan/tu_clear_blit.c
4565 views
/*1* Copyright 2019-2020 Valve Corporation2* SPDX-License-Identifier: MIT3*4* Authors:5* Jonathan Marek <[email protected]>6*/78#include "tu_private.h"910#include "tu_cs.h"11#include "vk_format.h"1213#include "util/format_r11g11b10f.h"14#include "util/format_rgb9e5.h"15#include "util/format_srgb.h"16#include "util/half_float.h"1718static uint32_t19tu_pack_float32_for_unorm(float val, int bits)20{21return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));22}2324/* r2d_ = BLIT_OP_SCALE operations */2526static enum a6xx_2d_ifmt27format_to_ifmt(VkFormat format)28{29if (format == VK_FORMAT_D24_UNORM_S8_UINT ||30format == VK_FORMAT_X8_D24_UNORM_PACK32)31return R2D_UNORM8;3233/* get_component_bits doesn't work with depth/stencil formats: */34if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)35return R2D_FLOAT32;36if (format == VK_FORMAT_S8_UINT)37return R2D_INT8;3839/* use the size of the red channel to find the corresponding "ifmt" */40bool is_int = vk_format_is_int(format);41switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {42case 4: case 5: case 8:43return is_int ? R2D_INT8 : R2D_UNORM8;44case 10: case 11:45return is_int ? R2D_INT16 : R2D_FLOAT16;46case 16:47if (vk_format_is_float(format))48return R2D_FLOAT16;49return is_int ? R2D_INT16 : R2D_FLOAT32;50case 32:51return is_int ? R2D_INT32 : R2D_FLOAT32;52default:53unreachable("bad format");54return 0;55}56}5758static void59r2d_coords(struct tu_cs *cs,60const VkOffset2D *dst,61const VkOffset2D *src,62const VkExtent2D *extent)63{64tu_cs_emit_regs(cs,65A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),66A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));6768if (!src)69return;7071tu_cs_emit_regs(cs,72A6XX_GRAS_2D_SRC_TL_X(src->x),73A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),74A6XX_GRAS_2D_SRC_TL_Y(src->y),75A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));76}7778static void79r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)80{81uint32_t clear_value[4] = {};8283switch (format) {84case VK_FORMAT_X8_D24_UNORM_PACK32:85case VK_FORMAT_D24_UNORM_S8_UINT:86/* cleared as r8g8b8a8_unorm using special format */87clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);88clear_value[1] = clear_value[0] >> 8;89clear_value[2] = clear_value[0] >> 16;90clear_value[3] = val->depthStencil.stencil;91break;92case VK_FORMAT_D16_UNORM:93case VK_FORMAT_D32_SFLOAT:94/* R2D_FLOAT32 */95clear_value[0] = fui(val->depthStencil.depth);96break;97case VK_FORMAT_S8_UINT:98clear_value[0] = val->depthStencil.stencil;99break;100case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:101/* cleared as UINT32 */102clear_value[0] = float3_to_rgb9e5(val->color.float32);103break;104default:105assert(!vk_format_is_depth_or_stencil(format));106const struct util_format_description *desc = vk_format_description(format);107enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);108109assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||110format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));111112for (unsigned i = 0; i < desc->nr_channels; i++) {113const struct util_format_channel_description *ch = &desc->channel[i];114if (ifmt == R2D_UNORM8) {115float linear = val->color.float32[i];116if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)117linear = util_format_linear_to_srgb_float(val->color.float32[i]);118119if (ch->type == UTIL_FORMAT_TYPE_SIGNED)120clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);121else122clear_value[i] = tu_pack_float32_for_unorm(linear, 8);123} else if (ifmt == R2D_FLOAT16) {124clear_value[i] = _mesa_float_to_half(val->color.float32[i]);125} else {126assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||127ifmt == R2D_INT16 || ifmt == R2D_INT8);128clear_value[i] = val->color.uint32[i];129}130}131break;132}133134tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);135tu_cs_emit_array(cs, clear_value, 4);136}137138static void139r2d_src(struct tu_cmd_buffer *cmd,140struct tu_cs *cs,141const struct tu_image_view *iview,142uint32_t layer,143VkFilter filter)144{145uint32_t src_info = iview->SP_PS_2D_SRC_INFO;146if (filter != VK_FILTER_NEAREST)147src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;148149tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);150tu_cs_emit(cs, src_info);151tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);152tu_cs_image_ref_2d(cs, iview, layer, true);153154tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);155tu_cs_image_flag_ref(cs, iview, layer);156}157158static void159r2d_src_stencil(struct tu_cmd_buffer *cmd,160struct tu_cs *cs,161const struct tu_image_view *iview,162uint32_t layer,163VkFilter filter)164{165tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);166tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);167tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);168tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);169/* SP_PS_2D_SRC_PITCH has shifted pitch field */170tu_cs_emit(cs, iview->stencil_PITCH << 9);171}172173static void174r2d_src_buffer(struct tu_cmd_buffer *cmd,175struct tu_cs *cs,176VkFormat vk_format,177uint64_t va, uint32_t pitch,178uint32_t width, uint32_t height)179{180struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);181182tu_cs_emit_regs(cs,183A6XX_SP_PS_2D_SRC_INFO(184.color_format = format.fmt,185.color_swap = format.swap,186.srgb = vk_format_is_srgb(vk_format),187.unk20 = 1,188.unk22 = 1),189A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),190A6XX_SP_PS_2D_SRC(.qword = va),191A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));192}193194static void195r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)196{197tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);198tu_cs_emit(cs, iview->RB_2D_DST_INFO);199tu_cs_image_ref_2d(cs, iview, layer, false);200201tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);202tu_cs_image_flag_ref(cs, iview, layer);203}204205static void206r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)207{208tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);209tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);210tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);211tu_cs_emit(cs, iview->stencil_PITCH);212}213214static void215r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)216{217struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);218219tu_cs_emit_regs(cs,220A6XX_RB_2D_DST_INFO(221.color_format = format.fmt,222.color_swap = format.swap,223.srgb = vk_format_is_srgb(vk_format)),224A6XX_RB_2D_DST(.qword = va),225A6XX_RB_2D_DST_PITCH(pitch));226}227228static void229r2d_setup_common(struct tu_cmd_buffer *cmd,230struct tu_cs *cs,231VkFormat vk_format,232VkImageAspectFlags aspect_mask,233unsigned blit_param,234bool clear,235bool ubwc,236bool scissor)237{238enum a6xx_format format = tu6_base_format(vk_format);239enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);240uint32_t unknown_8c01 = 0;241242if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||243vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {244format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;245}246247/* note: the only format with partial clearing is D24S8 */248if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {249/* preserve stencil channel */250if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)251unknown_8c01 = 0x08000041;252/* preserve depth channels */253if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)254unknown_8c01 = 0x00084001;255}256257tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);258tu_cs_emit(cs, unknown_8c01);259260uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(261.scissor = scissor,262.rotate = blit_param,263.solid_color = clear,264.d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,265.color_format = format,266.mask = 0xf,267.ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,268).value;269270tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);271tu_cs_emit(cs, blit_cntl);272273tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);274tu_cs_emit(cs, blit_cntl);275276if (format == FMT6_10_10_10_2_UNORM_DEST)277format = FMT6_16_16_16_16_FLOAT;278279tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(280.sint = vk_format_is_sint(vk_format),281.uint = vk_format_is_uint(vk_format),282.color_format = format,283.srgb = vk_format_is_srgb(vk_format),284.mask = 0xf));285}286287static void288r2d_setup(struct tu_cmd_buffer *cmd,289struct tu_cs *cs,290VkFormat vk_format,291VkImageAspectFlags aspect_mask,292unsigned blit_param,293bool clear,294bool ubwc)295{296tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);297298r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false);299}300301static void302r2d_teardown(struct tu_cmd_buffer *cmd,303struct tu_cs *cs)304{305/* nothing to do here */306}307308static void309r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)310{311tu_cs_emit_pkt7(cs, CP_BLIT, 1);312tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));313}314315/* r3d_ = shader path operations */316317void318tu_init_clear_blit_shaders(struct tu6_global *global)319{320#define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }321#define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }322#define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }323324static const instr_t vs_code[] = {325/* r0.xyz = r0.w ? c1.xyz : c0.xyz326* r1.xy = r0.w ? c1.zw : c0.zw327* r1.z = c2.x (for z_scale path)328* r0.w = 1.0f329*/330CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,331.c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,332.src2 = 3,333.c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),334CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,335.c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,336.src2 = 3,337.c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),338MOV(.dst = 6, .src_c = 1, .src = 8 ),339MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),340{ .cat0 = { .opc = OPC_END } },341};342343static const instr_t fs_blit[] = {344/* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its345* blit path (its not clear what allows it to not have it)346*/347CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),348{ .cat0 = { .opc = OPC_END } },349};350351static const instr_t fs_blit_zscale[] = {352/* (rpt2)bary.f (ei)r0.x, (r)0, r0.x353* (rpt5)nop354* sam.3d (s32)(xyzw)r0.x, r0.x, s#0, t#0355*/356CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 0, .src1_im = 1, .src1 = 0, .repeat = 2, .src1_r = 1),357{ .cat0 = { .repeat = 5 } },358{ .cat5 = { .opc_cat = 5, .opc = OPC_SAM & 31, .dst = 0, .wrmask = 0xf, .type = TYPE_S32,359.is_3d = 1, .norm = { .full = 1, .src1 = 0 } } },360{ .cat0 = { .opc = OPC_END } },361};362363memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));364memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));365memcpy(&global->shaders[GLOBAL_SH_FS_BLIT_ZSCALE], fs_blit_zscale, sizeof(fs_blit_zscale));366367for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {368instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];369for (uint32_t i = 0; i < num_rts; i++) {370/* (rpt3)mov.s32s32 r0.x, (r)c[i].x */371*code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);372}373*code++ = (instr_t) { .cat0 = { .opc = OPC_END } };374}375}376377static void378r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,379bool layered_clear, bool z_scale)380{381struct ir3_const_state dummy_const_state = {};382struct ir3_shader dummy_shader = {383.compiler = cmd->device->compiler,384};385386struct ir3_shader_variant vs = {387.type = MESA_SHADER_VERTEX,388.instrlen = 1,389.constlen = 4,390.info.max_reg = 1,391.inputs_count = 1,392.inputs[0] = {393.slot = SYSTEM_VALUE_VERTEX_ID,394.regid = regid(0, 3),395.sysval = true,396},397.outputs_count = blit ? 2 : 1,398.outputs[0] = {399.slot = VARYING_SLOT_POS,400.regid = regid(0, 0),401},402.outputs[1] = {403.slot = VARYING_SLOT_VAR0,404.regid = regid(1, 0),405},406.shader = &dummy_shader,407.const_state = &dummy_const_state,408};409if (layered_clear) {410vs.outputs[1].slot = VARYING_SLOT_LAYER;411vs.outputs[1].regid = regid(1, 1);412vs.outputs_count = 2;413}414415struct ir3_shader_variant fs = {416.type = MESA_SHADER_FRAGMENT,417.instrlen = 1, /* max of 9 instructions with num_rts = 8 */418.constlen = align(num_rts, 4),419.info.max_reg = MAX2(num_rts, 1) - 1,420.total_in = blit ? 2 : 0,421.num_samp = blit ? 1 : 0,422.inputs_count = blit ? 2 : 0,423.inputs[0] = {424.slot = VARYING_SLOT_VAR0,425.inloc = 0,426.compmask = 3,427.bary = true,428},429.inputs[1] = {430.slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,431.regid = regid(0, 0),432.sysval = 1,433},434.num_sampler_prefetch = blit ? 1 : 0,435.sampler_prefetch[0] = {436.src = 0,437.wrmask = 0xf,438.cmd = 4,439},440.shader = &dummy_shader,441.const_state = &dummy_const_state,442};443444enum global_shader fs_id = GLOBAL_SH_FS_BLIT;445446if (!blit)447fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;448449/* z_scale blit path has an extra varying and doesn't use prefetch */450if (z_scale) {451assert(blit);452fs.total_in = 3;453fs.num_sampler_prefetch = 0;454fs.inputs[0].compmask = 7;455fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;456}457458tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(459.vs_state = true,460.hs_state = true,461.ds_state = true,462.gs_state = true,463.fs_state = true,464.cs_state = true,465.gfx_ibo = true,466.cs_ibo = true,467.gfx_shared_const = true,468.gfx_bindless = 0x1f,469.cs_bindless = 0x1f));470471tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs);472tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);473tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);474tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);475tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs);476477struct tu_pvtmem_config pvtmem = {};478tu6_emit_xs(cs, MESA_SHADER_VERTEX, &vs, &pvtmem, global_iova(cmd, shaders[GLOBAL_SH_VS]));479tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, &fs, &pvtmem, global_iova(cmd, shaders[fs_id]));480481tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());482tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());483484if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {485/* Copy what the blob does here. This will emit an extra 0x3f486* CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what487* this is working around yet.488*/489tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);490tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));491tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);492tu_cs_emit(cs, 0);493} else {494tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());495}496tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());497498tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0);499500/* REPL_MODE for varying with RECTLIST (2 vertices only) */501tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));502tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));503504tu6_emit_fs_inputs(cs, &fs);505506tu_cs_emit_regs(cs,507A6XX_GRAS_CL_CNTL(508.persp_division_disable = 1,509.vp_xform_disable = 1,510.vp_clip_code_ignore = 1,511.clip_disable = 1));512tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?513514tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());515tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());516517tu_cs_emit_regs(cs,518A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),519A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));520tu_cs_emit_regs(cs,521A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),522A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));523524tu_cs_emit_regs(cs,525A6XX_VFD_INDEX_OFFSET(),526A6XX_VFD_INSTANCE_START_OFFSET());527}528529static void530r3d_coords_raw(struct tu_cs *cs, const float *coords)531{532tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);533tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |534CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |535CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |536CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |537CP_LOAD_STATE6_0_NUM_UNIT(2));538tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));539tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));540tu_cs_emit_array(cs, (const uint32_t *) coords, 8);541}542543/* z coordinate for "z scale" blit path which uses a 3d texture */544static void545r3d_coord_z(struct tu_cs *cs, float z)546{547tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);548tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |549CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |550CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |551CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |552CP_LOAD_STATE6_0_NUM_UNIT(1));553tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));554tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));555tu_cs_emit(cs, fui(z));556tu_cs_emit(cs, 0);557tu_cs_emit(cs, 0);558tu_cs_emit(cs, 0);559}560561static void562r3d_coords(struct tu_cs *cs,563const VkOffset2D *dst,564const VkOffset2D *src,565const VkExtent2D *extent)566{567int32_t src_x1 = src ? src->x : 0;568int32_t src_y1 = src ? src->y : 0;569r3d_coords_raw(cs, (float[]) {570dst->x, dst->y,571src_x1, src_y1,572dst->x + extent->width, dst->y + extent->height,573src_x1 + extent->width, src_y1 + extent->height,574});575}576577static void578r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)579{580tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);581tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |582CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |583CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |584CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |585CP_LOAD_STATE6_0_NUM_UNIT(1));586tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));587tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));588switch (format) {589case VK_FORMAT_X8_D24_UNORM_PACK32:590case VK_FORMAT_D24_UNORM_S8_UINT: {591/* cleared as r8g8b8a8_unorm using special format */592uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);593tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));594tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));595tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));596tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));597} break;598case VK_FORMAT_D16_UNORM:599case VK_FORMAT_D32_SFLOAT:600tu_cs_emit(cs, fui(val->depthStencil.depth));601tu_cs_emit(cs, 0);602tu_cs_emit(cs, 0);603tu_cs_emit(cs, 0);604break;605case VK_FORMAT_S8_UINT:606tu_cs_emit(cs, val->depthStencil.stencil & 0xff);607tu_cs_emit(cs, 0);608tu_cs_emit(cs, 0);609tu_cs_emit(cs, 0);610break;611default:612/* as color formats use clear value as-is */613assert(!vk_format_is_depth_or_stencil(format));614tu_cs_emit_array(cs, val->color.uint32, 4);615break;616}617}618619static void620r3d_src_common(struct tu_cmd_buffer *cmd,621struct tu_cs *cs,622const uint32_t *tex_const,623uint32_t offset_base,624uint32_t offset_ubwc,625VkFilter filter)626{627struct tu_cs_memory texture = { };628VkResult result = tu_cs_alloc(&cmd->sub_cs,6292, /* allocate space for a sampler too */630A6XX_TEX_CONST_DWORDS, &texture);631if (result != VK_SUCCESS) {632cmd->record_result = result;633return;634}635636memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);637638/* patch addresses for layer offset */639*(uint64_t*) (texture.map + 4) += offset_base;640uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;641texture.map[7] = ubwc_addr;642texture.map[8] = ubwc_addr >> 32;643644texture.map[A6XX_TEX_CONST_DWORDS + 0] =645A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |646A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |647A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |648A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |649A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |6500x60000; /* XXX used by blob, doesn't seem necessary */651texture.map[A6XX_TEX_CONST_DWORDS + 1] =6520x1 | /* XXX used by blob, doesn't seem necessary */653A6XX_TEX_SAMP_1_UNNORM_COORDS |654A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;655texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;656texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;657658tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);659tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |660CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |661CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |662CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |663CP_LOAD_STATE6_0_NUM_UNIT(1));664tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);665666tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));667668tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);669tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |670CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |671CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |672CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |673CP_LOAD_STATE6_0_NUM_UNIT(1));674tu_cs_emit_qw(cs, texture.iova);675676tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));677tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));678}679680static void681r3d_src(struct tu_cmd_buffer *cmd,682struct tu_cs *cs,683const struct tu_image_view *iview,684uint32_t layer,685VkFilter filter)686{687r3d_src_common(cmd, cs, iview->descriptor,688iview->layer_size * layer,689iview->ubwc_layer_size * layer,690filter);691}692693static void694r3d_src_buffer(struct tu_cmd_buffer *cmd,695struct tu_cs *cs,696VkFormat vk_format,697uint64_t va, uint32_t pitch,698uint32_t width, uint32_t height)699{700uint32_t desc[A6XX_TEX_CONST_DWORDS];701702struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);703704desc[0] =705COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |706A6XX_TEX_CONST_0_FMT(format.fmt) |707A6XX_TEX_CONST_0_SWAP(format.swap) |708A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |709// XXX to swizzle into .w for stencil buffer_to_image710A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |711A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |712A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);713desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);714desc[2] =715A6XX_TEX_CONST_2_PITCH(pitch) |716A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);717desc[3] = 0;718desc[4] = va;719desc[5] = va >> 32;720for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)721desc[i] = 0;722723r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);724}725726static void727r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)728{729tu6_emit_msaa(cs, iview->image->layout[0].nr_samples); /* TODO: move to setup */730731tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);732tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);733tu_cs_image_ref(cs, iview, layer);734tu_cs_emit(cs, 0);735736tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);737tu_cs_image_flag_ref(cs, iview, layer);738739tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));740}741742static void743r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)744{745tu6_emit_msaa(cs, iview->image->layout[0].nr_samples); /* TODO: move to setup */746747tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);748tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));749tu_cs_image_stencil_ref(cs, iview, layer);750tu_cs_emit(cs, 0);751752tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());753}754755static void756r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)757{758struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);759760tu6_emit_msaa(cs, 1); /* TODO: move to setup */761762tu_cs_emit_regs(cs,763A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),764A6XX_RB_MRT_PITCH(0, pitch),765A6XX_RB_MRT_ARRAY_PITCH(0, 0),766A6XX_RB_MRT_BASE(0, .qword = va),767A6XX_RB_MRT_BASE_GMEM(0, 0));768769tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());770}771772static uint8_t773aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)774{775uint8_t mask = 0xf;776assert(aspect_mask);777/* note: the only format with partial writing is D24S8,778* clear/blit uses the _AS_R8G8B8A8 format to access it779*/780if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {781if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)782mask = 0x7;783if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)784mask = 0x8;785}786return mask;787}788789static void790r3d_setup(struct tu_cmd_buffer *cmd,791struct tu_cs *cs,792VkFormat vk_format,793VkImageAspectFlags aspect_mask,794unsigned blit_param,795bool clear,796bool ubwc)797{798enum a6xx_format format = tu6_base_format(vk_format);799800if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||801vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {802format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;803}804805if (!cmd->state.pass) {806tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);807tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);808}809810tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));811tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));812813r3d_common(cmd, cs, !clear, clear ? 1 : 0, false, blit_param);814815tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);816tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |817A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |8180xfc000000);819tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));820821tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);822tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));823824tu_cs_emit_regs(cs,825A6XX_RB_FS_OUTPUT_CNTL0(),826A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));827828tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());829tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));830831tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());832tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());833tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());834tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());835tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());836tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());837tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());838839tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));840tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));841842tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,843.color_format = format,844.color_sint = vk_format_is_sint(vk_format),845.color_uint = vk_format_is_uint(vk_format)));846847tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,848.component_enable = aspect_write_mask(vk_format, aspect_mask)));849tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));850tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));851852tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));853tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));854855if (cmd->state.predication_active) {856tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);857tu_cs_emit(cs, 0);858}859}860861static void862r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)863{864tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);865tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |866CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |867CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));868tu_cs_emit(cs, 1); /* instance count */869tu_cs_emit(cs, 2); /* vertex count */870}871872static void873r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)874{875if (cmd->state.predication_active) {876tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);877tu_cs_emit(cs, 1);878}879}880881/* blit ops - common interface for 2d/shader paths */882883struct blit_ops {884void (*coords)(struct tu_cs *cs,885const VkOffset2D *dst,886const VkOffset2D *src,887const VkExtent2D *extent);888void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);889void (*src)(890struct tu_cmd_buffer *cmd,891struct tu_cs *cs,892const struct tu_image_view *iview,893uint32_t layer,894VkFilter filter);895void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,896VkFormat vk_format,897uint64_t va, uint32_t pitch,898uint32_t width, uint32_t height);899void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);900void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);901void (*setup)(struct tu_cmd_buffer *cmd,902struct tu_cs *cs,903VkFormat vk_format,904VkImageAspectFlags aspect_mask,905unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */906bool clear,907bool ubwc);908void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);909void (*teardown)(struct tu_cmd_buffer *cmd,910struct tu_cs *cs);911};912913static const struct blit_ops r2d_ops = {914.coords = r2d_coords,915.clear_value = r2d_clear_value,916.src = r2d_src,917.src_buffer = r2d_src_buffer,918.dst = r2d_dst,919.dst_buffer = r2d_dst_buffer,920.setup = r2d_setup,921.run = r2d_run,922.teardown = r2d_teardown,923};924925static const struct blit_ops r3d_ops = {926.coords = r3d_coords,927.clear_value = r3d_clear_value,928.src = r3d_src,929.src_buffer = r3d_src_buffer,930.dst = r3d_dst,931.dst_buffer = r3d_dst_buffer,932.setup = r3d_setup,933.run = r3d_run,934.teardown = r3d_teardown,935};936937/* passthrough set coords from 3D extents */938static void939coords(const struct blit_ops *ops,940struct tu_cs *cs,941const VkOffset3D *dst,942const VkOffset3D *src,943const VkExtent3D *extent)944{945ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);946}947948/* Decides the VK format to treat our data as for a memcpy-style blit. We have949* to be a bit careful because we have to pick a format with matching UBWC950* compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for951* everything.952*/953static VkFormat954copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)955{956if (vk_format_is_compressed(format)) {957switch (vk_format_get_blocksize(format)) {958case 1: return VK_FORMAT_R8_UINT;959case 2: return VK_FORMAT_R16_UINT;960case 4: return VK_FORMAT_R32_UINT;961case 8: return VK_FORMAT_R32G32_UINT;962case 16:return VK_FORMAT_R32G32B32A32_UINT;963default:964unreachable("unhandled format size");965}966}967968switch (format) {969/* For SNORM formats, copy them as the equivalent UNORM format. If we treat970* them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81971* (also -1.0), when we're supposed to be memcpying the bits. See972* https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.973*/974case VK_FORMAT_R8_SNORM:975return VK_FORMAT_R8_UNORM;976case VK_FORMAT_R8G8_SNORM:977return VK_FORMAT_R8G8_UNORM;978case VK_FORMAT_R8G8B8_SNORM:979return VK_FORMAT_R8G8B8_UNORM;980case VK_FORMAT_B8G8R8_SNORM:981return VK_FORMAT_B8G8R8_UNORM;982case VK_FORMAT_R8G8B8A8_SNORM:983return VK_FORMAT_R8G8B8A8_UNORM;984case VK_FORMAT_B8G8R8A8_SNORM:985return VK_FORMAT_B8G8R8A8_UNORM;986case VK_FORMAT_A8B8G8R8_SNORM_PACK32:987return VK_FORMAT_A8B8G8R8_UNORM_PACK32;988case VK_FORMAT_A2R10G10B10_SNORM_PACK32:989return VK_FORMAT_A2R10G10B10_UNORM_PACK32;990case VK_FORMAT_A2B10G10R10_SNORM_PACK32:991return VK_FORMAT_A2B10G10R10_UNORM_PACK32;992case VK_FORMAT_R16_SNORM:993return VK_FORMAT_R16_UNORM;994case VK_FORMAT_R16G16_SNORM:995return VK_FORMAT_R16G16_UNORM;996case VK_FORMAT_R16G16B16_SNORM:997return VK_FORMAT_R16G16B16_UNORM;998case VK_FORMAT_R16G16B16A16_SNORM:999return VK_FORMAT_R16G16B16A16_UNORM;10001001case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:1002return VK_FORMAT_R32_UINT;10031004case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:1005if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)1006return VK_FORMAT_R8G8_UNORM;1007else1008return VK_FORMAT_R8_UNORM;1009case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:1010return VK_FORMAT_R8_UNORM;10111012case VK_FORMAT_D24_UNORM_S8_UINT:1013if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)1014return VK_FORMAT_R8_UNORM;1015else1016return format;10171018case VK_FORMAT_D32_SFLOAT_S8_UINT:1019if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)1020return VK_FORMAT_S8_UINT;1021assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);1022return VK_FORMAT_D32_SFLOAT;10231024default:1025return format;1026}1027}10281029void1030tu6_clear_lrz(struct tu_cmd_buffer *cmd,1031struct tu_cs *cs,1032struct tu_image *image,1033const VkClearValue *value)1034{1035const struct blit_ops *ops = &r2d_ops;10361037ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false);1038ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);1039ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,1040image->bo->iova + image->bo_offset + image->lrz_offset,1041image->lrz_pitch * 2);1042ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});1043ops->run(cmd, cs);1044ops->teardown(cmd, cs);1045}10461047static void1048tu_image_view_copy_blit(struct tu_image_view *iview,1049struct tu_image *image,1050VkFormat format,1051const VkImageSubresourceLayers *subres,1052uint32_t layer,1053bool stencil_read,1054bool z_scale)1055{1056VkImageAspectFlags aspect_mask = subres->aspectMask;10571058/* always use the AS_R8G8B8A8 format for these */1059if (format == VK_FORMAT_D24_UNORM_S8_UINT ||1060format == VK_FORMAT_X8_D24_UNORM_PACK32) {1061aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;1062}10631064tu_image_view_init(iview, &(VkImageViewCreateInfo) {1065.image = tu_image_to_handle(image),1066.viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D,1067.format = format,1068/* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */1069.components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,1070.subresourceRange = {1071.aspectMask = aspect_mask,1072.baseMipLevel = subres->mipLevel,1073.levelCount = 1,1074.baseArrayLayer = subres->baseArrayLayer + layer,1075.layerCount = 1,1076},1077}, false);1078}10791080static void1081tu_image_view_copy(struct tu_image_view *iview,1082struct tu_image *image,1083VkFormat format,1084const VkImageSubresourceLayers *subres,1085uint32_t layer,1086bool stencil_read)1087{1088format = copy_format(format, subres->aspectMask, false);1089tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false);1090}10911092static void1093tu_image_view_blit(struct tu_image_view *iview,1094struct tu_image *image,1095const VkImageSubresourceLayers *subres,1096uint32_t layer)1097{1098tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false);1099}11001101static void1102tu6_blit_image(struct tu_cmd_buffer *cmd,1103struct tu_image *src_image,1104struct tu_image *dst_image,1105const VkImageBlit *info,1106VkFilter filter)1107{1108const struct blit_ops *ops = &r2d_ops;1109struct tu_cs *cs = &cmd->cs;1110bool z_scale = false;1111uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;11121113/* 2D blit can't do rotation mirroring from just coordinates */1114static const enum a6xx_rotation rotate[2][2] = {1115{ROTATE_0, ROTATE_HFLIP},1116{ROTATE_VFLIP, ROTATE_180},1117};11181119bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=1120(info->dstOffsets[1].x < info->dstOffsets[0].x);1121bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=1122(info->dstOffsets[1].y < info->dstOffsets[0].y);11231124int32_t src0_z = info->srcOffsets[0].z;1125int32_t src1_z = info->srcOffsets[1].z;11261127if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=1128info->dstOffsets[1].z - info->dstOffsets[0].z) ||1129info->srcOffsets[1].z < info->srcOffsets[0].z) {1130z_scale = true;1131}11321133if (info->dstOffsets[1].z < info->dstOffsets[0].z) {1134layers = info->dstOffsets[0].z - info->dstOffsets[1].z;1135src0_z = info->srcOffsets[1].z;1136src1_z = info->srcOffsets[0].z;1137}11381139if (info->dstSubresource.layerCount > 1) {1140assert(layers <= 1);1141layers = info->dstSubresource.layerCount;1142}11431144/* BC1_RGB_* formats need to have their last components overriden with 11145* when sampling, which is normally handled with the texture descriptor1146* swizzle. The 2d path can't handle that, so use the 3d path.1147*1148* TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with1149* the 2d path.1150*/11511152unsigned blit_param = rotate[mirror_y][mirror_x];1153if (dst_image->layout[0].nr_samples > 1 ||1154src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||1155src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||1156filter == VK_FILTER_CUBIC_EXT ||1157z_scale) {1158ops = &r3d_ops;1159blit_param = z_scale;1160}11611162/* use the right format in setup() for D32_S81163* TODO: this probably should use a helper1164*/1165VkFormat format = dst_image->vk_format;1166if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1167if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)1168format = VK_FORMAT_D32_SFLOAT;1169else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)1170format = VK_FORMAT_S8_UINT;1171else1172unreachable("unexpected D32_S8 aspect mask in blit_image");1173}11741175ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,1176blit_param, false, dst_image->layout[0].ubwc);11771178if (ops == &r3d_ops) {1179r3d_coords_raw(cs, (float[]) {1180info->dstOffsets[0].x, info->dstOffsets[0].y,1181info->srcOffsets[0].x, info->srcOffsets[0].y,1182info->dstOffsets[1].x, info->dstOffsets[1].y,1183info->srcOffsets[1].x, info->srcOffsets[1].y1184});1185} else {1186tu_cs_emit_regs(cs,1187A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),1188.y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),1189A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,1190.y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));1191tu_cs_emit_regs(cs,1192A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),1193A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),1194A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),1195A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));1196}11971198struct tu_image_view dst, src;1199tu_image_view_blit(&dst, dst_image, &info->dstSubresource,1200MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));12011202if (z_scale) {1203tu_image_view_copy_blit(&src, src_image, src_image->vk_format,1204&info->srcSubresource, 0, false, true);1205ops->src(cmd, cs, &src, 0, filter);1206} else {1207tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);1208}12091210for (uint32_t i = 0; i < layers; i++) {1211if (z_scale) {1212float t = ((float) i + 0.5f) / (float) layers;1213r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);1214} else {1215ops->src(cmd, cs, &src, i, filter);1216}1217ops->dst(cs, &dst, i);1218ops->run(cmd, cs);1219}12201221ops->teardown(cmd, cs);1222}12231224VKAPI_ATTR void VKAPI_CALL1225tu_CmdBlitImage(VkCommandBuffer commandBuffer,1226VkImage srcImage,1227VkImageLayout srcImageLayout,1228VkImage dstImage,1229VkImageLayout dstImageLayout,1230uint32_t regionCount,1231const VkImageBlit *pRegions,1232VkFilter filter)12331234{1235TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1236TU_FROM_HANDLE(tu_image, src_image, srcImage);1237TU_FROM_HANDLE(tu_image, dst_image, dstImage);12381239for (uint32_t i = 0; i < regionCount; ++i) {1240/* can't blit both depth and stencil at once with D32_S81241* TODO: more advanced 3D blit path to support it instead?1242*/1243if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||1244dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1245VkImageBlit region = pRegions[i];1246u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {1247region.srcSubresource.aspectMask = BIT(b);1248region.dstSubresource.aspectMask = BIT(b);1249tu6_blit_image(cmd, src_image, dst_image, ®ion, filter);1250}1251continue;1252}1253tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);1254}1255}12561257static void1258copy_compressed(VkFormat format,1259VkOffset3D *offset,1260VkExtent3D *extent,1261uint32_t *width,1262uint32_t *height)1263{1264if (!vk_format_is_compressed(format))1265return;12661267uint32_t block_width = vk_format_get_blockwidth(format);1268uint32_t block_height = vk_format_get_blockheight(format);12691270offset->x /= block_width;1271offset->y /= block_height;12721273if (extent) {1274extent->width = DIV_ROUND_UP(extent->width, block_width);1275extent->height = DIV_ROUND_UP(extent->height, block_height);1276}1277if (width)1278*width = DIV_ROUND_UP(*width, block_width);1279if (height)1280*height = DIV_ROUND_UP(*height, block_height);1281}12821283static void1284tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,1285struct tu_buffer *src_buffer,1286struct tu_image *dst_image,1287const VkBufferImageCopy *info)1288{1289struct tu_cs *cs = &cmd->cs;1290uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);1291VkFormat src_format =1292copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);1293const struct blit_ops *ops = &r2d_ops;12941295/* special case for buffer to stencil */1296if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&1297info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {1298ops = &r3d_ops;1299}13001301/* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,1302* which matters for UBWC. buffer_to_image/etc can fail because of this1303*/13041305VkOffset3D offset = info->imageOffset;1306VkExtent3D extent = info->imageExtent;1307uint32_t src_width = info->bufferRowLength ?: extent.width;1308uint32_t src_height = info->bufferImageHeight ?: extent.height;13091310copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);13111312uint32_t pitch = src_width * vk_format_get_blocksize(src_format);1313uint32_t layer_size = src_height * pitch;13141315ops->setup(cmd, cs,1316copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),1317info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc);13181319struct tu_image_view dst;1320tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);13211322for (uint32_t i = 0; i < layers; i++) {1323ops->dst(cs, &dst, i);13241325uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;1326if ((src_va & 63) || (pitch & 63)) {1327for (uint32_t y = 0; y < extent.height; y++) {1328uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);1329ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,1330x + extent.width, 1);1331ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},1332&(VkExtent2D) {extent.width, 1});1333ops->run(cmd, cs);1334src_va += pitch;1335}1336} else {1337ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);1338coords(ops, cs, &offset, &(VkOffset3D){}, &extent);1339ops->run(cmd, cs);1340}1341}13421343ops->teardown(cmd, cs);1344}13451346VKAPI_ATTR void VKAPI_CALL1347tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,1348VkBuffer srcBuffer,1349VkImage dstImage,1350VkImageLayout dstImageLayout,1351uint32_t regionCount,1352const VkBufferImageCopy *pRegions)1353{1354TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1355TU_FROM_HANDLE(tu_image, dst_image, dstImage);1356TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);13571358for (unsigned i = 0; i < regionCount; ++i)1359tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);1360}13611362static void1363tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,1364struct tu_image *src_image,1365struct tu_buffer *dst_buffer,1366const VkBufferImageCopy *info)1367{1368struct tu_cs *cs = &cmd->cs;1369uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);1370VkFormat dst_format =1371copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);1372bool stencil_read = false;13731374if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&1375info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {1376stencil_read = true;1377}13781379const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;1380VkOffset3D offset = info->imageOffset;1381VkExtent3D extent = info->imageExtent;1382uint32_t dst_width = info->bufferRowLength ?: extent.width;1383uint32_t dst_height = info->bufferImageHeight ?: extent.height;13841385copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);13861387uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);1388uint32_t layer_size = pitch * dst_height;13891390ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false);13911392struct tu_image_view src;1393tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);13941395for (uint32_t i = 0; i < layers; i++) {1396ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);13971398uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;1399if ((dst_va & 63) || (pitch & 63)) {1400for (uint32_t y = 0; y < extent.height; y++) {1401uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);1402ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);1403ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},1404&(VkExtent2D) {extent.width, 1});1405ops->run(cmd, cs);1406dst_va += pitch;1407}1408} else {1409ops->dst_buffer(cs, dst_format, dst_va, pitch);1410coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);1411ops->run(cmd, cs);1412}1413}14141415ops->teardown(cmd, cs);1416}14171418VKAPI_ATTR void VKAPI_CALL1419tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,1420VkImage srcImage,1421VkImageLayout srcImageLayout,1422VkBuffer dstBuffer,1423uint32_t regionCount,1424const VkBufferImageCopy *pRegions)1425{1426TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1427TU_FROM_HANDLE(tu_image, src_image, srcImage);1428TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);14291430for (unsigned i = 0; i < regionCount; ++i)1431tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);1432}14331434/* Tiled formats don't support swapping, which means that we can't support1435* formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some1436* formats like B5G5R5A1 have a separate linear-only format when sampling.1437* Currently we fake support for tiled swapped formats and use the unswapped1438* format instead, but this means that reinterpreting copies to and from1439* swapped formats can't be performed correctly unless we can swizzle the1440* components by reinterpreting the other image as the "correct" swapped1441* format, i.e. only when the other image is linear.1442*/14431444static bool1445is_swapped_format(VkFormat format)1446{1447struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);1448struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);1449return linear.fmt != tiled.fmt || linear.swap != tiled.swap;1450}14511452/* R8G8_* formats have a different tiling layout than other cpp=2 formats, and1453* therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice1454* versa). This should mirror the logic in fdl6_layout.1455*/1456static bool1457image_is_r8g8(struct tu_image *image)1458{1459return image->layout[0].cpp == 2 &&1460vk_format_get_nr_components(image->vk_format) == 2;1461}14621463static void1464tu_copy_image_to_image(struct tu_cmd_buffer *cmd,1465struct tu_image *src_image,1466struct tu_image *dst_image,1467const VkImageCopy *info)1468{1469const struct blit_ops *ops = &r2d_ops;1470struct tu_cs *cs = &cmd->cs;14711472if (dst_image->layout[0].nr_samples > 1)1473ops = &r3d_ops;14741475VkFormat format = VK_FORMAT_UNDEFINED;1476VkOffset3D src_offset = info->srcOffset;1477VkOffset3D dst_offset = info->dstOffset;1478VkExtent3D extent = info->extent;1479uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);14801481/* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between1482* Images":1483*1484* When copying between compressed and uncompressed formats the extent1485* members represent the texel dimensions of the source image and not1486* the destination. When copying from a compressed image to an1487* uncompressed image the image texel dimensions written to the1488* uncompressed image will be source extent divided by the compressed1489* texel block dimensions. When copying from an uncompressed image to a1490* compressed image the image texel dimensions written to the compressed1491* image will be the source extent multiplied by the compressed texel1492* block dimensions.1493*1494* This means we only have to adjust the extent if the source image is1495* compressed.1496*/1497copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);1498copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);14991500VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);1501VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);15021503bool use_staging_blit = false;15041505if (src_format == dst_format) {1506/* Images that share a format can always be copied directly because it's1507* the same as a blit.1508*/1509format = src_format;1510} else if (!src_image->layout[0].tile_mode) {1511/* If an image is linear, we can always safely reinterpret it with the1512* other image's format and then do a regular blit.1513*/1514format = dst_format;1515} else if (!dst_image->layout[0].tile_mode) {1516format = src_format;1517} else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {1518/* We can't currently copy r8g8 images to/from other cpp=2 images,1519* due to the different tile layout.1520*/1521use_staging_blit = true;1522} else if (is_swapped_format(src_format) ||1523is_swapped_format(dst_format)) {1524/* If either format has a non-identity swap, then we can't copy1525* to/from it.1526*/1527use_staging_blit = true;1528} else if (!src_image->layout[0].ubwc) {1529format = dst_format;1530} else if (!dst_image->layout[0].ubwc) {1531format = src_format;1532} else {1533/* Both formats use UBWC and so neither can be reinterpreted.1534* TODO: We could do an in-place decompression of the dst instead.1535*/1536use_staging_blit = true;1537}15381539struct tu_image_view dst, src;15401541if (use_staging_blit) {1542tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);1543tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);15441545struct tu_image staging_image = {1546.vk_format = src_format,1547.level_count = 1,1548.layer_count = info->srcSubresource.layerCount,1549.bo_offset = 0,1550};15511552VkImageSubresourceLayers staging_subresource = {1553.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,1554.mipLevel = 0,1555.baseArrayLayer = 0,1556.layerCount = info->srcSubresource.layerCount,1557};15581559VkOffset3D staging_offset = { 0 };15601561staging_image.layout[0].tile_mode = TILE6_LINEAR;1562staging_image.layout[0].ubwc = false;15631564fdl6_layout(&staging_image.layout[0],1565vk_format_to_pipe_format(staging_image.vk_format),1566src_image->layout[0].nr_samples,1567extent.width,1568extent.height,1569extent.depth,1570staging_image.level_count,1571staging_image.layer_count,1572extent.depth > 1,1573NULL);15741575VkResult result = tu_get_scratch_bo(cmd->device,1576staging_image.layout[0].size,1577&staging_image.bo);1578if (result != VK_SUCCESS) {1579cmd->record_result = result;1580return;1581}15821583struct tu_image_view staging;1584tu_image_view_copy(&staging, &staging_image, src_format,1585&staging_subresource, 0, false);15861587ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false);1588coords(ops, cs, &staging_offset, &src_offset, &extent);15891590for (uint32_t i = 0; i < layers_to_copy; i++) {1591ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);1592ops->dst(cs, &staging, i);1593ops->run(cmd, cs);1594}15951596/* When executed by the user there has to be a pipeline barrier here,1597* but since we're doing it manually we'll have to flush ourselves.1598*/1599tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);1600tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);16011602tu_image_view_copy(&staging, &staging_image, dst_format,1603&staging_subresource, 0, false);16041605ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,16060, false, dst_image->layout[0].ubwc);1607coords(ops, cs, &dst_offset, &staging_offset, &extent);16081609for (uint32_t i = 0; i < layers_to_copy; i++) {1610ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);1611ops->dst(cs, &dst, i);1612ops->run(cmd, cs);1613}1614} else {1615tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);1616tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);16171618ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,16190, false, dst_image->layout[0].ubwc);1620coords(ops, cs, &dst_offset, &src_offset, &extent);16211622for (uint32_t i = 0; i < layers_to_copy; i++) {1623ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);1624ops->dst(cs, &dst, i);1625ops->run(cmd, cs);1626}1627}16281629ops->teardown(cmd, cs);1630}16311632VKAPI_ATTR void VKAPI_CALL1633tu_CmdCopyImage(VkCommandBuffer commandBuffer,1634VkImage srcImage,1635VkImageLayout srcImageLayout,1636VkImage destImage,1637VkImageLayout destImageLayout,1638uint32_t regionCount,1639const VkImageCopy *pRegions)1640{1641TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1642TU_FROM_HANDLE(tu_image, src_image, srcImage);1643TU_FROM_HANDLE(tu_image, dst_image, destImage);16441645for (uint32_t i = 0; i < regionCount; ++i) {1646if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1647VkImageCopy info = pRegions[i];1648u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {1649info.srcSubresource.aspectMask = BIT(b);1650info.dstSubresource.aspectMask = BIT(b);1651tu_copy_image_to_image(cmd, src_image, dst_image, &info);1652}1653continue;1654}16551656tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);1657}1658}16591660static void1661copy_buffer(struct tu_cmd_buffer *cmd,1662uint64_t dst_va,1663uint64_t src_va,1664uint64_t size,1665uint32_t block_size)1666{1667const struct blit_ops *ops = &r2d_ops;1668struct tu_cs *cs = &cmd->cs;1669VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;1670uint64_t blocks = size / block_size;16711672ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false);16731674while (blocks) {1675uint32_t src_x = (src_va & 63) / block_size;1676uint32_t dst_x = (dst_va & 63) / block_size;1677uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);16781679ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);1680ops->dst_buffer( cs, format, dst_va & ~63, 0);1681ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});1682ops->run(cmd, cs);16831684src_va += width * block_size;1685dst_va += width * block_size;1686blocks -= width;1687}16881689ops->teardown(cmd, cs);1690}16911692VKAPI_ATTR void VKAPI_CALL1693tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,1694VkBuffer srcBuffer,1695VkBuffer dstBuffer,1696uint32_t regionCount,1697const VkBufferCopy *pRegions)1698{1699TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1700TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);1701TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);17021703for (unsigned i = 0; i < regionCount; ++i) {1704copy_buffer(cmd,1705tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,1706tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,1707pRegions[i].size, 1);1708}1709}17101711VKAPI_ATTR void VKAPI_CALL1712tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,1713VkBuffer dstBuffer,1714VkDeviceSize dstOffset,1715VkDeviceSize dataSize,1716const void *pData)1717{1718TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1719TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);17201721struct tu_cs_memory tmp;1722VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);1723if (result != VK_SUCCESS) {1724cmd->record_result = result;1725return;1726}17271728memcpy(tmp.map, pData, dataSize);1729copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);1730}17311732VKAPI_ATTR void VKAPI_CALL1733tu_CmdFillBuffer(VkCommandBuffer commandBuffer,1734VkBuffer dstBuffer,1735VkDeviceSize dstOffset,1736VkDeviceSize fillSize,1737uint32_t data)1738{1739TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1740TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);1741const struct blit_ops *ops = &r2d_ops;1742struct tu_cs *cs = &cmd->cs;17431744if (fillSize == VK_WHOLE_SIZE)1745fillSize = buffer->size - dstOffset;17461747uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;1748uint32_t blocks = fillSize / 4;17491750ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false);1751ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});17521753while (blocks) {1754uint32_t dst_x = (dst_va & 63) / 4;1755uint32_t width = MIN2(blocks, 0x4000 - dst_x);17561757ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);1758ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});1759ops->run(cmd, cs);17601761dst_va += width * 4;1762blocks -= width;1763}17641765ops->teardown(cmd, cs);1766}17671768VKAPI_ATTR void VKAPI_CALL1769tu_CmdResolveImage(VkCommandBuffer commandBuffer,1770VkImage srcImage,1771VkImageLayout srcImageLayout,1772VkImage dstImage,1773VkImageLayout dstImageLayout,1774uint32_t regionCount,1775const VkImageResolve *pRegions)1776{1777TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1778TU_FROM_HANDLE(tu_image, src_image, srcImage);1779TU_FROM_HANDLE(tu_image, dst_image, dstImage);1780const struct blit_ops *ops = &r2d_ops;1781struct tu_cs *cs = &cmd->cs;17821783ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,17840, false, dst_image->layout[0].ubwc);17851786for (uint32_t i = 0; i < regionCount; ++i) {1787const VkImageResolve *info = &pRegions[i];1788uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);17891790assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);1791/* TODO: aspect masks possible ? */17921793coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);17941795struct tu_image_view dst, src;1796tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);1797tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);17981799for (uint32_t i = 0; i < layers; i++) {1800ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);1801ops->dst(cs, &dst, i);1802ops->run(cmd, cs);1803}1804}18051806ops->teardown(cmd, cs);1807}18081809#define for_each_layer(layer, layer_mask, layers) \1810for (uint32_t layer = 0; \1811layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \1812layer++) \1813if (!layer_mask || (layer_mask & BIT(layer)))18141815static void1816resolve_sysmem(struct tu_cmd_buffer *cmd,1817struct tu_cs *cs,1818VkFormat format,1819struct tu_image_view *src,1820struct tu_image_view *dst,1821uint32_t layer_mask,1822uint32_t layers,1823const VkRect2D *rect,1824bool separate_stencil)1825{1826const struct blit_ops *ops = &r2d_ops;18271828ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,18290, false, dst->ubwc_enabled);1830ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);18311832for_each_layer(i, layer_mask, layers) {1833if (separate_stencil) {1834r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);1835r2d_dst_stencil(cs, dst, i);1836} else {1837ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);1838ops->dst(cs, dst, i);1839}1840ops->run(cmd, cs);1841}18421843ops->teardown(cmd, cs);1844}18451846void1847tu_resolve_sysmem(struct tu_cmd_buffer *cmd,1848struct tu_cs *cs,1849struct tu_image_view *src,1850struct tu_image_view *dst,1851uint32_t layer_mask,1852uint32_t layers,1853const VkRect2D *rect)1854{1855assert(src->image->vk_format == dst->image->vk_format);18561857if (dst->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1858resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT,1859src, dst, layer_mask, layers, rect, false);1860resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT,1861src, dst, layer_mask, layers, rect, true);1862} else {1863resolve_sysmem(cmd, cs, dst->image->vk_format,1864src, dst, layer_mask, layers, rect, false);1865}1866}18671868static void1869clear_image(struct tu_cmd_buffer *cmd,1870struct tu_image *image,1871const VkClearValue *clear_value,1872const VkImageSubresourceRange *range,1873VkImageAspectFlags aspect_mask)1874{1875uint32_t level_count = tu_get_levelCount(image, range);1876uint32_t layer_count = tu_get_layerCount(image, range);1877struct tu_cs *cs = &cmd->cs;1878VkFormat format = image->vk_format;1879if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)1880format = copy_format(format, aspect_mask, false);18811882if (image->layout[0].depth0 > 1) {1883assert(layer_count == 1);1884assert(range->baseArrayLayer == 0);1885}18861887const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;18881889ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc);1890if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)1891ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);1892else1893ops->clear_value(cs, format, clear_value);18941895for (unsigned j = 0; j < level_count; j++) {1896if (image->layout[0].depth0 > 1)1897layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);18981899ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {1900u_minify(image->layout[0].width0, range->baseMipLevel + j),1901u_minify(image->layout[0].height0, range->baseMipLevel + j)1902});19031904struct tu_image_view dst;1905tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {1906.aspectMask = aspect_mask,1907.mipLevel = range->baseMipLevel + j,1908.baseArrayLayer = range->baseArrayLayer,1909.layerCount = 1,1910}, 0, false, false);19111912for (uint32_t i = 0; i < layer_count; i++) {1913ops->dst(cs, &dst, i);1914ops->run(cmd, cs);1915}1916}19171918ops->teardown(cmd, cs);1919}19201921VKAPI_ATTR void VKAPI_CALL1922tu_CmdClearColorImage(VkCommandBuffer commandBuffer,1923VkImage image_h,1924VkImageLayout imageLayout,1925const VkClearColorValue *pColor,1926uint32_t rangeCount,1927const VkImageSubresourceRange *pRanges)1928{1929TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1930TU_FROM_HANDLE(tu_image, image, image_h);19311932for (unsigned i = 0; i < rangeCount; i++)1933clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);1934}19351936VKAPI_ATTR void VKAPI_CALL1937tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,1938VkImage image_h,1939VkImageLayout imageLayout,1940const VkClearDepthStencilValue *pDepthStencil,1941uint32_t rangeCount,1942const VkImageSubresourceRange *pRanges)1943{1944TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);1945TU_FROM_HANDLE(tu_image, image, image_h);19461947for (unsigned i = 0; i < rangeCount; i++) {1948const VkImageSubresourceRange *range = &pRanges[i];19491950if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1951/* can't clear both depth and stencil at once, split up the aspect mask */1952u_foreach_bit(b, range->aspectMask)1953clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));1954continue;1955}19561957clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);1958}1959}19601961static void1962tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,1963uint32_t attachment_count,1964const VkClearAttachment *attachments,1965uint32_t rect_count,1966const VkClearRect *rects)1967{1968/* the shader path here is special, it avoids changing MRT/etc state */1969const struct tu_render_pass *pass = cmd->state.pass;1970const struct tu_subpass *subpass = cmd->state.subpass;1971const uint32_t mrt_count = subpass->color_count;1972struct tu_cs *cs = &cmd->draw_cs;1973uint32_t clear_value[MAX_RTS][4];1974float z_clear_val = 0.0f;1975uint8_t s_clear_val = 0;1976uint32_t clear_rts = 0, clear_components = 0, num_rts = 0;1977bool z_clear = false;1978bool s_clear = false;1979bool layered_clear = false;1980uint32_t max_samples = 1;19811982for (uint32_t i = 0; i < attachment_count; i++) {1983uint32_t a;1984if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1985uint32_t c = attachments[i].colorAttachment;1986a = subpass->color_attachments[c].attachment;1987if (a == VK_ATTACHMENT_UNUSED)1988continue;19891990clear_rts |= 1 << c;1991clear_components |= 0xf << (c * 4);1992memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));1993} else {1994a = subpass->depth_stencil_attachment.attachment;1995if (a == VK_ATTACHMENT_UNUSED)1996continue;19971998if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1999z_clear = true;2000z_clear_val = attachments[i].clearValue.depthStencil.depth;2001}20022003if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {2004s_clear = true;2005s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;2006}2007}20082009max_samples = MAX2(max_samples, pass->attachments[a].samples);2010}20112012/* disable all draw states so they don't interfere2013* TODO: use and re-use draw states2014* we have to disable draw states individually to preserve2015* input attachment states, because a secondary command buffer2016* won't be able to restore them2017*/2018tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));2019for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {2020if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||2021i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)2022continue;2023tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |2024CP_SET_DRAW_STATE__0_DISABLE);2025tu_cs_emit_qw(cs, 0);2026}2027cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;20282029tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);2030tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |2031A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |20320xfc000000);2033tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));20342035tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);2036for (uint32_t i = 0; i < mrt_count; i++) {2037if (clear_rts & (1 << i))2038tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));2039else2040tu_cs_emit(cs, 0);2041}20422043for (uint32_t i = 0; i < rect_count; i++) {2044if (rects[i].baseArrayLayer || rects[i].layerCount > 1)2045layered_clear = true;2046}20472048/* a630 doesn't support multiview masks, which means that we can't use the2049* normal multiview path without potentially recompiling a shader on-demand2050* or using a more complicated variant that takes the mask as a const. Just2051* use the layered path instead, since it shouldn't be much worse.2052*/2053if (subpass->multiview_mask) {2054layered_clear = true;2055}20562057r3d_common(cmd, cs, false, num_rts, layered_clear, false);20582059tu_cs_emit_regs(cs,2060A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));2061tu_cs_emit_regs(cs,2062A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));20632064tu_cs_emit_regs(cs,2065A6XX_RB_FS_OUTPUT_CNTL0(),2066A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));20672068tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());2069tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));2070for (uint32_t i = 0; i < mrt_count; i++) {2071tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,2072.component_enable = COND(clear_rts & (1 << i), 0xf)));2073}20742075tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));2076tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));20772078tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());2079tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(2080.z_enable = z_clear,2081.z_write_enable = z_clear,2082.zfunc = FUNC_ALWAYS));2083tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());2084tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(2085.stencil_enable = s_clear,2086.func = FUNC_ALWAYS,2087.zpass = STENCIL_REPLACE));2088tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));2089tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));2090tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));20912092tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);2093tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |2094CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |2095CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |2096CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |2097CP_LOAD_STATE6_0_NUM_UNIT(num_rts));2098tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));2099tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));2100u_foreach_bit(b, clear_rts)2101tu_cs_emit_array(cs, clear_value[b], 4);21022103for (uint32_t i = 0; i < rect_count; i++) {2104/* This should be true because of this valid usage for2105* vkCmdClearAttachments:2106*2107* "If the render pass instance this is recorded in uses multiview,2108* then baseArrayLayer must be zero and layerCount must be one"2109*/2110assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);21112112for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {2113r3d_coords_raw(cs, (float[]) {2114rects[i].rect.offset.x, rects[i].rect.offset.y,2115z_clear_val, uif(rects[i].baseArrayLayer + layer),2116rects[i].rect.offset.x + rects[i].rect.extent.width,2117rects[i].rect.offset.y + rects[i].rect.extent.height,2118z_clear_val, 1.0f,2119});2120r3d_run(cmd, cs);2121}2122}2123}21242125static void2126pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])2127{2128switch (format) {2129case VK_FORMAT_X8_D24_UNORM_PACK32:2130case VK_FORMAT_D24_UNORM_S8_UINT:2131clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |2132val->depthStencil.stencil << 24;2133return;2134case VK_FORMAT_D16_UNORM:2135clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);2136return;2137case VK_FORMAT_D32_SFLOAT:2138clear_value[0] = fui(val->depthStencil.depth);2139return;2140case VK_FORMAT_S8_UINT:2141clear_value[0] = val->depthStencil.stencil;2142return;2143default:2144break;2145}21462147float tmp[4];2148memcpy(tmp, val->color.float32, 4 * sizeof(float));2149if (vk_format_is_srgb(format)) {2150for (int i = 0; i < 3; i++)2151tmp[i] = util_format_linear_to_srgb_float(tmp[i]);2152}21532154#define PACK_F(type) util_format_##type##_pack_rgba_float \2155( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)2156switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {2157case 4:2158PACK_F(r4g4b4a4_unorm);2159break;2160case 5:2161if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)2162PACK_F(r5g6b5_unorm);2163else2164PACK_F(r5g5b5a1_unorm);2165break;2166case 8:2167if (vk_format_is_snorm(format))2168PACK_F(r8g8b8a8_snorm);2169else if (vk_format_is_unorm(format))2170PACK_F(r8g8b8a8_unorm);2171else2172pack_int8(clear_value, val->color.uint32);2173break;2174case 10:2175if (vk_format_is_int(format))2176pack_int10_2(clear_value, val->color.uint32);2177else2178PACK_F(r10g10b10a2_unorm);2179break;2180case 11:2181clear_value[0] = float3_to_r11g11b10f(val->color.float32);2182break;2183case 16:2184if (vk_format_is_snorm(format))2185PACK_F(r16g16b16a16_snorm);2186else if (vk_format_is_unorm(format))2187PACK_F(r16g16b16a16_unorm);2188else if (vk_format_is_float(format))2189PACK_F(r16g16b16a16_float);2190else2191pack_int16(clear_value, val->color.uint32);2192break;2193case 32:2194memcpy(clear_value, val->color.float32, 4 * sizeof(float));2195break;2196default:2197unreachable("unexpected channel size");2198}2199#undef PACK_F2200}22012202static void2203clear_gmem_attachment(struct tu_cmd_buffer *cmd,2204struct tu_cs *cs,2205VkFormat format,2206uint8_t clear_mask,2207uint32_t gmem_offset,2208const VkClearValue *value)2209{2210tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);2211tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));22122213tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));22142215tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);2216tu_cs_emit(cs, gmem_offset);22172218tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);2219tu_cs_emit(cs, 0);22202221uint32_t clear_vals[4] = {};2222pack_gmem_clear_value(value, format, clear_vals);22232224tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);2225tu_cs_emit_array(cs, clear_vals, 4);22262227tu6_emit_event_write(cmd, cs, BLIT);2228}22292230static void2231tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,2232struct tu_cs *cs,2233uint32_t attachment,2234VkImageAspectFlags mask,2235const VkClearValue *value)2236{2237const struct tu_render_pass_attachment *att =2238&cmd->state.pass->attachments[attachment];22392240if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {2241if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)2242clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);2243if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)2244clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);2245return;2246}22472248clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);2249}22502251static void2252tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,2253uint32_t attachment_count,2254const VkClearAttachment *attachments,2255uint32_t rect_count,2256const VkClearRect *rects)2257{2258const struct tu_subpass *subpass = cmd->state.subpass;2259struct tu_cs *cs = &cmd->draw_cs;22602261/* TODO: swap the loops for smaller cmdstream */2262for (unsigned i = 0; i < rect_count; i++) {2263unsigned x1 = rects[i].rect.offset.x;2264unsigned y1 = rects[i].rect.offset.y;2265unsigned x2 = x1 + rects[i].rect.extent.width - 1;2266unsigned y2 = y1 + rects[i].rect.extent.height - 1;22672268tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);2269tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));2270tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));22712272for (unsigned j = 0; j < attachment_count; j++) {2273uint32_t a;2274if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)2275a = subpass->color_attachments[attachments[j].colorAttachment].attachment;2276else2277a = subpass->depth_stencil_attachment.attachment;22782279if (a == VK_ATTACHMENT_UNUSED)2280continue;22812282tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,2283&attachments[j].clearValue);2284}2285}2286}22872288VKAPI_ATTR void VKAPI_CALL2289tu_CmdClearAttachments(VkCommandBuffer commandBuffer,2290uint32_t attachmentCount,2291const VkClearAttachment *pAttachments,2292uint32_t rectCount,2293const VkClearRect *pRects)2294{2295TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);2296struct tu_cs *cs = &cmd->draw_cs;22972298/* sysmem path behaves like a draw, note we don't have a way of using different2299* flushes for sysmem/gmem, so this needs to be outside of the cond_exec2300*/2301tu_emit_cache_flush_renderpass(cmd, cs);23022303for (uint32_t j = 0; j < attachmentCount; j++) {2304if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)2305continue;2306cmd->state.lrz.valid = false;2307cmd->state.dirty |= TU_CMD_DIRTY_LRZ;2308}23092310/* vkCmdClearAttachments is supposed to respect the predicate if active.2311* The easiest way to do this is to always use the 3d path, which always2312* works even with GMEM because it's just a simple draw using the existing2313* attachment state. However it seems that IGNORE_VISIBILITY draws must be2314* skipped in the binning pass, since otherwise they produce binning data2315* which isn't consumed and leads to the wrong binning data being read, so2316* condition on GMEM | SYSMEM.2317*/2318if (cmd->state.predication_active) {2319tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |2320CP_COND_EXEC_0_RENDER_MODE_SYSMEM);2321tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);2322tu_cond_exec_end(cs);2323return;2324}23252326tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);2327tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);2328tu_cond_exec_end(cs);23292330tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);2331tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);2332tu_cond_exec_end(cs);2333}23342335static void2336clear_sysmem_attachment(struct tu_cmd_buffer *cmd,2337struct tu_cs *cs,2338VkFormat format,2339VkImageAspectFlags clear_mask,2340const VkRenderPassBeginInfo *info,2341uint32_t a,2342bool separate_stencil)2343{2344const struct tu_framebuffer *fb = cmd->state.framebuffer;2345const struct tu_image_view *iview = fb->attachments[a].attachment;2346const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;2347const struct blit_ops *ops = &r2d_ops;2348if (cmd->state.pass->attachments[a].samples > 1)2349ops = &r3d_ops;23502351ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled);2352ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);2353ops->clear_value(cs, format, &info->pClearValues[a]);23542355for_each_layer(i, clear_views, fb->layers) {2356if (separate_stencil) {2357if (ops == &r3d_ops)2358r3d_dst_stencil(cs, iview, i);2359else2360r2d_dst_stencil(cs, iview, i);2361} else {2362ops->dst(cs, iview, i);2363}2364ops->run(cmd, cs);2365}23662367ops->teardown(cmd, cs);2368}23692370void2371tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,2372struct tu_cs *cs,2373uint32_t a,2374const VkRenderPassBeginInfo *info)2375{2376const struct tu_render_pass_attachment *attachment =2377&cmd->state.pass->attachments[a];23782379if (!attachment->clear_mask)2380return;23812382/* Wait for any flushes at the beginning of the renderpass to complete */2383tu_cs_emit_wfi(cs);23842385if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {2386if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {2387clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,2388info, a, false);2389}2390if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {2391clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,2392info, a, true);2393}2394} else {2395clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,2396info, a, false);2397}23982399/* The spec doesn't explicitly say, but presumably the initial renderpass2400* clear is considered part of the renderpass, and therefore barriers2401* aren't required inside the subpass/renderpass. Therefore we need to2402* flush CCU color into CCU depth here, just like with2403* vkCmdClearAttachments(). Note that because this only happens at the2404* beginning of a renderpass, and renderpass writes are considered2405* "incoherent", we shouldn't have to worry about syncing depth into color2406* beforehand as depth should already be flushed.2407*/2408if (vk_format_is_depth_or_stencil(attachment->format)) {2409tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);2410tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);2411} else {2412tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);2413tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);2414}2415}24162417void2418tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,2419struct tu_cs *cs,2420uint32_t a,2421const VkRenderPassBeginInfo *info)2422{2423const struct tu_render_pass_attachment *attachment =2424&cmd->state.pass->attachments[a];24252426if (!attachment->clear_mask)2427return;24282429tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));24302431tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,2432&info->pClearValues[a]);2433}24342435static void2436tu_emit_blit(struct tu_cmd_buffer *cmd,2437struct tu_cs *cs,2438const struct tu_image_view *iview,2439const struct tu_render_pass_attachment *attachment,2440bool resolve,2441bool separate_stencil)2442{2443tu_cs_emit_regs(cs,2444A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));24452446tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(2447.unk0 = !resolve,2448.gmem = !resolve,2449.sample_0 = vk_format_is_int(attachment->format) |2450vk_format_is_depth_or_stencil(attachment->format)));24512452tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);2453if (separate_stencil) {2454tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);2455tu_cs_emit_qw(cs, iview->stencil_base_addr);2456tu_cs_emit(cs, iview->stencil_PITCH);24572458tu_cs_emit_regs(cs,2459A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));2460} else {2461tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);2462tu_cs_image_ref_2d(cs, iview, 0, false);24632464tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);2465tu_cs_image_flag_ref(cs, iview, 0);24662467tu_cs_emit_regs(cs,2468A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));2469}24702471tu6_emit_event_write(cmd, cs, BLIT);2472}24732474static bool2475blit_can_resolve(VkFormat format)2476{2477const struct util_format_description *desc = vk_format_description(format);24782479/* blit event can only do resolve for simple cases:2480* averaging samples as unsigned integers or choosing only one sample2481*/2482if (vk_format_is_snorm(format) || vk_format_is_srgb(format))2483return false;24842485/* can't do formats with larger channel sizes2486* note: this includes all float formats2487* note2: single channel integer formats seem OK2488*/2489if (desc->channel[0].size > 10)2490return false;24912492switch (format) {2493/* for unknown reasons blit event can't msaa resolve these formats when tiled2494* likely related to these formats having different layout from other cpp=2 formats2495*/2496case VK_FORMAT_R8G8_UNORM:2497case VK_FORMAT_R8G8_UINT:2498case VK_FORMAT_R8G8_SINT:2499/* TODO: this one should be able to work? */2500case VK_FORMAT_D24_UNORM_S8_UINT:2501return false;2502default:2503break;2504}25052506return true;2507}25082509void2510tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,2511struct tu_cs *cs,2512uint32_t a,2513bool force_load)2514{2515const struct tu_image_view *iview =2516cmd->state.framebuffer->attachments[a].attachment;2517const struct tu_render_pass_attachment *attachment =2518&cmd->state.pass->attachments[a];25192520if (attachment->load || force_load)2521tu_emit_blit(cmd, cs, iview, attachment, false, false);25222523if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))2524tu_emit_blit(cmd, cs, iview, attachment, false, true);2525}25262527static void2528store_cp_blit(struct tu_cmd_buffer *cmd,2529struct tu_cs *cs,2530struct tu_image_view *iview,2531uint32_t samples,2532bool separate_stencil,2533VkFormat format,2534uint32_t gmem_offset,2535uint32_t cpp)2536{2537r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,2538iview->ubwc_enabled, true);2539if (separate_stencil)2540r2d_dst_stencil(cs, iview, 0);2541else2542r2d_dst(cs, iview, 0);25432544tu_cs_emit_regs(cs,2545A6XX_SP_PS_2D_SRC_INFO(2546.color_format = tu6_format_texture(format, TILE6_2).fmt,2547.tile_mode = TILE6_2,2548.srgb = vk_format_is_srgb(format),2549.samples = tu_msaa_samples(samples),2550.samples_average = !vk_format_is_int(format) &&2551!vk_format_is_depth_or_stencil(format),2552.unk20 = 1,2553.unk22 = 1),2554/* note: src size does not matter when not scaling */2555A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),2556A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),2557A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));25582559/* sync GMEM writes with CACHE. */2560tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);25612562/* Wait for CACHE_INVALIDATE to land */2563tu_cs_emit_wfi(cs);25642565tu_cs_emit_pkt7(cs, CP_BLIT, 1);2566tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));25672568/* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to2569* sysmem, and we generally assume that GMEM renderpasses leave their2570* results in sysmem, so we need to flush manually here.2571*/2572tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);2573}25742575void2576tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,2577struct tu_cs *cs,2578uint32_t a,2579uint32_t gmem_a)2580{2581struct tu_physical_device *phys_dev = cmd->device->physical_device;2582const VkRect2D *render_area = &cmd->state.render_area;2583struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];2584struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;2585struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];25862587if (!dst->store && !dst->store_stencil)2588return;25892590uint32_t x1 = render_area->offset.x;2591uint32_t y1 = render_area->offset.y;2592uint32_t x2 = x1 + render_area->extent.width;2593uint32_t y2 = y1 + render_area->extent.height;2594/* x2/y2 can be unaligned if equal to the size of the image,2595* since it will write into padding space2596* the one exception is linear levels which don't have the2597* required y padding in the layout (except for the last level)2598*/2599bool need_y2_align =2600y2 != iview->extent.height || iview->need_y2_align;26012602bool unaligned =2603x1 % phys_dev->info->gmem_align_w ||2604(x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) ||2605y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);26062607/* D32_SFLOAT_S8_UINT is quite special format: it has two planes,2608* one for depth and other for stencil. When resolving a MSAA2609* D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.2610*/2611bool resolve_d32s8_s8 =2612src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&2613dst->format == VK_FORMAT_S8_UINT;26142615/* use fast path when render area is aligned, except for unsupported resolve cases */2616if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {2617if (dst->store)2618tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8);2619if (dst->store_stencil)2620tu_emit_blit(cmd, cs, iview, src, true, true);2621return;2622}26232624if (dst->samples > 1) {2625/* I guess we need to use shader path in this case?2626* need a testcase which fails because of this2627*/2628tu_finishme("unaligned store of msaa attachment\n");2629return;2630}26312632r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);26332634VkFormat format = src->format;2635if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)2636format = VK_FORMAT_D32_SFLOAT;26372638if (dst->store) {2639store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format,2640src->gmem_offset, src->cpp);2641}2642if (dst->store_stencil) {2643store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,2644src->gmem_offset_stencil, src->samples);2645}2646}264726482649