Path: blob/master/Common/GPU/Vulkan/VulkanQueueRunner.cpp
5654 views
#include <unordered_map>12#include "Common/GPU/DataFormat.h"3#include "Common/GPU/Vulkan/VulkanQueueRunner.h"4#include "Common/GPU/Vulkan/VulkanRenderManager.h"5#include "Common/Log.h"6#include "Common/TimeUtil.h"78using namespace PPSSPP_VK;910// Debug help: adb logcat -s DEBUG AndroidRuntime PPSSPPNativeActivity PPSSPP NativeGLView NativeRenderer NativeSurfaceView PowerSaveModeReceiver InputDeviceState PpssppActivity CameraHelper1112static void MergeRenderAreaRectInto(VkRect2D *dest, const VkRect2D &src) {13if (dest->offset.x > src.offset.x) {14dest->extent.width += (dest->offset.x - src.offset.x);15dest->offset.x = src.offset.x;16}17if (dest->offset.y > src.offset.y) {18dest->extent.height += (dest->offset.y - src.offset.y);19dest->offset.y = src.offset.y;20}21if (dest->offset.x + dest->extent.width < src.offset.x + src.extent.width) {22dest->extent.width = src.offset.x + src.extent.width - dest->offset.x;23}24if (dest->offset.y + dest->extent.height < src.offset.y + src.extent.height) {25dest->extent.height = src.offset.y + src.extent.height - dest->offset.y;26}27}2829// We need to take the "max" of the features used in the two render passes.30RenderPassType MergeRPTypes(RenderPassType a, RenderPassType b) {31// Either both are backbuffer type, or neither are.32// These can't merge with other renderpasses33if (a == RenderPassType::BACKBUFFER || b == RenderPassType::BACKBUFFER) {34_dbg_assert_(a == b);35return a;36}3738_dbg_assert_((a & RenderPassType::MULTIVIEW) == (b & RenderPassType::MULTIVIEW));3940// The rest we can just OR together to get the maximum feature set.41return (RenderPassType)((u32)a | (u32)b);42}4344void VulkanQueueRunner::CreateDeviceObjects() {45INFO_LOG(Log::G3D, "VulkanQueueRunner::CreateDeviceObjects");4647RPKey key{48VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,49VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,50};51compatibleRenderPass_ = GetRenderPass(key);5253#if 054// Just to check whether it makes sense to split some of these. drawidx is way bigger than the others...55// We should probably just move to variable-size data in a raw buffer anyway...56VkRenderData rd;57INFO_LOG(Log::G3D, "sizeof(pipeline): %d", (int)sizeof(rd.pipeline));58INFO_LOG(Log::G3D, "sizeof(draw): %d", (int)sizeof(rd.draw));59INFO_LOG(Log::G3D, "sizeof(drawidx): %d", (int)sizeof(rd.drawIndexed));60INFO_LOG(Log::G3D, "sizeof(clear): %d", (int)sizeof(rd.clear));61INFO_LOG(Log::G3D, "sizeof(viewport): %d", (int)sizeof(rd.viewport));62INFO_LOG(Log::G3D, "sizeof(scissor): %d", (int)sizeof(rd.scissor));63INFO_LOG(Log::G3D, "sizeof(blendColor): %d", (int)sizeof(rd.blendColor));64INFO_LOG(Log::G3D, "sizeof(push): %d", (int)sizeof(rd.push));65#endif66}6768void VulkanQueueRunner::DestroyDeviceObjects() {69INFO_LOG(Log::G3D, "VulkanQueueRunner::DestroyDeviceObjects");7071syncReadback_.Destroy(vulkan_);7273renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {74_dbg_assert_(rp);75rp->Destroy(vulkan_);76delete rp;77});78renderPasses_.Clear();79}8081bool VulkanQueueRunner::InitBackbufferFramebuffers(int width, int height, FrameDataShared &frameDataShared) {82VkResult res;83// We share the same depth buffer but have multiple color buffers, see the loop below.84VkImageView attachments[2] = { VK_NULL_HANDLE, depth_.view };8586VkFramebufferCreateInfo fb_info = { VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };87fb_info.renderPass = GetCompatibleRenderPass()->Get(vulkan_, RenderPassType::BACKBUFFER, VK_SAMPLE_COUNT_1_BIT);88fb_info.attachmentCount = 2;89fb_info.pAttachments = attachments;90fb_info.width = width;91fb_info.height = height;92fb_info.layers = 1;9394framebuffers_.resize(frameDataShared.swapchainImageCount_);9596for (uint32_t i = 0; i < frameDataShared.swapchainImageCount_; i++) {97attachments[0] = frameDataShared.swapchainImages_[i].view;98res = vkCreateFramebuffer(vulkan_->GetDevice(), &fb_info, nullptr, &framebuffers_[i]);99_dbg_assert_(res == VK_SUCCESS);100if (res != VK_SUCCESS) {101framebuffers_.clear();102return false;103}104}105106return true;107}108109bool VulkanQueueRunner::InitDepthStencilBuffer(VkCommandBuffer cmd, VulkanBarrierBatch *barriers) {110const VkFormat depth_format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat;111int aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;112VkImageCreateInfo image_info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };113image_info.imageType = VK_IMAGE_TYPE_2D;114image_info.format = depth_format;115image_info.extent.width = vulkan_->GetBackbufferWidth();116image_info.extent.height = vulkan_->GetBackbufferHeight();117image_info.extent.depth = 1;118image_info.mipLevels = 1;119image_info.arrayLayers = 1;120image_info.samples = VK_SAMPLE_COUNT_1_BIT;121image_info.queueFamilyIndexCount = 0;122image_info.pQueueFamilyIndices = nullptr;123image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;124image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;125image_info.flags = 0;126127depth_.format = depth_format;128129VmaAllocationCreateInfo allocCreateInfo{};130VmaAllocationInfo allocInfo{};131132allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;133134VkResult res = vmaCreateImage(vulkan_->Allocator(), &image_info, &allocCreateInfo, &depth_.image, &depth_.alloc, &allocInfo);135_dbg_assert_(res == VK_SUCCESS);136if (res != VK_SUCCESS)137return false;138139vulkan_->SetDebugName(depth_.image, VK_OBJECT_TYPE_IMAGE, "BackbufferDepth");140141VkImageMemoryBarrier *barrier = barriers->Add(depth_.image,142VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,143VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, 0);144barrier->subresourceRange.aspectMask = aspectMask;145barrier->oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;146barrier->newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;147barrier->srcAccessMask = 0;148barrier->dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;149150VkImageViewCreateInfo depth_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };151depth_view_info.image = depth_.image;152depth_view_info.format = depth_format;153depth_view_info.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;154depth_view_info.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;155depth_view_info.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;156depth_view_info.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;157depth_view_info.subresourceRange.aspectMask = aspectMask;158depth_view_info.subresourceRange.baseMipLevel = 0;159depth_view_info.subresourceRange.levelCount = 1;160depth_view_info.subresourceRange.baseArrayLayer = 0;161depth_view_info.subresourceRange.layerCount = 1;162depth_view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;163depth_view_info.flags = 0;164165VkDevice device = vulkan_->GetDevice();166167res = vkCreateImageView(device, &depth_view_info, NULL, &depth_.view);168vulkan_->SetDebugName(depth_.view, VK_OBJECT_TYPE_IMAGE_VIEW, "depth_stencil_backbuffer");169_dbg_assert_(res == VK_SUCCESS);170if (res != VK_SUCCESS)171return false;172173return true;174}175176void VulkanQueueRunner::DestroyBackBuffers() {177if (depth_.view) {178vulkan_->Delete().QueueDeleteImageView(depth_.view);179}180if (depth_.image) {181_dbg_assert_(depth_.alloc);182vulkan_->Delete().QueueDeleteImageAllocation(depth_.image, depth_.alloc);183}184depth_ = {};185for (uint32_t i = 0; i < framebuffers_.size(); i++) {186_dbg_assert_(framebuffers_[i] != VK_NULL_HANDLE);187vulkan_->Delete().QueueDeleteFramebuffer(framebuffers_[i]);188}189framebuffers_.clear();190191INFO_LOG(Log::G3D, "Backbuffers destroyed");192}193194// Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827195// Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies196VKRRenderPass *VulkanQueueRunner::GetRenderPass(const RPKey &key) {197VKRRenderPass *foundPass;198if (renderPasses_.Get(key, &foundPass)) {199return foundPass;200}201202VKRRenderPass *pass = new VKRRenderPass(key);203renderPasses_.Insert(key, pass);204return pass;205}206207void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {208// Optimizes renderpasses, then sequences them.209// Planned optimizations:210// * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes211// as early as possible in the frame (Wipeout billboards). This will require taking over more of descriptor management so we can212// substitute descriptors, alternatively using texture array layers creatively.213214for (int j = 0; j < (int)steps.size(); j++) {215if (steps[j]->stepType == VKRStepType::RENDER &&216steps[j]->render.framebuffer) {217if (steps[j]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) {218steps[j]->render.finalColorLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;219}220if (steps[j]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) {221steps[j]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;222}223}224}225226for (int j = 0; j < (int)steps.size() - 1; j++) {227// Push down empty "Clear/Store" renderpasses, and merge them with the first "Load/Store" to the same framebuffer.228if (steps.size() > 1 && steps[j]->stepType == VKRStepType::RENDER &&229steps[j]->render.numDraws == 0 &&230steps[j]->render.numReads == 0 &&231steps[j]->render.colorLoad == VKRRenderPassLoadAction::CLEAR &&232steps[j]->render.stencilLoad == VKRRenderPassLoadAction::CLEAR &&233steps[j]->render.depthLoad == VKRRenderPassLoadAction::CLEAR) {234235// Drop the clear step, and merge it into the next step that touches the same framebuffer.236for (int i = j + 1; i < (int)steps.size(); i++) {237if (steps[i]->stepType == VKRStepType::RENDER &&238steps[i]->render.framebuffer == steps[j]->render.framebuffer) {239if (steps[i]->render.colorLoad != VKRRenderPassLoadAction::CLEAR) {240steps[i]->render.colorLoad = VKRRenderPassLoadAction::CLEAR;241steps[i]->render.clearColor = steps[j]->render.clearColor;242}243if (steps[i]->render.depthLoad != VKRRenderPassLoadAction::CLEAR) {244steps[i]->render.depthLoad = VKRRenderPassLoadAction::CLEAR;245steps[i]->render.clearDepth = steps[j]->render.clearDepth;246}247if (steps[i]->render.stencilLoad != VKRRenderPassLoadAction::CLEAR) {248steps[i]->render.stencilLoad = VKRRenderPassLoadAction::CLEAR;249steps[i]->render.clearStencil = steps[j]->render.clearStencil;250}251MergeRenderAreaRectInto(&steps[i]->render.renderArea, steps[j]->render.renderArea);252steps[i]->render.renderPassType = MergeRPTypes(steps[i]->render.renderPassType, steps[j]->render.renderPassType);253steps[i]->render.numDraws += steps[j]->render.numDraws;254steps[i]->render.numReads += steps[j]->render.numReads;255// Cheaply skip the first step.256steps[j]->stepType = VKRStepType::RENDER_SKIP;257break;258} else if (steps[i]->stepType == VKRStepType::COPY &&259steps[i]->copy.src == steps[j]->render.framebuffer) {260// Can't eliminate the clear if a game copies from it before it's261// rendered to. However this should be rare.262// TODO: This should never happen when we check numReads now.263break;264}265}266}267}268269// Queue hacks.270if (hacksEnabled_) {271if (hacksEnabled_ & QUEUE_HACK_MGS2_ACID) {272// Massive speedup due to re-ordering.273ApplyMGSHack(steps);274}275if (hacksEnabled_ & QUEUE_HACK_SONIC) {276ApplySonicHack(steps);277}278if (hacksEnabled_ & QUEUE_HACK_RENDERPASS_MERGE) {279ApplyRenderPassMerge(steps);280}281}282}283284void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, int curFrame, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps) {285QueueProfileContext *profile = frameData.profile.enabled ? &frameData.profile : nullptr;286287if (profile)288profile->cpuStartTime = time_now_d();289290bool emitLabels = vulkan_->Extensions().EXT_debug_utils;291292VkCommandBuffer cmd = frameData.hasPresentCommands ? frameData.presentCmd : frameData.mainCmd;293294for (size_t i = 0; i < steps.size(); i++) {295const VKRStep &step = *steps[i];296if (emitLabels) {297VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };298char temp[128];299if (step.stepType == VKRStepType::RENDER && step.render.framebuffer) {300snprintf(temp, sizeof(temp), "%s: %s", step.tag, step.render.framebuffer->Tag());301labelInfo.pLabelName = temp;302} else {303labelInfo.pLabelName = step.tag;304}305vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);306}307308switch (step.stepType) {309case VKRStepType::RENDER:310{311bool perform = true;312if (!step.render.framebuffer) {313if (emitLabels) {314vkCmdEndDebugUtilsLabelEXT(cmd);315}316frameData.Submit(vulkan_, FrameSubmitType::Pending, frameDataShared);317318// If the window is minimized and we don't have a swap chain, don't bother.319if (frameDataShared.swapchainImageCount_ > 0) {320// When stepping in the GE debugger, we can end up here multiple times in a "frame".321// So only acquire once.322if (!frameData.hasAcquired) {323frameData.AcquireNextImage(vulkan_);324if (frameData.hasAcquired && frameData.curSwapchainImage != (uint32_t)-1) {325SetBackbuffer(framebuffers_[frameData.curSwapchainImage], frameDataShared.swapchainImages_[frameData.curSwapchainImage].image);326}327}328329if (!frameData.hasPresentCommands) {330// A RENDER step rendering to the backbuffer is normally the last step that happens in a frame,331// unless taking a screenshot, in which case there might be a READBACK_IMAGE after it.332// This is why we have to switch cmd to presentCmd, in this case.333VkCommandBufferBeginInfo begin{VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};334begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;335vkBeginCommandBuffer(frameData.presentCmd, &begin);336frameData.hasPresentCommands = true;337}338cmd = frameData.presentCmd;339if (emitLabels) {340VkDebugUtilsLabelEXT labelInfo{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT};341labelInfo.pLabelName = "present";342vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);343}344} else {345perform = false;346}347}348if (perform) {349PerformRenderPass(step, cmd, curFrame, frameData.profile);350} else {351frameData.skipSwap = true;352}353break;354}355case VKRStepType::COPY:356PerformCopy(step, cmd);357break;358case VKRStepType::BLIT:359PerformBlit(step, cmd);360break;361case VKRStepType::READBACK:362PerformReadback(step, cmd, frameData);363break;364case VKRStepType::READBACK_IMAGE:365PerformReadbackImage(step, cmd);366break;367case VKRStepType::RENDER_SKIP:368break;369default:370UNREACHABLE();371break;372}373374if (profile && profile->timestampsEnabled && profile->timestampDescriptions.size() + 1 < MAX_TIMESTAMP_QUERIES) {375vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile->queryPool, (uint32_t)profile->timestampDescriptions.size());376profile->timestampDescriptions.push_back(StepToString(vulkan_, step));377}378379if (emitLabels) {380vkCmdEndDebugUtilsLabelEXT(cmd);381}382}383384// Deleting all in one go should be easier on the instruction cache than deleting385// them as we go - and easier to debug because we can look backwards in the frame.386if (!keepSteps) {387for (auto step : steps) {388delete step;389}390steps.clear();391}392393if (profile)394profile->cpuEndTime = time_now_d();395}396397void VulkanQueueRunner::ApplyMGSHack(std::vector<VKRStep *> &steps) {398// Really need a sane way to express transforms of steps.399400// We want to turn a sequence of copy,render(1),copy,render(1),copy,render(1) to copy,copy,copy,render(n).401402// TODO: Where does this first part trigger? The below depal part triggers reliably in Acid2.403404for (int i = 0; i < (int)steps.size() - 3; i++) {405int last = -1;406if (!(steps[i]->stepType == VKRStepType::COPY &&407steps[i + 1]->stepType == VKRStepType::RENDER &&408steps[i + 2]->stepType == VKRStepType::COPY &&409steps[i + 1]->render.numDraws == 1 &&410steps[i]->copy.dst == steps[i + 2]->copy.dst))411continue;412// Looks promising! Let's start by finding the last one.413for (int j = i; j < (int)steps.size(); j++) {414switch (steps[j]->stepType) {415case VKRStepType::RENDER:416if (steps[j]->render.numDraws > 1)417last = j - 1;418// should really also check descriptor sets...419if (steps[j]->commands.size()) {420const VkRenderData &cmd = steps[j]->commands.back();421if (cmd.cmd == VKRRenderCommand::DRAW_INDEXED && cmd.draw.count != 6)422last = j - 1;423}424break;425case VKRStepType::COPY:426if (steps[j]->copy.dst != steps[i]->copy.dst)427last = j - 1;428break;429default:430break;431}432if (last != -1)433break;434}435436if (last != -1) {437// We've got a sequence from i to last that needs reordering.438// First, let's sort it, keeping the same length.439std::vector<VKRStep *> copies;440std::vector<VKRStep *> renders;441copies.reserve((last - i) / 2);442renders.reserve((last - i) / 2);443for (int n = i; n <= last; n++) {444if (steps[n]->stepType == VKRStepType::COPY)445copies.push_back(steps[n]);446else if (steps[n]->stepType == VKRStepType::RENDER)447renders.push_back(steps[n]);448}449// Write the copies back. TODO: Combine them too.450for (int j = 0; j < (int)copies.size(); j++) {451steps[i + j] = copies[j];452}453454const int firstRender = i + (int)copies.size();455456// Write the renders back (so they will be deleted properly).457for (int j = 0; j < (int)renders.size(); j++) {458steps[firstRender + j] = renders[j];459}460_assert_(steps[firstRender]->stepType == VKRStepType::RENDER);461// Combine the renders.462for (int j = 1; j < (int)renders.size(); j++) {463steps[firstRender]->commands.reserve(renders[j]->commands.size());464for (int k = 0; k < (int)renders[j]->commands.size(); k++) {465steps[firstRender]->commands.push_back(renders[j]->commands[k]);466}467MergeRenderAreaRectInto(&steps[firstRender]->render.renderArea, renders[j]->render.renderArea);468// Easier than removing them from the list, though that might be the better option.469steps[firstRender + j]->stepType = VKRStepType::RENDER_SKIP;470steps[firstRender + j]->commands.clear();471}472// We're done.473// INFO_LOG(Log::G3D, "MGS HACK part 1: copies: %d renders: %d", (int)copies.size(), (int)renders.size());474break;475}476}477478// There's also a post processing effect using depals that's just brutal in some parts479// of the game.480for (int i = 0; i < (int)steps.size() - 3; i++) {481int last = -1;482if (!(steps[i]->stepType == VKRStepType::RENDER &&483steps[i + 1]->stepType == VKRStepType::RENDER &&484steps[i + 2]->stepType == VKRStepType::RENDER &&485steps[i]->render.numDraws == 1 &&486steps[i + 1]->render.numDraws == 1 &&487steps[i + 2]->render.numDraws == 1 &&488steps[i]->render.colorLoad == VKRRenderPassLoadAction::DONT_CARE &&489steps[i + 1]->render.colorLoad == VKRRenderPassLoadAction::KEEP &&490steps[i + 2]->render.colorLoad == VKRRenderPassLoadAction::DONT_CARE)) {491continue;492}493VKRFramebuffer *depalFramebuffer = steps[i]->render.framebuffer;494VKRFramebuffer *targetFramebuffer = steps[i + 1]->render.framebuffer;495// OK, found the start of a post-process sequence. Let's scan until we find the end.496for (int j = i; j < (int)steps.size() - 3; j++) {497if (((j - i) & 1) == 0) {498// This should be a depal draw.499if (steps[j]->render.numDraws != 1)500break;501if (steps[j]->commands.size() > 5) // TODO: Not the greatest heuristic! This may change if we merge commands.502break;503if (steps[j]->render.colorLoad != VKRRenderPassLoadAction::DONT_CARE)504break;505if (steps[j]->render.framebuffer != depalFramebuffer)506break;507last = j;508} else {509// This should be a target draw.510if (steps[j]->render.numDraws != 1)511break;512if (steps[j]->commands.size() > 5) // TODO: Not the greatest heuristic! This may change if we merge commands.513break;514if (steps[j]->render.colorLoad != VKRRenderPassLoadAction::KEEP)515break;516if (steps[j]->render.framebuffer != targetFramebuffer)517break;518last = j;519}520}521522if (last == -1)523continue;524525if (last > 479) {526// Avoid some problems with the hack (oil slick crash). Some additional commands get added there that527// confuses this merging. NOTE: This is not really a solution! See #20306.528last = 479;529}530531int minScissorX = 10000;532int minScissorY = 10000;533int maxScissorX = 0;534int maxScissorY = 0;535536// Combine the depal renders. Also record scissor bounds.537for (int j = i + 2; j <= last + 1; j += 2) {538for (int k = 0; k < (int)steps[j]->commands.size(); k++) {539switch (steps[j]->commands[k].cmd) {540case VKRRenderCommand::DRAW:541case VKRRenderCommand::DRAW_INDEXED:542steps[i]->commands.push_back(steps[j]->commands[k]);543break;544case VKRRenderCommand::SCISSOR:545{546// TODO: Merge scissor rectangles.547const auto &rc = steps[j]->commands[k].scissor.scissor;548if (rc.offset.x < minScissorX) {549minScissorX = rc.offset.x;550}551if (rc.offset.y < minScissorY) {552minScissorY = rc.offset.y;553}554if (rc.offset.x + (int)rc.extent.width > maxScissorX) {555maxScissorX = rc.offset.x + rc.extent.width;556}557if (rc.offset.y + (int)rc.extent.height > maxScissorY) {558maxScissorY = rc.offset.y + rc.extent.height;559}560break;561}562default:563break;564}565}566MergeRenderAreaRectInto(&steps[i]->render.renderArea, steps[j]->render.renderArea);567steps[j]->stepType = VKRStepType::RENDER_SKIP;568}569570// Update the scissor in the first draw.571minScissorX = std::max(0, minScissorX);572minScissorY = std::max(0, minScissorY);573if (maxScissorX > minScissorX && maxScissorY > minScissorY) {574for (int k = 0; k < steps[i]->commands.size(); k++) {575if (steps[i]->commands[k].cmd == VKRRenderCommand::SCISSOR) {576auto &rc = steps[i]->commands[k].scissor.scissor;577rc.offset.x = minScissorX;578rc.offset.y = minScissorY;579rc.extent.width = maxScissorX - minScissorX;580rc.extent.height = maxScissorY - minScissorY;581break;582}583}584}585586// Combine the target renders.587for (int j = i + 3; j <= last; j += 2) {588for (int k = 0; k < (int)steps[j]->commands.size(); k++) {589switch (steps[j]->commands[k].cmd) {590case VKRRenderCommand::DRAW:591case VKRRenderCommand::DRAW_INDEXED:592steps[i + 1]->commands.push_back(steps[j]->commands[k]);593break;594default:595break;596}597}598MergeRenderAreaRectInto(&steps[i + 1]->render.renderArea, steps[j]->render.renderArea);599steps[j]->stepType = VKRStepType::RENDER_SKIP;600}601602// INFO_LOG(Log::G3D, "MGS HACK part 2: %d-%d : %d (total steps: %d)", i, last, (last - i), (int)steps.size());603604// We're done - we only expect one of these sequences per frame.605break;606}607}608609void VulkanQueueRunner::ApplySonicHack(std::vector<VKRStep *> &steps) {610// We want to turn a sequence of render(3),render(1),render(6),render(1),render(6),render(1),render(3) to611// render(1), render(1), render(1), render(6), render(6), render(6)612613for (int i = 0; i < (int)steps.size() - 4; i++) {614int last = -1;615if (!(steps[i]->stepType == VKRStepType::RENDER &&616steps[i + 1]->stepType == VKRStepType::RENDER &&617steps[i + 2]->stepType == VKRStepType::RENDER &&618steps[i + 3]->stepType == VKRStepType::RENDER &&619steps[i]->render.numDraws == 3 &&620steps[i + 1]->render.numDraws == 1 &&621steps[i + 2]->render.numDraws == 6 &&622steps[i + 3]->render.numDraws == 1 &&623steps[i]->render.framebuffer == steps[i + 2]->render.framebuffer &&624steps[i + 1]->render.framebuffer == steps[i + 3]->render.framebuffer))625continue;626// Looks promising! Let's start by finding the last one.627for (int j = i; j < (int)steps.size(); j++) {628switch (steps[j]->stepType) {629case VKRStepType::RENDER:630if ((j - i) & 1) {631if (steps[j]->render.framebuffer != steps[i + 1]->render.framebuffer)632last = j - 1;633if (steps[j]->render.numDraws != 1)634last = j - 1;635} else {636if (steps[j]->render.framebuffer != steps[i]->render.framebuffer)637last = j - 1;638if (steps[j]->render.numDraws != 3 && steps[j]->render.numDraws != 6)639last = j - 1;640}641break;642default:643break;644}645if (last != -1)646break;647}648649if (last != -1) {650// We've got a sequence from i to last that needs reordering.651// First, let's sort it, keeping the same length.652std::vector<VKRStep *> type1;653std::vector<VKRStep *> type2;654type1.reserve((last - i) / 2);655type2.reserve((last - i) / 2);656for (int n = i; n <= last; n++) {657if (steps[n]->render.framebuffer == steps[i]->render.framebuffer)658type1.push_back(steps[n]);659else660type2.push_back(steps[n]);661}662663// Write the renders back in order. Same amount, so deletion will work fine.664for (int j = 0; j < (int)type1.size(); j++) {665steps[i + j] = type1[j];666}667for (int j = 0; j < (int)type2.size(); j++) {668steps[i + j + type1.size()] = type2[j];669}670671// Combine the renders.672for (int j = 1; j < (int)type1.size(); j++) {673for (int k = 0; k < (int)type1[j]->commands.size(); k++) {674steps[i]->commands.push_back(type1[j]->commands[k]);675}676steps[i + j]->stepType = VKRStepType::RENDER_SKIP;677}678for (int j = 1; j < (int)type2.size(); j++) {679for (int k = 0; k < (int)type2[j]->commands.size(); k++) {680steps[i + type1.size()]->commands.push_back(type2[j]->commands[k]);681}682// Technically, should merge render area here, but they're all the same so not needed.683steps[i + type1.size() + j]->stepType = VKRStepType::RENDER_SKIP;684}685// We're done.686break;687}688}689}690691const char *AspectToString(VkImageAspectFlags aspect) {692switch (aspect) {693case VK_IMAGE_ASPECT_COLOR_BIT: return "COLOR";694case VK_IMAGE_ASPECT_DEPTH_BIT: return "DEPTH";695case VK_IMAGE_ASPECT_STENCIL_BIT: return "STENCIL";696case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: return "DEPTHSTENCIL";697default: return "UNUSUAL";698}699}700701std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep &step) {702char buffer[256];703switch (step.stepType) {704case VKRStepType::RENDER:705{706int w = step.render.framebuffer ? step.render.framebuffer->width : vulkan->GetBackbufferWidth();707int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan->GetBackbufferHeight();708int actual_w = step.render.renderArea.extent.width;709int actual_h = step.render.renderArea.extent.height;710const char *renderCmd = GetRPTypeName(step.render.renderPassType);711snprintf(buffer, sizeof(buffer), "%s %s %s (draws: %d, %dx%d/%dx%d)", renderCmd, step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", step.render.numDraws, actual_w, actual_h, w, h);712break;713}714case VKRStepType::COPY:715snprintf(buffer, sizeof(buffer), "COPY '%s' %s -> %s (%dx%d, %s)", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.copy.srcRect.extent.width, step.copy.srcRect.extent.height, AspectToString(step.copy.aspectMask));716break;717case VKRStepType::BLIT:718snprintf(buffer, sizeof(buffer), "BLIT '%s' %s -> %s (%dx%d->%dx%d, %s)", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.blit.srcRect.extent.width, step.blit.srcRect.extent.height, step.blit.dstRect.extent.width, step.blit.dstRect.extent.height, AspectToString(step.blit.aspectMask));719break;720case VKRStepType::READBACK:721snprintf(buffer, sizeof(buffer), "READBACK '%s' %s (%dx%d, %s)", step.tag, step.readback.src ? step.readback.src->Tag() : "(backbuffer)", step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, AspectToString(step.readback.aspectMask));722break;723case VKRStepType::READBACK_IMAGE:724snprintf(buffer, sizeof(buffer), "READBACK_IMAGE '%s' (%dx%d)", step.tag, step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height);725break;726case VKRStepType::RENDER_SKIP:727snprintf(buffer, sizeof(buffer), "(RENDER_SKIP) %s", step.tag);728break;729default:730buffer[0] = 0;731break;732}733return std::string(buffer);734}735736// Ideally, this should be cheap enough to be applied to all games. At least on mobile, it's pretty737// much a guaranteed neutral or win in terms of GPU power. However, dependency calculation really738// must be perfect!739void VulkanQueueRunner::ApplyRenderPassMerge(std::vector<VKRStep *> &steps) {740// First let's count how many times each framebuffer is rendered to.741// If it's more than one, let's do our best to merge them. This can help God of War quite a bit.742std::unordered_map<VKRFramebuffer *, int> counts;743for (int i = 0; i < (int)steps.size(); i++) {744if (steps[i]->stepType == VKRStepType::RENDER) {745counts[steps[i]->render.framebuffer]++;746}747}748749auto mergeRenderSteps = [](VKRStep *dst, VKRStep *src) {750// OK. Now, if it's a render, slurp up all the commands and kill the step.751// Also slurp up any pretransitions.752dst->preTransitions.append(src->preTransitions);753dst->commands.insert(dst->commands.end(), src->commands.begin(), src->commands.end());754MergeRenderAreaRectInto(&dst->render.renderArea, src->render.renderArea);755// So we don't consider it for other things, maybe doesn't matter.756src->dependencies.clear();757src->stepType = VKRStepType::RENDER_SKIP;758dst->render.numDraws += src->render.numDraws;759dst->render.numReads += src->render.numReads;760dst->render.pipelineFlags |= src->render.pipelineFlags;761dst->render.renderPassType = MergeRPTypes(dst->render.renderPassType, src->render.renderPassType);762};763auto renderHasClear = [](const VKRStep *step) {764const auto &r = step->render;765return r.colorLoad == VKRRenderPassLoadAction::CLEAR || r.depthLoad == VKRRenderPassLoadAction::CLEAR || r.stencilLoad == VKRRenderPassLoadAction::CLEAR;766};767768// Now, let's go through the steps. If we find one that is rendered to more than once,769// we'll scan forward and slurp up any rendering that can be merged across.770for (int i = 0; i < (int)steps.size(); i++) {771if (steps[i]->stepType == VKRStepType::RENDER && counts[steps[i]->render.framebuffer] > 1) {772auto fb = steps[i]->render.framebuffer;773TinySet<VKRFramebuffer *, 8> touchedFramebuffers; // must be the same fast-size as the dependencies TinySet for annoying reasons.774for (int j = i + 1; j < (int)steps.size(); j++) {775// If any other passes are reading from this framebuffer as-is, we cancel the scan.776if (steps[j]->dependencies.contains(fb)) {777// Reading from itself means a KEEP, which is okay.778if (steps[j]->stepType != VKRStepType::RENDER || steps[j]->render.framebuffer != fb)779break;780}781switch (steps[j]->stepType) {782case VKRStepType::RENDER:783if (steps[j]->render.framebuffer == fb) {784// Prevent Unknown's example case from https://github.com/hrydgard/ppsspp/pull/12242785if (renderHasClear(steps[j]) || steps[j]->dependencies.contains(touchedFramebuffers)) {786goto done_fb;787} else {788// Safe to merge, great.789mergeRenderSteps(steps[i], steps[j]);790}791} else {792// Remember the framebuffer this wrote to. We can't merge with later passes that depend on these.793touchedFramebuffers.insert(steps[j]->render.framebuffer);794}795break;796case VKRStepType::COPY:797if (steps[j]->copy.dst == fb) {798// Without framebuffer "renaming", we can't merge past a clobbered fb.799goto done_fb;800}801touchedFramebuffers.insert(steps[j]->copy.dst);802break;803case VKRStepType::BLIT:804if (steps[j]->blit.dst == fb) {805// Without framebuffer "renaming", we can't merge past a clobbered fb.806goto done_fb;807}808touchedFramebuffers.insert(steps[j]->blit.dst);809break;810case VKRStepType::READBACK:811// Not sure this has much effect, when executed READBACK is always the last step812// since we stall the GPU and wait immediately after.813break;814case VKRStepType::RENDER_SKIP:815case VKRStepType::READBACK_IMAGE:816break;817default:818// We added a new step? Might be unsafe.819_dbg_assert_(false);820goto done_fb;821}822}823done_fb:824;825}826}827}828829void VulkanQueueRunner::LogSteps(const std::vector<VKRStep *> &steps, bool verbose) {830INFO_LOG(Log::G3D, "=================== FRAME ====================");831for (size_t i = 0; i < steps.size(); i++) {832const VKRStep &step = *steps[i];833switch (step.stepType) {834case VKRStepType::RENDER:835LogRenderPass(step, verbose);836break;837case VKRStepType::COPY:838LogCopy(step);839break;840case VKRStepType::BLIT:841LogBlit(step);842break;843case VKRStepType::READBACK:844LogReadback(step);845break;846case VKRStepType::READBACK_IMAGE:847LogReadbackImage(step);848break;849case VKRStepType::RENDER_SKIP:850INFO_LOG(Log::G3D, "(skipped render pass)");851break;852}853}854INFO_LOG(Log::G3D, "------------------- SUBMIT ------------------");855}856857const char *RenderPassActionName(VKRRenderPassLoadAction a) {858switch (a) {859case VKRRenderPassLoadAction::CLEAR:860return "CLEAR";861case VKRRenderPassLoadAction::DONT_CARE:862return "DONT_CARE";863case VKRRenderPassLoadAction::KEEP:864return "KEEP";865}866return "?";867}868869const char *ImageLayoutToString(VkImageLayout layout) {870switch (layout) {871case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: return "COLOR_ATTACHMENT";872case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: return "DEPTH_STENCIL_ATTACHMENT";873case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: return "SHADER_READ_ONLY";874case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: return "TRANSFER_SRC";875case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return "TRANSFER_DST";876case VK_IMAGE_LAYOUT_GENERAL: return "GENERAL";877case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: return "PRESENT_SRC_KHR";878case VK_IMAGE_LAYOUT_UNDEFINED: return "UNDEFINED";879default: return "(unknown)";880}881}882883void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) {884const auto &r = pass.render;885const char *framebuf = r.framebuffer ? r.framebuffer->Tag() : "backbuffer";886int w = r.framebuffer ? r.framebuffer->width : vulkan_->GetBackbufferWidth();887int h = r.framebuffer ? r.framebuffer->height : vulkan_->GetBackbufferHeight();888889INFO_LOG(Log::G3D, "RENDER %s Begin(%s, draws: %d, %dx%d, %s, %s, %s)", pass.tag, framebuf, r.numDraws, w, h, RenderPassActionName(r.colorLoad), RenderPassActionName(r.depthLoad), RenderPassActionName(r.stencilLoad));890// TODO: Log these in detail.891for (int i = 0; i < (int)pass.preTransitions.size(); i++) {892INFO_LOG(Log::G3D, " PRETRANSITION: %s %s -> %s", pass.preTransitions[i].fb->Tag(), AspectToString(pass.preTransitions[i].aspect), ImageLayoutToString(pass.preTransitions[i].targetLayout));893}894895if (verbose) {896for (auto &cmd : pass.commands) {897switch (cmd.cmd) {898case VKRRenderCommand::REMOVED:899INFO_LOG(Log::G3D, " (Removed)");900break;901case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:902INFO_LOG(Log::G3D, " BindGraphicsPipeline(%x)", (int)(intptr_t)cmd.graphics_pipeline.pipeline);903break;904case VKRRenderCommand::BLEND:905INFO_LOG(Log::G3D, " BlendColor(%08x)", cmd.blendColor.color);906break;907case VKRRenderCommand::CLEAR:908INFO_LOG(Log::G3D, " Clear");909break;910case VKRRenderCommand::DRAW:911INFO_LOG(Log::G3D, " Draw(%d)", cmd.draw.count);912break;913case VKRRenderCommand::DRAW_INDEXED:914INFO_LOG(Log::G3D, " DrawIndexed(%d)", cmd.drawIndexed.count);915break;916case VKRRenderCommand::SCISSOR:917INFO_LOG(Log::G3D, " Scissor(%d, %d, %d, %d)", (int)cmd.scissor.scissor.offset.x, (int)cmd.scissor.scissor.offset.y, (int)cmd.scissor.scissor.extent.width, (int)cmd.scissor.scissor.extent.height);918break;919case VKRRenderCommand::STENCIL:920INFO_LOG(Log::G3D, " Stencil(ref=%d, compare=%d, write=%d)", cmd.stencil.stencilRef, cmd.stencil.stencilCompareMask, cmd.stencil.stencilWriteMask);921break;922case VKRRenderCommand::VIEWPORT:923INFO_LOG(Log::G3D, " Viewport(%f, %f, %f, %f, %f, %f)", cmd.viewport.vp.x, cmd.viewport.vp.y, cmd.viewport.vp.width, cmd.viewport.vp.height, cmd.viewport.vp.minDepth, cmd.viewport.vp.maxDepth);924break;925case VKRRenderCommand::PUSH_CONSTANTS:926INFO_LOG(Log::G3D, " PushConstants(%d)", cmd.push.size);927break;928case VKRRenderCommand::DEBUG_ANNOTATION:929INFO_LOG(Log::G3D, " DebugAnnotation(%s)", cmd.debugAnnotation.annotation);930break;931932case VKRRenderCommand::NUM_RENDER_COMMANDS:933break;934}935}936}937938INFO_LOG(Log::G3D, " Final: %s %s", ImageLayoutToString(pass.render.finalColorLayout), ImageLayoutToString(pass.render.finalDepthStencilLayout));939INFO_LOG(Log::G3D, "RENDER End(%s) - %d commands executed", framebuf, (int)pass.commands.size());940}941942void VulkanQueueRunner::LogCopy(const VKRStep &step) {943INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());944}945946void VulkanQueueRunner::LogBlit(const VKRStep &step) {947INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());948}949950void VulkanQueueRunner::LogReadback(const VKRStep &step) {951INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());952}953954void VulkanQueueRunner::LogReadbackImage(const VKRStep &step) {955INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());956}957958void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd, int curFrame, QueueProfileContext &profile) {959for (size_t i = 0; i < step.preTransitions.size(); i++) {960const TransitionRequest &iter = step.preTransitions[i];961if (iter.aspect == VK_IMAGE_ASPECT_COLOR_BIT && iter.fb->color.layout != iter.targetLayout) {962recordBarrier_.TransitionColorImageAuto(963&iter.fb->color,964iter.targetLayout965);966} else if (iter.fb->depth.image != VK_NULL_HANDLE && (iter.aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) && iter.fb->depth.layout != iter.targetLayout) {967recordBarrier_.TransitionDepthStencilImageAuto(968&iter.fb->depth,969iter.targetLayout970);971}972}973974// Don't execute empty renderpasses that keep the contents.975if (step.commands.empty() && step.render.colorLoad == VKRRenderPassLoadAction::KEEP && step.render.depthLoad == VKRRenderPassLoadAction::KEEP && step.render.stencilLoad == VKRRenderPassLoadAction::KEEP) {976// Flush the pending barrier977recordBarrier_.Flush(cmd);978// Nothing to do.979// TODO: Though - a later step might have used this step's finalColorLayout etc to get things in a layout it expects.980// Should we just do a barrier? Or just let the later step deal with not having things in its preferred layout, like now?981return;982}983984// Write-after-write hazards. Fixed flicker in God of War on ARM (before we added another fix that removed these).985// NOTE: These are commented out because the normal barriers no longer check for equality, effectively generating these986// barriers automatically. This is safe, but sometimes I think can be improved on.987/*988if (step.render.framebuffer) {989int n = 0;990int stage = 0;991992if (step.render.framebuffer->color.layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {993recordBarrier_.TransitionImage(994step.render.framebuffer->color.image,9950,9961,997step.render.framebuffer->numLayers,998VK_IMAGE_ASPECT_COLOR_BIT,999VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1000VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1001VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1002VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT,1003VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,1004VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1005);1006}1007if (step.render.framebuffer->depth.image != VK_NULL_HANDLE && step.render.framebuffer->depth.layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {1008recordBarrier_.TransitionImage(1009step.render.framebuffer->depth.image,10100,10111,1012step.render.framebuffer->numLayers,1013VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1014VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1015VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1016VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1017VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT,1018VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,1019VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT,1020);1021}1022}*/10231024// This chooses a render pass according to the load/store attachment state. We no longer transition1025// image layouts as part of the passes.1026//1027// NOTE: Unconditionally flushes recordBarrier_.1028VKRRenderPass *renderPass = PerformBindFramebufferAsRenderTarget(step, cmd);10291030int curWidth = step.render.framebuffer ? step.render.framebuffer->width : vulkan_->GetBackbufferWidth();1031int curHeight = step.render.framebuffer ? step.render.framebuffer->height : vulkan_->GetBackbufferHeight();10321033VKRFramebuffer *fb = step.render.framebuffer;10341035VKRGraphicsPipeline *lastGraphicsPipeline = nullptr;1036VKRComputePipeline *lastComputePipeline = nullptr;10371038const auto &commands = step.commands;10391040// We can do a little bit of state tracking here to eliminate some calls into the driver.1041// The stencil ones are very commonly mostly redundant so let's eliminate them where possible.1042// Might also want to consider scissor and viewport.1043VkPipeline lastPipeline = VK_NULL_HANDLE;1044FastVec<PendingDescSet> *descSets = nullptr;1045VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;10461047bool pipelineOK = false;10481049int lastStencilWriteMask = -1;1050int lastStencilCompareMask = -1;1051int lastStencilReference = -1;10521053const RenderPassType rpType = step.render.renderPassType;10541055for (size_t i = 0; i < commands.size(); i++) {1056const VkRenderData &c = commands[i];1057#ifdef _DEBUG1058if (profile.enabled) {1059if ((size_t)step.stepType < ARRAY_SIZE(profile.commandCounts)) {1060profile.commandCounts[(size_t)c.cmd]++;1061}1062}1063#endif1064switch (c.cmd) {1065case VKRRenderCommand::REMOVED:1066break;10671068case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:1069{1070VKRGraphicsPipeline *graphicsPipeline = c.graphics_pipeline.pipeline;1071if (graphicsPipeline != lastGraphicsPipeline) {1072VkSampleCountFlagBits fbSampleCount = fb ? fb->sampleCount : VK_SAMPLE_COUNT_1_BIT;10731074if (RenderPassTypeHasMultisample(rpType) && fbSampleCount != graphicsPipeline->SampleCount()) {1075// should have been invalidated.1076_assert_msg_(graphicsPipeline->SampleCount() == VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM,1077"expected %d sample count, got %d", fbSampleCount, graphicsPipeline->SampleCount());1078}10791080VkPipeline pipeline;10811082{1083std::lock_guard<std::mutex> lock(graphicsPipeline->mutex_);1084if (!graphicsPipeline->pipeline[(size_t)rpType]) {1085// NOTE: If render steps got merged, it can happen that, as they ended during recording,1086// they didn't know their final render pass type so they created the wrong pipelines in EndCurRenderStep().1087// Unfortunately I don't know if we can fix it in any more sensible place than here.1088// Maybe a middle pass. But let's try to just block and compile here for now, this doesn't1089// happen all that much.1090graphicsPipeline->pipeline[(size_t)rpType] = Promise<VkPipeline>::CreateEmpty();1091graphicsPipeline->Create(vulkan_, renderPass->Get(vulkan_, rpType, fbSampleCount), rpType, fbSampleCount, time_now_d(), -1);1092}1093pipeline = graphicsPipeline->pipeline[(size_t)rpType]->BlockUntilReady();1094}10951096if (pipeline != VK_NULL_HANDLE) {1097vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);1098descSets = &c.graphics_pipeline.pipelineLayout->frameData[curFrame].descSets_;1099pipelineLayout = c.graphics_pipeline.pipelineLayout->pipelineLayout;1100_dbg_assert_(pipelineLayout != VK_NULL_HANDLE);1101lastGraphicsPipeline = graphicsPipeline;1102pipelineOK = true;1103} else {1104pipelineOK = false;1105}11061107// Reset dynamic state so it gets refreshed with the new pipeline.1108lastStencilWriteMask = -1;1109lastStencilCompareMask = -1;1110lastStencilReference = -1;1111}1112break;1113}11141115case VKRRenderCommand::VIEWPORT:1116if (fb != nullptr) {1117vkCmdSetViewport(cmd, 0, 1, &c.viewport.vp);1118} else {1119const VkViewport &vp = c.viewport.vp;1120DisplayRect<float> rc{ vp.x, vp.y, vp.width, vp.height };1121RotateRectToDisplay(rc, (float)vulkan_->GetBackbufferWidth(), (float)vulkan_->GetBackbufferHeight());1122VkViewport final_vp;1123final_vp.x = rc.x;1124final_vp.y = rc.y;1125final_vp.width = rc.w;1126final_vp.height = rc.h;1127final_vp.maxDepth = vp.maxDepth;1128final_vp.minDepth = vp.minDepth;1129vkCmdSetViewport(cmd, 0, 1, &final_vp);1130}1131break;11321133case VKRRenderCommand::SCISSOR:1134{1135if (fb != nullptr) {1136vkCmdSetScissor(cmd, 0, 1, &c.scissor.scissor);1137} else {1138// Rendering to backbuffer. Might need to rotate.1139const VkRect2D &rc = c.scissor.scissor;1140DisplayRect<int> rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height };1141RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());1142_dbg_assert_(rotated_rc.x >= 0);1143_dbg_assert_(rotated_rc.y >= 0);1144VkRect2D finalRect = VkRect2D{ { rotated_rc.x, rotated_rc.y }, { (uint32_t)rotated_rc.w, (uint32_t)rotated_rc.h} };1145vkCmdSetScissor(cmd, 0, 1, &finalRect);1146}1147break;1148}11491150case VKRRenderCommand::BLEND:1151{1152float bc[4];1153Uint8x4ToFloat4(bc, c.blendColor.color);1154vkCmdSetBlendConstants(cmd, bc);1155break;1156}11571158case VKRRenderCommand::PUSH_CONSTANTS:1159if (pipelineOK) {1160vkCmdPushConstants(cmd, pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data);1161}1162break;11631164case VKRRenderCommand::STENCIL:1165if (lastStencilWriteMask != c.stencil.stencilWriteMask) {1166lastStencilWriteMask = (int)c.stencil.stencilWriteMask;1167vkCmdSetStencilWriteMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilWriteMask);1168}1169if (lastStencilCompareMask != c.stencil.stencilCompareMask) {1170lastStencilCompareMask = c.stencil.stencilCompareMask;1171vkCmdSetStencilCompareMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilCompareMask);1172}1173if (lastStencilReference != c.stencil.stencilRef) {1174lastStencilReference = c.stencil.stencilRef;1175vkCmdSetStencilReference(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilRef);1176}1177break;11781179case VKRRenderCommand::DRAW_INDEXED:1180if (pipelineOK) {1181VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;1182_dbg_assert_(set != VK_NULL_HANDLE);1183vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.drawIndexed.numUboOffsets, c.drawIndexed.uboOffsets);1184vkCmdBindIndexBuffer(cmd, c.drawIndexed.ibuffer, c.drawIndexed.ioffset, VK_INDEX_TYPE_UINT16);1185VkDeviceSize voffset = c.drawIndexed.voffset;1186vkCmdBindVertexBuffers(cmd, 0, 1, &c.drawIndexed.vbuffer, &voffset);1187vkCmdDrawIndexed(cmd, c.drawIndexed.count, c.drawIndexed.instances, 0, 0, 0);1188}1189break;11901191case VKRRenderCommand::DRAW:1192if (pipelineOK) {1193VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;1194_dbg_assert_(set != VK_NULL_HANDLE);1195vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.draw.numUboOffsets, c.draw.uboOffsets);1196if (c.draw.vbuffer) {1197vkCmdBindVertexBuffers(cmd, 0, 1, &c.draw.vbuffer, &c.draw.voffset);1198}1199vkCmdDraw(cmd, c.draw.count, 1, c.draw.offset, 0);1200}1201break;12021203case VKRRenderCommand::CLEAR:1204{1205// If we get here, we failed to merge a clear into a render pass load op. This is bad for perf.1206int numAttachments = 0;1207VkClearRect rc{};1208rc.baseArrayLayer = 0;1209rc.layerCount = 1; // In multiview mode, 1 means to replicate to all the active layers.1210rc.rect.extent.width = (uint32_t)curWidth;1211rc.rect.extent.height = (uint32_t)curHeight;1212VkClearAttachment attachments[2]{};1213if (c.clear.clearMask & VK_IMAGE_ASPECT_COLOR_BIT) {1214VkClearAttachment &attachment = attachments[numAttachments++];1215attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1216attachment.colorAttachment = 0;1217Uint8x4ToFloat4(attachment.clearValue.color.float32, c.clear.clearColor);1218}1219if (c.clear.clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1220VkClearAttachment &attachment = attachments[numAttachments++];1221attachment.aspectMask = 0;1222if (c.clear.clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1223attachment.clearValue.depthStencil.depth = c.clear.clearZ;1224attachment.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1225}1226if (c.clear.clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) {1227attachment.clearValue.depthStencil.stencil = (uint32_t)c.clear.clearStencil;1228attachment.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1229}1230}1231if (numAttachments) {1232vkCmdClearAttachments(cmd, numAttachments, attachments, 1, &rc);1233}1234break;1235}12361237case VKRRenderCommand::DEBUG_ANNOTATION:1238if (vulkan_->Extensions().EXT_debug_utils) {1239VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };1240labelInfo.pLabelName = c.debugAnnotation.annotation;1241vkCmdInsertDebugUtilsLabelEXT(cmd, &labelInfo);1242}1243break;12441245default:1246UNREACHABLE();1247break;1248}1249}1250vkCmdEndRenderPass(cmd);12511252_dbg_assert_(recordBarrier_.empty());12531254if (fb) {1255// If the desired final layout aren't the optimal layout needed next, early-transition the image.1256if (step.render.finalColorLayout != fb->color.layout) {1257recordBarrier_.TransitionColorImageAuto(&fb->color, step.render.finalColorLayout);1258}1259if (fb->depth.image && step.render.finalDepthStencilLayout != fb->depth.layout) {1260recordBarrier_.TransitionDepthStencilImageAuto(&fb->depth, step.render.finalDepthStencilLayout);1261}1262}1263}12641265VKRRenderPass *VulkanQueueRunner::PerformBindFramebufferAsRenderTarget(const VKRStep &step, VkCommandBuffer cmd) {1266VKRRenderPass *renderPass;1267int numClearVals = 0;1268VkClearValue clearVal[4]{};1269VkFramebuffer framebuf;1270int w;1271int h;12721273bool hasDepth = RenderPassTypeHasDepth(step.render.renderPassType);12741275VkSampleCountFlagBits sampleCount;12761277// Can be used to separate the final*Layout barrier from the rest for debugging in renderdoc.1278// recordBarrier_.Flush(cmd);12791280if (step.render.framebuffer) {1281_dbg_assert_(step.render.finalColorLayout != VK_IMAGE_LAYOUT_UNDEFINED);1282_dbg_assert_(step.render.finalDepthStencilLayout != VK_IMAGE_LAYOUT_UNDEFINED);12831284RPKey key{1285step.render.colorLoad, step.render.depthLoad, step.render.stencilLoad,1286step.render.colorStore, step.render.depthStore, step.render.stencilStore,1287};1288renderPass = GetRenderPass(key);12891290VKRFramebuffer *fb = step.render.framebuffer;1291framebuf = fb->Get(renderPass, step.render.renderPassType);1292sampleCount = fb->sampleCount;1293_dbg_assert_(framebuf != VK_NULL_HANDLE);1294w = fb->width;1295h = fb->height;12961297// Mali driver on S8 (Android O) and S9 mishandles renderpasses that do just a clear1298// and then no draw calls. Memory transaction elimination gets mis-flagged or something.1299// To avoid this, we transition to GENERAL and back in this case (ARM-approved workaround).1300// See pull request #10723.1301bool maliBugWorkaround = step.render.numDraws == 0 &&1302step.render.colorLoad == VKRRenderPassLoadAction::CLEAR &&1303vulkan_->GetPhysicalDeviceProperties().properties.driverVersion == 0xaa9c4b29;1304if (maliBugWorkaround) {1305// A little suboptimal but let's go for maximum safety here.1306recordBarrier_.TransitionImage(fb->color.image, 0, 1, fb->numLayers, VK_IMAGE_ASPECT_COLOR_BIT,1307fb->color.layout, VK_IMAGE_LAYOUT_GENERAL,1308VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1309VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1310VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);1311fb->color.layout = VK_IMAGE_LAYOUT_GENERAL;1312}13131314recordBarrier_.TransitionColorImageAuto(&fb->color, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);13151316// If the render pass doesn't touch depth, we can avoid a layout transition of the depth buffer.1317if (fb->depth.image && RenderPassTypeHasDepth(step.render.renderPassType)) {1318recordBarrier_.TransitionDepthStencilImageAuto(&fb->depth, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);1319}13201321// The transition from the optimal format happens after EndRenderPass, now that we don't1322// do it as part of the renderpass itself anymore.13231324if (sampleCount != VK_SAMPLE_COUNT_1_BIT) {1325// We don't initialize values for these.1326numClearVals = hasDepth ? 2 : 1; // Skip the resolve buffers, don't need to clear those.1327}1328if (step.render.colorLoad == VKRRenderPassLoadAction::CLEAR) {1329Uint8x4ToFloat4(clearVal[numClearVals].color.float32, step.render.clearColor);1330}1331numClearVals++;1332if (hasDepth) {1333if (step.render.depthLoad == VKRRenderPassLoadAction::CLEAR || step.render.stencilLoad == VKRRenderPassLoadAction::CLEAR) {1334clearVal[numClearVals].depthStencil.depth = step.render.clearDepth;1335clearVal[numClearVals].depthStencil.stencil = step.render.clearStencil;1336}1337numClearVals++;1338}1339_dbg_assert_(numClearVals != 3);1340} else {1341RPKey key{1342VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,1343VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,1344};1345renderPass = GetRenderPass(key);1346framebuf = backbuffer_;13471348// Raw, rotated backbuffer size.1349w = vulkan_->GetBackbufferWidth();1350h = vulkan_->GetBackbufferHeight();13511352Uint8x4ToFloat4(clearVal[0].color.float32, step.render.clearColor);1353numClearVals = hasDepth ? 2 : 1; // We might do depth-less backbuffer in the future, though doubtful of the value.1354clearVal[1].depthStencil.depth = 0.0f;1355clearVal[1].depthStencil.stencil = 0;1356sampleCount = VK_SAMPLE_COUNT_1_BIT;1357}13581359VkRenderPassBeginInfo rp_begin = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO };1360rp_begin.renderPass = renderPass->Get(vulkan_, step.render.renderPassType, sampleCount);1361rp_begin.framebuffer = framebuf;13621363VkRect2D rc = step.render.renderArea;1364if (!step.render.framebuffer) {1365// Rendering to backbuffer, must rotate, just like scissors.1366DisplayRect<int> rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height };1367RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());13681369rc.offset.x = rotated_rc.x;1370rc.offset.y = rotated_rc.y;1371rc.extent.width = rotated_rc.w;1372rc.extent.height = rotated_rc.h;1373}13741375recordBarrier_.Flush(cmd);13761377rp_begin.renderArea = rc;1378rp_begin.clearValueCount = numClearVals;1379rp_begin.pClearValues = numClearVals ? clearVal : nullptr;1380vkCmdBeginRenderPass(cmd, &rp_begin, VK_SUBPASS_CONTENTS_INLINE);13811382return renderPass;1383}13841385void VulkanQueueRunner::PerformCopy(const VKRStep &step, VkCommandBuffer cmd) {1386VKRFramebuffer *src = step.copy.src;1387VKRFramebuffer *dst = step.copy.dst;13881389int layerCount = std::min(step.copy.src->numLayers, step.copy.dst->numLayers);1390_dbg_assert_(step.copy.src->numLayers >= step.copy.dst->numLayers);13911392// TODO: If dst covers exactly the whole destination, we can set up a UNDEFINED->TRANSFER_DST_OPTIMAL transition,1393// which can potentially be more efficient.13941395const VkImageLayout srcTransferLayout = src != dst ? VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL : VK_IMAGE_LAYOUT_GENERAL;13961397if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1398recordBarrier_.TransitionColorImageAuto(&src->color, srcTransferLayout);1399if (src != dst) {1400recordBarrier_.TransitionColorImageAuto(&dst->color, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1401}1402}14031404// We can't copy only depth or only stencil unfortunately - or can we?.1405if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1406_dbg_assert_(src->depth.image != VK_NULL_HANDLE);1407recordBarrier_.TransitionDepthStencilImageAuto(&src->depth, srcTransferLayout);1408if (src != dst) {1409if (dst->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {1410recordBarrier_.TransitionDepthStencilImageAuto(&dst->depth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1411} else {1412// Kingdom Hearts: Subsequent copies twice to the same depth buffer without any other use.1413// Not super sure how that happens, but we need a barrier to pass sync validation.1414SetupTransferDstWriteAfterWrite(dst->depth, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, &recordBarrier_);1415}1416}1417}14181419bool multisampled = src->sampleCount != VK_SAMPLE_COUNT_1_BIT && dst->sampleCount != VK_SAMPLE_COUNT_1_BIT;1420if (multisampled) {1421// If both the targets are multisampled, copy the msaa targets too.1422// For that, we need to transition them from their normally permanent VK_*_ATTACHMENT_OPTIMAL layouts, and then back.1423if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1424recordBarrier_.TransitionColorImageAuto(&src->msaaColor, srcTransferLayout);1425if (src != dst) {1426recordBarrier_.TransitionColorImageAuto(&dst->msaaColor, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1427}1428}1429if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1430recordBarrier_.TransitionDepthStencilImageAuto(&src->msaaDepth, srcTransferLayout);1431if (src != dst) {1432// Kingdom Hearts: Subsequent copies to the same depth buffer without any other use.1433// Not super sure how that happens, but we need a barrier to pass sync validation.1434recordBarrier_.TransitionDepthStencilImageAuto(&dst->msaaDepth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1435}1436}1437}14381439recordBarrier_.Flush(cmd);14401441VkImageCopy copy{};1442copy.srcOffset.x = step.copy.srcRect.offset.x;1443copy.srcOffset.y = step.copy.srcRect.offset.y;1444copy.srcOffset.z = 0;1445copy.srcSubresource.mipLevel = 0;1446copy.srcSubresource.layerCount = layerCount;1447copy.dstOffset.x = step.copy.dstPos.x;1448copy.dstOffset.y = step.copy.dstPos.y;1449copy.dstOffset.z = 0;1450copy.dstSubresource.mipLevel = 0;1451copy.dstSubresource.layerCount = layerCount;1452copy.extent.width = step.copy.srcRect.extent.width;1453copy.extent.height = step.copy.srcRect.extent.height;1454copy.extent.depth = 1;14551456if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1457copy.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1458copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1459vkCmdCopyImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, ©);14601461if (multisampled) {1462vkCmdCopyImage(cmd, src->msaaColor.image, src->msaaColor.layout, dst->msaaColor.image, dst->msaaColor.layout, 1, ©);1463}1464}1465if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1466_dbg_assert_(src->depth.image != VK_NULL_HANDLE);1467_dbg_assert_(dst->depth.image != VK_NULL_HANDLE);1468copy.srcSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);1469copy.dstSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);1470vkCmdCopyImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, ©);14711472if (multisampled) {1473vkCmdCopyImage(cmd, src->msaaDepth.image, src->msaaDepth.layout, dst->msaaDepth.image, dst->msaaDepth.layout, 1, ©);1474}1475}14761477if (multisampled) {1478// Transition the MSAA surfaces back to optimal.1479if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1480recordBarrier_.TransitionImage(1481src->msaaColor.image,14820,14831,1484src->msaaColor.numLayers,1485VK_IMAGE_ASPECT_COLOR_BIT,1486srcTransferLayout,1487VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1488VK_ACCESS_TRANSFER_READ_BIT,1489VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1490VK_PIPELINE_STAGE_TRANSFER_BIT,1491VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1492);1493src->msaaColor.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;1494if (src != dst) {1495recordBarrier_.TransitionImage(1496dst->msaaColor.image,14970,14981,1499dst->msaaColor.numLayers,1500VK_IMAGE_ASPECT_COLOR_BIT,1501VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1502VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1503VK_ACCESS_TRANSFER_WRITE_BIT,1504VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1505VK_PIPELINE_STAGE_TRANSFER_BIT,1506VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1507);1508dst->msaaColor.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;1509}1510}1511if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1512recordBarrier_.TransitionImage(1513src->msaaDepth.image,15140,15151,1516src->msaaDepth.numLayers,1517VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1518srcTransferLayout,1519VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1520VK_ACCESS_TRANSFER_READ_BIT,1521VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1522VK_PIPELINE_STAGE_TRANSFER_BIT,1523VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT1524);1525src->msaaDepth.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;1526if (src != dst) {1527recordBarrier_.TransitionImage(1528dst->msaaDepth.image,15290,15301,1531dst->msaaDepth.numLayers,1532VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1533VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1534VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1535VK_ACCESS_TRANSFER_WRITE_BIT,1536VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1537VK_PIPELINE_STAGE_TRANSFER_BIT,1538VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT1539);1540dst->msaaDepth.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;1541}1542}1543// Probably not necessary.1544recordBarrier_.Flush(cmd);1545}1546}15471548void VulkanQueueRunner::PerformBlit(const VKRStep &step, VkCommandBuffer cmd) {1549// The barrier code doesn't handle this case. We'd need to transition to GENERAL to do an intra-image copy.1550_dbg_assert_(step.blit.src != step.blit.dst);15511552int layerCount = std::min(step.blit.src->numLayers, step.blit.dst->numLayers);1553_dbg_assert_(step.blit.src->numLayers >= step.blit.dst->numLayers);15541555// Blitting is not allowed for multisample images. You're suppose to use vkCmdResolveImage but it only goes in one direction (multi to single).1556_dbg_assert_(step.blit.src->sampleCount == VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT);1557_dbg_assert_(step.blit.dst->sampleCount == VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT);15581559VKRFramebuffer *src = step.blit.src;1560VKRFramebuffer *dst = step.blit.dst;15611562// First source barriers.1563if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1564recordBarrier_.TransitionColorImageAuto(&src->color, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1565recordBarrier_.TransitionColorImageAuto(&dst->color, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1566}15671568// We can't copy only depth or only stencil unfortunately.1569if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1570_assert_(src->depth.image != VK_NULL_HANDLE);1571_assert_(dst->depth.image != VK_NULL_HANDLE);1572recordBarrier_.TransitionDepthStencilImageAuto(&src->depth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1573recordBarrier_.TransitionDepthStencilImageAuto(&dst->depth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1574}15751576recordBarrier_.Flush(cmd);15771578// If any validation needs to be performed here, it should probably have been done1579// already when the blit was queued. So don't validate here.1580VkImageBlit blit{};1581blit.srcOffsets[0].x = step.blit.srcRect.offset.x;1582blit.srcOffsets[0].y = step.blit.srcRect.offset.y;1583blit.srcOffsets[0].z = 0;1584blit.srcOffsets[1].x = step.blit.srcRect.offset.x + step.blit.srcRect.extent.width;1585blit.srcOffsets[1].y = step.blit.srcRect.offset.y + step.blit.srcRect.extent.height;1586blit.srcOffsets[1].z = 1;1587blit.srcSubresource.mipLevel = 0;1588blit.srcSubresource.layerCount = layerCount;1589blit.dstOffsets[0].x = step.blit.dstRect.offset.x;1590blit.dstOffsets[0].y = step.blit.dstRect.offset.y;1591blit.dstOffsets[0].z = 0;1592blit.dstOffsets[1].x = step.blit.dstRect.offset.x + step.blit.dstRect.extent.width;1593blit.dstOffsets[1].y = step.blit.dstRect.offset.y + step.blit.dstRect.extent.height;1594blit.dstOffsets[1].z = 1;1595blit.dstSubresource.mipLevel = 0;1596blit.dstSubresource.layerCount = layerCount;15971598if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1599blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1600blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1601vkCmdBlitImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, &blit, step.blit.filter);1602}16031604// TODO: Need to check if the depth format is blittable.1605// Actually, we should probably almost always use copies rather than blits for depth buffers.1606if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1607blit.srcSubresource.aspectMask = 0;1608blit.dstSubresource.aspectMask = 0;1609if (step.blit.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1610blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1611blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1612}1613if (step.blit.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {1614blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1615blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1616}1617vkCmdBlitImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, &blit, step.blit.filter);1618}1619}16201621void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrierBatch *recordBarrier) {1622VkImageAspectFlags imageAspect = aspect;1623VkAccessFlags srcAccessMask = 0;1624VkPipelineStageFlags srcStageMask = 0;1625if (img.format == VK_FORMAT_D16_UNORM_S8_UINT || img.format == VK_FORMAT_D24_UNORM_S8_UINT || img.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1626// Barrier must specify both for combined depth/stencil buffers.1627imageAspect = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;1628} else {1629imageAspect = aspect;1630}1631_dbg_assert_(img.layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1632srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;1633srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;1634recordBarrier->TransitionImage(1635img.image,16360,16371,1638img.numLayers,1639aspect,1640VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1641VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1642VK_ACCESS_TRANSFER_WRITE_BIT,1643VK_ACCESS_TRANSFER_WRITE_BIT,1644VK_PIPELINE_STAGE_TRANSFER_BIT,1645VK_PIPELINE_STAGE_TRANSFER_BIT1646);1647}16481649void VulkanQueueRunner::ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize) {1650if (readback->buffer && requiredSize <= readback->bufferSize) {1651return;1652}16531654if (readback->buffer) {1655vulkan_->Delete().QueueDeleteBufferAllocation(readback->buffer, readback->allocation);1656}16571658readback->bufferSize = requiredSize;16591660VkDevice device = vulkan_->GetDevice();16611662VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };1663buf.size = readback->bufferSize;1664buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;16651666VmaAllocationCreateInfo allocCreateInfo{};1667allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;1668VmaAllocationInfo allocInfo{};16691670VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readback->buffer, &readback->allocation, &allocInfo);1671_assert_(res == VK_SUCCESS);16721673const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];1674readback->isCoherent = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;1675}16761677void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd, FrameData &frameData) {1678VkImage image;1679VkImageLayout copyLayout;1680// Special case for backbuffer readbacks.1681if (step.readback.src == nullptr) {1682// We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT,1683// and then back into it.1684// Regarding layers, backbuffer currently only has one layer.1685recordBarrier_.TransitionImage(backbufferImage_, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT,1686VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,16870, VK_ACCESS_TRANSFER_READ_BIT,1688VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);1689copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;1690image = backbufferImage_;1691} else {1692VKRImage *srcImage;1693if (step.readback.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1694srcImage = &step.readback.src->color;1695recordBarrier_.TransitionColorImageAuto(srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1696} else if (step.readback.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1697srcImage = &step.readback.src->depth;1698recordBarrier_.TransitionDepthStencilImageAuto(srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1699_dbg_assert_(srcImage->image != VK_NULL_HANDLE);1700} else {1701_dbg_assert_msg_(false, "No image aspect to readback?");1702return;1703}1704image = srcImage->image;1705copyLayout = srcImage->layout;1706}17071708recordBarrier_.Flush(cmd);17091710// TODO: Handle different readback formats!1711u32 readbackSizeInBytes = sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height;17121713CachedReadback *cached = nullptr;17141715if (step.readback.delayed) {1716ReadbackKey key;1717key.framebuf = step.readback.src;1718key.width = step.readback.srcRect.extent.width;1719key.height = step.readback.srcRect.extent.height;17201721// See if there's already a buffer we can reuse1722if (!frameData.readbacks_.Get(key, &cached)) {1723cached = new CachedReadback();1724cached->bufferSize = 0;1725frameData.readbacks_.Insert(key, cached);1726}1727} else {1728cached = &syncReadback_;1729}17301731ResizeReadbackBuffer(cached, readbackSizeInBytes);17321733VkBufferImageCopy region{};1734region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };1735region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };1736region.imageSubresource.aspectMask = step.readback.aspectMask;1737region.imageSubresource.layerCount = 1;1738region.bufferOffset = 0;1739region.bufferRowLength = step.readback.srcRect.extent.width;1740region.bufferImageHeight = step.readback.srcRect.extent.height;17411742vkCmdCopyImageToBuffer(cmd, image, copyLayout, cached->buffer, 1, ®ion);17431744// NOTE: Can't read the buffer using the CPU here - need to sync first.17451746// If we copied from the backbuffer, transition it back.1747if (step.readback.src == nullptr) {1748// We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT,1749// and then back into it.1750// Regarding layers, backbuffer currently only has one layer.1751recordBarrier_.TransitionImage(backbufferImage_, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT,1752VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,1753VK_ACCESS_TRANSFER_READ_BIT, 0,1754VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);1755recordBarrier_.Flush(cmd); // probably not needed1756copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;1757}1758}17591760void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffer cmd) {1761// TODO: Clean this up - just reusing `SetupTransitionToTransferSrc`.1762VkImageLayout layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;1763recordBarrier_.TransitionColorImageAuto(step.readback_image.image, &layout, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 1, 1);1764recordBarrier_.Flush(cmd);17651766ResizeReadbackBuffer(&syncReadback_, sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);17671768VkBufferImageCopy region{};1769region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 };1770region.imageExtent = { step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height, 1 };1771region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1772region.imageSubresource.layerCount = 1;1773region.imageSubresource.mipLevel = step.readback_image.mipLevel;1774region.bufferOffset = 0;1775region.bufferRowLength = step.readback_image.srcRect.extent.width;1776region.bufferImageHeight = step.readback_image.srcRect.extent.height;1777vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, syncReadback_.buffer, 1, ®ion);17781779// Now transfer it back to a texture.1780recordBarrier_.TransitionImage(step.readback_image.image, 0, 1, 1, // I don't think we have any multilayer cases for regular textures. Above in PerformReadback, though..1781VK_IMAGE_ASPECT_COLOR_BIT,1782VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,1783VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_SHADER_READ_BIT,1784VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);1785recordBarrier_.Flush(cmd); // probably not needed17861787// NOTE: Can't read the buffer using the CPU here - need to sync first.1788// Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host.1789}17901791bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {1792CachedReadback *readback = &syncReadback_;17931794// Look up in readback cache.1795if (src) {1796ReadbackKey key;1797key.framebuf = src;1798key.width = width;1799key.height = height;1800CachedReadback *cached;1801if (frameData.readbacks_.Get(key, &cached)) {1802readback = cached;1803} else {1804// Didn't have a cached image ready yet1805return false;1806}1807}18081809if (!readback->buffer)1810return false; // Didn't find anything in cache, or something has gone really wrong.18111812// Read back to the requested address in ram from buffer.1813void *mappedData;1814const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);1815VkResult res = vmaMapMemory(vulkan_->Allocator(), readback->allocation, &mappedData);18161817if (res != VK_SUCCESS) {1818ERROR_LOG(Log::G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res);1819return false;1820}18211822if (!readback->isCoherent) {1823vmaInvalidateAllocation(vulkan_->Allocator(), readback->allocation, 0, width * height * srcPixelSize);1824}18251826// TODO: Perform these conversions in a compute shader on the GPU.1827if (srcFormat == Draw::DataFormat::R8G8B8A8_UNORM) {1828ConvertFromRGBA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat);1829} else if (srcFormat == Draw::DataFormat::B8G8R8A8_UNORM) {1830ConvertFromBGRA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat);1831} else if (srcFormat == destFormat) {1832// Can just memcpy when it matches no matter the format!1833uint8_t *dst = pixels;1834const uint8_t *src = (const uint8_t *)mappedData;1835for (int y = 0; y < height; ++y) {1836memcpy(dst, src, width * srcPixelSize);1837src += width * srcPixelSize;1838dst += pixelStride * srcPixelSize;1839}1840} else if (destFormat == Draw::DataFormat::D32F) {1841ConvertToD32F(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat);1842} else if (destFormat == Draw::DataFormat::D16) {1843ConvertToD16(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat);1844} else {1845// TODO: Maybe a depth conversion or something?1846ERROR_LOG(Log::G3D, "CopyReadbackBuffer: Unknown format");1847_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);1848}18491850vmaUnmapMemory(vulkan_->Allocator(), readback->allocation);1851return true;1852}18531854const char *VKRRenderCommandToString(VKRRenderCommand cmd) {1855const char * const str[] = {1856"REMOVED",1857"BIND_GRAPHICS_PIPELINE", // async1858"STENCIL",1859"BLEND",1860"VIEWPORT",1861"SCISSOR",1862"CLEAR",1863"DRAW",1864"DRAW_INDEXED",1865"PUSH_CONSTANTS",1866"DEBUG_ANNOTATION",1867};1868if ((int)cmd < ARRAY_SIZE(str)) {1869return str[(int)cmd];1870} else {1871return "N/A";1872}1873}187418751876