CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/GPU/Vulkan/VulkanQueueRunner.cpp
Views: 1401
#include <unordered_map>12#include "Common/GPU/DataFormat.h"3#include "Common/GPU/Vulkan/VulkanQueueRunner.h"4#include "Common/GPU/Vulkan/VulkanRenderManager.h"5#include "Common/VR/PPSSPPVR.h"6#include "Common/Log.h"7#include "Common/TimeUtil.h"89using namespace PPSSPP_VK;1011// Debug help: adb logcat -s DEBUG AndroidRuntime PPSSPPNativeActivity PPSSPP NativeGLView NativeRenderer NativeSurfaceView PowerSaveModeReceiver InputDeviceState PpssppActivity CameraHelper1213static void MergeRenderAreaRectInto(VkRect2D *dest, const VkRect2D &src) {14if (dest->offset.x > src.offset.x) {15dest->extent.width += (dest->offset.x - src.offset.x);16dest->offset.x = src.offset.x;17}18if (dest->offset.y > src.offset.y) {19dest->extent.height += (dest->offset.y - src.offset.y);20dest->offset.y = src.offset.y;21}22if (dest->offset.x + dest->extent.width < src.offset.x + src.extent.width) {23dest->extent.width = src.offset.x + src.extent.width - dest->offset.x;24}25if (dest->offset.y + dest->extent.height < src.offset.y + src.extent.height) {26dest->extent.height = src.offset.y + src.extent.height - dest->offset.y;27}28}2930// We need to take the "max" of the features used in the two render passes.31RenderPassType MergeRPTypes(RenderPassType a, RenderPassType b) {32// Either both are backbuffer type, or neither are.33// These can't merge with other renderpasses34if (a == RenderPassType::BACKBUFFER || b == RenderPassType::BACKBUFFER) {35_dbg_assert_(a == b);36return a;37}3839_dbg_assert_((a & RenderPassType::MULTIVIEW) == (b & RenderPassType::MULTIVIEW));4041// The rest we can just OR together to get the maximum feature set.42return (RenderPassType)((u32)a | (u32)b);43}4445void VulkanQueueRunner::CreateDeviceObjects() {46INFO_LOG(Log::G3D, "VulkanQueueRunner::CreateDeviceObjects");4748RPKey key{49VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,50VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,51};52compatibleRenderPass_ = GetRenderPass(key);5354#if 055// Just to check whether it makes sense to split some of these. drawidx is way bigger than the others...56// We should probably just move to variable-size data in a raw buffer anyway...57VkRenderData rd;58INFO_LOG(Log::G3D, "sizeof(pipeline): %d", (int)sizeof(rd.pipeline));59INFO_LOG(Log::G3D, "sizeof(draw): %d", (int)sizeof(rd.draw));60INFO_LOG(Log::G3D, "sizeof(drawidx): %d", (int)sizeof(rd.drawIndexed));61INFO_LOG(Log::G3D, "sizeof(clear): %d", (int)sizeof(rd.clear));62INFO_LOG(Log::G3D, "sizeof(viewport): %d", (int)sizeof(rd.viewport));63INFO_LOG(Log::G3D, "sizeof(scissor): %d", (int)sizeof(rd.scissor));64INFO_LOG(Log::G3D, "sizeof(blendColor): %d", (int)sizeof(rd.blendColor));65INFO_LOG(Log::G3D, "sizeof(push): %d", (int)sizeof(rd.push));66#endif67}6869void VulkanQueueRunner::DestroyDeviceObjects() {70INFO_LOG(Log::G3D, "VulkanQueueRunner::DestroyDeviceObjects");7172syncReadback_.Destroy(vulkan_);7374renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {75_assert_(rp);76rp->Destroy(vulkan_);77delete rp;78});79renderPasses_.Clear();80}8182bool VulkanQueueRunner::CreateSwapchain(VkCommandBuffer cmdInit, VulkanBarrierBatch *barriers) {83VkResult res = vkGetSwapchainImagesKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &swapchainImageCount_, nullptr);84_dbg_assert_(res == VK_SUCCESS);8586VkImage *swapchainImages = new VkImage[swapchainImageCount_];87res = vkGetSwapchainImagesKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &swapchainImageCount_, swapchainImages);88if (res != VK_SUCCESS) {89ERROR_LOG(Log::G3D, "vkGetSwapchainImagesKHR failed");90delete[] swapchainImages;91return false;92}9394for (uint32_t i = 0; i < swapchainImageCount_; i++) {95SwapchainImageData sc_buffer{};96sc_buffer.image = swapchainImages[i];9798VkImageViewCreateInfo color_image_view = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };99color_image_view.format = vulkan_->GetSwapchainFormat();100color_image_view.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;101color_image_view.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;102color_image_view.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;103color_image_view.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;104color_image_view.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;105color_image_view.subresourceRange.baseMipLevel = 0;106color_image_view.subresourceRange.levelCount = 1;107color_image_view.subresourceRange.baseArrayLayer = 0;108color_image_view.subresourceRange.layerCount = 1; // TODO: Investigate hw-assisted stereo.109color_image_view.viewType = VK_IMAGE_VIEW_TYPE_2D;110color_image_view.flags = 0;111color_image_view.image = sc_buffer.image;112113// We leave the images as UNDEFINED, there's no need to pre-transition them as114// the backbuffer renderpass starts out with them being auto-transitioned from UNDEFINED anyway.115// Also, turns out it's illegal to transition un-acquired images, thanks Hans-Kristian. See #11417.116117res = vkCreateImageView(vulkan_->GetDevice(), &color_image_view, nullptr, &sc_buffer.view);118vulkan_->SetDebugName(sc_buffer.view, VK_OBJECT_TYPE_IMAGE_VIEW, "swapchain_view");119swapchainImages_.push_back(sc_buffer);120_dbg_assert_(res == VK_SUCCESS);121}122delete[] swapchainImages;123124// Must be before InitBackbufferRenderPass.125if (InitDepthStencilBuffer(cmdInit, barriers)) {126InitBackbufferFramebuffers(vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());127}128return true;129}130131bool VulkanQueueRunner::InitBackbufferFramebuffers(int width, int height) {132VkResult res;133// We share the same depth buffer but have multiple color buffers, see the loop below.134VkImageView attachments[2] = { VK_NULL_HANDLE, depth_.view };135136VkFramebufferCreateInfo fb_info = { VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };137fb_info.renderPass = GetCompatibleRenderPass()->Get(vulkan_, RenderPassType::BACKBUFFER, VK_SAMPLE_COUNT_1_BIT);138fb_info.attachmentCount = 2;139fb_info.pAttachments = attachments;140fb_info.width = width;141fb_info.height = height;142fb_info.layers = 1;143144framebuffers_.resize(swapchainImageCount_);145146for (uint32_t i = 0; i < swapchainImageCount_; i++) {147attachments[0] = swapchainImages_[i].view;148res = vkCreateFramebuffer(vulkan_->GetDevice(), &fb_info, nullptr, &framebuffers_[i]);149_dbg_assert_(res == VK_SUCCESS);150if (res != VK_SUCCESS) {151framebuffers_.clear();152return false;153}154}155156return true;157}158159bool VulkanQueueRunner::InitDepthStencilBuffer(VkCommandBuffer cmd, VulkanBarrierBatch *barriers) {160const VkFormat depth_format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat;161int aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;162VkImageCreateInfo image_info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };163image_info.imageType = VK_IMAGE_TYPE_2D;164image_info.format = depth_format;165image_info.extent.width = vulkan_->GetBackbufferWidth();166image_info.extent.height = vulkan_->GetBackbufferHeight();167image_info.extent.depth = 1;168image_info.mipLevels = 1;169image_info.arrayLayers = 1;170image_info.samples = VK_SAMPLE_COUNT_1_BIT;171image_info.queueFamilyIndexCount = 0;172image_info.pQueueFamilyIndices = nullptr;173image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;174image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;175image_info.flags = 0;176177depth_.format = depth_format;178179VmaAllocationCreateInfo allocCreateInfo{};180VmaAllocationInfo allocInfo{};181182allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;183184VkResult res = vmaCreateImage(vulkan_->Allocator(), &image_info, &allocCreateInfo, &depth_.image, &depth_.alloc, &allocInfo);185_dbg_assert_(res == VK_SUCCESS);186if (res != VK_SUCCESS)187return false;188189vulkan_->SetDebugName(depth_.image, VK_OBJECT_TYPE_IMAGE, "BackbufferDepth");190191VkImageMemoryBarrier *barrier = barriers->Add(depth_.image,192VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,193VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, 0);194barrier->subresourceRange.aspectMask = aspectMask;195barrier->oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;196barrier->newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;197barrier->srcAccessMask = 0;198barrier->dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;199200VkImageViewCreateInfo depth_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };201depth_view_info.image = depth_.image;202depth_view_info.format = depth_format;203depth_view_info.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;204depth_view_info.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;205depth_view_info.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;206depth_view_info.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;207depth_view_info.subresourceRange.aspectMask = aspectMask;208depth_view_info.subresourceRange.baseMipLevel = 0;209depth_view_info.subresourceRange.levelCount = 1;210depth_view_info.subresourceRange.baseArrayLayer = 0;211depth_view_info.subresourceRange.layerCount = 1;212depth_view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;213depth_view_info.flags = 0;214215VkDevice device = vulkan_->GetDevice();216217res = vkCreateImageView(device, &depth_view_info, NULL, &depth_.view);218vulkan_->SetDebugName(depth_.view, VK_OBJECT_TYPE_IMAGE_VIEW, "depth_stencil_backbuffer");219_dbg_assert_(res == VK_SUCCESS);220if (res != VK_SUCCESS)221return false;222223return true;224}225226227void VulkanQueueRunner::DestroyBackBuffers() {228for (auto &image : swapchainImages_) {229vulkan_->Delete().QueueDeleteImageView(image.view);230}231swapchainImages_.clear();232233if (depth_.view) {234vulkan_->Delete().QueueDeleteImageView(depth_.view);235}236if (depth_.image) {237_dbg_assert_(depth_.alloc);238vulkan_->Delete().QueueDeleteImageAllocation(depth_.image, depth_.alloc);239}240depth_ = {};241for (uint32_t i = 0; i < framebuffers_.size(); i++) {242_dbg_assert_(framebuffers_[i] != VK_NULL_HANDLE);243vulkan_->Delete().QueueDeleteFramebuffer(framebuffers_[i]);244}245framebuffers_.clear();246247INFO_LOG(Log::G3D, "Backbuffers destroyed");248}249250// Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827251// Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies252VKRRenderPass *VulkanQueueRunner::GetRenderPass(const RPKey &key) {253VKRRenderPass *foundPass;254if (renderPasses_.Get(key, &foundPass)) {255return foundPass;256}257258VKRRenderPass *pass = new VKRRenderPass(key);259renderPasses_.Insert(key, pass);260return pass;261}262263void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {264// Optimizes renderpasses, then sequences them.265// Planned optimizations:266// * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes267// as early as possible in the frame (Wipeout billboards). This will require taking over more of descriptor management so we can268// substitute descriptors, alternatively using texture array layers creatively.269270for (int j = 0; j < (int)steps.size(); j++) {271if (steps[j]->stepType == VKRStepType::RENDER &&272steps[j]->render.framebuffer) {273if (steps[j]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) {274steps[j]->render.finalColorLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;275}276if (steps[j]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) {277steps[j]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;278}279}280}281282for (int j = 0; j < (int)steps.size() - 1; j++) {283// Push down empty "Clear/Store" renderpasses, and merge them with the first "Load/Store" to the same framebuffer.284if (steps.size() > 1 && steps[j]->stepType == VKRStepType::RENDER &&285steps[j]->render.numDraws == 0 &&286steps[j]->render.numReads == 0 &&287steps[j]->render.colorLoad == VKRRenderPassLoadAction::CLEAR &&288steps[j]->render.stencilLoad == VKRRenderPassLoadAction::CLEAR &&289steps[j]->render.depthLoad == VKRRenderPassLoadAction::CLEAR) {290291// Drop the clear step, and merge it into the next step that touches the same framebuffer.292for (int i = j + 1; i < (int)steps.size(); i++) {293if (steps[i]->stepType == VKRStepType::RENDER &&294steps[i]->render.framebuffer == steps[j]->render.framebuffer) {295if (steps[i]->render.colorLoad != VKRRenderPassLoadAction::CLEAR) {296steps[i]->render.colorLoad = VKRRenderPassLoadAction::CLEAR;297steps[i]->render.clearColor = steps[j]->render.clearColor;298}299if (steps[i]->render.depthLoad != VKRRenderPassLoadAction::CLEAR) {300steps[i]->render.depthLoad = VKRRenderPassLoadAction::CLEAR;301steps[i]->render.clearDepth = steps[j]->render.clearDepth;302}303if (steps[i]->render.stencilLoad != VKRRenderPassLoadAction::CLEAR) {304steps[i]->render.stencilLoad = VKRRenderPassLoadAction::CLEAR;305steps[i]->render.clearStencil = steps[j]->render.clearStencil;306}307MergeRenderAreaRectInto(&steps[i]->render.renderArea, steps[j]->render.renderArea);308steps[i]->render.renderPassType = MergeRPTypes(steps[i]->render.renderPassType, steps[j]->render.renderPassType);309steps[i]->render.numDraws += steps[j]->render.numDraws;310steps[i]->render.numReads += steps[j]->render.numReads;311// Cheaply skip the first step.312steps[j]->stepType = VKRStepType::RENDER_SKIP;313break;314} else if (steps[i]->stepType == VKRStepType::COPY &&315steps[i]->copy.src == steps[j]->render.framebuffer) {316// Can't eliminate the clear if a game copies from it before it's317// rendered to. However this should be rare.318// TODO: This should never happen when we check numReads now.319break;320}321}322}323}324325// Queue hacks.326if (hacksEnabled_) {327if (hacksEnabled_ & QUEUE_HACK_MGS2_ACID) {328// Massive speedup.329ApplyMGSHack(steps);330}331if (hacksEnabled_ & QUEUE_HACK_SONIC) {332ApplySonicHack(steps);333}334if (hacksEnabled_ & QUEUE_HACK_RENDERPASS_MERGE) {335ApplyRenderPassMerge(steps);336}337}338}339340void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, int curFrame, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps) {341QueueProfileContext *profile = frameData.profile.enabled ? &frameData.profile : nullptr;342343if (profile)344profile->cpuStartTime = time_now_d();345346bool emitLabels = vulkan_->Extensions().EXT_debug_utils;347348VkCommandBuffer cmd = frameData.hasPresentCommands ? frameData.presentCmd : frameData.mainCmd;349350for (size_t i = 0; i < steps.size(); i++) {351const VKRStep &step = *steps[i];352if (emitLabels) {353VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };354char temp[128];355if (step.stepType == VKRStepType::RENDER && step.render.framebuffer) {356snprintf(temp, sizeof(temp), "%s: %s", step.tag, step.render.framebuffer->Tag());357labelInfo.pLabelName = temp;358} else {359labelInfo.pLabelName = step.tag;360}361vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);362}363364switch (step.stepType) {365case VKRStepType::RENDER:366if (!step.render.framebuffer) {367if (emitLabels) {368vkCmdEndDebugUtilsLabelEXT(cmd);369}370frameData.Submit(vulkan_, FrameSubmitType::Pending, frameDataShared);371372// When stepping in the GE debugger, we can end up here multiple times in a "frame".373// So only acquire once.374if (!frameData.hasAcquired) {375frameData.AcquireNextImage(vulkan_);376SetBackbuffer(framebuffers_[frameData.curSwapchainImage], swapchainImages_[frameData.curSwapchainImage].image);377}378379if (!frameData.hasPresentCommands) {380// A RENDER step rendering to the backbuffer is normally the last step that happens in a frame,381// unless taking a screenshot, in which case there might be a READBACK_IMAGE after it.382// This is why we have to switch cmd to presentCmd, in this case.383VkCommandBufferBeginInfo begin{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };384begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;385vkBeginCommandBuffer(frameData.presentCmd, &begin);386frameData.hasPresentCommands = true;387}388cmd = frameData.presentCmd;389if (emitLabels) {390VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };391labelInfo.pLabelName = "present";392vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);393}394}395PerformRenderPass(step, cmd, curFrame, frameData.profile);396break;397case VKRStepType::COPY:398PerformCopy(step, cmd);399break;400case VKRStepType::BLIT:401PerformBlit(step, cmd);402break;403case VKRStepType::READBACK:404PerformReadback(step, cmd, frameData);405break;406case VKRStepType::READBACK_IMAGE:407PerformReadbackImage(step, cmd);408break;409case VKRStepType::RENDER_SKIP:410break;411}412413if (profile && profile->timestampsEnabled && profile->timestampDescriptions.size() + 1 < MAX_TIMESTAMP_QUERIES) {414vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile->queryPool, (uint32_t)profile->timestampDescriptions.size());415profile->timestampDescriptions.push_back(StepToString(vulkan_, step));416}417418if (emitLabels) {419vkCmdEndDebugUtilsLabelEXT(cmd);420}421}422423// Deleting all in one go should be easier on the instruction cache than deleting424// them as we go - and easier to debug because we can look backwards in the frame.425if (!keepSteps) {426for (auto step : steps) {427delete step;428}429steps.clear();430}431432if (profile)433profile->cpuEndTime = time_now_d();434}435436void VulkanQueueRunner::ApplyMGSHack(std::vector<VKRStep *> &steps) {437// Really need a sane way to express transforms of steps.438439// We want to turn a sequence of copy,render(1),copy,render(1),copy,render(1) to copy,copy,copy,render(n).440441for (int i = 0; i < (int)steps.size() - 3; i++) {442int last = -1;443if (!(steps[i]->stepType == VKRStepType::COPY &&444steps[i + 1]->stepType == VKRStepType::RENDER &&445steps[i + 2]->stepType == VKRStepType::COPY &&446steps[i + 1]->render.numDraws == 1 &&447steps[i]->copy.dst == steps[i + 2]->copy.dst))448continue;449// Looks promising! Let's start by finding the last one.450for (int j = i; j < (int)steps.size(); j++) {451switch (steps[j]->stepType) {452case VKRStepType::RENDER:453if (steps[j]->render.numDraws > 1)454last = j - 1;455// should really also check descriptor sets...456if (steps[j]->commands.size()) {457const VkRenderData &cmd = steps[j]->commands.back();458if (cmd.cmd == VKRRenderCommand::DRAW_INDEXED && cmd.draw.count != 6)459last = j - 1;460}461break;462case VKRStepType::COPY:463if (steps[j]->copy.dst != steps[i]->copy.dst)464last = j - 1;465break;466default:467break;468}469if (last != -1)470break;471}472473if (last != -1) {474// We've got a sequence from i to last that needs reordering.475// First, let's sort it, keeping the same length.476std::vector<VKRStep *> copies;477std::vector<VKRStep *> renders;478copies.reserve((last - i) / 2);479renders.reserve((last - i) / 2);480for (int n = i; n <= last; n++) {481if (steps[n]->stepType == VKRStepType::COPY)482copies.push_back(steps[n]);483else if (steps[n]->stepType == VKRStepType::RENDER)484renders.push_back(steps[n]);485}486// Write the copies back. TODO: Combine them too.487for (int j = 0; j < (int)copies.size(); j++) {488steps[i + j] = copies[j];489}490491const int firstRender = i + (int)copies.size();492493// Write the renders back (so they will be deleted properly).494for (int j = 0; j < (int)renders.size(); j++) {495steps[firstRender + j] = renders[j];496}497_assert_(steps[firstRender]->stepType == VKRStepType::RENDER);498// Combine the renders.499for (int j = 1; j < (int)renders.size(); j++) {500steps[firstRender]->commands.reserve(renders[j]->commands.size());501for (int k = 0; k < (int)renders[j]->commands.size(); k++) {502steps[firstRender]->commands.push_back(renders[j]->commands[k]);503}504MergeRenderAreaRectInto(&steps[firstRender]->render.renderArea, renders[j]->render.renderArea);505// Easier than removing them from the list, though that might be the better option.506steps[firstRender + j]->stepType = VKRStepType::RENDER_SKIP;507steps[firstRender + j]->commands.clear();508}509// We're done.510break;511}512}513514// There's also a post processing effect using depals that's just brutal in some parts515// of the game.516for (int i = 0; i < (int)steps.size() - 3; i++) {517int last = -1;518if (!(steps[i]->stepType == VKRStepType::RENDER &&519steps[i + 1]->stepType == VKRStepType::RENDER &&520steps[i + 2]->stepType == VKRStepType::RENDER &&521steps[i]->render.numDraws == 1 &&522steps[i + 1]->render.numDraws == 1 &&523steps[i + 2]->render.numDraws == 1 &&524steps[i]->render.colorLoad == VKRRenderPassLoadAction::DONT_CARE &&525steps[i + 1]->render.colorLoad == VKRRenderPassLoadAction::KEEP &&526steps[i + 2]->render.colorLoad == VKRRenderPassLoadAction::DONT_CARE))527continue;528VKRFramebuffer *depalFramebuffer = steps[i]->render.framebuffer;529VKRFramebuffer *targetFramebuffer = steps[i + 1]->render.framebuffer;530// OK, found the start of a post-process sequence. Let's scan until we find the end.531for (int j = i; j < (int)steps.size() - 3; j++) {532if (((j - i) & 1) == 0) {533// This should be a depal draw.534if (steps[j]->render.numDraws != 1)535break;536if (steps[j]->render.colorLoad != VKRRenderPassLoadAction::DONT_CARE)537break;538if (steps[j]->render.framebuffer != depalFramebuffer)539break;540last = j;541} else {542// This should be a target draw.543if (steps[j]->render.numDraws != 1)544break;545if (steps[j]->render.colorLoad != VKRRenderPassLoadAction::KEEP)546break;547if (steps[j]->render.framebuffer != targetFramebuffer)548break;549last = j;550}551}552553if (last == -1)554continue;555556// Combine the depal renders.557for (int j = i + 2; j <= last + 1; j += 2) {558for (int k = 0; k < (int)steps[j]->commands.size(); k++) {559switch (steps[j]->commands[k].cmd) {560case VKRRenderCommand::DRAW:561case VKRRenderCommand::DRAW_INDEXED:562steps[i]->commands.push_back(steps[j]->commands[k]);563break;564default:565break;566}567}568MergeRenderAreaRectInto(&steps[i]->render.renderArea, steps[j]->render.renderArea);569steps[j]->stepType = VKRStepType::RENDER_SKIP;570}571572// Combine the target renders.573for (int j = i + 3; j <= last; j += 2) {574for (int k = 0; k < (int)steps[j]->commands.size(); k++) {575switch (steps[j]->commands[k].cmd) {576case VKRRenderCommand::DRAW:577case VKRRenderCommand::DRAW_INDEXED:578steps[i + 1]->commands.push_back(steps[j]->commands[k]);579break;580default:581break;582}583}584MergeRenderAreaRectInto(&steps[i + 1]->render.renderArea, steps[j]->render.renderArea);585steps[j]->stepType = VKRStepType::RENDER_SKIP;586}587588// We're done - we only expect one of these sequences per frame.589break;590}591}592593void VulkanQueueRunner::ApplySonicHack(std::vector<VKRStep *> &steps) {594// We want to turn a sequence of render(3),render(1),render(6),render(1),render(6),render(1),render(3) to595// render(1), render(1), render(1), render(6), render(6), render(6)596597for (int i = 0; i < (int)steps.size() - 4; i++) {598int last = -1;599if (!(steps[i]->stepType == VKRStepType::RENDER &&600steps[i + 1]->stepType == VKRStepType::RENDER &&601steps[i + 2]->stepType == VKRStepType::RENDER &&602steps[i + 3]->stepType == VKRStepType::RENDER &&603steps[i]->render.numDraws == 3 &&604steps[i + 1]->render.numDraws == 1 &&605steps[i + 2]->render.numDraws == 6 &&606steps[i + 3]->render.numDraws == 1 &&607steps[i]->render.framebuffer == steps[i + 2]->render.framebuffer &&608steps[i + 1]->render.framebuffer == steps[i + 3]->render.framebuffer))609continue;610// Looks promising! Let's start by finding the last one.611for (int j = i; j < (int)steps.size(); j++) {612switch (steps[j]->stepType) {613case VKRStepType::RENDER:614if ((j - i) & 1) {615if (steps[j]->render.framebuffer != steps[i + 1]->render.framebuffer)616last = j - 1;617if (steps[j]->render.numDraws != 1)618last = j - 1;619} else {620if (steps[j]->render.framebuffer != steps[i]->render.framebuffer)621last = j - 1;622if (steps[j]->render.numDraws != 3 && steps[j]->render.numDraws != 6)623last = j - 1;624}625break;626default:627break;628}629if (last != -1)630break;631}632633if (last != -1) {634// We've got a sequence from i to last that needs reordering.635// First, let's sort it, keeping the same length.636std::vector<VKRStep *> type1;637std::vector<VKRStep *> type2;638type1.reserve((last - i) / 2);639type2.reserve((last - i) / 2);640for (int n = i; n <= last; n++) {641if (steps[n]->render.framebuffer == steps[i]->render.framebuffer)642type1.push_back(steps[n]);643else644type2.push_back(steps[n]);645}646647// Write the renders back in order. Same amount, so deletion will work fine.648for (int j = 0; j < (int)type1.size(); j++) {649steps[i + j] = type1[j];650}651for (int j = 0; j < (int)type2.size(); j++) {652steps[i + j + type1.size()] = type2[j];653}654655// Combine the renders.656for (int j = 1; j < (int)type1.size(); j++) {657for (int k = 0; k < (int)type1[j]->commands.size(); k++) {658steps[i]->commands.push_back(type1[j]->commands[k]);659}660steps[i + j]->stepType = VKRStepType::RENDER_SKIP;661}662for (int j = 1; j < (int)type2.size(); j++) {663for (int k = 0; k < (int)type2[j]->commands.size(); k++) {664steps[i + type1.size()]->commands.push_back(type2[j]->commands[k]);665}666// Technically, should merge render area here, but they're all the same so not needed.667steps[i + type1.size() + j]->stepType = VKRStepType::RENDER_SKIP;668}669// We're done.670break;671}672}673}674675const char *AspectToString(VkImageAspectFlags aspect) {676switch (aspect) {677case VK_IMAGE_ASPECT_COLOR_BIT: return "COLOR";678case VK_IMAGE_ASPECT_DEPTH_BIT: return "DEPTH";679case VK_IMAGE_ASPECT_STENCIL_BIT: return "STENCIL";680case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: return "DEPTHSTENCIL";681default: return "UNUSUAL";682}683}684685std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep &step) {686char buffer[256];687switch (step.stepType) {688case VKRStepType::RENDER:689{690int w = step.render.framebuffer ? step.render.framebuffer->width : vulkan->GetBackbufferWidth();691int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan->GetBackbufferHeight();692int actual_w = step.render.renderArea.extent.width;693int actual_h = step.render.renderArea.extent.height;694const char *renderCmd = GetRPTypeName(step.render.renderPassType);695snprintf(buffer, sizeof(buffer), "%s %s %s (draws: %d, %dx%d/%dx%d)", renderCmd, step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", step.render.numDraws, actual_w, actual_h, w, h);696break;697}698case VKRStepType::COPY:699snprintf(buffer, sizeof(buffer), "COPY '%s' %s -> %s (%dx%d, %s)", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.copy.srcRect.extent.width, step.copy.srcRect.extent.height, AspectToString(step.copy.aspectMask));700break;701case VKRStepType::BLIT:702snprintf(buffer, sizeof(buffer), "BLIT '%s' %s -> %s (%dx%d->%dx%d, %s)", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.blit.srcRect.extent.width, step.blit.srcRect.extent.height, step.blit.dstRect.extent.width, step.blit.dstRect.extent.height, AspectToString(step.blit.aspectMask));703break;704case VKRStepType::READBACK:705snprintf(buffer, sizeof(buffer), "READBACK '%s' %s (%dx%d, %s)", step.tag, step.readback.src ? step.readback.src->Tag() : "(backbuffer)", step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, AspectToString(step.readback.aspectMask));706break;707case VKRStepType::READBACK_IMAGE:708snprintf(buffer, sizeof(buffer), "READBACK_IMAGE '%s' (%dx%d)", step.tag, step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height);709break;710case VKRStepType::RENDER_SKIP:711snprintf(buffer, sizeof(buffer), "(RENDER_SKIP) %s", step.tag);712break;713default:714buffer[0] = 0;715break;716}717return std::string(buffer);718}719720// Ideally, this should be cheap enough to be applied to all games. At least on mobile, it's pretty721// much a guaranteed neutral or win in terms of GPU power. However, dependency calculation really722// must be perfect!723void VulkanQueueRunner::ApplyRenderPassMerge(std::vector<VKRStep *> &steps) {724// First let's count how many times each framebuffer is rendered to.725// If it's more than one, let's do our best to merge them. This can help God of War quite a bit.726std::unordered_map<VKRFramebuffer *, int> counts;727for (int i = 0; i < (int)steps.size(); i++) {728if (steps[i]->stepType == VKRStepType::RENDER) {729counts[steps[i]->render.framebuffer]++;730}731}732733auto mergeRenderSteps = [](VKRStep *dst, VKRStep *src) {734// OK. Now, if it's a render, slurp up all the commands and kill the step.735// Also slurp up any pretransitions.736dst->preTransitions.append(src->preTransitions);737dst->commands.insert(dst->commands.end(), src->commands.begin(), src->commands.end());738MergeRenderAreaRectInto(&dst->render.renderArea, src->render.renderArea);739// So we don't consider it for other things, maybe doesn't matter.740src->dependencies.clear();741src->stepType = VKRStepType::RENDER_SKIP;742dst->render.numDraws += src->render.numDraws;743dst->render.numReads += src->render.numReads;744dst->render.pipelineFlags |= src->render.pipelineFlags;745dst->render.renderPassType = MergeRPTypes(dst->render.renderPassType, src->render.renderPassType);746};747auto renderHasClear = [](const VKRStep *step) {748const auto &r = step->render;749return r.colorLoad == VKRRenderPassLoadAction::CLEAR || r.depthLoad == VKRRenderPassLoadAction::CLEAR || r.stencilLoad == VKRRenderPassLoadAction::CLEAR;750};751752// Now, let's go through the steps. If we find one that is rendered to more than once,753// we'll scan forward and slurp up any rendering that can be merged across.754for (int i = 0; i < (int)steps.size(); i++) {755if (steps[i]->stepType == VKRStepType::RENDER && counts[steps[i]->render.framebuffer] > 1) {756auto fb = steps[i]->render.framebuffer;757TinySet<VKRFramebuffer *, 8> touchedFramebuffers; // must be the same fast-size as the dependencies TinySet for annoying reasons.758for (int j = i + 1; j < (int)steps.size(); j++) {759// If any other passes are reading from this framebuffer as-is, we cancel the scan.760if (steps[j]->dependencies.contains(fb)) {761// Reading from itself means a KEEP, which is okay.762if (steps[j]->stepType != VKRStepType::RENDER || steps[j]->render.framebuffer != fb)763break;764}765switch (steps[j]->stepType) {766case VKRStepType::RENDER:767if (steps[j]->render.framebuffer == fb) {768// Prevent Unknown's example case from https://github.com/hrydgard/ppsspp/pull/12242769if (renderHasClear(steps[j]) || steps[j]->dependencies.contains(touchedFramebuffers)) {770goto done_fb;771} else {772// Safe to merge, great.773mergeRenderSteps(steps[i], steps[j]);774}775} else {776// Remember the framebuffer this wrote to. We can't merge with later passes that depend on these.777touchedFramebuffers.insert(steps[j]->render.framebuffer);778}779break;780case VKRStepType::COPY:781if (steps[j]->copy.dst == fb) {782// Without framebuffer "renaming", we can't merge past a clobbered fb.783goto done_fb;784}785touchedFramebuffers.insert(steps[j]->copy.dst);786break;787case VKRStepType::BLIT:788if (steps[j]->blit.dst == fb) {789// Without framebuffer "renaming", we can't merge past a clobbered fb.790goto done_fb;791}792touchedFramebuffers.insert(steps[j]->blit.dst);793break;794case VKRStepType::READBACK:795// Not sure this has much effect, when executed READBACK is always the last step796// since we stall the GPU and wait immediately after.797break;798case VKRStepType::RENDER_SKIP:799case VKRStepType::READBACK_IMAGE:800break;801default:802// We added a new step? Might be unsafe.803goto done_fb;804}805}806done_fb:807;808}809}810}811812void VulkanQueueRunner::LogSteps(const std::vector<VKRStep *> &steps, bool verbose) {813INFO_LOG(Log::G3D, "=================== FRAME ====================");814for (size_t i = 0; i < steps.size(); i++) {815const VKRStep &step = *steps[i];816switch (step.stepType) {817case VKRStepType::RENDER:818LogRenderPass(step, verbose);819break;820case VKRStepType::COPY:821LogCopy(step);822break;823case VKRStepType::BLIT:824LogBlit(step);825break;826case VKRStepType::READBACK:827LogReadback(step);828break;829case VKRStepType::READBACK_IMAGE:830LogReadbackImage(step);831break;832case VKRStepType::RENDER_SKIP:833INFO_LOG(Log::G3D, "(skipped render pass)");834break;835}836}837INFO_LOG(Log::G3D, "------------------- SUBMIT ------------------");838}839840const char *RenderPassActionName(VKRRenderPassLoadAction a) {841switch (a) {842case VKRRenderPassLoadAction::CLEAR:843return "CLEAR";844case VKRRenderPassLoadAction::DONT_CARE:845return "DONT_CARE";846case VKRRenderPassLoadAction::KEEP:847return "KEEP";848}849return "?";850}851852const char *ImageLayoutToString(VkImageLayout layout) {853switch (layout) {854case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: return "COLOR_ATTACHMENT";855case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: return "DEPTH_STENCIL_ATTACHMENT";856case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: return "SHADER_READ_ONLY";857case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: return "TRANSFER_SRC";858case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return "TRANSFER_DST";859case VK_IMAGE_LAYOUT_GENERAL: return "GENERAL";860case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: return "PRESENT_SRC_KHR";861case VK_IMAGE_LAYOUT_UNDEFINED: return "UNDEFINED";862default: return "(unknown)";863}864}865866void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) {867const auto &r = pass.render;868const char *framebuf = r.framebuffer ? r.framebuffer->Tag() : "backbuffer";869int w = r.framebuffer ? r.framebuffer->width : vulkan_->GetBackbufferWidth();870int h = r.framebuffer ? r.framebuffer->height : vulkan_->GetBackbufferHeight();871872INFO_LOG(Log::G3D, "RENDER %s Begin(%s, draws: %d, %dx%d, %s, %s, %s)", pass.tag, framebuf, r.numDraws, w, h, RenderPassActionName(r.colorLoad), RenderPassActionName(r.depthLoad), RenderPassActionName(r.stencilLoad));873// TODO: Log these in detail.874for (int i = 0; i < (int)pass.preTransitions.size(); i++) {875INFO_LOG(Log::G3D, " PRETRANSITION: %s %s -> %s", pass.preTransitions[i].fb->Tag(), AspectToString(pass.preTransitions[i].aspect), ImageLayoutToString(pass.preTransitions[i].targetLayout));876}877878if (verbose) {879for (auto &cmd : pass.commands) {880switch (cmd.cmd) {881case VKRRenderCommand::REMOVED:882INFO_LOG(Log::G3D, " (Removed)");883break;884case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:885INFO_LOG(Log::G3D, " BindGraphicsPipeline(%x)", (int)(intptr_t)cmd.graphics_pipeline.pipeline);886break;887case VKRRenderCommand::BLEND:888INFO_LOG(Log::G3D, " BlendColor(%08x)", cmd.blendColor.color);889break;890case VKRRenderCommand::CLEAR:891INFO_LOG(Log::G3D, " Clear");892break;893case VKRRenderCommand::DRAW:894INFO_LOG(Log::G3D, " Draw(%d)", cmd.draw.count);895break;896case VKRRenderCommand::DRAW_INDEXED:897INFO_LOG(Log::G3D, " DrawIndexed(%d)", cmd.drawIndexed.count);898break;899case VKRRenderCommand::SCISSOR:900INFO_LOG(Log::G3D, " Scissor(%d, %d, %d, %d)", (int)cmd.scissor.scissor.offset.x, (int)cmd.scissor.scissor.offset.y, (int)cmd.scissor.scissor.extent.width, (int)cmd.scissor.scissor.extent.height);901break;902case VKRRenderCommand::STENCIL:903INFO_LOG(Log::G3D, " Stencil(ref=%d, compare=%d, write=%d)", cmd.stencil.stencilRef, cmd.stencil.stencilCompareMask, cmd.stencil.stencilWriteMask);904break;905case VKRRenderCommand::VIEWPORT:906INFO_LOG(Log::G3D, " Viewport(%f, %f, %f, %f, %f, %f)", cmd.viewport.vp.x, cmd.viewport.vp.y, cmd.viewport.vp.width, cmd.viewport.vp.height, cmd.viewport.vp.minDepth, cmd.viewport.vp.maxDepth);907break;908case VKRRenderCommand::PUSH_CONSTANTS:909INFO_LOG(Log::G3D, " PushConstants(%d)", cmd.push.size);910break;911case VKRRenderCommand::DEBUG_ANNOTATION:912INFO_LOG(Log::G3D, " DebugAnnotation(%s)", cmd.debugAnnotation.annotation);913break;914915case VKRRenderCommand::NUM_RENDER_COMMANDS:916break;917}918}919}920921INFO_LOG(Log::G3D, " Final: %s %s", ImageLayoutToString(pass.render.finalColorLayout), ImageLayoutToString(pass.render.finalDepthStencilLayout));922INFO_LOG(Log::G3D, "RENDER End(%s) - %d commands executed", framebuf, (int)pass.commands.size());923}924925void VulkanQueueRunner::LogCopy(const VKRStep &step) {926INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());927}928929void VulkanQueueRunner::LogBlit(const VKRStep &step) {930INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());931}932933void VulkanQueueRunner::LogReadback(const VKRStep &step) {934INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());935}936937void VulkanQueueRunner::LogReadbackImage(const VKRStep &step) {938INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());939}940941void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd, int curFrame, QueueProfileContext &profile) {942for (size_t i = 0; i < step.preTransitions.size(); i++) {943const TransitionRequest &iter = step.preTransitions[i];944if (iter.aspect == VK_IMAGE_ASPECT_COLOR_BIT && iter.fb->color.layout != iter.targetLayout) {945recordBarrier_.TransitionColorImageAuto(946&iter.fb->color,947iter.targetLayout948);949} else if (iter.fb->depth.image != VK_NULL_HANDLE && (iter.aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) && iter.fb->depth.layout != iter.targetLayout) {950recordBarrier_.TransitionDepthStencilImageAuto(951&iter.fb->depth,952iter.targetLayout953);954}955}956957// Don't execute empty renderpasses that keep the contents.958if (step.commands.empty() && step.render.colorLoad == VKRRenderPassLoadAction::KEEP && step.render.depthLoad == VKRRenderPassLoadAction::KEEP && step.render.stencilLoad == VKRRenderPassLoadAction::KEEP) {959// Flush the pending barrier960recordBarrier_.Flush(cmd);961// Nothing to do.962// TODO: Though - a later step might have used this step's finalColorLayout etc to get things in a layout it expects.963// Should we just do a barrier? Or just let the later step deal with not having things in its preferred layout, like now?964return;965}966967// Write-after-write hazards. Fixed flicker in God of War on ARM (before we added another fix that removed these).968// NOTE: These are commented out because the normal barriers no longer check for equality, effectively generating these969// barriers automatically. This is safe, but sometimes I think can be improved on.970/*971if (step.render.framebuffer) {972int n = 0;973int stage = 0;974975if (step.render.framebuffer->color.layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {976recordBarrier_.TransitionImage(977step.render.framebuffer->color.image,9780,9791,980step.render.framebuffer->numLayers,981VK_IMAGE_ASPECT_COLOR_BIT,982VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,983VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,984VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,985VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT,986VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,987VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT988);989}990if (step.render.framebuffer->depth.image != VK_NULL_HANDLE && step.render.framebuffer->depth.layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {991recordBarrier_.TransitionImage(992step.render.framebuffer->depth.image,9930,9941,995step.render.framebuffer->numLayers,996VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,997VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,998VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,999VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1000VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT,1001VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,1002VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT,1003);1004}1005}*/10061007// This chooses a render pass according to the load/store attachment state. We no longer transition1008// image layouts as part of the passes.1009//1010// NOTE: Unconditionally flushes recordBarrier_.1011VKRRenderPass *renderPass = PerformBindFramebufferAsRenderTarget(step, cmd);10121013int curWidth = step.render.framebuffer ? step.render.framebuffer->width : vulkan_->GetBackbufferWidth();1014int curHeight = step.render.framebuffer ? step.render.framebuffer->height : vulkan_->GetBackbufferHeight();10151016VKRFramebuffer *fb = step.render.framebuffer;10171018VKRGraphicsPipeline *lastGraphicsPipeline = nullptr;1019VKRComputePipeline *lastComputePipeline = nullptr;10201021const auto &commands = step.commands;10221023// We can do a little bit of state tracking here to eliminate some calls into the driver.1024// The stencil ones are very commonly mostly redundant so let's eliminate them where possible.1025// Might also want to consider scissor and viewport.1026VkPipeline lastPipeline = VK_NULL_HANDLE;1027FastVec<PendingDescSet> *descSets = nullptr;1028VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;10291030bool pipelineOK = false;10311032int lastStencilWriteMask = -1;1033int lastStencilCompareMask = -1;1034int lastStencilReference = -1;10351036const RenderPassType rpType = step.render.renderPassType;10371038for (size_t i = 0; i < commands.size(); i++) {1039const VkRenderData &c = commands[i];1040#ifdef _DEBUG1041if (profile.enabled) {1042if ((size_t)step.stepType < ARRAY_SIZE(profile.commandCounts)) {1043profile.commandCounts[(size_t)c.cmd]++;1044}1045}1046#endif1047switch (c.cmd) {1048case VKRRenderCommand::REMOVED:1049break;10501051case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:1052{1053VKRGraphicsPipeline *graphicsPipeline = c.graphics_pipeline.pipeline;1054if (graphicsPipeline != lastGraphicsPipeline) {1055VkSampleCountFlagBits fbSampleCount = fb ? fb->sampleCount : VK_SAMPLE_COUNT_1_BIT;10561057if (RenderPassTypeHasMultisample(rpType) && fbSampleCount != graphicsPipeline->SampleCount()) {1058// should have been invalidated.1059_assert_msg_(graphicsPipeline->SampleCount() == VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM,1060"expected %d sample count, got %d", fbSampleCount, graphicsPipeline->SampleCount());1061}10621063if (!graphicsPipeline->pipeline[(size_t)rpType]) {1064// NOTE: If render steps got merged, it can happen that, as they ended during recording,1065// they didn't know their final render pass type so they created the wrong pipelines in EndCurRenderStep().1066// Unfortunately I don't know if we can fix it in any more sensible place than here.1067// Maybe a middle pass. But let's try to just block and compile here for now, this doesn't1068// happen all that much.1069graphicsPipeline->pipeline[(size_t)rpType] = Promise<VkPipeline>::CreateEmpty();1070graphicsPipeline->Create(vulkan_, renderPass->Get(vulkan_, rpType, fbSampleCount), rpType, fbSampleCount, time_now_d(), -1);1071}10721073VkPipeline pipeline = graphicsPipeline->pipeline[(size_t)rpType]->BlockUntilReady();10741075if (pipeline != VK_NULL_HANDLE) {1076vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);1077descSets = &c.graphics_pipeline.pipelineLayout->frameData[curFrame].descSets_;1078pipelineLayout = c.graphics_pipeline.pipelineLayout->pipelineLayout;1079_dbg_assert_(pipelineLayout != VK_NULL_HANDLE);1080lastGraphicsPipeline = graphicsPipeline;1081pipelineOK = true;1082} else {1083pipelineOK = false;1084}10851086// Reset dynamic state so it gets refreshed with the new pipeline.1087lastStencilWriteMask = -1;1088lastStencilCompareMask = -1;1089lastStencilReference = -1;1090}1091break;1092}10931094case VKRRenderCommand::VIEWPORT:1095if (fb != nullptr) {1096vkCmdSetViewport(cmd, 0, 1, &c.viewport.vp);1097} else {1098const VkViewport &vp = c.viewport.vp;1099DisplayRect<float> rc{ vp.x, vp.y, vp.width, vp.height };1100RotateRectToDisplay(rc, (float)vulkan_->GetBackbufferWidth(), (float)vulkan_->GetBackbufferHeight());1101VkViewport final_vp;1102final_vp.x = rc.x;1103final_vp.y = rc.y;1104final_vp.width = rc.w;1105final_vp.height = rc.h;1106final_vp.maxDepth = vp.maxDepth;1107final_vp.minDepth = vp.minDepth;1108vkCmdSetViewport(cmd, 0, 1, &final_vp);1109}1110break;11111112case VKRRenderCommand::SCISSOR:1113{1114if (fb != nullptr) {1115vkCmdSetScissor(cmd, 0, 1, &c.scissor.scissor);1116} else {1117// Rendering to backbuffer. Might need to rotate.1118const VkRect2D &rc = c.scissor.scissor;1119DisplayRect<int> rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height };1120RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());1121_dbg_assert_(rotated_rc.x >= 0);1122_dbg_assert_(rotated_rc.y >= 0);1123VkRect2D finalRect = VkRect2D{ { rotated_rc.x, rotated_rc.y }, { (uint32_t)rotated_rc.w, (uint32_t)rotated_rc.h} };1124vkCmdSetScissor(cmd, 0, 1, &finalRect);1125}1126break;1127}11281129case VKRRenderCommand::BLEND:1130{1131float bc[4];1132Uint8x4ToFloat4(bc, c.blendColor.color);1133vkCmdSetBlendConstants(cmd, bc);1134break;1135}11361137case VKRRenderCommand::PUSH_CONSTANTS:1138if (pipelineOK) {1139vkCmdPushConstants(cmd, pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data);1140}1141break;11421143case VKRRenderCommand::STENCIL:1144if (lastStencilWriteMask != c.stencil.stencilWriteMask) {1145lastStencilWriteMask = (int)c.stencil.stencilWriteMask;1146vkCmdSetStencilWriteMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilWriteMask);1147}1148if (lastStencilCompareMask != c.stencil.stencilCompareMask) {1149lastStencilCompareMask = c.stencil.stencilCompareMask;1150vkCmdSetStencilCompareMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilCompareMask);1151}1152if (lastStencilReference != c.stencil.stencilRef) {1153lastStencilReference = c.stencil.stencilRef;1154vkCmdSetStencilReference(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilRef);1155}1156break;11571158case VKRRenderCommand::DRAW_INDEXED:1159if (pipelineOK) {1160VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;1161_dbg_assert_(set != VK_NULL_HANDLE);1162vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.drawIndexed.numUboOffsets, c.drawIndexed.uboOffsets);1163vkCmdBindIndexBuffer(cmd, c.drawIndexed.ibuffer, c.drawIndexed.ioffset, VK_INDEX_TYPE_UINT16);1164VkDeviceSize voffset = c.drawIndexed.voffset;1165vkCmdBindVertexBuffers(cmd, 0, 1, &c.drawIndexed.vbuffer, &voffset);1166vkCmdDrawIndexed(cmd, c.drawIndexed.count, c.drawIndexed.instances, 0, 0, 0);1167}1168break;11691170case VKRRenderCommand::DRAW:1171if (pipelineOK) {1172VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;1173_dbg_assert_(set != VK_NULL_HANDLE);1174vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.draw.numUboOffsets, c.draw.uboOffsets);1175if (c.draw.vbuffer) {1176vkCmdBindVertexBuffers(cmd, 0, 1, &c.draw.vbuffer, &c.draw.voffset);1177}1178vkCmdDraw(cmd, c.draw.count, 1, c.draw.offset, 0);1179}1180break;11811182case VKRRenderCommand::CLEAR:1183{1184// If we get here, we failed to merge a clear into a render pass load op. This is bad for perf.1185int numAttachments = 0;1186VkClearRect rc{};1187rc.baseArrayLayer = 0;1188rc.layerCount = 1; // In multiview mode, 1 means to replicate to all the active layers.1189rc.rect.extent.width = (uint32_t)curWidth;1190rc.rect.extent.height = (uint32_t)curHeight;1191VkClearAttachment attachments[2]{};1192if (c.clear.clearMask & VK_IMAGE_ASPECT_COLOR_BIT) {1193VkClearAttachment &attachment = attachments[numAttachments++];1194attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1195attachment.colorAttachment = 0;1196Uint8x4ToFloat4(attachment.clearValue.color.float32, c.clear.clearColor);1197}1198if (c.clear.clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1199VkClearAttachment &attachment = attachments[numAttachments++];1200attachment.aspectMask = 0;1201if (c.clear.clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1202attachment.clearValue.depthStencil.depth = c.clear.clearZ;1203attachment.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1204}1205if (c.clear.clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) {1206attachment.clearValue.depthStencil.stencil = (uint32_t)c.clear.clearStencil;1207attachment.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1208}1209}1210if (numAttachments) {1211vkCmdClearAttachments(cmd, numAttachments, attachments, 1, &rc);1212}1213break;1214}12151216case VKRRenderCommand::DEBUG_ANNOTATION:1217if (vulkan_->Extensions().EXT_debug_utils) {1218VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };1219labelInfo.pLabelName = c.debugAnnotation.annotation;1220vkCmdInsertDebugUtilsLabelEXT(cmd, &labelInfo);1221}1222break;12231224default:1225ERROR_LOG(Log::G3D, "Unimpl queue command");1226break;1227}1228}1229vkCmdEndRenderPass(cmd);12301231_dbg_assert_(recordBarrier_.empty());12321233if (fb) {1234// If the desired final layout aren't the optimal layout needed next, early-transition the image.1235if (step.render.finalColorLayout != fb->color.layout) {1236recordBarrier_.TransitionColorImageAuto(&fb->color, step.render.finalColorLayout);1237}1238if (fb->depth.image && step.render.finalDepthStencilLayout != fb->depth.layout) {1239recordBarrier_.TransitionDepthStencilImageAuto(&fb->depth, step.render.finalDepthStencilLayout);1240}1241}1242}12431244VKRRenderPass *VulkanQueueRunner::PerformBindFramebufferAsRenderTarget(const VKRStep &step, VkCommandBuffer cmd) {1245VKRRenderPass *renderPass;1246int numClearVals = 0;1247VkClearValue clearVal[4]{};1248VkFramebuffer framebuf;1249int w;1250int h;12511252bool hasDepth = RenderPassTypeHasDepth(step.render.renderPassType);12531254VkSampleCountFlagBits sampleCount;12551256// Can be used to separate the final*Layout barrier from the rest for debugging in renderdoc.1257// recordBarrier_.Flush(cmd);12581259if (step.render.framebuffer) {1260_dbg_assert_(step.render.finalColorLayout != VK_IMAGE_LAYOUT_UNDEFINED);1261_dbg_assert_(step.render.finalDepthStencilLayout != VK_IMAGE_LAYOUT_UNDEFINED);12621263RPKey key{1264step.render.colorLoad, step.render.depthLoad, step.render.stencilLoad,1265step.render.colorStore, step.render.depthStore, step.render.stencilStore,1266};1267renderPass = GetRenderPass(key);12681269VKRFramebuffer *fb = step.render.framebuffer;1270framebuf = fb->Get(renderPass, step.render.renderPassType);1271sampleCount = fb->sampleCount;1272_dbg_assert_(framebuf != VK_NULL_HANDLE);1273w = fb->width;1274h = fb->height;12751276// Mali driver on S8 (Android O) and S9 mishandles renderpasses that do just a clear1277// and then no draw calls. Memory transaction elimination gets mis-flagged or something.1278// To avoid this, we transition to GENERAL and back in this case (ARM-approved workaround).1279// See pull request #10723.1280bool maliBugWorkaround = step.render.numDraws == 0 &&1281step.render.colorLoad == VKRRenderPassLoadAction::CLEAR &&1282vulkan_->GetPhysicalDeviceProperties().properties.driverVersion == 0xaa9c4b29;1283if (maliBugWorkaround) {1284// A little suboptimal but let's go for maximum safety here.1285recordBarrier_.TransitionImage(fb->color.image, 0, 1, fb->numLayers, VK_IMAGE_ASPECT_COLOR_BIT,1286fb->color.layout, VK_IMAGE_LAYOUT_GENERAL,1287VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1288VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1289VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);1290fb->color.layout = VK_IMAGE_LAYOUT_GENERAL;1291}12921293recordBarrier_.TransitionColorImageAuto(&fb->color, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);12941295// If the render pass doesn't touch depth, we can avoid a layout transition of the depth buffer.1296if (fb->depth.image && RenderPassTypeHasDepth(step.render.renderPassType)) {1297recordBarrier_.TransitionDepthStencilImageAuto(&fb->depth, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);1298}12991300// The transition from the optimal format happens after EndRenderPass, now that we don't1301// do it as part of the renderpass itself anymore.13021303if (sampleCount != VK_SAMPLE_COUNT_1_BIT) {1304// We don't initialize values for these.1305numClearVals = hasDepth ? 2 : 1; // Skip the resolve buffers, don't need to clear those.1306}1307if (step.render.colorLoad == VKRRenderPassLoadAction::CLEAR) {1308Uint8x4ToFloat4(clearVal[numClearVals].color.float32, step.render.clearColor);1309}1310numClearVals++;1311if (hasDepth) {1312if (step.render.depthLoad == VKRRenderPassLoadAction::CLEAR || step.render.stencilLoad == VKRRenderPassLoadAction::CLEAR) {1313clearVal[numClearVals].depthStencil.depth = step.render.clearDepth;1314clearVal[numClearVals].depthStencil.stencil = step.render.clearStencil;1315}1316numClearVals++;1317}1318_dbg_assert_(numClearVals != 3);1319} else {1320RPKey key{1321VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,1322VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,1323};1324renderPass = GetRenderPass(key);13251326if (IsVREnabled()) {1327framebuf = (VkFramebuffer)BindVRFramebuffer();1328} else {1329framebuf = backbuffer_;1330}13311332// Raw, rotated backbuffer size.1333w = vulkan_->GetBackbufferWidth();1334h = vulkan_->GetBackbufferHeight();13351336Uint8x4ToFloat4(clearVal[0].color.float32, step.render.clearColor);1337numClearVals = hasDepth ? 2 : 1; // We might do depth-less backbuffer in the future, though doubtful of the value.1338clearVal[1].depthStencil.depth = 0.0f;1339clearVal[1].depthStencil.stencil = 0;1340sampleCount = VK_SAMPLE_COUNT_1_BIT;1341}13421343VkRenderPassBeginInfo rp_begin = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO };1344rp_begin.renderPass = renderPass->Get(vulkan_, step.render.renderPassType, sampleCount);1345rp_begin.framebuffer = framebuf;13461347VkRect2D rc = step.render.renderArea;1348if (!step.render.framebuffer) {1349// Rendering to backbuffer, must rotate, just like scissors.1350DisplayRect<int> rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height };1351RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());13521353rc.offset.x = rotated_rc.x;1354rc.offset.y = rotated_rc.y;1355rc.extent.width = rotated_rc.w;1356rc.extent.height = rotated_rc.h;1357}13581359recordBarrier_.Flush(cmd);13601361rp_begin.renderArea = rc;1362rp_begin.clearValueCount = numClearVals;1363rp_begin.pClearValues = numClearVals ? clearVal : nullptr;1364vkCmdBeginRenderPass(cmd, &rp_begin, VK_SUBPASS_CONTENTS_INLINE);13651366return renderPass;1367}13681369void VulkanQueueRunner::PerformCopy(const VKRStep &step, VkCommandBuffer cmd) {1370// The barrier code doesn't handle this case. We'd need to transition to GENERAL to do an intra-image copy.1371_dbg_assert_(step.copy.src != step.copy.dst);13721373VKRFramebuffer *src = step.copy.src;1374VKRFramebuffer *dst = step.copy.dst;13751376int layerCount = std::min(step.copy.src->numLayers, step.copy.dst->numLayers);1377_dbg_assert_(step.copy.src->numLayers >= step.copy.dst->numLayers);13781379// TODO: If dst covers exactly the whole destination, we can set up a UNDEFINED->TRANSFER_DST_OPTIMAL transition,1380// which can potentially be more efficient.13811382if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1383recordBarrier_.TransitionColorImageAuto(&src->color, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1384recordBarrier_.TransitionColorImageAuto(&dst->color, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1385}13861387// We can't copy only depth or only stencil unfortunately - or can we?.1388if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1389_dbg_assert_(src->depth.image != VK_NULL_HANDLE);13901391recordBarrier_.TransitionDepthStencilImageAuto(&src->depth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1392if (dst->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {1393recordBarrier_.TransitionDepthStencilImageAuto(&dst->depth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1394} else {1395// Kingdom Hearts: Subsequent copies twice to the same depth buffer without any other use.1396// Not super sure how that happens, but we need a barrier to pass sync validation.1397SetupTransferDstWriteAfterWrite(dst->depth, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, &recordBarrier_);1398}1399}14001401bool multisampled = src->sampleCount != VK_SAMPLE_COUNT_1_BIT && dst->sampleCount != VK_SAMPLE_COUNT_1_BIT;1402if (multisampled) {1403// If both the targets are multisampled, copy the msaa targets too.1404// For that, we need to transition them from their normally permanent VK_*_ATTACHMENT_OPTIMAL layouts, and then back.1405if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1406recordBarrier_.TransitionColorImageAuto(&src->msaaColor, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1407recordBarrier_.TransitionColorImageAuto(&dst->msaaColor, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1408}1409if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1410// Kingdom Hearts: Subsequent copies to the same depth buffer without any other use.1411// Not super sure how that happens, but we need a barrier to pass sync validation.1412recordBarrier_.TransitionDepthStencilImageAuto(&src->msaaDepth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1413recordBarrier_.TransitionDepthStencilImageAuto(&dst->msaaDepth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1414}1415}14161417recordBarrier_.Flush(cmd);14181419VkImageCopy copy{};1420copy.srcOffset.x = step.copy.srcRect.offset.x;1421copy.srcOffset.y = step.copy.srcRect.offset.y;1422copy.srcOffset.z = 0;1423copy.srcSubresource.mipLevel = 0;1424copy.srcSubresource.layerCount = layerCount;1425copy.dstOffset.x = step.copy.dstPos.x;1426copy.dstOffset.y = step.copy.dstPos.y;1427copy.dstOffset.z = 0;1428copy.dstSubresource.mipLevel = 0;1429copy.dstSubresource.layerCount = layerCount;1430copy.extent.width = step.copy.srcRect.extent.width;1431copy.extent.height = step.copy.srcRect.extent.height;1432copy.extent.depth = 1;14331434if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1435copy.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1436copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1437vkCmdCopyImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, ©);14381439if (multisampled) {1440vkCmdCopyImage(cmd, src->msaaColor.image, src->msaaColor.layout, dst->msaaColor.image, dst->msaaColor.layout, 1, ©);1441}1442}1443if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1444_dbg_assert_(src->depth.image != VK_NULL_HANDLE);1445_dbg_assert_(dst->depth.image != VK_NULL_HANDLE);1446copy.srcSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);1447copy.dstSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);1448vkCmdCopyImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, ©);14491450if (multisampled) {1451vkCmdCopyImage(cmd, src->msaaDepth.image, src->msaaDepth.layout, dst->msaaDepth.image, dst->msaaDepth.layout, 1, ©);1452}1453}14541455if (multisampled) {1456// Transition the MSAA surfaces back to optimal.1457if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1458recordBarrier_.TransitionImage(1459src->msaaColor.image,14600,14611,1462src->msaaColor.numLayers,1463VK_IMAGE_ASPECT_COLOR_BIT,1464VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,1465VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1466VK_ACCESS_TRANSFER_READ_BIT,1467VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1468VK_PIPELINE_STAGE_TRANSFER_BIT,1469VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1470);1471src->msaaColor.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;1472recordBarrier_.TransitionImage(1473dst->msaaColor.image,14740,14751,1476dst->msaaColor.numLayers,1477VK_IMAGE_ASPECT_COLOR_BIT,1478VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1479VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1480VK_ACCESS_TRANSFER_WRITE_BIT,1481VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1482VK_PIPELINE_STAGE_TRANSFER_BIT,1483VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1484);1485dst->msaaColor.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;1486}1487if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1488recordBarrier_.TransitionImage(1489src->msaaDepth.image,14900,14911,1492src->msaaDepth.numLayers,1493VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1494VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,1495VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1496VK_ACCESS_TRANSFER_READ_BIT,1497VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1498VK_PIPELINE_STAGE_TRANSFER_BIT,1499VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT1500);1501src->msaaDepth.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;1502recordBarrier_.TransitionImage(1503dst->msaaDepth.image,15040,15051,1506dst->msaaDepth.numLayers,1507VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1508VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1509VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1510VK_ACCESS_TRANSFER_WRITE_BIT,1511VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1512VK_PIPELINE_STAGE_TRANSFER_BIT,1513VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT1514);1515dst->msaaDepth.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;1516}1517// Probably not necessary.1518recordBarrier_.Flush(cmd);1519}1520}15211522void VulkanQueueRunner::PerformBlit(const VKRStep &step, VkCommandBuffer cmd) {1523// The barrier code doesn't handle this case. We'd need to transition to GENERAL to do an intra-image copy.1524_dbg_assert_(step.blit.src != step.blit.dst);15251526int layerCount = std::min(step.blit.src->numLayers, step.blit.dst->numLayers);1527_dbg_assert_(step.blit.src->numLayers >= step.blit.dst->numLayers);15281529VKRFramebuffer *src = step.blit.src;1530VKRFramebuffer *dst = step.blit.dst;15311532// First source barriers.1533if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1534recordBarrier_.TransitionColorImageAuto(&src->color, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1535recordBarrier_.TransitionColorImageAuto(&dst->color, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1536}15371538// We can't copy only depth or only stencil unfortunately.1539if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1540_assert_(src->depth.image != VK_NULL_HANDLE);1541_assert_(dst->depth.image != VK_NULL_HANDLE);1542recordBarrier_.TransitionDepthStencilImageAuto(&src->depth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1543recordBarrier_.TransitionDepthStencilImageAuto(&dst->depth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1544}15451546recordBarrier_.Flush(cmd);15471548// If any validation needs to be performed here, it should probably have been done1549// already when the blit was queued. So don't validate here.1550VkImageBlit blit{};1551blit.srcOffsets[0].x = step.blit.srcRect.offset.x;1552blit.srcOffsets[0].y = step.blit.srcRect.offset.y;1553blit.srcOffsets[0].z = 0;1554blit.srcOffsets[1].x = step.blit.srcRect.offset.x + step.blit.srcRect.extent.width;1555blit.srcOffsets[1].y = step.blit.srcRect.offset.y + step.blit.srcRect.extent.height;1556blit.srcOffsets[1].z = 1;1557blit.srcSubresource.mipLevel = 0;1558blit.srcSubresource.layerCount = layerCount;1559blit.dstOffsets[0].x = step.blit.dstRect.offset.x;1560blit.dstOffsets[0].y = step.blit.dstRect.offset.y;1561blit.dstOffsets[0].z = 0;1562blit.dstOffsets[1].x = step.blit.dstRect.offset.x + step.blit.dstRect.extent.width;1563blit.dstOffsets[1].y = step.blit.dstRect.offset.y + step.blit.dstRect.extent.height;1564blit.dstOffsets[1].z = 1;1565blit.dstSubresource.mipLevel = 0;1566blit.dstSubresource.layerCount = layerCount;15671568if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1569blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1570blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1571vkCmdBlitImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, &blit, step.blit.filter);1572}15731574// TODO: Need to check if the depth format is blittable.1575// Actually, we should probably almost always use copies rather than blits for depth buffers.1576if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1577blit.srcSubresource.aspectMask = 0;1578blit.dstSubresource.aspectMask = 0;1579if (step.blit.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1580blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1581blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1582}1583if (step.blit.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {1584blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1585blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1586}1587vkCmdBlitImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, &blit, step.blit.filter);1588}1589}15901591void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrierBatch *recordBarrier) {1592VkImageAspectFlags imageAspect = aspect;1593VkAccessFlags srcAccessMask = 0;1594VkPipelineStageFlags srcStageMask = 0;1595if (img.format == VK_FORMAT_D16_UNORM_S8_UINT || img.format == VK_FORMAT_D24_UNORM_S8_UINT || img.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1596// Barrier must specify both for combined depth/stencil buffers.1597imageAspect = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;1598} else {1599imageAspect = aspect;1600}1601_dbg_assert_(img.layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1602srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;1603srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;1604recordBarrier->TransitionImage(1605img.image,16060,16071,1608img.numLayers,1609aspect,1610VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1611VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1612VK_ACCESS_TRANSFER_WRITE_BIT,1613VK_ACCESS_TRANSFER_WRITE_BIT,1614VK_PIPELINE_STAGE_TRANSFER_BIT,1615VK_PIPELINE_STAGE_TRANSFER_BIT1616);1617}16181619void VulkanQueueRunner::ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize) {1620if (readback->buffer && requiredSize <= readback->bufferSize) {1621return;1622}16231624if (readback->buffer) {1625vulkan_->Delete().QueueDeleteBufferAllocation(readback->buffer, readback->allocation);1626}16271628readback->bufferSize = requiredSize;16291630VkDevice device = vulkan_->GetDevice();16311632VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };1633buf.size = readback->bufferSize;1634buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;16351636VmaAllocationCreateInfo allocCreateInfo{};1637allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;1638VmaAllocationInfo allocInfo{};16391640VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readback->buffer, &readback->allocation, &allocInfo);1641_assert_(res == VK_SUCCESS);16421643const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];1644readback->isCoherent = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;1645}16461647void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd, FrameData &frameData) {1648VkImage image;1649VkImageLayout copyLayout;1650// Special case for backbuffer readbacks.1651if (step.readback.src == nullptr) {1652// We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT,1653// and then back into it.1654// Regarding layers, backbuffer currently only has one layer.1655recordBarrier_.TransitionImage(backbufferImage_, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT,1656VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,16570, VK_ACCESS_TRANSFER_READ_BIT,1658VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);1659copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;1660image = backbufferImage_;1661} else {1662VKRImage *srcImage;1663if (step.readback.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1664srcImage = &step.readback.src->color;1665recordBarrier_.TransitionColorImageAuto(srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1666} else if (step.readback.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1667srcImage = &step.readback.src->depth;1668recordBarrier_.TransitionDepthStencilImageAuto(srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1669_dbg_assert_(srcImage->image != VK_NULL_HANDLE);1670} else {1671_dbg_assert_msg_(false, "No image aspect to readback?");1672return;1673}1674image = srcImage->image;1675copyLayout = srcImage->layout;1676}16771678recordBarrier_.Flush(cmd);16791680// TODO: Handle different readback formats!1681u32 readbackSizeInBytes = sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height;16821683CachedReadback *cached = nullptr;16841685if (step.readback.delayed) {1686ReadbackKey key;1687key.framebuf = step.readback.src;1688key.width = step.readback.srcRect.extent.width;1689key.height = step.readback.srcRect.extent.height;16901691// See if there's already a buffer we can reuse1692if (!frameData.readbacks_.Get(key, &cached)) {1693cached = new CachedReadback();1694cached->bufferSize = 0;1695frameData.readbacks_.Insert(key, cached);1696}1697} else {1698cached = &syncReadback_;1699}17001701ResizeReadbackBuffer(cached, readbackSizeInBytes);17021703VkBufferImageCopy region{};1704region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };1705region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };1706region.imageSubresource.aspectMask = step.readback.aspectMask;1707region.imageSubresource.layerCount = 1;1708region.bufferOffset = 0;1709region.bufferRowLength = step.readback.srcRect.extent.width;1710region.bufferImageHeight = step.readback.srcRect.extent.height;17111712vkCmdCopyImageToBuffer(cmd, image, copyLayout, cached->buffer, 1, ®ion);17131714// NOTE: Can't read the buffer using the CPU here - need to sync first.17151716// If we copied from the backbuffer, transition it back.1717if (step.readback.src == nullptr) {1718// We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT,1719// and then back into it.1720// Regarding layers, backbuffer currently only has one layer.1721recordBarrier_.TransitionImage(backbufferImage_, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT,1722VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,1723VK_ACCESS_TRANSFER_READ_BIT, 0,1724VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);1725recordBarrier_.Flush(cmd); // probably not needed1726copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;1727}1728}17291730void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffer cmd) {1731// TODO: Clean this up - just reusing `SetupTransitionToTransferSrc`.1732VkImageLayout layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;1733recordBarrier_.TransitionColorImageAuto(step.readback_image.image, &layout, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 1, 1);1734recordBarrier_.Flush(cmd);17351736ResizeReadbackBuffer(&syncReadback_, sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);17371738VkBufferImageCopy region{};1739region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 };1740region.imageExtent = { step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height, 1 };1741region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1742region.imageSubresource.layerCount = 1;1743region.imageSubresource.mipLevel = step.readback_image.mipLevel;1744region.bufferOffset = 0;1745region.bufferRowLength = step.readback_image.srcRect.extent.width;1746region.bufferImageHeight = step.readback_image.srcRect.extent.height;1747vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, syncReadback_.buffer, 1, ®ion);17481749// Now transfer it back to a texture.1750recordBarrier_.TransitionImage(step.readback_image.image, 0, 1, 1, // I don't think we have any multilayer cases for regular textures. Above in PerformReadback, though..1751VK_IMAGE_ASPECT_COLOR_BIT,1752VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,1753VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_SHADER_READ_BIT,1754VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);1755recordBarrier_.Flush(cmd); // probably not needed17561757// NOTE: Can't read the buffer using the CPU here - need to sync first.1758// Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host.1759}17601761bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {1762CachedReadback *readback = &syncReadback_;17631764// Look up in readback cache.1765if (src) {1766ReadbackKey key;1767key.framebuf = src;1768key.width = width;1769key.height = height;1770CachedReadback *cached;1771if (frameData.readbacks_.Get(key, &cached)) {1772readback = cached;1773} else {1774// Didn't have a cached image ready yet1775return false;1776}1777}17781779if (!readback->buffer)1780return false; // Didn't find anything in cache, or something has gone really wrong.17811782// Read back to the requested address in ram from buffer.1783void *mappedData;1784const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);1785VkResult res = vmaMapMemory(vulkan_->Allocator(), readback->allocation, &mappedData);17861787if (res != VK_SUCCESS) {1788ERROR_LOG(Log::G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res);1789return false;1790}17911792if (!readback->isCoherent) {1793vmaInvalidateAllocation(vulkan_->Allocator(), readback->allocation, 0, width * height * srcPixelSize);1794}17951796// TODO: Perform these conversions in a compute shader on the GPU.1797if (srcFormat == Draw::DataFormat::R8G8B8A8_UNORM) {1798ConvertFromRGBA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat);1799} else if (srcFormat == Draw::DataFormat::B8G8R8A8_UNORM) {1800ConvertFromBGRA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat);1801} else if (srcFormat == destFormat) {1802// Can just memcpy when it matches no matter the format!1803uint8_t *dst = pixels;1804const uint8_t *src = (const uint8_t *)mappedData;1805for (int y = 0; y < height; ++y) {1806memcpy(dst, src, width * srcPixelSize);1807src += width * srcPixelSize;1808dst += pixelStride * srcPixelSize;1809}1810} else if (destFormat == Draw::DataFormat::D32F) {1811ConvertToD32F(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat);1812} else if (destFormat == Draw::DataFormat::D16) {1813ConvertToD16(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat);1814} else {1815// TODO: Maybe a depth conversion or something?1816ERROR_LOG(Log::G3D, "CopyReadbackBuffer: Unknown format");1817_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);1818}18191820vmaUnmapMemory(vulkan_->Allocator(), readback->allocation);1821return true;1822}18231824const char *VKRRenderCommandToString(VKRRenderCommand cmd) {1825const char * const str[] = {1826"REMOVED",1827"BIND_GRAPHICS_PIPELINE", // async1828"STENCIL",1829"BLEND",1830"VIEWPORT",1831"SCISSOR",1832"CLEAR",1833"DRAW",1834"DRAW_INDEXED",1835"PUSH_CONSTANTS",1836"DEBUG_ANNOTATION",1837};1838if ((int)cmd < ARRAY_SIZE(str)) {1839return str[(int)cmd];1840} else {1841return "N/A";1842}1843}184418451846