CoCalc -- VulkanRenderManager.cpp

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/GPU/Vulkan/VulkanRenderManager.cpp
Views: ¹⁴⁰¹
1
#include <algorithm>
2
#include <cstdint>
3

4
#include <map>
5
#include <sstream>
6

7
#include "Common/Log.h"
8
#include "Common/StringUtils.h"
9
#include "Common/TimeUtil.h"
10

11
#include "Common/GPU/Vulkan/VulkanAlloc.h"
12
#include "Common/GPU/Vulkan/VulkanContext.h"
13
#include "Common/GPU/Vulkan/VulkanRenderManager.h"
14

15
#include "Common/LogReporting.h"
16
#include "Common/Thread/ThreadUtil.h"
17
#include "Common/VR/PPSSPPVR.h"
18

19
#if 0 // def _DEBUG
20
#define VLOG(...) NOTICE_LOG(Log::G3D, __VA_ARGS__)
21
#else
22
#define VLOG(...)
23
#endif
24

25
#ifndef UINT64_MAX
26
#define UINT64_MAX 0xFFFFFFFFFFFFFFFFULL
27
#endif
28

29
using namespace PPSSPP_VK;
30

31
// renderPass is an example of the "compatibility class" or RenderPassType type.
32
bool VKRGraphicsPipeline::Create(VulkanContext *vulkan, VkRenderPass compatibleRenderPass, RenderPassType rpType, VkSampleCountFlagBits sampleCount, double scheduleTime, int countToCompile) {
33
	// Good torture test to test the shutdown-while-precompiling-shaders issue on PC where it's normally
34
	// hard to catch because shaders compile so fast.
35
	// sleep_ms(200);
36

37
	bool multisample = RenderPassTypeHasMultisample(rpType);
38
	if (multisample) {
39
		if (sampleCount_ != VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM) {
40
			_assert_(sampleCount == sampleCount_);
41
		} else {
42
			sampleCount_ = sampleCount;
43
		}
44
	}
45

46
	// Sanity check.
47
	// Seen in crash reports from PowerVR GE8320, presumably we failed creating some shader modules.
48
	if (!desc->vertexShader || !desc->fragmentShader) {
49
		ERROR_LOG(Log::G3D, "Failed creating graphics pipeline - missing vs/fs shader module pointers!");
50
		pipeline[(size_t)rpType]->Post(VK_NULL_HANDLE);
51
		return false;
52
	}
53

54
	// Fill in the last part of the desc since now it's time to block.
55
	VkShaderModule vs = desc->vertexShader->BlockUntilReady();
56
	VkShaderModule fs = desc->fragmentShader->BlockUntilReady();
57
	VkShaderModule gs = desc->geometryShader ? desc->geometryShader->BlockUntilReady() : VK_NULL_HANDLE;
58

59
	if (!vs || !fs || (!gs && desc->geometryShader)) {
60
		ERROR_LOG(Log::G3D, "Failed creating graphics pipeline - missing shader modules");
61
		pipeline[(size_t)rpType]->Post(VK_NULL_HANDLE);
62
		return false;
63
	}
64

65
	if (!compatibleRenderPass) {
66
		ERROR_LOG(Log::G3D, "Failed creating graphics pipeline - compatible render pass was nullptr");
67
		pipeline[(size_t)rpType]->Post(VK_NULL_HANDLE);
68
		return false;
69
	}
70

71
	uint32_t stageCount = 2;
72
	VkPipelineShaderStageCreateInfo ss[3]{};
73
	ss[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
74
	ss[0].stage = VK_SHADER_STAGE_VERTEX_BIT;
75
	ss[0].pSpecializationInfo = nullptr;
76
	ss[0].module = vs;
77
	ss[0].pName = "main";
78
	ss[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
79
	ss[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
80
	ss[1].pSpecializationInfo = nullptr;
81
	ss[1].module = fs;
82
	ss[1].pName = "main";
83
	if (gs) {
84
		stageCount++;
85
		ss[2].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
86
		ss[2].stage = VK_SHADER_STAGE_GEOMETRY_BIT;
87
		ss[2].pSpecializationInfo = nullptr;
88
		ss[2].module = gs;
89
		ss[2].pName = "main";
90
	}
91

92
	VkGraphicsPipelineCreateInfo pipe{ VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO };
93
	pipe.pStages = ss;
94
	pipe.stageCount = stageCount;
95
	pipe.renderPass = compatibleRenderPass;
96
	pipe.basePipelineIndex = 0;
97
	pipe.pColorBlendState = &desc->cbs;
98
	pipe.pDepthStencilState = &desc->dss;
99
	pipe.pRasterizationState = &desc->rs;
100

101
	VkPipelineMultisampleStateCreateInfo ms{ VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO };
102
	ms.rasterizationSamples = multisample ? sampleCount : VK_SAMPLE_COUNT_1_BIT;
103
	if (multisample && (flags_ & PipelineFlags::USES_DISCARD)) {
104
		// Extreme quality
105
		ms.sampleShadingEnable = true;
106
		ms.minSampleShading = 1.0f;
107
	}
108

109
	VkPipelineInputAssemblyStateCreateInfo inputAssembly{ VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO };
110
	inputAssembly.topology = desc->topology;
111

112
	// We will use dynamic viewport state.
113
	pipe.pVertexInputState = &desc->vis;
114
	pipe.pViewportState = &desc->views;
115
	pipe.pTessellationState = nullptr;
116
	pipe.pDynamicState = &desc->ds;
117
	pipe.pInputAssemblyState = &inputAssembly;
118
	pipe.pMultisampleState = &ms;
119
	pipe.layout = desc->pipelineLayout->pipelineLayout;
120
	pipe.basePipelineHandle = VK_NULL_HANDLE;
121
	pipe.basePipelineIndex = 0;
122
	pipe.subpass = 0;
123

124
	double start = time_now_d();
125
	VkPipeline vkpipeline;
126
	VkResult result = vkCreateGraphicsPipelines(vulkan->GetDevice(), desc->pipelineCache, 1, &pipe, nullptr, &vkpipeline);
127

128
	double now = time_now_d();
129
	double taken_ms_since_scheduling = (now - scheduleTime) * 1000.0;
130
	double taken_ms = (now - start) * 1000.0;
131

132
	if (taken_ms < 0.1) {
133
		DEBUG_LOG(Log::G3D, "Pipeline (x/%d) time on %s: %0.2f ms, %0.2f ms since scheduling (fast) rpType: %04x sampleBits: %d (%s)",
134
			countToCompile, GetCurrentThreadName(), taken_ms, taken_ms_since_scheduling, (u32)rpType, (u32)sampleCount, tag_.c_str());
135
	} else {
136
		INFO_LOG(Log::G3D, "Pipeline (x/%d) time on %s: %0.2f ms, %0.2f ms since scheduling  rpType: %04x sampleBits: %d (%s)",
137
			countToCompile, GetCurrentThreadName(), taken_ms, taken_ms_since_scheduling, (u32)rpType, (u32)sampleCount, tag_.c_str());
138
	}
139

140
	bool success = true;
141
	if (result == VK_INCOMPLETE) {
142
		// Bad (disallowed by spec) return value seen on Adreno in Burnout :(  Try to ignore?
143
		// Would really like to log more here, we could probably attach more info to desc.
144
		//
145
		// At least create a null placeholder to avoid creating over and over if something is broken.
146
		pipeline[(size_t)rpType]->Post(VK_NULL_HANDLE);
147
		ERROR_LOG(Log::G3D, "Failed creating graphics pipeline! VK_INCOMPLETE");
148
		LogCreationFailure();
149
		success = false;
150
	} else if (result != VK_SUCCESS) {
151
		pipeline[(size_t)rpType]->Post(VK_NULL_HANDLE);
152
		ERROR_LOG(Log::G3D, "Failed creating graphics pipeline! result='%s'", VulkanResultToString(result));
153
		LogCreationFailure();
154
		success = false;
155
	} else {
156
		// Success!
157
		if (!tag_.empty()) {
158
			vulkan->SetDebugName(vkpipeline, VK_OBJECT_TYPE_PIPELINE, tag_.c_str());
159
		}
160
		pipeline[(size_t)rpType]->Post(vkpipeline);
161
	}
162

163
	return success;
164
}
165

166
void VKRGraphicsPipeline::DestroyVariants(VulkanContext *vulkan, bool msaaOnly) {
167
	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
168
		if (!this->pipeline[i])
169
			continue;
170
		if (msaaOnly && (i & (int)RenderPassType::MULTISAMPLE) == 0)
171
			continue;
172

173
		VkPipeline pipeline = this->pipeline[i]->BlockUntilReady();
174
		// pipeline can be nullptr here, if it failed to compile before.
175
		if (pipeline) {
176
			vulkan->Delete().QueueDeletePipeline(pipeline);
177
		}
178
		this->pipeline[i] = nullptr;
179
	}
180
	sampleCount_ = VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM;
181
}
182

183
void VKRGraphicsPipeline::DestroyVariantsInstant(VkDevice device) {
184
	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
185
		if (pipeline[i]) {
186
			vkDestroyPipeline(device, pipeline[i]->BlockUntilReady(), nullptr);
187
			delete pipeline[i];
188
			pipeline[i] = nullptr;
189
		}
190
	}
191
}
192

193
VKRGraphicsPipeline::~VKRGraphicsPipeline() {
194
	// This is called from the callbacked queued in QueueForDeletion.
195
	// When we reach here, we should already be empty, so let's assert on that.
196
	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
197
		_assert_(!pipeline[i]);
198
	}
199
	if (desc)
200
		desc->Release();
201
}
202

203
void VKRGraphicsPipeline::BlockUntilCompiled() {
204
	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
205
		if (pipeline[i]) {
206
			pipeline[i]->BlockUntilReady();
207
		}
208
	}
209
}
210

211
void VKRGraphicsPipeline::QueueForDeletion(VulkanContext *vulkan) {
212
	// Can't destroy variants here, the pipeline still lives for a while.
213
	vulkan->Delete().QueueCallback([](VulkanContext *vulkan, void *p) {
214
		VKRGraphicsPipeline *pipeline = (VKRGraphicsPipeline *)p;
215
		pipeline->DestroyVariantsInstant(vulkan->GetDevice());
216
		delete pipeline;
217
	}, this);
218
}
219

220
u32 VKRGraphicsPipeline::GetVariantsBitmask() const {
221
	u32 bitmask = 0;
222
	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
223
		if (pipeline[i]) {
224
			bitmask |= 1 << i;
225
		}
226
	}
227
	return bitmask;
228
}
229

230
void VKRGraphicsPipeline::LogCreationFailure() const {
231
	ERROR_LOG(Log::G3D, "vs: %s\n[END VS]", desc->vertexShaderSource.c_str());
232
	ERROR_LOG(Log::G3D, "fs: %s\n[END FS]", desc->fragmentShaderSource.c_str());
233
	if (desc->geometryShader) {
234
		ERROR_LOG(Log::G3D, "gs: %s\n[END GS]", desc->geometryShaderSource.c_str());
235
	}
236
	// TODO: Maybe log various other state?
237
	ERROR_LOG(Log::G3D, "======== END OF PIPELINE ==========");
238
}
239

240
struct SinglePipelineTask {
241
	VKRGraphicsPipeline *pipeline;
242
	VkRenderPass compatibleRenderPass;
243
	RenderPassType rpType;
244
	VkSampleCountFlagBits sampleCount;
245
	double scheduleTime;
246
	int countToCompile;
247
};
248

249
class CreateMultiPipelinesTask : public Task {
250
public:
251
	CreateMultiPipelinesTask(VulkanContext *vulkan, std::vector<SinglePipelineTask> tasks) : vulkan_(vulkan), tasks_(tasks) {
252
		tasksInFlight_.fetch_add(1);
253
	}
254
	~CreateMultiPipelinesTask() {}
255

256
	TaskType Type() const override {
257
		return TaskType::CPU_COMPUTE;
258
	}
259

260
	TaskPriority Priority() const override {
261
		return TaskPriority::HIGH;
262
	}
263

264
	void Run() override {
265
		for (auto &task : tasks_) {
266
			task.pipeline->Create(vulkan_, task.compatibleRenderPass, task.rpType, task.sampleCount, task.scheduleTime, task.countToCompile);
267
		}
268
		tasksInFlight_.fetch_sub(1);
269
	}
270

271
	VulkanContext *vulkan_;
272
	std::vector<SinglePipelineTask> tasks_;
273

274
	// Use during shutdown to make sure there aren't any leftover tasks sitting queued.
275
	// Could probably be done more elegantly. Like waiting for all tasks of a type, or saving pointers to them, or something...
276
	static void WaitForAll();
277
	static std::atomic<int> tasksInFlight_;
278
};
279

280
void CreateMultiPipelinesTask::WaitForAll() {
281
	while (tasksInFlight_.load() > 0) {
282
		sleep_ms(2);
283
	}
284
}
285

286
std::atomic<int> CreateMultiPipelinesTask::tasksInFlight_;
287

288
VulkanRenderManager::VulkanRenderManager(VulkanContext *vulkan, bool useThread, HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory)
289
	: vulkan_(vulkan), queueRunner_(vulkan),
290
	initTimeMs_("initTimeMs"),
291
	totalGPUTimeMs_("totalGPUTimeMs"),
292
	renderCPUTimeMs_("renderCPUTimeMs"),
293
	descUpdateTimeMs_("descUpdateCPUTimeMs"),
294
	useRenderThread_(useThread),
295
	frameTimeHistory_(frameTimeHistory)
296
{
297
	inflightFramesAtStart_ = vulkan_->GetInflightFrames();
298

299
	// For present timing experiments. Disabled for now.
300
	measurePresentTime_ = false;
301

302
	frameDataShared_.Init(vulkan, useThread, measurePresentTime_);
303

304
	for (int i = 0; i < inflightFramesAtStart_; i++) {
305
		frameData_[i].Init(vulkan, i);
306
	}
307

308
	queueRunner_.CreateDeviceObjects();
309
}
310

311
bool VulkanRenderManager::CreateBackbuffers() {
312
	if (!vulkan_->GetSwapchain()) {
313
		ERROR_LOG(Log::G3D, "No swapchain - can't create backbuffers");
314
		return false;
315
	}
316

317
	VkCommandBuffer cmdInit = GetInitCmd();
318

319
	if (!queueRunner_.CreateSwapchain(cmdInit, &postInitBarrier_)) {
320
		return false;
321
	}
322

323
	curWidthRaw_ = -1;
324
	curHeightRaw_ = -1;
325

326
	if (HasBackbuffers()) {
327
		VLOG("Backbuffers Created");
328
	}
329

330
	if (newInflightFrames_ != -1) {
331
		INFO_LOG(Log::G3D, "Updating inflight frames to %d", newInflightFrames_);
332
		vulkan_->UpdateInflightFrames(newInflightFrames_);
333
		newInflightFrames_ = -1;
334
	}
335

336
	outOfDateFrames_ = 0;
337

338
	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
339
		auto &frameData = frameData_[i];
340
		frameData.readyForFence = true;  // Just in case.
341
	}
342

343
	// Start the thread(s).
344
	if (HasBackbuffers()) {
345
		StartThreads();
346
	}
347
	return true;
348
}
349

350
void VulkanRenderManager::StartThreads() {
351
	{
352
		std::unique_lock<std::mutex> lock(compileMutex_);
353
		_assert_(compileQueue_.empty());
354
	}
355

356
	runCompileThread_ = true;  // For controlling the compiler thread's exit
357

358
	if (useRenderThread_) {
359
		INFO_LOG(Log::G3D, "Starting Vulkan submission thread");
360
		renderThread_ = std::thread(&VulkanRenderManager::RenderThreadFunc, this);
361
	}
362
	INFO_LOG(Log::G3D, "Starting Vulkan compiler thread");
363
	compileThread_ = std::thread(&VulkanRenderManager::CompileThreadFunc, this);
364

365
	if (measurePresentTime_ && vulkan_->Extensions().KHR_present_wait && vulkan_->GetPresentMode() == VK_PRESENT_MODE_FIFO_KHR) {
366
		INFO_LOG(Log::G3D, "Starting Vulkan present wait thread");
367
		presentWaitThread_ = std::thread(&VulkanRenderManager::PresentWaitThreadFunc, this);
368
	}
369
}
370

371
// Called from main thread.
372
void VulkanRenderManager::StopThreads() {
373
	// Not sure this is a sensible check - should be ok even if not.
374
	// _dbg_assert_(steps_.empty());
375

376
	if (useRenderThread_) {
377
		_dbg_assert_(renderThread_.joinable());
378
		// Tell the render thread to quit when it's done.
379
		VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::EXIT);
380
		task->frame = vulkan_->GetCurFrame();
381
		{
382
			std::unique_lock<std::mutex> lock(pushMutex_);
383
			renderThreadQueue_.push(task);
384
		}
385
		pushCondVar_.notify_one();
386
		// Once the render thread encounters the above exit task, it'll exit.
387
		renderThread_.join();
388
		INFO_LOG(Log::G3D, "Vulkan submission thread joined. Frame=%d", vulkan_->GetCurFrame());
389
	}
390

391
	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
392
		auto &frameData = frameData_[i];
393
		// Zero the queries so we don't try to pull them later.
394
		frameData.profile.timestampDescriptions.clear();
395
	}
396

397
	{
398
		std::unique_lock<std::mutex> lock(compileMutex_);
399
		runCompileThread_ = false;  // Compiler and present thread both look at this bool.
400
		_assert_(compileThread_.joinable());
401
		compileCond_.notify_one();
402
	}
403
	compileThread_.join();
404

405
	if (presentWaitThread_.joinable()) {
406
		presentWaitThread_.join();
407
	}
408

409
	INFO_LOG(Log::G3D, "Vulkan compiler thread joined. Now wait for any straggling compile tasks.");
410
	CreateMultiPipelinesTask::WaitForAll();
411

412
	{
413
		std::unique_lock<std::mutex> lock(compileMutex_);
414
		_assert_(compileQueue_.empty());
415
	}
416
}
417

418
void VulkanRenderManager::DestroyBackbuffers() {
419
	StopThreads();
420
	vulkan_->WaitUntilQueueIdle();
421

422
	queueRunner_.DestroyBackBuffers();
423
}
424

425
void VulkanRenderManager::CheckNothingPending() {
426
	_assert_(pipelinesToCheck_.empty());
427
	{
428
		std::unique_lock<std::mutex> lock(compileMutex_);
429
		_assert_(compileQueue_.empty());
430
	}
431
}
432

433
VulkanRenderManager::~VulkanRenderManager() {
434
	INFO_LOG(Log::G3D, "VulkanRenderManager destructor");
435

436
	{
437
		std::unique_lock<std::mutex> lock(compileMutex_);
438
		_assert_(compileQueue_.empty());
439
	}
440

441
	if (useRenderThread_) {
442
		_dbg_assert_(!renderThread_.joinable());
443
	}
444

445
	_dbg_assert_(!runCompileThread_);  // StopThread should already have been called from DestroyBackbuffers.
446

447
	vulkan_->WaitUntilQueueIdle();
448

449
	_dbg_assert_(pipelineLayouts_.empty());
450

451
	VkDevice device = vulkan_->GetDevice();
452
	frameDataShared_.Destroy(vulkan_);
453
	for (int i = 0; i < inflightFramesAtStart_; i++) {
454
		frameData_[i].Destroy(vulkan_);
455
	}
456
	queueRunner_.DestroyDeviceObjects();
457
}
458

459
void VulkanRenderManager::CompileThreadFunc() {
460
	SetCurrentThreadName("ShaderCompile");
461
	while (true) {
462
		bool exitAfterCompile = false;
463
		std::vector<CompileQueueEntry> toCompile;
464
		{
465
			std::unique_lock<std::mutex> lock(compileMutex_);
466
			while (compileQueue_.empty() && runCompileThread_) {
467
				compileCond_.wait(lock);
468
			}
469
			toCompile = std::move(compileQueue_);
470
			compileQueue_.clear();
471
			if (!runCompileThread_) {
472
				exitAfterCompile = true;
473
			}
474
		}
475

476
		int countToCompile = (int)toCompile.size();
477

478
		// Here we sort the pending pipelines by vertex and fragment shaders,
479
		std::map<std::pair<Promise<VkShaderModule> *, Promise<VkShaderModule> *>, std::vector<SinglePipelineTask>> map;
480

481
		double scheduleTime = time_now_d();
482

483
		// Here we sort pending graphics pipelines by vertex and fragment shaders, and split up further.
484
		// Those with the same pairs of shaders should be on the same thread, at least on NVIDIA.
485
		// I don't think PowerVR cares though, it doesn't seem to reuse information between the compiles,
486
		// so we might want a different splitting algorithm there.
487
		for (auto &entry : toCompile) {
488
			switch (entry.type) {
489
			case CompileQueueEntry::Type::GRAPHICS:
490
			{
491
				map[std::make_pair(entry.graphics->desc->vertexShader, entry.graphics->desc->fragmentShader)].push_back(
492
					SinglePipelineTask{
493
						entry.graphics,
494
						entry.compatibleRenderPass,
495
						entry.renderPassType,
496
						entry.sampleCount,
497
						scheduleTime,    // these two are for logging purposes.
498
						countToCompile,
499
					}
500
				);
501
				break;
502
			}
503
			}
504
		}
505

506
		for (auto iter : map) {
507
			auto &shaders = iter.first;
508
			auto &entries = iter.second;
509

510
			// NOTICE_LOG(Log::G3D, "For this shader pair, we have %d pipelines to create", (int)entries.size());
511

512
			Task *task = new CreateMultiPipelinesTask(vulkan_, entries);
513
			g_threadManager.EnqueueTask(task);
514
		}
515

516
		if (exitAfterCompile) {
517
			break;
518
		}
519

520
		// Hold off just a bit before we check again, to allow bunches of pipelines to collect.
521
		sleep_ms(1);
522
	}
523

524
	std::unique_lock<std::mutex> lock(compileMutex_);
525
	_assert_(compileQueue_.empty());
526
}
527

528
void VulkanRenderManager::RenderThreadFunc() {
529
	SetCurrentThreadName("VulkanRenderMan");
530
	while (true) {
531
		_dbg_assert_(useRenderThread_);
532

533
		// Pop a task of the queue and execute it.
534
		VKRRenderThreadTask *task = nullptr;
535
		{
536
			std::unique_lock<std::mutex> lock(pushMutex_);
537
			while (renderThreadQueue_.empty()) {
538
				pushCondVar_.wait(lock);
539
			}
540
			task = renderThreadQueue_.front();
541
			renderThreadQueue_.pop();
542
		}
543

544
		// Oh, we got a task! We can now have pushMutex_ unlocked, allowing the host to
545
		// push more work when it feels like it, and just start working.
546
		if (task->runType == VKRRunType::EXIT) {
547
			// Oh, host wanted out. Let's leave.
548
			delete task;
549
			// In this case, there should be no more tasks.
550
			break;
551
		}
552

553
		Run(*task);
554
		delete task;
555
	}
556

557
	// Wait for the device to be done with everything, before tearing stuff down.
558
	// TODO: Do we really need this? It's probably a good idea, though.
559
	vkDeviceWaitIdle(vulkan_->GetDevice());
560
	VLOG("PULL: Quitting");
561
}
562

563
void VulkanRenderManager::PresentWaitThreadFunc() {
564
	SetCurrentThreadName("PresentWait");
565

566
#if !PPSSPP_PLATFORM(IOS_APP_STORE)
567
	_dbg_assert_(vkWaitForPresentKHR != nullptr);
568

569
	uint64_t waitedId = frameIdGen_;
570
	while (runCompileThread_) {
571
		const uint64_t timeout = 1000000000ULL;  // 1 sec
572
		if (VK_SUCCESS == vkWaitForPresentKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), waitedId, timeout)) {
573
			frameTimeHistory_[waitedId].actualPresent = time_now_d();
574
			frameTimeHistory_[waitedId].waitCount++;
575
			waitedId++;
576
		} else {
577
			// We caught up somehow, which is a bad sign (we should have blocked, right?). Maybe we should break out of the loop?
578
			sleep_ms(1);
579
			frameTimeHistory_[waitedId].waitCount++;
580
		}
581
		_dbg_assert_(waitedId <= frameIdGen_);
582
	}
583
#endif
584

585
	INFO_LOG(Log::G3D, "Leaving PresentWaitThreadFunc()");
586
}
587

588
void VulkanRenderManager::PollPresentTiming() {
589
	// For VK_GOOGLE_display_timing, we need to poll.
590

591
	// Poll for information about completed frames.
592
	// NOTE: We seem to get the information pretty late! Like after 6 frames, which is quite weird.
593
	// Tested on POCO F4.
594
	// TODO: Getting validation errors that this should be called from the thread doing the presenting.
595
	// Probably a fair point. For now, we turn it off.
596
	if (measurePresentTime_ && vulkan_->Extensions().GOOGLE_display_timing) {
597
		uint32_t count = 0;
598
		vkGetPastPresentationTimingGOOGLE(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &count, nullptr);
599
		if (count > 0) {
600
			VkPastPresentationTimingGOOGLE *timings = new VkPastPresentationTimingGOOGLE[count];
601
			vkGetPastPresentationTimingGOOGLE(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &count, timings);
602
			for (uint32_t i = 0; i < count; i++) {
603
				uint64_t presentId = timings[i].presentID;
604
				frameTimeHistory_[presentId].actualPresent = from_time_raw(timings[i].actualPresentTime);
605
				frameTimeHistory_[presentId].desiredPresentTime = from_time_raw(timings[i].desiredPresentTime);
606
				frameTimeHistory_[presentId].earliestPresentTime = from_time_raw(timings[i].earliestPresentTime);
607
				double presentMargin = from_time_raw_relative(timings[i].presentMargin);
608
				frameTimeHistory_[presentId].presentMargin = presentMargin;
609
			}
610
			delete[] timings;
611
		}
612
	}
613
}
614

615
void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfiler) {
616
	double frameBeginTime = time_now_d()
617
	VLOG("BeginFrame");
618
	VkDevice device = vulkan_->GetDevice();
619

620
	int curFrame = vulkan_->GetCurFrame();
621
	FrameData &frameData = frameData_[curFrame];
622
	VLOG("PUSH: Fencing %d", curFrame);
623

624
	// Makes sure the submission from the previous time around has happened. Otherwise
625
	// we are not allowed to wait from another thread here..
626
	if (useRenderThread_) {
627
		std::unique_lock<std::mutex> lock(frameData.fenceMutex);
628
		while (!frameData.readyForFence) {
629
			frameData.fenceCondVar.wait(lock);
630
		}
631
		frameData.readyForFence = false;
632
	}
633

634
	// This must be the very first Vulkan call we do in a new frame.
635
	// Makes sure the very last command buffer from the frame before the previous has been fully executed.
636
	if (vkWaitForFences(device, 1, &frameData.fence, true, UINT64_MAX) == VK_ERROR_DEVICE_LOST) {
637
		_assert_msg_(false, "Device lost in vkWaitForFences");
638
	}
639
	vkResetFences(device, 1, &frameData.fence);
640

641
	uint64_t frameId = frameIdGen_++;
642

643
	PollPresentTiming();
644

645
	ResetDescriptorLists(curFrame);
646

647
	int validBits = vulkan_->GetQueueFamilyProperties(vulkan_->GetGraphicsQueueFamilyIndex()).timestampValidBits;
648

649
	FrameTimeData &frameTimeData = frameTimeHistory_.Add(frameId);
650
	frameTimeData.frameId = frameId;
651
	frameTimeData.frameBegin = frameBeginTime;
652
	frameTimeData.afterFenceWait = time_now_d();
653

654
	// Can't set this until after the fence.
655
	frameData.profile.enabled = enableProfiling;
656
	frameData.profile.timestampsEnabled = enableProfiling && validBits > 0;
657
	frameData.frameId = frameId;
658

659
	uint64_t queryResults[MAX_TIMESTAMP_QUERIES];
660

661
	if (enableProfiling) {
662
		// Pull the profiling results from last time and produce a summary!
663
		if (!frameData.profile.timestampDescriptions.empty() && frameData.profile.timestampsEnabled) {
664
			int numQueries = (int)frameData.profile.timestampDescriptions.size();
665
			VkResult res = vkGetQueryPoolResults(
666
				vulkan_->GetDevice(),
667
				frameData.profile.queryPool, 0, numQueries, sizeof(uint64_t) * numQueries, &queryResults[0], sizeof(uint64_t),
668
				VK_QUERY_RESULT_64_BIT);
669
			if (res == VK_SUCCESS) {
670
				double timestampConversionFactor = (double)vulkan_->GetPhysicalDeviceProperties().properties.limits.timestampPeriod * (1.0 / 1000000.0);
671
				uint64_t timestampDiffMask = validBits == 64 ? 0xFFFFFFFFFFFFFFFFULL : ((1ULL << validBits) - 1);
672
				std::stringstream str;
673

674
				char line[256];
675
				totalGPUTimeMs_.Update(((double)((queryResults[numQueries - 1] - queryResults[0]) & timestampDiffMask) * timestampConversionFactor));
676
				totalGPUTimeMs_.Format(line, sizeof(line));
677
				str << line;
678
				renderCPUTimeMs_.Update((frameData.profile.cpuEndTime - frameData.profile.cpuStartTime) * 1000.0);
679
				renderCPUTimeMs_.Format(line, sizeof(line));
680
				str << line;
681
				descUpdateTimeMs_.Update(frameData.profile.descWriteTime * 1000.0);
682
				descUpdateTimeMs_.Format(line, sizeof(line));
683
				str << line;
684
				snprintf(line, sizeof(line), "Descriptors written: %d (dedup: %d)\n", frameData.profile.descriptorsWritten, frameData.profile.descriptorsDeduped);
685
				str << line;
686
				snprintf(line, sizeof(line), "Resource deletions: %d\n", vulkan_->GetLastDeleteCount());
687
				str << line;
688
				for (int i = 0; i < numQueries - 1; i++) {
689
					uint64_t diff = (queryResults[i + 1] - queryResults[i]) & timestampDiffMask;
690
					double milliseconds = (double)diff * timestampConversionFactor;
691

692
					// Can't use SimpleStat for these very easily since these are dynamic per frame.
693
					// Only the first one is static, the initCmd.
694
					// Could try some hashtable tracking for the rest, later.
695
					if (i == 0) {
696
						initTimeMs_.Update(milliseconds);
697
						initTimeMs_.Format(line, sizeof(line));
698
					} else {
699
						snprintf(line, sizeof(line), "%s: %0.3f ms\n", frameData.profile.timestampDescriptions[i + 1].c_str(), milliseconds);
700
					}
701
					str << line;
702
				}
703
				frameData.profile.profileSummary = str.str();
704
			} else {
705
				frameData.profile.profileSummary = "(error getting GPU profile - not ready?)";
706
			}
707
		} else {
708
			std::stringstream str;
709
			char line[256];
710
			renderCPUTimeMs_.Update((frameData.profile.cpuEndTime - frameData.profile.cpuStartTime) * 1000.0);
711
			renderCPUTimeMs_.Format(line, sizeof(line));
712
			str << line;
713
			descUpdateTimeMs_.Update(frameData.profile.descWriteTime * 1000.0);
714
			descUpdateTimeMs_.Format(line, sizeof(line));
715
			str << line;
716
			snprintf(line, sizeof(line), "Descriptors written: %d\n", frameData.profile.descriptorsWritten);
717
			str << line;
718
			frameData.profile.profileSummary = str.str();
719
		}
720

721
#ifdef _DEBUG
722
		std::string cmdString;
723
		for (int i = 0; i < ARRAY_SIZE(frameData.profile.commandCounts); i++) {
724
			if (frameData.profile.commandCounts[i] > 0) {
725
				cmdString += StringFromFormat("%s: %d\n", VKRRenderCommandToString((VKRRenderCommand)i), frameData.profile.commandCounts[i]);
726
			}
727
		}
728
		memset(frameData.profile.commandCounts, 0, sizeof(frameData.profile.commandCounts));
729
		frameData.profile.profileSummary += cmdString;
730
#endif
731
	}
732

733
	frameData.profile.descriptorsWritten = 0;
734
	frameData.profile.descriptorsDeduped = 0;
735

736
	// Must be after the fence - this performs deletes.
737
	VLOG("PUSH: BeginFrame %d", curFrame);
738

739
	insideFrame_ = true;
740
	vulkan_->BeginFrame(enableLogProfiler ? GetInitCmd() : VK_NULL_HANDLE);
741

742
	frameData.profile.timestampDescriptions.clear();
743
	if (frameData.profile.timestampsEnabled) {
744
		// For various reasons, we need to always use an init cmd buffer in this case to perform the vkCmdResetQueryPool,
745
		// unless we want to limit ourselves to only measure the main cmd buffer.
746
		// Later versions of Vulkan have support for clearing queries on the CPU timeline, but we don't want to rely on that.
747
		// Reserve the first two queries for initCmd.
748
		frameData.profile.timestampDescriptions.push_back("initCmd Begin");
749
		frameData.profile.timestampDescriptions.push_back("initCmd");
750
		VkCommandBuffer initCmd = GetInitCmd();
751
	}
752
}
753

754
VkCommandBuffer VulkanRenderManager::GetInitCmd() {
755
	int curFrame = vulkan_->GetCurFrame();
756
	return frameData_[curFrame].GetInitCmd(vulkan_);
757
}
758

759
void VulkanRenderManager::ReportBadStateForDraw() {
760
	const char *cause1 = "";
761
	char cause2[256];
762
	cause2[0] = '\0';
763
	if (!curRenderStep_) {
764
		cause1 = "No current render step";
765
	}
766
	if (curRenderStep_ && curRenderStep_->stepType != VKRStepType::RENDER) {
767
		cause1 = "Not a render step: ";
768
		std::string str = VulkanQueueRunner::StepToString(vulkan_, *curRenderStep_);
769
		truncate_cpy(cause2, str.c_str());
770
	}
771
	ERROR_LOG_REPORT_ONCE(baddraw, Log::G3D, "Can't draw: %s%s. Step count: %d", cause1, cause2, (int)steps_.size());
772
}
773

774
VKRGraphicsPipeline *VulkanRenderManager::CreateGraphicsPipeline(VKRGraphicsPipelineDesc *desc, PipelineFlags pipelineFlags, uint32_t variantBitmask, VkSampleCountFlagBits sampleCount, bool cacheLoad, const char *tag) {
775
	if (!desc->vertexShader || !desc->fragmentShader) {
776
		ERROR_LOG(Log::G3D, "Can't create graphics pipeline with missing vs/ps: %p %p", desc->vertexShader, desc->fragmentShader);
777
		return nullptr;
778
	}
779

780
	VKRGraphicsPipeline *pipeline = new VKRGraphicsPipeline(pipelineFlags, tag);
781
	pipeline->desc = desc;
782
	pipeline->desc->AddRef();
783
	if (curRenderStep_ && !cacheLoad) {
784
		// The common case during gameplay.
785
		pipelinesToCheck_.push_back(pipeline);
786
	} else {
787
		if (!variantBitmask) {
788
			WARN_LOG(Log::G3D, "WARNING: Will not compile any variants of pipeline, not in renderpass and empty variantBitmask");
789
		}
790
		// Presumably we're in initialization, loading the shader cache.
791
		// Look at variantBitmask to see what variants we should queue up.
792
		RPKey key{
793
			VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,
794
			VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,
795
		};
796
		VKRRenderPass *compatibleRenderPass = queueRunner_.GetRenderPass(key);
797
		std::unique_lock<std::mutex> lock(compileMutex_);
798
		bool needsCompile = false;
799
		for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
800
			if (!(variantBitmask & (1 << i)))
801
				continue;
802
			RenderPassType rpType = (RenderPassType)i;
803

804
			// Sanity check - don't compile incompatible types (could be caused by corrupt caches, changes in data structures, etc).
805
			if ((pipelineFlags & PipelineFlags::USES_DEPTH_STENCIL) && !RenderPassTypeHasDepth(rpType)) {
806
				WARN_LOG(Log::G3D, "Not compiling pipeline that requires depth, for non depth renderpass type");
807
				continue;
808
			}
809
			// Shouldn't hit this, these should have been filtered elsewhere. However, still a good check to do.
810
			if (sampleCount == VK_SAMPLE_COUNT_1_BIT && RenderPassTypeHasMultisample(rpType)) {
811
				WARN_LOG(Log::G3D, "Not compiling single sample pipeline for a multisampled render pass type");
812
				continue;
813
			}
814

815
			if (rpType == RenderPassType::BACKBUFFER) {
816
				sampleCount = VK_SAMPLE_COUNT_1_BIT;
817
			}
818

819
			pipeline->pipeline[i] = Promise<VkPipeline>::CreateEmpty();
820
			compileQueue_.emplace_back(pipeline, compatibleRenderPass->Get(vulkan_, rpType, sampleCount), rpType, sampleCount);
821
			needsCompile = true;
822
		}
823
		if (needsCompile)
824
			compileCond_.notify_one();
825
	}
826
	return pipeline;
827
}
828

829
void VulkanRenderManager::EndCurRenderStep() {
830
	if (!curRenderStep_)
831
		return;
832

833
	RPKey key{
834
		curRenderStep_->render.colorLoad, curRenderStep_->render.depthLoad, curRenderStep_->render.stencilLoad,
835
		curRenderStep_->render.colorStore, curRenderStep_->render.depthStore, curRenderStep_->render.stencilStore,
836
	};
837
	// Save the accumulated pipeline flags so we can use that to configure the render pass.
838
	// We'll often be able to avoid loading/saving the depth/stencil buffer.
839
	curRenderStep_->render.pipelineFlags = curPipelineFlags_;
840
	bool depthStencil = (curPipelineFlags_ & PipelineFlags::USES_DEPTH_STENCIL) != 0;
841
	RenderPassType rpType = depthStencil ? RenderPassType::HAS_DEPTH : RenderPassType::DEFAULT;
842

843
	if (curRenderStep_->render.framebuffer && (rpType & RenderPassType::HAS_DEPTH) && !curRenderStep_->render.framebuffer->HasDepth()) {
844
		WARN_LOG(Log::G3D, "Trying to render with a depth-writing pipeline to a framebuffer without depth: %s", curRenderStep_->render.framebuffer->Tag());
845
		rpType = RenderPassType::DEFAULT;
846
	}
847

848
	if (!curRenderStep_->render.framebuffer) {
849
		rpType = RenderPassType::BACKBUFFER;
850
	} else {
851
		// Framebuffers can be stereo, and if so, will control the render pass type to match.
852
		// Pipelines can be mono and render fine to stereo etc, so not checking them here.
853
		// Note that we don't support rendering to just one layer of a multilayer framebuffer!
854
		if (curRenderStep_->render.framebuffer->numLayers > 1) {
855
			rpType = (RenderPassType)(rpType | RenderPassType::MULTIVIEW);
856
		}
857

858
		if (curRenderStep_->render.framebuffer->sampleCount != VK_SAMPLE_COUNT_1_BIT) {
859
			rpType = (RenderPassType)(rpType | RenderPassType::MULTISAMPLE);
860
		}
861
	}
862

863
	VKRRenderPass *renderPass = queueRunner_.GetRenderPass(key);
864
	curRenderStep_->render.renderPassType = rpType;
865

866
	VkSampleCountFlagBits sampleCount = curRenderStep_->render.framebuffer ? curRenderStep_->render.framebuffer->sampleCount : VK_SAMPLE_COUNT_1_BIT;
867

868
	compileMutex_.lock();
869
	bool needsCompile = false;
870
	for (VKRGraphicsPipeline *pipeline : pipelinesToCheck_) {
871
		if (!pipeline) {
872
			// Not good, but let's try not to crash.
873
			continue;
874
		}
875
		if (!pipeline->pipeline[(size_t)rpType]) {
876
			pipeline->pipeline[(size_t)rpType] = Promise<VkPipeline>::CreateEmpty();
877
			_assert_(renderPass);
878
			compileQueue_.push_back(CompileQueueEntry(pipeline, renderPass->Get(vulkan_, rpType, sampleCount), rpType, sampleCount));
879
			needsCompile = true;
880
		}
881
	}
882
	if (needsCompile)
883
		compileCond_.notify_one();
884
	compileMutex_.unlock();
885
	pipelinesToCheck_.clear();
886

887
	// We don't do this optimization for very small targets, probably not worth it.
888
	if (!curRenderArea_.Empty() && (curWidth_ > 32 && curHeight_ > 32)) {
889
		curRenderStep_->render.renderArea = curRenderArea_.ToVkRect2D();
890
	} else {
891
		curRenderStep_->render.renderArea.offset = {};
892
		curRenderStep_->render.renderArea.extent = { (uint32_t)curWidth_, (uint32_t)curHeight_ };
893
	}
894
	curRenderArea_.Reset();
895

896
	// We no longer have a current render step.
897
	curRenderStep_ = nullptr;
898
	curPipelineFlags_ = (PipelineFlags)0;
899
}
900

901
void VulkanRenderManager::BindFramebufferAsRenderTarget(VKRFramebuffer *fb, VKRRenderPassLoadAction color, VKRRenderPassLoadAction depth, VKRRenderPassLoadAction stencil, uint32_t clearColor, float clearDepth, uint8_t clearStencil, const char *tag) {
902
	_dbg_assert_(insideFrame_);
903

904
	// Eliminate dupes (bind of the framebuffer we already are rendering to), instantly convert to a clear if possible.
905
	if (!steps_.empty() && steps_.back()->stepType == VKRStepType::RENDER && steps_.back()->render.framebuffer == fb) {
906
		u32 clearMask = 0;
907
		if (color == VKRRenderPassLoadAction::CLEAR) {
908
			clearMask |= VK_IMAGE_ASPECT_COLOR_BIT;
909
		}
910
		if (depth == VKRRenderPassLoadAction::CLEAR) {
911
			clearMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
912
			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
913
		}
914
		if (stencil == VKRRenderPassLoadAction::CLEAR) {
915
			clearMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
916
			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
917
		}
918

919
		// If we need a clear and the previous step has commands already, it's best to just add a clear and keep going.
920
		// If there's no clear needed, let's also do that.
921
		//
922
		// However, if we do need a clear and there are no commands in the previous pass,
923
		// we want the queuerunner to have the opportunity to merge, so we'll go ahead and make a new renderpass.
924
		if (clearMask == 0 || !steps_.back()->commands.empty()) {
925
			curRenderStep_ = steps_.back();
926
			curStepHasViewport_ = false;
927
			curStepHasScissor_ = false;
928
			for (const auto &c : steps_.back()->commands) {
929
				if (c.cmd == VKRRenderCommand::VIEWPORT) {
930
					curStepHasViewport_ = true;
931
				} else if (c.cmd == VKRRenderCommand::SCISSOR) {
932
					curStepHasScissor_ = true;
933
				}
934
			}
935
			if (clearMask != 0) {
936
				VkRenderData data{ VKRRenderCommand::CLEAR };
937
				data.clear.clearColor = clearColor;
938
				data.clear.clearZ = clearDepth;
939
				data.clear.clearStencil = clearStencil;
940
				data.clear.clearMask = clearMask;
941
				curRenderStep_->commands.push_back(data);
942
				curRenderArea_.SetRect(0, 0, curWidth_, curHeight_);
943
			}
944
			return;
945
		}
946
	}
947

948
#ifdef _DEBUG
949
	SanityCheckPassesOnAdd();
950
#endif
951

952
	// More redundant bind elimination.
953
	if (curRenderStep_) {
954
		if (curRenderStep_->commands.empty()) {
955
			if (curRenderStep_->render.colorLoad != VKRRenderPassLoadAction::CLEAR && curRenderStep_->render.depthLoad != VKRRenderPassLoadAction::CLEAR && curRenderStep_->render.stencilLoad != VKRRenderPassLoadAction::CLEAR) {
956
				// Can trivially kill the last empty render step.
957
				_dbg_assert_(steps_.back() == curRenderStep_);
958
				delete steps_.back();
959
				steps_.pop_back();
960
				curRenderStep_ = nullptr;
961
			}
962
			VLOG("Empty render step. Usually happens after uploading pixels..");
963
		}
964

965
		EndCurRenderStep();
966
	}
967

968
	// Sanity check that we don't have binds to the backbuffer before binds to other buffers. It must always be bound last.
969
	if (steps_.size() >= 1 && steps_.back()->stepType == VKRStepType::RENDER && steps_.back()->render.framebuffer == nullptr && fb != nullptr) {
970
		_dbg_assert_(false);
971
	}
972

973
	// Older Mali drivers have issues with depth and stencil don't match load/clear/etc.
974
	// TODO: Determine which versions and do this only where necessary.
975
	u32 lateClearMask = 0;
976
	if (depth != stencil && vulkan_->GetPhysicalDeviceProperties().properties.vendorID == VULKAN_VENDOR_ARM) {
977
		if (stencil == VKRRenderPassLoadAction::DONT_CARE) {
978
			stencil = depth;
979
		} else if (depth == VKRRenderPassLoadAction::DONT_CARE) {
980
			depth = stencil;
981
		} else if (stencil == VKRRenderPassLoadAction::CLEAR) {
982
			depth = stencil;
983
			lateClearMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
984
		} else if (depth == VKRRenderPassLoadAction::CLEAR) {
985
			stencil = depth;
986
			lateClearMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
987
		}
988
	}
989

990
	VKRStep *step = new VKRStep{ VKRStepType::RENDER };
991
	step->render.framebuffer = fb;
992
	step->render.colorLoad = color;
993
	step->render.depthLoad = depth;
994
	step->render.stencilLoad = stencil;
995
	step->render.colorStore = VKRRenderPassStoreAction::STORE;
996
	step->render.depthStore = VKRRenderPassStoreAction::STORE;
997
	step->render.stencilStore = VKRRenderPassStoreAction::STORE;
998
	step->render.clearColor = clearColor;
999
	step->render.clearDepth = clearDepth;
1000
	step->render.clearStencil = clearStencil;
1001
	step->render.numDraws = 0;
1002
	step->render.numReads = 0;
1003
	step->render.finalColorLayout = !fb ? VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL : VK_IMAGE_LAYOUT_UNDEFINED;
1004
	step->render.finalDepthStencilLayout = !fb ? VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL : VK_IMAGE_LAYOUT_UNDEFINED;
1005
	// pipelineFlags, renderArea and renderPassType get filled in when we finalize the step. Do not read from them before that.
1006
	step->tag = tag;
1007
	steps_.push_back(step);
1008

1009
	if (fb) {
1010
		// If there's a KEEP, we naturally read from the framebuffer.
1011
		if (color == VKRRenderPassLoadAction::KEEP || depth == VKRRenderPassLoadAction::KEEP || stencil == VKRRenderPassLoadAction::KEEP) {
1012
			step->dependencies.insert(fb);
1013
		}
1014
	}
1015

1016
	curRenderStep_ = step;
1017
	curStepHasViewport_ = false;
1018
	curStepHasScissor_ = false;
1019
	if (fb) {
1020
		curWidthRaw_ = fb->width;
1021
		curHeightRaw_ = fb->height;
1022
		curWidth_ = fb->width;
1023
		curHeight_ = fb->height;
1024
	} else {
1025
		curWidthRaw_ = vulkan_->GetBackbufferWidth();
1026
		curHeightRaw_ = vulkan_->GetBackbufferHeight();
1027
		if (g_display.rotation == DisplayRotation::ROTATE_90 ||
1028
			g_display.rotation == DisplayRotation::ROTATE_270) {
1029
			curWidth_ = curHeightRaw_;
1030
			curHeight_ = curWidthRaw_;
1031
		} else {
1032
			curWidth_ = curWidthRaw_;
1033
			curHeight_ = curHeightRaw_;
1034
		}
1035
	}
1036

1037
	if (color == VKRRenderPassLoadAction::CLEAR || depth == VKRRenderPassLoadAction::CLEAR || stencil == VKRRenderPassLoadAction::CLEAR) {
1038
		curRenderArea_.SetRect(0, 0, curWidth_, curHeight_);
1039
	}
1040

1041
	// See above - we add a clear afterward if only one side for depth/stencil CLEAR/KEEP.
1042
	if (lateClearMask != 0) {
1043
		VkRenderData data{ VKRRenderCommand::CLEAR };
1044
		data.clear.clearColor = clearColor;
1045
		data.clear.clearZ = clearDepth;
1046
		data.clear.clearStencil = clearStencil;
1047
		data.clear.clearMask = lateClearMask;
1048
		curRenderStep_->commands.push_back(data);
1049
	}
1050

1051
	if (invalidationCallback_) {
1052
		invalidationCallback_(InvalidationCallbackFlags::RENDER_PASS_STATE);
1053
	}
1054
}
1055

1056
bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAspectFlags aspectBits, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, Draw::ReadbackMode mode, const char *tag) {
1057
	_dbg_assert_(insideFrame_);
1058

1059
	for (int i = (int)steps_.size() - 1; i >= 0; i--) {
1060
		if (steps_[i]->stepType == VKRStepType::RENDER && steps_[i]->render.framebuffer == src) {
1061
			steps_[i]->render.numReads++;
1062
			break;
1063
		}
1064
	}
1065

1066
	EndCurRenderStep();
1067

1068
	VKRStep *step = new VKRStep{ VKRStepType::READBACK };
1069
	step->readback.aspectMask = aspectBits;
1070
	step->readback.src = src;
1071
	step->readback.srcRect.offset = { x, y };
1072
	step->readback.srcRect.extent = { (uint32_t)w, (uint32_t)h };
1073
	step->readback.delayed = mode == Draw::ReadbackMode::OLD_DATA_OK;
1074
	step->dependencies.insert(src);
1075
	step->tag = tag;
1076
	steps_.push_back(step);
1077

1078
	if (mode == Draw::ReadbackMode::BLOCK) {
1079
		FlushSync();
1080
	}
1081

1082
	Draw::DataFormat srcFormat = Draw::DataFormat::UNDEFINED;
1083
	if (aspectBits & VK_IMAGE_ASPECT_COLOR_BIT) {
1084
		if (src) {
1085
			switch (src->color.format) {
1086
			case VK_FORMAT_R8G8B8A8_UNORM: srcFormat = Draw::DataFormat::R8G8B8A8_UNORM; break;
1087
			default: _assert_(false);
1088
			}
1089
		} else {
1090
			// Backbuffer.
1091
			if (!(vulkan_->GetSurfaceCapabilities().supportedUsageFlags & VK_IMAGE_USAGE_TRANSFER_SRC_BIT)) {
1092
				ERROR_LOG(Log::G3D, "Copying from backbuffer not supported, can't take screenshots");
1093
				return false;
1094
			}
1095
			switch (vulkan_->GetSwapchainFormat()) {
1096
			case VK_FORMAT_B8G8R8A8_UNORM: srcFormat = Draw::DataFormat::B8G8R8A8_UNORM; break;
1097
			case VK_FORMAT_R8G8B8A8_UNORM: srcFormat = Draw::DataFormat::R8G8B8A8_UNORM; break;
1098
			// NOTE: If you add supported formats here, make sure to also support them in VulkanQueueRunner::CopyReadbackBuffer.
1099
			default:
1100
				ERROR_LOG(Log::G3D, "Unsupported backbuffer format for screenshots");
1101
				return false;
1102
			}
1103
		}
1104
	} else if (aspectBits & VK_IMAGE_ASPECT_STENCIL_BIT) {
1105
		// Copies from stencil are always S8.
1106
		srcFormat = Draw::DataFormat::S8;
1107
	} else if (aspectBits & VK_IMAGE_ASPECT_DEPTH_BIT) {
1108
		switch (src->depth.format) {
1109
		case VK_FORMAT_D24_UNORM_S8_UINT: srcFormat = Draw::DataFormat::D24_S8; break;
1110
		case VK_FORMAT_D32_SFLOAT_S8_UINT: srcFormat = Draw::DataFormat::D32F; break;
1111
		case VK_FORMAT_D16_UNORM_S8_UINT: srcFormat = Draw::DataFormat::D16; break;
1112
		default: _assert_(false);
1113
		}
1114
	} else {
1115
		_assert_(false);
1116
	}
1117

1118
	// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
1119
	return queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()],
1120
		mode == Draw::ReadbackMode::OLD_DATA_OK ? src : nullptr, w, h, srcFormat, destFormat, pixelStride, pixels);
1121
}
1122

1123
void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag) {
1124
	_dbg_assert_(insideFrame_);
1125

1126
	EndCurRenderStep();
1127

1128
	VKRStep *step = new VKRStep{ VKRStepType::READBACK_IMAGE };
1129
	step->readback_image.image = image;
1130
	step->readback_image.srcRect.offset = { x, y };
1131
	step->readback_image.srcRect.extent = { (uint32_t)w, (uint32_t)h };
1132
	step->readback_image.mipLevel = mipLevel;
1133
	step->tag = tag;
1134
	steps_.push_back(step);
1135

1136
	FlushSync();
1137

1138
	// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
1139
	queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()], nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
1140
}
1141

1142
static void RemoveDrawCommands(FastVec<VkRenderData> *cmds) {
1143
	// Here we remove any DRAW type commands when we hit a CLEAR.
1144
	for (auto &c : *cmds) {
1145
		if (c.cmd == VKRRenderCommand::DRAW || c.cmd == VKRRenderCommand::DRAW_INDEXED) {
1146
			c.cmd = VKRRenderCommand::REMOVED;
1147
		}
1148
	}
1149
}
1150

1151
static void CleanupRenderCommands(FastVec<VkRenderData> *cmds) {
1152
	size_t lastCommand[(int)VKRRenderCommand::NUM_RENDER_COMMANDS];
1153
	memset(lastCommand, -1, sizeof(lastCommand));
1154

1155
	// Find any duplicate state commands (likely from RemoveDrawCommands.)
1156
	for (size_t i = 0; i < cmds->size(); ++i) {
1157
		auto &c = cmds->at(i);
1158
		auto &lastOfCmd = lastCommand[(uint8_t)c.cmd];
1159

1160
		switch (c.cmd) {
1161
		case VKRRenderCommand::REMOVED:
1162
			continue;
1163

1164
		case VKRRenderCommand::VIEWPORT:
1165
		case VKRRenderCommand::SCISSOR:
1166
		case VKRRenderCommand::BLEND:
1167
		case VKRRenderCommand::STENCIL:
1168
			if (lastOfCmd != -1) {
1169
				cmds->at(lastOfCmd).cmd = VKRRenderCommand::REMOVED;
1170
			}
1171
			break;
1172

1173
		case VKRRenderCommand::PUSH_CONSTANTS:
1174
			// TODO: For now, we have to keep this one (it has an offset.)  Still update lastCommand.
1175
			break;
1176

1177
		case VKRRenderCommand::CLEAR:
1178
			// Ignore, doesn't participate in state.
1179
			continue;
1180

1181
		case VKRRenderCommand::DRAW_INDEXED:
1182
		case VKRRenderCommand::DRAW:
1183
		default:
1184
			// Boundary - must keep state before this.
1185
			memset(lastCommand, -1, sizeof(lastCommand));
1186
			continue;
1187
		}
1188

1189
		lastOfCmd = i;
1190
	}
1191

1192
	// At this point, anything in lastCommand can be cleaned up too.
1193
	// Note that it's safe to remove the last unused PUSH_CONSTANTS here.
1194
	for (size_t i = 0; i < ARRAY_SIZE(lastCommand); ++i) {
1195
		auto &lastOfCmd = lastCommand[i];
1196
		if (lastOfCmd != -1) {
1197
			cmds->at(lastOfCmd).cmd = VKRRenderCommand::REMOVED;
1198
		}
1199
	}
1200
}
1201

1202
void VulkanRenderManager::Clear(uint32_t clearColor, float clearZ, int clearStencil, int clearMask) {
1203
	_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
1204
	if (!clearMask)
1205
		return;
1206

1207
	// If this is the first drawing command or clears everything, merge it into the pass.
1208
	int allAspects = VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
1209
	if (curRenderStep_->render.numDraws == 0 || clearMask == allAspects) {
1210
		curRenderStep_->render.clearColor = clearColor;
1211
		curRenderStep_->render.clearDepth = clearZ;
1212
		curRenderStep_->render.clearStencil = clearStencil;
1213
		curRenderStep_->render.colorLoad = (clearMask & VK_IMAGE_ASPECT_COLOR_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;
1214
		curRenderStep_->render.depthLoad = (clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;
1215
		curRenderStep_->render.stencilLoad = (clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;
1216

1217
		if (clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1218
			if (curRenderStep_->render.framebuffer && !curRenderStep_->render.framebuffer->HasDepth()) {
1219
				WARN_LOG(Log::G3D, "Trying to clear depth/stencil on a non-depth framebuffer: %s", curRenderStep_->render.framebuffer->Tag());
1220
			} else {
1221
				curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
1222
			}
1223
		}
1224

1225
		// In case there were commands already.
1226
		curRenderStep_->render.numDraws = 0;
1227
		RemoveDrawCommands(&curRenderStep_->commands);
1228
	} else {
1229
		VkRenderData data{ VKRRenderCommand::CLEAR };
1230
		data.clear.clearColor = clearColor;
1231
		data.clear.clearZ = clearZ;
1232
		data.clear.clearStencil = clearStencil;
1233
		data.clear.clearMask = clearMask;
1234
		curRenderStep_->commands.push_back(data);
1235
	}
1236

1237
	curRenderArea_.SetRect(0, 0, curWidth_, curHeight_);
1238
}
1239

1240
void VulkanRenderManager::CopyFramebuffer(VKRFramebuffer *src, VkRect2D srcRect, VKRFramebuffer *dst, VkOffset2D dstPos, VkImageAspectFlags aspectMask, const char *tag) {
1241
#ifdef _DEBUG
1242
	SanityCheckPassesOnAdd();
1243
#endif
1244

1245
	_dbg_assert_msg_(srcRect.offset.x >= 0, "srcrect offset x (%d) < 0", srcRect.offset.x);
1246
	_dbg_assert_msg_(srcRect.offset.y >= 0, "srcrect offset y (%d) < 0", srcRect.offset.y);
1247
	_dbg_assert_msg_(srcRect.offset.x + srcRect.extent.width <= (uint32_t)src->width, "srcrect offset x (%d) + extent (%d) > width (%d)", srcRect.offset.x, srcRect.extent.width, (uint32_t)src->width);
1248
	_dbg_assert_msg_(srcRect.offset.y + srcRect.extent.height <= (uint32_t)src->height, "srcrect offset y (%d) + extent (%d) > height (%d)", srcRect.offset.y, srcRect.extent.height, (uint32_t)src->height);
1249

1250
	_dbg_assert_msg_(srcRect.extent.width > 0, "copy srcwidth == 0");
1251
	_dbg_assert_msg_(srcRect.extent.height > 0, "copy srcheight == 0");
1252

1253
	_dbg_assert_msg_(dstPos.x >= 0, "dstPos offset x (%d) < 0", dstPos.x);
1254
	_dbg_assert_msg_(dstPos.y >= 0, "dstPos offset y (%d) < 0", dstPos.y);
1255
	_dbg_assert_msg_(dstPos.x + srcRect.extent.width <= (uint32_t)dst->width, "dstPos + extent x > width");
1256
	_dbg_assert_msg_(dstPos.y + srcRect.extent.height <= (uint32_t)dst->height, "dstPos + extent y > height");
1257

1258
	for (int i = (int)steps_.size() - 1; i >= 0; i--) {
1259
		if (steps_[i]->stepType == VKRStepType::RENDER && steps_[i]->render.framebuffer == src) {
1260
			if (aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1261
				if (steps_[i]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) {
1262
					steps_[i]->render.finalColorLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
1263
				}
1264
			}
1265
			if (aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1266
				if (steps_[i]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) {
1267
					steps_[i]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
1268
				}
1269
			}
1270
			steps_[i]->render.numReads++;
1271
			break;
1272
		}
1273
	}
1274
	for (int i = (int)steps_.size() - 1; i >= 0; i--) {
1275
		if (steps_[i]->stepType == VKRStepType::RENDER && steps_[i]->render.framebuffer == dst) {
1276
			if (aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1277
				if (steps_[i]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) {
1278
					steps_[i]->render.finalColorLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
1279
				}
1280
			}
1281
			if (aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1282
				if (steps_[i]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) {
1283
					steps_[i]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
1284
				}
1285
			}
1286
			break;
1287
		}
1288
	}
1289

1290
	EndCurRenderStep();
1291

1292
	VKRStep *step = new VKRStep{ VKRStepType::COPY };
1293

1294
	step->copy.aspectMask = aspectMask;
1295
	step->copy.src = src;
1296
	step->copy.srcRect = srcRect;
1297
	step->copy.dst = dst;
1298
	step->copy.dstPos = dstPos;
1299
	step->dependencies.insert(src);
1300
	step->tag = tag;
1301
	bool fillsDst = dst && srcRect.offset.x == 0 && srcRect.offset.y == 0 && srcRect.extent.width == dst->width && srcRect.extent.height == dst->height;
1302
	if (dstPos.x != 0 || dstPos.y != 0 || !fillsDst)
1303
		step->dependencies.insert(dst);
1304

1305
	steps_.push_back(step);
1306
}
1307

1308
void VulkanRenderManager::BlitFramebuffer(VKRFramebuffer *src, VkRect2D srcRect, VKRFramebuffer *dst, VkRect2D dstRect, VkImageAspectFlags aspectMask, VkFilter filter, const char *tag) {
1309
#ifdef _DEBUG
1310
	SanityCheckPassesOnAdd();
1311
#endif
1312

1313
	_dbg_assert_msg_(srcRect.offset.x >= 0, "srcrect offset x (%d) < 0", srcRect.offset.x);
1314
	_dbg_assert_msg_(srcRect.offset.y >= 0, "srcrect offset y (%d) < 0", srcRect.offset.y);
1315
	_dbg_assert_msg_(srcRect.offset.x + srcRect.extent.width <= (uint32_t)src->width, "srcrect offset x (%d) + extent (%d) > width (%d)", srcRect.offset.x, srcRect.extent.width, (uint32_t)src->width);
1316
	_dbg_assert_msg_(srcRect.offset.y + srcRect.extent.height <= (uint32_t)src->height, "srcrect offset y (%d) + extent (%d) > height (%d)", srcRect.offset.y, srcRect.extent.height, (uint32_t)src->height);
1317

1318
	_dbg_assert_msg_(srcRect.extent.width > 0, "blit srcwidth == 0");
1319
	_dbg_assert_msg_(srcRect.extent.height > 0, "blit srcheight == 0");
1320

1321
	_dbg_assert_msg_(dstRect.offset.x >= 0, "dstrect offset x < 0");
1322
	_dbg_assert_msg_(dstRect.offset.y >= 0, "dstrect offset y < 0");
1323
	_dbg_assert_msg_(dstRect.offset.x + dstRect.extent.width <= (uint32_t)dst->width, "dstrect offset x + extent > width");
1324
	_dbg_assert_msg_(dstRect.offset.y + dstRect.extent.height <= (uint32_t)dst->height, "dstrect offset y + extent > height");
1325

1326
	_dbg_assert_msg_(dstRect.extent.width > 0, "blit dstwidth == 0");
1327
	_dbg_assert_msg_(dstRect.extent.height > 0, "blit dstheight == 0");
1328

1329
	// TODO: Seem to be missing final layouts here like in Copy...
1330

1331
	for (int i = (int)steps_.size() - 1; i >= 0; i--) {
1332
		if (steps_[i]->stepType == VKRStepType::RENDER && steps_[i]->render.framebuffer == src) {
1333
			steps_[i]->render.numReads++;
1334
			break;
1335
		}
1336
	}
1337

1338
	EndCurRenderStep();
1339

1340
	// Sanity check. Added an assert to try to gather more info.
1341
	// Got this assert in NPJH50443 FINAL FANTASY TYPE-0, but pretty rare. Moving back to debug assert.
1342
	if (aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
1343
		_dbg_assert_msg_(src->depth.image != VK_NULL_HANDLE, "%s", src->Tag());
1344
		_dbg_assert_msg_(dst->depth.image != VK_NULL_HANDLE, "%s", dst->Tag());
1345

1346
		if (!src->depth.image || !dst->depth.image) {
1347
			// Something has gone wrong, but let's try to stumble along.
1348
			return;
1349
		}
1350
	}
1351

1352
	VKRStep *step = new VKRStep{ VKRStepType::BLIT };
1353
	step->blit.aspectMask = aspectMask;
1354
	step->blit.src = src;
1355
	step->blit.srcRect = srcRect;
1356
	step->blit.dst = dst;
1357
	step->blit.dstRect = dstRect;
1358
	step->blit.filter = filter;
1359
	step->dependencies.insert(src);
1360
	step->tag = tag;
1361
	bool fillsDst = dst && dstRect.offset.x == 0 && dstRect.offset.y == 0 && dstRect.extent.width == dst->width && dstRect.extent.height == dst->height;
1362
	if (!fillsDst)
1363
		step->dependencies.insert(dst);
1364

1365
	steps_.push_back(step);
1366
}
1367

1368
VkImageView VulkanRenderManager::BindFramebufferAsTexture(VKRFramebuffer *fb, int binding, VkImageAspectFlags aspectBit, int layer) {
1369
	_dbg_assert_(curRenderStep_ != nullptr);
1370

1371
	// We don't support texturing from stencil, neither do we support texturing from depth|stencil together (nonsensical).
1372
	_dbg_assert_(aspectBit == VK_IMAGE_ASPECT_COLOR_BIT || aspectBit == VK_IMAGE_ASPECT_DEPTH_BIT);
1373

1374
	// Mark the dependency, check for required transitions, and return the image.
1375

1376
	// Optimization: If possible, use final*Layout to put the texture into the correct layout "early".
1377
	for (int i = (int)steps_.size() - 1; i >= 0; i--) {
1378
		if (steps_[i]->stepType == VKRStepType::RENDER && steps_[i]->render.framebuffer == fb) {
1379
			if (aspectBit == VK_IMAGE_ASPECT_COLOR_BIT) {
1380
				// If this framebuffer was rendered to earlier in this frame, make sure to pre-transition it to the correct layout.
1381
				if (steps_[i]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) {
1382
					steps_[i]->render.finalColorLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
1383
				}
1384
				// If we find some other layout, a copy after this is likely involved. It's fine though,
1385
				// we'll just transition it right as we need it and lose a tiny optimization.
1386
			} else if (aspectBit == VK_IMAGE_ASPECT_DEPTH_BIT) {
1387
				// If this framebuffer was rendered to earlier in this frame, make sure to pre-transition it to the correct layout.
1388
				if (steps_[i]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) {
1389
					steps_[i]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
1390
				}
1391
			}  // We don't (yet?) support texturing from stencil images.
1392
			steps_[i]->render.numReads++;
1393
			break;
1394
		}
1395
	}
1396

1397
	// Track dependencies fully.
1398
	curRenderStep_->dependencies.insert(fb);
1399

1400
	// Add this pretransition unless we already have it.
1401
	TransitionRequest rq{ fb, aspectBit, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL };
1402
	curRenderStep_->preTransitions.insert(rq);  // Note that insert avoids inserting duplicates.
1403

1404
	if (layer == -1) {
1405
		return aspectBit == VK_IMAGE_ASPECT_COLOR_BIT ? fb->color.texAllLayersView : fb->depth.texAllLayersView;
1406
	} else {
1407
		return aspectBit == VK_IMAGE_ASPECT_COLOR_BIT ? fb->color.texLayerViews[layer] : fb->depth.texLayerViews[layer];
1408
	}
1409
}
1410

1411
// Called on main thread.
1412
// Sends the collected commands to the render thread. Submit-latency should be
1413
// measured from here, probably.
1414
void VulkanRenderManager::Finish() {
1415
	EndCurRenderStep();
1416

1417
	// Let's do just a bit of cleanup on render commands now.
1418
	// TODO: Should look into removing this.
1419
	for (auto &step : steps_) {
1420
		if (step->stepType == VKRStepType::RENDER) {
1421
			CleanupRenderCommands(&step->commands);
1422
		}
1423
	}
1424

1425
	int curFrame = vulkan_->GetCurFrame();
1426
	FrameData &frameData = frameData_[curFrame];
1427

1428
	if (!postInitBarrier_.empty()) {
1429
		VkCommandBuffer buffer = frameData.GetInitCmd(vulkan_);
1430
		postInitBarrier_.Flush(buffer);
1431
	}
1432

1433
	VLOG("PUSH: Frame[%d]", curFrame);
1434
	VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::SUBMIT);
1435
	task->frame = curFrame;
1436
	if (useRenderThread_) {
1437
		std::unique_lock<std::mutex> lock(pushMutex_);
1438
		renderThreadQueue_.push(task);
1439
		renderThreadQueue_.back()->steps = std::move(steps_);
1440
		pushCondVar_.notify_one();
1441
	} else {
1442
		// Just do it!
1443
		task->steps = std::move(steps_);
1444
		Run(*task);
1445
		delete task;
1446
	}
1447

1448
	steps_.clear();
1449
}
1450

1451
void VulkanRenderManager::Present() {
1452
	int curFrame = vulkan_->GetCurFrame();
1453

1454
	VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::PRESENT);
1455
	task->frame = curFrame;
1456
	if (useRenderThread_) {
1457
		std::unique_lock<std::mutex> lock(pushMutex_);
1458
		renderThreadQueue_.push(task);
1459
		pushCondVar_.notify_one();
1460
	} else {
1461
		// Just do it!
1462
		Run(*task);
1463
		delete task;
1464
	}
1465

1466
	vulkan_->EndFrame();
1467
	insideFrame_ = false;
1468
}
1469

1470
// Called on the render thread.
1471
//
1472
// Can be called again after a VKRRunType::SYNC on the same frame.
1473
void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
1474
	FrameData &frameData = frameData_[task.frame];
1475

1476
	if (task.runType == VKRRunType::PRESENT) {
1477
		if (!frameData.skipSwap) {
1478
			VkResult res = frameData.QueuePresent(vulkan_, frameDataShared_);
1479
			frameTimeHistory_[frameData.frameId].queuePresent = time_now_d();
1480
			if (res == VK_ERROR_OUT_OF_DATE_KHR) {
1481
				// We clearly didn't get this in vkAcquireNextImageKHR because of the skipSwap check above.
1482
				// Do the increment.
1483
				outOfDateFrames_++;
1484
			} else if (res == VK_SUBOPTIMAL_KHR) {
1485
				outOfDateFrames_++;
1486
			} else if (res != VK_SUCCESS) {
1487
				_assert_msg_(false, "vkQueuePresentKHR failed! result=%s", VulkanResultToString(res));
1488
			} else {
1489
				// Success
1490
				outOfDateFrames_ = 0;
1491
			}
1492
		} else {
1493
			// We only get here if vkAcquireNextImage returned VK_ERROR_OUT_OF_DATE.
1494
			outOfDateFrames_++;
1495
			frameData.skipSwap = false;
1496
		}
1497
		return;
1498
	}
1499

1500
	_dbg_assert_(!frameData.hasPresentCommands);
1501

1502
	if (!frameTimeHistory_[frameData.frameId].firstSubmit) {
1503
		frameTimeHistory_[frameData.frameId].firstSubmit = time_now_d();
1504
	}
1505
	frameData.Submit(vulkan_, FrameSubmitType::Pending, frameDataShared_);
1506

1507
	// Flush descriptors.
1508
	double descStart = time_now_d();
1509
	FlushDescriptors(task.frame);
1510
	frameData.profile.descWriteTime = time_now_d() - descStart;
1511

1512
	if (!frameData.hasMainCommands) {
1513
		// Effectively resets both main and present command buffers, since they both live in this pool.
1514
		// We always record main commands first, so we don't need to reset the present command buffer separately.
1515
		vkResetCommandPool(vulkan_->GetDevice(), frameData.cmdPoolMain, 0);
1516

1517
		VkCommandBufferBeginInfo begin{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
1518
		begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
1519
		VkResult res = vkBeginCommandBuffer(frameData.mainCmd, &begin);
1520
		frameData.hasMainCommands = true;
1521
		_assert_msg_(res == VK_SUCCESS, "vkBeginCommandBuffer failed! result=%s", VulkanResultToString(res));
1522
	}
1523

1524
	queueRunner_.PreprocessSteps(task.steps);
1525
	// Likely during shutdown, happens in headless.
1526
	if (task.steps.empty() && !frameData.hasAcquired)
1527
		frameData.skipSwap = true;
1528
	//queueRunner_.LogSteps(stepsOnThread, false);
1529
	if (IsVREnabled()) {
1530
		int passes = GetVRPassesCount();
1531
		for (int i = 0; i < passes; i++) {
1532
			PreVRFrameRender(i);
1533
			queueRunner_.RunSteps(task.steps, task.frame, frameData, frameDataShared_, i < passes - 1);
1534
			PostVRFrameRender();
1535
		}
1536
	} else {
1537
		queueRunner_.RunSteps(task.steps, task.frame, frameData, frameDataShared_);
1538
	}
1539

1540
	switch (task.runType) {
1541
	case VKRRunType::SUBMIT:
1542
		frameData.Submit(vulkan_, FrameSubmitType::FinishFrame, frameDataShared_);
1543
		break;
1544

1545
	case VKRRunType::SYNC:
1546
		// The submit will trigger the readbackFence, and also do the wait for it.
1547
		frameData.Submit(vulkan_, FrameSubmitType::Sync, frameDataShared_);
1548

1549
		if (useRenderThread_) {
1550
			std::unique_lock<std::mutex> lock(syncMutex_);
1551
			syncCondVar_.notify_one();
1552
		}
1553

1554
		// At this point the GPU is idle, and we can resume filling the command buffers for the
1555
		// current frame since and thus all previously enqueued command buffers have been
1556
		// processed. No need to switch to the next frame number, would just be confusing.
1557
		break;
1558

1559
	default:
1560
		_dbg_assert_(false);
1561
	}
1562

1563
	VLOG("PULL: Finished running frame %d", task.frame);
1564
}
1565

1566
// Called from main thread.
1567
void VulkanRenderManager::FlushSync() {
1568
	_dbg_assert_(!curRenderStep_);
1569

1570
	if (invalidationCallback_) {
1571
		invalidationCallback_(InvalidationCallbackFlags::COMMAND_BUFFER_STATE);
1572
	}
1573

1574
	int curFrame = vulkan_->GetCurFrame();
1575
	FrameData &frameData = frameData_[curFrame];
1576

1577
	if (!postInitBarrier_.empty()) {
1578
		VkCommandBuffer buffer = frameData.GetInitCmd(vulkan_);
1579
		postInitBarrier_.Flush(buffer);
1580
	}
1581

1582
	if (useRenderThread_) {
1583
		{
1584
			VLOG("PUSH: Frame[%d]", curFrame);
1585
			VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::SYNC);
1586
			task->frame = curFrame;
1587
			{
1588
				std::unique_lock<std::mutex> lock(pushMutex_);
1589
				renderThreadQueue_.push(task);
1590
				renderThreadQueue_.back()->steps = std::move(steps_);
1591
				pushCondVar_.notify_one();
1592
			}
1593
			steps_.clear();
1594
		}
1595

1596
		{
1597
			std::unique_lock<std::mutex> lock(syncMutex_);
1598
			// Wait for the flush to be hit, since we're syncing.
1599
			while (!frameData.syncDone) {
1600
				VLOG("PUSH: Waiting for frame[%d].syncDone = 1 (sync)", curFrame);
1601
				syncCondVar_.wait(lock);
1602
			}
1603
			frameData.syncDone = false;
1604
		}
1605
	} else {
1606
		VKRRenderThreadTask task(VKRRunType::SYNC);
1607
		task.frame = curFrame;
1608
		task.steps = std::move(steps_);
1609
		Run(task);
1610
		steps_.clear();
1611
	}
1612
}
1613

1614
void VulkanRenderManager::ResetStats() {
1615
	initTimeMs_.Reset();
1616
	totalGPUTimeMs_.Reset();
1617
	renderCPUTimeMs_.Reset();
1618
}
1619

1620
VKRPipelineLayout *VulkanRenderManager::CreatePipelineLayout(BindingType *bindingTypes, size_t bindingTypesCount, bool geoShadersEnabled, const char *tag) {
1621
	VKRPipelineLayout *layout = new VKRPipelineLayout();
1622
	layout->SetTag(tag);
1623
	layout->bindingTypesCount = (uint32_t)bindingTypesCount;
1624

1625
	_dbg_assert_(bindingTypesCount <= ARRAY_SIZE(layout->bindingTypes));
1626
	memcpy(layout->bindingTypes, bindingTypes, sizeof(BindingType) * bindingTypesCount);
1627

1628
	VkDescriptorSetLayoutBinding bindings[VKRPipelineLayout::MAX_DESC_SET_BINDINGS];
1629
	for (int i = 0; i < bindingTypesCount; i++) {
1630
		bindings[i].binding = i;
1631
		bindings[i].descriptorCount = 1;
1632
		bindings[i].pImmutableSamplers = nullptr;
1633

1634
		switch (bindingTypes[i]) {
1635
		case BindingType::COMBINED_IMAGE_SAMPLER:
1636
			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
1637
			bindings[i].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
1638
			break;
1639
		case BindingType::UNIFORM_BUFFER_DYNAMIC_VERTEX:
1640
			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
1641
			bindings[i].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
1642
			break;
1643
		case BindingType::UNIFORM_BUFFER_DYNAMIC_ALL:
1644
			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
1645
			bindings[i].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT;
1646
			if (geoShadersEnabled) {
1647
				bindings[i].stageFlags |= VK_SHADER_STAGE_GEOMETRY_BIT;
1648
			}
1649
			break;
1650
		case BindingType::STORAGE_BUFFER_VERTEX:
1651
			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1652
			bindings[i].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
1653
			break;
1654
		case BindingType::STORAGE_BUFFER_COMPUTE:
1655
			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1656
			bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
1657
			break;
1658
		case BindingType::STORAGE_IMAGE_COMPUTE:
1659
			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
1660
			bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
1661
			break;
1662
		default:
1663
			_dbg_assert_(false);
1664
			break;
1665
		}
1666
	}
1667

1668
	VkDescriptorSetLayoutCreateInfo dsl = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
1669
	dsl.bindingCount = (uint32_t)bindingTypesCount;
1670
	dsl.pBindings = bindings;
1671
	VkResult res = vkCreateDescriptorSetLayout(vulkan_->GetDevice(), &dsl, nullptr, &layout->descriptorSetLayout);
1672
	_assert_(VK_SUCCESS == res && layout->descriptorSetLayout);
1673

1674
	VkPipelineLayoutCreateInfo pl = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
1675
	VkDescriptorSetLayout setLayouts[1] = { layout->descriptorSetLayout };
1676
	pl.setLayoutCount = ARRAY_SIZE(setLayouts);
1677
	pl.pSetLayouts = setLayouts;
1678
	res = vkCreatePipelineLayout(vulkan_->GetDevice(), &pl, nullptr, &layout->pipelineLayout);
1679
	_assert_(VK_SUCCESS == res && layout->pipelineLayout);
1680

1681
	vulkan_->SetDebugName(layout->descriptorSetLayout, VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT, tag);
1682
	vulkan_->SetDebugName(layout->pipelineLayout, VK_OBJECT_TYPE_PIPELINE_LAYOUT, tag);
1683

1684
	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
1685
		// Some games go beyond 1024 and end up having to resize like GTA, but most stay below so we start there.
1686
		layout->frameData[i].pool.Create(vulkan_, bindingTypes, (uint32_t)bindingTypesCount, 1024);
1687
	}
1688

1689
	pipelineLayouts_.push_back(layout);
1690
	return layout;
1691
}
1692

1693
void VulkanRenderManager::DestroyPipelineLayout(VKRPipelineLayout *layout) {
1694
	for (auto iter = pipelineLayouts_.begin(); iter != pipelineLayouts_.end(); iter++) {
1695
		if (*iter == layout) {
1696
			pipelineLayouts_.erase(iter);
1697
			break;
1698
		}
1699
	}
1700
	vulkan_->Delete().QueueCallback([](VulkanContext *vulkan, void *userdata) {
1701
		VKRPipelineLayout *layout = (VKRPipelineLayout *)userdata;
1702
		for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
1703
			layout->frameData[i].pool.DestroyImmediately();
1704
		}
1705
		vkDestroyPipelineLayout(vulkan->GetDevice(), layout->pipelineLayout, nullptr);
1706
		vkDestroyDescriptorSetLayout(vulkan->GetDevice(), layout->descriptorSetLayout, nullptr);
1707

1708
		delete layout;
1709
	}, layout);
1710
}
1711

1712
void VulkanRenderManager::FlushDescriptors(int frame) {
1713
	for (auto iter : pipelineLayouts_) {
1714
		iter->FlushDescSets(vulkan_, frame, &frameData_[frame].profile);
1715
	}
1716
}
1717

1718
void VulkanRenderManager::ResetDescriptorLists(int frame) {
1719
	for (auto iter : pipelineLayouts_) {
1720
		VKRPipelineLayout::FrameData &data = iter->frameData[frame];
1721

1722
		data.flushedDescriptors_ = 0;
1723
		data.descSets_.clear();
1724
		data.descData_.clear();
1725
	}
1726
}
1727

1728
VKRPipelineLayout::~VKRPipelineLayout() {
1729
	_assert_(frameData[0].pool.IsDestroyed());
1730
}
1731

1732
void VKRPipelineLayout::FlushDescSets(VulkanContext *vulkan, int frame, QueueProfileContext *profile) {
1733
	_dbg_assert_(frame < VulkanContext::MAX_INFLIGHT_FRAMES);
1734

1735
	FrameData &data = frameData[frame];
1736

1737
	VulkanDescSetPool &pool = data.pool;
1738
	FastVec<PackedDescriptor> &descData = data.descData_;
1739
	FastVec<PendingDescSet> &descSets = data.descSets_;
1740

1741
	pool.Reset();
1742

1743
	VkDescriptorSet setCache[8];
1744
	VkDescriptorSetLayout layoutsForAlloc[ARRAY_SIZE(setCache)];
1745
	for (int i = 0; i < ARRAY_SIZE(setCache); i++) {
1746
		layoutsForAlloc[i] = descriptorSetLayout;
1747
	}
1748
	int setsUsed = ARRAY_SIZE(setCache);  // To allocate immediately.
1749

1750
	// This will write all descriptors.
1751
	// Initially, we just do a simple look-back comparing to the previous descriptor to avoid sequential dupes.
1752
	// In theory, we could multithread this. Gotta be a lot of descriptors for that to be worth it though.
1753

1754
	// Initially, let's do naive single desc set writes.
1755
	VkWriteDescriptorSet writes[MAX_DESC_SET_BINDINGS];
1756
	VkDescriptorImageInfo imageInfo[MAX_DESC_SET_BINDINGS];  // just picked a practical number
1757
	VkDescriptorBufferInfo bufferInfo[MAX_DESC_SET_BINDINGS];
1758

1759
	// Preinitialize fields that won't change.
1760
	for (size_t i = 0; i < ARRAY_SIZE(writes); i++) {
1761
		writes[i].descriptorCount = 1;
1762
		writes[i].dstArrayElement = 0;
1763
		writes[i].pTexelBufferView = nullptr;
1764
		writes[i].pNext = nullptr;
1765
		writes[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1766
	}
1767

1768
	size_t start = data.flushedDescriptors_;
1769
	int writeCount = 0, dedupCount = 0;
1770

1771
	for (size_t index = start; index < descSets.size(); index++) {
1772
		auto &d = descSets[index];
1773

1774
		// This is where we look up to see if we already have an identical descriptor previously in the array.
1775
		// We could do a simple custom hash map here that doesn't handle collisions, since those won't matter.
1776
		// Instead, for now we just check history one item backwards. Good enough, it seems.
1777
		if (index > start + 1) {
1778
			if (descSets[index - 1].count == d.count) {
1779
				if (!memcmp(descData.data() + d.offset, descData.data() + descSets[index - 1].offset, d.count * sizeof(PackedDescriptor))) {
1780
					d.set = descSets[index - 1].set;
1781
					dedupCount++;
1782
					continue;
1783
				}
1784
			}
1785
		}
1786

1787
		if (setsUsed < ARRAY_SIZE(setCache)) {
1788
			d.set = setCache[setsUsed++];
1789
		} else {
1790
			// Allocate in small batches.
1791
			bool success = pool.Allocate(setCache, ARRAY_SIZE(setCache), layoutsForAlloc);
1792
			_dbg_assert_(success);
1793
			d.set = setCache[0];
1794
			setsUsed = 1;
1795
		}
1796

1797
		// TODO: Build up bigger batches of writes.
1798
		const PackedDescriptor *data = descData.begin() + d.offset;
1799
		int numWrites = 0;
1800
		int numBuffers = 0;
1801
		int numImages = 0;
1802
		for (int i = 0; i < d.count; i++) {
1803
			if (!data[i].image.view) {  // This automatically also checks for an null buffer due to the union.
1804
				continue;
1805
			}
1806
			switch (this->bindingTypes[i]) {
1807
			case BindingType::COMBINED_IMAGE_SAMPLER:
1808
				_dbg_assert_(data[i].image.sampler != VK_NULL_HANDLE);
1809
				_dbg_assert_(data[i].image.view != VK_NULL_HANDLE);
1810
				imageInfo[numImages].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
1811
				imageInfo[numImages].imageView = data[i].image.view;
1812
				imageInfo[numImages].sampler = data[i].image.sampler;
1813
				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
1814
				writes[numWrites].pImageInfo = &imageInfo[numImages];
1815
				writes[numWrites].pBufferInfo = nullptr;
1816
				numImages++;
1817
				break;
1818
			case BindingType::STORAGE_IMAGE_COMPUTE:
1819
				_dbg_assert_(data[i].image.view != VK_NULL_HANDLE);
1820
				imageInfo[numImages].imageLayout = VK_IMAGE_LAYOUT_GENERAL;
1821
				imageInfo[numImages].imageView = data[i].image.view;
1822
				imageInfo[numImages].sampler = VK_NULL_HANDLE;
1823
				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
1824
				writes[numWrites].pImageInfo = &imageInfo[numImages];
1825
				writes[numWrites].pBufferInfo = nullptr;
1826
				numImages++;
1827
				break;
1828
			case BindingType::STORAGE_BUFFER_VERTEX:
1829
			case BindingType::STORAGE_BUFFER_COMPUTE:
1830
				_dbg_assert_(data[i].buffer.buffer != VK_NULL_HANDLE);
1831
				bufferInfo[numBuffers].buffer = data[i].buffer.buffer;
1832
				bufferInfo[numBuffers].range = data[i].buffer.range;
1833
				bufferInfo[numBuffers].offset = data[i].buffer.offset;
1834
				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1835
				writes[numWrites].pBufferInfo = &bufferInfo[numBuffers];
1836
				writes[numWrites].pImageInfo = nullptr;
1837
				numBuffers++;
1838
				break;
1839
			case BindingType::UNIFORM_BUFFER_DYNAMIC_ALL:
1840
			case BindingType::UNIFORM_BUFFER_DYNAMIC_VERTEX:
1841
				_dbg_assert_(data[i].buffer.buffer != VK_NULL_HANDLE);
1842
				bufferInfo[numBuffers].buffer = data[i].buffer.buffer;
1843
				bufferInfo[numBuffers].range = data[i].buffer.range;
1844
				bufferInfo[numBuffers].offset = 0;
1845
				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
1846
				writes[numWrites].pBufferInfo = &bufferInfo[numBuffers];
1847
				writes[numWrites].pImageInfo = nullptr;
1848
				numBuffers++;
1849
				break;
1850
			}
1851
			writes[numWrites].dstBinding = i;
1852
			writes[numWrites].dstSet = d.set;
1853
			numWrites++;
1854
		}
1855

1856
		vkUpdateDescriptorSets(vulkan->GetDevice(), numWrites, writes, 0, nullptr);
1857

1858
		writeCount++;
1859
	}
1860

1861
	data.flushedDescriptors_ = (int)descSets.size();
1862
	profile->descriptorsWritten += writeCount;
1863
	profile->descriptorsDeduped += dedupCount;
1864
}
1865

1866
void VulkanRenderManager::SanityCheckPassesOnAdd() {
1867
#if _DEBUG
1868
	// Check that we don't have any previous passes that write to the backbuffer, that must ALWAYS be the last one.
1869
	for (int i = 0; i < steps_.size(); i++) {
1870
		if (steps_[i]->stepType == VKRStepType::RENDER) {
1871
			_dbg_assert_(steps_[i]->render.framebuffer != nullptr);
1872
		}
1873
	}
1874
#endif
1875
}
1876

1877
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

Product

Resources

Company