Path: blob/master/servers/rendering/multi_uma_buffer.h
20884 views
/**************************************************************************/1/* multi_uma_buffer.h */2/**************************************************************************/3/* This file is part of: */4/* GODOT ENGINE */5/* https://godotengine.org */6/**************************************************************************/7/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */8/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */9/* */10/* Permission is hereby granted, free of charge, to any person obtaining */11/* a copy of this software and associated documentation files (the */12/* "Software"), to deal in the Software without restriction, including */13/* without limitation the rights to use, copy, modify, merge, publish, */14/* distribute, sublicense, and/or sell copies of the Software, and to */15/* permit persons to whom the Software is furnished to do so, subject to */16/* the following conditions: */17/* */18/* The above copyright notice and this permission notice shall be */19/* included in all copies or substantial portions of the Software. */20/* */21/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */22/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */23/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */24/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */25/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */26/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */27/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */28/**************************************************************************/2930#pragma once3132#include "servers/rendering/rendering_server.h"3334class MultiUmaBufferBase {35protected:36LocalVector<RID> buffers;37uint32_t curr_idx = UINT32_MAX;38uint64_t last_frame_mapped = UINT64_MAX;39const uint32_t max_extra_buffers;40#ifdef DEBUG_ENABLED41const char *debug_name;42#endif4344MultiUmaBufferBase(uint32_t p_max_extra_buffers, const char *p_debug_name) :45max_extra_buffers(p_max_extra_buffers)46#ifdef DEBUG_ENABLED47,48debug_name(p_debug_name)49#endif50{51}5253#ifdef DEV_ENABLED54~MultiUmaBufferBase() {55DEV_ASSERT(buffers.is_empty() && "Forgot to call uninit()!");56}57#endif5859public:60void uninit() {61if (is_print_verbose_enabled()) {62print_line("MultiUmaBuffer '"63#ifdef DEBUG_ENABLED64+ String(debug_name) +65#else66"{DEBUG_ENABLED unavailable}"67#endif68"' used a total of " + itos(buffers.size()) +69" buffers. A large number may indicate a waste of VRAM and can be brought down by tweaking MAX_EXTRA_BUFFERS for this buffer.");70}7172RenderingDevice *rd = RD::RenderingDevice::get_singleton();7374for (RID buffer : buffers) {75if (buffer.is_valid()) {76rd->free_rid(buffer);77}78}7980buffers.clear();81}8283void shrink_to_max_extra_buffers() {84DEV_ASSERT(curr_idx == 0u && "This function can only be called after reset and before being upload_and_advance again!");8586RenderingDevice *rd = RD::RenderingDevice::get_singleton();8788uint32_t elem_count = buffers.size();8990if (elem_count > max_extra_buffers) {91if (is_print_verbose_enabled()) {92print_line("MultiUmaBuffer '"93#ifdef DEBUG_ENABLED94+ String(debug_name) +95#else96"{DEBUG_ENABLED unavailable}"97#endif98"' peaked to " + itos(elem_count) + " elements and shrinking it to " + itos(max_extra_buffers) +99". If you see this message often, then something is wrong with rendering or MAX_EXTRA_BUFFERS needs to be increased.");100}101}102103while (elem_count > max_extra_buffers) {104--elem_count;105if (buffers[elem_count].is_valid()) {106rd->free_rid(buffers[elem_count]);107}108buffers.remove_at(elem_count);109}110}111};112113enum class MultiUmaBufferType : uint8_t {114UNIFORM,115STORAGE,116VERTEX,117};118119/// Interface for making it easier to work with UMA.120///121/// # What is UMA?122///123/// It stands for Unified Memory Architecture. There are two kinds of UMA:124/// 1. HW UMA. This is the case of iGPUs (specially Android, iOS, Apple ARM-based macOS, PS4 & PS5)125/// The CPU and GPU share the same die and same memory. So regular RAM and VRAM are internally the126/// same thing. There may be some differences between them in practice due to cache synchronization127/// behaviors or the regular BW RAM may be purposely throttled (as is the case of PS4 & PS5).128/// 2. "Pretended UMA". On PC Desktop GPUs with ReBAR enabled can pretend VRAM behaves like normal129/// RAM, while internally the data is moved across the PCIe Bus. This can cause differences130/// in execution time of the routines that write to GPU buffers as the region is often uncached131/// (i.e. write-combined) and PCIe latency and BW is vastly different from regular RAM.132/// Without ReBAR, the amount of UMA memory is limited to 256MB (shared by the entire system).133///134/// Since often this type of memory is uncached, it is not well-suited for downloading GPU -> CPU,135/// but rather for uploading CPU -> GPU.136///137/// # When to use UMA buffers?138///139/// UMA buffers have various caveats and improper usage might lead to visual glitches. Therefore they140/// should be used sparingly, where it makes a difference. Does all of the following check?:141/// 1. Data is uploaded from CPU to GPU every (or almost every) frame.142/// 2. Data is always uploaded from scratch. Partial uploads are unsupported.143/// 3. If uploading multiple times per frame (e.g. for multiple passes). The amount of times144/// per frame is relatively stable (occasional spikes are fine if using MAX_EXTRA_BUFFERS).145///146/// # Why the caveats?147///148/// This is due to our inability to detect race conditions. If you write to an UMA buffer, submit149/// GPU commands and then write more data to it, we can't guarantee that you won't be writing to a150/// region the GPU is currently reading from. Tools like the validation layers cannot detect this151/// race condition at all, making it very hard to troubleshoot.152///153/// Therefore the safest approach is to use an interface that forces users to upload everything at once.154/// There is one exception for performance: map_raw_for_upload() will return a pointer, and it is your155/// responsibility to make sure you don't use that pointer again after submitting.156/// USE THIS API CALL SPARINGLY AND WITH CARE.157///158/// Since we forbid uploading more data after we've uploaded to it, this Interface will create159/// more buffers. This means users will need more UniformSets (i.e. uniform_set_create).160///161/// # How to use162///163/// Example code 01:164/// MultiUmaBuffer<1> uma_buffer = MultiUmaBuffer<1>("Debug name displayed if run with --verbose");165/// uma_buffer.set_uniform_size(0, max_size_bytes);166///167/// for(uint32_t i = 0u; i < num_passes; ++i) {168/// uma_buffer.prepare_for_upload(); // Creates a new buffer (if none exists already)169/// // of max_size_bytes. Must be called.170/// uma_buffer.upload(0, src_data, size_bytes);171///172/// if(!uniform_set[i]) {173/// RD::Uniform u;174/// u.binding = 1;175/// u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;176/// u.append_id(uma_buffer._get(0u));177/// uniform_set[i] = rd->uniform_set_create( ... );178/// }179/// }180///181/// // On shutdown (or if you need to call set_size again).182/// uma_buffer.uninit();183///184/// Example code 02:185///186/// uma_buffer.prepare_for_upload();187/// RID rid = uma_buffer.get_for_upload(0u);188/// rd->buffer_update(rid, 0, sizeof(BakeParameters), &bake_parameters);189/// RD::Uniform u; // Skipping full initialization of u. See Example 01.190/// u.append_id(rid);191///192/// Example code 03:193///194/// void *dst_data = uma_buffer.map_raw_for_upload(0u);195/// memcpy(dst_data, src_data, size_bytes);196/// rd->buffer_flush(uma_buffer._get(0u));197/// RD::Uniform u; // Skipping full initialization of u. See Example 01.198/// u.append_id(rid);199///200/// # Tricks201///202/// Godot's shadow mapping code calls uma_buffer.uniform_buffers._get(-p_pass_offset) (i.e. a negative value)203/// because for various reasons its shadow mapping code was written like this:204///205/// for( uint32_t i = 0u; i < num_passes; ++i ) {206/// uma_buffer.prepare_for_upload();207/// uma_buffer.upload(0, src_data, size_bytes);208/// }209/// for( uint32_t i = 0u; i < num_passes; ++i ) {210/// RD::Uniform u;211/// u.binding = 1;212/// u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;213/// u.append_id(uma_buffer._get(-(num_passes - 1u - i)));214/// uniform_set[i] = rd->uniform_set_create( ... );215/// }216///217/// Every time prepare_for_upload() is called, uma_buffer._get(-idx) will return a different RID(*).218/// Thus with a negative value we can address previous ones. This is fine as long as the value idx219/// doesn't exceed the number of times the user called prepare_for_upload() for this frame.220///221/// (*)This RID will be returned again on the next frame after the same amount of prepare_for_upload()222/// calls; unless the number of times it was called exceeded MAX_EXTRA_BUFFERS.223///224/// # Template parameters225///226/// ## NUM_BUFFERS227///228/// How many buffers we should track. e.g. instead of doing this:229/// MultiUmaBuffer<1> omni_lights = /*...*/;230/// MultiUmaBuffer<1> spot_lights = /*...*/;231/// MultiUmaBuffer<1> directional_lights = /*...*/;232///233/// omni_lights.set_uniform_size(0u, omni_size);234/// spot_lights.set_uniform_size(0u, spot_size);235/// directional_lights.set_uniform_size(0u, dir_size);236///237/// omni_lights.prepare_for_upload();238/// spot_lights.prepare_for_upload();239/// directional_lights.prepare_for_upload();240///241/// You can do this:242///243/// MultiUmaBuffer<3> lights = /*...*/;244///245/// lights.set_uniform_size(0u, omni_size);246/// lights.set_uniform_size(1u, spot_size);247/// lights.set_uniform_size(2u, dir_size);248///249/// lights.prepare_for_upload();250///251/// This approach works as long as all buffers would call prepare_for_upload() at the same time.252/// It saves some overhead.253///254/// ## MAX_EXTRA_BUFFERS255///256/// Upper limit on the number of buffers per frame.257///258/// There are times where rendering might spike for exceptional reasons, calling prepare_for_upload()259/// too many times, never to do that again. This will cause an increase in memory usage that will260/// never be reclaimed until shutdown.261///262/// MAX_EXTRA_BUFFERS can be used to handle such spikes, by deallocating the extra buffers.263/// Example:264/// MultiUmaBuffer<1, 6> buffer;265///266/// // Normal frame (assuming up to 6 passes is considered normal):267/// for(uint32_t i = 0u; i < 6u; ++i) {268/// buffer.prepare_for_upload();269/// ...270/// buffer.upload(...);271/// }272///273/// // Exceptional frame:274/// for(uint32_t i = 0u; i < 24u; ++i) {275/// buffer.prepare_for_upload();276/// ...277/// buffer.upload(...);278/// }279///280/// After the frame is done, those extra 18 buffers will be deleted.281/// Launching godot with --verbose will print diagnostic information.282template <uint32_t NUM_BUFFERS, uint32_t MAX_EXTRA_BUFFERS = UINT32_MAX>283class MultiUmaBuffer : public MultiUmaBufferBase {284struct BufferInfo {285uint32_t size_bytes = 0;286MultiUmaBufferType type = MultiUmaBufferType::UNIFORM;287};288BufferInfo buffer_info[NUM_BUFFERS];289#ifdef DEV_ENABLED290bool can_upload[NUM_BUFFERS] = {};291#endif292293void push() {294RenderingDevice *rd = RD::RenderingDevice::get_singleton();295for (uint32_t i = 0u; i < NUM_BUFFERS; ++i) {296const BufferInfo &info = buffer_info[i];297RID buffer;298switch (info.type) {299case MultiUmaBufferType::STORAGE:300buffer = rd->storage_buffer_create(info.size_bytes, Vector<uint8_t>(), BitField<RenderingDevice::StorageBufferUsage>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);301break;302case MultiUmaBufferType::VERTEX:303buffer = rd->vertex_buffer_create(info.size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);304break;305case MultiUmaBufferType::UNIFORM:306default:307buffer = rd->uniform_buffer_create(info.size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);308break;309}310buffers.push_back(buffer);311}312}313314public:315MultiUmaBuffer(const char *p_debug_name) :316MultiUmaBufferBase(MAX_EXTRA_BUFFERS, p_debug_name) {}317318uint32_t get_curr_idx() const { return curr_idx; }319320void set_size(uint32_t p_idx, uint32_t p_size_bytes, MultiUmaBufferType p_type) {321DEV_ASSERT(buffers.is_empty());322buffer_info[p_idx].size_bytes = p_size_bytes;323buffer_info[p_idx].type = p_type;324curr_idx = UINT32_MAX;325last_frame_mapped = UINT64_MAX;326}327328void set_size(uint32_t p_idx, uint32_t p_size_bytes, bool p_is_storage) {329set_size(p_idx, p_size_bytes, p_is_storage ? MultiUmaBufferType::STORAGE : MultiUmaBufferType::UNIFORM);330}331332void set_uniform_size(uint32_t p_idx, uint32_t p_size_bytes) {333set_size(p_idx, p_size_bytes, MultiUmaBufferType::UNIFORM);334}335336void set_storage_size(uint32_t p_idx, uint32_t p_size_bytes) {337set_size(p_idx, p_size_bytes, MultiUmaBufferType::STORAGE);338}339340void set_vertex_size(uint32_t p_idx, uint32_t p_size_bytes) {341set_size(p_idx, p_size_bytes, MultiUmaBufferType::VERTEX);342}343344uint32_t get_size(uint32_t p_idx) const { return buffer_info[p_idx].size_bytes; }345346// Gets the raw buffer. Use with care.347// If you call this function, make sure to have called prepare_for_upload() first.348// Do not call _get() then prepare_for_upload().349RID _get(uint32_t p_idx) {350return buffers[curr_idx * NUM_BUFFERS + p_idx];351}352353/**354* @param p_append True if you wish to append more data to existing buffer.355* @return False if it's possible to append. True if the internal buffer changed.356*/357bool prepare_for_map(bool p_append) {358RenderingDevice *rd = RD::RenderingDevice::get_singleton();359const uint64_t frames_drawn = rd->get_frames_drawn();360361if (last_frame_mapped == frames_drawn) {362if (!p_append) {363++curr_idx;364}365} else {366p_append = false;367curr_idx = 0u;368if (max_extra_buffers != UINT32_MAX) {369shrink_to_max_extra_buffers();370}371}372last_frame_mapped = frames_drawn;373if (curr_idx * NUM_BUFFERS >= buffers.size()) {374push();375}376377#ifdef DEV_ENABLED378if (!p_append) {379for (size_t i = 0u; i < NUM_BUFFERS; ++i) {380can_upload[i] = true;381}382}383#endif384return !p_append;385}386387void prepare_for_upload() {388prepare_for_map(false);389}390391void *map_raw_for_upload(uint32_t p_idx) {392#ifdef DEV_ENABLED393DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");394can_upload[p_idx] = false;395#endif396RenderingDevice *rd = RD::RenderingDevice::get_singleton();397return rd->buffer_persistent_map_advance(buffers[curr_idx * NUM_BUFFERS + p_idx]);398}399400RID get_for_upload(uint32_t p_idx) {401#ifdef DEV_ENABLED402DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");403can_upload[p_idx] = false;404#endif405return buffers[curr_idx * NUM_BUFFERS + p_idx];406}407408void upload(uint32_t p_idx, const void *p_src_data, uint32_t p_size_bytes) {409#ifdef DEV_ENABLED410DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");411can_upload[p_idx] = false;412#endif413RenderingDevice *rd = RD::RenderingDevice::get_singleton();414rd->buffer_update(buffers[curr_idx * NUM_BUFFERS + p_idx], 0, p_size_bytes, p_src_data, true);415}416};417418419