CoCalc -- multi_uma

GitHub Repository: godotengine/godot
Path: blob/master/servers/rendering/multi_uma_buffer.h
²⁰⁸⁸⁴ views
1
/**************************************************************************/
2
/*  multi_uma_buffer.h                                                    */
3
/**************************************************************************/
4
/*                         This file is part of:                          */
5
/*                             GODOT ENGINE                               */
6
/*                        https://godotengine.org                         */
7
/**************************************************************************/
8
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
10
/*                                                                        */
11
/* Permission is hereby granted, free of charge, to any person obtaining  */
12
/* a copy of this software and associated documentation files (the        */
13
/* "Software"), to deal in the Software without restriction, including    */
14
/* without limitation the rights to use, copy, modify, merge, publish,    */
15
/* distribute, sublicense, and/or sell copies of the Software, and to     */
16
/* permit persons to whom the Software is furnished to do so, subject to  */
17
/* the following conditions:                                              */
18
/*                                                                        */
19
/* The above copyright notice and this permission notice shall be         */
20
/* included in all copies or substantial portions of the Software.        */
21
/*                                                                        */
22
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
23
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
24
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
26
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
27
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
28
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
29
/**************************************************************************/
30

31
#pragma once
32

33
#include "servers/rendering/rendering_server.h"
34

35
class MultiUmaBufferBase {
36
protected:
37
	LocalVector<RID> buffers;
38
	uint32_t curr_idx = UINT32_MAX;
39
	uint64_t last_frame_mapped = UINT64_MAX;
40
	const uint32_t max_extra_buffers;
41
#ifdef DEBUG_ENABLED
42
	const char *debug_name;
43
#endif
44

45
	MultiUmaBufferBase(uint32_t p_max_extra_buffers, const char *p_debug_name) :
46
			max_extra_buffers(p_max_extra_buffers)
47
#ifdef DEBUG_ENABLED
48
			,
49
			debug_name(p_debug_name)
50
#endif
51
	{
52
	}
53

54
#ifdef DEV_ENABLED
55
	~MultiUmaBufferBase() {
56
		DEV_ASSERT(buffers.is_empty() && "Forgot to call uninit()!");
57
	}
58
#endif
59

60
public:
61
	void uninit() {
62
		if (is_print_verbose_enabled()) {
63
			print_line("MultiUmaBuffer '"
64
#ifdef DEBUG_ENABLED
65
					+ String(debug_name) +
66
#else
67
					   "{DEBUG_ENABLED unavailable}"
68
#endif
69
					"' used a total of " + itos(buffers.size()) +
70
					" buffers. A large number may indicate a waste of VRAM and can be brought down by tweaking MAX_EXTRA_BUFFERS for this buffer.");
71
		}
72

73
		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
74

75
		for (RID buffer : buffers) {
76
			if (buffer.is_valid()) {
77
				rd->free_rid(buffer);
78
			}
79
		}
80

81
		buffers.clear();
82
	}
83

84
	void shrink_to_max_extra_buffers() {
85
		DEV_ASSERT(curr_idx == 0u && "This function can only be called after reset and before being upload_and_advance again!");
86

87
		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
88

89
		uint32_t elem_count = buffers.size();
90

91
		if (elem_count > max_extra_buffers) {
92
			if (is_print_verbose_enabled()) {
93
				print_line("MultiUmaBuffer '"
94
#ifdef DEBUG_ENABLED
95
						+ String(debug_name) +
96
#else
97
						   "{DEBUG_ENABLED unavailable}"
98
#endif
99
						"' peaked to " + itos(elem_count) + " elements and shrinking it to " + itos(max_extra_buffers) +
100
						". If you see this message often, then something is wrong with rendering or MAX_EXTRA_BUFFERS needs to be increased.");
101
			}
102
		}
103

104
		while (elem_count > max_extra_buffers) {
105
			--elem_count;
106
			if (buffers[elem_count].is_valid()) {
107
				rd->free_rid(buffers[elem_count]);
108
			}
109
			buffers.remove_at(elem_count);
110
		}
111
	}
112
};
113

114
enum class MultiUmaBufferType : uint8_t {
115
	UNIFORM,
116
	STORAGE,
117
	VERTEX,
118
};
119

120
/// Interface for making it easier to work with UMA.
121
///
122
/// # What is UMA?
123
///
124
/// It stands for Unified Memory Architecture. There are two kinds of UMA:
125
///	 1. HW UMA. This is the case of iGPUs (specially Android, iOS, Apple ARM-based macOS, PS4 & PS5)
126
///		The CPU and GPU share the same die and same memory. So regular RAM and VRAM are internally the
127
///		same thing. There may be some differences between them in practice due to cache synchronization
128
///		behaviors or the regular BW RAM may be purposely throttled (as is the case of PS4 & PS5).
129
///  2. "Pretended UMA". On PC Desktop GPUs with ReBAR enabled can pretend VRAM behaves like normal
130
///		RAM, while internally the data is moved across the PCIe Bus. This can cause differences
131
///		in execution time of the routines that write to GPU buffers as the region is often uncached
132
///		(i.e. write-combined) and PCIe latency and BW is vastly different from regular RAM.
133
///		Without ReBAR, the amount of UMA memory is limited to 256MB (shared by the entire system).
134
///
135
/// Since often this type of memory is uncached, it is not well-suited for downloading GPU -> CPU,
136
/// but rather for uploading CPU -> GPU.
137
///
138
/// # When to use UMA buffers?
139
///
140
/// UMA buffers have various caveats and improper usage might lead to visual glitches. Therefore they
141
/// should be used sparingly, where it makes a difference. Does all of the following check?:
142
///	  1. Data is uploaded from CPU to GPU every (or almost every) frame.
143
///   2. Data is always uploaded from scratch. Partial uploads are unsupported.
144
///	  3. If uploading multiple times per frame (e.g. for multiple passes). The amount of times
145
///      per frame is relatively stable (occasional spikes are fine if using MAX_EXTRA_BUFFERS).
146
///
147
/// # Why the caveats?
148
///
149
///	This is due to our inability to detect race conditions. If you write to an UMA buffer, submit
150
///	GPU commands and then write more data to it, we can't guarantee that you won't be writing to a
151
/// region the GPU is currently reading from. Tools like the validation layers cannot detect this
152
/// race condition at all, making it very hard to troubleshoot.
153
///
154
/// Therefore the safest approach is to use an interface that forces users to upload everything at once.
155
/// There is one exception for performance: map_raw_for_upload() will return a pointer, and it is your
156
/// responsibility to make sure you don't use that pointer again after submitting.
157
/// USE THIS API CALL SPARINGLY AND WITH CARE.
158
///
159
/// Since we forbid uploading more data after we've uploaded to it, this Interface will create
160
/// more buffers. This means users will need more UniformSets (i.e. uniform_set_create).
161
///
162
/// # How to use
163
///
164
/// Example code 01:
165
///		MultiUmaBuffer<1> uma_buffer = MultiUmaBuffer<1>("Debug name displayed if run with --verbose");
166
///		uma_buffer.set_uniform_size(0, max_size_bytes);
167
///
168
///		for(uint32_t i = 0u; i < num_passes; ++i) {
169
///			uma_buffer.prepare_for_upload(); // Creates a new buffer (if none exists already)
170
///											 // of max_size_bytes. Must be called.
171
///			uma_buffer.upload(0, src_data, size_bytes);
172
///
173
///			if(!uniform_set[i]) {
174
///				RD::Uniform u;
175
///				u.binding = 1;
176
///				u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
177
///				u.append_id(uma_buffer._get(0u));
178
///				uniform_set[i] = rd->uniform_set_create( ... );
179
///			}
180
///		}
181
///
182
///	  // On shutdown (or if you need to call set_size again).
183
///	  uma_buffer.uninit();
184
///
185
/// Example code 02:
186
///
187
///		uma_buffer.prepare_for_upload();
188
///		RID rid = uma_buffer.get_for_upload(0u);
189
///		rd->buffer_update(rid, 0, sizeof(BakeParameters), &bake_parameters);
190
///		RD::Uniform u; // Skipping full initialization of u. See Example 01.
191
///		u.append_id(rid);
192
///
193
/// Example code 03:
194
///
195
///		void *dst_data = uma_buffer.map_raw_for_upload(0u);
196
///		memcpy(dst_data, src_data, size_bytes);
197
///		rd->buffer_flush(uma_buffer._get(0u));
198
///		RD::Uniform u; // Skipping full initialization of u. See Example 01.
199
///		u.append_id(rid);
200
///
201
/// # Tricks
202
///
203
///	Godot's shadow mapping code calls uma_buffer.uniform_buffers._get(-p_pass_offset) (i.e. a negative value)
204
/// because for various reasons its shadow mapping code was written like this:
205
///
206
///		for( uint32_t i = 0u; i < num_passes; ++i ) {
207
///			uma_buffer.prepare_for_upload();
208
///			uma_buffer.upload(0, src_data, size_bytes);
209
///		}
210
///		for( uint32_t i = 0u; i < num_passes; ++i ) {
211
///			RD::Uniform u;
212
///			u.binding = 1;
213
///			u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER_DYNAMIC;
214
///			u.append_id(uma_buffer._get(-(num_passes - 1u - i)));
215
///			uniform_set[i] = rd->uniform_set_create( ... );
216
///		}
217
///
218
/// Every time prepare_for_upload() is called, uma_buffer._get(-idx) will return a different RID(*).
219
/// Thus with a negative value we can address previous ones. This is fine as long as the value idx
220
/// doesn't exceed the number of times the user called prepare_for_upload() for this frame.
221
///
222
/// (*)This RID will be returned again on the next frame after the same amount of prepare_for_upload()
223
/// calls; unless the number of times it was called exceeded MAX_EXTRA_BUFFERS.
224
///
225
/// # Template parameters
226
///
227
///	## NUM_BUFFERS
228
///
229
/// How many buffers we should track. e.g. instead of doing this:
230
///		MultiUmaBuffer<1> omni_lights = /*...*/;
231
///		MultiUmaBuffer<1> spot_lights = /*...*/;
232
///		MultiUmaBuffer<1> directional_lights = /*...*/;
233
///
234
///		omni_lights.set_uniform_size(0u, omni_size);
235
///		spot_lights.set_uniform_size(0u, spot_size);
236
///		directional_lights.set_uniform_size(0u, dir_size);
237
///
238
///		omni_lights.prepare_for_upload();
239
///		spot_lights.prepare_for_upload();
240
///		directional_lights.prepare_for_upload();
241
///
242
/// You can do this:
243
///
244
///		MultiUmaBuffer<3> lights = /*...*/;
245
///
246
///		lights.set_uniform_size(0u, omni_size);
247
///		lights.set_uniform_size(1u, spot_size);
248
///		lights.set_uniform_size(2u, dir_size);
249
///
250
///		lights.prepare_for_upload();
251
///
252
/// This approach works as long as all buffers would call prepare_for_upload() at the same time.
253
/// It saves some overhead.
254
///
255
/// ## MAX_EXTRA_BUFFERS
256
///
257
/// Upper limit on the number of buffers per frame.
258
///
259
/// There are times where rendering might spike for exceptional reasons, calling prepare_for_upload()
260
/// too many times, never to do that again. This will cause an increase in memory usage that will
261
/// never be reclaimed until shutdown.
262
///
263
/// MAX_EXTRA_BUFFERS can be used to handle such spikes, by deallocating the extra buffers.
264
/// Example:
265
///		MultiUmaBuffer<1, 6> buffer;
266
///
267
///		// Normal frame (assuming up to 6 passes is considered normal):
268
///		for(uint32_t i = 0u; i < 6u; ++i) {
269
///			buffer.prepare_for_upload();
270
///			...
271
///			buffer.upload(...);
272
///		}
273
///
274
///		// Exceptional frame:
275
///		for(uint32_t i = 0u; i < 24u; ++i) {
276
///			buffer.prepare_for_upload();
277
///			...
278
///			buffer.upload(...);
279
///		}
280
///
281
///	After the frame is done, those extra 18 buffers will be deleted.
282
/// Launching godot with --verbose will print diagnostic information.
283
template <uint32_t NUM_BUFFERS, uint32_t MAX_EXTRA_BUFFERS = UINT32_MAX>
284
class MultiUmaBuffer : public MultiUmaBufferBase {
285
	struct BufferInfo {
286
		uint32_t size_bytes = 0;
287
		MultiUmaBufferType type = MultiUmaBufferType::UNIFORM;
288
	};
289
	BufferInfo buffer_info[NUM_BUFFERS];
290
#ifdef DEV_ENABLED
291
	bool can_upload[NUM_BUFFERS] = {};
292
#endif
293

294
	void push() {
295
		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
296
		for (uint32_t i = 0u; i < NUM_BUFFERS; ++i) {
297
			const BufferInfo &info = buffer_info[i];
298
			RID buffer;
299
			switch (info.type) {
300
				case MultiUmaBufferType::STORAGE:
301
					buffer = rd->storage_buffer_create(info.size_bytes, Vector<uint8_t>(), BitField<RenderingDevice::StorageBufferUsage>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
302
					break;
303
				case MultiUmaBufferType::VERTEX:
304
					buffer = rd->vertex_buffer_create(info.size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
305
					break;
306
				case MultiUmaBufferType::UNIFORM:
307
				default:
308
					buffer = rd->uniform_buffer_create(info.size_bytes, Vector<uint8_t>(), RD::BUFFER_CREATION_DYNAMIC_PERSISTENT_BIT);
309
					break;
310
			}
311
			buffers.push_back(buffer);
312
		}
313
	}
314

315
public:
316
	MultiUmaBuffer(const char *p_debug_name) :
317
			MultiUmaBufferBase(MAX_EXTRA_BUFFERS, p_debug_name) {}
318

319
	uint32_t get_curr_idx() const { return curr_idx; }
320

321
	void set_size(uint32_t p_idx, uint32_t p_size_bytes, MultiUmaBufferType p_type) {
322
		DEV_ASSERT(buffers.is_empty());
323
		buffer_info[p_idx].size_bytes = p_size_bytes;
324
		buffer_info[p_idx].type = p_type;
325
		curr_idx = UINT32_MAX;
326
		last_frame_mapped = UINT64_MAX;
327
	}
328

329
	void set_size(uint32_t p_idx, uint32_t p_size_bytes, bool p_is_storage) {
330
		set_size(p_idx, p_size_bytes, p_is_storage ? MultiUmaBufferType::STORAGE : MultiUmaBufferType::UNIFORM);
331
	}
332

333
	void set_uniform_size(uint32_t p_idx, uint32_t p_size_bytes) {
334
		set_size(p_idx, p_size_bytes, MultiUmaBufferType::UNIFORM);
335
	}
336

337
	void set_storage_size(uint32_t p_idx, uint32_t p_size_bytes) {
338
		set_size(p_idx, p_size_bytes, MultiUmaBufferType::STORAGE);
339
	}
340

341
	void set_vertex_size(uint32_t p_idx, uint32_t p_size_bytes) {
342
		set_size(p_idx, p_size_bytes, MultiUmaBufferType::VERTEX);
343
	}
344

345
	uint32_t get_size(uint32_t p_idx) const { return buffer_info[p_idx].size_bytes; }
346

347
	// Gets the raw buffer. Use with care.
348
	// If you call this function, make sure to have called prepare_for_upload() first.
349
	// Do not call _get() then prepare_for_upload().
350
	RID _get(uint32_t p_idx) {
351
		return buffers[curr_idx * NUM_BUFFERS + p_idx];
352
	}
353

354
	/**
355
	 * @param p_append	True if you wish to append more data to existing buffer.
356
	 * @return			False if it's possible to append. True if the internal buffer changed.
357
	 */
358
	bool prepare_for_map(bool p_append) {
359
		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
360
		const uint64_t frames_drawn = rd->get_frames_drawn();
361

362
		if (last_frame_mapped == frames_drawn) {
363
			if (!p_append) {
364
				++curr_idx;
365
			}
366
		} else {
367
			p_append = false;
368
			curr_idx = 0u;
369
			if (max_extra_buffers != UINT32_MAX) {
370
				shrink_to_max_extra_buffers();
371
			}
372
		}
373
		last_frame_mapped = frames_drawn;
374
		if (curr_idx * NUM_BUFFERS >= buffers.size()) {
375
			push();
376
		}
377

378
#ifdef DEV_ENABLED
379
		if (!p_append) {
380
			for (size_t i = 0u; i < NUM_BUFFERS; ++i) {
381
				can_upload[i] = true;
382
			}
383
		}
384
#endif
385
		return !p_append;
386
	}
387

388
	void prepare_for_upload() {
389
		prepare_for_map(false);
390
	}
391

392
	void *map_raw_for_upload(uint32_t p_idx) {
393
#ifdef DEV_ENABLED
394
		DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
395
		can_upload[p_idx] = false;
396
#endif
397
		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
398
		return rd->buffer_persistent_map_advance(buffers[curr_idx * NUM_BUFFERS + p_idx]);
399
	}
400

401
	RID get_for_upload(uint32_t p_idx) {
402
#ifdef DEV_ENABLED
403
		DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
404
		can_upload[p_idx] = false;
405
#endif
406
		return buffers[curr_idx * NUM_BUFFERS + p_idx];
407
	}
408

409
	void upload(uint32_t p_idx, const void *p_src_data, uint32_t p_size_bytes) {
410
#ifdef DEV_ENABLED
411
		DEV_ASSERT(can_upload[p_idx] && "Forgot to prepare_for_upload first! Or called get_for_upload/upload() twice.");
412
		can_upload[p_idx] = false;
413
#endif
414
		RenderingDevice *rd = RD::RenderingDevice::get_singleton();
415
		rd->buffer_update(buffers[curr_idx * NUM_BUFFERS + p_idx], 0, p_size_bytes, p_src_data, true);
416
	}
417
};
418

419
Product

Resources

Company