Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/kernels/subdiv/tessellation_cache.h
9914 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#include "../common/default.h"
7
8
/* force a complete cache invalidation when running out of allocation space */
9
#define FORCE_SIMPLE_FLUSH 0
10
11
#define THREAD_BLOCK_ATOMIC_ADD 4
12
13
#if defined(DEBUG)
14
#define CACHE_STATS(x)
15
#else
16
#define CACHE_STATS(x)
17
#endif
18
19
namespace embree
20
{
21
class SharedTessellationCacheStats
22
{
23
public:
24
/* stats */
25
static std::atomic<size_t> cache_accesses;
26
static std::atomic<size_t> cache_hits;
27
static std::atomic<size_t> cache_misses;
28
static std::atomic<size_t> cache_flushes;
29
static size_t cache_num_patches;
30
__aligned(64) static SpinLock mtx;
31
32
/* print stats for debugging */
33
static void printStats();
34
static void clearStats();
35
};
36
37
void resizeTessellationCache(size_t new_size);
38
void resetTessellationCache();
39
40
////////////////////////////////////////////////////////////////////////////////
41
////////////////////////////////////////////////////////////////////////////////
42
////////////////////////////////////////////////////////////////////////////////
43
44
struct __aligned(64) ThreadWorkState
45
{
46
ALIGNED_STRUCT_(64);
47
48
std::atomic<size_t> counter;
49
ThreadWorkState* next;
50
bool allocated;
51
52
__forceinline ThreadWorkState(bool allocated = false)
53
: counter(0), next(nullptr), allocated(allocated)
54
{
55
assert( ((size_t)this % 64) == 0 );
56
}
57
};
58
59
class __aligned(64) SharedLazyTessellationCache
60
{
61
public:
62
63
static const size_t NUM_CACHE_SEGMENTS = 8;
64
static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
65
static const size_t COMMIT_INDEX_SHIFT = 32+8;
66
#if defined(__64BIT__)
67
static const size_t REF_TAG_MASK = 0xffffffffff;
68
#else
69
static const size_t REF_TAG_MASK = 0x7FFFFFFF;
70
#endif
71
static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;
72
static const size_t BLOCK_SIZE = 64;
73
74
75
/*! Per thread tessellation ref cache */
76
static __thread ThreadWorkState* init_t_state;
77
static ThreadWorkState* current_t_state;
78
79
static __forceinline ThreadWorkState *threadState()
80
{
81
if (unlikely(!init_t_state))
82
/* sets init_t_state, can't return pointer due to macosx icc bug*/
83
SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
84
return init_t_state;
85
}
86
87
struct Tag
88
{
89
__forceinline Tag() : data(0) {}
90
91
__forceinline Tag(void* ptr, size_t combinedTime) {
92
init(ptr,combinedTime);
93
}
94
95
__forceinline Tag(size_t ptr, size_t combinedTime) {
96
init((void*)ptr,combinedTime);
97
}
98
99
__forceinline void init(void* ptr, size_t combinedTime)
100
{
101
if (ptr == nullptr) {
102
data = 0;
103
return;
104
}
105
int64_t new_root_ref = (int64_t) ptr;
106
new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
107
assert( new_root_ref <= (int64_t)REF_TAG_MASK );
108
new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
109
data = new_root_ref;
110
}
111
112
__forceinline int64_t get() const { return data.load(); }
113
__forceinline void set( int64_t v ) { data.store(v); }
114
__forceinline void reset() { data.store(0); }
115
116
private:
117
atomic<int64_t> data;
118
};
119
120
static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
121
122
struct CacheEntry
123
{
124
Tag tag;
125
SpinLock mutex;
126
};
127
128
private:
129
130
float *data;
131
bool hugepages;
132
size_t size;
133
size_t maxBlocks;
134
ThreadWorkState *threadWorkState;
135
136
__aligned(64) std::atomic<size_t> localTime;
137
__aligned(64) std::atomic<size_t> next_block;
138
__aligned(64) SpinLock reset_state;
139
__aligned(64) SpinLock linkedlist_mtx;
140
__aligned(64) std::atomic<size_t> switch_block_threshold;
141
__aligned(64) std::atomic<size_t> numRenderThreads;
142
143
144
public:
145
146
147
SharedLazyTessellationCache();
148
~SharedLazyTessellationCache();
149
150
void getNextRenderThreadWorkState();
151
152
__forceinline size_t maxAllocSize() const {
153
return switch_block_threshold;
154
}
155
156
__forceinline size_t getCurrentIndex() { return localTime.load(); }
157
__forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
158
159
__forceinline size_t getTime(const size_t globalTime) {
160
return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
161
}
162
163
164
__forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }
165
__forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
166
167
__forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
168
169
static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }
170
static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
171
static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
172
static __forceinline size_t getState() { return threadState()->counter.load(); }
173
static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
174
175
static __forceinline size_t getTCacheTime(const size_t globalTime) {
176
return sharedLazyTessellationCache.getTime(globalTime);
177
}
178
179
/* per thread lock */
180
__forceinline void lockThreadLoop (ThreadWorkState *const t_state)
181
{
182
while(1)
183
{
184
size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
185
if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
186
{
187
/* lock failed wait until sync phase is over */
188
sharedLazyTessellationCache.unlockThread(t_state,-1);
189
sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
190
}
191
else
192
break;
193
}
194
}
195
196
static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
197
{
198
const int64_t subdiv_patch_root_ref = entry.tag.get();
199
CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
200
201
if (likely(subdiv_patch_root_ref != 0))
202
{
203
const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
204
const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
205
206
if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
207
{
208
CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
209
return (void*) subdiv_patch_root;
210
}
211
}
212
CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
213
return nullptr;
214
}
215
216
template<typename Constructor>
217
static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
218
{
219
ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
220
221
while (true)
222
{
223
sharedLazyTessellationCache.lockThreadLoop(t_state);
224
void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
225
if (patch) return (decltype(constructor())) patch;
226
227
if (entry.mutex.try_lock())
228
{
229
if (!validTag(entry.tag,globalTime))
230
{
231
auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
232
auto ret = constructor(); // thread is locked here!
233
assert(ret);
234
/* this should never return nullptr */
235
auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
236
auto time = before ? timeBefore : timeAfter;
237
__memory_barrier();
238
entry.tag = SharedLazyTessellationCache::Tag(ret,time);
239
__memory_barrier();
240
entry.mutex.unlock();
241
return ret;
242
}
243
entry.mutex.unlock();
244
}
245
SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
246
}
247
}
248
249
__forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
250
{
251
#if FORCE_SIMPLE_FLUSH == 1
252
return i == getTime(globalTime);
253
#else
254
return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
255
#endif
256
}
257
258
static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
259
{
260
return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
261
}
262
263
264
static __forceinline bool validTag(const Tag& tag, size_t globalTime)
265
{
266
const int64_t subdiv_patch_root_ref = tag.get();
267
if (subdiv_patch_root_ref == 0) return false;
268
const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
269
return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
270
}
271
272
void waitForUsersLessEqual(ThreadWorkState *const t_state,
273
const unsigned int users);
274
275
__forceinline size_t alloc(const size_t blocks)
276
{
277
if (unlikely(blocks >= switch_block_threshold))
278
throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
279
280
assert(blocks < switch_block_threshold);
281
size_t index = next_block.fetch_add(blocks);
282
if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
283
return index;
284
}
285
286
static __forceinline void* malloc(const size_t bytes)
287
{
288
size_t block_index = -1;
289
ThreadWorkState *const t_state = threadState();
290
while (true)
291
{
292
block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
293
if (block_index == (size_t)-1)
294
{
295
sharedLazyTessellationCache.unlockThread(t_state);
296
sharedLazyTessellationCache.allocNextSegment();
297
sharedLazyTessellationCache.lockThread(t_state);
298
continue;
299
}
300
break;
301
}
302
return sharedLazyTessellationCache.getBlockPtr(block_index);
303
}
304
305
__forceinline void *getBlockPtr(const size_t block_index)
306
{
307
assert(block_index < maxBlocks);
308
assert(data);
309
assert(block_index*16 <= size);
310
return (void*)&data[block_index*16];
311
}
312
313
__forceinline void* getDataPtr() { return data; }
314
__forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
315
__forceinline size_t getMaxBlocks() { return maxBlocks; }
316
__forceinline size_t getSize() { return size; }
317
318
void allocNextSegment();
319
void realloc(const size_t newSize);
320
321
void reset();
322
323
static SharedLazyTessellationCache sharedLazyTessellationCache;
324
};
325
}
326
327