CoCalc -- tessellation

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/kernels/subdiv/tessellation_cache.h
⁹⁹¹⁴ views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3

4
#pragma once
5

6
#include "../common/default.h"
7

8
/* force a complete cache invalidation when running out of allocation space */
9
#define FORCE_SIMPLE_FLUSH 0
10

11
#define THREAD_BLOCK_ATOMIC_ADD 4
12

13
#if defined(DEBUG)
14
#define CACHE_STATS(x) 
15
#else
16
#define CACHE_STATS(x) 
17
#endif
18

19
namespace embree
20
{
21
  class SharedTessellationCacheStats
22
  {
23
  public:
24
    /* stats */
25
    static std::atomic<size_t> cache_accesses;
26
    static std::atomic<size_t> cache_hits;
27
    static std::atomic<size_t> cache_misses;
28
    static std::atomic<size_t> cache_flushes;                
29
    static size_t        cache_num_patches;
30
    __aligned(64) static SpinLock mtx;
31
    
32
    /* print stats for debugging */                 
33
    static void printStats();
34
    static void clearStats();
35
  };
36
  
37
  void resizeTessellationCache(size_t new_size);
38
  void resetTessellationCache();
39
  
40
 ////////////////////////////////////////////////////////////////////////////////
41
 ////////////////////////////////////////////////////////////////////////////////
42
 ////////////////////////////////////////////////////////////////////////////////
43

44
 struct __aligned(64) ThreadWorkState 
45
 {
46
   ALIGNED_STRUCT_(64);
47

48
   std::atomic<size_t> counter;
49
   ThreadWorkState* next;
50
   bool allocated;
51

52
   __forceinline ThreadWorkState(bool allocated = false) 
53
     : counter(0), next(nullptr), allocated(allocated) 
54
   {
55
     assert( ((size_t)this % 64) == 0 ); 
56
   }   
57
 };
58

59
 class __aligned(64) SharedLazyTessellationCache 
60
 {
61
 public:
62
   
63
   static const size_t NUM_CACHE_SEGMENTS              = 8;
64
   static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
65
   static const size_t COMMIT_INDEX_SHIFT              = 32+8;
66
#if defined(__64BIT__)
67
   static const size_t REF_TAG_MASK                    = 0xffffffffff;
68
#else
69
   static const size_t REF_TAG_MASK                    = 0x7FFFFFFF;
70
#endif
71
   static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1;
72
   static const size_t BLOCK_SIZE                      = 64;
73
   
74

75
    /*! Per thread tessellation ref cache */
76
   static __thread ThreadWorkState* init_t_state;
77
   static ThreadWorkState* current_t_state;
78
   
79
   static __forceinline ThreadWorkState *threadState() 
80
   {
81
     if (unlikely(!init_t_state))
82
       /* sets init_t_state, can't return pointer due to macosx icc bug*/
83
       SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
84
     return init_t_state;
85
   }
86

87
   struct Tag
88
   {
89
     __forceinline Tag() : data(0) {}
90

91
     __forceinline Tag(void* ptr, size_t combinedTime) { 
92
       init(ptr,combinedTime);
93
     }
94

95
     __forceinline Tag(size_t ptr, size_t combinedTime) {
96
       init((void*)ptr,combinedTime); 
97
     }
98

99
     __forceinline void init(void* ptr, size_t combinedTime)
100
     {
101
       if (ptr == nullptr) {
102
         data = 0;
103
         return;
104
       }
105
       int64_t new_root_ref = (int64_t) ptr;
106
       new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                
107
       assert( new_root_ref <= (int64_t)REF_TAG_MASK );
108
       new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; 
109
       data = new_root_ref;
110
     }
111

112
     __forceinline int64_t get() const { return data.load(); }
113
     __forceinline void set( int64_t v ) { data.store(v); }
114
     __forceinline void reset() { data.store(0); }
115

116
   private:
117
     atomic<int64_t> data;
118
   };
119

120
   static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
121

122
   struct CacheEntry
123
   {
124
     Tag tag;
125
     SpinLock mutex;
126
   };
127

128
 private:
129

130
   float *data;
131
   bool hugepages;
132
   size_t size;
133
   size_t maxBlocks;
134
   ThreadWorkState *threadWorkState;
135
      
136
   __aligned(64) std::atomic<size_t> localTime;
137
   __aligned(64) std::atomic<size_t> next_block;
138
   __aligned(64) SpinLock   reset_state;
139
   __aligned(64) SpinLock   linkedlist_mtx;
140
   __aligned(64) std::atomic<size_t> switch_block_threshold;
141
   __aligned(64) std::atomic<size_t> numRenderThreads;
142

143

144
 public:
145

146
      
147
   SharedLazyTessellationCache();
148
   ~SharedLazyTessellationCache();
149

150
   void getNextRenderThreadWorkState();
151

152
   __forceinline size_t maxAllocSize() const {
153
     return switch_block_threshold;
154
   }
155

156
   __forceinline size_t getCurrentIndex() { return localTime.load(); }
157
   __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
158

159
   __forceinline size_t getTime(const size_t globalTime) {
160
     return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
161
   }
162

163

164
   __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus);  }
165
   __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
166

167
   __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
168

169
   static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(threadState()); }
170
   static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
171
   static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
172
   static __forceinline size_t getState() { return threadState()->counter.load(); }
173
   static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
174

175
   static __forceinline size_t getTCacheTime(const size_t globalTime) {
176
     return sharedLazyTessellationCache.getTime(globalTime);
177
   }
178

179
   /* per thread lock */
180
   __forceinline void lockThreadLoop (ThreadWorkState *const t_state) 
181
   { 
182
     while(1)
183
     {
184
       size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
185
       if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
186
       {
187
         /* lock failed wait until sync phase is over */
188
         sharedLazyTessellationCache.unlockThread(t_state,-1);	       
189
         sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
190
       }
191
       else
192
         break;
193
     }
194
   }
195

196
   static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
197
   {   
198
     const int64_t subdiv_patch_root_ref = entry.tag.get(); 
199
     CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
200
     
201
     if (likely(subdiv_patch_root_ref != 0)) 
202
     {
203
       const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
204
       const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
205
       
206
       if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
207
       {
208
         CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
209
         return (void*) subdiv_patch_root;
210
       }
211
     }
212
     CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
213
     return nullptr;
214
   }
215

216
   template<typename Constructor>
217
     static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
218
   {
219
     ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
220

221
     while (true)
222
     {
223
       sharedLazyTessellationCache.lockThreadLoop(t_state);
224
       void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
225
       if (patch) return (decltype(constructor())) patch;
226
       
227
       if (entry.mutex.try_lock())
228
       {
229
         if (!validTag(entry.tag,globalTime)) 
230
         {
231
           auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
232
           auto ret = constructor(); // thread is locked here!
233
           assert(ret);
234
           /* this should never return nullptr */
235
           auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
236
           auto time = before ? timeBefore : timeAfter;
237
           __memory_barrier();
238
           entry.tag = SharedLazyTessellationCache::Tag(ret,time);
239
           __memory_barrier();
240
           entry.mutex.unlock();
241
           return ret;
242
         }
243
         entry.mutex.unlock();
244
       }
245
       SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
246
     }
247
   }
248
   
249
   __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
250
   {
251
#if FORCE_SIMPLE_FLUSH == 1
252
     return i == getTime(globalTime);
253
#else
254
     return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
255
#endif
256
   }
257

258
   static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
259
   {
260
     return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
261
   }
262

263

264
    static __forceinline bool validTag(const Tag& tag, size_t globalTime)
265
    {
266
      const int64_t subdiv_patch_root_ref = tag.get(); 
267
      if (subdiv_patch_root_ref == 0) return false;
268
      const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
269
      return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
270
    }
271

272
   void waitForUsersLessEqual(ThreadWorkState *const t_state,
273
			      const unsigned int users);
274
    
275
   __forceinline size_t alloc(const size_t blocks)
276
   {
277
     if (unlikely(blocks >= switch_block_threshold))
278
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
279

280
     assert(blocks < switch_block_threshold);
281
     size_t index = next_block.fetch_add(blocks);
282
     if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
283
     return index;
284
   }
285

286
   static __forceinline void* malloc(const size_t bytes)
287
   {
288
     size_t block_index = -1;
289
     ThreadWorkState *const t_state = threadState();
290
     while (true)
291
     {
292
       block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
293
       if (block_index == (size_t)-1)
294
       {
295
         sharedLazyTessellationCache.unlockThread(t_state);		  
296
         sharedLazyTessellationCache.allocNextSegment();
297
         sharedLazyTessellationCache.lockThread(t_state);
298
         continue; 
299
       }
300
       break;
301
     }
302
     return sharedLazyTessellationCache.getBlockPtr(block_index);
303
   }
304

305
   __forceinline void *getBlockPtr(const size_t block_index)
306
   {
307
     assert(block_index < maxBlocks);
308
     assert(data);
309
     assert(block_index*16 <= size);
310
     return (void*)&data[block_index*16];
311
   }
312

313
   __forceinline void*  getDataPtr()      { return data; }
314
   __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
315
   __forceinline size_t getMaxBlocks()    { return maxBlocks; }
316
   __forceinline size_t getSize()         { return size; }
317

318
   void allocNextSegment();
319
   void realloc(const size_t newSize);
320

321
   void reset();
322

323
   static SharedLazyTessellationCache sharedLazyTessellationCache;
324
 };
325
}
326

327
Product

Resources

Company