Path: blob/master/thirdparty/embree/kernels/subdiv/tessellation_cache.h
9914 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "../common/default.h"67/* force a complete cache invalidation when running out of allocation space */8#define FORCE_SIMPLE_FLUSH 0910#define THREAD_BLOCK_ATOMIC_ADD 41112#if defined(DEBUG)13#define CACHE_STATS(x)14#else15#define CACHE_STATS(x)16#endif1718namespace embree19{20class SharedTessellationCacheStats21{22public:23/* stats */24static std::atomic<size_t> cache_accesses;25static std::atomic<size_t> cache_hits;26static std::atomic<size_t> cache_misses;27static std::atomic<size_t> cache_flushes;28static size_t cache_num_patches;29__aligned(64) static SpinLock mtx;3031/* print stats for debugging */32static void printStats();33static void clearStats();34};3536void resizeTessellationCache(size_t new_size);37void resetTessellationCache();3839////////////////////////////////////////////////////////////////////////////////40////////////////////////////////////////////////////////////////////////////////41////////////////////////////////////////////////////////////////////////////////4243struct __aligned(64) ThreadWorkState44{45ALIGNED_STRUCT_(64);4647std::atomic<size_t> counter;48ThreadWorkState* next;49bool allocated;5051__forceinline ThreadWorkState(bool allocated = false)52: counter(0), next(nullptr), allocated(allocated)53{54assert( ((size_t)this % 64) == 0 );55}56};5758class __aligned(64) SharedLazyTessellationCache59{60public:6162static const size_t NUM_CACHE_SEGMENTS = 8;63static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;64static const size_t COMMIT_INDEX_SHIFT = 32+8;65#if defined(__64BIT__)66static const size_t REF_TAG_MASK = 0xffffffffff;67#else68static const size_t REF_TAG_MASK = 0x7FFFFFFF;69#endif70static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+1;71static const size_t BLOCK_SIZE = 64;727374/*! Per thread tessellation ref cache */75static __thread ThreadWorkState* init_t_state;76static ThreadWorkState* current_t_state;7778static __forceinline ThreadWorkState *threadState()79{80if (unlikely(!init_t_state))81/* sets init_t_state, can't return pointer due to macosx icc bug*/82SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();83return init_t_state;84}8586struct Tag87{88__forceinline Tag() : data(0) {}8990__forceinline Tag(void* ptr, size_t combinedTime) {91init(ptr,combinedTime);92}9394__forceinline Tag(size_t ptr, size_t combinedTime) {95init((void*)ptr,combinedTime);96}9798__forceinline void init(void* ptr, size_t combinedTime)99{100if (ptr == nullptr) {101data = 0;102return;103}104int64_t new_root_ref = (int64_t) ptr;105new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();106assert( new_root_ref <= (int64_t)REF_TAG_MASK );107new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;108data = new_root_ref;109}110111__forceinline int64_t get() const { return data.load(); }112__forceinline void set( int64_t v ) { data.store(v); }113__forceinline void reset() { data.store(0); }114115private:116atomic<int64_t> data;117};118119static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }120121struct CacheEntry122{123Tag tag;124SpinLock mutex;125};126127private:128129float *data;130bool hugepages;131size_t size;132size_t maxBlocks;133ThreadWorkState *threadWorkState;134135__aligned(64) std::atomic<size_t> localTime;136__aligned(64) std::atomic<size_t> next_block;137__aligned(64) SpinLock reset_state;138__aligned(64) SpinLock linkedlist_mtx;139__aligned(64) std::atomic<size_t> switch_block_threshold;140__aligned(64) std::atomic<size_t> numRenderThreads;141142143public:144145146SharedLazyTessellationCache();147~SharedLazyTessellationCache();148149void getNextRenderThreadWorkState();150151__forceinline size_t maxAllocSize() const {152return switch_block_threshold;153}154155__forceinline size_t getCurrentIndex() { return localTime.load(); }156__forceinline void addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }157158__forceinline size_t getTime(const size_t globalTime) {159return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;160}161162163__forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus); }164__forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }165166__forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }167168static __forceinline void lock () { sharedLazyTessellationCache.lockThread(threadState()); }169static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }170static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }171static __forceinline size_t getState() { return threadState()->counter.load(); }172static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }173174static __forceinline size_t getTCacheTime(const size_t globalTime) {175return sharedLazyTessellationCache.getTime(globalTime);176}177178/* per thread lock */179__forceinline void lockThreadLoop (ThreadWorkState *const t_state)180{181while(1)182{183size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);184if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))185{186/* lock failed wait until sync phase is over */187sharedLazyTessellationCache.unlockThread(t_state,-1);188sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);189}190else191break;192}193}194195static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)196{197const int64_t subdiv_patch_root_ref = entry.tag.get();198CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);199200if (likely(subdiv_patch_root_ref != 0))201{202const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();203const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);204205if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))206{207CACHE_STATS(SharedTessellationCacheStats::cache_hits++);208return (void*) subdiv_patch_root;209}210}211CACHE_STATS(SharedTessellationCacheStats::cache_misses++);212return nullptr;213}214215template<typename Constructor>216static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())217{218ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();219220while (true)221{222sharedLazyTessellationCache.lockThreadLoop(t_state);223void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);224if (patch) return (decltype(constructor())) patch;225226if (entry.mutex.try_lock())227{228if (!validTag(entry.tag,globalTime))229{230auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);231auto ret = constructor(); // thread is locked here!232assert(ret);233/* this should never return nullptr */234auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);235auto time = before ? timeBefore : timeAfter;236__memory_barrier();237entry.tag = SharedLazyTessellationCache::Tag(ret,time);238__memory_barrier();239entry.mutex.unlock();240return ret;241}242entry.mutex.unlock();243}244SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);245}246}247248__forceinline bool validCacheIndex(const size_t i, const size_t globalTime)249{250#if FORCE_SIMPLE_FLUSH == 1251return i == getTime(globalTime);252#else253return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);254#endif255}256257static __forceinline bool validTime(const size_t oldtime, const size_t newTime)258{259return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;260}261262263static __forceinline bool validTag(const Tag& tag, size_t globalTime)264{265const int64_t subdiv_patch_root_ref = tag.get();266if (subdiv_patch_root_ref == 0) return false;267const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);268return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);269}270271void waitForUsersLessEqual(ThreadWorkState *const t_state,272const unsigned int users);273274__forceinline size_t alloc(const size_t blocks)275{276if (unlikely(blocks >= switch_block_threshold))277throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");278279assert(blocks < switch_block_threshold);280size_t index = next_block.fetch_add(blocks);281if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;282return index;283}284285static __forceinline void* malloc(const size_t bytes)286{287size_t block_index = -1;288ThreadWorkState *const t_state = threadState();289while (true)290{291block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);292if (block_index == (size_t)-1)293{294sharedLazyTessellationCache.unlockThread(t_state);295sharedLazyTessellationCache.allocNextSegment();296sharedLazyTessellationCache.lockThread(t_state);297continue;298}299break;300}301return sharedLazyTessellationCache.getBlockPtr(block_index);302}303304__forceinline void *getBlockPtr(const size_t block_index)305{306assert(block_index < maxBlocks);307assert(data);308assert(block_index*16 <= size);309return (void*)&data[block_index*16];310}311312__forceinline void* getDataPtr() { return data; }313__forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }314__forceinline size_t getMaxBlocks() { return maxBlocks; }315__forceinline size_t getSize() { return size; }316317void allocNextSegment();318void realloc(const size_t newSize);319320void reset();321322static SharedLazyTessellationCache sharedLazyTessellationCache;323};324}325326327