Path: blob/master/thirdparty/embree/kernels/common/alloc.h
9905 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "default.h"6#include "device.h"7#include "scene.h"8#include "../builders/primref.h"910#include "../../common/tasking/taskscheduler.h"1112namespace embree13{14class FastAllocator15{16/*! maximum supported alignment */17static const size_t maxAlignment = 64;1819/*! maximum allocation size */2021/* default settings */22//static const size_t defaultBlockSize = 4096;23#define maxAllocationSize size_t(2*1024*1024-maxAlignment)2425static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;2627public:2829struct ThreadLocal2;30enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };3132/*! Per thread structure holding the current memory block. */33struct __aligned(64) ThreadLocal34{35ALIGNED_CLASS_(64);36public:3738/*! Constructor for usage with ThreadLocalData */39__forceinline ThreadLocal (ThreadLocal2* parent)40: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}4142/*! initialize allocator */43void init(FastAllocator* alloc)44{45ptr = nullptr;46cur = end = 0;47bytesUsed = 0;48bytesWasted = 0;49allocBlockSize = 0;50if (alloc) allocBlockSize = alloc->defaultBlockSize;51}5253/* Allocate aligned memory from the threads memory block. */54__forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)55{56/* bind the thread local allocator to the proper FastAllocator*/57parent->bind(alloc);5859assert(align <= maxAlignment);60bytesUsed += bytes;6162/* try to allocate in local block */63size_t ofs = (align - cur) & (align-1);64cur += bytes + ofs;65if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }66cur -= bytes + ofs;6768/* if allocation is too large allocate with parent allocator */69if (4*bytes > allocBlockSize) {70return alloc->malloc(bytes,maxAlignment,false);71}7273/* get new partial block if allocation failed */74size_t blockSize = allocBlockSize;75ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);76bytesWasted += end-cur;77cur = 0; end = blockSize;7879/* retry allocation */80ofs = (align - cur) & (align-1);81cur += bytes + ofs;82if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }83cur -= bytes + ofs;8485/* get new full block if allocation failed */86blockSize = allocBlockSize;87ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);88bytesWasted += end-cur;89cur = 0; end = blockSize;9091/* retry allocation */92ofs = (align - cur) & (align-1);93cur += bytes + ofs;94if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }95cur -= bytes + ofs;9697/* should never happen as large allocations get handled specially above */98assert(false);99return nullptr;100}101102__forceinline size_t getUsedBytes() const { return bytesUsed; }103104/*! returns amount of free bytes */105__forceinline size_t getFreeBytes() const { return end-cur; }106107/*! returns amount of wasted bytes */108__forceinline size_t getWastedBytes() const { return bytesWasted; }109110private:111ThreadLocal2* parent;112char* ptr; //!< pointer to memory block113size_t cur; //!< current location of the allocator114size_t end; //!< end of the memory block115size_t allocBlockSize; //!< block size for allocations116size_t bytesUsed; //!< number of total bytes allocated117size_t bytesWasted; //!< number of bytes wasted118};119120/*! Two thread local structures. */121struct __aligned(64) ThreadLocal2122{123ALIGNED_CLASS_(64);124public:125126__forceinline ThreadLocal2()127: alloc(nullptr), alloc0(this), alloc1(this) {}128129/*! bind to fast allocator */130__forceinline void bind(FastAllocator* alloc_i)131{132assert(alloc_i);133if (alloc.load() == alloc_i) return;134Lock<MutexSys> lock(mutex);135//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind136if (alloc.load()) {137alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();138alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();139alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();140}141alloc0.init(alloc_i);142alloc1.init(alloc_i);143alloc.store(alloc_i);144alloc_i->join(this);145}146147/*! unbind to fast allocator */148void unbind(FastAllocator* alloc_i)149{150assert(alloc_i);151if (alloc.load() != alloc_i) return;152Lock<MutexSys> lock(mutex);153if (alloc.load() != alloc_i) return; // required as a different thread calls unbind154alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();155alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();156alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();157alloc0.init(nullptr);158alloc1.init(nullptr);159alloc.store(nullptr);160}161162public:163MutexSys mutex;164std::atomic<FastAllocator*> alloc; //!< parent allocator165ThreadLocal alloc0;166ThreadLocal alloc1;167};168169FastAllocator (Device* device,170bool osAllocation,171bool useUSM = false,172bool blockAllocation = true)173: device(device)174, slotMask(0)175, defaultBlockSize(PAGE_SIZE)176, estimatedSize(0)177, growSize(PAGE_SIZE)178, maxGrowSize(maxAllocationSize)179, usedBlocks(nullptr)180, freeBlocks(nullptr)181, useUSM(useUSM)182, blockAllocation(blockAllocation)183, use_single_mode(false)184, log2_grow_size_scale(0)185, bytesUsed(0)186, bytesFree(0)187, bytesWasted(0)188, atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)189, primrefarray(device,0)190{191if (osAllocation && useUSM)192abort(); //throw std::runtime_error("USM allocation cannot be combined with OS allocation.");193194for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)195{196threadUsedBlocks[i] = nullptr;197threadBlocks[i] = nullptr;198//assert(!slotMutex[i].isLocked());199}200}201202~FastAllocator () {203clear();204}205206/*! returns the device attached to this allocator */207Device* getDevice() {208return device;209}210211void share(mvector<PrimRef>& primrefarray_i) {212primrefarray = std::move(primrefarray_i);213}214215void unshare(mvector<PrimRef>& primrefarray_o)216{217reset(); // this removes blocks that are allocated inside the shared primref array218primrefarray_o = std::move(primrefarray);219}220221/*! returns first fast thread local allocator */222__forceinline ThreadLocal* _threadLocal() {223return &threadLocal2()->alloc0;224}225226void setOSallocation(bool flag)227{228atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;229}230231private:232233/*! returns both fast thread local allocators */234__forceinline ThreadLocal2* threadLocal2()235{236ThreadLocal2* alloc = thread_local_allocator2;237if (alloc == nullptr) {238thread_local_allocator2 = alloc = new ThreadLocal2;239Lock<MutexSys> lock(s_thread_local_allocators_lock);240s_thread_local_allocators.push_back(make_unique(alloc));241}242return alloc;243}244245public:246247__forceinline void join(ThreadLocal2* alloc)248{249Lock<MutexSys> lock(s_thread_local_allocators_lock);250thread_local_allocators.push_back(alloc);251}252253public:254255struct CachedAllocator256{257__forceinline CachedAllocator(void* ptr)258: alloc(nullptr), talloc0(nullptr), talloc1(nullptr)259{260assert(ptr == nullptr);261}262263__forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)264: alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}265266__forceinline operator bool () const {267return alloc != nullptr;268}269270__forceinline void* operator() (size_t bytes, size_t align = 16) const {271return talloc0->malloc(alloc,bytes,align);272}273274__forceinline void* malloc0 (size_t bytes, size_t align = 16) const {275return talloc0->malloc(alloc,bytes,align);276}277278__forceinline void* malloc1 (size_t bytes, size_t align = 16) const {279return talloc1->malloc(alloc,bytes,align);280}281282public:283FastAllocator* alloc;284ThreadLocal* talloc0;285ThreadLocal* talloc1;286};287288__forceinline CachedAllocator getCachedAllocator() {289return CachedAllocator(this,threadLocal2());290}291292/*! Builder interface to create thread local allocator */293struct Create294{295public:296__forceinline Create (FastAllocator* allocator) : allocator(allocator) {}297__forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }298299private:300FastAllocator* allocator;301};302303void internal_fix_used_blocks()304{305/* move thread local blocks to global block list */306for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)307{308while (threadBlocks[i].load() != nullptr) {309Block* nextUsedBlock = threadBlocks[i].load()->next;310threadBlocks[i].load()->next = usedBlocks.load();311usedBlocks = threadBlocks[i].load();312threadBlocks[i] = nextUsedBlock;313}314threadBlocks[i] = nullptr;315}316}317318static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks319static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks320static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks321322/* calculates a single threaded threshold for the builders such323* that for small scenes the overhead of partly allocated blocks324* per thread is low */325size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)326{327if (numPrimitives == 0 || bytesEstimated == 0)328return defaultThreshold;329330/* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */331const size_t single_mode_factor = use_single_mode ? 1 : 2;332const size_t threadCount = TaskScheduler::threadCount();333const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;334335/* if we do not have to limit number of threads use optimal thresdhold */336if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)337return defaultThreshold;338339/* otherwise limit number of threads by calculating proper single thread threshold */340else {341double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);342return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));343}344}345346__forceinline size_t alignSize(size_t i) {347return (i+127)/128*128;348}349350/*! initializes the grow size */351__forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)352{353/* we do not need single thread local allocator mode */354use_single_mode = false;355356/* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */357size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;358size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);359growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);360361/* if we reached the maxAllocationSize for growSize, we can362* increase the number of allocation slots by still guaranteeing363* the mainAllocationOverhead */364slotMask = 0x0;365366if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;367if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;368if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;369if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */370371/* set the thread local alloc block size */372size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;373374/* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */375#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL376const size_t threadCount = TaskScheduler::threadCount();377const size_t single_mode_factor = use_single_mode ? 1 : 2;378const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;379if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)380defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);381382/* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */383else384#endif385defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);386387if (bytesEstimated == 0) {388maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size389defaultBlockSize = defaultBlockSizeSwitch;390}391log2_grow_size_scale = 0;392393if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;394if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;395if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;396if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;397if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;398if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;399if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;400}401402/*! initializes the allocator */403void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)404{405internal_fix_used_blocks();406/* distribute the allocation to multiple thread block slots */407slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove408if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }409if (bytesReserve == 0) bytesReserve = bytesAllocate;410freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);411estimatedSize = bytesEstimate;412initGrowSizeAndNumSlots(bytesEstimate,true);413}414415/*! initializes the allocator */416void init_estimate(size_t bytesEstimate)417{418internal_fix_used_blocks();419if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }420/* single allocator mode ? */421estimatedSize = bytesEstimate;422//initGrowSizeAndNumSlots(bytesEstimate,false);423initGrowSizeAndNumSlots(bytesEstimate,false);424425}426427/*! frees state not required after build */428__forceinline void cleanup()429{430internal_fix_used_blocks();431432/* unbind all thread local allocators */433for (auto alloc : thread_local_allocators) alloc->unbind(this);434thread_local_allocators.clear();435}436437/*! resets the allocator, memory blocks get reused */438void reset ()439{440internal_fix_used_blocks();441442bytesUsed.store(0);443bytesFree.store(0);444bytesWasted.store(0);445446/* reset all used blocks and move them to begin of free block list */447while (usedBlocks.load() != nullptr) {448usedBlocks.load()->reset_block();449Block* nextUsedBlock = usedBlocks.load()->next;450usedBlocks.load()->next = freeBlocks.load();451freeBlocks = usedBlocks.load();452usedBlocks = nextUsedBlock;453}454455/* remove all shared blocks as they are re-added during build */456freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));457458for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)459{460threadUsedBlocks[i] = nullptr;461threadBlocks[i] = nullptr;462}463464/* unbind all thread local allocators */465for (auto alloc : thread_local_allocators) alloc->unbind(this);466thread_local_allocators.clear();467}468469/*! frees all allocated memory */470__forceinline void clear()471{472cleanup();473bytesUsed.store(0);474bytesFree.store(0);475bytesWasted.store(0);476if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;477if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;478for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {479threadUsedBlocks[i] = nullptr;480threadBlocks[i] = nullptr;481}482primrefarray.clear();483}484485__forceinline size_t incGrowSizeScale()486{487size_t scale = log2_grow_size_scale.fetch_add(1)+1;488return size_t(1) << min(size_t(16),scale);489}490491/*! thread safe allocation of memory */492void* malloc(size_t& bytes, size_t align, bool partial)493{494assert(align <= maxAlignment);495496while (true)497{498/* allocate using current block */499size_t threadID = TaskScheduler::threadID();500size_t slot = threadID & slotMask;501Block* myUsedBlocks = threadUsedBlocks[slot];502if (myUsedBlocks) {503void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);504if (ptr == nullptr && !blockAllocation)505abort(); //throw std::bad_alloc();506if (ptr) return ptr;507}508509/* throw error if allocation is too large */510if (bytes > maxAllocationSize)511throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");512513/* parallel block creation in case of no freeBlocks, avoids single global mutex */514if (likely(freeBlocks.load() == nullptr))515{516Lock<MutexSys> lock(slotMutex[slot]);517if (myUsedBlocks == threadUsedBlocks[slot]) {518const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);519const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);520assert(allocSize >= bytes);521threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!522// FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.523}524continue;525}526527/* if this fails allocate new block */528{529Lock<MutexSys> lock(mutex);530if (myUsedBlocks == threadUsedBlocks[slot])531{532if (freeBlocks.load() != nullptr) {533Block* nextFreeBlock = freeBlocks.load()->next;534freeBlocks.load()->next = usedBlocks;535__memory_barrier();536usedBlocks = freeBlocks.load();537threadUsedBlocks[slot] = freeBlocks.load();538freeBlocks = nextFreeBlock;539} else {540const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);541usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!542}543}544}545}546}547548/*! add new block */549void addBlock(void* ptr, ssize_t bytes)550{551Lock<MutexSys> lock(mutex);552const size_t sizeof_Header = offsetof(Block,data[0]);553void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));554size_t ofs = (size_t) aptr - (size_t) ptr;555bytes -= ofs;556if (bytes < 4096) return; // ignore empty or very small blocks557freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);558}559560/* special allocation only used from morton builder only a single time for each build */561void* specialAlloc(size_t bytes)562{563assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);564return freeBlocks.load()->ptr();565}566567struct Statistics568{569Statistics ()570: bytesUsed(0), bytesFree(0), bytesWasted(0) {}571572Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)573: bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}574575Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)576: bytesUsed(0), bytesFree(0), bytesWasted(0)577{578Block* usedBlocks = alloc->usedBlocks.load();579Block* freeBlocks = alloc->freeBlocks.load();580if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);581if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);582if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);583if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);584if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);585}586587std::string str(size_t numPrimitives)588{589std::stringstream str;590str.setf(std::ios::fixed, std::ios::floatfield);591str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "592<< "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "593<< "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "594<< "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "595<< "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);596return str.str();597}598599friend Statistics operator+ ( const Statistics& a, const Statistics& b)600{601return Statistics(a.bytesUsed+b.bytesUsed,602a.bytesFree+b.bytesFree,603a.bytesWasted+b.bytesWasted);604}605606size_t bytesAllocatedTotal() const {607return bytesUsed + bytesFree + bytesWasted;608}609610public:611size_t bytesUsed;612size_t bytesFree;613size_t bytesWasted;614};615616Statistics getStatistics(AllocationType atype, bool huge_pages = false) {617return Statistics(this,atype,huge_pages);618}619620size_t getUsedBytes() {621return bytesUsed;622}623624size_t getWastedBytes() {625return bytesWasted;626}627628struct AllStatistics629{630AllStatistics (FastAllocator* alloc)631632: bytesUsed(alloc->bytesUsed),633bytesFree(alloc->bytesFree),634bytesWasted(alloc->bytesWasted),635stat_all(alloc,ANY_TYPE),636stat_malloc(alloc,ALIGNED_MALLOC),637stat_4K(alloc,EMBREE_OS_MALLOC,false),638stat_2M(alloc,EMBREE_OS_MALLOC,true),639stat_shared(alloc,SHARED) {}640641AllStatistics (size_t bytesUsed,642size_t bytesFree,643size_t bytesWasted,644Statistics stat_all,645Statistics stat_malloc,646Statistics stat_4K,647Statistics stat_2M,648Statistics stat_shared)649650: bytesUsed(bytesUsed),651bytesFree(bytesFree),652bytesWasted(bytesWasted),653stat_all(stat_all),654stat_malloc(stat_malloc),655stat_4K(stat_4K),656stat_2M(stat_2M),657stat_shared(stat_shared) {}658659friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)660{661return AllStatistics(a.bytesUsed+b.bytesUsed,662a.bytesFree+b.bytesFree,663a.bytesWasted+b.bytesWasted,664a.stat_all + b.stat_all,665a.stat_malloc + b.stat_malloc,666a.stat_4K + b.stat_4K,667a.stat_2M + b.stat_2M,668a.stat_shared + b.stat_shared);669}670671void print(size_t numPrimitives)672{673std::stringstream str0;674str0.setf(std::ios::fixed, std::ios::floatfield);675str0 << " alloc : "676<< "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "677<< " "678<< "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);679std::cout << str0.str() << std::endl;680681std::stringstream str1;682str1.setf(std::ios::fixed, std::ios::floatfield);683str1 << " alloc : "684<< "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "685<< "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "686<< "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "687<< "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "688<< "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);689std::cout << str1.str() << std::endl;690691std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;692std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;693std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;694std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;695std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;696}697698private:699size_t bytesUsed;700size_t bytesFree;701size_t bytesWasted;702Statistics stat_all;703Statistics stat_malloc;704Statistics stat_4K;705Statistics stat_2M;706Statistics stat_shared;707};708709void print_blocks()710{711std::cout << " estimatedSize = " << estimatedSize712<< ", slotMask = " << slotMask713<< ", use_single_mode = " << use_single_mode714<< ", maxGrowSize = " << maxGrowSize715<< ", defaultBlockSize = " << defaultBlockSize716<< std::endl;717718std::cout << " used blocks = ";719if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();720std::cout << "[END]" << std::endl;721722std::cout << " free blocks = ";723if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();724std::cout << "[END]" << std::endl;725}726727private:728729struct Block730{731__forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)732{733if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);734else return alignedMalloc (bytesAllocate, bytesAlignment);735}736737__forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)738{739if (useUSM) return device->free(ptr);740else return alignedFree(ptr);741}742743static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)744{745/* We avoid using os_malloc for small blocks as this could746* cause a risk of fragmenting the virtual address space and747* reach the limit of vm.max_map_count = 65k under Linux. */748if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)749atype = ALIGNED_MALLOC;750751/* we need to additionally allocate some header */752const size_t sizeof_Header = offsetof(Block,data[0]);753bytesAllocate = sizeof_Header+bytesAllocate;754bytesReserve = sizeof_Header+bytesReserve;755756/* consume full 4k pages with using os_malloc */757if (atype == EMBREE_OS_MALLOC) {758bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));759bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));760}761762/* either use alignedMalloc or os_malloc */763void *ptr = nullptr;764if (atype == ALIGNED_MALLOC)765{766/* special handling for default block size */767if (bytesAllocate == (2*PAGE_SIZE_2M))768{769const size_t alignment = maxAlignment;770if (device) device->memoryMonitor(bytesAllocate+alignment,false);771ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);772773/* give hint to transparently convert these pages to 2MB pages */774const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);775os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block776os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);777os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block778779return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);780}781else782{783const size_t alignment = maxAlignment;784if (device) device->memoryMonitor(bytesAllocate+alignment,false);785ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);786return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);787}788}789else if (atype == EMBREE_OS_MALLOC)790{791if (device) device->memoryMonitor(bytesAllocate,false);792bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);793return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);794}795else796assert(false);797798return NULL;799}800801Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)802: cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)803{804assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);805}806807static Block* remove_shared_blocks(Block* head)808{809Block** prev_next = &head;810for (Block* block = head; block; block = block->next) {811if (block->atype == SHARED) *prev_next = block->next;812else prev_next = &block->next;813}814return head;815}816817void clear_list(Device* device, bool useUSM)818{819Block* block = this;820while (block) {821Block* next = block->next;822block->clear_block(device, useUSM);823block = next;824}825}826827void clear_block (Device* device, bool useUSM)828{829const size_t sizeof_Header = offsetof(Block,data[0]);830const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();831832if (atype == ALIGNED_MALLOC) {833blockAlignedFree(device, useUSM, this);834if (device) device->memoryMonitor(-sizeof_Alloced,true);835}836837else if (atype == EMBREE_OS_MALLOC) {838size_t sizeof_This = sizeof_Header+reserveEnd;839os_free(this,sizeof_This,huge_pages);840if (device) device->memoryMonitor(-sizeof_Alloced,true);841}842843else /* if (atype == SHARED) */ {844}845}846847void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)848{849size_t bytes = bytes_in;850assert(align <= maxAlignment);851bytes = (bytes+(align-1)) & ~(align-1);852if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;853const size_t i = cur.fetch_add(bytes);854if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;855if (unlikely(i > reserveEnd)) return nullptr;856bytes_in = bytes = min(bytes,reserveEnd-i);857858if (i+bytes > allocEnd) {859if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);860}861return &data[i];862}863864void* ptr() {865return &data[cur];866}867868void reset_block ()869{870allocEnd = max(allocEnd,(size_t)cur);871cur = 0;872}873874size_t getBlockUsedBytes() const {875return min(size_t(cur),reserveEnd);876}877878size_t getBlockFreeBytes() const {879return getBlockAllocatedBytes() - getBlockUsedBytes();880}881882size_t getBlockAllocatedBytes() const {883return min(max(allocEnd,size_t(cur)),reserveEnd);884}885886size_t getBlockWastedBytes() const {887const size_t sizeof_Header = offsetof(Block,data[0]);888return sizeof_Header + wasted;889}890891size_t getBlockReservedBytes() const {892return reserveEnd;893}894895bool hasType(AllocationType atype_i, bool huge_pages_i) const896{897if (atype_i == ANY_TYPE ) return true;898else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;899else return atype_i == atype;900}901902size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {903size_t bytes = 0;904for (const Block* block = this; block; block = block->next) {905if (!block->hasType(atype,huge_pages)) continue;906bytes += block->getBlockUsedBytes();907}908return bytes;909}910911size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {912size_t bytes = 0;913for (const Block* block = this; block; block = block->next) {914if (!block->hasType(atype,huge_pages)) continue;915bytes += block->getBlockFreeBytes();916}917return bytes;918}919920size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {921size_t bytes = 0;922for (const Block* block = this; block; block = block->next) {923if (!block->hasType(atype,huge_pages)) continue;924bytes += block->getBlockWastedBytes();925}926return bytes;927}928929size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {930size_t bytes = 0;931for (const Block* block = this; block; block = block->next) {932if (!block->hasType(atype,huge_pages)) continue;933bytes += block->getBlockAllocatedBytes();934}935return bytes;936}937938void print_list ()939{940for (const Block* block = this; block; block = block->next)941block->print_block();942}943944void print_block() const945{946if (atype == ALIGNED_MALLOC) std::cout << "A";947else if (atype == EMBREE_OS_MALLOC) std::cout << "O";948else if (atype == SHARED) std::cout << "S";949if (huge_pages) std::cout << "H";950size_t bytesUsed = getBlockUsedBytes();951size_t bytesFree = getBlockFreeBytes();952size_t bytesWasted = getBlockWastedBytes();953std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";954}955956public:957std::atomic<size_t> cur; //!< current location of the allocator958std::atomic<size_t> allocEnd; //!< end of the allocated memory region959std::atomic<size_t> reserveEnd; //!< end of the reserved memory region960Block* next; //!< pointer to next block in list961size_t wasted; //!< amount of memory wasted through block alignment962AllocationType atype; //!< allocation mode of the block963bool huge_pages; //!< whether the block uses huge pages964char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment965char data[1]; //!< here starts memory to use for allocations966};967968public:969static const size_t blockHeaderSize = offsetof(Block,data[0]);970971private:972Device* device;973size_t slotMask;974size_t defaultBlockSize;975size_t estimatedSize;976size_t growSize;977size_t maxGrowSize;978979MutexSys mutex;980MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];981std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];982std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];983std::atomic<Block*> usedBlocks;984std::atomic<Block*> freeBlocks;985986bool useUSM;987bool blockAllocation = true;988bool use_single_mode;989990std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove991std::atomic<size_t> bytesUsed;992std::atomic<size_t> bytesFree;993std::atomic<size_t> bytesWasted;994995static __thread ThreadLocal2* thread_local_allocator2;996static MutexSys s_thread_local_allocators_lock;997static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;998999std::vector<ThreadLocal2*> thread_local_allocators;1000AllocationType atype;10011002mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes1003};1004}100510061007