Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/kernels/common/alloc.h
9905 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#include "default.h"
7
#include "device.h"
8
#include "scene.h"
9
#include "../builders/primref.h"
10
11
#include "../../common/tasking/taskscheduler.h"
12
13
namespace embree
14
{
15
class FastAllocator
16
{
17
/*! maximum supported alignment */
18
static const size_t maxAlignment = 64;
19
20
/*! maximum allocation size */
21
22
/* default settings */
23
//static const size_t defaultBlockSize = 4096;
24
#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
25
26
static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
27
28
public:
29
30
struct ThreadLocal2;
31
enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
32
33
/*! Per thread structure holding the current memory block. */
34
struct __aligned(64) ThreadLocal
35
{
36
ALIGNED_CLASS_(64);
37
public:
38
39
/*! Constructor for usage with ThreadLocalData */
40
__forceinline ThreadLocal (ThreadLocal2* parent)
41
: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
42
43
/*! initialize allocator */
44
void init(FastAllocator* alloc)
45
{
46
ptr = nullptr;
47
cur = end = 0;
48
bytesUsed = 0;
49
bytesWasted = 0;
50
allocBlockSize = 0;
51
if (alloc) allocBlockSize = alloc->defaultBlockSize;
52
}
53
54
/* Allocate aligned memory from the threads memory block. */
55
__forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
56
{
57
/* bind the thread local allocator to the proper FastAllocator*/
58
parent->bind(alloc);
59
60
assert(align <= maxAlignment);
61
bytesUsed += bytes;
62
63
/* try to allocate in local block */
64
size_t ofs = (align - cur) & (align-1);
65
cur += bytes + ofs;
66
if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
67
cur -= bytes + ofs;
68
69
/* if allocation is too large allocate with parent allocator */
70
if (4*bytes > allocBlockSize) {
71
return alloc->malloc(bytes,maxAlignment,false);
72
}
73
74
/* get new partial block if allocation failed */
75
size_t blockSize = allocBlockSize;
76
ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
77
bytesWasted += end-cur;
78
cur = 0; end = blockSize;
79
80
/* retry allocation */
81
ofs = (align - cur) & (align-1);
82
cur += bytes + ofs;
83
if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
84
cur -= bytes + ofs;
85
86
/* get new full block if allocation failed */
87
blockSize = allocBlockSize;
88
ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
89
bytesWasted += end-cur;
90
cur = 0; end = blockSize;
91
92
/* retry allocation */
93
ofs = (align - cur) & (align-1);
94
cur += bytes + ofs;
95
if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
96
cur -= bytes + ofs;
97
98
/* should never happen as large allocations get handled specially above */
99
assert(false);
100
return nullptr;
101
}
102
103
__forceinline size_t getUsedBytes() const { return bytesUsed; }
104
105
/*! returns amount of free bytes */
106
__forceinline size_t getFreeBytes() const { return end-cur; }
107
108
/*! returns amount of wasted bytes */
109
__forceinline size_t getWastedBytes() const { return bytesWasted; }
110
111
private:
112
ThreadLocal2* parent;
113
char* ptr; //!< pointer to memory block
114
size_t cur; //!< current location of the allocator
115
size_t end; //!< end of the memory block
116
size_t allocBlockSize; //!< block size for allocations
117
size_t bytesUsed; //!< number of total bytes allocated
118
size_t bytesWasted; //!< number of bytes wasted
119
};
120
121
/*! Two thread local structures. */
122
struct __aligned(64) ThreadLocal2
123
{
124
ALIGNED_CLASS_(64);
125
public:
126
127
__forceinline ThreadLocal2()
128
: alloc(nullptr), alloc0(this), alloc1(this) {}
129
130
/*! bind to fast allocator */
131
__forceinline void bind(FastAllocator* alloc_i)
132
{
133
assert(alloc_i);
134
if (alloc.load() == alloc_i) return;
135
Lock<MutexSys> lock(mutex);
136
//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
137
if (alloc.load()) {
138
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
139
alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
140
alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
141
}
142
alloc0.init(alloc_i);
143
alloc1.init(alloc_i);
144
alloc.store(alloc_i);
145
alloc_i->join(this);
146
}
147
148
/*! unbind to fast allocator */
149
void unbind(FastAllocator* alloc_i)
150
{
151
assert(alloc_i);
152
if (alloc.load() != alloc_i) return;
153
Lock<MutexSys> lock(mutex);
154
if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
155
alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
156
alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
157
alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
158
alloc0.init(nullptr);
159
alloc1.init(nullptr);
160
alloc.store(nullptr);
161
}
162
163
public:
164
MutexSys mutex;
165
std::atomic<FastAllocator*> alloc; //!< parent allocator
166
ThreadLocal alloc0;
167
ThreadLocal alloc1;
168
};
169
170
FastAllocator (Device* device,
171
bool osAllocation,
172
bool useUSM = false,
173
bool blockAllocation = true)
174
: device(device)
175
, slotMask(0)
176
, defaultBlockSize(PAGE_SIZE)
177
, estimatedSize(0)
178
, growSize(PAGE_SIZE)
179
, maxGrowSize(maxAllocationSize)
180
, usedBlocks(nullptr)
181
, freeBlocks(nullptr)
182
, useUSM(useUSM)
183
, blockAllocation(blockAllocation)
184
, use_single_mode(false)
185
, log2_grow_size_scale(0)
186
, bytesUsed(0)
187
, bytesFree(0)
188
, bytesWasted(0)
189
, atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
190
, primrefarray(device,0)
191
{
192
if (osAllocation && useUSM)
193
abort(); //throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
194
195
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
196
{
197
threadUsedBlocks[i] = nullptr;
198
threadBlocks[i] = nullptr;
199
//assert(!slotMutex[i].isLocked());
200
}
201
}
202
203
~FastAllocator () {
204
clear();
205
}
206
207
/*! returns the device attached to this allocator */
208
Device* getDevice() {
209
return device;
210
}
211
212
void share(mvector<PrimRef>& primrefarray_i) {
213
primrefarray = std::move(primrefarray_i);
214
}
215
216
void unshare(mvector<PrimRef>& primrefarray_o)
217
{
218
reset(); // this removes blocks that are allocated inside the shared primref array
219
primrefarray_o = std::move(primrefarray);
220
}
221
222
/*! returns first fast thread local allocator */
223
__forceinline ThreadLocal* _threadLocal() {
224
return &threadLocal2()->alloc0;
225
}
226
227
void setOSallocation(bool flag)
228
{
229
atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
230
}
231
232
private:
233
234
/*! returns both fast thread local allocators */
235
__forceinline ThreadLocal2* threadLocal2()
236
{
237
ThreadLocal2* alloc = thread_local_allocator2;
238
if (alloc == nullptr) {
239
thread_local_allocator2 = alloc = new ThreadLocal2;
240
Lock<MutexSys> lock(s_thread_local_allocators_lock);
241
s_thread_local_allocators.push_back(make_unique(alloc));
242
}
243
return alloc;
244
}
245
246
public:
247
248
__forceinline void join(ThreadLocal2* alloc)
249
{
250
Lock<MutexSys> lock(s_thread_local_allocators_lock);
251
thread_local_allocators.push_back(alloc);
252
}
253
254
public:
255
256
struct CachedAllocator
257
{
258
__forceinline CachedAllocator(void* ptr)
259
: alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
260
{
261
assert(ptr == nullptr);
262
}
263
264
__forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
265
: alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
266
267
__forceinline operator bool () const {
268
return alloc != nullptr;
269
}
270
271
__forceinline void* operator() (size_t bytes, size_t align = 16) const {
272
return talloc0->malloc(alloc,bytes,align);
273
}
274
275
__forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
276
return talloc0->malloc(alloc,bytes,align);
277
}
278
279
__forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
280
return talloc1->malloc(alloc,bytes,align);
281
}
282
283
public:
284
FastAllocator* alloc;
285
ThreadLocal* talloc0;
286
ThreadLocal* talloc1;
287
};
288
289
__forceinline CachedAllocator getCachedAllocator() {
290
return CachedAllocator(this,threadLocal2());
291
}
292
293
/*! Builder interface to create thread local allocator */
294
struct Create
295
{
296
public:
297
__forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
298
__forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
299
300
private:
301
FastAllocator* allocator;
302
};
303
304
void internal_fix_used_blocks()
305
{
306
/* move thread local blocks to global block list */
307
for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
308
{
309
while (threadBlocks[i].load() != nullptr) {
310
Block* nextUsedBlock = threadBlocks[i].load()->next;
311
threadBlocks[i].load()->next = usedBlocks.load();
312
usedBlocks = threadBlocks[i].load();
313
threadBlocks[i] = nextUsedBlock;
314
}
315
threadBlocks[i] = nullptr;
316
}
317
}
318
319
static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
320
static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
321
static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
322
323
/* calculates a single threaded threshold for the builders such
324
* that for small scenes the overhead of partly allocated blocks
325
* per thread is low */
326
size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
327
{
328
if (numPrimitives == 0 || bytesEstimated == 0)
329
return defaultThreshold;
330
331
/* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
332
const size_t single_mode_factor = use_single_mode ? 1 : 2;
333
const size_t threadCount = TaskScheduler::threadCount();
334
const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
335
336
/* if we do not have to limit number of threads use optimal thresdhold */
337
if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
338
return defaultThreshold;
339
340
/* otherwise limit number of threads by calculating proper single thread threshold */
341
else {
342
double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
343
return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive));
344
}
345
}
346
347
__forceinline size_t alignSize(size_t i) {
348
return (i+127)/128*128;
349
}
350
351
/*! initializes the grow size */
352
__forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
353
{
354
/* we do not need single thread local allocator mode */
355
use_single_mode = false;
356
357
/* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
358
size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
359
size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
360
growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
361
362
/* if we reached the maxAllocationSize for growSize, we can
363
* increase the number of allocation slots by still guaranteeing
364
* the mainAllocationOverhead */
365
slotMask = 0x0;
366
367
if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
368
if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
369
if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
370
if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
371
372
/* set the thread local alloc block size */
373
size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
374
375
/* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
376
#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
377
const size_t threadCount = TaskScheduler::threadCount();
378
const size_t single_mode_factor = use_single_mode ? 1 : 2;
379
const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
380
if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
381
defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
382
383
/* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
384
else
385
#endif
386
defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
387
388
if (bytesEstimated == 0) {
389
maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
390
defaultBlockSize = defaultBlockSizeSwitch;
391
}
392
log2_grow_size_scale = 0;
393
394
if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
395
if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
396
if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
397
if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
398
if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
399
if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
400
if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
401
}
402
403
/*! initializes the allocator */
404
void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
405
{
406
internal_fix_used_blocks();
407
/* distribute the allocation to multiple thread block slots */
408
slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
409
if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
410
if (bytesReserve == 0) bytesReserve = bytesAllocate;
411
freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);
412
estimatedSize = bytesEstimate;
413
initGrowSizeAndNumSlots(bytesEstimate,true);
414
}
415
416
/*! initializes the allocator */
417
void init_estimate(size_t bytesEstimate)
418
{
419
internal_fix_used_blocks();
420
if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
421
/* single allocator mode ? */
422
estimatedSize = bytesEstimate;
423
//initGrowSizeAndNumSlots(bytesEstimate,false);
424
initGrowSizeAndNumSlots(bytesEstimate,false);
425
426
}
427
428
/*! frees state not required after build */
429
__forceinline void cleanup()
430
{
431
internal_fix_used_blocks();
432
433
/* unbind all thread local allocators */
434
for (auto alloc : thread_local_allocators) alloc->unbind(this);
435
thread_local_allocators.clear();
436
}
437
438
/*! resets the allocator, memory blocks get reused */
439
void reset ()
440
{
441
internal_fix_used_blocks();
442
443
bytesUsed.store(0);
444
bytesFree.store(0);
445
bytesWasted.store(0);
446
447
/* reset all used blocks and move them to begin of free block list */
448
while (usedBlocks.load() != nullptr) {
449
usedBlocks.load()->reset_block();
450
Block* nextUsedBlock = usedBlocks.load()->next;
451
usedBlocks.load()->next = freeBlocks.load();
452
freeBlocks = usedBlocks.load();
453
usedBlocks = nextUsedBlock;
454
}
455
456
/* remove all shared blocks as they are re-added during build */
457
freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
458
459
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
460
{
461
threadUsedBlocks[i] = nullptr;
462
threadBlocks[i] = nullptr;
463
}
464
465
/* unbind all thread local allocators */
466
for (auto alloc : thread_local_allocators) alloc->unbind(this);
467
thread_local_allocators.clear();
468
}
469
470
/*! frees all allocated memory */
471
__forceinline void clear()
472
{
473
cleanup();
474
bytesUsed.store(0);
475
bytesFree.store(0);
476
bytesWasted.store(0);
477
if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;
478
if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;
479
for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
480
threadUsedBlocks[i] = nullptr;
481
threadBlocks[i] = nullptr;
482
}
483
primrefarray.clear();
484
}
485
486
__forceinline size_t incGrowSizeScale()
487
{
488
size_t scale = log2_grow_size_scale.fetch_add(1)+1;
489
return size_t(1) << min(size_t(16),scale);
490
}
491
492
/*! thread safe allocation of memory */
493
void* malloc(size_t& bytes, size_t align, bool partial)
494
{
495
assert(align <= maxAlignment);
496
497
while (true)
498
{
499
/* allocate using current block */
500
size_t threadID = TaskScheduler::threadID();
501
size_t slot = threadID & slotMask;
502
Block* myUsedBlocks = threadUsedBlocks[slot];
503
if (myUsedBlocks) {
504
void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
505
if (ptr == nullptr && !blockAllocation)
506
abort(); //throw std::bad_alloc();
507
if (ptr) return ptr;
508
}
509
510
/* throw error if allocation is too large */
511
if (bytes > maxAllocationSize)
512
throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
513
514
/* parallel block creation in case of no freeBlocks, avoids single global mutex */
515
if (likely(freeBlocks.load() == nullptr))
516
{
517
Lock<MutexSys> lock(slotMutex[slot]);
518
if (myUsedBlocks == threadUsedBlocks[slot]) {
519
const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
520
const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
521
assert(allocSize >= bytes);
522
threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
523
// FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
524
}
525
continue;
526
}
527
528
/* if this fails allocate new block */
529
{
530
Lock<MutexSys> lock(mutex);
531
if (myUsedBlocks == threadUsedBlocks[slot])
532
{
533
if (freeBlocks.load() != nullptr) {
534
Block* nextFreeBlock = freeBlocks.load()->next;
535
freeBlocks.load()->next = usedBlocks;
536
__memory_barrier();
537
usedBlocks = freeBlocks.load();
538
threadUsedBlocks[slot] = freeBlocks.load();
539
freeBlocks = nextFreeBlock;
540
} else {
541
const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
542
usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
543
}
544
}
545
}
546
}
547
}
548
549
/*! add new block */
550
void addBlock(void* ptr, ssize_t bytes)
551
{
552
Lock<MutexSys> lock(mutex);
553
const size_t sizeof_Header = offsetof(Block,data[0]);
554
void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
555
size_t ofs = (size_t) aptr - (size_t) ptr;
556
bytes -= ofs;
557
if (bytes < 4096) return; // ignore empty or very small blocks
558
freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
559
}
560
561
/* special allocation only used from morton builder only a single time for each build */
562
void* specialAlloc(size_t bytes)
563
{
564
assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
565
return freeBlocks.load()->ptr();
566
}
567
568
struct Statistics
569
{
570
Statistics ()
571
: bytesUsed(0), bytesFree(0), bytesWasted(0) {}
572
573
Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
574
: bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
575
576
Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
577
: bytesUsed(0), bytesFree(0), bytesWasted(0)
578
{
579
Block* usedBlocks = alloc->usedBlocks.load();
580
Block* freeBlocks = alloc->freeBlocks.load();
581
if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
582
if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
583
if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
584
if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
585
if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
586
}
587
588
std::string str(size_t numPrimitives)
589
{
590
std::stringstream str;
591
str.setf(std::ios::fixed, std::ios::floatfield);
592
str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
593
<< "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
594
<< "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
595
<< "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
596
<< "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
597
return str.str();
598
}
599
600
friend Statistics operator+ ( const Statistics& a, const Statistics& b)
601
{
602
return Statistics(a.bytesUsed+b.bytesUsed,
603
a.bytesFree+b.bytesFree,
604
a.bytesWasted+b.bytesWasted);
605
}
606
607
size_t bytesAllocatedTotal() const {
608
return bytesUsed + bytesFree + bytesWasted;
609
}
610
611
public:
612
size_t bytesUsed;
613
size_t bytesFree;
614
size_t bytesWasted;
615
};
616
617
Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
618
return Statistics(this,atype,huge_pages);
619
}
620
621
size_t getUsedBytes() {
622
return bytesUsed;
623
}
624
625
size_t getWastedBytes() {
626
return bytesWasted;
627
}
628
629
struct AllStatistics
630
{
631
AllStatistics (FastAllocator* alloc)
632
633
: bytesUsed(alloc->bytesUsed),
634
bytesFree(alloc->bytesFree),
635
bytesWasted(alloc->bytesWasted),
636
stat_all(alloc,ANY_TYPE),
637
stat_malloc(alloc,ALIGNED_MALLOC),
638
stat_4K(alloc,EMBREE_OS_MALLOC,false),
639
stat_2M(alloc,EMBREE_OS_MALLOC,true),
640
stat_shared(alloc,SHARED) {}
641
642
AllStatistics (size_t bytesUsed,
643
size_t bytesFree,
644
size_t bytesWasted,
645
Statistics stat_all,
646
Statistics stat_malloc,
647
Statistics stat_4K,
648
Statistics stat_2M,
649
Statistics stat_shared)
650
651
: bytesUsed(bytesUsed),
652
bytesFree(bytesFree),
653
bytesWasted(bytesWasted),
654
stat_all(stat_all),
655
stat_malloc(stat_malloc),
656
stat_4K(stat_4K),
657
stat_2M(stat_2M),
658
stat_shared(stat_shared) {}
659
660
friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
661
{
662
return AllStatistics(a.bytesUsed+b.bytesUsed,
663
a.bytesFree+b.bytesFree,
664
a.bytesWasted+b.bytesWasted,
665
a.stat_all + b.stat_all,
666
a.stat_malloc + b.stat_malloc,
667
a.stat_4K + b.stat_4K,
668
a.stat_2M + b.stat_2M,
669
a.stat_shared + b.stat_shared);
670
}
671
672
void print(size_t numPrimitives)
673
{
674
std::stringstream str0;
675
str0.setf(std::ios::fixed, std::ios::floatfield);
676
str0 << " alloc : "
677
<< "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
678
<< " "
679
<< "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
680
std::cout << str0.str() << std::endl;
681
682
std::stringstream str1;
683
str1.setf(std::ios::fixed, std::ios::floatfield);
684
str1 << " alloc : "
685
<< "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
686
<< "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
687
<< "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
688
<< "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
689
<< "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
690
std::cout << str1.str() << std::endl;
691
692
std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
693
std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
694
std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
695
std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
696
std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
697
}
698
699
private:
700
size_t bytesUsed;
701
size_t bytesFree;
702
size_t bytesWasted;
703
Statistics stat_all;
704
Statistics stat_malloc;
705
Statistics stat_4K;
706
Statistics stat_2M;
707
Statistics stat_shared;
708
};
709
710
void print_blocks()
711
{
712
std::cout << " estimatedSize = " << estimatedSize
713
<< ", slotMask = " << slotMask
714
<< ", use_single_mode = " << use_single_mode
715
<< ", maxGrowSize = " << maxGrowSize
716
<< ", defaultBlockSize = " << defaultBlockSize
717
<< std::endl;
718
719
std::cout << " used blocks = ";
720
if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
721
std::cout << "[END]" << std::endl;
722
723
std::cout << " free blocks = ";
724
if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
725
std::cout << "[END]" << std::endl;
726
}
727
728
private:
729
730
struct Block
731
{
732
__forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)
733
{
734
if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);
735
else return alignedMalloc (bytesAllocate, bytesAlignment);
736
}
737
738
__forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)
739
{
740
if (useUSM) return device->free(ptr);
741
else return alignedFree(ptr);
742
}
743
744
static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
745
{
746
/* We avoid using os_malloc for small blocks as this could
747
* cause a risk of fragmenting the virtual address space and
748
* reach the limit of vm.max_map_count = 65k under Linux. */
749
if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
750
atype = ALIGNED_MALLOC;
751
752
/* we need to additionally allocate some header */
753
const size_t sizeof_Header = offsetof(Block,data[0]);
754
bytesAllocate = sizeof_Header+bytesAllocate;
755
bytesReserve = sizeof_Header+bytesReserve;
756
757
/* consume full 4k pages with using os_malloc */
758
if (atype == EMBREE_OS_MALLOC) {
759
bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
760
bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
761
}
762
763
/* either use alignedMalloc or os_malloc */
764
void *ptr = nullptr;
765
if (atype == ALIGNED_MALLOC)
766
{
767
/* special handling for default block size */
768
if (bytesAllocate == (2*PAGE_SIZE_2M))
769
{
770
const size_t alignment = maxAlignment;
771
if (device) device->memoryMonitor(bytesAllocate+alignment,false);
772
ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
773
774
/* give hint to transparently convert these pages to 2MB pages */
775
const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
776
os_advise((void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
777
os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
778
os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
779
780
return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
781
}
782
else
783
{
784
const size_t alignment = maxAlignment;
785
if (device) device->memoryMonitor(bytesAllocate+alignment,false);
786
ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
787
return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
788
}
789
}
790
else if (atype == EMBREE_OS_MALLOC)
791
{
792
if (device) device->memoryMonitor(bytesAllocate,false);
793
bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
794
return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
795
}
796
else
797
assert(false);
798
799
return NULL;
800
}
801
802
Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
803
: cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
804
{
805
assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
806
}
807
808
static Block* remove_shared_blocks(Block* head)
809
{
810
Block** prev_next = &head;
811
for (Block* block = head; block; block = block->next) {
812
if (block->atype == SHARED) *prev_next = block->next;
813
else prev_next = &block->next;
814
}
815
return head;
816
}
817
818
void clear_list(Device* device, bool useUSM)
819
{
820
Block* block = this;
821
while (block) {
822
Block* next = block->next;
823
block->clear_block(device, useUSM);
824
block = next;
825
}
826
}
827
828
void clear_block (Device* device, bool useUSM)
829
{
830
const size_t sizeof_Header = offsetof(Block,data[0]);
831
const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
832
833
if (atype == ALIGNED_MALLOC) {
834
blockAlignedFree(device, useUSM, this);
835
if (device) device->memoryMonitor(-sizeof_Alloced,true);
836
}
837
838
else if (atype == EMBREE_OS_MALLOC) {
839
size_t sizeof_This = sizeof_Header+reserveEnd;
840
os_free(this,sizeof_This,huge_pages);
841
if (device) device->memoryMonitor(-sizeof_Alloced,true);
842
}
843
844
else /* if (atype == SHARED) */ {
845
}
846
}
847
848
void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
849
{
850
size_t bytes = bytes_in;
851
assert(align <= maxAlignment);
852
bytes = (bytes+(align-1)) & ~(align-1);
853
if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
854
const size_t i = cur.fetch_add(bytes);
855
if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
856
if (unlikely(i > reserveEnd)) return nullptr;
857
bytes_in = bytes = min(bytes,reserveEnd-i);
858
859
if (i+bytes > allocEnd) {
860
if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
861
}
862
return &data[i];
863
}
864
865
void* ptr() {
866
return &data[cur];
867
}
868
869
void reset_block ()
870
{
871
allocEnd = max(allocEnd,(size_t)cur);
872
cur = 0;
873
}
874
875
size_t getBlockUsedBytes() const {
876
return min(size_t(cur),reserveEnd);
877
}
878
879
size_t getBlockFreeBytes() const {
880
return getBlockAllocatedBytes() - getBlockUsedBytes();
881
}
882
883
size_t getBlockAllocatedBytes() const {
884
return min(max(allocEnd,size_t(cur)),reserveEnd);
885
}
886
887
size_t getBlockWastedBytes() const {
888
const size_t sizeof_Header = offsetof(Block,data[0]);
889
return sizeof_Header + wasted;
890
}
891
892
size_t getBlockReservedBytes() const {
893
return reserveEnd;
894
}
895
896
bool hasType(AllocationType atype_i, bool huge_pages_i) const
897
{
898
if (atype_i == ANY_TYPE ) return true;
899
else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
900
else return atype_i == atype;
901
}
902
903
size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
904
size_t bytes = 0;
905
for (const Block* block = this; block; block = block->next) {
906
if (!block->hasType(atype,huge_pages)) continue;
907
bytes += block->getBlockUsedBytes();
908
}
909
return bytes;
910
}
911
912
size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
913
size_t bytes = 0;
914
for (const Block* block = this; block; block = block->next) {
915
if (!block->hasType(atype,huge_pages)) continue;
916
bytes += block->getBlockFreeBytes();
917
}
918
return bytes;
919
}
920
921
size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
922
size_t bytes = 0;
923
for (const Block* block = this; block; block = block->next) {
924
if (!block->hasType(atype,huge_pages)) continue;
925
bytes += block->getBlockWastedBytes();
926
}
927
return bytes;
928
}
929
930
size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
931
size_t bytes = 0;
932
for (const Block* block = this; block; block = block->next) {
933
if (!block->hasType(atype,huge_pages)) continue;
934
bytes += block->getBlockAllocatedBytes();
935
}
936
return bytes;
937
}
938
939
void print_list ()
940
{
941
for (const Block* block = this; block; block = block->next)
942
block->print_block();
943
}
944
945
void print_block() const
946
{
947
if (atype == ALIGNED_MALLOC) std::cout << "A";
948
else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
949
else if (atype == SHARED) std::cout << "S";
950
if (huge_pages) std::cout << "H";
951
size_t bytesUsed = getBlockUsedBytes();
952
size_t bytesFree = getBlockFreeBytes();
953
size_t bytesWasted = getBlockWastedBytes();
954
std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
955
}
956
957
public:
958
std::atomic<size_t> cur; //!< current location of the allocator
959
std::atomic<size_t> allocEnd; //!< end of the allocated memory region
960
std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
961
Block* next; //!< pointer to next block in list
962
size_t wasted; //!< amount of memory wasted through block alignment
963
AllocationType atype; //!< allocation mode of the block
964
bool huge_pages; //!< whether the block uses huge pages
965
char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
966
char data[1]; //!< here starts memory to use for allocations
967
};
968
969
public:
970
static const size_t blockHeaderSize = offsetof(Block,data[0]);
971
972
private:
973
Device* device;
974
size_t slotMask;
975
size_t defaultBlockSize;
976
size_t estimatedSize;
977
size_t growSize;
978
size_t maxGrowSize;
979
980
MutexSys mutex;
981
MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
982
std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
983
std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
984
std::atomic<Block*> usedBlocks;
985
std::atomic<Block*> freeBlocks;
986
987
bool useUSM;
988
bool blockAllocation = true;
989
bool use_single_mode;
990
991
std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
992
std::atomic<size_t> bytesUsed;
993
std::atomic<size_t> bytesFree;
994
std::atomic<size_t> bytesWasted;
995
996
static __thread ThreadLocal2* thread_local_allocator2;
997
static MutexSys s_thread_local_allocators_lock;
998
static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
999
1000
std::vector<ThreadLocal2*> thread_local_allocators;
1001
AllocationType atype;
1002
1003
mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
1004
};
1005
}
1006
1007