CoCalc -- alloc.h

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/kernels/common/alloc.h
⁹⁹⁰⁵ views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3

4
#pragma once
5

6
#include "default.h"
7
#include "device.h"
8
#include "scene.h"
9
#include "../builders/primref.h"
10

11
#include "../../common/tasking/taskscheduler.h"
12

13
namespace embree
14
{
15
  class FastAllocator
16
  {
17
    /*! maximum supported alignment */
18
    static const size_t maxAlignment = 64;
19
    
20
    /*! maximum allocation size */
21

22
    /* default settings */
23
    //static const size_t defaultBlockSize = 4096;
24
#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
25

26
    static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
27

28
  public:
29

30
    struct ThreadLocal2;
31
    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
32

33
    /*! Per thread structure holding the current memory block. */
34
    struct __aligned(64) ThreadLocal
35
    {
36
      ALIGNED_CLASS_(64);
37
    public:
38

39
      /*! Constructor for usage with ThreadLocalData */
40
      __forceinline ThreadLocal (ThreadLocal2* parent)
41
        : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
42

43
      /*! initialize allocator */
44
      void init(FastAllocator* alloc)
45
      {
46
        ptr = nullptr;
47
        cur = end = 0;
48
        bytesUsed = 0;
49
        bytesWasted = 0;
50
        allocBlockSize = 0;
51
        if (alloc) allocBlockSize = alloc->defaultBlockSize;
52
      }
53

54
      /* Allocate aligned memory from the threads memory block. */
55
      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
56
      {
57
        /* bind the thread local allocator to the proper FastAllocator*/
58
        parent->bind(alloc);
59

60
        assert(align <= maxAlignment);
61
        bytesUsed += bytes;
62

63
        /* try to allocate in local block */
64
        size_t ofs = (align - cur) & (align-1);
65
        cur += bytes + ofs;
66
        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
67
        cur -= bytes + ofs;
68

69
        /* if allocation is too large allocate with parent allocator */
70
        if (4*bytes > allocBlockSize) {
71
          return alloc->malloc(bytes,maxAlignment,false);
72
        }
73

74
        /* get new partial block if allocation failed */
75
        size_t blockSize = allocBlockSize;
76
        ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
77
        bytesWasted += end-cur;
78
        cur = 0; end = blockSize;
79

80
        /* retry allocation */
81
        ofs = (align - cur) & (align-1);
82
        cur += bytes + ofs;
83
        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
84
        cur -= bytes + ofs;
85

86
        /* get new full block if allocation failed */
87
        blockSize = allocBlockSize;
88
        ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
89
        bytesWasted += end-cur;
90
        cur = 0; end = blockSize;
91

92
        /* retry allocation */
93
        ofs = (align - cur) & (align-1);
94
        cur += bytes + ofs;
95
        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
96
        cur -= bytes + ofs;
97

98
        /* should never happen as large allocations get handled specially above */
99
        assert(false);
100
        return nullptr;
101
      }
102

103
      __forceinline size_t getUsedBytes() const { return bytesUsed; }
104

105
      /*! returns amount of free bytes */
106
      __forceinline size_t getFreeBytes() const { return end-cur; }
107

108
      /*! returns amount of wasted bytes */
109
      __forceinline size_t getWastedBytes() const { return bytesWasted; }
110

111
    private:
112
      ThreadLocal2* parent;
113
      char*  ptr;            //!< pointer to memory block
114
      size_t cur;            //!< current location of the allocator
115
      size_t end;            //!< end of the memory block
116
      size_t allocBlockSize; //!< block size for allocations
117
      size_t bytesUsed;      //!< number of total bytes allocated
118
      size_t bytesWasted;    //!< number of bytes wasted
119
    };
120

121
    /*! Two thread local structures. */
122
    struct __aligned(64) ThreadLocal2
123
    {
124
      ALIGNED_CLASS_(64);
125
    public:
126

127
      __forceinline ThreadLocal2()
128
        : alloc(nullptr), alloc0(this), alloc1(this) {}
129

130
      /*! bind to fast allocator */
131
      __forceinline void bind(FastAllocator* alloc_i) 
132
      {
133
        assert(alloc_i);
134
        if (alloc.load() == alloc_i) return;
135
        Lock<MutexSys> lock(mutex);
136
        //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
137
        if (alloc.load()) {
138
          alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
139
          alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
140
          alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
141
        }
142
        alloc0.init(alloc_i);
143
        alloc1.init(alloc_i);
144
        alloc.store(alloc_i);
145
        alloc_i->join(this);
146
      }
147

148
      /*! unbind to fast allocator */
149
      void unbind(FastAllocator* alloc_i) 
150
      {
151
        assert(alloc_i);
152
        if (alloc.load() != alloc_i) return;
153
        Lock<MutexSys> lock(mutex);
154
        if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
155
        alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
156
        alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
157
        alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
158
        alloc0.init(nullptr);
159
        alloc1.init(nullptr);
160
        alloc.store(nullptr);
161
      }
162

163
    public:
164
      MutexSys mutex;
165
      std::atomic<FastAllocator*> alloc;  //!< parent allocator
166
      ThreadLocal alloc0;
167
      ThreadLocal alloc1;
168
    };
169

170
    FastAllocator (Device* device,
171
                   bool osAllocation,
172
                   bool useUSM = false,
173
                   bool blockAllocation = true)
174
      : device(device)
175
      , slotMask(0)
176
      , defaultBlockSize(PAGE_SIZE)
177
      , estimatedSize(0)
178
      , growSize(PAGE_SIZE)
179
      , maxGrowSize(maxAllocationSize)
180
      , usedBlocks(nullptr)
181
      , freeBlocks(nullptr)
182
      , useUSM(useUSM)
183
      , blockAllocation(blockAllocation)
184
      , use_single_mode(false)
185
      , log2_grow_size_scale(0)
186
      , bytesUsed(0)
187
      , bytesFree(0)
188
      , bytesWasted(0)
189
      , atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC)
190
      , primrefarray(device,0)
191
    {
192
      if (osAllocation && useUSM)
193
        abort(); //throw std::runtime_error("USM allocation cannot be combined with OS allocation.");
194

195
      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
196
      {
197
        threadUsedBlocks[i] = nullptr;
198
        threadBlocks[i] = nullptr;
199
        //assert(!slotMutex[i].isLocked());
200
      }
201
    }
202

203
    ~FastAllocator () {
204
      clear();
205
    }
206

207
    /*! returns the device attached to this allocator */
208
    Device* getDevice() {
209
      return device;
210
    }
211

212
    void share(mvector<PrimRef>& primrefarray_i) {
213
      primrefarray = std::move(primrefarray_i);
214
    }
215

216
    void unshare(mvector<PrimRef>& primrefarray_o)
217
    {
218
      reset(); // this removes blocks that are allocated inside the shared primref array
219
      primrefarray_o = std::move(primrefarray);
220
    }
221

222
    /*! returns first fast thread local allocator */
223
    __forceinline ThreadLocal* _threadLocal() {
224
      return &threadLocal2()->alloc0;
225
    }
226

227
    void setOSallocation(bool flag)
228
    {
229
      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
230
    }
231

232
  private:
233

234
    /*! returns both fast thread local allocators */
235
    __forceinline ThreadLocal2* threadLocal2() 
236
    {
237
      ThreadLocal2* alloc = thread_local_allocator2;
238
      if (alloc == nullptr) {
239
        thread_local_allocator2 = alloc = new ThreadLocal2;
240
        Lock<MutexSys> lock(s_thread_local_allocators_lock);
241
        s_thread_local_allocators.push_back(make_unique(alloc));
242
      }
243
      return alloc;
244
    }
245

246
  public:
247

248
    __forceinline void join(ThreadLocal2* alloc)
249
    {
250
      Lock<MutexSys> lock(s_thread_local_allocators_lock);
251
      thread_local_allocators.push_back(alloc);
252
    }
253

254
  public:
255

256
    struct CachedAllocator
257
    {
258
      __forceinline CachedAllocator(void* ptr)
259
        : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) 
260
      {
261
        assert(ptr == nullptr);
262
      }
263

264
      __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
265
        : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
266

267
      __forceinline operator bool () const {
268
        return alloc != nullptr;
269
      }
270

271
      __forceinline void* operator() (size_t bytes, size_t align = 16) const {
272
        return talloc0->malloc(alloc,bytes,align);
273
      }
274

275
      __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
276
        return talloc0->malloc(alloc,bytes,align);
277
      }
278

279
      __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
280
        return talloc1->malloc(alloc,bytes,align);
281
      }
282

283
    public:
284
      FastAllocator* alloc;
285
      ThreadLocal* talloc0;
286
      ThreadLocal* talloc1;
287
    };
288

289
    __forceinline CachedAllocator getCachedAllocator() {
290
      return CachedAllocator(this,threadLocal2());
291
    }
292

293
    /*! Builder interface to create thread local allocator */
294
    struct Create
295
    {
296
    public:
297
      __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
298
      __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator();  }
299

300
    private:
301
      FastAllocator* allocator;
302
    };
303

304
    void internal_fix_used_blocks()
305
    {
306
      /* move thread local blocks to global block list */
307
      for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
308
      {
309
        while (threadBlocks[i].load() != nullptr) {
310
          Block* nextUsedBlock = threadBlocks[i].load()->next;
311
          threadBlocks[i].load()->next = usedBlocks.load();
312
          usedBlocks = threadBlocks[i].load();
313
          threadBlocks[i] = nextUsedBlock;
314
        }
315
        threadBlocks[i] = nullptr;
316
      }
317
    }
318

319
    static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
320
    static const size_t mainAllocOverheadStatic  = 20;  //! 20 means 5% allocation overhead through unfilled main alloc blocks
321
    static const size_t mainAllocOverheadDynamic = 8;  //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
322

323
    /* calculates a single threaded threshold for the builders such
324
     * that for small scenes the overhead of partly allocated blocks
325
     * per thread is low */
326
    size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
327
    {
328
      if (numPrimitives == 0 || bytesEstimated == 0) 
329
        return defaultThreshold;
330

331
      /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
332
      const size_t single_mode_factor = use_single_mode ? 1 : 2;
333
      const size_t threadCount = TaskScheduler::threadCount();
334
      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
335

336
      /* if we do not have to limit number of threads use optimal thresdhold */
337
      if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
338
        return defaultThreshold;
339

340
      /* otherwise limit number of threads by calculating proper single thread threshold */
341
      else {
342
        double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
343
        return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); 
344
      }
345
    }
346

347
    __forceinline size_t alignSize(size_t i) {
348
      return (i+127)/128*128;
349
    }
350

351
    /*! initializes the grow size */
352
    __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) 
353
    {
354
      /* we do not need single thread local allocator mode */
355
      use_single_mode = false;
356
     
357
      /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
358
      size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
359
      size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
360
      growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
361

362
      /* if we reached the maxAllocationSize for growSize, we can
363
       * increase the number of allocation slots by still guaranteeing
364
       * the mainAllocationOverhead */
365
      slotMask = 0x0;
366

367
      if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
368
      if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
369
      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
370
      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
371

372
      /* set the thread local alloc block size */
373
      size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
374
      
375
      /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
376
#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
377
      const size_t threadCount = TaskScheduler::threadCount();
378
      const size_t single_mode_factor = use_single_mode ? 1 : 2;
379
      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
380
      if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
381
        defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
382

383
      /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
384
        else
385
#endif
386
        defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
387

388
      if (bytesEstimated == 0) {
389
        maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
390
        defaultBlockSize = defaultBlockSizeSwitch;
391
      }
392
      log2_grow_size_scale = 0;
393
      
394
      if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
395
      if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
396
      if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
397
      if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
398
      if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
399
      if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
400
      if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
401
    }
402

403
    /*! initializes the allocator */
404
    void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
405
    {
406
      internal_fix_used_blocks();
407
      /* distribute the allocation to multiple thread block slots */
408
      slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
409
      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
410
      if (bytesReserve == 0) bytesReserve = bytesAllocate;
411
      freeBlocks = Block::create(device,useUSM,bytesAllocate,bytesReserve,nullptr,atype);
412
      estimatedSize = bytesEstimate;
413
      initGrowSizeAndNumSlots(bytesEstimate,true);
414
    }
415

416
    /*! initializes the allocator */
417
    void init_estimate(size_t bytesEstimate)
418
    {
419
      internal_fix_used_blocks();
420
      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
421
      /* single allocator mode ? */
422
      estimatedSize = bytesEstimate;
423
      //initGrowSizeAndNumSlots(bytesEstimate,false);
424
      initGrowSizeAndNumSlots(bytesEstimate,false);
425

426
    }
427

428
    /*! frees state not required after build */
429
    __forceinline void cleanup()
430
    {
431
      internal_fix_used_blocks();
432

433
      /* unbind all thread local allocators */
434
      for (auto alloc : thread_local_allocators) alloc->unbind(this);
435
      thread_local_allocators.clear();
436
    }
437

438
    /*! resets the allocator, memory blocks get reused */
439
    void reset ()
440
    {
441
      internal_fix_used_blocks();
442

443
      bytesUsed.store(0);
444
      bytesFree.store(0);
445
      bytesWasted.store(0);
446

447
      /* reset all used blocks and move them to begin of free block list */
448
      while (usedBlocks.load() != nullptr) {
449
        usedBlocks.load()->reset_block();
450
        Block* nextUsedBlock = usedBlocks.load()->next;
451
        usedBlocks.load()->next = freeBlocks.load();
452
        freeBlocks = usedBlocks.load();
453
        usedBlocks = nextUsedBlock;
454
      }
455

456
      /* remove all shared blocks as they are re-added during build */
457
      freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
458

459
      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
460
      {
461
        threadUsedBlocks[i] = nullptr;
462
        threadBlocks[i] = nullptr;
463
      }
464
      
465
      /* unbind all thread local allocators */
466
      for (auto alloc : thread_local_allocators) alloc->unbind(this);
467
      thread_local_allocators.clear();
468
    }
469

470
    /*! frees all allocated memory */
471
    __forceinline void clear()
472
    {
473
      cleanup();
474
      bytesUsed.store(0);
475
      bytesFree.store(0);
476
      bytesWasted.store(0);
477
      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device,useUSM); usedBlocks = nullptr;
478
      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device,useUSM); freeBlocks = nullptr;
479
      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
480
        threadUsedBlocks[i] = nullptr;
481
        threadBlocks[i] = nullptr;
482
      }
483
      primrefarray.clear();
484
    }
485

486
    __forceinline size_t incGrowSizeScale()
487
    {
488
      size_t scale = log2_grow_size_scale.fetch_add(1)+1;
489
      return size_t(1) << min(size_t(16),scale);
490
    }
491

492
    /*! thread safe allocation of memory */
493
    void* malloc(size_t& bytes, size_t align, bool partial)
494
    {
495
      assert(align <= maxAlignment);
496

497
      while (true)
498
      {
499
        /* allocate using current block */
500
        size_t threadID = TaskScheduler::threadID();
501
        size_t slot = threadID & slotMask;
502
        Block* myUsedBlocks = threadUsedBlocks[slot];
503
        if (myUsedBlocks) {
504
          void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
505
          if (ptr == nullptr && !blockAllocation)
506
            abort(); //throw std::bad_alloc();
507
          if (ptr) return ptr;
508
        }
509

510
        /* throw error if allocation is too large */
511
        if (bytes > maxAllocationSize)
512
          throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
513

514
        /* parallel block creation in case of no freeBlocks, avoids single global mutex */
515
        if (likely(freeBlocks.load() == nullptr))
516
        {
517
          Lock<MutexSys> lock(slotMutex[slot]);
518
          if (myUsedBlocks == threadUsedBlocks[slot]) {
519
            const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
520
            const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
521
            assert(allocSize >= bytes);
522
            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
523
            // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
524
          }
525
          continue;
526
        }
527

528
        /* if this fails allocate new block */
529
        {
530
          Lock<MutexSys> lock(mutex);
531
          if (myUsedBlocks == threadUsedBlocks[slot])
532
          {
533
            if (freeBlocks.load() != nullptr) {
534
              Block* nextFreeBlock = freeBlocks.load()->next;
535
              freeBlocks.load()->next = usedBlocks;
536
              __memory_barrier();
537
              usedBlocks = freeBlocks.load();
538
              threadUsedBlocks[slot] = freeBlocks.load();
539
              freeBlocks = nextFreeBlock;
540
            } else {
541
              const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
542
              usedBlocks = threadUsedBlocks[slot] = Block::create(device,useUSM,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
543
            }
544
          }
545
        }
546
      }
547
    }
548

549
    /*! add new block */
550
    void addBlock(void* ptr, ssize_t bytes)
551
    {
552
      Lock<MutexSys> lock(mutex);
553
      const size_t sizeof_Header = offsetof(Block,data[0]);
554
      void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
555
      size_t ofs = (size_t) aptr - (size_t) ptr;
556
      bytes -= ofs;
557
      if (bytes < 4096) return; // ignore empty or very small blocks
558
      freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
559
    }
560

561
    /* special allocation only used from morton builder only a single time for each build */
562
    void* specialAlloc(size_t bytes)
563
    {
564
      assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
565
      return freeBlocks.load()->ptr();
566
    }
567

568
    struct Statistics
569
    {
570
      Statistics ()
571
      : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
572

573
      Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
574
      : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
575

576
      Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
577
      : bytesUsed(0), bytesFree(0), bytesWasted(0)
578
      {
579
        Block* usedBlocks = alloc->usedBlocks.load();
580
        Block* freeBlocks = alloc->freeBlocks.load();
581
        if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
582
        if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
583
        if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
584
        if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
585
        if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
586
      }
587

588
      std::string str(size_t numPrimitives)
589
      {
590
        std::stringstream str;
591
        str.setf(std::ios::fixed, std::ios::floatfield);
592
        str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
593
            << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
594
            << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
595
            << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
596
            << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
597
        return str.str();
598
      }
599

600
      friend Statistics operator+ ( const Statistics& a, const Statistics& b)
601
      {
602
        return Statistics(a.bytesUsed+b.bytesUsed,
603
                          a.bytesFree+b.bytesFree,
604
                          a.bytesWasted+b.bytesWasted);
605
      }
606

607
      size_t bytesAllocatedTotal() const {
608
        return bytesUsed + bytesFree + bytesWasted;
609
      }
610

611
    public:
612
      size_t bytesUsed;
613
      size_t bytesFree;
614
      size_t bytesWasted;
615
    };
616

617
    Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
618
      return Statistics(this,atype,huge_pages);
619
    }
620

621
    size_t getUsedBytes() {
622
      return bytesUsed;
623
    }
624

625
    size_t getWastedBytes() {
626
      return bytesWasted;
627
    }
628

629
    struct AllStatistics
630
    {
631
      AllStatistics (FastAllocator* alloc)
632

633
      : bytesUsed(alloc->bytesUsed),
634
        bytesFree(alloc->bytesFree),
635
        bytesWasted(alloc->bytesWasted),
636
        stat_all(alloc,ANY_TYPE),
637
        stat_malloc(alloc,ALIGNED_MALLOC),
638
        stat_4K(alloc,EMBREE_OS_MALLOC,false),
639
        stat_2M(alloc,EMBREE_OS_MALLOC,true),
640
        stat_shared(alloc,SHARED) {}
641

642
      AllStatistics (size_t bytesUsed,
643
                     size_t bytesFree,
644
                     size_t bytesWasted,
645
                     Statistics stat_all,
646
                     Statistics stat_malloc,
647
                     Statistics stat_4K,
648
                     Statistics stat_2M,
649
                     Statistics stat_shared)
650

651
      : bytesUsed(bytesUsed),
652
        bytesFree(bytesFree),
653
        bytesWasted(bytesWasted),
654
        stat_all(stat_all),
655
        stat_malloc(stat_malloc),
656
        stat_4K(stat_4K),
657
        stat_2M(stat_2M),
658
        stat_shared(stat_shared) {}
659

660
      friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
661
      {
662
        return AllStatistics(a.bytesUsed+b.bytesUsed,
663
                             a.bytesFree+b.bytesFree,
664
                             a.bytesWasted+b.bytesWasted,
665
                             a.stat_all + b.stat_all,
666
                             a.stat_malloc + b.stat_malloc,
667
                             a.stat_4K + b.stat_4K,
668
                             a.stat_2M + b.stat_2M,
669
                             a.stat_shared + b.stat_shared);
670
      }
671

672
      void print(size_t numPrimitives)
673
      {
674
        std::stringstream str0;
675
        str0.setf(std::ios::fixed, std::ios::floatfield);
676
        str0 << "  alloc : " 
677
             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
678
             << "                                                            " 
679
             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
680
        std::cout << str0.str() << std::endl;
681
      
682
        std::stringstream str1;
683
        str1.setf(std::ios::fixed, std::ios::floatfield);
684
        str1 << "  alloc : " 
685
             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
686
             << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "            
687
             << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
688
             << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
689
             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
690
        std::cout << str1.str() << std::endl;
691
     
692
        std::cout << "  total : " << stat_all.str(numPrimitives) << std::endl;
693
        std::cout << "  4K    : " << stat_4K.str(numPrimitives) << std::endl;
694
        std::cout << "  2M    : " << stat_2M.str(numPrimitives) << std::endl;
695
        std::cout << "  malloc: " << stat_malloc.str(numPrimitives) << std::endl;
696
        std::cout << "  shared: " << stat_shared.str(numPrimitives) << std::endl;
697
      }
698

699
    private:
700
      size_t bytesUsed;
701
      size_t bytesFree;
702
      size_t bytesWasted;
703
      Statistics stat_all;
704
      Statistics stat_malloc;
705
      Statistics stat_4K;
706
      Statistics stat_2M;
707
      Statistics stat_shared;
708
    };
709

710
    void print_blocks()
711
    {
712
      std::cout << "  estimatedSize = " << estimatedSize
713
                << ", slotMask = " << slotMask
714
                << ", use_single_mode = " << use_single_mode
715
                << ", maxGrowSize = " << maxGrowSize
716
                << ", defaultBlockSize = " << defaultBlockSize
717
                << std::endl;
718

719
      std::cout << "  used blocks = ";
720
      if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
721
      std::cout << "[END]" << std::endl;
722

723
      std::cout << "  free blocks = ";
724
      if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
725
      std::cout << "[END]" << std::endl;
726
    }
727

728
  private:
729

730
    struct Block
731
    {
732
      __forceinline static void* blockAlignedMalloc(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesAlignment)
733
      {
734
        if (useUSM) return device->malloc(bytesAllocate, bytesAlignment);
735
	else        return alignedMalloc (bytesAllocate, bytesAlignment);
736
      }
737

738
      __forceinline static void blockAlignedFree(Device* device, bool useUSM, void* ptr)
739
      {
740
        if (useUSM) return device->free(ptr);
741
	else        return alignedFree(ptr);
742
      }
743

744
      static Block* create(Device* device, bool useUSM, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
745
      {
746
        /* We avoid using os_malloc for small blocks as this could
747
         * cause a risk of fragmenting the virtual address space and
748
         * reach the limit of vm.max_map_count = 65k under Linux. */
749
        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
750
          atype = ALIGNED_MALLOC;
751

752
        /* we need to additionally allocate some header */
753
        const size_t sizeof_Header = offsetof(Block,data[0]);
754
        bytesAllocate = sizeof_Header+bytesAllocate;
755
        bytesReserve  = sizeof_Header+bytesReserve;
756

757
        /* consume full 4k pages with using os_malloc */
758
        if (atype == EMBREE_OS_MALLOC) {
759
          bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
760
          bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
761
        }
762

763
        /* either use alignedMalloc or os_malloc */
764
        void *ptr = nullptr;
765
        if (atype == ALIGNED_MALLOC)
766
        {
767
          /* special handling for default block size */
768
          if (bytesAllocate == (2*PAGE_SIZE_2M))
769
          {
770
            const size_t alignment = maxAlignment;
771
            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
772
            ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
773

774
            /* give hint to transparently convert these pages to 2MB pages */
775
            const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
776
            os_advise((void*)(ptr_aligned_begin +              0),PAGE_SIZE_2M); // may fail if no memory mapped before block
777
            os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
778
            os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
779

780
            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
781
          }
782
          else
783
          {
784
            const size_t alignment = maxAlignment;
785
            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
786
            ptr = blockAlignedMalloc(device,useUSM,bytesAllocate,alignment);
787
            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
788
          }
789
        }
790
        else if (atype == EMBREE_OS_MALLOC)
791
        {
792
          if (device) device->memoryMonitor(bytesAllocate,false);
793
          bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
794
          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
795
        }
796
        else
797
          assert(false);
798

799
        return NULL;
800
      }
801

802
      Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
803
      : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
804
      {
805
        assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
806
      }
807

808
      static Block* remove_shared_blocks(Block* head)
809
      {
810
        Block** prev_next = &head;
811
        for (Block* block = head; block; block = block->next) {
812
          if (block->atype == SHARED) *prev_next = block->next;
813
          else                         prev_next = &block->next;
814
        }
815
        return head;
816
      }
817

818
      void clear_list(Device* device, bool useUSM)
819
      {
820
        Block* block = this;
821
        while (block) {
822
          Block* next = block->next;
823
          block->clear_block(device, useUSM);
824
          block = next;
825
        }
826
      }
827

828
      void clear_block (Device* device, bool useUSM)
829
      {
830
        const size_t sizeof_Header = offsetof(Block,data[0]);
831
        const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
832

833
        if (atype == ALIGNED_MALLOC) {
834
          blockAlignedFree(device, useUSM, this);
835
          if (device) device->memoryMonitor(-sizeof_Alloced,true);
836
        }
837

838
        else if (atype == EMBREE_OS_MALLOC) {
839
         size_t sizeof_This = sizeof_Header+reserveEnd;
840
         os_free(this,sizeof_This,huge_pages);
841
         if (device) device->memoryMonitor(-sizeof_Alloced,true);
842
        }
843

844
        else /* if (atype == SHARED) */ {
845
        }
846
      }
847

848
      void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
849
      {
850
        size_t bytes = bytes_in;
851
        assert(align <= maxAlignment);
852
        bytes = (bytes+(align-1)) & ~(align-1);
853
        if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
854
        const size_t i = cur.fetch_add(bytes);
855
        if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
856
        if (unlikely(i > reserveEnd)) return nullptr;
857
        bytes_in = bytes = min(bytes,reserveEnd-i);
858

859
        if (i+bytes > allocEnd) {
860
          if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
861
        }
862
        return &data[i];
863
      }
864

865
      void* ptr() {
866
        return &data[cur];
867
      }
868

869
      void reset_block ()
870
      {
871
        allocEnd = max(allocEnd,(size_t)cur);
872
        cur = 0;
873
      }
874

875
      size_t getBlockUsedBytes() const {
876
        return min(size_t(cur),reserveEnd);
877
      }
878

879
      size_t getBlockFreeBytes() const {
880
        return getBlockAllocatedBytes() - getBlockUsedBytes();
881
      }
882

883
      size_t getBlockAllocatedBytes() const {
884
        return min(max(allocEnd,size_t(cur)),reserveEnd);
885
      }
886

887
      size_t getBlockWastedBytes() const {
888
        const size_t sizeof_Header = offsetof(Block,data[0]);
889
        return sizeof_Header + wasted;
890
      }
891

892
      size_t getBlockReservedBytes() const {
893
        return reserveEnd;
894
      }
895
  
896
      bool hasType(AllocationType atype_i, bool huge_pages_i) const
897
      {
898
        if      (atype_i == ANY_TYPE ) return true;
899
        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
900
        else                           return atype_i == atype;
901
      }
902

903
      size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
904
        size_t bytes = 0;
905
        for (const Block* block = this; block; block = block->next) {
906
          if (!block->hasType(atype,huge_pages)) continue;
907
          bytes += block->getBlockUsedBytes();
908
        }
909
        return bytes;
910
      }
911

912
      size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
913
        size_t bytes = 0;
914
        for (const Block* block = this; block; block = block->next) {
915
          if (!block->hasType(atype,huge_pages)) continue;
916
          bytes += block->getBlockFreeBytes();
917
        }
918
        return bytes;
919
      }
920

921
      size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
922
        size_t bytes = 0;
923
        for (const Block* block = this; block; block = block->next) {
924
          if (!block->hasType(atype,huge_pages)) continue;
925
          bytes += block->getBlockWastedBytes();
926
        }
927
        return bytes;
928
      }
929

930
      size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
931
        size_t bytes = 0;
932
        for (const Block* block = this; block; block = block->next) {
933
          if (!block->hasType(atype,huge_pages)) continue;
934
          bytes += block->getBlockAllocatedBytes();
935
        }
936
        return bytes;
937
      }
938

939
      void print_list ()
940
      {
941
        for (const Block* block = this; block; block = block->next)
942
          block->print_block();
943
      }
944

945
      void print_block() const
946
      {
947
        if (atype == ALIGNED_MALLOC) std::cout << "A";
948
        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
949
        else if (atype == SHARED) std::cout << "S";
950
        if (huge_pages) std::cout << "H";
951
        size_t bytesUsed = getBlockUsedBytes();
952
        size_t bytesFree = getBlockFreeBytes();
953
        size_t bytesWasted = getBlockWastedBytes();
954
        std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
955
      }
956

957
    public:
958
      std::atomic<size_t> cur;        //!< current location of the allocator
959
      std::atomic<size_t> allocEnd;   //!< end of the allocated memory region
960
      std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
961
      Block* next;               //!< pointer to next block in list
962
      size_t wasted;             //!< amount of memory wasted through block alignment
963
      AllocationType atype;      //!< allocation mode of the block
964
      bool huge_pages;           //!< whether the block uses huge pages
965
      char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
966
      char data[1];              //!< here starts memory to use for allocations
967
    };
968

969
  public:
970
    static const size_t blockHeaderSize = offsetof(Block,data[0]);
971

972
  private:
973
    Device* device;
974
    size_t slotMask;
975
    size_t defaultBlockSize;
976
    size_t estimatedSize;
977
    size_t growSize;
978
    size_t maxGrowSize;
979

980
    MutexSys mutex;
981
    MutexSys slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
982
    std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
983
    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
984
    std::atomic<Block*> usedBlocks;
985
    std::atomic<Block*> freeBlocks;
986

987
    bool useUSM;
988
    bool blockAllocation = true;
989
    bool use_single_mode;
990

991
    std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
992
    std::atomic<size_t> bytesUsed;
993
    std::atomic<size_t> bytesFree;
994
    std::atomic<size_t> bytesWasted;
995

996
    static __thread ThreadLocal2* thread_local_allocator2;
997
    static MutexSys s_thread_local_allocators_lock;
998
    static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
999

1000
    std::vector<ThreadLocal2*> thread_local_allocators;
1001
    AllocationType atype;
1002

1003
    mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
1004
  };
1005
}
1006

1007
Product

Resources

Company