Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_alloc.cpp
35258 views
/*1* kmp_alloc.cpp -- private/shared dynamic memory allocation and management2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112#include "kmp.h"13#include "kmp_io.h"14#include "kmp_wrapper_malloc.h"1516// Disable bget when it is not used17#if KMP_USE_BGET1819/* Thread private buffer management code */2021typedef int (*bget_compact_t)(size_t, int);22typedef void *(*bget_acquire_t)(size_t);23typedef void (*bget_release_t)(void *);2425/* NOTE: bufsize must be a signed datatype */2627#if KMP_OS_WINDOWS28#if KMP_ARCH_X86 || KMP_ARCH_ARM29typedef kmp_int32 bufsize;30#else31typedef kmp_int64 bufsize;32#endif33#else34typedef ssize_t bufsize;35#endif // KMP_OS_WINDOWS3637/* The three modes of operation are, fifo search, lifo search, and best-fit */3839typedef enum bget_mode {40bget_mode_fifo = 0,41bget_mode_lifo = 1,42bget_mode_best = 243} bget_mode_t;4445static void bpool(kmp_info_t *th, void *buffer, bufsize len);46static void *bget(kmp_info_t *th, bufsize size);47static void *bgetz(kmp_info_t *th, bufsize size);48static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);49static void brel(kmp_info_t *th, void *buf);50static void bectl(kmp_info_t *th, bget_compact_t compact,51bget_acquire_t acquire, bget_release_t release,52bufsize pool_incr);5354/* BGET CONFIGURATION */55/* Buffer allocation size quantum: all buffers allocated are a56multiple of this size. This MUST be a power of two. */5758/* On IA-32 architecture with Linux* OS, malloc() does not59ensure 16 byte alignment */6061#if KMP_ARCH_X86 || !KMP_HAVE_QUAD6263#define SizeQuant 864#define AlignType double6566#else6768#define SizeQuant 1669#define AlignType _Quad7071#endif7273// Define this symbol to enable the bstats() function which calculates the74// total free space in the buffer pool, the largest available buffer, and the75// total space currently allocated.76#define BufStats 17778#ifdef KMP_DEBUG7980// Define this symbol to enable the bpoold() function which dumps the buffers81// in a buffer pool.82#define BufDump 18384// Define this symbol to enable the bpoolv() function for validating a buffer85// pool.86#define BufValid 18788// Define this symbol to enable the bufdump() function which allows dumping the89// contents of an allocated or free buffer.90#define DumpData 19192#ifdef NOT_USED_NOW9394// Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants95// who attempt to use pointers into released buffers.96#define FreeWipe 19798// Use a best fit algorithm when searching for space for an allocation request.99// This uses memory more efficiently, but allocation will be much slower.100#define BestFit 1101102#endif /* NOT_USED_NOW */103#endif /* KMP_DEBUG */104105static bufsize bget_bin_size[] = {1060,107// 1 << 6, /* .5 Cache line */1081 << 7, /* 1 Cache line, new */1091 << 8, /* 2 Cache lines */1101 << 9, /* 4 Cache lines, new */1111 << 10, /* 8 Cache lines */1121 << 11, /* 16 Cache lines, new */1131 << 12, 1 << 13, /* new */1141 << 14, 1 << 15, /* new */1151 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /* 1MB */1161 << 21, /* 2MB */1171 << 22, /* 4MB */1181 << 23, /* 8MB */1191 << 24, /* 16MB */1201 << 25, /* 32MB */121};122123#define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))124125struct bfhead;126127// Declare the interface, including the requested buffer size type, bufsize.128129/* Queue links */130typedef struct qlinks {131struct bfhead *flink; /* Forward link */132struct bfhead *blink; /* Backward link */133} qlinks_t;134135/* Header in allocated and free buffers */136typedef struct bhead2 {137kmp_info_t *bthr; /* The thread which owns the buffer pool */138bufsize prevfree; /* Relative link back to previous free buffer in memory or1390 if previous buffer is allocated. */140bufsize bsize; /* Buffer size: positive if free, negative if allocated. */141} bhead2_t;142143/* Make sure the bhead structure is a multiple of SizeQuant in size. */144typedef union bhead {145KMP_ALIGN(SizeQuant)146AlignType b_align;147char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];148bhead2_t bb;149} bhead_t;150#define BH(p) ((bhead_t *)(p))151152/* Header in directly allocated buffers (by acqfcn) */153typedef struct bdhead {154bufsize tsize; /* Total size, including overhead */155bhead_t bh; /* Common header */156} bdhead_t;157#define BDH(p) ((bdhead_t *)(p))158159/* Header in free buffers */160typedef struct bfhead {161bhead_t bh; /* Common allocated/free header */162qlinks_t ql; /* Links on free list */163} bfhead_t;164#define BFH(p) ((bfhead_t *)(p))165166typedef struct thr_data {167bfhead_t freelist[MAX_BGET_BINS];168#if BufStats169size_t totalloc; /* Total space currently allocated */170long numget, numrel; /* Number of bget() and brel() calls */171long numpblk; /* Number of pool blocks */172long numpget, numprel; /* Number of block gets and rels */173long numdget, numdrel; /* Number of direct gets and rels */174#endif /* BufStats */175176/* Automatic expansion block management functions */177bget_compact_t compfcn;178bget_acquire_t acqfcn;179bget_release_t relfcn;180181bget_mode_t mode; /* what allocation mode to use? */182183bufsize exp_incr; /* Expansion block size */184bufsize pool_len; /* 0: no bpool calls have been made185-1: not all pool blocks are the same size186>0: (common) block size for all bpool calls made so far187*/188bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */189} thr_data_t;190191/* Minimum allocation quantum: */192#define QLSize (sizeof(qlinks_t))193#define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)194#define MaxSize \195(bufsize)( \196~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))197// Maximum for the requested size.198199/* End sentinel: value placed in bsize field of dummy block delimiting200end of pool block. The most negative number which will fit in a201bufsize, defined in a way that the compiler will accept. */202203#define ESent \204((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))205206/* Thread Data management routines */207static int bget_get_bin(bufsize size) {208// binary chop bins209int lo = 0, hi = MAX_BGET_BINS - 1;210211KMP_DEBUG_ASSERT(size > 0);212213while ((hi - lo) > 1) {214int mid = (lo + hi) >> 1;215if (size < bget_bin_size[mid])216hi = mid - 1;217else218lo = mid;219}220221KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));222223return lo;224}225226static void set_thr_data(kmp_info_t *th) {227int i;228thr_data_t *data;229230data = (thr_data_t *)((!th->th.th_local.bget_data)231? __kmp_allocate(sizeof(*data))232: th->th.th_local.bget_data);233234memset(data, '\0', sizeof(*data));235236for (i = 0; i < MAX_BGET_BINS; ++i) {237data->freelist[i].ql.flink = &data->freelist[i];238data->freelist[i].ql.blink = &data->freelist[i];239}240241th->th.th_local.bget_data = data;242th->th.th_local.bget_list = 0;243#if !USE_CMP_XCHG_FOR_BGET244#ifdef USE_QUEUING_LOCK_FOR_BGET245__kmp_init_lock(&th->th.th_local.bget_lock);246#else247__kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);248#endif /* USE_LOCK_FOR_BGET */249#endif /* ! USE_CMP_XCHG_FOR_BGET */250}251252static thr_data_t *get_thr_data(kmp_info_t *th) {253thr_data_t *data;254255data = (thr_data_t *)th->th.th_local.bget_data;256257KMP_DEBUG_ASSERT(data != 0);258259return data;260}261262/* Walk the free list and release the enqueued buffers */263static void __kmp_bget_dequeue(kmp_info_t *th) {264void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);265266if (p != 0) {267#if USE_CMP_XCHG_FOR_BGET268{269volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);270while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,271CCAST(void *, old_value), nullptr)) {272KMP_CPU_PAUSE();273old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);274}275p = CCAST(void *, old_value);276}277#else /* ! USE_CMP_XCHG_FOR_BGET */278#ifdef USE_QUEUING_LOCK_FOR_BGET279__kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));280#else281__kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);282#endif /* USE_QUEUING_LOCK_FOR_BGET */283284p = (void *)th->th.th_local.bget_list;285th->th.th_local.bget_list = 0;286287#ifdef USE_QUEUING_LOCK_FOR_BGET288__kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));289#else290__kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);291#endif292#endif /* USE_CMP_XCHG_FOR_BGET */293294/* Check again to make sure the list is not empty */295while (p != 0) {296void *buf = p;297bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));298299KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);300KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==301(kmp_uintptr_t)th); // clear possible mark302KMP_DEBUG_ASSERT(b->ql.blink == 0);303304p = (void *)b->ql.flink;305306brel(th, buf);307}308}309}310311/* Chain together the free buffers by using the thread owner field */312static void __kmp_bget_enqueue(kmp_info_t *th, void *buf313#ifdef USE_QUEUING_LOCK_FOR_BGET314,315kmp_int32 rel_gtid316#endif317) {318bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));319320KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);321KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==322(kmp_uintptr_t)th); // clear possible mark323324b->ql.blink = 0;325326KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",327__kmp_gtid_from_thread(th)));328329#if USE_CMP_XCHG_FOR_BGET330{331volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);332/* the next pointer must be set before setting bget_list to buf to avoid333exposing a broken list to other threads, even for an instant. */334b->ql.flink = BFH(CCAST(void *, old_value));335336while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,337CCAST(void *, old_value), buf)) {338KMP_CPU_PAUSE();339old_value = TCR_PTR(th->th.th_local.bget_list);340/* the next pointer must be set before setting bget_list to buf to avoid341exposing a broken list to other threads, even for an instant. */342b->ql.flink = BFH(CCAST(void *, old_value));343}344}345#else /* ! USE_CMP_XCHG_FOR_BGET */346#ifdef USE_QUEUING_LOCK_FOR_BGET347__kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);348#else349__kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);350#endif351352b->ql.flink = BFH(th->th.th_local.bget_list);353th->th.th_local.bget_list = (void *)buf;354355#ifdef USE_QUEUING_LOCK_FOR_BGET356__kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);357#else358__kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);359#endif360#endif /* USE_CMP_XCHG_FOR_BGET */361}362363/* insert buffer back onto a new freelist */364static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {365int bin;366367KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);368KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);369370bin = bget_get_bin(b->bh.bb.bsize);371372KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==373&thr->freelist[bin]);374KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==375&thr->freelist[bin]);376377b->ql.flink = &thr->freelist[bin];378b->ql.blink = thr->freelist[bin].ql.blink;379380thr->freelist[bin].ql.blink = b;381b->ql.blink->ql.flink = b;382}383384/* unlink the buffer from the old freelist */385static void __kmp_bget_remove_from_freelist(bfhead_t *b) {386KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);387KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);388389b->ql.blink->ql.flink = b->ql.flink;390b->ql.flink->ql.blink = b->ql.blink;391}392393/* GET STATS -- check info on free list */394static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {395thr_data_t *thr = get_thr_data(th);396int bin;397398*total_free = *max_free = 0;399400for (bin = 0; bin < MAX_BGET_BINS; ++bin) {401bfhead_t *b, *best;402403best = &thr->freelist[bin];404b = best->ql.flink;405406while (b != &thr->freelist[bin]) {407*total_free += (b->bh.bb.bsize - sizeof(bhead_t));408if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))409best = b;410411/* Link to next buffer */412b = b->ql.flink;413}414415if (*max_free < best->bh.bb.bsize)416*max_free = best->bh.bb.bsize;417}418419if (*max_free > (bufsize)sizeof(bhead_t))420*max_free -= sizeof(bhead_t);421}422423/* BGET -- Allocate a buffer. */424static void *bget(kmp_info_t *th, bufsize requested_size) {425thr_data_t *thr = get_thr_data(th);426bufsize size = requested_size;427bfhead_t *b;428void *buf;429int compactseq = 0;430int use_blink = 0;431/* For BestFit */432bfhead_t *best;433434if (size < 0 || size + sizeof(bhead_t) > MaxSize) {435return NULL;436}437438__kmp_bget_dequeue(th); /* Release any queued buffers */439440if (size < (bufsize)SizeQ) { // Need at least room for the queue links.441size = SizeQ;442}443#if defined(SizeQuant) && (SizeQuant > 1)444size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));445#endif446447size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.448KMP_DEBUG_ASSERT(size >= 0);449KMP_DEBUG_ASSERT(size % SizeQuant == 0);450451use_blink = (thr->mode == bget_mode_lifo);452453/* If a compact function was provided in the call to bectl(), wrap454a loop around the allocation process to allow compaction to455intervene in case we don't find a suitable buffer in the chain. */456457for (;;) {458int bin;459460for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {461/* Link to next buffer */462b = (use_blink ? thr->freelist[bin].ql.blink463: thr->freelist[bin].ql.flink);464465if (thr->mode == bget_mode_best) {466best = &thr->freelist[bin];467468/* Scan the free list searching for the first buffer big enough469to hold the requested size buffer. */470while (b != &thr->freelist[bin]) {471if (b->bh.bb.bsize >= (bufsize)size) {472if ((best == &thr->freelist[bin]) ||473(b->bh.bb.bsize < best->bh.bb.bsize)) {474best = b;475}476}477478/* Link to next buffer */479b = (use_blink ? b->ql.blink : b->ql.flink);480}481b = best;482}483484while (b != &thr->freelist[bin]) {485if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {486487// Buffer is big enough to satisfy the request. Allocate it to the488// caller. We must decide whether the buffer is large enough to split489// into the part given to the caller and a free buffer that remains490// on the free list, or whether the entire buffer should be removed491// from the free list and given to the caller in its entirety. We492// only split the buffer if enough room remains for a header plus the493// minimum quantum of allocation.494if ((b->bh.bb.bsize - (bufsize)size) >495(bufsize)(SizeQ + (sizeof(bhead_t)))) {496bhead_t *ba, *bn;497498ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));499bn = BH(((char *)ba) + size);500501KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);502503/* Subtract size from length of free block. */504b->bh.bb.bsize -= (bufsize)size;505506/* Link allocated buffer to the previous free buffer. */507ba->bb.prevfree = b->bh.bb.bsize;508509/* Plug negative size into user buffer. */510ba->bb.bsize = -size;511512/* Mark this buffer as owned by this thread. */513TCW_PTR(ba->bb.bthr,514th); // not an allocated address (do not mark it)515/* Mark buffer after this one not preceded by free block. */516bn->bb.prevfree = 0;517518// unlink buffer from old freelist, and reinsert into new freelist519__kmp_bget_remove_from_freelist(b);520__kmp_bget_insert_into_freelist(thr, b);521#if BufStats522thr->totalloc += (size_t)size;523thr->numget++; /* Increment number of bget() calls */524#endif525buf = (void *)((((char *)ba) + sizeof(bhead_t)));526KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);527return buf;528} else {529bhead_t *ba;530531ba = BH(((char *)b) + b->bh.bb.bsize);532533KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);534535/* The buffer isn't big enough to split. Give the whole536shebang to the caller and remove it from the free list. */537538__kmp_bget_remove_from_freelist(b);539#if BufStats540thr->totalloc += (size_t)b->bh.bb.bsize;541thr->numget++; /* Increment number of bget() calls */542#endif543/* Negate size to mark buffer allocated. */544b->bh.bb.bsize = -(b->bh.bb.bsize);545546/* Mark this buffer as owned by this thread. */547TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)548/* Zero the back pointer in the next buffer in memory549to indicate that this buffer is allocated. */550ba->bb.prevfree = 0;551552/* Give user buffer starting at queue links. */553buf = (void *)&(b->ql);554KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);555return buf;556}557}558559/* Link to next buffer */560b = (use_blink ? b->ql.blink : b->ql.flink);561}562}563564/* We failed to find a buffer. If there's a compact function defined,565notify it of the size requested. If it returns TRUE, try the allocation566again. */567568if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {569break;570}571}572573/* No buffer available with requested size free. */574575/* Don't give up yet -- look in the reserve supply. */576if (thr->acqfcn != 0) {577if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {578/* Request is too large to fit in a single expansion block.579Try to satisfy it by a direct buffer acquisition. */580bdhead_t *bdh;581582size += sizeof(bdhead_t) - sizeof(bhead_t);583584KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));585586/* richryan */587bdh = BDH((*thr->acqfcn)((bufsize)size));588if (bdh != NULL) {589590// Mark the buffer special by setting size field of its header to zero.591bdh->bh.bb.bsize = 0;592593/* Mark this buffer as owned by this thread. */594TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,595// because direct buffer never goes to free list596bdh->bh.bb.prevfree = 0;597bdh->tsize = size;598#if BufStats599thr->totalloc += (size_t)size;600thr->numget++; /* Increment number of bget() calls */601thr->numdget++; /* Direct bget() call count */602#endif603buf = (void *)(bdh + 1);604KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);605return buf;606}607608} else {609610/* Try to obtain a new expansion block */611void *newpool;612613KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));614615/* richryan */616newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);617KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);618if (newpool != NULL) {619bpool(th, newpool, thr->exp_incr);620buf = bget(621th, requested_size); /* This can't, I say, can't get into a loop. */622return buf;623}624}625}626627/* Still no buffer available */628629return NULL;630}631632/* BGETZ -- Allocate a buffer and clear its contents to zero. We clear633the entire contents of the buffer to zero, not just the634region requested by the caller. */635636static void *bgetz(kmp_info_t *th, bufsize size) {637char *buf = (char *)bget(th, size);638639if (buf != NULL) {640bhead_t *b;641bufsize rsize;642643b = BH(buf - sizeof(bhead_t));644rsize = -(b->bb.bsize);645if (rsize == 0) {646bdhead_t *bd;647648bd = BDH(buf - sizeof(bdhead_t));649rsize = bd->tsize - (bufsize)sizeof(bdhead_t);650} else {651rsize -= sizeof(bhead_t);652}653654KMP_DEBUG_ASSERT(rsize >= size);655656(void)memset(buf, 0, (bufsize)rsize);657}658return ((void *)buf);659}660661/* BGETR -- Reallocate a buffer. This is a minimal implementation,662simply in terms of brel() and bget(). It could be663enhanced to allow the buffer to grow into adjacent free664blocks and to avoid moving data unnecessarily. */665666static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {667void *nbuf;668bufsize osize; /* Old size of buffer */669bhead_t *b;670671nbuf = bget(th, size);672if (nbuf == NULL) { /* Acquire new buffer */673return NULL;674}675if (buf == NULL) {676return nbuf;677}678b = BH(((char *)buf) - sizeof(bhead_t));679osize = -b->bb.bsize;680if (osize == 0) {681/* Buffer acquired directly through acqfcn. */682bdhead_t *bd;683684bd = BDH(((char *)buf) - sizeof(bdhead_t));685osize = bd->tsize - (bufsize)sizeof(bdhead_t);686} else {687osize -= sizeof(bhead_t);688}689690KMP_DEBUG_ASSERT(osize > 0);691692(void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */693(size_t)((size < osize) ? size : osize));694brel(th, buf);695696return nbuf;697}698699/* BREL -- Release a buffer. */700static void brel(kmp_info_t *th, void *buf) {701thr_data_t *thr = get_thr_data(th);702bfhead_t *b, *bn;703kmp_info_t *bth;704705KMP_DEBUG_ASSERT(buf != NULL);706KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);707708b = BFH(((char *)buf) - sizeof(bhead_t));709710if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */711bdhead_t *bdh;712713bdh = BDH(((char *)buf) - sizeof(bdhead_t));714KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);715#if BufStats716thr->totalloc -= (size_t)bdh->tsize;717thr->numdrel++; /* Number of direct releases */718thr->numrel++; /* Increment number of brel() calls */719#endif /* BufStats */720#ifdef FreeWipe721(void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));722#endif /* FreeWipe */723724KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));725726KMP_DEBUG_ASSERT(thr->relfcn != 0);727(*thr->relfcn)((void *)bdh); /* Release it directly. */728return;729}730731bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &732~1); // clear possible mark before comparison733if (bth != th) {734/* Add this buffer to be released by the owning thread later */735__kmp_bget_enqueue(bth, buf736#ifdef USE_QUEUING_LOCK_FOR_BGET737,738__kmp_gtid_from_thread(th)739#endif740);741return;742}743744/* Buffer size must be negative, indicating that the buffer is allocated. */745if (b->bh.bb.bsize >= 0) {746bn = NULL;747}748KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);749750/* Back pointer in next buffer must be zero, indicating the same thing: */751752KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);753754#if BufStats755thr->numrel++; /* Increment number of brel() calls */756thr->totalloc += (size_t)b->bh.bb.bsize;757#endif758759/* If the back link is nonzero, the previous buffer is free. */760761if (b->bh.bb.prevfree != 0) {762/* The previous buffer is free. Consolidate this buffer with it by adding763the length of this buffer to the previous free buffer. Note that we764subtract the size in the buffer being released, since it's negative to765indicate that the buffer is allocated. */766bufsize size = b->bh.bb.bsize;767768/* Make the previous buffer the one we're working on. */769KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==770b->bh.bb.prevfree);771b = BFH(((char *)b) - b->bh.bb.prevfree);772b->bh.bb.bsize -= size;773774/* unlink the buffer from the old freelist */775__kmp_bget_remove_from_freelist(b);776} else {777/* The previous buffer isn't allocated. Mark this buffer size as positive778(i.e. free) and fall through to place the buffer on the free list as an779isolated free block. */780b->bh.bb.bsize = -b->bh.bb.bsize;781}782783/* insert buffer back onto a new freelist */784__kmp_bget_insert_into_freelist(thr, b);785786/* Now we look at the next buffer in memory, located by advancing from787the start of this buffer by its size, to see if that buffer is788free. If it is, we combine this buffer with the next one in789memory, dechaining the second buffer from the free list. */790bn = BFH(((char *)b) + b->bh.bb.bsize);791if (bn->bh.bb.bsize > 0) {792793/* The buffer is free. Remove it from the free list and add794its size to that of our buffer. */795KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==796bn->bh.bb.bsize);797798__kmp_bget_remove_from_freelist(bn);799800b->bh.bb.bsize += bn->bh.bb.bsize;801802/* unlink the buffer from the old freelist, and reinsert it into the new803* freelist */804__kmp_bget_remove_from_freelist(b);805__kmp_bget_insert_into_freelist(thr, b);806807/* Finally, advance to the buffer that follows the newly808consolidated free block. We must set its backpointer to the809head of the consolidated free block. We know the next block810must be an allocated block because the process of recombination811guarantees that two free blocks will never be contiguous in812memory. */813bn = BFH(((char *)b) + b->bh.bb.bsize);814}815#ifdef FreeWipe816(void)memset(((char *)b) + sizeof(bfhead_t), 0x55,817(size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));818#endif819KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);820821/* The next buffer is allocated. Set the backpointer in it to point822to this buffer; the previous free buffer in memory. */823824bn->bh.bb.prevfree = b->bh.bb.bsize;825826/* If a block-release function is defined, and this free buffer827constitutes the entire block, release it. Note that pool_len828is defined in such a way that the test will fail unless all829pool blocks are the same size. */830if (thr->relfcn != 0 &&831b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {832#if BufStats833if (thr->numpblk !=8341) { /* Do not release the last buffer until finalization time */835#endif836837KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);838KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);839KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==840b->bh.bb.bsize);841842/* Unlink the buffer from the free list */843__kmp_bget_remove_from_freelist(b);844845KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));846847(*thr->relfcn)(b);848#if BufStats849thr->numprel++; /* Nr of expansion block releases */850thr->numpblk--; /* Total number of blocks */851KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);852853// avoid leaving stale last_pool pointer around if it is being dealloced854if (thr->last_pool == b)855thr->last_pool = 0;856} else {857thr->last_pool = b;858}859#endif /* BufStats */860}861}862863/* BECTL -- Establish automatic pool expansion control */864static void bectl(kmp_info_t *th, bget_compact_t compact,865bget_acquire_t acquire, bget_release_t release,866bufsize pool_incr) {867thr_data_t *thr = get_thr_data(th);868869thr->compfcn = compact;870thr->acqfcn = acquire;871thr->relfcn = release;872thr->exp_incr = pool_incr;873}874875/* BPOOL -- Add a region of memory to the buffer pool. */876static void bpool(kmp_info_t *th, void *buf, bufsize len) {877/* int bin = 0; */878thr_data_t *thr = get_thr_data(th);879bfhead_t *b = BFH(buf);880bhead_t *bn;881882__kmp_bget_dequeue(th); /* Release any queued buffers */883884#ifdef SizeQuant885len &= ~((bufsize)(SizeQuant - 1));886#endif887if (thr->pool_len == 0) {888thr->pool_len = len;889} else if (len != thr->pool_len) {890thr->pool_len = -1;891}892#if BufStats893thr->numpget++; /* Number of block acquisitions */894thr->numpblk++; /* Number of blocks total */895KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);896#endif /* BufStats */897898/* Since the block is initially occupied by a single free buffer,899it had better not be (much) larger than the largest buffer900whose size we can store in bhead.bb.bsize. */901KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));902903/* Clear the backpointer at the start of the block to indicate that904there is no free block prior to this one. That blocks905recombination when the first block in memory is released. */906b->bh.bb.prevfree = 0;907908/* Create a dummy allocated buffer at the end of the pool. This dummy909buffer is seen when a buffer at the end of the pool is released and910blocks recombination of the last buffer with the dummy buffer at911the end. The length in the dummy buffer is set to the largest912negative number to denote the end of the pool for diagnostic913routines (this specific value is not counted on by the actual914allocation and release functions). */915len -= sizeof(bhead_t);916b->bh.bb.bsize = (bufsize)len;917/* Set the owner of this buffer */918TCW_PTR(b->bh.bb.bthr,919(kmp_info_t *)((kmp_uintptr_t)th |9201)); // mark the buffer as allocated address921922/* Chain the new block to the free list. */923__kmp_bget_insert_into_freelist(thr, b);924925#ifdef FreeWipe926(void)memset(((char *)b) + sizeof(bfhead_t), 0x55,927(size_t)(len - sizeof(bfhead_t)));928#endif929bn = BH(((char *)b) + len);930bn->bb.prevfree = (bufsize)len;931/* Definition of ESent assumes two's complement! */932KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));933934bn->bb.bsize = ESent;935}936937/* BFREED -- Dump the free lists for this thread. */938static void bfreed(kmp_info_t *th) {939int bin = 0, count = 0;940int gtid = __kmp_gtid_from_thread(th);941thr_data_t *thr = get_thr_data(th);942943#if BufStats944__kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC945" get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC946" pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC947" prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC948" drel=%" KMP_INT64_SPEC "\n",949gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,950(kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,951(kmp_int64)thr->numpget, (kmp_int64)thr->numprel,952(kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);953#endif954955for (bin = 0; bin < MAX_BGET_BINS; ++bin) {956bfhead_t *b;957958for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];959b = b->ql.flink) {960bufsize bs = b->bh.bb.bsize;961962KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);963KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);964KMP_DEBUG_ASSERT(bs > 0);965966count += 1;967968__kmp_printf_no_lock(969"__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,970(long)bs);971#ifdef FreeWipe972{973char *lerr = ((char *)b) + sizeof(bfhead_t);974if ((bs > sizeof(bfhead_t)) &&975((*lerr != 0x55) ||976(memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=9770))) {978__kmp_printf_no_lock("__kmp_printpool: T#%d (Contents of above "979"free block have been overstored.)\n",980gtid);981}982}983#endif984}985}986987if (count == 0)988__kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);989}990991void __kmp_initialize_bget(kmp_info_t *th) {992KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));993994set_thr_data(th);995996bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,997(bufsize)__kmp_malloc_pool_incr);998}9991000void __kmp_finalize_bget(kmp_info_t *th) {1001thr_data_t *thr;1002bfhead_t *b;10031004KMP_DEBUG_ASSERT(th != 0);10051006#if BufStats1007thr = (thr_data_t *)th->th.th_local.bget_data;1008KMP_DEBUG_ASSERT(thr != NULL);1009b = thr->last_pool;10101011/* If a block-release function is defined, and this free buffer constitutes1012the entire block, release it. Note that pool_len is defined in such a way1013that the test will fail unless all pool blocks are the same size. */10141015// Deallocate the last pool if one exists because we no longer do it in brel()1016if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&1017b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {1018KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);1019KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);1020KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==1021b->bh.bb.bsize);10221023/* Unlink the buffer from the free list */1024__kmp_bget_remove_from_freelist(b);10251026KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));10271028(*thr->relfcn)(b);1029thr->numprel++; /* Nr of expansion block releases */1030thr->numpblk--; /* Total number of blocks */1031KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);1032}1033#endif /* BufStats */10341035/* Deallocate bget_data */1036if (th->th.th_local.bget_data != NULL) {1037__kmp_free(th->th.th_local.bget_data);1038th->th.th_local.bget_data = NULL;1039}1040}10411042void kmpc_set_poolsize(size_t size) {1043bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,1044(bget_release_t)free, (bufsize)size);1045}10461047size_t kmpc_get_poolsize(void) {1048thr_data_t *p;10491050p = get_thr_data(__kmp_get_thread());10511052return p->exp_incr;1053}10541055void kmpc_set_poolmode(int mode) {1056thr_data_t *p;10571058if (mode == bget_mode_fifo || mode == bget_mode_lifo ||1059mode == bget_mode_best) {1060p = get_thr_data(__kmp_get_thread());1061p->mode = (bget_mode_t)mode;1062}1063}10641065int kmpc_get_poolmode(void) {1066thr_data_t *p;10671068p = get_thr_data(__kmp_get_thread());10691070return p->mode;1071}10721073void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {1074kmp_info_t *th = __kmp_get_thread();1075bufsize a, b;10761077__kmp_bget_dequeue(th); /* Release any queued buffers */10781079bcheck(th, &a, &b);10801081*maxmem = a;1082*allmem = b;1083}10841085void kmpc_poolprint(void) {1086kmp_info_t *th = __kmp_get_thread();10871088__kmp_bget_dequeue(th); /* Release any queued buffers */10891090bfreed(th);1091}10921093#endif // #if KMP_USE_BGET10941095void *kmpc_malloc(size_t size) {1096void *ptr;1097ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));1098if (ptr != NULL) {1099// save allocated pointer just before one returned to user1100*(void **)ptr = ptr;1101ptr = (void **)ptr + 1;1102}1103return ptr;1104}11051106#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)11071108void *kmpc_aligned_malloc(size_t size, size_t alignment) {1109void *ptr;1110void *ptr_allocated;1111KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big1112if (!IS_POWER_OF_TWO(alignment)) {1113// AC: do we need to issue a warning here?1114errno = EINVAL;1115return NULL;1116}1117size = size + sizeof(void *) + alignment;1118ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);1119if (ptr_allocated != NULL) {1120// save allocated pointer just before one returned to user1121ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &1122~(alignment - 1));1123*((void **)ptr - 1) = ptr_allocated;1124} else {1125ptr = NULL;1126}1127return ptr;1128}11291130void *kmpc_calloc(size_t nelem, size_t elsize) {1131void *ptr;1132ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));1133if (ptr != NULL) {1134// save allocated pointer just before one returned to user1135*(void **)ptr = ptr;1136ptr = (void **)ptr + 1;1137}1138return ptr;1139}11401141void *kmpc_realloc(void *ptr, size_t size) {1142void *result = NULL;1143if (ptr == NULL) {1144// If pointer is NULL, realloc behaves like malloc.1145result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));1146// save allocated pointer just before one returned to user1147if (result != NULL) {1148*(void **)result = result;1149result = (void **)result + 1;1150}1151} else if (size == 0) {1152// If size is 0, realloc behaves like free.1153// The thread must be registered by the call to kmpc_malloc() or1154// kmpc_calloc() before.1155// So it should be safe to call __kmp_get_thread(), not1156// __kmp_entry_thread().1157KMP_ASSERT(*((void **)ptr - 1));1158brel(__kmp_get_thread(), *((void **)ptr - 1));1159} else {1160result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),1161(bufsize)(size + sizeof(ptr)));1162if (result != NULL) {1163*(void **)result = result;1164result = (void **)result + 1;1165}1166}1167return result;1168}11691170// NOTE: the library must have already been initialized by a previous allocate1171void kmpc_free(void *ptr) {1172if (!__kmp_init_serial) {1173return;1174}1175if (ptr != NULL) {1176kmp_info_t *th = __kmp_get_thread();1177__kmp_bget_dequeue(th); /* Release any queued buffers */1178// extract allocated pointer and free it1179KMP_ASSERT(*((void **)ptr - 1));1180brel(th, *((void **)ptr - 1));1181}1182}11831184void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {1185void *ptr;1186KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,1187(int)size KMP_SRC_LOC_PARM));1188ptr = bget(th, (bufsize)size);1189KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));1190return ptr;1191}11921193void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,1194size_t elsize KMP_SRC_LOC_DECL) {1195void *ptr;1196KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,1197(int)nelem, (int)elsize KMP_SRC_LOC_PARM));1198ptr = bgetz(th, (bufsize)(nelem * elsize));1199KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));1200return ptr;1201}12021203void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,1204size_t size KMP_SRC_LOC_DECL) {1205KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,1206ptr, (int)size KMP_SRC_LOC_PARM));1207ptr = bgetr(th, ptr, (bufsize)size);1208KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));1209return ptr;1210}12111212void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {1213KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,1214ptr KMP_SRC_LOC_PARM));1215if (ptr != NULL) {1216__kmp_bget_dequeue(th); /* Release any queued buffers */1217brel(th, ptr);1218}1219KE_TRACE(30, ("<- __kmp_thread_free()\n"));1220}12211222/* OMP 5.0 Memory Management support */1223static const char *kmp_mk_lib_name;1224static void *h_memkind;1225/* memkind experimental API: */1226// memkind_alloc1227static void *(*kmp_mk_alloc)(void *k, size_t sz);1228// memkind_free1229static void (*kmp_mk_free)(void *kind, void *ptr);1230// memkind_check_available1231static int (*kmp_mk_check)(void *kind);1232// kinds we are going to use1233static void **mk_default;1234static void **mk_interleave;1235static void **mk_hbw;1236static void **mk_hbw_interleave;1237static void **mk_hbw_preferred;1238static void **mk_hugetlb;1239static void **mk_hbw_hugetlb;1240static void **mk_hbw_preferred_hugetlb;1241static void **mk_dax_kmem;1242static void **mk_dax_kmem_all;1243static void **mk_dax_kmem_preferred;1244static void *(*kmp_target_alloc_host)(size_t size, int device);1245static void *(*kmp_target_alloc_shared)(size_t size, int device);1246static void *(*kmp_target_alloc_device)(size_t size, int device);1247static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);1248static void *(*kmp_target_unlock_mem)(void *ptr, int device);1249static void *(*kmp_target_free_host)(void *ptr, int device);1250static void *(*kmp_target_free_shared)(void *ptr, int device);1251static void *(*kmp_target_free_device)(void *ptr, int device);1252static bool __kmp_target_mem_available;1253#define KMP_IS_TARGET_MEM_SPACE(MS) \1254(MS == llvm_omp_target_host_mem_space || \1255MS == llvm_omp_target_shared_mem_space || \1256MS == llvm_omp_target_device_mem_space)1257#define KMP_IS_TARGET_MEM_ALLOC(MA) \1258(MA == llvm_omp_target_host_mem_alloc || \1259MA == llvm_omp_target_shared_mem_alloc || \1260MA == llvm_omp_target_device_mem_alloc)12611262#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN1263static inline void chk_kind(void ***pkind) {1264KMP_DEBUG_ASSERT(pkind);1265if (*pkind) // symbol found1266if (kmp_mk_check(**pkind)) // kind not available or error1267*pkind = NULL;1268}1269#endif12701271void __kmp_init_memkind() {1272// as of 2018-07-31 memkind does not support Windows*, exclude it for now1273#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN1274// use of statically linked memkind is problematic, as it depends on libnuma1275kmp_mk_lib_name = "libmemkind.so";1276h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);1277if (h_memkind) {1278kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");1279kmp_mk_alloc =1280(void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");1281kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");1282mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");1283if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&1284!kmp_mk_check(*mk_default)) {1285__kmp_memkind_available = 1;1286mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");1287chk_kind(&mk_interleave);1288mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");1289chk_kind(&mk_hbw);1290mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");1291chk_kind(&mk_hbw_interleave);1292mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");1293chk_kind(&mk_hbw_preferred);1294mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");1295chk_kind(&mk_hugetlb);1296mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");1297chk_kind(&mk_hbw_hugetlb);1298mk_hbw_preferred_hugetlb =1299(void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");1300chk_kind(&mk_hbw_preferred_hugetlb);1301mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");1302chk_kind(&mk_dax_kmem);1303mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");1304chk_kind(&mk_dax_kmem_all);1305mk_dax_kmem_preferred =1306(void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");1307chk_kind(&mk_dax_kmem_preferred);1308KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));1309return; // success1310}1311dlclose(h_memkind); // failure1312}1313#else // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)1314kmp_mk_lib_name = "";1315#endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)1316h_memkind = NULL;1317kmp_mk_check = NULL;1318kmp_mk_alloc = NULL;1319kmp_mk_free = NULL;1320mk_default = NULL;1321mk_interleave = NULL;1322mk_hbw = NULL;1323mk_hbw_interleave = NULL;1324mk_hbw_preferred = NULL;1325mk_hugetlb = NULL;1326mk_hbw_hugetlb = NULL;1327mk_hbw_preferred_hugetlb = NULL;1328mk_dax_kmem = NULL;1329mk_dax_kmem_all = NULL;1330mk_dax_kmem_preferred = NULL;1331}13321333void __kmp_fini_memkind() {1334#if KMP_OS_UNIX && KMP_DYNAMIC_LIB1335if (__kmp_memkind_available)1336KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));1337if (h_memkind) {1338dlclose(h_memkind);1339h_memkind = NULL;1340}1341kmp_mk_check = NULL;1342kmp_mk_alloc = NULL;1343kmp_mk_free = NULL;1344mk_default = NULL;1345mk_interleave = NULL;1346mk_hbw = NULL;1347mk_hbw_interleave = NULL;1348mk_hbw_preferred = NULL;1349mk_hugetlb = NULL;1350mk_hbw_hugetlb = NULL;1351mk_hbw_preferred_hugetlb = NULL;1352mk_dax_kmem = NULL;1353mk_dax_kmem_all = NULL;1354mk_dax_kmem_preferred = NULL;1355#endif1356}13571358void __kmp_init_target_mem() {1359*(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");1360*(void **)(&kmp_target_alloc_shared) =1361KMP_DLSYM("llvm_omp_target_alloc_shared");1362*(void **)(&kmp_target_alloc_device) =1363KMP_DLSYM("llvm_omp_target_alloc_device");1364*(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");1365*(void **)(&kmp_target_free_shared) =1366KMP_DLSYM("llvm_omp_target_free_shared");1367*(void **)(&kmp_target_free_device) =1368KMP_DLSYM("llvm_omp_target_free_device");1369__kmp_target_mem_available =1370kmp_target_alloc_host && kmp_target_alloc_shared &&1371kmp_target_alloc_device && kmp_target_free_host &&1372kmp_target_free_shared && kmp_target_free_device;1373// lock/pin and unlock/unpin target calls1374*(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");1375*(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");1376}13771378omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,1379int ntraits,1380omp_alloctrait_t traits[]) {1381// OpenMP 5.0 only allows predefined memspaces1382KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||1383ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||1384ms == omp_high_bw_mem_space || KMP_IS_TARGET_MEM_SPACE(ms));1385kmp_allocator_t *al;1386int i;1387al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed1388al->memspace = ms; // not used currently1389for (i = 0; i < ntraits; ++i) {1390switch (traits[i].key) {1391case omp_atk_sync_hint:1392case omp_atk_access:1393break;1394case omp_atk_pinned:1395al->pinned = true;1396break;1397case omp_atk_alignment:1398__kmp_type_convert(traits[i].value, &(al->alignment));1399KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));1400break;1401case omp_atk_pool_size:1402al->pool_size = traits[i].value;1403break;1404case omp_atk_fallback:1405al->fb = (omp_alloctrait_value_t)traits[i].value;1406KMP_DEBUG_ASSERT(1407al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||1408al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);1409break;1410case omp_atk_fb_data:1411al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);1412break;1413case omp_atk_partition:1414al->memkind = RCAST(void **, traits[i].value);1415break;1416default:1417KMP_ASSERT2(0, "Unexpected allocator trait");1418}1419}1420if (al->fb == 0) {1421// set default allocator1422al->fb = omp_atv_default_mem_fb;1423al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;1424} else if (al->fb == omp_atv_allocator_fb) {1425KMP_ASSERT(al->fb_data != NULL);1426} else if (al->fb == omp_atv_default_mem_fb) {1427al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;1428}1429if (__kmp_memkind_available) {1430// Let's use memkind library if available1431if (ms == omp_high_bw_mem_space) {1432if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {1433al->memkind = mk_hbw_interleave;1434} else if (mk_hbw_preferred) {1435// AC: do not try to use MEMKIND_HBW for now, because memkind library1436// cannot reliably detect exhaustion of HBW memory.1437// It could be possible using hbw_verify_memory_region() but memkind1438// manual says: "Using this function in production code may result in1439// serious performance penalty".1440al->memkind = mk_hbw_preferred;1441} else {1442// HBW is requested but not available --> return NULL allocator1443__kmp_free(al);1444return omp_null_allocator;1445}1446} else if (ms == omp_large_cap_mem_space) {1447if (mk_dax_kmem_all) {1448// All pmem nodes are visited1449al->memkind = mk_dax_kmem_all;1450} else if (mk_dax_kmem) {1451// Only closest pmem node is visited1452al->memkind = mk_dax_kmem;1453} else {1454__kmp_free(al);1455return omp_null_allocator;1456}1457} else {1458if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {1459al->memkind = mk_interleave;1460} else {1461al->memkind = mk_default;1462}1463}1464} else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {1465__kmp_free(al);1466return omp_null_allocator;1467} else {1468if (ms == omp_high_bw_mem_space) {1469// cannot detect HBW memory presence without memkind library1470__kmp_free(al);1471return omp_null_allocator;1472}1473}1474return (omp_allocator_handle_t)al;1475}14761477void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {1478if (allocator > kmp_max_mem_alloc)1479__kmp_free(allocator);1480}14811482void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {1483if (allocator == omp_null_allocator)1484allocator = omp_default_mem_alloc;1485__kmp_threads[gtid]->th.th_def_allocator = allocator;1486}14871488omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {1489return __kmp_threads[gtid]->th.th_def_allocator;1490}14911492typedef struct kmp_mem_desc { // Memory block descriptor1493void *ptr_alloc; // Pointer returned by allocator1494size_t size_a; // Size of allocated memory block (initial+descriptor+align)1495size_t size_orig; // Original size requested1496void *ptr_align; // Pointer to aligned memory, returned1497kmp_allocator_t *allocator; // allocator1498} kmp_mem_desc_t;1499static int alignment = sizeof(void *); // align to pointer size by default15001501// external interfaces are wrappers over internal implementation1502void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {1503KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));1504void *ptr = __kmp_alloc(gtid, 0, size, allocator);1505KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));1506return ptr;1507}15081509void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,1510omp_allocator_handle_t allocator) {1511KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,1512(int)size, allocator));1513void *ptr = __kmp_alloc(gtid, algn, size, allocator);1514KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));1515return ptr;1516}15171518void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,1519omp_allocator_handle_t allocator) {1520KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,1521(int)size, allocator));1522void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);1523KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));1524return ptr;1525}15261527void *__kmpc_realloc(int gtid, void *ptr, size_t size,1528omp_allocator_handle_t allocator,1529omp_allocator_handle_t free_allocator) {1530KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,1531allocator, free_allocator));1532void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);1533KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));1534return nptr;1535}15361537void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {1538KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));1539___kmpc_free(gtid, ptr, allocator);1540KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));1541return;1542}15431544// internal implementation, called from inside the library1545void *__kmp_alloc(int gtid, size_t algn, size_t size,1546omp_allocator_handle_t allocator) {1547void *ptr = NULL;1548kmp_allocator_t *al;1549KMP_DEBUG_ASSERT(__kmp_init_serial);1550if (size == 0)1551return NULL;1552if (allocator == omp_null_allocator)1553allocator = __kmp_threads[gtid]->th.th_def_allocator;1554kmp_int32 default_device =1555__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;15561557al = RCAST(kmp_allocator_t *, allocator);15581559int sz_desc = sizeof(kmp_mem_desc_t);1560kmp_mem_desc_t desc;1561kmp_uintptr_t addr; // address returned by allocator1562kmp_uintptr_t addr_align; // address to return to caller1563kmp_uintptr_t addr_descr; // address of memory block descriptor1564size_t align = alignment; // default alignment1565if (allocator > kmp_max_mem_alloc && al->alignment > align)1566align = al->alignment; // alignment required by allocator trait1567if (align < algn)1568align = algn; // max of allocator trait, parameter and sizeof(void*)1569desc.size_orig = size;1570desc.size_a = size + sz_desc + align;1571bool is_pinned = false;1572if (allocator > kmp_max_mem_alloc)1573is_pinned = al->pinned;15741575// Use default allocator if libmemkind is not available1576int use_default_allocator = (__kmp_memkind_available) ? false : true;15771578if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {1579// Use size input directly as the memory may not be accessible on host.1580// Use default device for now.1581if (__kmp_target_mem_available) {1582kmp_int32 device =1583__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;1584if (allocator == llvm_omp_target_host_mem_alloc)1585ptr = kmp_target_alloc_host(size, device);1586else if (allocator == llvm_omp_target_shared_mem_alloc)1587ptr = kmp_target_alloc_shared(size, device);1588else // allocator == llvm_omp_target_device_mem_alloc1589ptr = kmp_target_alloc_device(size, device);1590return ptr;1591} else {1592KMP_INFORM(TargetMemNotAvailable);1593}1594}15951596if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {1597if (__kmp_target_mem_available) {1598kmp_int32 device =1599__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;1600if (al->memspace == llvm_omp_target_host_mem_space)1601ptr = kmp_target_alloc_host(size, device);1602else if (al->memspace == llvm_omp_target_shared_mem_space)1603ptr = kmp_target_alloc_shared(size, device);1604else // al->memspace == llvm_omp_target_device_mem_space1605ptr = kmp_target_alloc_device(size, device);1606return ptr;1607} else {1608KMP_INFORM(TargetMemNotAvailable);1609}1610}16111612if (__kmp_memkind_available) {1613if (allocator < kmp_max_mem_alloc) {1614// pre-defined allocator1615if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {1616ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);1617} else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {1618ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);1619} else {1620ptr = kmp_mk_alloc(*mk_default, desc.size_a);1621}1622} else if (al->pool_size > 0) {1623// custom allocator with pool size requested1624kmp_uint64 used =1625KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);1626if (used + desc.size_a > al->pool_size) {1627// not enough space, need to go fallback path1628KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);1629if (al->fb == omp_atv_default_mem_fb) {1630al = (kmp_allocator_t *)omp_default_mem_alloc;1631ptr = kmp_mk_alloc(*mk_default, desc.size_a);1632} else if (al->fb == omp_atv_abort_fb) {1633KMP_ASSERT(0); // abort fallback requested1634} else if (al->fb == omp_atv_allocator_fb) {1635KMP_ASSERT(al != al->fb_data);1636al = al->fb_data;1637ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);1638if (is_pinned && kmp_target_lock_mem)1639kmp_target_lock_mem(ptr, size, default_device);1640return ptr;1641} // else ptr == NULL;1642} else {1643// pool has enough space1644ptr = kmp_mk_alloc(*al->memkind, desc.size_a);1645if (ptr == NULL) {1646if (al->fb == omp_atv_default_mem_fb) {1647al = (kmp_allocator_t *)omp_default_mem_alloc;1648ptr = kmp_mk_alloc(*mk_default, desc.size_a);1649} else if (al->fb == omp_atv_abort_fb) {1650KMP_ASSERT(0); // abort fallback requested1651} else if (al->fb == omp_atv_allocator_fb) {1652KMP_ASSERT(al != al->fb_data);1653al = al->fb_data;1654ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);1655if (is_pinned && kmp_target_lock_mem)1656kmp_target_lock_mem(ptr, size, default_device);1657return ptr;1658}1659}1660}1661} else {1662// custom allocator, pool size not requested1663ptr = kmp_mk_alloc(*al->memkind, desc.size_a);1664if (ptr == NULL) {1665if (al->fb == omp_atv_default_mem_fb) {1666al = (kmp_allocator_t *)omp_default_mem_alloc;1667ptr = kmp_mk_alloc(*mk_default, desc.size_a);1668} else if (al->fb == omp_atv_abort_fb) {1669KMP_ASSERT(0); // abort fallback requested1670} else if (al->fb == omp_atv_allocator_fb) {1671KMP_ASSERT(al != al->fb_data);1672al = al->fb_data;1673ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);1674if (is_pinned && kmp_target_lock_mem)1675kmp_target_lock_mem(ptr, size, default_device);1676return ptr;1677}1678}1679}1680} else if (allocator < kmp_max_mem_alloc) {1681// pre-defined allocator1682if (allocator == omp_high_bw_mem_alloc) {1683KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");1684} else if (allocator == omp_large_cap_mem_alloc) {1685KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");1686} else if (allocator == omp_const_mem_alloc) {1687KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");1688} else if (allocator == omp_low_lat_mem_alloc) {1689KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");1690} else if (allocator == omp_cgroup_mem_alloc) {1691KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");1692} else if (allocator == omp_pteam_mem_alloc) {1693KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");1694} else if (allocator == omp_thread_mem_alloc) {1695KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");1696} else { // default allocator requested1697use_default_allocator = true;1698}1699if (use_default_allocator) {1700ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);1701use_default_allocator = false;1702}1703} else if (al->pool_size > 0) {1704// custom allocator with pool size requested1705kmp_uint64 used =1706KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);1707if (used + desc.size_a > al->pool_size) {1708// not enough space, need to go fallback path1709KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);1710if (al->fb == omp_atv_default_mem_fb) {1711al = (kmp_allocator_t *)omp_default_mem_alloc;1712ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);1713} else if (al->fb == omp_atv_abort_fb) {1714KMP_ASSERT(0); // abort fallback requested1715} else if (al->fb == omp_atv_allocator_fb) {1716KMP_ASSERT(al != al->fb_data);1717al = al->fb_data;1718ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);1719if (is_pinned && kmp_target_lock_mem)1720kmp_target_lock_mem(ptr, size, default_device);1721return ptr;1722} // else ptr == NULL;1723} else {1724// pool has enough space1725ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);1726if (ptr == NULL && al->fb == omp_atv_abort_fb) {1727KMP_ASSERT(0); // abort fallback requested1728} // no sense to look for another fallback because of same internal alloc1729}1730} else {1731// custom allocator, pool size not requested1732ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);1733if (ptr == NULL && al->fb == omp_atv_abort_fb) {1734KMP_ASSERT(0); // abort fallback requested1735} // no sense to look for another fallback because of same internal alloc1736}1737KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));1738if (ptr == NULL)1739return NULL;17401741if (is_pinned && kmp_target_lock_mem)1742kmp_target_lock_mem(ptr, desc.size_a, default_device);17431744addr = (kmp_uintptr_t)ptr;1745addr_align = (addr + sz_desc + align - 1) & ~(align - 1);1746addr_descr = addr_align - sz_desc;17471748desc.ptr_alloc = ptr;1749desc.ptr_align = (void *)addr_align;1750desc.allocator = al;1751*((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents1752KMP_MB();17531754return desc.ptr_align;1755}17561757void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,1758omp_allocator_handle_t allocator) {1759void *ptr = NULL;1760kmp_allocator_t *al;1761KMP_DEBUG_ASSERT(__kmp_init_serial);17621763if (allocator == omp_null_allocator)1764allocator = __kmp_threads[gtid]->th.th_def_allocator;17651766al = RCAST(kmp_allocator_t *, allocator);17671768if (nmemb == 0 || size == 0)1769return ptr;17701771if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {1772if (al->fb == omp_atv_abort_fb) {1773KMP_ASSERT(0);1774}1775return ptr;1776}17771778ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);17791780if (ptr) {1781memset(ptr, 0x00, nmemb * size);1782}1783return ptr;1784}17851786void *__kmp_realloc(int gtid, void *ptr, size_t size,1787omp_allocator_handle_t allocator,1788omp_allocator_handle_t free_allocator) {1789void *nptr = NULL;1790KMP_DEBUG_ASSERT(__kmp_init_serial);17911792if (size == 0) {1793if (ptr != NULL)1794___kmpc_free(gtid, ptr, free_allocator);1795return nptr;1796}17971798nptr = __kmp_alloc(gtid, 0, size, allocator);17991800if (nptr != NULL && ptr != NULL) {1801kmp_mem_desc_t desc;1802kmp_uintptr_t addr_align; // address to return to caller1803kmp_uintptr_t addr_descr; // address of memory block descriptor18041805addr_align = (kmp_uintptr_t)ptr;1806addr_descr = addr_align - sizeof(kmp_mem_desc_t);1807desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor18081809KMP_DEBUG_ASSERT(desc.ptr_align == ptr);1810KMP_DEBUG_ASSERT(desc.size_orig > 0);1811KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);1812KMP_MEMCPY((char *)nptr, (char *)ptr,1813(size_t)((size < desc.size_orig) ? size : desc.size_orig));1814}18151816if (nptr != NULL) {1817___kmpc_free(gtid, ptr, free_allocator);1818}18191820return nptr;1821}18221823void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {1824if (ptr == NULL)1825return;18261827kmp_allocator_t *al;1828omp_allocator_handle_t oal;1829al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));1830kmp_mem_desc_t desc;1831kmp_uintptr_t addr_align; // address to return to caller1832kmp_uintptr_t addr_descr; // address of memory block descriptor1833if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||1834(allocator > kmp_max_mem_alloc &&1835KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {1836kmp_int32 device =1837__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;1838if (allocator == llvm_omp_target_host_mem_alloc) {1839kmp_target_free_host(ptr, device);1840} else if (allocator == llvm_omp_target_shared_mem_alloc) {1841kmp_target_free_shared(ptr, device);1842} else if (allocator == llvm_omp_target_device_mem_alloc) {1843kmp_target_free_device(ptr, device);1844}1845return;1846}18471848addr_align = (kmp_uintptr_t)ptr;1849addr_descr = addr_align - sizeof(kmp_mem_desc_t);1850desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor18511852KMP_DEBUG_ASSERT(desc.ptr_align == ptr);1853if (allocator) {1854KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);1855}1856al = desc.allocator;1857oal = (omp_allocator_handle_t)al; // cast to void* for comparisons1858KMP_DEBUG_ASSERT(al);18591860if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {1861kmp_int32 device =1862__kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;1863kmp_target_unlock_mem(desc.ptr_alloc, device);1864}18651866if (__kmp_memkind_available) {1867if (oal < kmp_max_mem_alloc) {1868// pre-defined allocator1869if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {1870kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);1871} else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {1872kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);1873} else {1874kmp_mk_free(*mk_default, desc.ptr_alloc);1875}1876} else {1877if (al->pool_size > 0) { // custom allocator with pool size requested1878kmp_uint64 used =1879KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);1880(void)used; // to suppress compiler warning1881KMP_DEBUG_ASSERT(used >= desc.size_a);1882}1883kmp_mk_free(*al->memkind, desc.ptr_alloc);1884}1885} else {1886if (oal > kmp_max_mem_alloc && al->pool_size > 0) {1887kmp_uint64 used =1888KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);1889(void)used; // to suppress compiler warning1890KMP_DEBUG_ASSERT(used >= desc.size_a);1891}1892__kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);1893}1894}18951896/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes1897memory leaks, but it may be useful for debugging memory corruptions, used1898freed pointers, etc. */1899/* #define LEAK_MEMORY */1900struct kmp_mem_descr { // Memory block descriptor.1901void *ptr_allocated; // Pointer returned by malloc(), subject for free().1902size_t size_allocated; // Size of allocated memory block.1903void *ptr_aligned; // Pointer to aligned memory, to be used by client code.1904size_t size_aligned; // Size of aligned memory block.1905};1906typedef struct kmp_mem_descr kmp_mem_descr_t;19071908/* Allocate memory on requested boundary, fill allocated memory with 0x00.1909NULL is NEVER returned, __kmp_abort() is called in case of memory allocation1910error. Must use __kmp_free when freeing memory allocated by this routine! */1911static void *___kmp_allocate_align(size_t size,1912size_t alignment KMP_SRC_LOC_DECL) {1913/* __kmp_allocate() allocates (by call to malloc()) bigger memory block than1914requested to return properly aligned pointer. Original pointer returned1915by malloc() and size of allocated block is saved in descriptor just1916before the aligned pointer. This information used by __kmp_free() -- it1917has to pass to free() original pointer, not aligned one.19181919+---------+------------+-----------------------------------+---------+1920| padding | descriptor | aligned block | padding |1921+---------+------------+-----------------------------------+---------+1922^ ^1923| |1924| +- Aligned pointer returned to caller1925+- Pointer returned by malloc()19261927Aligned block is filled with zeros, paddings are filled with 0xEF. */19281929kmp_mem_descr_t descr;1930kmp_uintptr_t addr_allocated; // Address returned by malloc().1931kmp_uintptr_t addr_aligned; // Aligned address to return to caller.1932kmp_uintptr_t addr_descr; // Address of memory block descriptor.19331934KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",1935(int)size, (int)alignment KMP_SRC_LOC_PARM));19361937KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too1938KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));1939// Make sure kmp_uintptr_t is enough to store addresses.19401941descr.size_aligned = size;1942descr.size_allocated =1943descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;19441945#if KMP_DEBUG1946descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);1947#else1948descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);1949#endif1950KE_TRACE(10, (" malloc( %d ) returned %p\n", (int)descr.size_allocated,1951descr.ptr_allocated));1952if (descr.ptr_allocated == NULL) {1953KMP_FATAL(OutOfHeapMemory);1954}19551956addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;1957addr_aligned =1958(addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);1959addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);19601961descr.ptr_aligned = (void *)addr_aligned;19621963KE_TRACE(26, (" ___kmp_allocate_align: "1964"ptr_allocated=%p, size_allocated=%d, "1965"ptr_aligned=%p, size_aligned=%d\n",1966descr.ptr_allocated, (int)descr.size_allocated,1967descr.ptr_aligned, (int)descr.size_aligned));19681969KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);1970KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);1971KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=1972addr_allocated + descr.size_allocated);1973KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);1974#ifdef KMP_DEBUG1975memset(descr.ptr_allocated, 0xEF, descr.size_allocated);1976// Fill allocated memory block with 0xEF.1977#endif1978memset(descr.ptr_aligned, 0x00, descr.size_aligned);1979// Fill the aligned memory block (which is intended for using by caller) with1980// 0x00. Do not1981// put this filling under KMP_DEBUG condition! Many callers expect zeroed1982// memory. (Padding1983// bytes remain filled with 0xEF in debugging library.)1984*((kmp_mem_descr_t *)addr_descr) = descr;19851986KMP_MB();19871988KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));1989return descr.ptr_aligned;1990} // func ___kmp_allocate_align19911992/* Allocate memory on cache line boundary, fill allocated memory with 0x00.1993Do not call this func directly! Use __kmp_allocate macro instead.1994NULL is NEVER returned, __kmp_abort() is called in case of memory allocation1995error. Must use __kmp_free when freeing memory allocated by this routine! */1996void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {1997void *ptr;1998KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",1999(int)size KMP_SRC_LOC_PARM));2000ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);2001KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));2002return ptr;2003} // func ___kmp_allocate20042005/* Allocate memory on page boundary, fill allocated memory with 0x00.2006Does not call this func directly! Use __kmp_page_allocate macro instead.2007NULL is NEVER returned, __kmp_abort() is called in case of memory allocation2008error. Must use __kmp_free when freeing memory allocated by this routine! */2009void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {2010int page_size = 8 * 1024;2011void *ptr;20122013KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",2014(int)size KMP_SRC_LOC_PARM));2015ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);2016KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));2017return ptr;2018} // ___kmp_page_allocate20192020/* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().2021In debug mode, fill the memory block with 0xEF before call to free(). */2022void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {2023kmp_mem_descr_t descr;2024#if KMP_DEBUG2025kmp_uintptr_t addr_allocated; // Address returned by malloc().2026kmp_uintptr_t addr_aligned; // Aligned address passed by caller.2027#endif2028KE_TRACE(25,2029("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));2030KMP_ASSERT(ptr != NULL);20312032descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));20332034KE_TRACE(26, (" __kmp_free: "2035"ptr_allocated=%p, size_allocated=%d, "2036"ptr_aligned=%p, size_aligned=%d\n",2037descr.ptr_allocated, (int)descr.size_allocated,2038descr.ptr_aligned, (int)descr.size_aligned));2039#if KMP_DEBUG2040addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;2041addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;2042KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);2043KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);2044KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);2045KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);2046KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=2047addr_allocated + descr.size_allocated);2048memset(descr.ptr_allocated, 0xEF, descr.size_allocated);2049// Fill memory block with 0xEF, it helps catch using freed memory.2050#endif20512052#ifndef LEAK_MEMORY2053KE_TRACE(10, (" free( %p )\n", descr.ptr_allocated));2054#ifdef KMP_DEBUG2055_free_src_loc(descr.ptr_allocated, _file_, _line_);2056#else2057free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);2058#endif2059#endif2060KMP_MB();2061KE_TRACE(25, ("<- __kmp_free() returns\n"));2062} // func ___kmp_free20632064#if USE_FAST_MEMORY == 32065// Allocate fast memory by first scanning the thread's free lists2066// If a chunk the right size exists, grab it off the free list.2067// Otherwise allocate normally using kmp_thread_malloc.20682069// AC: How to choose the limit? Just get 16 for now...2070#define KMP_FREE_LIST_LIMIT 1620712072// Always use 128 bytes for determining buckets for caching memory blocks2073#define DCACHE_LINE 12820742075void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {2076void *ptr;2077size_t num_lines, idx;2078int index;2079void *alloc_ptr;2080size_t alloc_size;2081kmp_mem_descr_t *descr;20822083KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",2084__kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));20852086num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;2087idx = num_lines - 1;2088KMP_DEBUG_ASSERT(idx >= 0);2089if (idx < 2) {2090index = 0; // idx is [ 0, 1 ], use first free list2091num_lines = 2; // 1, 2 cache lines or less than cache line2092} else if ((idx >>= 2) == 0) {2093index = 1; // idx is [ 2, 3 ], use second free list2094num_lines = 4; // 3, 4 cache lines2095} else if ((idx >>= 2) == 0) {2096index = 2; // idx is [ 4, 15 ], use third free list2097num_lines = 16; // 5, 6, ..., 16 cache lines2098} else if ((idx >>= 2) == 0) {2099index = 3; // idx is [ 16, 63 ], use fourth free list2100num_lines = 64; // 17, 18, ..., 64 cache lines2101} else {2102goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists2103}21042105ptr = this_thr->th.th_free_lists[index].th_free_list_self;2106if (ptr != NULL) {2107// pop the head of no-sync free list2108this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);2109KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -2110sizeof(kmp_mem_descr_t)))2111->ptr_aligned);2112goto end;2113}2114ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);2115if (ptr != NULL) {2116// no-sync free list is empty, use sync free list (filled in by other2117// threads only)2118// pop the head of the sync free list, push NULL instead2119while (!KMP_COMPARE_AND_STORE_PTR(2120&this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {2121KMP_CPU_PAUSE();2122ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);2123}2124// push the rest of chain into no-sync free list (can be NULL if there was2125// the only block)2126this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);2127KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -2128sizeof(kmp_mem_descr_t)))2129->ptr_aligned);2130goto end;2131}21322133alloc_call:2134// haven't found block in the free lists, thus allocate it2135size = num_lines * DCACHE_LINE;21362137alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;2138KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "2139"alloc_size %d\n",2140__kmp_gtid_from_thread(this_thr), alloc_size));2141alloc_ptr = bget(this_thr, (bufsize)alloc_size);21422143// align ptr to DCACHE_LINE2144ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +2145DCACHE_LINE) &2146~(DCACHE_LINE - 1));2147descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));21482149descr->ptr_allocated = alloc_ptr; // remember allocated pointer2150// we don't need size_allocated2151descr->ptr_aligned = (void *)this_thr; // remember allocating thread2152// (it is already saved in bget buffer,2153// but we may want to use another allocator in future)2154descr->size_aligned = size;21552156end:2157KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",2158__kmp_gtid_from_thread(this_thr), ptr));2159return ptr;2160} // func __kmp_fast_allocate21612162// Free fast memory and place it on the thread's free list if it is of2163// the correct size.2164void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {2165kmp_mem_descr_t *descr;2166kmp_info_t *alloc_thr;2167size_t size;2168size_t idx;2169int index;21702171KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",2172__kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));2173KMP_ASSERT(ptr != NULL);21742175descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));21762177KE_TRACE(26, (" __kmp_fast_free: size_aligned=%d\n",2178(int)descr->size_aligned));21792180size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines21812182idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block2183if (idx == size) {2184index = 0; // 2 cache lines2185} else if ((idx <<= 1) == size) {2186index = 1; // 4 cache lines2187} else if ((idx <<= 2) == size) {2188index = 2; // 16 cache lines2189} else if ((idx <<= 2) == size) {2190index = 3; // 64 cache lines2191} else {2192KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);2193goto free_call; // 65 or more cache lines ( > 8KB )2194}21952196alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block2197if (alloc_thr == this_thr) {2198// push block to self no-sync free list, linking previous head (LIFO)2199*((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;2200this_thr->th.th_free_lists[index].th_free_list_self = ptr;2201} else {2202void *head = this_thr->th.th_free_lists[index].th_free_list_other;2203if (head == NULL) {2204// Create new free list2205this_thr->th.th_free_lists[index].th_free_list_other = ptr;2206*((void **)ptr) = NULL; // mark the tail of the list2207descr->size_allocated = (size_t)1; // head of the list keeps its length2208} else {2209// need to check existed "other" list's owner thread and size of queue2210kmp_mem_descr_t *dsc =2211(kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));2212// allocating thread, same for all queue nodes2213kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);2214size_t q_sz =2215dsc->size_allocated + 1; // new size in case we add current task2216if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {2217// we can add current task to "other" list, no sync needed2218*((void **)ptr) = head;2219descr->size_allocated = q_sz;2220this_thr->th.th_free_lists[index].th_free_list_other = ptr;2221} else {2222// either queue blocks owner is changing or size limit exceeded2223// return old queue to allocating thread (q_th) synchronously,2224// and start new list for alloc_thr's tasks2225void *old_ptr;2226void *tail = head;2227void *next = *((void **)head);2228while (next != NULL) {2229KMP_DEBUG_ASSERT(2230// queue size should decrease by 1 each step through the list2231((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))2232->size_allocated +22331 ==2234((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))2235->size_allocated);2236tail = next; // remember tail node2237next = *((void **)next);2238}2239KMP_DEBUG_ASSERT(q_th != NULL);2240// push block to owner's sync free list2241old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);2242/* the next pointer must be set before setting free_list to ptr to avoid2243exposing a broken list to other threads, even for an instant. */2244*((void **)tail) = old_ptr;22452246while (!KMP_COMPARE_AND_STORE_PTR(2247&q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {2248KMP_CPU_PAUSE();2249old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);2250*((void **)tail) = old_ptr;2251}22522253// start new list of not-selt tasks2254this_thr->th.th_free_lists[index].th_free_list_other = ptr;2255*((void **)ptr) = NULL;2256descr->size_allocated = (size_t)1; // head of queue keeps its length2257}2258}2259}2260goto end;22612262free_call:2263KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",2264__kmp_gtid_from_thread(this_thr), size));2265__kmp_bget_dequeue(this_thr); /* Release any queued buffers */2266brel(this_thr, descr->ptr_allocated);22672268end:2269KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));22702271} // func __kmp_fast_free22722273// Initialize the thread free lists related to fast memory2274// Only do this when a thread is initially created.2275void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {2276KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));22772278memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));2279}22802281// Free the memory in the thread free lists related to fast memory2282// Only do this when a thread is being reaped (destroyed).2283void __kmp_free_fast_memory(kmp_info_t *th) {2284// Suppose we use BGET underlying allocator, walk through its structures...2285int bin;2286thr_data_t *thr = get_thr_data(th);2287void **lst = NULL;22882289KE_TRACE(22905, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));22912292__kmp_bget_dequeue(th); // Release any queued buffers22932294// Dig through free lists and extract all allocated blocks2295for (bin = 0; bin < MAX_BGET_BINS; ++bin) {2296bfhead_t *b = thr->freelist[bin].ql.flink;2297while (b != &thr->freelist[bin]) {2298if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address2299*((void **)b) =2300lst; // link the list (override bthr, but keep flink yet)2301lst = (void **)b; // push b into lst2302}2303b = b->ql.flink; // get next buffer2304}2305}2306while (lst != NULL) {2307void *next = *lst;2308KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",2309lst, next, th, __kmp_gtid_from_thread(th)));2310(*thr->relfcn)(lst);2311#if BufStats2312// count blocks to prevent problems in __kmp_finalize_bget()2313thr->numprel++; /* Nr of expansion block releases */2314thr->numpblk--; /* Total number of blocks */2315#endif2316lst = (void **)next;2317}23182319KE_TRACE(23205, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));2321}23222323#endif // USE_FAST_MEMORY232423252326