Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h
35258 views
/*1* kmp_affinity.h -- header for affinity management2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112#ifndef KMP_AFFINITY_H13#define KMP_AFFINITY_H1415#include "kmp.h"16#include "kmp_os.h"17#include <limits>1819#if KMP_AFFINITY_SUPPORTED20#if KMP_USE_HWLOC21class KMPHwlocAffinity : public KMPAffinity {22public:23class Mask : public KMPAffinity::Mask {24hwloc_cpuset_t mask;2526public:27Mask() {28mask = hwloc_bitmap_alloc();29this->zero();30}31~Mask() { hwloc_bitmap_free(mask); }32void set(int i) override { hwloc_bitmap_set(mask, i); }33bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }34void clear(int i) override { hwloc_bitmap_clr(mask, i); }35void zero() override { hwloc_bitmap_zero(mask); }36bool empty() const override { return hwloc_bitmap_iszero(mask); }37void copy(const KMPAffinity::Mask *src) override {38const Mask *convert = static_cast<const Mask *>(src);39hwloc_bitmap_copy(mask, convert->mask);40}41void bitwise_and(const KMPAffinity::Mask *rhs) override {42const Mask *convert = static_cast<const Mask *>(rhs);43hwloc_bitmap_and(mask, mask, convert->mask);44}45void bitwise_or(const KMPAffinity::Mask *rhs) override {46const Mask *convert = static_cast<const Mask *>(rhs);47hwloc_bitmap_or(mask, mask, convert->mask);48}49void bitwise_not() override { hwloc_bitmap_not(mask, mask); }50bool is_equal(const KMPAffinity::Mask *rhs) const override {51const Mask *convert = static_cast<const Mask *>(rhs);52return hwloc_bitmap_isequal(mask, convert->mask);53}54int begin() const override { return hwloc_bitmap_first(mask); }55int end() const override { return -1; }56int next(int previous) const override {57return hwloc_bitmap_next(mask, previous);58}59int get_system_affinity(bool abort_on_error) override {60KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),61"Illegal get affinity operation when not capable");62long retval =63hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);64if (retval >= 0) {65return 0;66}67int error = errno;68if (abort_on_error) {69__kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),70KMP_ERR(error), __kmp_msg_null);71}72return error;73}74int set_system_affinity(bool abort_on_error) const override {75KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),76"Illegal set affinity operation when not capable");77long retval =78hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);79if (retval >= 0) {80return 0;81}82int error = errno;83if (abort_on_error) {84__kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),85KMP_ERR(error), __kmp_msg_null);86}87return error;88}89#if KMP_OS_WINDOWS90int set_process_affinity(bool abort_on_error) const override {91KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),92"Illegal set process affinity operation when not capable");93int error = 0;94const hwloc_topology_support *support =95hwloc_topology_get_support(__kmp_hwloc_topology);96if (support->cpubind->set_proc_cpubind) {97int retval;98retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,99HWLOC_CPUBIND_PROCESS);100if (retval >= 0)101return 0;102error = errno;103if (abort_on_error)104__kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),105KMP_ERR(error), __kmp_msg_null);106}107return error;108}109#endif110int get_proc_group() const override {111int group = -1;112#if KMP_OS_WINDOWS113if (__kmp_num_proc_groups == 1) {114return 1;115}116for (int i = 0; i < __kmp_num_proc_groups; i++) {117// On windows, the long type is always 32 bits118unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);119unsigned long second_32_bits =120hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);121if (first_32_bits == 0 && second_32_bits == 0) {122continue;123}124if (group >= 0) {125return -1;126}127group = i;128}129#endif /* KMP_OS_WINDOWS */130return group;131}132};133void determine_capable(const char *var) override {134const hwloc_topology_support *topology_support;135if (__kmp_hwloc_topology == NULL) {136if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {137__kmp_hwloc_error = TRUE;138if (__kmp_affinity.flags.verbose) {139KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");140}141}142if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {143__kmp_hwloc_error = TRUE;144if (__kmp_affinity.flags.verbose) {145KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");146}147}148}149topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);150// Is the system capable of setting/getting this thread's affinity?151// Also, is topology discovery possible? (pu indicates ability to discover152// processing units). And finally, were there no errors when calling any153// hwloc_* API functions?154if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&155topology_support->cpubind->get_thisthread_cpubind &&156topology_support->discovery->pu && !__kmp_hwloc_error) {157// enables affinity according to KMP_AFFINITY_CAPABLE() macro158KMP_AFFINITY_ENABLE(TRUE);159} else {160// indicate that hwloc didn't work and disable affinity161__kmp_hwloc_error = TRUE;162KMP_AFFINITY_DISABLE();163}164}165void bind_thread(int which) override {166KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),167"Illegal set affinity operation when not capable");168KMPAffinity::Mask *mask;169KMP_CPU_ALLOC_ON_STACK(mask);170KMP_CPU_ZERO(mask);171KMP_CPU_SET(which, mask);172__kmp_set_system_affinity(mask, TRUE);173KMP_CPU_FREE_FROM_STACK(mask);174}175KMPAffinity::Mask *allocate_mask() override { return new Mask(); }176void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }177KMPAffinity::Mask *allocate_mask_array(int num) override {178return new Mask[num];179}180void deallocate_mask_array(KMPAffinity::Mask *array) override {181Mask *hwloc_array = static_cast<Mask *>(array);182delete[] hwloc_array;183}184KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,185int index) override {186Mask *hwloc_array = static_cast<Mask *>(array);187return &(hwloc_array[index]);188}189api_type get_api_type() const override { return HWLOC; }190};191#endif /* KMP_USE_HWLOC */192193#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \194KMP_OS_AIX195#if KMP_OS_LINUX196/* On some of the older OS's that we build on, these constants aren't present197in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on198all systems of the same arch where they are defined, and they cannot change.199stone forever. */200#include <sys/syscall.h>201#if KMP_ARCH_X86 || KMP_ARCH_ARM202#ifndef __NR_sched_setaffinity203#define __NR_sched_setaffinity 241204#elif __NR_sched_setaffinity != 241205#error Wrong code for setaffinity system call.206#endif /* __NR_sched_setaffinity */207#ifndef __NR_sched_getaffinity208#define __NR_sched_getaffinity 242209#elif __NR_sched_getaffinity != 242210#error Wrong code for getaffinity system call.211#endif /* __NR_sched_getaffinity */212#elif KMP_ARCH_AARCH64213#ifndef __NR_sched_setaffinity214#define __NR_sched_setaffinity 122215#elif __NR_sched_setaffinity != 122216#error Wrong code for setaffinity system call.217#endif /* __NR_sched_setaffinity */218#ifndef __NR_sched_getaffinity219#define __NR_sched_getaffinity 123220#elif __NR_sched_getaffinity != 123221#error Wrong code for getaffinity system call.222#endif /* __NR_sched_getaffinity */223#elif KMP_ARCH_X86_64224#ifndef __NR_sched_setaffinity225#define __NR_sched_setaffinity 203226#elif __NR_sched_setaffinity != 203227#error Wrong code for setaffinity system call.228#endif /* __NR_sched_setaffinity */229#ifndef __NR_sched_getaffinity230#define __NR_sched_getaffinity 204231#elif __NR_sched_getaffinity != 204232#error Wrong code for getaffinity system call.233#endif /* __NR_sched_getaffinity */234#elif KMP_ARCH_PPC64235#ifndef __NR_sched_setaffinity236#define __NR_sched_setaffinity 222237#elif __NR_sched_setaffinity != 222238#error Wrong code for setaffinity system call.239#endif /* __NR_sched_setaffinity */240#ifndef __NR_sched_getaffinity241#define __NR_sched_getaffinity 223242#elif __NR_sched_getaffinity != 223243#error Wrong code for getaffinity system call.244#endif /* __NR_sched_getaffinity */245#elif KMP_ARCH_MIPS246#ifndef __NR_sched_setaffinity247#define __NR_sched_setaffinity 4239248#elif __NR_sched_setaffinity != 4239249#error Wrong code for setaffinity system call.250#endif /* __NR_sched_setaffinity */251#ifndef __NR_sched_getaffinity252#define __NR_sched_getaffinity 4240253#elif __NR_sched_getaffinity != 4240254#error Wrong code for getaffinity system call.255#endif /* __NR_sched_getaffinity */256#elif KMP_ARCH_MIPS64257#ifndef __NR_sched_setaffinity258#define __NR_sched_setaffinity 5195259#elif __NR_sched_setaffinity != 5195260#error Wrong code for setaffinity system call.261#endif /* __NR_sched_setaffinity */262#ifndef __NR_sched_getaffinity263#define __NR_sched_getaffinity 5196264#elif __NR_sched_getaffinity != 5196265#error Wrong code for getaffinity system call.266#endif /* __NR_sched_getaffinity */267#elif KMP_ARCH_LOONGARCH64268#ifndef __NR_sched_setaffinity269#define __NR_sched_setaffinity 122270#elif __NR_sched_setaffinity != 122271#error Wrong code for setaffinity system call.272#endif /* __NR_sched_setaffinity */273#ifndef __NR_sched_getaffinity274#define __NR_sched_getaffinity 123275#elif __NR_sched_getaffinity != 123276#error Wrong code for getaffinity system call.277#endif /* __NR_sched_getaffinity */278#elif KMP_ARCH_RISCV64279#ifndef __NR_sched_setaffinity280#define __NR_sched_setaffinity 122281#elif __NR_sched_setaffinity != 122282#error Wrong code for setaffinity system call.283#endif /* __NR_sched_setaffinity */284#ifndef __NR_sched_getaffinity285#define __NR_sched_getaffinity 123286#elif __NR_sched_getaffinity != 123287#error Wrong code for getaffinity system call.288#endif /* __NR_sched_getaffinity */289#elif KMP_ARCH_VE290#ifndef __NR_sched_setaffinity291#define __NR_sched_setaffinity 203292#elif __NR_sched_setaffinity != 203293#error Wrong code for setaffinity system call.294#endif /* __NR_sched_setaffinity */295#ifndef __NR_sched_getaffinity296#define __NR_sched_getaffinity 204297#elif __NR_sched_getaffinity != 204298#error Wrong code for getaffinity system call.299#endif /* __NR_sched_getaffinity */300#elif KMP_ARCH_S390X301#ifndef __NR_sched_setaffinity302#define __NR_sched_setaffinity 239303#elif __NR_sched_setaffinity != 239304#error Wrong code for setaffinity system call.305#endif /* __NR_sched_setaffinity */306#ifndef __NR_sched_getaffinity307#define __NR_sched_getaffinity 240308#elif __NR_sched_getaffinity != 240309#error Wrong code for getaffinity system call.310#endif /* __NR_sched_getaffinity */311#else312#error Unknown or unsupported architecture313#endif /* KMP_ARCH_* */314#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY315#include <pthread.h>316#include <pthread_np.h>317#elif KMP_OS_NETBSD318#include <pthread.h>319#include <sched.h>320#elif KMP_OS_AIX321#include <sys/dr.h>322#include <sys/rset.h>323#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.324#define GET_NUMBER_SMT_SETS 0x0004325extern "C" int syssmt(int flags, int, int, int *);326#endif327class KMPNativeAffinity : public KMPAffinity {328class Mask : public KMPAffinity::Mask {329typedef unsigned long mask_t;330typedef decltype(__kmp_affin_mask_size) mask_size_type;331static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;332static const mask_t ONE = 1;333mask_size_type get_num_mask_types() const {334return __kmp_affin_mask_size / sizeof(mask_t);335}336337public:338mask_t *mask;339Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }340~Mask() {341if (mask)342__kmp_free(mask);343}344void set(int i) override {345mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));346}347bool is_set(int i) const override {348return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));349}350void clear(int i) override {351mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));352}353void zero() override {354mask_size_type e = get_num_mask_types();355for (mask_size_type i = 0; i < e; ++i)356mask[i] = (mask_t)0;357}358bool empty() const override {359mask_size_type e = get_num_mask_types();360for (mask_size_type i = 0; i < e; ++i)361if (mask[i] != (mask_t)0)362return false;363return true;364}365void copy(const KMPAffinity::Mask *src) override {366const Mask *convert = static_cast<const Mask *>(src);367mask_size_type e = get_num_mask_types();368for (mask_size_type i = 0; i < e; ++i)369mask[i] = convert->mask[i];370}371void bitwise_and(const KMPAffinity::Mask *rhs) override {372const Mask *convert = static_cast<const Mask *>(rhs);373mask_size_type e = get_num_mask_types();374for (mask_size_type i = 0; i < e; ++i)375mask[i] &= convert->mask[i];376}377void bitwise_or(const KMPAffinity::Mask *rhs) override {378const Mask *convert = static_cast<const Mask *>(rhs);379mask_size_type e = get_num_mask_types();380for (mask_size_type i = 0; i < e; ++i)381mask[i] |= convert->mask[i];382}383void bitwise_not() override {384mask_size_type e = get_num_mask_types();385for (mask_size_type i = 0; i < e; ++i)386mask[i] = ~(mask[i]);387}388bool is_equal(const KMPAffinity::Mask *rhs) const override {389const Mask *convert = static_cast<const Mask *>(rhs);390mask_size_type e = get_num_mask_types();391for (mask_size_type i = 0; i < e; ++i)392if (mask[i] != convert->mask[i])393return false;394return true;395}396int begin() const override {397int retval = 0;398while (retval < end() && !is_set(retval))399++retval;400return retval;401}402int end() const override {403int e;404__kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);405return e;406}407int next(int previous) const override {408int retval = previous + 1;409while (retval < end() && !is_set(retval))410++retval;411return retval;412}413#if KMP_OS_AIX414// On AIX, we don't have a way to get CPU(s) a thread is bound to.415// This routine is only used to get the full mask.416int get_system_affinity(bool abort_on_error) override {417KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),418"Illegal get affinity operation when not capable");419420(void)abort_on_error;421422// Set the mask with all CPUs that are available.423for (int i = 0; i < __kmp_xproc; ++i)424KMP_CPU_SET(i, this);425return 0;426}427int set_system_affinity(bool abort_on_error) const override {428KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),429430"Illegal set affinity operation when not capable");431432int location;433int gtid = __kmp_entry_gtid();434int tid = thread_self();435436// Unbind the thread if it was bound to any processors before so that437// we can bind the thread to CPUs specified by the mask not others.438int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);439440// On AIX, we can only bind to one instead of a set of CPUs with the441// bindprocessor() system call.442KMP_CPU_SET_ITERATE(location, this) {443if (KMP_CPU_ISSET(location, this)) {444retval = bindprocessor(BINDTHREAD, tid, location);445if (retval == -1 && errno == 1) {446rsid_t rsid;447rsethandle_t rsh;448// Put something in rsh to prevent compiler warning449// about uninitalized use450rsh = rs_alloc(RS_EMPTY);451rsid.at_pid = getpid();452if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {453retval = ra_detachrset(R_PROCESS, rsid, 0);454retval = bindprocessor(BINDTHREAD, tid, location);455}456}457if (retval == 0) {458KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "459"T#%d to cpu=%d.\n",460gtid, location));461continue;462}463int error = errno;464if (abort_on_error) {465__kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),466KMP_ERR(error), __kmp_msg_null);467KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "468"T#%d to cpu=%d, errno=%d.\n",469gtid, location, error));470return error;471}472}473}474return 0;475}476#else // !KMP_OS_AIX477int get_system_affinity(bool abort_on_error) override {478KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),479"Illegal get affinity operation when not capable");480#if KMP_OS_LINUX481long retval =482syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);483#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY484int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,485reinterpret_cast<cpuset_t *>(mask));486int retval = (r == 0 ? 0 : -1);487#endif488if (retval >= 0) {489return 0;490}491int error = errno;492if (abort_on_error) {493__kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),494KMP_ERR(error), __kmp_msg_null);495}496return error;497}498int set_system_affinity(bool abort_on_error) const override {499KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),500"Illegal set affinity operation when not capable");501#if KMP_OS_LINUX502long retval =503syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);504#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY505int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,506reinterpret_cast<cpuset_t *>(mask));507int retval = (r == 0 ? 0 : -1);508#endif509if (retval >= 0) {510return 0;511}512int error = errno;513if (abort_on_error) {514__kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),515KMP_ERR(error), __kmp_msg_null);516}517return error;518}519#endif // KMP_OS_AIX520};521void determine_capable(const char *env_var) override {522__kmp_affinity_determine_capable(env_var);523}524void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }525KMPAffinity::Mask *allocate_mask() override {526KMPNativeAffinity::Mask *retval = new Mask();527return retval;528}529void deallocate_mask(KMPAffinity::Mask *m) override {530KMPNativeAffinity::Mask *native_mask =531static_cast<KMPNativeAffinity::Mask *>(m);532delete native_mask;533}534KMPAffinity::Mask *allocate_mask_array(int num) override {535return new Mask[num];536}537void deallocate_mask_array(KMPAffinity::Mask *array) override {538Mask *linux_array = static_cast<Mask *>(array);539delete[] linux_array;540}541KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,542int index) override {543Mask *linux_array = static_cast<Mask *>(array);544return &(linux_array[index]);545}546api_type get_api_type() const override { return NATIVE_OS; }547};548#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \549|| KMP_OS_AIX */550551#if KMP_OS_WINDOWS552class KMPNativeAffinity : public KMPAffinity {553class Mask : public KMPAffinity::Mask {554typedef ULONG_PTR mask_t;555static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;556mask_t *mask;557558public:559Mask() {560mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);561}562~Mask() {563if (mask)564__kmp_free(mask);565}566void set(int i) override {567mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));568}569bool is_set(int i) const override {570return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));571}572void clear(int i) override {573mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));574}575void zero() override {576for (int i = 0; i < __kmp_num_proc_groups; ++i)577mask[i] = 0;578}579bool empty() const override {580for (size_t i = 0; i < __kmp_num_proc_groups; ++i)581if (mask[i])582return false;583return true;584}585void copy(const KMPAffinity::Mask *src) override {586const Mask *convert = static_cast<const Mask *>(src);587for (int i = 0; i < __kmp_num_proc_groups; ++i)588mask[i] = convert->mask[i];589}590void bitwise_and(const KMPAffinity::Mask *rhs) override {591const Mask *convert = static_cast<const Mask *>(rhs);592for (int i = 0; i < __kmp_num_proc_groups; ++i)593mask[i] &= convert->mask[i];594}595void bitwise_or(const KMPAffinity::Mask *rhs) override {596const Mask *convert = static_cast<const Mask *>(rhs);597for (int i = 0; i < __kmp_num_proc_groups; ++i)598mask[i] |= convert->mask[i];599}600void bitwise_not() override {601for (int i = 0; i < __kmp_num_proc_groups; ++i)602mask[i] = ~(mask[i]);603}604bool is_equal(const KMPAffinity::Mask *rhs) const override {605const Mask *convert = static_cast<const Mask *>(rhs);606for (size_t i = 0; i < __kmp_num_proc_groups; ++i)607if (mask[i] != convert->mask[i])608return false;609return true;610}611int begin() const override {612int retval = 0;613while (retval < end() && !is_set(retval))614++retval;615return retval;616}617int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }618int next(int previous) const override {619int retval = previous + 1;620while (retval < end() && !is_set(retval))621++retval;622return retval;623}624int set_process_affinity(bool abort_on_error) const override {625if (__kmp_num_proc_groups <= 1) {626if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {627DWORD error = GetLastError();628if (abort_on_error) {629__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),630__kmp_msg_null);631}632return error;633}634}635return 0;636}637int set_system_affinity(bool abort_on_error) const override {638if (__kmp_num_proc_groups > 1) {639// Check for a valid mask.640GROUP_AFFINITY ga;641int group = get_proc_group();642if (group < 0) {643if (abort_on_error) {644KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");645}646return -1;647}648// Transform the bit vector into a GROUP_AFFINITY struct649// and make the system call to set affinity.650ga.Group = group;651ga.Mask = mask[group];652ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;653654KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);655if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {656DWORD error = GetLastError();657if (abort_on_error) {658__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),659__kmp_msg_null);660}661return error;662}663} else {664if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {665DWORD error = GetLastError();666if (abort_on_error) {667__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),668__kmp_msg_null);669}670return error;671}672}673return 0;674}675int get_system_affinity(bool abort_on_error) override {676if (__kmp_num_proc_groups > 1) {677this->zero();678GROUP_AFFINITY ga;679KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);680if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {681DWORD error = GetLastError();682if (abort_on_error) {683__kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),684KMP_ERR(error), __kmp_msg_null);685}686return error;687}688if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||689(ga.Mask == 0)) {690return -1;691}692mask[ga.Group] = ga.Mask;693} else {694mask_t newMask, sysMask, retval;695if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {696DWORD error = GetLastError();697if (abort_on_error) {698__kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),699KMP_ERR(error), __kmp_msg_null);700}701return error;702}703retval = SetThreadAffinityMask(GetCurrentThread(), newMask);704if (!retval) {705DWORD error = GetLastError();706if (abort_on_error) {707__kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),708KMP_ERR(error), __kmp_msg_null);709}710return error;711}712newMask = SetThreadAffinityMask(GetCurrentThread(), retval);713if (!newMask) {714DWORD error = GetLastError();715if (abort_on_error) {716__kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),717KMP_ERR(error), __kmp_msg_null);718}719}720*mask = retval;721}722return 0;723}724int get_proc_group() const override {725int group = -1;726if (__kmp_num_proc_groups == 1) {727return 1;728}729for (int i = 0; i < __kmp_num_proc_groups; i++) {730if (mask[i] == 0)731continue;732if (group >= 0)733return -1;734group = i;735}736return group;737}738};739void determine_capable(const char *env_var) override {740__kmp_affinity_determine_capable(env_var);741}742void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }743KMPAffinity::Mask *allocate_mask() override { return new Mask(); }744void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }745KMPAffinity::Mask *allocate_mask_array(int num) override {746return new Mask[num];747}748void deallocate_mask_array(KMPAffinity::Mask *array) override {749Mask *windows_array = static_cast<Mask *>(array);750delete[] windows_array;751}752KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,753int index) override {754Mask *windows_array = static_cast<Mask *>(array);755return &(windows_array[index]);756}757api_type get_api_type() const override { return NATIVE_OS; }758};759#endif /* KMP_OS_WINDOWS */760#endif /* KMP_AFFINITY_SUPPORTED */761762// Describe an attribute for a level in the machine topology763struct kmp_hw_attr_t {764int core_type : 8;765int core_eff : 8;766unsigned valid : 1;767unsigned reserved : 15;768769static const int UNKNOWN_CORE_EFF = -1;770771kmp_hw_attr_t()772: core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),773valid(0), reserved(0) {}774void set_core_type(kmp_hw_core_type_t type) {775valid = 1;776core_type = type;777}778void set_core_eff(int eff) {779valid = 1;780core_eff = eff;781}782kmp_hw_core_type_t get_core_type() const {783return (kmp_hw_core_type_t)core_type;784}785int get_core_eff() const { return core_eff; }786bool is_core_type_valid() const {787return core_type != KMP_HW_CORE_TYPE_UNKNOWN;788}789bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }790operator bool() const { return valid; }791void clear() {792core_type = KMP_HW_CORE_TYPE_UNKNOWN;793core_eff = UNKNOWN_CORE_EFF;794valid = 0;795}796bool contains(const kmp_hw_attr_t &other) const {797if (!valid && !other.valid)798return true;799if (valid && other.valid) {800if (other.is_core_type_valid()) {801if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))802return false;803}804if (other.is_core_eff_valid()) {805if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))806return false;807}808return true;809}810return false;811}812#if KMP_AFFINITY_SUPPORTED813bool contains(const kmp_affinity_attrs_t &attr) const {814if (!valid && !attr.valid)815return true;816if (valid && attr.valid) {817if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)818return (is_core_type_valid() &&819(get_core_type() == (kmp_hw_core_type_t)attr.core_type));820if (attr.core_eff != UNKNOWN_CORE_EFF)821return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));822return true;823}824return false;825}826#endif // KMP_AFFINITY_SUPPORTED827bool operator==(const kmp_hw_attr_t &rhs) const {828return (rhs.valid == valid && rhs.core_eff == core_eff &&829rhs.core_type == core_type);830}831bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }832};833834#if KMP_AFFINITY_SUPPORTED835KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));836#endif837838class kmp_hw_thread_t {839public:840static const int UNKNOWN_ID = -1;841static const int MULTIPLE_ID = -2;842static int compare_ids(const void *a, const void *b);843static int compare_compact(const void *a, const void *b);844int ids[KMP_HW_LAST];845int sub_ids[KMP_HW_LAST];846bool leader;847int os_id;848kmp_hw_attr_t attrs;849850void print() const;851void clear() {852for (int i = 0; i < (int)KMP_HW_LAST; ++i)853ids[i] = UNKNOWN_ID;854leader = false;855attrs.clear();856}857};858859class kmp_topology_t {860861struct flags_t {862int uniform : 1;863int reserved : 31;864};865866int depth;867868// The following arrays are all 'depth' long and have been869// allocated to hold up to KMP_HW_LAST number of objects if870// needed so layers can be added without reallocation of any array871872// Orderd array of the types in the topology873kmp_hw_t *types;874875// Keep quick topology ratios, for non-uniform topologies,876// this ratio holds the max number of itemAs per itemB877// e.g., [ 4 packages | 6 cores / package | 2 threads / core ]878int *ratio;879880// Storage containing the absolute number of each topology layer881int *count;882883// The number of core efficiencies. This is only useful for hybrid884// topologies. Core efficiencies will range from 0 to num efficiencies - 1885int num_core_efficiencies;886int num_core_types;887kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];888889// The hardware threads array890// hw_threads is num_hw_threads long891// Each hw_thread's ids and sub_ids are depth deep892int num_hw_threads;893kmp_hw_thread_t *hw_threads;894895// Equivalence hash where the key is the hardware topology item896// and the value is the equivalent hardware topology type in the897// types[] array, if the value is KMP_HW_UNKNOWN, then there is no898// known equivalence for the topology type899kmp_hw_t equivalent[KMP_HW_LAST];900901// Flags describing the topology902flags_t flags;903904// Compact value used during sort_compact()905int compact;906907// Insert a new topology layer after allocation908void _insert_layer(kmp_hw_t type, const int *ids);909910#if KMP_GROUP_AFFINITY911// Insert topology information about Windows Processor groups912void _insert_windows_proc_groups();913#endif914915// Count each item & get the num x's per y916// e.g., get the number of cores and the number of threads per core917// for each (x, y) in (KMP_HW_* , KMP_HW_*)918void _gather_enumeration_information();919920// Remove layers that don't add information to the topology.921// This is done by having the layer take on the id = UNKNOWN_ID (-1)922void _remove_radix1_layers();923924// Find out if the topology is uniform925void _discover_uniformity();926927// Set all the sub_ids for each hardware thread928void _set_sub_ids();929930// Set global affinity variables describing the number of threads per931// core, the number of packages, the number of cores per package, and932// the number of cores.933void _set_globals();934935// Set the last level cache equivalent type936void _set_last_level_cache();937938// Return the number of cores with a particular attribute, 'attr'.939// If 'find_all' is true, then find all cores on the machine, otherwise find940// all cores per the layer 'above'941int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,942bool find_all = false) const;943944public:945// Force use of allocate()/deallocate()946kmp_topology_t() = delete;947kmp_topology_t(const kmp_topology_t &t) = delete;948kmp_topology_t(kmp_topology_t &&t) = delete;949kmp_topology_t &operator=(const kmp_topology_t &t) = delete;950kmp_topology_t &operator=(kmp_topology_t &&t) = delete;951952static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);953static void deallocate(kmp_topology_t *);954955// Functions used in create_map() routines956kmp_hw_thread_t &at(int index) {957KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);958return hw_threads[index];959}960const kmp_hw_thread_t &at(int index) const {961KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);962return hw_threads[index];963}964int get_num_hw_threads() const { return num_hw_threads; }965void sort_ids() {966qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),967kmp_hw_thread_t::compare_ids);968}969// Check if the hardware ids are unique, if they are970// return true, otherwise return false971bool check_ids() const;972973// Function to call after the create_map() routine974void canonicalize();975void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);976977// Functions used after canonicalize() called978979#if KMP_AFFINITY_SUPPORTED980// Set the granularity for affinity settings981void set_granularity(kmp_affinity_t &stgs) const;982bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;983bool restrict_to_mask(const kmp_affin_mask_t *mask);984bool filter_hw_subset();985#endif986bool is_uniform() const { return flags.uniform; }987// Tell whether a type is a valid type in the topology988// returns KMP_HW_UNKNOWN when there is no equivalent type989kmp_hw_t get_equivalent_type(kmp_hw_t type) const {990if (type == KMP_HW_UNKNOWN)991return KMP_HW_UNKNOWN;992return equivalent[type];993}994// Set type1 = type2995void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {996KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);997KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);998kmp_hw_t real_type2 = equivalent[type2];999if (real_type2 == KMP_HW_UNKNOWN)1000real_type2 = type2;1001equivalent[type1] = real_type2;1002// This loop is required since any of the types may have been set to1003// be equivalent to type1. They all must be checked and reset to type2.1004KMP_FOREACH_HW_TYPE(type) {1005if (equivalent[type] == type1) {1006equivalent[type] = real_type2;1007}1008}1009}1010// Calculate number of types corresponding to level11011// per types corresponding to level2 (e.g., number of threads per core)1012int calculate_ratio(int level1, int level2) const {1013KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);1014KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);1015int r = 1;1016for (int level = level1; level > level2; --level)1017r *= ratio[level];1018return r;1019}1020int get_ratio(int level) const {1021KMP_DEBUG_ASSERT(level >= 0 && level < depth);1022return ratio[level];1023}1024int get_depth() const { return depth; };1025kmp_hw_t get_type(int level) const {1026KMP_DEBUG_ASSERT(level >= 0 && level < depth);1027return types[level];1028}1029int get_level(kmp_hw_t type) const {1030KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);1031int eq_type = equivalent[type];1032if (eq_type == KMP_HW_UNKNOWN)1033return -1;1034for (int i = 0; i < depth; ++i)1035if (types[i] == eq_type)1036return i;1037return -1;1038}1039int get_count(int level) const {1040KMP_DEBUG_ASSERT(level >= 0 && level < depth);1041return count[level];1042}1043// Return the total number of cores with attribute 'attr'1044int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {1045return _get_ncores_with_attr(attr, -1, true);1046}1047// Return the number of cores with attribute1048// 'attr' per topology level 'above'1049int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {1050return _get_ncores_with_attr(attr, above, false);1051}10521053#if KMP_AFFINITY_SUPPORTED1054friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);1055void sort_compact(kmp_affinity_t &affinity) {1056compact = affinity.compact;1057qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),1058kmp_hw_thread_t::compare_compact);1059}1060#endif1061void print(const char *env_var = "KMP_AFFINITY") const;1062void dump() const;1063};1064extern kmp_topology_t *__kmp_topology;10651066class kmp_hw_subset_t {1067const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;10681069public:1070// Describe a machine topology item in KMP_HW_SUBSET1071struct item_t {1072kmp_hw_t type;1073int num_attrs;1074int num[MAX_ATTRS];1075int offset[MAX_ATTRS];1076kmp_hw_attr_t attr[MAX_ATTRS];1077};1078// Put parenthesis around max to avoid accidental use of Windows max macro.1079const static int USE_ALL = (std::numeric_limits<int>::max)();10801081private:1082int depth;1083int capacity;1084item_t *items;1085kmp_uint64 set;1086bool absolute;1087// The set must be able to handle up to KMP_HW_LAST number of layers1088KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);1089// Sorting the KMP_HW_SUBSET items to follow topology order1090// All unknown topology types will be at the beginning of the subset1091static int hw_subset_compare(const void *i1, const void *i2) {1092kmp_hw_t type1 = ((const item_t *)i1)->type;1093kmp_hw_t type2 = ((const item_t *)i2)->type;1094int level1 = __kmp_topology->get_level(type1);1095int level2 = __kmp_topology->get_level(type2);1096return level1 - level2;1097}10981099public:1100// Force use of allocate()/deallocate()1101kmp_hw_subset_t() = delete;1102kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;1103kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;1104kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;1105kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;11061107static kmp_hw_subset_t *allocate() {1108int initial_capacity = 5;1109kmp_hw_subset_t *retval =1110(kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));1111retval->depth = 0;1112retval->capacity = initial_capacity;1113retval->set = 0ull;1114retval->absolute = false;1115retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);1116return retval;1117}1118static void deallocate(kmp_hw_subset_t *subset) {1119__kmp_free(subset->items);1120__kmp_free(subset);1121}1122void set_absolute() { absolute = true; }1123bool is_absolute() const { return absolute; }1124void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {1125for (int i = 0; i < depth; ++i) {1126// Found an existing item for this layer type1127// Add the num, offset, and attr to this item1128if (items[i].type == type) {1129int idx = items[i].num_attrs++;1130if ((size_t)idx >= MAX_ATTRS)1131return;1132items[i].num[idx] = num;1133items[i].offset[idx] = offset;1134items[i].attr[idx] = attr;1135return;1136}1137}1138if (depth == capacity - 1) {1139capacity *= 2;1140item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);1141for (int i = 0; i < depth; ++i)1142new_items[i] = items[i];1143__kmp_free(items);1144items = new_items;1145}1146items[depth].num_attrs = 1;1147items[depth].type = type;1148items[depth].num[0] = num;1149items[depth].offset[0] = offset;1150items[depth].attr[0] = attr;1151depth++;1152set |= (1ull << type);1153}1154int get_depth() const { return depth; }1155const item_t &at(int index) const {1156KMP_DEBUG_ASSERT(index >= 0 && index < depth);1157return items[index];1158}1159item_t &at(int index) {1160KMP_DEBUG_ASSERT(index >= 0 && index < depth);1161return items[index];1162}1163void remove(int index) {1164KMP_DEBUG_ASSERT(index >= 0 && index < depth);1165set &= ~(1ull << items[index].type);1166for (int j = index + 1; j < depth; ++j) {1167items[j - 1] = items[j];1168}1169depth--;1170}1171void sort() {1172KMP_DEBUG_ASSERT(__kmp_topology);1173qsort(items, depth, sizeof(item_t), hw_subset_compare);1174}1175bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }11761177// Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.1178// This means putting each of {sockets, cores, threads} in the topology if1179// they are not specified:1180// e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.1181// e.g., 3module => *s,3module,*c,*t1182// By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET1183// are expecting the traditional sockets/cores/threads topology. For newer1184// hardware, there can be intervening layers like dies/tiles/modules1185// (usually corresponding to a cache level). So when a user asks for1186// 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user1187// should get 12 hardware threads across 6 cores and effectively ignore the1188// module layer.1189void canonicalize(const kmp_topology_t *top) {1190// Layers to target for KMP_HW_SUBSET canonicalization1191kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};11921193// Do not target-layer-canonicalize absolute KMP_HW_SUBSETS1194if (is_absolute())1195return;11961197// Do not target-layer-canonicalize KMP_HW_SUBSETS when the1198// topology doesn't have these layers1199for (kmp_hw_t type : targeted)1200if (top->get_level(type) == KMP_HW_UNKNOWN)1201return;12021203// Put targeted layers in topology if they do not exist1204for (kmp_hw_t type : targeted) {1205bool found = false;1206for (int i = 0; i < get_depth(); ++i) {1207if (top->get_equivalent_type(items[i].type) == type) {1208found = true;1209break;1210}1211}1212if (!found) {1213push_back(USE_ALL, type, 0, kmp_hw_attr_t{});1214}1215}1216sort();1217// Set as an absolute topology that only targets the targeted layers1218set_absolute();1219}1220void dump() const {1221printf("**********************\n");1222printf("*** kmp_hw_subset: ***\n");1223printf("* depth: %d\n", depth);1224printf("* items:\n");1225for (int i = 0; i < depth; ++i) {1226printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));1227for (int j = 0; j < items[i].num_attrs; ++j) {1228printf(" num: %d, offset: %d, attr: ", items[i].num[j],1229items[i].offset[j]);1230if (!items[i].attr[j]) {1231printf(" (none)\n");1232} else {1233printf(1234" core_type = %s, core_eff = %d\n",1235__kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),1236items[i].attr[j].get_core_eff());1237}1238}1239}1240printf("* set: 0x%llx\n", set);1241printf("* absolute: %d\n", absolute);1242printf("**********************\n");1243}1244};1245extern kmp_hw_subset_t *__kmp_hw_subset;12461247/* A structure for holding machine-specific hierarchy info to be computed once1248at init. This structure represents a mapping of threads to the actual machine1249hierarchy, or to our best guess at what the hierarchy might be, for the1250purpose of performing an efficient barrier. In the worst case, when there is1251no machine hierarchy information, it produces a tree suitable for a barrier,1252similar to the tree used in the hyper barrier. */1253class hierarchy_info {1254public:1255/* Good default values for number of leaves and branching factor, given no1256affinity information. Behaves a bit like hyper barrier. */1257static const kmp_uint32 maxLeaves = 4;1258static const kmp_uint32 minBranch = 4;1259/** Number of levels in the hierarchy. Typical levels are threads/core,1260cores/package or socket, packages/node, nodes/machine, etc. We don't want1261to get specific with nomenclature. When the machine is oversubscribed we1262add levels to duplicate the hierarchy, doubling the thread capacity of the1263hierarchy each time we add a level. */1264kmp_uint32 maxLevels;12651266/** This is specifically the depth of the machine configuration hierarchy, in1267terms of the number of levels along the longest path from root to any1268leaf. It corresponds to the number of entries in numPerLevel if we exclude1269all but one trailing 1. */1270kmp_uint32 depth;1271kmp_uint32 base_num_threads;1272enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };1273volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,1274// 2=initialization in progress1275volatile kmp_int8 resizing; // 0=not resizing, 1=resizing12761277/** Level 0 corresponds to leaves. numPerLevel[i] is the number of children1278the parent of a node at level i has. For example, if we have a machine1279with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =1280{2, 4, 4, 1, 1}. All empty levels are set to 1. */1281kmp_uint32 *numPerLevel;1282kmp_uint32 *skipPerLevel;12831284void deriveLevels() {1285int hier_depth = __kmp_topology->get_depth();1286for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {1287numPerLevel[level] = __kmp_topology->get_ratio(i);1288}1289}12901291hierarchy_info()1292: maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}12931294void fini() {1295if (!uninitialized && numPerLevel) {1296__kmp_free(numPerLevel);1297numPerLevel = NULL;1298uninitialized = not_initialized;1299}1300}13011302void init(int num_addrs) {1303kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(1304&uninitialized, not_initialized, initializing);1305if (bool_result == 0) { // Wait for initialization1306while (TCR_1(uninitialized) != initialized)1307KMP_CPU_PAUSE();1308return;1309}1310KMP_DEBUG_ASSERT(bool_result == 1);13111312/* Added explicit initialization of the data fields here to prevent usage of1313dirty value observed when static library is re-initialized multiple times1314(e.g. when non-OpenMP thread repeatedly launches/joins thread that uses1315OpenMP). */1316depth = 1;1317resizing = 0;1318maxLevels = 7;1319numPerLevel =1320(kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));1321skipPerLevel = &(numPerLevel[maxLevels]);1322for (kmp_uint32 i = 0; i < maxLevels;1323++i) { // init numPerLevel[*] to 1 item per level1324numPerLevel[i] = 1;1325skipPerLevel[i] = 1;1326}13271328// Sort table by physical ID1329if (__kmp_topology && __kmp_topology->get_depth() > 0) {1330deriveLevels();1331} else {1332numPerLevel[0] = maxLeaves;1333numPerLevel[1] = num_addrs / maxLeaves;1334if (num_addrs % maxLeaves)1335numPerLevel[1]++;1336}13371338base_num_threads = num_addrs;1339for (int i = maxLevels - 1; i >= 0;1340--i) // count non-empty levels to get depth1341if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'1342depth++;13431344kmp_uint32 branch = minBranch;1345if (numPerLevel[0] == 1)1346branch = num_addrs / maxLeaves;1347if (branch < minBranch)1348branch = minBranch;1349for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width1350while (numPerLevel[d] > branch ||1351(d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!1352if (numPerLevel[d] & 1)1353numPerLevel[d]++;1354numPerLevel[d] = numPerLevel[d] >> 1;1355if (numPerLevel[d + 1] == 1)1356depth++;1357numPerLevel[d + 1] = numPerLevel[d + 1] << 1;1358}1359if (numPerLevel[0] == 1) {1360branch = branch >> 1;1361if (branch < 4)1362branch = minBranch;1363}1364}13651366for (kmp_uint32 i = 1; i < depth; ++i)1367skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];1368// Fill in hierarchy in the case of oversubscription1369for (kmp_uint32 i = depth; i < maxLevels; ++i)1370skipPerLevel[i] = 2 * skipPerLevel[i - 1];13711372uninitialized = initialized; // One writer1373}13741375// Resize the hierarchy if nproc changes to something larger than before1376void resize(kmp_uint32 nproc) {1377kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);1378while (bool_result == 0) { // someone else is trying to resize1379KMP_CPU_PAUSE();1380if (nproc <= base_num_threads) // happy with other thread's resize1381return;1382else // try to resize1383bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);1384}1385KMP_DEBUG_ASSERT(bool_result != 0);1386if (nproc <= base_num_threads)1387return; // happy with other thread's resize13881389// Calculate new maxLevels1390kmp_uint32 old_sz = skipPerLevel[depth - 1];1391kmp_uint32 incs = 0, old_maxLevels = maxLevels;1392// First see if old maxLevels is enough to contain new size1393for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {1394skipPerLevel[i] = 2 * skipPerLevel[i - 1];1395numPerLevel[i - 1] *= 2;1396old_sz *= 2;1397depth++;1398}1399if (nproc > old_sz) { // Not enough space, need to expand hierarchy1400while (nproc > old_sz) {1401old_sz *= 2;1402incs++;1403depth++;1404}1405maxLevels += incs;14061407// Resize arrays1408kmp_uint32 *old_numPerLevel = numPerLevel;1409kmp_uint32 *old_skipPerLevel = skipPerLevel;1410numPerLevel = skipPerLevel = NULL;1411numPerLevel =1412(kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));1413skipPerLevel = &(numPerLevel[maxLevels]);14141415// Copy old elements from old arrays1416for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {1417// init numPerLevel[*] to 1 item per level1418numPerLevel[i] = old_numPerLevel[i];1419skipPerLevel[i] = old_skipPerLevel[i];1420}14211422// Init new elements in arrays to 11423for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {1424// init numPerLevel[*] to 1 item per level1425numPerLevel[i] = 1;1426skipPerLevel[i] = 1;1427}14281429// Free old arrays1430__kmp_free(old_numPerLevel);1431}14321433// Fill in oversubscription levels of hierarchy1434for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)1435skipPerLevel[i] = 2 * skipPerLevel[i - 1];14361437base_num_threads = nproc;1438resizing = 0; // One writer1439}1440};1441#endif // KMP_AFFINITY_H144214431444