Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch_hier.h
35258 views
/*1* kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112#ifndef KMP_DISPATCH_HIER_H13#define KMP_DISPATCH_HIER_H14#include "kmp.h"15#include "kmp_dispatch.h"1617// Layer type for scheduling hierarchy18enum kmp_hier_layer_e {19LAYER_THREAD = -1,20LAYER_L1,21LAYER_L2,22LAYER_L3,23LAYER_NUMA,24LAYER_LOOP,25LAYER_LAST26};2728// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string29static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {30switch (type) {31case kmp_hier_layer_e::LAYER_THREAD:32return "THREAD";33case kmp_hier_layer_e::LAYER_L1:34return "L1";35case kmp_hier_layer_e::LAYER_L2:36return "L2";37case kmp_hier_layer_e::LAYER_L3:38return "L3";39case kmp_hier_layer_e::LAYER_NUMA:40return "NUMA";41case kmp_hier_layer_e::LAYER_LOOP:42return "WHOLE_LOOP";43case kmp_hier_layer_e::LAYER_LAST:44return "LAST";45}46KMP_ASSERT(0);47// Appease compilers, should never get here48return "ERROR";49}5051// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy52typedef struct kmp_hier_sched_env_t {53int size;54int capacity;55enum sched_type *scheds;56kmp_int32 *small_chunks;57kmp_int64 *large_chunks;58kmp_hier_layer_e *layers;59// Append a level of the hierarchy60void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {61if (capacity == 0) {62scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *63kmp_hier_layer_e::LAYER_LAST);64small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *65kmp_hier_layer_e::LAYER_LAST);66large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *67kmp_hier_layer_e::LAYER_LAST);68layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *69kmp_hier_layer_e::LAYER_LAST);70capacity = kmp_hier_layer_e::LAYER_LAST;71}72int current_size = size;73KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);74scheds[current_size] = sched;75layers[current_size] = layer;76small_chunks[current_size] = chunk;77large_chunks[current_size] = (kmp_int64)chunk;78size++;79}80// Sort the hierarchy using selection sort, size will always be small81// (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm82void sort() {83if (size <= 1)84return;85for (int i = 0; i < size; ++i) {86int switch_index = i;87for (int j = i + 1; j < size; ++j) {88if (layers[j] < layers[switch_index])89switch_index = j;90}91if (switch_index != i) {92kmp_hier_layer_e temp1 = layers[i];93enum sched_type temp2 = scheds[i];94kmp_int32 temp3 = small_chunks[i];95kmp_int64 temp4 = large_chunks[i];96layers[i] = layers[switch_index];97scheds[i] = scheds[switch_index];98small_chunks[i] = small_chunks[switch_index];99large_chunks[i] = large_chunks[switch_index];100layers[switch_index] = temp1;101scheds[switch_index] = temp2;102small_chunks[switch_index] = temp3;103large_chunks[switch_index] = temp4;104}105}106}107// Free all memory108void deallocate() {109if (capacity > 0) {110__kmp_free(scheds);111__kmp_free(layers);112__kmp_free(small_chunks);113__kmp_free(large_chunks);114scheds = NULL;115layers = NULL;116small_chunks = NULL;117large_chunks = NULL;118}119size = 0;120capacity = 0;121}122} kmp_hier_sched_env_t;123124extern int __kmp_dispatch_hand_threading;125extern kmp_hier_sched_env_t __kmp_hier_scheds;126127// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.128extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];129extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];130131extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);132extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);133extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,134kmp_hier_layer_e t2);135extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);136137template <typename T> struct kmp_hier_shared_bdata_t {138typedef typename traits_t<T>::signed_t ST;139volatile kmp_uint64 val[2];140kmp_int32 status[2];141T lb[2];142T ub[2];143ST st[2];144dispatch_shared_info_template<T> sh[2];145void zero() {146val[0] = val[1] = 0;147status[0] = status[1] = 0;148lb[0] = lb[1] = 0;149ub[0] = ub[1] = 0;150st[0] = st[1] = 0;151sh[0].u.s.iteration = sh[1].u.s.iteration = 0;152}153void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,154kmp_uint64 index) {155lb[1 - index] = nlb;156ub[1 - index] = nub;157st[1 - index] = nst;158status[1 - index] = nstatus;159}160void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {161lb[1 - index] = nlb;162ub[1 - index] = nub;163st[1 - index] = nst;164status[1 - index] = nstatus;165sh[1 - index].u.s.iteration = 0;166}167168kmp_int32 get_next_status(kmp_uint64 index) const {169return status[1 - index];170}171T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }172T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }173ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }174dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {175return &(sh[1 - index]);176}177178kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }179T get_curr_lb(kmp_uint64 index) const { return lb[index]; }180T get_curr_ub(kmp_uint64 index) const { return ub[index]; }181ST get_curr_st(kmp_uint64 index) const { return st[index]; }182dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {183return &(sh[index]);184}185};186187/*188* In the barrier implementations, num_active is the number of threads that are189* attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.190* bdata is the shared barrier data that resides on the kmp_hier_top_unit_t191* structure. tdata is the thread private data that resides on the thread192* data structure.193*194* The reset_shared() method is used to initialize the barrier data on the195* kmp_hier_top_unit_t hierarchy structure196*197* The reset_private() method is used to initialize the barrier data on the198* thread's private dispatch buffer structure199*200* The barrier() method takes an id, which is that thread's id for the201* kmp_hier_top_unit_t structure, and implements the barrier. All threads wait202* inside barrier() until all fellow threads who are attached to that203* kmp_hier_top_unit_t structure have arrived.204*/205206// Core barrier implementation207// Can be used in a unit with between 2 to 8 threads208template <typename T> class core_barrier_impl {209static inline kmp_uint64 get_wait_val(int num_active) {210kmp_uint64 wait_val = 0LL;211switch (num_active) {212case 2:213wait_val = 0x0101LL;214break;215case 3:216wait_val = 0x010101LL;217break;218case 4:219wait_val = 0x01010101LL;220break;221case 5:222wait_val = 0x0101010101LL;223break;224case 6:225wait_val = 0x010101010101LL;226break;227case 7:228wait_val = 0x01010101010101LL;229break;230case 8:231wait_val = 0x0101010101010101LL;232break;233default:234// don't use the core_barrier_impl for more than 8 threads235KMP_ASSERT(0);236}237return wait_val;238}239240public:241static void reset_private(kmp_int32 num_active,242kmp_hier_private_bdata_t *tdata);243static void reset_shared(kmp_int32 num_active,244kmp_hier_shared_bdata_t<T> *bdata);245static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,246kmp_hier_private_bdata_t *tdata);247};248249template <typename T>250void core_barrier_impl<T>::reset_private(kmp_int32 num_active,251kmp_hier_private_bdata_t *tdata) {252tdata->num_active = num_active;253tdata->index = 0;254tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);255}256template <typename T>257void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,258kmp_hier_shared_bdata_t<T> *bdata) {259bdata->val[0] = bdata->val[1] = 0LL;260bdata->status[0] = bdata->status[1] = 0LL;261}262template <typename T>263void core_barrier_impl<T>::barrier(kmp_int32 id,264kmp_hier_shared_bdata_t<T> *bdata,265kmp_hier_private_bdata_t *tdata) {266kmp_uint64 current_index = tdata->index;267kmp_uint64 next_index = 1 - current_index;268kmp_uint64 current_wait_value = tdata->wait_val[current_index];269kmp_uint64 next_wait_value =270(current_wait_value ? 0 : get_wait_val(tdata->num_active));271KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "272"next_index:%llu curr_wait:%llu next_wait:%llu\n",273__kmp_get_gtid(), current_index, next_index, current_wait_value,274next_wait_value));275char v = (current_wait_value ? '\1' : '\0');276(RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;277__kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,278__kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));279tdata->wait_val[current_index] = next_wait_value;280tdata->index = next_index;281}282283// Counter barrier implementation284// Can be used in a unit with arbitrary number of active threads285template <typename T> class counter_barrier_impl {286public:287static void reset_private(kmp_int32 num_active,288kmp_hier_private_bdata_t *tdata);289static void reset_shared(kmp_int32 num_active,290kmp_hier_shared_bdata_t<T> *bdata);291static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,292kmp_hier_private_bdata_t *tdata);293};294295template <typename T>296void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,297kmp_hier_private_bdata_t *tdata) {298tdata->num_active = num_active;299tdata->index = 0;300tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;301}302template <typename T>303void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,304kmp_hier_shared_bdata_t<T> *bdata) {305bdata->val[0] = bdata->val[1] = 0LL;306bdata->status[0] = bdata->status[1] = 0LL;307}308template <typename T>309void counter_barrier_impl<T>::barrier(kmp_int32 id,310kmp_hier_shared_bdata_t<T> *bdata,311kmp_hier_private_bdata_t *tdata) {312volatile kmp_int64 *val;313kmp_uint64 current_index = tdata->index;314kmp_uint64 next_index = 1 - current_index;315kmp_uint64 current_wait_value = tdata->wait_val[current_index];316kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;317318KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "319"next_index:%llu curr_wait:%llu next_wait:%llu\n",320__kmp_get_gtid(), current_index, next_index, current_wait_value,321next_wait_value));322val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));323KMP_TEST_THEN_INC64(val);324__kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,325__kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));326tdata->wait_val[current_index] = next_wait_value;327tdata->index = next_index;328}329330// Data associated with topology unit within a layer331// For example, one kmp_hier_top_unit_t corresponds to one L1 cache332template <typename T> struct kmp_hier_top_unit_t {333typedef typename traits_t<T>::signed_t ST;334typedef typename traits_t<T>::unsigned_t UT;335kmp_int32 active; // number of topology units that communicate with this unit336// chunk information (lower/upper bound, stride, etc.)337dispatch_private_info_template<T> hier_pr;338kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit339kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit340341kmp_int32 get_hier_id() const { return hier_pr.hier_id; }342void reset_shared_barrier() {343KMP_DEBUG_ASSERT(active > 0);344if (active == 1)345return;346hier_barrier.zero();347if (active >= 2 && active <= 8) {348core_barrier_impl<T>::reset_shared(active, &hier_barrier);349} else {350counter_barrier_impl<T>::reset_shared(active, &hier_barrier);351}352}353void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {354KMP_DEBUG_ASSERT(tdata);355KMP_DEBUG_ASSERT(active > 0);356if (active == 1)357return;358if (active >= 2 && active <= 8) {359core_barrier_impl<T>::reset_private(active, tdata);360} else {361counter_barrier_impl<T>::reset_private(active, tdata);362}363}364void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {365KMP_DEBUG_ASSERT(tdata);366KMP_DEBUG_ASSERT(active > 0);367KMP_DEBUG_ASSERT(id >= 0 && id < active);368if (active == 1) {369tdata->index = 1 - tdata->index;370return;371}372if (active >= 2 && active <= 8) {373core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);374} else {375counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);376}377}378379kmp_int32 get_next_status(kmp_uint64 index) const {380return hier_barrier.get_next_status(index);381}382T get_next_lb(kmp_uint64 index) const {383return hier_barrier.get_next_lb(index);384}385T get_next_ub(kmp_uint64 index) const {386return hier_barrier.get_next_ub(index);387}388ST get_next_st(kmp_uint64 index) const {389return hier_barrier.get_next_st(index);390}391dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {392return hier_barrier.get_next_sh(index);393}394395kmp_int32 get_curr_status(kmp_uint64 index) const {396return hier_barrier.get_curr_status(index);397}398T get_curr_lb(kmp_uint64 index) const {399return hier_barrier.get_curr_lb(index);400}401T get_curr_ub(kmp_uint64 index) const {402return hier_barrier.get_curr_ub(index);403}404ST get_curr_st(kmp_uint64 index) const {405return hier_barrier.get_curr_st(index);406}407dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {408return hier_barrier.get_curr_sh(index);409}410411void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,412kmp_uint64 index) {413hier_barrier.set_next_hand_thread(lb, ub, st, status, index);414}415void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {416hier_barrier.set_next(lb, ub, st, status, index);417}418dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }419kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }420dispatch_private_info_template<T> *get_parent_pr() {421return &(hier_parent->hier_pr);422}423424kmp_int32 is_active() const { return active; }425kmp_int32 get_num_active() const { return active; }426#ifdef KMP_DEBUG427void print() {428KD_TRACE(42910,430(" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",431active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,432hier_pr.u.p.tc));433}434#endif435};436437// Information regarding a single layer within the scheduling hierarchy438template <typename T> struct kmp_hier_layer_info_t {439int num_active; // number of threads active in this level440kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.441enum sched_type sched; // static, dynamic, guided, etc.442typename traits_t<T>::signed_t chunk; // chunk size associated with schedule443int length; // length of the kmp_hier_top_unit_t array444445#ifdef KMP_DEBUG446// Print this layer's information447void print() {448const char *t = __kmp_get_hier_str(type);449KD_TRACE(45010,451(" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "452"length:%d\n",453num_active, t, sched, chunk, length));454}455#endif456};457458/*459* Structure to implement entire hierarchy460*461* The hierarchy is kept as an array of arrays to represent the different462* layers. Layer 0 is the lowest layer to layer num_layers - 1 which is the463* highest layer.464* Example:465* [ 2 ] -> [ L3 | L3 ]466* [ 1 ] -> [ L2 | L2 | L2 | L2 ]467* [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]468* There is also an array of layer_info_t which has information regarding469* each layer470*/471template <typename T> struct kmp_hier_t {472public:473typedef typename traits_t<T>::unsigned_t UT;474typedef typename traits_t<T>::signed_t ST;475476private:477int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,478kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,479kmp_int32 previous_id, int hier_level) {480int status;481kmp_info_t *th = __kmp_threads[gtid];482auto parent = current->get_parent();483bool last_layer = (hier_level == get_num_layers() - 1);484KMP_DEBUG_ASSERT(th);485kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);486KMP_DEBUG_ASSERT(current);487KMP_DEBUG_ASSERT(hier_level >= 0);488KMP_DEBUG_ASSERT(hier_level < get_num_layers());489KMP_DEBUG_ASSERT(tdata);490KMP_DEBUG_ASSERT(parent || last_layer);491492KD_TRACE(4931, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));494495T hier_id = (T)current->get_hier_id();496// Attempt to grab next iteration range for this level497if (previous_id == 0) {498KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",499gtid, hier_level));500kmp_int32 contains_last;501T my_lb, my_ub;502ST my_st;503T nproc;504dispatch_shared_info_template<T> volatile *my_sh;505dispatch_private_info_template<T> *my_pr;506if (last_layer) {507// last layer below the very top uses the single shared buffer508// from the team struct.509KD_TRACE(10,510("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",511gtid, hier_level));512my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(513th->th.th_dispatch->th_dispatch_sh_current);514nproc = (T)get_top_level_nproc();515} else {516// middle layers use the shared buffer inside the kmp_hier_top_unit_t517// structure518KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",519gtid, hier_level));520my_sh =521parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);522nproc = (T)parent->get_num_active();523}524my_pr = current->get_my_pr();525KMP_DEBUG_ASSERT(my_sh);526KMP_DEBUG_ASSERT(my_pr);527enum sched_type schedule = get_sched(hier_level);528ST chunk = (ST)get_chunk(hier_level);529status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,530&contains_last, &my_lb, &my_ub,531&my_st, nproc, hier_id);532KD_TRACE(53310,534("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",535gtid, hier_level, status));536// When no iterations are found (status == 0) and this is not the last537// layer, attempt to go up the hierarchy for more iterations538if (status == 0 && !last_layer) {539kmp_int32 hid;540__kmp_type_convert(hier_id, &hid);541status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,542&my_st, hid, hier_level + 1);543KD_TRACE(54410,545("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",546gtid, hier_level, status));547if (status == 1) {548kmp_hier_private_bdata_t *upper_tdata =549&(th->th.th_hier_bar_data[hier_level + 1]);550my_sh = parent->get_curr_sh(upper_tdata->index);551KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",552gtid, hier_level));553__kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,554parent->get_curr_lb(upper_tdata->index),555parent->get_curr_ub(upper_tdata->index),556parent->get_curr_st(upper_tdata->index),557#if USE_ITT_BUILD558NULL,559#endif560chunk, nproc, hier_id);561status = __kmp_dispatch_next_algorithm<T>(562gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,563hier_id);564if (!status) {565KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "566"setting to 2!\n",567gtid, hier_level));568status = 2;569}570}571}572current->set_next(my_lb, my_ub, my_st, status, tdata->index);573// Propagate whether a unit holds the actual global last iteration574// The contains_last attribute is sent downwards from the top to the575// bottom of the hierarchy via the contains_last flag inside the576// private dispatch buffers in the hierarchy's middle layers577if (contains_last) {578// If the next_algorithm() method returns 1 for p_last and it is the579// last layer or our parent contains the last serial chunk, then the580// chunk must contain the last serial iteration.581if (last_layer || parent->hier_pr.flags.contains_last) {582KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "583"to contain last.\n",584gtid, hier_level));585current->hier_pr.flags.contains_last = contains_last;586}587if (!current->hier_pr.flags.contains_last)588contains_last = FALSE;589}590if (p_last)591*p_last = contains_last;592} // if primary thread of this unit593if (hier_level > 0 || !__kmp_dispatch_hand_threading) {594KD_TRACE(10,595("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",596gtid, hier_level));597current->barrier(previous_id, tdata);598KD_TRACE(10,599("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",600gtid, hier_level, current->get_curr_status(tdata->index)));601} else {602KMP_DEBUG_ASSERT(previous_id == 0);603return status;604}605return current->get_curr_status(tdata->index);606}607608public:609int top_level_nproc;610int num_layers;611bool valid;612int type_size;613kmp_hier_layer_info_t<T> *info;614kmp_hier_top_unit_t<T> **layers;615// Deallocate all memory from this hierarchy616void deallocate() {617for (int i = 0; i < num_layers; ++i)618if (layers[i] != NULL) {619__kmp_free(layers[i]);620}621if (layers != NULL) {622__kmp_free(layers);623layers = NULL;624}625if (info != NULL) {626__kmp_free(info);627info = NULL;628}629num_layers = 0;630valid = false;631}632// Returns true if reallocation is needed else false633bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,634const enum sched_type *new_scheds,635const ST *new_chunks) const {636if (!valid || layers == NULL || info == NULL ||637traits_t<T>::type_size != type_size || n != num_layers)638return true;639for (int i = 0; i < n; ++i) {640if (info[i].type != new_layers[i])641return true;642if (info[i].sched != new_scheds[i])643return true;644if (info[i].chunk != new_chunks[i])645return true;646}647return false;648}649// A single thread should call this function while the other threads wait650// create a new scheduling hierarchy consisting of new_layers, new_scheds651// and new_chunks. These should come pre-sorted according to652// kmp_hier_layer_e value. This function will try to avoid reallocation653// if it can654void allocate_hier(int n, const kmp_hier_layer_e *new_layers,655const enum sched_type *new_scheds, const ST *new_chunks) {656top_level_nproc = 0;657if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {658KD_TRACE(65910,660("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));661for (int i = 0; i < n; ++i) {662info[i].num_active = 0;663for (int j = 0; j < get_length(i); ++j)664layers[i][j].active = 0;665}666return;667}668KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));669deallocate();670type_size = traits_t<T>::type_size;671num_layers = n;672info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(673sizeof(kmp_hier_layer_info_t<T>) * n);674layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(675sizeof(kmp_hier_top_unit_t<T> *) * n);676for (int i = 0; i < n; ++i) {677int max = 0;678kmp_hier_layer_e layer = new_layers[i];679info[i].num_active = 0;680info[i].type = layer;681info[i].sched = new_scheds[i];682info[i].chunk = new_chunks[i];683max = __kmp_hier_max_units[layer + 1];684if (max == 0) {685valid = false;686KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));687deallocate();688return;689}690info[i].length = max;691layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(692sizeof(kmp_hier_top_unit_t<T>) * max);693for (int j = 0; j < max; ++j) {694layers[i][j].active = 0;695layers[i][j].hier_pr.flags.use_hier = TRUE;696}697}698valid = true;699}700// loc - source file location701// gtid - global thread identifier702// pr - this thread's private dispatch buffer (corresponding with gtid)703// p_last (return value) - pointer to flag indicating this set of iterations704// contains last705// iteration706// p_lb (return value) - lower bound for this chunk of iterations707// p_ub (return value) - upper bound for this chunk of iterations708// p_st (return value) - stride for this chunk of iterations709//710// Returns 1 if there are more iterations to perform, 0 otherwise711int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,712kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {713int status;714kmp_int32 contains_last = 0;715kmp_info_t *th = __kmp_threads[gtid];716kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);717auto parent = pr->get_parent();718KMP_DEBUG_ASSERT(parent);719KMP_DEBUG_ASSERT(th);720KMP_DEBUG_ASSERT(tdata);721KMP_DEBUG_ASSERT(parent);722T nproc = (T)parent->get_num_active();723T unit_id = (T)pr->get_hier_id();724KD_TRACE(72510,726("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",727gtid, nproc, unit_id));728// Handthreading implementation729// Each iteration is performed by all threads on last unit (typically730// cores/tiles)731// e.g., threads 0,1,2,3 all execute iteration 0732// threads 0,1,2,3 all execute iteration 1733// threads 4,5,6,7 all execute iteration 2734// threads 4,5,6,7 all execute iteration 3735// ... etc.736if (__kmp_dispatch_hand_threading) {737KD_TRACE(10,738("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",739gtid));740if (unit_id == 0) {741// For hand threading, the sh buffer on the lowest level is only ever742// modified and read by the primary thread on that level. Because of743// this, we can always use the first sh buffer.744auto sh = &(parent->hier_barrier.sh[0]);745KMP_DEBUG_ASSERT(sh);746status = __kmp_dispatch_next_algorithm<T>(747gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);748if (!status) {749bool done = false;750while (!done) {751done = true;752kmp_int32 uid;753__kmp_type_convert(unit_id, &uid);754status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,755p_st, uid, 0);756if (status == 1) {757__kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,758parent->get_next_lb(tdata->index),759parent->get_next_ub(tdata->index),760parent->get_next_st(tdata->index),761#if USE_ITT_BUILD762NULL,763#endif764pr->u.p.parm1, nproc, unit_id);765sh->u.s.iteration = 0;766status = __kmp_dispatch_next_algorithm<T>(767gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,768unit_id);769if (!status) {770KD_TRACE(10,771("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "772"after next_pr_sh()"773"trying again.\n",774gtid));775done = false;776}777} else if (status == 2) {778KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "779"trying again.\n",780gtid));781done = false;782}783}784}785parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);786} // if primary thread of lowest unit level787parent->barrier(pr->get_hier_id(), tdata);788if (unit_id != 0) {789*p_lb = parent->get_curr_lb(tdata->index);790*p_ub = parent->get_curr_ub(tdata->index);791*p_st = parent->get_curr_st(tdata->index);792status = parent->get_curr_status(tdata->index);793}794} else {795// Normal implementation796// Each thread grabs an iteration chunk and executes it (no cooperation)797auto sh = parent->get_curr_sh(tdata->index);798KMP_DEBUG_ASSERT(sh);799status = __kmp_dispatch_next_algorithm<T>(800gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);801KD_TRACE(10,802("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "803"contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",804gtid, status, contains_last, *p_lb, *p_ub, *p_st));805if (!status) {806bool done = false;807while (!done) {808done = true;809kmp_int32 uid;810__kmp_type_convert(unit_id, &uid);811status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,812p_st, uid, 0);813if (status == 1) {814sh = parent->get_curr_sh(tdata->index);815__kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,816parent->get_curr_lb(tdata->index),817parent->get_curr_ub(tdata->index),818parent->get_curr_st(tdata->index),819#if USE_ITT_BUILD820NULL,821#endif822pr->u.p.parm1, nproc, unit_id);823status = __kmp_dispatch_next_algorithm<T>(824gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);825if (!status) {826KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "827"after next_pr_sh()"828"trying again.\n",829gtid));830done = false;831}832} else if (status == 2) {833KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "834"trying again.\n",835gtid));836done = false;837}838}839}840}841if (contains_last && !parent->hier_pr.flags.contains_last) {842KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "843"contains_last to FALSE\n",844gtid));845contains_last = FALSE;846}847if (p_last)848*p_last = contains_last;849KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,850status));851return status;852}853// These functions probe the layer info structure854// Returns the type of topology unit given level855kmp_hier_layer_e get_type(int level) const {856KMP_DEBUG_ASSERT(level >= 0);857KMP_DEBUG_ASSERT(level < num_layers);858return info[level].type;859}860// Returns the schedule type at given level861enum sched_type get_sched(int level) const {862KMP_DEBUG_ASSERT(level >= 0);863KMP_DEBUG_ASSERT(level < num_layers);864return info[level].sched;865}866// Returns the chunk size at given level867ST get_chunk(int level) const {868KMP_DEBUG_ASSERT(level >= 0);869KMP_DEBUG_ASSERT(level < num_layers);870return info[level].chunk;871}872// Returns the number of active threads at given level873int get_num_active(int level) const {874KMP_DEBUG_ASSERT(level >= 0);875KMP_DEBUG_ASSERT(level < num_layers);876return info[level].num_active;877}878// Returns the length of topology unit array at given level879int get_length(int level) const {880KMP_DEBUG_ASSERT(level >= 0);881KMP_DEBUG_ASSERT(level < num_layers);882return info[level].length;883}884// Returns the topology unit given the level and index885kmp_hier_top_unit_t<T> *get_unit(int level, int index) {886KMP_DEBUG_ASSERT(level >= 0);887KMP_DEBUG_ASSERT(level < num_layers);888KMP_DEBUG_ASSERT(index >= 0);889KMP_DEBUG_ASSERT(index < get_length(level));890return &(layers[level][index]);891}892// Returns the number of layers in the hierarchy893int get_num_layers() const { return num_layers; }894// Returns the number of threads in the top layer895// This is necessary because we don't store a topology unit as896// the very top level and the scheduling algorithms need this information897int get_top_level_nproc() const { return top_level_nproc; }898// Return whether this hierarchy is valid or not899bool is_valid() const { return valid; }900#ifdef KMP_DEBUG901// Print the hierarchy902void print() {903KD_TRACE(10, ("kmp_hier_t:\n"));904for (int i = num_layers - 1; i >= 0; --i) {905KD_TRACE(10, ("Info[%d] = ", i));906info[i].print();907}908for (int i = num_layers - 1; i >= 0; --i) {909KD_TRACE(10, ("Layer[%d] =\n", i));910for (int j = 0; j < info[i].length; ++j) {911layers[i][j].print();912}913}914}915#endif916};917918template <typename T>919void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,920kmp_hier_layer_e *new_layers,921enum sched_type *new_scheds,922typename traits_t<T>::signed_t *new_chunks,923T lb, T ub,924typename traits_t<T>::signed_t st) {925int tid, gtid, num_hw_threads, num_threads_per_layer1, active;926unsigned int my_buffer_index;927kmp_info_t *th;928kmp_team_t *team;929dispatch_private_info_template<T> *pr;930dispatch_shared_info_template<T> volatile *sh;931gtid = __kmp_entry_gtid();932tid = __kmp_tid_from_gtid(gtid);933#ifdef KMP_DEBUG934KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",935gtid, n));936for (int i = 0; i < n; ++i) {937const char *layer = __kmp_get_hier_str(new_layers[i]);938KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "939"new_scheds[%d] = %d, new_chunks[%d] = %u\n",940gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));941}942#endif // KMP_DEBUG943KMP_DEBUG_ASSERT(n > 0);944KMP_DEBUG_ASSERT(new_layers);945KMP_DEBUG_ASSERT(new_scheds);946KMP_DEBUG_ASSERT(new_chunks);947if (!TCR_4(__kmp_init_parallel))948__kmp_parallel_initialize();949__kmp_resume_if_soft_paused();950951th = __kmp_threads[gtid];952team = th->th.th_team;953active = !team->t.t_serialized;954th->th.th_ident = loc;955num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];956KMP_DEBUG_ASSERT(th->th.th_dispatch ==957&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);958my_buffer_index = th->th.th_dispatch->th_disp_index;959pr = reinterpret_cast<dispatch_private_info_template<T> *>(960&th->th.th_dispatch961->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);962sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(963&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);964if (!active) {965KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "966"Using normal dispatch functions.\n",967gtid));968KMP_DEBUG_ASSERT(pr);969pr->flags.use_hier = FALSE;970pr->flags.contains_last = FALSE;971return;972}973KMP_DEBUG_ASSERT(pr);974KMP_DEBUG_ASSERT(sh);975pr->flags.use_hier = TRUE;976pr->u.p.tc = 0;977// Have primary thread allocate the hierarchy978if (__kmp_tid_from_gtid(gtid) == 0) {979KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "980"hierarchy\n",981gtid, pr, sh));982if (sh->hier == NULL) {983sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));984}985sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);986sh->u.s.iteration = 0;987}988__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);989// Check to make sure the hierarchy is valid990kmp_hier_t<T> *hier = sh->hier;991if (!sh->hier->is_valid()) {992pr->flags.use_hier = FALSE;993return;994}995// Have threads allocate their thread-private barrier data if it hasn't996// already been allocated997if (th->th.th_hier_bar_data == NULL) {998th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(999sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);1000}1001// Have threads "register" themselves by modifying the active count for each1002// level they are involved in. The active count will act as nthreads for that1003// level regarding the scheduling algorithms1004for (int i = 0; i < n; ++i) {1005int index = __kmp_dispatch_get_index(tid, hier->get_type(i));1006kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);1007// Setup the thread's private dispatch buffer's hierarchy pointers1008if (i == 0)1009pr->hier_parent = my_unit;1010// If this unit is already active, then increment active count and wait1011if (my_unit->is_active()) {1012KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "1013"is already active (%d)\n",1014gtid, my_unit, my_unit->active));1015KMP_TEST_THEN_INC32(&(my_unit->active));1016break;1017}1018// Flag that this unit is active1019if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {1020// Do not setup parent pointer for top level unit since it has no parent1021if (i < n - 1) {1022// Setup middle layer pointers to parents1023my_unit->get_my_pr()->hier_id =1024index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),1025hier->get_type(i + 1));1026int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));1027my_unit->hier_parent = hier->get_unit(i + 1, parent_index);1028} else {1029// Setup top layer information (no parent pointers are set)1030my_unit->get_my_pr()->hier_id =1031index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),1032kmp_hier_layer_e::LAYER_LOOP);1033KMP_TEST_THEN_INC32(&(hier->top_level_nproc));1034my_unit->hier_parent = nullptr;1035}1036// Set trip count to 0 so that next() operation will initially climb up1037// the hierarchy to get more iterations (early exit in next() for tc == 0)1038my_unit->get_my_pr()->u.p.tc = 0;1039// Increment this layer's number of active units1040KMP_TEST_THEN_INC32(&(hier->info[i].num_active));1041KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "1042"incrementing num_active\n",1043gtid, my_unit));1044} else {1045KMP_TEST_THEN_INC32(&(my_unit->active));1046break;1047}1048}1049// Set this thread's id1050num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(1051kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));1052pr->hier_id = tid % num_threads_per_layer1;1053// For oversubscribed threads, increment their index within the lowest unit1054// This is done to prevent having two or more threads with id 0, id 1, etc.1055if (tid >= num_hw_threads)1056pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);1057KD_TRACE(105810, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",1059gtid, pr->hier_id));10601061pr->flags.contains_last = FALSE;1062__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);10631064// Now that the number of active threads at each level is determined,1065// the barrier data for each unit can be initialized and the last layer's1066// loop information can be initialized.1067int prev_id = pr->get_hier_id();1068for (int i = 0; i < n; ++i) {1069if (prev_id != 0)1070break;1071int index = __kmp_dispatch_get_index(tid, hier->get_type(i));1072kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);1073// Only primary threads of this unit within the hierarchy do initialization1074KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",1075gtid, i));1076my_unit->reset_shared_barrier();1077my_unit->hier_pr.flags.contains_last = FALSE;1078// Last layer, initialize the private buffers with entire loop information1079// Now the next next_algorithm() call will get the first chunk of1080// iterations properly1081if (i == n - 1) {1082__kmp_dispatch_init_algorithm<T>(1083loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,1084#if USE_ITT_BUILD1085NULL,1086#endif1087hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());1088}1089prev_id = my_unit->get_hier_id();1090}1091// Initialize each layer of the thread's private barrier data1092kmp_hier_top_unit_t<T> *unit = pr->hier_parent;1093for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {1094kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);1095unit->reset_private_barrier(tdata);1096}1097__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);10981099#ifdef KMP_DEBUG1100if (__kmp_tid_from_gtid(gtid) == 0) {1101for (int i = 0; i < n; ++i) {1102KD_TRACE(10,1103("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",1104gtid, i, hier->get_num_active(i)));1105}1106hier->print();1107}1108__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);1109#endif // KMP_DEBUG1110}1111#endif111211131114