Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp
35258 views
/*1* kmp_barrier.cpp2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112#include "kmp_wait_release.h"13#include "kmp_barrier.h"14#include "kmp_itt.h"15#include "kmp_os.h"16#include "kmp_stats.h"17#include "ompt-specific.h"18// for distributed barrier19#include "kmp_affinity.h"2021#if KMP_MIC22#include <immintrin.h>23#define USE_NGO_STORES 124#endif // KMP_MIC2526#if KMP_MIC && USE_NGO_STORES27// ICV copying28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)31#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")32#else33#define ngo_load(src) ((void)0)34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))35#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)36#define ngo_sync() ((void)0)37#endif /* KMP_MIC && USE_NGO_STORES */3839void __kmp_print_structure(void); // Forward declaration4041// ---------------------------- Barrier Algorithms ----------------------------42// Distributed barrier4344// Compute how many threads to have polling each cache-line.45// We want to limit the number of writes to IDEAL_GO_RESOLUTION.46void distributedBarrier::computeVarsForN(size_t n) {47int nsockets = 1;48if (__kmp_topology) {49int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);50int core_level = __kmp_topology->get_level(KMP_HW_CORE);51int ncores_per_socket =52__kmp_topology->calculate_ratio(core_level, socket_level);53nsockets = __kmp_topology->get_count(socket_level);5455if (nsockets <= 0)56nsockets = 1;57if (ncores_per_socket <= 0)58ncores_per_socket = 1;5960threads_per_go = ncores_per_socket >> 1;61if (!fix_threads_per_go) {62// Minimize num_gos63if (threads_per_go > 4) {64if (KMP_OPTIMIZE_FOR_REDUCTIONS) {65threads_per_go = threads_per_go >> 1;66}67if (threads_per_go > 4 && nsockets == 1)68threads_per_go = threads_per_go >> 1;69}70}71if (threads_per_go == 0)72threads_per_go = 1;73fix_threads_per_go = true;74num_gos = n / threads_per_go;75if (n % threads_per_go)76num_gos++;77if (nsockets == 1 || num_gos == 1)78num_groups = 1;79else {80num_groups = num_gos / nsockets;81if (num_gos % nsockets)82num_groups++;83}84if (num_groups <= 0)85num_groups = 1;86gos_per_group = num_gos / num_groups;87if (num_gos % num_groups)88gos_per_group++;89threads_per_group = threads_per_go * gos_per_group;90} else {91num_gos = n / threads_per_go;92if (n % threads_per_go)93num_gos++;94if (num_gos == 1)95num_groups = 1;96else {97num_groups = num_gos / 2;98if (num_gos % 2)99num_groups++;100}101gos_per_group = num_gos / num_groups;102if (num_gos % num_groups)103gos_per_group++;104threads_per_group = threads_per_go * gos_per_group;105}106}107108void distributedBarrier::computeGo(size_t n) {109// Minimize num_gos110for (num_gos = 1;; num_gos++)111if (IDEAL_CONTENTION * num_gos >= n)112break;113threads_per_go = n / num_gos;114if (n % num_gos)115threads_per_go++;116while (num_gos > MAX_GOS) {117threads_per_go++;118num_gos = n / threads_per_go;119if (n % threads_per_go)120num_gos++;121}122computeVarsForN(n);123}124125// This function is to resize the barrier arrays when the new number of threads126// exceeds max_threads, which is the current size of all the arrays127void distributedBarrier::resize(size_t nthr) {128KMP_DEBUG_ASSERT(nthr > max_threads);129130// expand to requested size * 2131max_threads = nthr * 2;132133// allocate arrays to new max threads134for (int i = 0; i < MAX_ITERS; ++i) {135if (flags[i])136flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],137max_threads * sizeof(flags_s));138else139flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));140}141142if (go)143go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));144else145go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));146147if (iter)148iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));149else150iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));151152if (sleep)153sleep =154(sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));155else156sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));157}158159// This function is to set all the go flags that threads might be waiting160// on, and when blocktime is not infinite, it should be followed by a wake-up161// call to each thread162kmp_uint64 distributedBarrier::go_release() {163kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;164for (size_t j = 0; j < num_gos; j++) {165go[j].go.store(next_go);166}167return next_go;168}169170void distributedBarrier::go_reset() {171for (size_t j = 0; j < max_threads; ++j) {172for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {173flags[i][j].stillNeed = 1;174}175go[j].go.store(0);176iter[j].iter = 0;177}178}179180// This function inits/re-inits the distributed barrier for a particular number181// of threads. If a resize of arrays is needed, it calls the resize function.182void distributedBarrier::init(size_t nthr) {183size_t old_max = max_threads;184if (nthr > max_threads) { // need more space in arrays185resize(nthr);186}187188for (size_t i = 0; i < max_threads; i++) {189for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {190flags[j][i].stillNeed = 1;191}192go[i].go.store(0);193iter[i].iter = 0;194if (i >= old_max)195sleep[i].sleep = false;196}197198// Recalculate num_gos, etc. based on new nthr199computeVarsForN(nthr);200201num_threads = nthr;202203if (team_icvs == NULL)204team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));205}206207// This function is used only when KMP_BLOCKTIME is not infinite.208// static209void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,210size_t start, size_t stop, size_t inc,211size_t tid) {212KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);213if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))214return;215216kmp_info_t **other_threads = team->t.t_threads;217for (size_t thr = start; thr < stop; thr += inc) {218KMP_DEBUG_ASSERT(other_threads[thr]);219int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;220// Wake up worker regardless of if it appears to be sleeping or not221__kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);222}223}224225static void __kmp_dist_barrier_gather(226enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,227void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {228KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);229kmp_team_t *team;230distributedBarrier *b;231kmp_info_t **other_threads;232kmp_uint64 my_current_iter, my_next_iter;233kmp_uint32 nproc;234bool group_leader;235236team = this_thr->th.th_team;237nproc = this_thr->th.th_team_nproc;238other_threads = team->t.t_threads;239b = team->t.b;240my_current_iter = b->iter[tid].iter;241my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;242group_leader = ((tid % b->threads_per_group) == 0);243244KA_TRACE(20,245("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",246gtid, team->t.t_id, tid, bt));247248#if USE_ITT_BUILD && USE_ITT_NOTIFY249// Barrier imbalance - save arrive time to the thread250if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {251this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =252__itt_get_timestamp();253}254#endif255256if (group_leader) {257// Start from the thread after the group leader258size_t group_start = tid + 1;259size_t group_end = tid + b->threads_per_group;260size_t threads_pending = 0;261262if (group_end > nproc)263group_end = nproc;264do { // wait for threads in my group265threads_pending = 0;266// Check all the flags every time to avoid branch misspredict267for (size_t thr = group_start; thr < group_end; thr++) {268// Each thread uses a different cache line269threads_pending += b->flags[my_current_iter][thr].stillNeed;270}271// Execute tasks here272if (__kmp_tasking_mode != tskm_immediate_exec) {273kmp_task_team_t *task_team = this_thr->th.th_task_team;274if (task_team != NULL) {275if (TCR_SYNC_4(task_team->tt.tt_active)) {276if (KMP_TASKING_ENABLED(task_team)) {277int tasks_completed = FALSE;278__kmp_atomic_execute_tasks_64(279this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,280&tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);281} else282this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;283}284} else {285this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;286} // if287}288if (TCR_4(__kmp_global.g.g_done)) {289if (__kmp_global.g.g_abort)290__kmp_abort_thread();291break;292} else if (__kmp_tasking_mode != tskm_immediate_exec &&293this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {294this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;295}296} while (threads_pending > 0);297298if (reduce) { // Perform reduction if needed299OMPT_REDUCTION_DECL(this_thr, gtid);300OMPT_REDUCTION_BEGIN;301// Group leader reduces all threads in group302for (size_t thr = group_start; thr < group_end; thr++) {303(*reduce)(this_thr->th.th_local.reduce_data,304other_threads[thr]->th.th_local.reduce_data);305}306OMPT_REDUCTION_END;307}308309// Set flag for next iteration310b->flags[my_next_iter][tid].stillNeed = 1;311// Each thread uses a different cache line; resets stillNeed to 0 to312// indicate it has reached the barrier313b->flags[my_current_iter][tid].stillNeed = 0;314315do { // wait for all group leaders316threads_pending = 0;317for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {318threads_pending += b->flags[my_current_iter][thr].stillNeed;319}320// Execute tasks here321if (__kmp_tasking_mode != tskm_immediate_exec) {322kmp_task_team_t *task_team = this_thr->th.th_task_team;323if (task_team != NULL) {324if (TCR_SYNC_4(task_team->tt.tt_active)) {325if (KMP_TASKING_ENABLED(task_team)) {326int tasks_completed = FALSE;327__kmp_atomic_execute_tasks_64(328this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,329&tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);330} else331this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;332}333} else {334this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;335} // if336}337if (TCR_4(__kmp_global.g.g_done)) {338if (__kmp_global.g.g_abort)339__kmp_abort_thread();340break;341} else if (__kmp_tasking_mode != tskm_immediate_exec &&342this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {343this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;344}345} while (threads_pending > 0);346347if (reduce) { // Perform reduction if needed348if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders349OMPT_REDUCTION_DECL(this_thr, gtid);350OMPT_REDUCTION_BEGIN;351for (size_t thr = b->threads_per_group; thr < nproc;352thr += b->threads_per_group) {353(*reduce)(this_thr->th.th_local.reduce_data,354other_threads[thr]->th.th_local.reduce_data);355}356OMPT_REDUCTION_END;357}358}359} else {360// Set flag for next iteration361b->flags[my_next_iter][tid].stillNeed = 1;362// Each thread uses a different cache line; resets stillNeed to 0 to363// indicate it has reached the barrier364b->flags[my_current_iter][tid].stillNeed = 0;365}366367KMP_MFENCE();368369KA_TRACE(20,370("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",371gtid, team->t.t_id, tid, bt));372}373374static void __kmp_dist_barrier_release(375enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,376int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {377KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);378kmp_team_t *team;379distributedBarrier *b;380kmp_bstate_t *thr_bar;381kmp_uint64 my_current_iter, next_go;382size_t my_go_index;383bool group_leader;384385KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",386gtid, tid, bt));387388thr_bar = &this_thr->th.th_bar[bt].bb;389390if (!KMP_MASTER_TID(tid)) {391// workers and non-master group leaders need to check their presence in team392do {393if (this_thr->th.th_used_in_team.load() != 1 &&394this_thr->th.th_used_in_team.load() != 3) {395// Thread is not in use in a team. Wait on location in tid's thread396// struct. The 0 value tells anyone looking that this thread is spinning397// or sleeping until this location becomes 3 again; 3 is the transition398// state to get to 1 which is waiting on go and being in the team399kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);400if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,4010) ||402this_thr->th.th_used_in_team.load() == 0) {403my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));404}405#if USE_ITT_BUILD && USE_ITT_NOTIFY406if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {407// In fork barrier where we could not get the object reliably408itt_sync_obj =409__kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);410// Cancel wait on previous parallel region...411__kmp_itt_task_starting(itt_sync_obj);412413if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))414return;415416itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);417if (itt_sync_obj != NULL)418// Call prepare as early as possible for "new" barrier419__kmp_itt_task_finished(itt_sync_obj);420} else421#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */422if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))423return;424}425if (this_thr->th.th_used_in_team.load() != 1 &&426this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?427continue;428if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))429return;430431// At this point, the thread thinks it is in use in a team, or in432// transition to be used in a team, but it might have reached this barrier433// before it was marked unused by the team. Unused threads are awoken and434// shifted to wait on local thread struct elsewhere. It also might reach435// this point by being picked up for use by a different team. Either way,436// we need to update the tid.437tid = __kmp_tid_from_gtid(gtid);438team = this_thr->th.th_team;439KMP_DEBUG_ASSERT(tid >= 0);440KMP_DEBUG_ASSERT(team);441b = team->t.b;442my_current_iter = b->iter[tid].iter;443next_go = my_current_iter + distributedBarrier::MAX_ITERS;444my_go_index = tid / b->threads_per_go;445if (this_thr->th.th_used_in_team.load() == 3) {446KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);447}448// Check if go flag is set449if (b->go[my_go_index].go.load() != next_go) {450// Wait on go flag on team451kmp_atomic_flag_64<false, true> my_flag(452&(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));453my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));454KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||455b->iter[tid].iter == 0);456KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);457}458459if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))460return;461// At this point, the thread's go location was set. This means the primary462// thread is safely in the barrier, and so this thread's data is463// up-to-date, but we should check again that this thread is really in464// use in the team, as it could have been woken up for the purpose of465// changing team size, or reaping threads at shutdown.466if (this_thr->th.th_used_in_team.load() == 1)467break;468} while (1);469470if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))471return;472473group_leader = ((tid % b->threads_per_group) == 0);474if (group_leader) {475// Tell all the threads in my group they can go!476for (size_t go_idx = my_go_index + 1;477go_idx < my_go_index + b->gos_per_group; go_idx++) {478b->go[go_idx].go.store(next_go);479}480// Fence added so that workers can see changes to go. sfence inadequate.481KMP_MFENCE();482}483484#if KMP_BARRIER_ICV_PUSH485if (propagate_icvs) { // copy ICVs to final dest486__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,487tid, FALSE);488copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,489(kmp_internal_control_t *)team->t.b->team_icvs);490copy_icvs(&thr_bar->th_fixed_icvs,491&team->t.t_implicit_task_taskdata[tid].td_icvs);492}493#endif494if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {495// This thread is now awake and participating in the barrier;496// wake up the other threads in the group497size_t nproc = this_thr->th.th_team_nproc;498size_t group_end = tid + b->threads_per_group;499if (nproc < group_end)500group_end = nproc;501__kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);502}503} else { // Primary thread504team = this_thr->th.th_team;505b = team->t.b;506my_current_iter = b->iter[tid].iter;507next_go = my_current_iter + distributedBarrier::MAX_ITERS;508#if KMP_BARRIER_ICV_PUSH509if (propagate_icvs) {510// primary thread has ICVs in final destination; copy511copy_icvs(&thr_bar->th_fixed_icvs,512&team->t.t_implicit_task_taskdata[tid].td_icvs);513}514#endif515// Tell all the group leaders they can go!516for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {517b->go[go_idx].go.store(next_go);518}519520if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {521// Wake-up the group leaders522size_t nproc = this_thr->th.th_team_nproc;523__kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,524b->threads_per_group, tid);525}526527// Tell all the threads in my group they can go!528for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {529b->go[go_idx].go.store(next_go);530}531532// Fence added so that workers can see changes to go. sfence inadequate.533KMP_MFENCE();534535if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {536// Wake-up the other threads in my group537size_t nproc = this_thr->th.th_team_nproc;538size_t group_end = tid + b->threads_per_group;539if (nproc < group_end)540group_end = nproc;541__kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);542}543}544// Update to next iteration545KMP_ASSERT(my_current_iter == b->iter[tid].iter);546b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;547548KA_TRACE(54920, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",550gtid, team->t.t_id, tid, bt));551}552553// Linear Barrier554template <bool cancellable = false>555static bool __kmp_linear_barrier_gather_template(556enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,557void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {558KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);559kmp_team_t *team = this_thr->th.th_team;560kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;561kmp_info_t **other_threads = team->t.t_threads;562563KA_TRACE(56420,565("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",566gtid, team->t.t_id, tid, bt));567KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);568569#if USE_ITT_BUILD && USE_ITT_NOTIFY570// Barrier imbalance - save arrive time to the thread571if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {572this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =573__itt_get_timestamp();574}575#endif576// We now perform a linear reduction to signal that all of the threads have577// arrived.578if (!KMP_MASTER_TID(tid)) {579KA_TRACE(20,580("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"581"arrived(%p): %llu => %llu\n",582gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),583team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,584thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));585// Mark arrival to primary thread586/* After performing this write, a worker thread may not assume that the team587is valid any more - it could be deallocated by the primary thread at any588time. */589kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);590flag.release();591} else {592kmp_balign_team_t *team_bar = &team->t.t_bar[bt];593int nproc = this_thr->th.th_team_nproc;594int i;595// Don't have to worry about sleep bit here or atomic since team setting596kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;597598// Collect all the worker team member threads.599for (i = 1; i < nproc; ++i) {600#if KMP_CACHE_MANAGE601// Prefetch next thread's arrived count602if (i + 1 < nproc)603KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);604#endif /* KMP_CACHE_MANAGE */605KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "606"arrived(%p) == %llu\n",607gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),608team->t.t_id, i,609&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));610611// Wait for worker thread to arrive612if (cancellable) {613kmp_flag_64<true, false> flag(614&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);615if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))616return true;617} else {618kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,619new_state);620flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));621}622#if USE_ITT_BUILD && USE_ITT_NOTIFY623// Barrier imbalance - write min of the thread time and the other thread624// time to the thread.625if (__kmp_forkjoin_frames_mode == 2) {626this_thr->th.th_bar_min_time = KMP_MIN(627this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);628}629#endif630if (reduce) {631KA_TRACE(100,632("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",633gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),634team->t.t_id, i));635OMPT_REDUCTION_DECL(this_thr, gtid);636OMPT_REDUCTION_BEGIN;637(*reduce)(this_thr->th.th_local.reduce_data,638other_threads[i]->th.th_local.reduce_data);639OMPT_REDUCTION_END;640}641}642// Don't have to worry about sleep bit here or atomic since team setting643team_bar->b_arrived = new_state;644KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "645"arrived(%p) = %llu\n",646gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,647new_state));648}649KA_TRACE(65020,651("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",652gtid, team->t.t_id, tid, bt));653return false;654}655656template <bool cancellable = false>657static bool __kmp_linear_barrier_release_template(658enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,659int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {660KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);661kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;662kmp_team_t *team;663664if (KMP_MASTER_TID(tid)) {665unsigned int i;666kmp_uint32 nproc = this_thr->th.th_team_nproc;667kmp_info_t **other_threads;668669team = __kmp_threads[gtid]->th.th_team;670KMP_DEBUG_ASSERT(team != NULL);671other_threads = team->t.t_threads;672673KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "674"barrier type %d\n",675gtid, team->t.t_id, tid, bt));676677if (nproc > 1) {678#if KMP_BARRIER_ICV_PUSH679{680KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);681if (propagate_icvs) {682ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);683for (i = 1; i < nproc; ++i) {684__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],685team, i, FALSE);686ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,687&team->t.t_implicit_task_taskdata[0].td_icvs);688}689ngo_sync();690}691}692#endif // KMP_BARRIER_ICV_PUSH693694// Now, release all of the worker threads695for (i = 1; i < nproc; ++i) {696#if KMP_CACHE_MANAGE697// Prefetch next thread's go flag698if (i + 1 < nproc)699KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);700#endif /* KMP_CACHE_MANAGE */701KA_TRACE(70220,703("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "704"go(%p): %u => %u\n",705gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,706team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,707other_threads[i]->th.th_bar[bt].bb.b_go,708other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));709kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,710other_threads[i]);711flag.release();712}713}714} else { // Wait for the PRIMARY thread to release us715KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",716gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));717if (cancellable) {718kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);719if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))720return true;721} else {722kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);723flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));724}725#if USE_ITT_BUILD && USE_ITT_NOTIFY726if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {727// In a fork barrier; cannot get the object reliably (or ITTNOTIFY is728// disabled)729itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);730// Cancel wait on previous parallel region...731__kmp_itt_task_starting(itt_sync_obj);732733if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))734return false;735736itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);737if (itt_sync_obj != NULL)738// Call prepare as early as possible for "new" barrier739__kmp_itt_task_finished(itt_sync_obj);740} else741#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */742// Early exit for reaping threads releasing forkjoin barrier743if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))744return false;745// The worker thread may now assume that the team is valid.746#ifdef KMP_DEBUG747tid = __kmp_tid_from_gtid(gtid);748team = __kmp_threads[gtid]->th.th_team;749#endif750KMP_DEBUG_ASSERT(team != NULL);751TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);752KA_TRACE(20,753("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",754gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));755KMP_MB(); // Flush all pending memory write invalidates.756}757KA_TRACE(75820,759("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",760gtid, team->t.t_id, tid, bt));761return false;762}763764static void __kmp_linear_barrier_gather(765enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,766void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {767__kmp_linear_barrier_gather_template<false>(768bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));769}770771static bool __kmp_linear_barrier_gather_cancellable(772enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,773void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {774return __kmp_linear_barrier_gather_template<true>(775bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));776}777778static void __kmp_linear_barrier_release(779enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,780int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {781__kmp_linear_barrier_release_template<false>(782bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));783}784785static bool __kmp_linear_barrier_release_cancellable(786enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,787int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {788return __kmp_linear_barrier_release_template<true>(789bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));790}791792// Tree barrier793static void __kmp_tree_barrier_gather(794enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,795void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {796KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);797kmp_team_t *team = this_thr->th.th_team;798kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;799kmp_info_t **other_threads = team->t.t_threads;800kmp_uint32 nproc = this_thr->th.th_team_nproc;801kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];802kmp_uint32 branch_factor = 1 << branch_bits;803kmp_uint32 child;804kmp_uint32 child_tid;805kmp_uint64 new_state = 0;806807KA_TRACE(80820, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",809gtid, team->t.t_id, tid, bt));810KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);811812#if USE_ITT_BUILD && USE_ITT_NOTIFY813// Barrier imbalance - save arrive time to the thread814if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {815this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =816__itt_get_timestamp();817}818#endif819// Perform tree gather to wait until all threads have arrived; reduce any820// required data as we go821child_tid = (tid << branch_bits) + 1;822if (child_tid < nproc) {823// Parent threads wait for all their children to arrive824new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;825child = 1;826do {827kmp_info_t *child_thr = other_threads[child_tid];828kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;829#if KMP_CACHE_MANAGE830// Prefetch next thread's arrived count831if (child + 1 <= branch_factor && child_tid + 1 < nproc)832KMP_CACHE_PREFETCH(833&other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);834#endif /* KMP_CACHE_MANAGE */835KA_TRACE(20,836("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "837"arrived(%p) == %llu\n",838gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),839team->t.t_id, child_tid, &child_bar->b_arrived, new_state));840// Wait for child to arrive841kmp_flag_64<> flag(&child_bar->b_arrived, new_state);842flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));843#if USE_ITT_BUILD && USE_ITT_NOTIFY844// Barrier imbalance - write min of the thread time and a child time to845// the thread.846if (__kmp_forkjoin_frames_mode == 2) {847this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,848child_thr->th.th_bar_min_time);849}850#endif851if (reduce) {852KA_TRACE(100,853("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",854gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),855team->t.t_id, child_tid));856OMPT_REDUCTION_DECL(this_thr, gtid);857OMPT_REDUCTION_BEGIN;858(*reduce)(this_thr->th.th_local.reduce_data,859child_thr->th.th_local.reduce_data);860OMPT_REDUCTION_END;861}862child++;863child_tid++;864} while (child <= branch_factor && child_tid < nproc);865}866867if (!KMP_MASTER_TID(tid)) { // Worker threads868kmp_int32 parent_tid = (tid - 1) >> branch_bits;869870KA_TRACE(20,871("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "872"arrived(%p): %llu => %llu\n",873gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),874team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,875thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));876877// Mark arrival to parent thread878/* After performing this write, a worker thread may not assume that the team879is valid any more - it could be deallocated by the primary thread at any880time. */881kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);882flag.release();883} else {884// Need to update the team arrived pointer if we are the primary thread885if (nproc > 1) // New value was already computed above886team->t.t_bar[bt].b_arrived = new_state;887else888team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;889KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "890"arrived(%p) = %llu\n",891gtid, team->t.t_id, tid, team->t.t_id,892&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));893}894KA_TRACE(20,895("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",896gtid, team->t.t_id, tid, bt));897}898899static void __kmp_tree_barrier_release(900enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,901int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {902KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);903kmp_team_t *team;904kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;905kmp_uint32 nproc;906kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];907kmp_uint32 branch_factor = 1 << branch_bits;908kmp_uint32 child;909kmp_uint32 child_tid;910911// Perform a tree release for all of the threads that have been gathered912if (!KMP_MASTER_TID(913tid)) { // Handle fork barrier workers who aren't part of a team yet914KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,915&thr_bar->b_go, KMP_BARRIER_STATE_BUMP));916// Wait for parent thread to release us917kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);918flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));919#if USE_ITT_BUILD && USE_ITT_NOTIFY920if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {921// In fork barrier where we could not get the object reliably (or922// ITTNOTIFY is disabled)923itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);924// Cancel wait on previous parallel region...925__kmp_itt_task_starting(itt_sync_obj);926927if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))928return;929930itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);931if (itt_sync_obj != NULL)932// Call prepare as early as possible for "new" barrier933__kmp_itt_task_finished(itt_sync_obj);934} else935#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */936// Early exit for reaping threads releasing forkjoin barrier937if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))938return;939940// The worker thread may now assume that the team is valid.941team = __kmp_threads[gtid]->th.th_team;942KMP_DEBUG_ASSERT(team != NULL);943tid = __kmp_tid_from_gtid(gtid);944945TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);946KA_TRACE(20,947("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,948team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));949KMP_MB(); // Flush all pending memory write invalidates.950} else {951team = __kmp_threads[gtid]->th.th_team;952KMP_DEBUG_ASSERT(team != NULL);953KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "954"barrier type %d\n",955gtid, team->t.t_id, tid, bt));956}957nproc = this_thr->th.th_team_nproc;958child_tid = (tid << branch_bits) + 1;959960if (child_tid < nproc) {961kmp_info_t **other_threads = team->t.t_threads;962child = 1;963// Parent threads release all their children964do {965kmp_info_t *child_thr = other_threads[child_tid];966kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;967#if KMP_CACHE_MANAGE968// Prefetch next thread's go count969if (child + 1 <= branch_factor && child_tid + 1 < nproc)970KMP_CACHE_PREFETCH(971&other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);972#endif /* KMP_CACHE_MANAGE */973974#if KMP_BARRIER_ICV_PUSH975{976KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);977if (propagate_icvs) {978__kmp_init_implicit_task(team->t.t_ident,979team->t.t_threads[child_tid], team,980child_tid, FALSE);981copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,982&team->t.t_implicit_task_taskdata[0].td_icvs);983}984}985#endif // KMP_BARRIER_ICV_PUSH986KA_TRACE(20,987("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"988"go(%p): %u => %u\n",989gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),990team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,991child_bar->b_go + KMP_BARRIER_STATE_BUMP));992// Release child from barrier993kmp_flag_64<> flag(&child_bar->b_go, child_thr);994flag.release();995child++;996child_tid++;997} while (child <= branch_factor && child_tid < nproc);998}999KA_TRACE(100020, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",1001gtid, team->t.t_id, tid, bt));1002}10031004// Hyper Barrier1005static void __kmp_hyper_barrier_gather(1006enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,1007void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {1008KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);1009kmp_team_t *team = this_thr->th.th_team;1010kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;1011kmp_info_t **other_threads = team->t.t_threads;1012kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;1013kmp_uint32 num_threads = this_thr->th.th_team_nproc;1014kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];1015kmp_uint32 branch_factor = 1 << branch_bits;1016kmp_uint32 offset;1017kmp_uint32 level;10181019KA_TRACE(102020,1021("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",1022gtid, team->t.t_id, tid, bt));1023KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);10241025#if USE_ITT_BUILD && USE_ITT_NOTIFY1026// Barrier imbalance - save arrive time to the thread1027if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {1028this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =1029__itt_get_timestamp();1030}1031#endif1032/* Perform a hypercube-embedded tree gather to wait until all of the threads1033have arrived, and reduce any required data as we go. */1034kmp_flag_64<> p_flag(&thr_bar->b_arrived);1035for (level = 0, offset = 1; offset < num_threads;1036level += branch_bits, offset <<= branch_bits) {1037kmp_uint32 child;1038kmp_uint32 child_tid;10391040if (((tid >> level) & (branch_factor - 1)) != 0) {1041kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);10421043KMP_MB(); // Synchronize parent and child threads.1044KA_TRACE(20,1045("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "1046"arrived(%p): %llu => %llu\n",1047gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),1048team->t.t_id, parent_tid, &thr_bar->b_arrived,1049thr_bar->b_arrived,1050thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));1051// Mark arrival to parent thread1052/* After performing this write (in the last iteration of the enclosing for1053loop), a worker thread may not assume that the team is valid any more1054- it could be deallocated by the primary thread at any time. */1055p_flag.set_waiter(other_threads[parent_tid]);1056p_flag.release();1057break;1058}10591060// Parent threads wait for children to arrive1061if (new_state == KMP_BARRIER_UNUSED_STATE)1062new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;1063for (child = 1, child_tid = tid + (1 << level);1064child < branch_factor && child_tid < num_threads;1065child++, child_tid += (1 << level)) {1066kmp_info_t *child_thr = other_threads[child_tid];1067kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;1068#if KMP_CACHE_MANAGE1069kmp_uint32 next_child_tid = child_tid + (1 << level);1070// Prefetch next thread's arrived count1071if (child + 1 < branch_factor && next_child_tid < num_threads)1072KMP_CACHE_PREFETCH(1073&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);1074#endif /* KMP_CACHE_MANAGE */1075KA_TRACE(20,1076("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "1077"arrived(%p) == %llu\n",1078gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),1079team->t.t_id, child_tid, &child_bar->b_arrived, new_state));1080// Wait for child to arrive1081kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);1082c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));1083KMP_MB(); // Synchronize parent and child threads.1084#if USE_ITT_BUILD && USE_ITT_NOTIFY1085// Barrier imbalance - write min of the thread time and a child time to1086// the thread.1087if (__kmp_forkjoin_frames_mode == 2) {1088this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,1089child_thr->th.th_bar_min_time);1090}1091#endif1092if (reduce) {1093KA_TRACE(100,1094("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",1095gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),1096team->t.t_id, child_tid));1097OMPT_REDUCTION_DECL(this_thr, gtid);1098OMPT_REDUCTION_BEGIN;1099(*reduce)(this_thr->th.th_local.reduce_data,1100child_thr->th.th_local.reduce_data);1101OMPT_REDUCTION_END;1102}1103}1104}11051106if (KMP_MASTER_TID(tid)) {1107// Need to update the team arrived pointer if we are the primary thread1108if (new_state == KMP_BARRIER_UNUSED_STATE)1109team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;1110else1111team->t.t_bar[bt].b_arrived = new_state;1112KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "1113"arrived(%p) = %llu\n",1114gtid, team->t.t_id, tid, team->t.t_id,1115&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));1116}1117KA_TRACE(111820, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",1119gtid, team->t.t_id, tid, bt));1120}11211122// The reverse versions seem to beat the forward versions overall1123#define KMP_REVERSE_HYPER_BAR1124static void __kmp_hyper_barrier_release(1125enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,1126int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {1127KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);1128kmp_team_t *team;1129kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;1130kmp_info_t **other_threads;1131kmp_uint32 num_threads;1132kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];1133kmp_uint32 branch_factor = 1 << branch_bits;1134kmp_uint32 child;1135kmp_uint32 child_tid;1136kmp_uint32 offset;1137kmp_uint32 level;11381139/* Perform a hypercube-embedded tree release for all of the threads that have1140been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads1141are released in the reverse order of the corresponding gather, otherwise1142threads are released in the same order. */1143if (KMP_MASTER_TID(tid)) { // primary thread1144team = __kmp_threads[gtid]->th.th_team;1145KMP_DEBUG_ASSERT(team != NULL);1146KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "1147"barrier type %d\n",1148gtid, team->t.t_id, tid, bt));1149#if KMP_BARRIER_ICV_PUSH1150if (propagate_icvs) { // primary already has ICVs in final destination; copy1151copy_icvs(&thr_bar->th_fixed_icvs,1152&team->t.t_implicit_task_taskdata[tid].td_icvs);1153}1154#endif1155} else { // Handle fork barrier workers who aren't part of a team yet1156KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,1157&thr_bar->b_go, KMP_BARRIER_STATE_BUMP));1158// Wait for parent thread to release us1159kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);1160flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));1161#if USE_ITT_BUILD && USE_ITT_NOTIFY1162if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {1163// In fork barrier where we could not get the object reliably1164itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);1165// Cancel wait on previous parallel region...1166__kmp_itt_task_starting(itt_sync_obj);11671168if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))1169return;11701171itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);1172if (itt_sync_obj != NULL)1173// Call prepare as early as possible for "new" barrier1174__kmp_itt_task_finished(itt_sync_obj);1175} else1176#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */1177// Early exit for reaping threads releasing forkjoin barrier1178if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))1179return;11801181// The worker thread may now assume that the team is valid.1182team = __kmp_threads[gtid]->th.th_team;1183KMP_DEBUG_ASSERT(team != NULL);1184tid = __kmp_tid_from_gtid(gtid);11851186TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);1187KA_TRACE(20,1188("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",1189gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));1190KMP_MB(); // Flush all pending memory write invalidates.1191}1192num_threads = this_thr->th.th_team_nproc;1193other_threads = team->t.t_threads;11941195#ifdef KMP_REVERSE_HYPER_BAR1196// Count up to correct level for parent1197for (level = 0, offset = 1;1198offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);1199level += branch_bits, offset <<= branch_bits)1200;12011202// Now go down from there1203for (level -= branch_bits, offset >>= branch_bits; offset != 0;1204level -= branch_bits, offset >>= branch_bits)1205#else1206// Go down the tree, level by level1207for (level = 0, offset = 1; offset < num_threads;1208level += branch_bits, offset <<= branch_bits)1209#endif // KMP_REVERSE_HYPER_BAR1210{1211#ifdef KMP_REVERSE_HYPER_BAR1212/* Now go in reverse order through the children, highest to lowest.1213Initial setting of child is conservative here. */1214child = num_threads >> ((level == 0) ? level : level - 1);1215for (child = (child < branch_factor - 1) ? child : branch_factor - 1,1216child_tid = tid + (child << level);1217child >= 1; child--, child_tid -= (1 << level))1218#else1219if (((tid >> level) & (branch_factor - 1)) != 0)1220// No need to go lower than this, since this is the level parent would be1221// notified1222break;1223// Iterate through children on this level of the tree1224for (child = 1, child_tid = tid + (1 << level);1225child < branch_factor && child_tid < num_threads;1226child++, child_tid += (1 << level))1227#endif // KMP_REVERSE_HYPER_BAR1228{1229if (child_tid >= num_threads)1230continue; // Child doesn't exist so keep going1231else {1232kmp_info_t *child_thr = other_threads[child_tid];1233kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;1234#if KMP_CACHE_MANAGE1235kmp_uint32 next_child_tid = child_tid - (1 << level);1236// Prefetch next thread's go count1237#ifdef KMP_REVERSE_HYPER_BAR1238if (child - 1 >= 1 && next_child_tid < num_threads)1239#else1240if (child + 1 < branch_factor && next_child_tid < num_threads)1241#endif // KMP_REVERSE_HYPER_BAR1242KMP_CACHE_PREFETCH(1243&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);1244#endif /* KMP_CACHE_MANAGE */12451246#if KMP_BARRIER_ICV_PUSH1247if (propagate_icvs) // push my fixed ICVs to my child1248copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);1249#endif // KMP_BARRIER_ICV_PUSH12501251KA_TRACE(125220,1253("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"1254"go(%p): %u => %u\n",1255gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),1256team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,1257child_bar->b_go + KMP_BARRIER_STATE_BUMP));1258// Release child from barrier1259kmp_flag_64<> flag(&child_bar->b_go, child_thr);1260flag.release();1261}1262}1263}1264#if KMP_BARRIER_ICV_PUSH1265if (propagate_icvs &&1266!KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest1267__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,1268FALSE);1269copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,1270&thr_bar->th_fixed_icvs);1271}1272#endif1273KA_TRACE(127420,1275("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",1276gtid, team->t.t_id, tid, bt));1277}12781279// Hierarchical Barrier12801281// Initialize thread barrier data1282/* Initializes/re-initializes the hierarchical barrier data stored on a thread.1283Performs the minimum amount of initialization required based on how the team1284has changed. Returns true if leaf children will require both on-core and1285traditional wake-up mechanisms. For example, if the team size increases,1286threads already in the team will respond to on-core wakeup on their parent1287thread, but threads newly added to the team will only be listening on the1288their local b_go. */1289static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,1290kmp_bstate_t *thr_bar,1291kmp_uint32 nproc, int gtid,1292int tid, kmp_team_t *team) {1293// Checks to determine if (re-)initialization is needed1294bool uninitialized = thr_bar->team == NULL;1295bool team_changed = team != thr_bar->team;1296bool team_sz_changed = nproc != thr_bar->nproc;1297bool tid_changed = tid != thr_bar->old_tid;1298bool retval = false;12991300if (uninitialized || team_sz_changed) {1301__kmp_get_hierarchy(nproc, thr_bar);1302}13031304if (uninitialized || team_sz_changed || tid_changed) {1305thr_bar->my_level = thr_bar->depth - 1; // default for primary thread1306thr_bar->parent_tid = -1; // default for primary thread1307if (!KMP_MASTER_TID(tid)) {1308// if not primary thread, find parent thread in hierarchy1309kmp_uint32 d = 0;1310while (d < thr_bar->depth) { // find parent based on level of thread in1311// hierarchy, and note level1312kmp_uint32 rem;1313if (d == thr_bar->depth - 2) { // reached level right below the primary1314thr_bar->parent_tid = 0;1315thr_bar->my_level = d;1316break;1317} else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {1318// TODO: can we make the above op faster?1319// thread is not a subtree root at next level, so this is max1320thr_bar->parent_tid = tid - rem;1321thr_bar->my_level = d;1322break;1323}1324++d;1325}1326}1327__kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /1328(thr_bar->skip_per_level[thr_bar->my_level])),1329&(thr_bar->offset));1330thr_bar->old_tid = tid;1331thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;1332thr_bar->team = team;1333thr_bar->parent_bar =1334&team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;1335}1336if (uninitialized || team_changed || tid_changed) {1337thr_bar->team = team;1338thr_bar->parent_bar =1339&team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;1340retval = true;1341}1342if (uninitialized || team_sz_changed || tid_changed) {1343thr_bar->nproc = nproc;1344thr_bar->leaf_kids = thr_bar->base_leaf_kids;1345if (thr_bar->my_level == 0)1346thr_bar->leaf_kids = 0;1347if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)1348__kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));1349thr_bar->leaf_state = 0;1350for (int i = 0; i < thr_bar->leaf_kids; ++i)1351((char *)&(thr_bar->leaf_state))[7 - i] = 1;1352}1353return retval;1354}13551356static void __kmp_hierarchical_barrier_gather(1357enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,1358void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {1359KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);1360kmp_team_t *team = this_thr->th.th_team;1361kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;1362kmp_uint32 nproc = this_thr->th.th_team_nproc;1363kmp_info_t **other_threads = team->t.t_threads;1364kmp_uint64 new_state = 0;13651366int level = team->t.t_level;1367if (other_threads[0]1368->th.th_teams_microtask) // are we inside the teams construct?1369if (this_thr->th.th_teams_size.nteams > 1)1370++level; // level was not increased in teams construct for team_of_masters1371if (level == 1)1372thr_bar->use_oncore_barrier = 1;1373else1374thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested13751376KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "1377"barrier type %d\n",1378gtid, team->t.t_id, tid, bt));1379KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);13801381#if USE_ITT_BUILD && USE_ITT_NOTIFY1382// Barrier imbalance - save arrive time to the thread1383if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {1384this_thr->th.th_bar_arrive_time = __itt_get_timestamp();1385}1386#endif13871388(void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,1389team);13901391if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)1392kmp_int32 child_tid;1393new_state =1394(kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;1395if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&1396thr_bar->use_oncore_barrier) {1397if (thr_bar->leaf_kids) {1398// First, wait for leaf children to check-in on my b_arrived flag1399kmp_uint64 leaf_state =1400KMP_MASTER_TID(tid)1401? thr_bar->b_arrived | thr_bar->leaf_state1402: team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;1403KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "1404"for leaf kids\n",1405gtid, team->t.t_id, tid));1406kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);1407flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));1408if (reduce) {1409OMPT_REDUCTION_DECL(this_thr, gtid);1410OMPT_REDUCTION_BEGIN;1411for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;1412++child_tid) {1413KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "1414"T#%d(%d:%d)\n",1415gtid, team->t.t_id, tid,1416__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1417child_tid));1418(*reduce)(this_thr->th.th_local.reduce_data,1419other_threads[child_tid]->th.th_local.reduce_data);1420}1421OMPT_REDUCTION_END;1422}1423// clear leaf_state bits1424KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));1425}1426// Next, wait for higher level children on each child's b_arrived flag1427for (kmp_uint32 d = 1; d < thr_bar->my_level;1428++d) { // gather lowest level threads first, but skip 01429kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],1430skip = thr_bar->skip_per_level[d];1431if (last > nproc)1432last = nproc;1433for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {1434kmp_info_t *child_thr = other_threads[child_tid];1435kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;1436KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "1437"T#%d(%d:%d) "1438"arrived(%p) == %llu\n",1439gtid, team->t.t_id, tid,1440__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1441child_tid, &child_bar->b_arrived, new_state));1442kmp_flag_64<> flag(&child_bar->b_arrived, new_state);1443flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));1444if (reduce) {1445KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "1446"T#%d(%d:%d)\n",1447gtid, team->t.t_id, tid,1448__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1449child_tid));1450(*reduce)(this_thr->th.th_local.reduce_data,1451child_thr->th.th_local.reduce_data);1452}1453}1454}1455} else { // Blocktime is not infinite1456for (kmp_uint32 d = 0; d < thr_bar->my_level;1457++d) { // Gather lowest level threads first1458kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],1459skip = thr_bar->skip_per_level[d];1460if (last > nproc)1461last = nproc;1462for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {1463kmp_info_t *child_thr = other_threads[child_tid];1464kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;1465KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "1466"T#%d(%d:%d) "1467"arrived(%p) == %llu\n",1468gtid, team->t.t_id, tid,1469__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1470child_tid, &child_bar->b_arrived, new_state));1471kmp_flag_64<> flag(&child_bar->b_arrived, new_state);1472flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));1473if (reduce) {1474KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "1475"T#%d(%d:%d)\n",1476gtid, team->t.t_id, tid,1477__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1478child_tid));1479(*reduce)(this_thr->th.th_local.reduce_data,1480child_thr->th.th_local.reduce_data);1481}1482}1483}1484}1485}1486// All subordinates are gathered; now release parent if not primary thread14871488if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy1489KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"1490" T#%d(%d:%d) arrived(%p): %llu => %llu\n",1491gtid, team->t.t_id, tid,1492__kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,1493thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,1494thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));1495/* Mark arrival to parent: After performing this write, a worker thread may1496not assume that the team is valid any more - it could be deallocated by1497the primary thread at any time. */1498if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||1499!thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived1500// flag; release it1501kmp_flag_64<> flag(&thr_bar->b_arrived,1502other_threads[thr_bar->parent_tid]);1503flag.release();1504} else {1505// Leaf does special release on "offset" bits of parent's b_arrived flag1506thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;1507kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,1508thr_bar->offset + 1);1509flag.set_waiter(other_threads[thr_bar->parent_tid]);1510flag.release();1511}1512} else { // Primary thread needs to update the team's b_arrived value1513team->t.t_bar[bt].b_arrived = new_state;1514KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "1515"arrived(%p) = %llu\n",1516gtid, team->t.t_id, tid, team->t.t_id,1517&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));1518}1519// Is the team access below unsafe or just technically invalid?1520KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "1521"barrier type %d\n",1522gtid, team->t.t_id, tid, bt));1523}15241525static void __kmp_hierarchical_barrier_release(1526enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,1527int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {1528KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);1529kmp_team_t *team;1530kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;1531kmp_uint32 nproc;1532bool team_change = false; // indicates on-core barrier shouldn't be used15331534if (KMP_MASTER_TID(tid)) {1535team = __kmp_threads[gtid]->th.th_team;1536KMP_DEBUG_ASSERT(team != NULL);1537KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "1538"entered barrier type %d\n",1539gtid, team->t.t_id, tid, bt));1540} else { // Worker threads1541// Wait for parent thread to release me1542if (!thr_bar->use_oncore_barrier ||1543__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||1544thr_bar->team == NULL) {1545// Use traditional method of waiting on my own b_go flag1546thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;1547kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);1548flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));1549TCW_8(thr_bar->b_go,1550KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time1551} else { // Thread barrier data is initialized, this is a leaf, blocktime is1552// infinite, not nested1553// Wait on my "offset" bits on parent's b_go flag1554thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;1555kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,1556thr_bar->offset + 1, bt,1557this_thr USE_ITT_BUILD_ARG(itt_sync_obj));1558flag.wait(this_thr, TRUE);1559if (thr_bar->wait_flag ==1560KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go1561TCW_8(thr_bar->b_go,1562KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time1563} else { // Reset my bits on parent's b_go flag1564(RCAST(volatile char *,1565&(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;1566}1567}1568thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;1569// Early exit for reaping threads releasing forkjoin barrier1570if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))1571return;1572// The worker thread may now assume that the team is valid.1573team = __kmp_threads[gtid]->th.th_team;1574KMP_DEBUG_ASSERT(team != NULL);1575tid = __kmp_tid_from_gtid(gtid);15761577KA_TRACE(157820,1579("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",1580gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));1581KMP_MB(); // Flush all pending memory write invalidates.1582}15831584nproc = this_thr->th.th_team_nproc;1585int level = team->t.t_level;1586if (team->t.t_threads[0]1587->th.th_teams_microtask) { // are we inside the teams construct?1588if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&1589this_thr->th.th_teams_level == level)1590++level; // level was not increased in teams construct for team_of_workers1591if (this_thr->th.th_teams_size.nteams > 1)1592++level; // level was not increased in teams construct for team_of_masters1593}1594if (level == 1)1595thr_bar->use_oncore_barrier = 1;1596else1597thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested15981599// If the team size has increased, we still communicate with old leaves via1600// oncore barrier.1601unsigned short int old_leaf_kids = thr_bar->leaf_kids;1602kmp_uint64 old_leaf_state = thr_bar->leaf_state;1603team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,1604tid, team);1605// But if the entire team changes, we won't use oncore barrier at all1606if (team_change)1607old_leaf_kids = 0;16081609#if KMP_BARRIER_ICV_PUSH1610if (propagate_icvs) {1611__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,1612FALSE);1613if (KMP_MASTER_TID(1614tid)) { // primary already has copy in final destination; copy1615copy_icvs(&thr_bar->th_fixed_icvs,1616&team->t.t_implicit_task_taskdata[tid].td_icvs);1617} else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&1618thr_bar->use_oncore_barrier) { // optimization for inf blocktime1619if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)1620// leaves (on-core children) pull parent's fixed ICVs directly to local1621// ICV store1622copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,1623&thr_bar->parent_bar->th_fixed_icvs);1624// non-leaves will get ICVs piggybacked with b_go via NGO store1625} else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs1626if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can1627// access1628copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);1629else // leaves copy parent's fixed ICVs directly to local ICV store1630copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,1631&thr_bar->parent_bar->th_fixed_icvs);1632}1633}1634#endif // KMP_BARRIER_ICV_PUSH16351636// Now, release my children1637if (thr_bar->my_level) { // not a leaf1638kmp_int32 child_tid;1639kmp_uint32 last;1640if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&1641thr_bar->use_oncore_barrier) {1642if (KMP_MASTER_TID(tid)) { // do a flat release1643// Set local b_go to bump children via NGO store of the cache line1644// containing IVCs and b_go.1645thr_bar->b_go = KMP_BARRIER_STATE_BUMP;1646// Use ngo stores if available; b_go piggybacks in the last 8 bytes of1647// the cache line1648ngo_load(&thr_bar->th_fixed_icvs);1649// This loops over all the threads skipping only the leaf nodes in the1650// hierarchy1651for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;1652child_tid += thr_bar->skip_per_level[1]) {1653kmp_bstate_t *child_bar =1654&team->t.t_threads[child_tid]->th.th_bar[bt].bb;1655KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "1656"releasing T#%d(%d:%d)"1657" go(%p): %u => %u\n",1658gtid, team->t.t_id, tid,1659__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1660child_tid, &child_bar->b_go, child_bar->b_go,1661child_bar->b_go + KMP_BARRIER_STATE_BUMP));1662// Use ngo store (if available) to both store ICVs and release child1663// via child's b_go1664ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);1665}1666ngo_sync();1667}1668TCW_8(thr_bar->b_go,1669KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time1670// Now, release leaf children1671if (thr_bar->leaf_kids) { // if there are any1672// We test team_change on the off-chance that the level 1 team changed.1673if (team_change ||1674old_leaf_kids < thr_bar->leaf_kids) { // some old, some new1675if (old_leaf_kids) { // release old leaf kids1676thr_bar->b_go |= old_leaf_state;1677}1678// Release new leaf kids1679last = tid + thr_bar->skip_per_level[1];1680if (last > nproc)1681last = nproc;1682for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;1683++child_tid) { // skip_per_level[0]=11684kmp_info_t *child_thr = team->t.t_threads[child_tid];1685kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;1686KA_TRACE(168720,1688("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"1689" T#%d(%d:%d) go(%p): %u => %u\n",1690gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),1691team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,1692child_bar->b_go + KMP_BARRIER_STATE_BUMP));1693// Release child using child's b_go flag1694kmp_flag_64<> flag(&child_bar->b_go, child_thr);1695flag.release();1696}1697} else { // Release all children at once with leaf_state bits on my own1698// b_go flag1699thr_bar->b_go |= thr_bar->leaf_state;1700}1701}1702} else { // Blocktime is not infinite; do a simple hierarchical release1703for (int d = thr_bar->my_level - 1; d >= 0;1704--d) { // Release highest level threads first1705last = tid + thr_bar->skip_per_level[d + 1];1706kmp_uint32 skip = thr_bar->skip_per_level[d];1707if (last > nproc)1708last = nproc;1709for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {1710kmp_info_t *child_thr = team->t.t_threads[child_tid];1711kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;1712KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "1713"releasing T#%d(%d:%d) go(%p): %u => %u\n",1714gtid, team->t.t_id, tid,1715__kmp_gtid_from_tid(child_tid, team), team->t.t_id,1716child_tid, &child_bar->b_go, child_bar->b_go,1717child_bar->b_go + KMP_BARRIER_STATE_BUMP));1718// Release child using child's b_go flag1719kmp_flag_64<> flag(&child_bar->b_go, child_thr);1720flag.release();1721}1722}1723}1724#if KMP_BARRIER_ICV_PUSH1725if (propagate_icvs && !KMP_MASTER_TID(tid))1726// non-leaves copy ICVs from fixed ICVs to local dest1727copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,1728&thr_bar->th_fixed_icvs);1729#endif // KMP_BARRIER_ICV_PUSH1730}1731KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "1732"barrier type %d\n",1733gtid, team->t.t_id, tid, bt));1734}17351736// End of Barrier Algorithms17371738// type traits for cancellable value1739// if cancellable is true, then is_cancellable is a normal boolean variable1740// if cancellable is false, then is_cancellable is a compile time constant1741template <bool cancellable> struct is_cancellable {};1742template <> struct is_cancellable<true> {1743bool value;1744is_cancellable() : value(false) {}1745is_cancellable(bool b) : value(b) {}1746is_cancellable &operator=(bool b) {1747value = b;1748return *this;1749}1750operator bool() const { return value; }1751};1752template <> struct is_cancellable<false> {1753is_cancellable &operator=(bool b) { return *this; }1754constexpr operator bool() const { return false; }1755};17561757// Internal function to do a barrier.1758/* If is_split is true, do a split barrier, otherwise, do a plain barrier1759If reduce is non-NULL, do a split reduction barrier, otherwise, do a split1760barrier1761When cancellable = false,1762Returns 0 if primary thread, 1 if worker thread.1763When cancellable = true1764Returns 0 if not cancelled, 1 if cancelled. */1765template <bool cancellable = false>1766static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,1767size_t reduce_size, void *reduce_data,1768void (*reduce)(void *, void *)) {1769KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);1770KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);1771int tid = __kmp_tid_from_gtid(gtid);1772kmp_info_t *this_thr = __kmp_threads[gtid];1773kmp_team_t *team = this_thr->th.th_team;1774int status = 0;1775is_cancellable<cancellable> cancelled;1776#if OMPT_SUPPORT && OMPT_OPTIONAL1777ompt_data_t *my_task_data;1778ompt_data_t *my_parallel_data;1779void *return_address;1780ompt_sync_region_t barrier_kind;1781#endif17821783KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,1784__kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));17851786#if OMPT_SUPPORT1787if (ompt_enabled.enabled) {1788#if OMPT_OPTIONAL1789my_task_data = OMPT_CUR_TASK_DATA(this_thr);1790my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);1791return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);1792barrier_kind = __ompt_get_barrier_kind(bt, this_thr);1793if (ompt_enabled.ompt_callback_sync_region) {1794ompt_callbacks.ompt_callback(ompt_callback_sync_region)(1795barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,1796return_address);1797}1798if (ompt_enabled.ompt_callback_sync_region_wait) {1799ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(1800barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,1801return_address);1802}1803#endif1804// It is OK to report the barrier state after the barrier begin callback.1805// According to the OMPT specification, a compliant implementation may1806// even delay reporting this state until the barrier begins to wait.1807auto *ompt_thr_info = &this_thr->th.ompt_thread_info;1808switch (barrier_kind) {1809case ompt_sync_region_barrier_explicit:1810ompt_thr_info->state = ompt_state_wait_barrier_explicit;1811break;1812case ompt_sync_region_barrier_implicit_workshare:1813ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;1814break;1815case ompt_sync_region_barrier_implicit_parallel:1816ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;1817break;1818case ompt_sync_region_barrier_teams:1819ompt_thr_info->state = ompt_state_wait_barrier_teams;1820break;1821case ompt_sync_region_barrier_implementation:1822[[fallthrough]];1823default:1824ompt_thr_info->state = ompt_state_wait_barrier_implementation;1825}1826}1827#endif18281829if (!team->t.t_serialized) {1830#if USE_ITT_BUILD1831// This value will be used in itt notify events below.1832void *itt_sync_obj = NULL;1833#if USE_ITT_NOTIFY1834if (__itt_sync_create_ptr || KMP_ITT_DEBUG)1835itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);1836#endif1837#endif /* USE_ITT_BUILD */1838if (__kmp_tasking_mode == tskm_extra_barrier) {1839__kmp_tasking_barrier(team, this_thr, gtid);1840KA_TRACE(15,1841("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,1842__kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));1843}18441845/* Copy the blocktime info to the thread, where __kmp_wait_template() can1846access it when the team struct is not guaranteed to exist. */1847// See note about the corresponding code in __kmp_join_barrier() being1848// performance-critical.1849if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {1850#if KMP_USE_MONITOR1851this_thr->th.th_team_bt_intervals =1852team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;1853this_thr->th.th_team_bt_set =1854team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;1855#else1856this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);1857#endif1858}18591860#if USE_ITT_BUILD1861if (__itt_sync_create_ptr || KMP_ITT_DEBUG)1862__kmp_itt_barrier_starting(gtid, itt_sync_obj);1863#endif /* USE_ITT_BUILD */1864#if USE_DEBUGGER1865// Let the debugger know: the thread arrived to the barrier and waiting.1866if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct1867team->t.t_bar[bt].b_master_arrived += 1;1868} else {1869this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;1870} // if1871#endif /* USE_DEBUGGER */1872if (reduce != NULL) {1873// KMP_DEBUG_ASSERT( is_split == TRUE ); // #C699561874this_thr->th.th_local.reduce_data = reduce_data;1875}18761877if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)1878__kmp_task_team_setup(this_thr, team);18791880if (cancellable) {1881cancelled = __kmp_linear_barrier_gather_cancellable(1882bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));1883} else {1884switch (__kmp_barrier_gather_pattern[bt]) {1885case bp_dist_bar: {1886__kmp_dist_barrier_gather(bt, this_thr, gtid, tid,1887reduce USE_ITT_BUILD_ARG(itt_sync_obj));1888break;1889}1890case bp_hyper_bar: {1891// don't set branch bits to 0; use linear1892KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);1893__kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,1894reduce USE_ITT_BUILD_ARG(itt_sync_obj));1895break;1896}1897case bp_hierarchical_bar: {1898__kmp_hierarchical_barrier_gather(1899bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));1900break;1901}1902case bp_tree_bar: {1903// don't set branch bits to 0; use linear1904KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);1905__kmp_tree_barrier_gather(bt, this_thr, gtid, tid,1906reduce USE_ITT_BUILD_ARG(itt_sync_obj));1907break;1908}1909default: {1910__kmp_linear_barrier_gather(bt, this_thr, gtid, tid,1911reduce USE_ITT_BUILD_ARG(itt_sync_obj));1912}1913}1914}19151916KMP_MB();19171918if (KMP_MASTER_TID(tid)) {1919status = 0;1920if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {1921__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));1922}1923#if USE_DEBUGGER1924// Let the debugger know: All threads are arrived and starting leaving the1925// barrier.1926team->t.t_bar[bt].b_team_arrived += 1;1927#endif19281929if (__kmp_omp_cancellation) {1930kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);1931// Reset cancellation flag for worksharing constructs1932if (cancel_request == cancel_loop ||1933cancel_request == cancel_sections) {1934KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);1935}1936}1937#if USE_ITT_BUILD1938/* TODO: In case of split reduction barrier, primary thread may send1939acquired event early, before the final summation into the shared1940variable is done (final summation can be a long operation for array1941reductions). */1942if (__itt_sync_create_ptr || KMP_ITT_DEBUG)1943__kmp_itt_barrier_middle(gtid, itt_sync_obj);1944#endif /* USE_ITT_BUILD */1945#if USE_ITT_BUILD && USE_ITT_NOTIFY1946// Barrier - report frame end (only if active_level == 1)1947if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&1948__kmp_forkjoin_frames_mode &&1949(this_thr->th.th_teams_microtask == NULL || // either not in teams1950this_thr->th.th_teams_size.nteams == 1) && // or inside single team1951team->t.t_active_level == 1) {1952ident_t *loc = __kmp_threads[gtid]->th.th_ident;1953kmp_uint64 cur_time = __itt_get_timestamp();1954kmp_info_t **other_threads = team->t.t_threads;1955int nproc = this_thr->th.th_team_nproc;1956int i;1957switch (__kmp_forkjoin_frames_mode) {1958case 1:1959__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,1960loc, nproc);1961this_thr->th.th_frame_time = cur_time;1962break;1963case 2: // AC 2015-01-19: currently does not work for hierarchical (to1964// be fixed)1965__kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,19661, loc, nproc);1967break;1968case 3:1969if (__itt_metadata_add_ptr) {1970// Initialize with primary thread's wait time1971kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;1972// Set arrive time to zero to be able to check it in1973// __kmp_invoke_task(); the same is done inside the loop below1974this_thr->th.th_bar_arrive_time = 0;1975for (i = 1; i < nproc; ++i) {1976delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);1977other_threads[i]->th.th_bar_arrive_time = 0;1978}1979__kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,1980cur_time, delta,1981(kmp_uint64)(reduce != NULL));1982}1983__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,1984loc, nproc);1985this_thr->th.th_frame_time = cur_time;1986break;1987}1988}1989#endif /* USE_ITT_BUILD */1990} else {1991status = 1;1992#if USE_ITT_BUILD1993if (__itt_sync_create_ptr || KMP_ITT_DEBUG)1994__kmp_itt_barrier_middle(gtid, itt_sync_obj);1995#endif /* USE_ITT_BUILD */1996}1997if ((status == 1 || !is_split) && !cancelled) {1998if (cancellable) {1999cancelled = __kmp_linear_barrier_release_cancellable(2000bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));2001} else {2002switch (__kmp_barrier_release_pattern[bt]) {2003case bp_dist_bar: {2004KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);2005__kmp_dist_barrier_release(bt, this_thr, gtid, tid,2006FALSE USE_ITT_BUILD_ARG(itt_sync_obj));2007break;2008}2009case bp_hyper_bar: {2010KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);2011__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,2012FALSE USE_ITT_BUILD_ARG(itt_sync_obj));2013break;2014}2015case bp_hierarchical_bar: {2016__kmp_hierarchical_barrier_release(2017bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));2018break;2019}2020case bp_tree_bar: {2021KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);2022__kmp_tree_barrier_release(bt, this_thr, gtid, tid,2023FALSE USE_ITT_BUILD_ARG(itt_sync_obj));2024break;2025}2026default: {2027__kmp_linear_barrier_release(bt, this_thr, gtid, tid,2028FALSE USE_ITT_BUILD_ARG(itt_sync_obj));2029}2030}2031}2032if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {2033__kmp_task_team_sync(this_thr, team);2034}2035}20362037#if USE_ITT_BUILD2038/* GEH: TODO: Move this under if-condition above and also include in2039__kmp_end_split_barrier(). This will more accurately represent the actual2040release time of the threads for split barriers. */2041if (__itt_sync_create_ptr || KMP_ITT_DEBUG)2042__kmp_itt_barrier_finished(gtid, itt_sync_obj);2043#endif /* USE_ITT_BUILD */2044} else { // Team is serialized.2045status = 0;2046if (__kmp_tasking_mode != tskm_immediate_exec) {2047if (this_thr->th.th_task_team != NULL) {2048#if USE_ITT_NOTIFY2049void *itt_sync_obj = NULL;2050if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {2051itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);2052__kmp_itt_barrier_starting(gtid, itt_sync_obj);2053}2054#endif20552056KMP_DEBUG_ASSERT(2057this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||2058this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==2059TRUE);2060__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));2061__kmp_task_team_setup(this_thr, team);20622063#if USE_ITT_BUILD2064if (__itt_sync_create_ptr || KMP_ITT_DEBUG)2065__kmp_itt_barrier_finished(gtid, itt_sync_obj);2066#endif /* USE_ITT_BUILD */2067}2068}2069}2070KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",2071gtid, __kmp_team_from_gtid(gtid)->t.t_id,2072__kmp_tid_from_gtid(gtid), status));20732074#if OMPT_SUPPORT2075if (ompt_enabled.enabled) {2076#if OMPT_OPTIONAL2077if (ompt_enabled.ompt_callback_sync_region_wait) {2078ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(2079barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,2080return_address);2081}2082if (ompt_enabled.ompt_callback_sync_region) {2083ompt_callbacks.ompt_callback(ompt_callback_sync_region)(2084barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,2085return_address);2086}2087#endif2088this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;2089}2090#endif20912092if (cancellable)2093return (int)cancelled;2094return status;2095}20962097// Returns 0 if primary thread, 1 if worker thread.2098int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,2099size_t reduce_size, void *reduce_data,2100void (*reduce)(void *, void *)) {2101return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,2102reduce);2103}21042105#if defined(KMP_GOMP_COMPAT)2106// Returns 1 if cancelled, 0 otherwise2107int __kmp_barrier_gomp_cancel(int gtid) {2108if (__kmp_omp_cancellation) {2109int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,21100, NULL, NULL);2111if (cancelled) {2112int tid = __kmp_tid_from_gtid(gtid);2113kmp_info_t *this_thr = __kmp_threads[gtid];2114if (KMP_MASTER_TID(tid)) {2115// Primary thread does not need to revert anything2116} else {2117// Workers need to revert their private b_arrived flag2118this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=2119KMP_BARRIER_STATE_BUMP;2120}2121}2122return cancelled;2123}2124__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);2125return FALSE;2126}2127#endif21282129void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {2130KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);2131KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);2132KMP_DEBUG_ASSERT(bt < bs_last_barrier);2133int tid = __kmp_tid_from_gtid(gtid);2134kmp_info_t *this_thr = __kmp_threads[gtid];2135kmp_team_t *team = this_thr->th.th_team;21362137if (!team->t.t_serialized) {2138if (KMP_MASTER_GTID(gtid)) {2139switch (__kmp_barrier_release_pattern[bt]) {2140case bp_dist_bar: {2141__kmp_dist_barrier_release(bt, this_thr, gtid, tid,2142FALSE USE_ITT_BUILD_ARG(NULL));2143break;2144}2145case bp_hyper_bar: {2146KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);2147__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,2148FALSE USE_ITT_BUILD_ARG(NULL));2149break;2150}2151case bp_hierarchical_bar: {2152__kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,2153FALSE USE_ITT_BUILD_ARG(NULL));2154break;2155}2156case bp_tree_bar: {2157KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);2158__kmp_tree_barrier_release(bt, this_thr, gtid, tid,2159FALSE USE_ITT_BUILD_ARG(NULL));2160break;2161}2162default: {2163__kmp_linear_barrier_release(bt, this_thr, gtid, tid,2164FALSE USE_ITT_BUILD_ARG(NULL));2165}2166}2167if (__kmp_tasking_mode != tskm_immediate_exec) {2168__kmp_task_team_sync(this_thr, team);2169} // if2170}2171}2172}21732174void __kmp_join_barrier(int gtid) {2175KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);2176KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);21772178KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);21792180kmp_info_t *this_thr = __kmp_threads[gtid];2181kmp_team_t *team;2182int tid;2183#ifdef KMP_DEBUG2184int team_id;2185#endif /* KMP_DEBUG */2186#if USE_ITT_BUILD2187void *itt_sync_obj = NULL;2188#if USE_ITT_NOTIFY2189if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need2190// Get object created at fork_barrier2191itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);2192#endif2193#endif /* USE_ITT_BUILD */2194#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)2195int nproc = this_thr->th.th_team_nproc;2196#endif2197KMP_MB();21982199// Get current info2200team = this_thr->th.th_team;2201KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);2202tid = __kmp_tid_from_gtid(gtid);2203#ifdef KMP_DEBUG2204team_id = team->t.t_id;2205kmp_info_t *master_thread = this_thr->th.th_team_master;2206if (master_thread != team->t.t_threads[0]) {2207__kmp_print_structure();2208}2209#endif /* KMP_DEBUG */2210KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);2211KMP_MB();22122213// Verify state2214KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));2215KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));2216KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);2217KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",2218gtid, team_id, tid));22192220#if OMPT_SUPPORT2221if (ompt_enabled.enabled) {2222#if OMPT_OPTIONAL2223ompt_data_t *my_task_data;2224ompt_data_t *my_parallel_data;2225void *codeptr = NULL;2226int ds_tid = this_thr->th.th_info.ds.ds_tid;2227if (KMP_MASTER_TID(ds_tid) &&2228(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||2229ompt_callbacks.ompt_callback(ompt_callback_sync_region)))2230codeptr = team->t.ompt_team_info.master_return_address;2231my_task_data = OMPT_CUR_TASK_DATA(this_thr);2232my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);2233ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;2234ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;2235if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {2236sync_kind = ompt_sync_region_barrier_teams;2237ompt_state = ompt_state_wait_barrier_teams;2238}2239if (ompt_enabled.ompt_callback_sync_region) {2240ompt_callbacks.ompt_callback(ompt_callback_sync_region)(2241sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);2242}2243if (ompt_enabled.ompt_callback_sync_region_wait) {2244ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(2245sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);2246}2247if (!KMP_MASTER_TID(ds_tid))2248this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);2249#endif2250this_thr->th.ompt_thread_info.state = ompt_state;2251}2252#endif22532254if (__kmp_tasking_mode == tskm_extra_barrier) {2255__kmp_tasking_barrier(team, this_thr, gtid);2256KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",2257gtid, team_id, tid));2258}2259#ifdef KMP_DEBUG2260if (__kmp_tasking_mode != tskm_immediate_exec) {2261KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "2262"%p, th_task_team = %p\n",2263__kmp_gtid_from_thread(this_thr), team_id,2264team->t.t_task_team[this_thr->th.th_task_state],2265this_thr->th.th_task_team));2266KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);2267}2268#endif /* KMP_DEBUG */22692270/* Copy the blocktime info to the thread, where __kmp_wait_template() can2271access it when the team struct is not guaranteed to exist. Doing these2272loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,2273we do not perform the copy if blocktime=infinite, since the values are not2274used by __kmp_wait_template() in that case. */2275if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {2276#if KMP_USE_MONITOR2277this_thr->th.th_team_bt_intervals =2278team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;2279this_thr->th.th_team_bt_set =2280team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;2281#else2282this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);2283#endif2284}22852286#if USE_ITT_BUILD2287if (__itt_sync_create_ptr || KMP_ITT_DEBUG)2288__kmp_itt_barrier_starting(gtid, itt_sync_obj);2289#endif /* USE_ITT_BUILD */22902291switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {2292case bp_dist_bar: {2293__kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,2294NULL USE_ITT_BUILD_ARG(itt_sync_obj));2295break;2296}2297case bp_hyper_bar: {2298KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);2299__kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,2300NULL USE_ITT_BUILD_ARG(itt_sync_obj));2301break;2302}2303case bp_hierarchical_bar: {2304__kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,2305NULL USE_ITT_BUILD_ARG(itt_sync_obj));2306break;2307}2308case bp_tree_bar: {2309KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);2310__kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,2311NULL USE_ITT_BUILD_ARG(itt_sync_obj));2312break;2313}2314default: {2315__kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,2316NULL USE_ITT_BUILD_ARG(itt_sync_obj));2317}2318}23192320/* From this point on, the team data structure may be deallocated at any time2321by the primary thread - it is unsafe to reference it in any of the worker2322threads. Any per-team data items that need to be referenced before the2323end of the barrier should be moved to the kmp_task_team_t structs. */2324if (KMP_MASTER_TID(tid)) {2325if (__kmp_tasking_mode != tskm_immediate_exec) {2326__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));2327}2328if (__kmp_display_affinity) {2329KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);2330}2331#if KMP_STATS_ENABLED2332// Have primary thread flag the workers to indicate they are now waiting for2333// next parallel region, Also wake them up so they switch their timers to2334// idle.2335for (int i = 0; i < team->t.t_nproc; ++i) {2336kmp_info_t *team_thread = team->t.t_threads[i];2337if (team_thread == this_thr)2338continue;2339team_thread->th.th_stats->setIdleFlag();2340if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&2341team_thread->th.th_sleep_loc != NULL)2342__kmp_null_resume_wrapper(team_thread);2343}2344#endif2345#if USE_ITT_BUILD2346if (__itt_sync_create_ptr || KMP_ITT_DEBUG)2347__kmp_itt_barrier_middle(gtid, itt_sync_obj);2348#endif /* USE_ITT_BUILD */23492350#if USE_ITT_BUILD && USE_ITT_NOTIFY2351// Join barrier - report frame end2352if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&2353__kmp_forkjoin_frames_mode &&2354(this_thr->th.th_teams_microtask == NULL || // either not in teams2355this_thr->th.th_teams_size.nteams == 1) && // or inside single team2356team->t.t_active_level == 1) {2357kmp_uint64 cur_time = __itt_get_timestamp();2358ident_t *loc = team->t.t_ident;2359kmp_info_t **other_threads = team->t.t_threads;2360switch (__kmp_forkjoin_frames_mode) {2361case 1:2362__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,2363loc, nproc);2364break;2365case 2:2366__kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,2367loc, nproc);2368break;2369case 3:2370if (__itt_metadata_add_ptr) {2371// Initialize with primary thread's wait time2372kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;2373// Set arrive time to zero to be able to check it in2374// __kmp_invoke_task(); the same is done inside the loop below2375this_thr->th.th_bar_arrive_time = 0;2376for (int i = 1; i < nproc; ++i) {2377delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);2378other_threads[i]->th.th_bar_arrive_time = 0;2379}2380__kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,2381cur_time, delta, 0);2382}2383__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,2384loc, nproc);2385this_thr->th.th_frame_time = cur_time;2386break;2387}2388}2389#endif /* USE_ITT_BUILD */2390}2391#if USE_ITT_BUILD2392else {2393if (__itt_sync_create_ptr || KMP_ITT_DEBUG)2394__kmp_itt_barrier_middle(gtid, itt_sync_obj);2395}2396#endif /* USE_ITT_BUILD */23972398#if KMP_DEBUG2399if (KMP_MASTER_TID(tid)) {2400KA_TRACE(240115,2402("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",2403gtid, team_id, tid, nproc));2404}2405#endif /* KMP_DEBUG */24062407// TODO now, mark worker threads as done so they may be disbanded2408KMP_MB(); // Flush all pending memory write invalidates.2409KA_TRACE(10,2410("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));24112412}24132414// TODO release worker threads' fork barriers as we are ready instead of all at2415// once2416void __kmp_fork_barrier(int gtid, int tid) {2417KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);2418KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);2419kmp_info_t *this_thr = __kmp_threads[gtid];2420kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;2421#if USE_ITT_BUILD2422void *itt_sync_obj = NULL;2423#endif /* USE_ITT_BUILD */2424#ifdef KMP_DEBUG2425if (team)2426KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,2427(team != NULL) ? team->t.t_id : -1, tid));2428#endif2429// th_team pointer only valid for primary thread here2430if (KMP_MASTER_TID(tid)) {2431#if USE_ITT_BUILD && USE_ITT_NOTIFY2432if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {2433// Create itt barrier object2434itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);2435__kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing2436}2437#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */24382439#ifdef KMP_DEBUG2440KMP_DEBUG_ASSERT(team);2441kmp_info_t **other_threads = team->t.t_threads;2442int i;24432444// Verify state2445KMP_MB();24462447for (i = 1; i < team->t.t_nproc; ++i) {2448KA_TRACE(500,2449("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "2450"== %u.\n",2451gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,2452team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,2453other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));2454KMP_DEBUG_ASSERT(2455(TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &2456~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);2457KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);2458}2459#endif24602461if (__kmp_tasking_mode != tskm_immediate_exec)2462__kmp_task_team_setup(this_thr, team);24632464/* The primary thread may have changed its blocktime between join barrier2465and fork barrier. Copy the blocktime info to the thread, where2466__kmp_wait_template() can access it when the team struct is not2467guaranteed to exist. */2468// See note about the corresponding code in __kmp_join_barrier() being2469// performance-critical2470if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {2471#if KMP_USE_MONITOR2472this_thr->th.th_team_bt_intervals =2473team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;2474this_thr->th.th_team_bt_set =2475team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;2476#else2477this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);2478#endif2479}2480} // primary thread24812482switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {2483case bp_dist_bar: {2484__kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,2485TRUE USE_ITT_BUILD_ARG(NULL));2486break;2487}2488case bp_hyper_bar: {2489KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);2490__kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,2491TRUE USE_ITT_BUILD_ARG(itt_sync_obj));2492break;2493}2494case bp_hierarchical_bar: {2495__kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,2496TRUE USE_ITT_BUILD_ARG(itt_sync_obj));2497break;2498}2499case bp_tree_bar: {2500KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);2501__kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,2502TRUE USE_ITT_BUILD_ARG(itt_sync_obj));2503break;2504}2505default: {2506__kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,2507TRUE USE_ITT_BUILD_ARG(itt_sync_obj));2508}2509}25102511#if OMPT_SUPPORT2512ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;2513if (ompt_enabled.enabled &&2514(ompt_state == ompt_state_wait_barrier_teams ||2515ompt_state == ompt_state_wait_barrier_implicit_parallel)) {2516int ds_tid = this_thr->th.th_info.ds.ds_tid;2517ompt_data_t *task_data = (team)2518? OMPT_CUR_TASK_DATA(this_thr)2519: &(this_thr->th.ompt_thread_info.task_data);2520this_thr->th.ompt_thread_info.state = ompt_state_overhead;2521#if OMPT_OPTIONAL2522void *codeptr = NULL;2523if (KMP_MASTER_TID(ds_tid) &&2524(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||2525ompt_callbacks.ompt_callback(ompt_callback_sync_region)))2526codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;2527ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;2528if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)2529sync_kind = ompt_sync_region_barrier_teams;2530if (ompt_enabled.ompt_callback_sync_region_wait) {2531ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(2532sync_kind, ompt_scope_end, NULL, task_data, codeptr);2533}2534if (ompt_enabled.ompt_callback_sync_region) {2535ompt_callbacks.ompt_callback(ompt_callback_sync_region)(2536sync_kind, ompt_scope_end, NULL, task_data, codeptr);2537}2538#endif2539if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {2540ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(2541ompt_scope_end, NULL, task_data, 0, ds_tid,2542ompt_task_implicit); // TODO: Can this be ompt_task_initial?2543}2544}2545#endif25462547// Early exit for reaping threads releasing forkjoin barrier2548if (TCR_4(__kmp_global.g.g_done)) {2549this_thr->th.th_task_team = NULL;25502551#if USE_ITT_BUILD && USE_ITT_NOTIFY2552if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {2553if (!KMP_MASTER_TID(tid)) {2554itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);2555if (itt_sync_obj)2556__kmp_itt_barrier_finished(gtid, itt_sync_obj);2557}2558}2559#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */2560KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));2561return;2562}25632564/* We can now assume that a valid team structure has been allocated by the2565primary thread and propagated to all worker threads. The current thread,2566however, may not be part of the team, so we can't blindly assume that the2567team pointer is non-null. */2568team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);2569KMP_DEBUG_ASSERT(team != NULL);2570tid = __kmp_tid_from_gtid(gtid);25712572#if KMP_BARRIER_ICV_PULL2573/* Primary thread's copy of the ICVs was set up on the implicit taskdata in2574__kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's2575implicit task has this data before this function is called. We cannot2576modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's2577thread struct, because it is not always the case that the threads arrays2578have been allocated when __kmp_fork_call() is executed. */2579{2580KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);2581if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs2582// Copy the initial ICVs from the primary thread's thread struct to the2583// implicit task for this tid.2584KA_TRACE(10,2585("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));2586__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,2587tid, FALSE);2588copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,2589&team->t.t_threads[0]2590->th.th_bar[bs_forkjoin_barrier]2591.bb.th_fixed_icvs);2592}2593}2594#endif // KMP_BARRIER_ICV_PULL25952596if (__kmp_tasking_mode != tskm_immediate_exec) {2597__kmp_task_team_sync(this_thr, team);2598}25992600#if KMP_AFFINITY_SUPPORTED2601kmp_proc_bind_t proc_bind = team->t.t_proc_bind;2602if (proc_bind == proc_bind_intel) {2603// Call dynamic affinity settings2604if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {2605__kmp_balanced_affinity(this_thr, team->t.t_nproc);2606}2607} else if (proc_bind != proc_bind_false) {2608if (this_thr->th.th_new_place == this_thr->th.th_current_place) {2609KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",2610__kmp_gtid_from_thread(this_thr),2611this_thr->th.th_current_place));2612} else {2613__kmp_affinity_bind_place(gtid);2614}2615}2616#endif // KMP_AFFINITY_SUPPORTED2617// Perform the display affinity functionality2618if (__kmp_display_affinity) {2619if (team->t.t_display_affinity2620#if KMP_AFFINITY_SUPPORTED2621|| (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)2622#endif2623) {2624// NULL means use the affinity-format-var ICV2625__kmp_aux_display_affinity(gtid, NULL);2626this_thr->th.th_prev_num_threads = team->t.t_nproc;2627this_thr->th.th_prev_level = team->t.t_level;2628}2629}2630if (!KMP_MASTER_TID(tid))2631KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);26322633#if USE_ITT_BUILD && USE_ITT_NOTIFY2634if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {2635if (!KMP_MASTER_TID(tid)) {2636// Get correct barrier object2637itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);2638__kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired2639} // (prepare called inside barrier_release)2640}2641#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */2642KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,2643team->t.t_id, tid));2644}26452646void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,2647kmp_internal_control_t *new_icvs, ident_t *loc) {2648KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);26492650KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);2651KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);26522653/* Primary thread's copy of the ICVs was set up on the implicit taskdata in2654__kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's2655implicit task has this data before this function is called. */2656#if KMP_BARRIER_ICV_PULL2657/* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which2658remains untouched), where all of the worker threads can access them and2659make their own copies after the barrier. */2660KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be2661// allocated at this point2662copy_icvs(2663&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,2664new_icvs);2665KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,2666team->t.t_threads[0], team));2667#elif KMP_BARRIER_ICV_PUSH2668// The ICVs will be propagated in the fork barrier, so nothing needs to be2669// done here.2670KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,2671team->t.t_threads[0], team));2672#else2673// Copy the ICVs to each of the non-primary threads. This takes O(nthreads)2674// time.2675ngo_load(new_icvs);2676KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be2677// allocated at this point2678for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread2679// TODO: GEH - pass in better source location info since usually NULL here2680KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",2681f, team->t.t_threads[f], team));2682__kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);2683ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);2684KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",2685f, team->t.t_threads[f], team));2686}2687ngo_sync();2688#endif // KMP_BARRIER_ICV_PULL2689}269026912692