Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp
35258 views
/*1* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112/* Dynamic scheduling initialization and dispatch.13*14* NOTE: __kmp_nth is a constant inside of any dispatch loop, however15* it may change values between parallel regions. __kmp_max_nth16* is the largest value __kmp_nth may take, 1 is the smallest.17*/1819#include "kmp.h"20#include "kmp_error.h"21#include "kmp_i18n.h"22#include "kmp_itt.h"23#include "kmp_stats.h"24#include "kmp_str.h"25#if KMP_USE_X87CONTROL26#include <float.h>27#endif28#include "kmp_lock.h"29#include "kmp_dispatch.h"30#if KMP_USE_HIER_SCHED31#include "kmp_dispatch_hier.h"32#endif3334#if OMPT_SUPPORT35#include "ompt-specific.h"36#endif3738/* ------------------------------------------------------------------------ */39/* ------------------------------------------------------------------------ */4041void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {42kmp_info_t *th;4344KMP_DEBUG_ASSERT(gtid_ref);4546if (__kmp_env_consistency_check) {47th = __kmp_threads[*gtid_ref];48if (th->th.th_root->r.r_active &&49(th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {50#if KMP_USE_DYNAMIC_LOCK51__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);52#else53__kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);54#endif55}56}57}5859void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {60kmp_info_t *th;6162if (__kmp_env_consistency_check) {63th = __kmp_threads[*gtid_ref];64if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {65__kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);66}67}68}6970// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC71static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,72bool use_hier = false) {73// Pick up the nonmonotonic/monotonic bits from the scheduling type74// Nonmonotonic as default for dynamic schedule when no modifier is specified75int monotonicity = SCHEDULE_NONMONOTONIC;7677// Let default be monotonic for executables78// compiled with OpenMP* 4.5 or less compilers79if (loc != NULL && loc->get_openmp_version() < 50)80monotonicity = SCHEDULE_MONOTONIC;8182if (use_hier || __kmp_force_monotonic)83monotonicity = SCHEDULE_MONOTONIC;84else if (SCHEDULE_HAS_NONMONOTONIC(schedule))85monotonicity = SCHEDULE_NONMONOTONIC;86else if (SCHEDULE_HAS_MONOTONIC(schedule))87monotonicity = SCHEDULE_MONOTONIC;8889return monotonicity;90}9192#if KMP_WEIGHTED_ITERATIONS_SUPPORTED93// Return floating point number rounded to two decimal points94static inline float __kmp_round_2decimal_val(float num) {95return (float)(static_cast<int>(num * 100 + 0.5)) / 100;96}97static inline int __kmp_get_round_val(float num) {98return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);99}100#endif101102template <typename T>103inline void104__kmp_initialize_self_buffer(kmp_team_t *team, T id,105dispatch_private_info_template<T> *pr,106typename traits_t<T>::unsigned_t nchunks, T nproc,107typename traits_t<T>::unsigned_t &init,108T &small_chunk, T &extras, T &p_extra) {109110#if KMP_WEIGHTED_ITERATIONS_SUPPORTED111if (pr->flags.use_hybrid) {112kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];113kmp_hw_core_type_t type =114(kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;115T pchunks = pr->u.p.pchunks;116T echunks = nchunks - pchunks;117T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;118T num_procs_with_ecore = nproc - num_procs_with_pcore;119T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;120T big_chunk =121pchunks / num_procs_with_pcore; // chunks per thread with p-core122small_chunk =123echunks / num_procs_with_ecore; // chunks per thread with e-core124125extras =126(pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);127128p_extra = (big_chunk - small_chunk);129130if (type == KMP_HW_CORE_TYPE_CORE) {131if (id < first_thread_with_ecore) {132init = id * small_chunk + id * p_extra + (id < extras ? id : extras);133} else {134init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +135(id < extras ? id : extras);136}137} else {138if (id == first_thread_with_ecore) {139init = id * small_chunk + id * p_extra + (id < extras ? id : extras);140} else {141init = id * small_chunk + first_thread_with_ecore * p_extra +142(id < extras ? id : extras);143}144}145p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;146return;147}148#endif149150small_chunk = nchunks / nproc; // chunks per thread151extras = nchunks % nproc;152p_extra = 0;153init = id * small_chunk + (id < extras ? id : extras);154}155156#if KMP_STATIC_STEAL_ENABLED157enum { // values for steal_flag (possible states of private per-loop buffer)158UNUSED = 0,159CLAIMED = 1, // owner thread started initialization160READY = 2, // available for stealing161THIEF = 3 // finished by owner, or claimed by thief162// possible state changes:163// 0 -> 1 owner only, sync164// 0 -> 3 thief only, sync165// 1 -> 2 owner only, async166// 2 -> 3 owner only, async167// 3 -> 2 owner only, async168// 3 -> 0 last thread finishing the loop, async169};170#endif171172// Initialize a dispatch_private_info_template<T> buffer for a particular173// type of schedule,chunk. The loop description is found in lb (lower bound),174// ub (upper bound), and st (stride). nproc is the number of threads relevant175// to the scheduling (often the number of threads in a team, but not always if176// hierarchical scheduling is used). tid is the id of the thread calling177// the function within the group of nproc threads. It will have a value178// between 0 and nproc - 1. This is often just the thread id within a team, but179// is not necessarily the case when using hierarchical scheduling.180// loc is the source file location of the corresponding loop181// gtid is the global thread id182template <typename T>183void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,184dispatch_private_info_template<T> *pr,185enum sched_type schedule, T lb, T ub,186typename traits_t<T>::signed_t st,187#if USE_ITT_BUILD188kmp_uint64 *cur_chunk,189#endif190typename traits_t<T>::signed_t chunk,191T nproc, T tid) {192typedef typename traits_t<T>::unsigned_t UT;193typedef typename traits_t<T>::floating_t DBL;194195int active;196T tc;197kmp_info_t *th;198kmp_team_t *team;199int monotonicity;200bool use_hier;201202#ifdef KMP_DEBUG203typedef typename traits_t<T>::signed_t ST;204{205char *buff;206// create format specifiers before the debug output207buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "208"pr:%%p lb:%%%s ub:%%%s st:%%%s "209"schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",210traits_t<T>::spec, traits_t<T>::spec,211traits_t<ST>::spec, traits_t<ST>::spec,212traits_t<T>::spec, traits_t<T>::spec);213KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));214__kmp_str_free(&buff);215}216#endif217/* setup data */218th = __kmp_threads[gtid];219team = th->th.th_team;220active = !team->t.t_serialized;221222#if USE_ITT_BUILD223int itt_need_metadata_reporting =224__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&225KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&226team->t.t_active_level == 1;227#endif228229#if KMP_USE_HIER_SCHED230use_hier = pr->flags.use_hier;231#else232use_hier = false;233#endif234235/* Pick up the nonmonotonic/monotonic bits from the scheduling type */236monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);237schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);238239/* Pick up the nomerge/ordered bits from the scheduling type */240if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {241pr->flags.nomerge = TRUE;242schedule =243(enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));244} else {245pr->flags.nomerge = FALSE;246}247pr->type_size = traits_t<T>::type_size; // remember the size of variables248if (kmp_ord_lower & schedule) {249pr->flags.ordered = TRUE;250schedule =251(enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));252} else {253pr->flags.ordered = FALSE;254}255// Ordered overrides nonmonotonic256if (pr->flags.ordered) {257monotonicity = SCHEDULE_MONOTONIC;258}259260if (schedule == kmp_sch_static) {261schedule = __kmp_static;262} else {263if (schedule == kmp_sch_runtime) {264// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if265// not specified)266schedule = team->t.t_sched.r_sched_type;267monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);268schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);269if (pr->flags.ordered) // correct monotonicity for ordered loop if needed270monotonicity = SCHEDULE_MONOTONIC;271// Detail the schedule if needed (global controls are differentiated272// appropriately)273if (schedule == kmp_sch_guided_chunked) {274schedule = __kmp_guided;275} else if (schedule == kmp_sch_static) {276schedule = __kmp_static;277}278// Use the chunk size specified by OMP_SCHEDULE (or default if not279// specified)280chunk = team->t.t_sched.chunk;281#if USE_ITT_BUILD282if (cur_chunk)283*cur_chunk = chunk;284#endif285#ifdef KMP_DEBUG286{287char *buff;288// create format specifiers before the debug output289buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "290"schedule:%%d chunk:%%%s\n",291traits_t<ST>::spec);292KD_TRACE(10, (buff, gtid, schedule, chunk));293__kmp_str_free(&buff);294}295#endif296} else {297if (schedule == kmp_sch_guided_chunked) {298schedule = __kmp_guided;299}300if (chunk <= 0) {301chunk = KMP_DEFAULT_CHUNK;302}303}304305if (schedule == kmp_sch_auto) {306// mapping and differentiation: in the __kmp_do_serial_initialize()307schedule = __kmp_auto;308#ifdef KMP_DEBUG309{310char *buff;311// create format specifiers before the debug output312buff = __kmp_str_format(313"__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "314"schedule:%%d chunk:%%%s\n",315traits_t<ST>::spec);316KD_TRACE(10, (buff, gtid, schedule, chunk));317__kmp_str_free(&buff);318}319#endif320}321#if KMP_STATIC_STEAL_ENABLED322// map nonmonotonic:dynamic to static steal323if (schedule == kmp_sch_dynamic_chunked) {324if (monotonicity == SCHEDULE_NONMONOTONIC)325schedule = kmp_sch_static_steal;326}327#endif328/* guided analytical not safe for too many threads */329if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {330schedule = kmp_sch_guided_iterative_chunked;331KMP_WARNING(DispatchManyThreads);332}333if (schedule == kmp_sch_runtime_simd) {334// compiler provides simd_width in the chunk parameter335schedule = team->t.t_sched.r_sched_type;336monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);337schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);338// Detail the schedule if needed (global controls are differentiated339// appropriately)340if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||341schedule == __kmp_static) {342schedule = kmp_sch_static_balanced_chunked;343} else {344if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {345schedule = kmp_sch_guided_simd;346}347chunk = team->t.t_sched.chunk * chunk;348}349#if USE_ITT_BUILD350if (cur_chunk)351*cur_chunk = chunk;352#endif353#ifdef KMP_DEBUG354{355char *buff;356// create format specifiers before the debug output357buff = __kmp_str_format(358"__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"359" chunk:%%%s\n",360traits_t<ST>::spec);361KD_TRACE(10, (buff, gtid, schedule, chunk));362__kmp_str_free(&buff);363}364#endif365}366pr->u.p.parm1 = chunk;367}368KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),369"unknown scheduling type");370371pr->u.p.count = 0;372373if (__kmp_env_consistency_check) {374if (st == 0) {375__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,376(pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);377}378}379// compute trip count380if (st == 1) { // most common case381if (ub >= lb) {382tc = ub - lb + 1;383} else { // ub < lb384tc = 0; // zero-trip385}386} else if (st < 0) {387if (lb >= ub) {388// AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),389// where the division needs to be unsigned regardless of the result type390tc = (UT)(lb - ub) / (-st) + 1;391} else { // lb < ub392tc = 0; // zero-trip393}394} else { // st > 0395if (ub >= lb) {396// AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),397// where the division needs to be unsigned regardless of the result type398tc = (UT)(ub - lb) / st + 1;399} else { // ub < lb400tc = 0; // zero-trip401}402}403404#if KMP_STATS_ENABLED405if (KMP_MASTER_GTID(gtid)) {406KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);407}408#endif409410pr->u.p.lb = lb;411pr->u.p.ub = ub;412pr->u.p.st = st;413pr->u.p.tc = tc;414415#if KMP_OS_WINDOWS416pr->u.p.last_upper = ub + st;417#endif /* KMP_OS_WINDOWS */418419/* NOTE: only the active parallel region(s) has active ordered sections */420421if (active) {422if (pr->flags.ordered) {423pr->ordered_bumped = 0;424pr->u.p.ordered_lower = 1;425pr->u.p.ordered_upper = 0;426}427}428429switch (schedule) {430#if KMP_STATIC_STEAL_ENABLED431case kmp_sch_static_steal: {432T ntc, init = 0;433434KD_TRACE(100,435("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",436gtid));437438ntc = (tc % chunk ? 1 : 0) + tc / chunk;439if (nproc > 1 && ntc >= nproc) {440KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);441T id = tid;442T small_chunk, extras, p_extra = 0;443kmp_uint32 old = UNUSED;444int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);445if (traits_t<T>::type_size > 4) {446// AC: TODO: check if 16-byte CAS available and use it to447// improve performance (probably wait for explicit request448// before spending time on this).449// For now use dynamically allocated per-private-buffer lock,450// free memory in __kmp_dispatch_next when status==0.451pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));452__kmp_init_lock(pr->u.p.steal_lock);453}454455#if KMP_WEIGHTED_ITERATIONS_SUPPORTED456// Iterations are divided in a 60/40 skewed distribution among CORE and457// ATOM processors for hybrid systems458bool use_hybrid = false;459kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;460T first_thread_with_ecore = 0;461T num_procs_with_pcore = 0;462T num_procs_with_ecore = 0;463T p_ntc = 0, e_ntc = 0;464if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&465__kmp_affinity.type != affinity_explicit) {466use_hybrid = true;467core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;468if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&469__kmp_first_osid_with_ecore > -1) {470for (int i = 0; i < team->t.t_nproc; ++i) {471kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]472->th.th_topology_attrs.core_type;473int id = team->t.t_threads[i]->th.th_topology_ids.os_id;474if (id == __kmp_first_osid_with_ecore) {475first_thread_with_ecore =476team->t.t_threads[i]->th.th_info.ds.ds_tid;477}478if (type == KMP_HW_CORE_TYPE_CORE) {479num_procs_with_pcore++;480} else if (type == KMP_HW_CORE_TYPE_ATOM) {481num_procs_with_ecore++;482} else {483use_hybrid = false;484break;485}486}487}488if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {489float multiplier = 60.0 / 40.0;490float p_ratio = (float)num_procs_with_pcore / nproc;491float e_ratio = (float)num_procs_with_ecore / nproc;492float e_multiplier =493(float)1 /494(((multiplier * num_procs_with_pcore) / nproc) + e_ratio);495float p_multiplier = multiplier * e_multiplier;496p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);497if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))498e_ntc =499(int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));500else501e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);502KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);503504// Use regular static steal if not enough chunks for skewed505// distribution506use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&507e_ntc >= num_procs_with_ecore)508? true509: false);510} else {511use_hybrid = false;512}513}514pr->flags.use_hybrid = use_hybrid;515pr->u.p.pchunks = p_ntc;516pr->u.p.num_procs_with_pcore = num_procs_with_pcore;517pr->u.p.first_thread_with_ecore = first_thread_with_ecore;518519if (use_hybrid) {520KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);521T big_chunk = p_ntc / num_procs_with_pcore;522small_chunk = e_ntc / num_procs_with_ecore;523524extras =525(p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);526527p_extra = (big_chunk - small_chunk);528529if (core_type == KMP_HW_CORE_TYPE_CORE) {530if (id < first_thread_with_ecore) {531init =532id * small_chunk + id * p_extra + (id < extras ? id : extras);533} else {534init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +535(id < extras ? id : extras);536}537} else {538if (id == first_thread_with_ecore) {539init =540id * small_chunk + id * p_extra + (id < extras ? id : extras);541} else {542init = id * small_chunk + first_thread_with_ecore * p_extra +543(id < extras ? id : extras);544}545}546p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;547} else548#endif549{550small_chunk = ntc / nproc;551extras = ntc % nproc;552init = id * small_chunk + (id < extras ? id : extras);553p_extra = 0;554}555pr->u.p.count = init;556if (claimed) { // are we succeeded in claiming own buffer?557pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);558// Other threads will inspect steal_flag when searching for a victim.559// READY means other threads may steal from this thread from now on.560KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);561} else {562// other thread has stolen whole our range563KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);564pr->u.p.ub = init; // mark there is no iterations to work on565}566pr->u.p.parm2 = ntc; // save number of chunks567// parm3 is the number of times to attempt stealing which is568// nproc (just a heuristics, could be optimized later on).569pr->u.p.parm3 = nproc;570pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid571break;572} else {573/* too few chunks: switching to kmp_sch_dynamic_chunked */574schedule = kmp_sch_dynamic_chunked;575KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "576"kmp_sch_dynamic_chunked\n",577gtid));578goto dynamic_init;579break;580} // if581} // case582#endif583case kmp_sch_static_balanced: {584T init, limit;585586KD_TRACE(587100,588("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",589gtid));590591if (nproc > 1) {592T id = tid;593594if (tc < nproc) {595if (id < tc) {596init = id;597limit = id;598pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */599} else {600pr->u.p.count = 1; /* means no more chunks to execute */601pr->u.p.parm1 = FALSE;602break;603}604} else {605T small_chunk = tc / nproc;606T extras = tc % nproc;607init = id * small_chunk + (id < extras ? id : extras);608limit = init + small_chunk - (id < extras ? 0 : 1);609pr->u.p.parm1 = (id == nproc - 1);610}611} else {612if (tc > 0) {613init = 0;614limit = tc - 1;615pr->u.p.parm1 = TRUE;616} else {617// zero trip count618pr->u.p.count = 1; /* means no more chunks to execute */619pr->u.p.parm1 = FALSE;620break;621}622}623#if USE_ITT_BUILD624// Calculate chunk for metadata report625if (itt_need_metadata_reporting)626if (cur_chunk)627*cur_chunk = limit - init + 1;628#endif629if (st == 1) {630pr->u.p.lb = lb + init;631pr->u.p.ub = lb + limit;632} else {633// calculated upper bound, "ub" is user-defined upper bound634T ub_tmp = lb + limit * st;635pr->u.p.lb = lb + init * st;636// adjust upper bound to "ub" if needed, so that MS lastprivate will match637// it exactly638if (st > 0) {639pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);640} else {641pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);642}643}644if (pr->flags.ordered) {645pr->u.p.ordered_lower = init;646pr->u.p.ordered_upper = limit;647}648break;649} // case650case kmp_sch_static_balanced_chunked: {651// similar to balanced, but chunk adjusted to multiple of simd width652T nth = nproc;653KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"654" -> falling-through to static_greedy\n",655gtid));656schedule = kmp_sch_static_greedy;657if (nth > 1)658pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);659else660pr->u.p.parm1 = tc;661break;662} // case663case kmp_sch_guided_simd:664case kmp_sch_guided_iterative_chunked: {665KD_TRACE(666100,667("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"668" case\n",669gtid));670671if (nproc > 1) {672if ((2L * chunk + 1) * nproc >= tc) {673/* chunk size too large, switch to dynamic */674schedule = kmp_sch_dynamic_chunked;675goto dynamic_init;676} else {677// when remaining iters become less than parm2 - switch to dynamic678pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);679*(double *)&pr->u.p.parm3 =680guided_flt_param / (double)nproc; // may occupy parm3 and parm4681}682} else {683KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "684"kmp_sch_static_greedy\n",685gtid));686schedule = kmp_sch_static_greedy;687/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */688KD_TRACE(689100,690("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",691gtid));692pr->u.p.parm1 = tc;693} // if694} // case695break;696case kmp_sch_guided_analytical_chunked: {697KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "698"kmp_sch_guided_analytical_chunked case\n",699gtid));700701if (nproc > 1) {702if ((2L * chunk + 1) * nproc >= tc) {703/* chunk size too large, switch to dynamic */704schedule = kmp_sch_dynamic_chunked;705goto dynamic_init;706} else {707/* commonly used term: (2 nproc - 1)/(2 nproc) */708DBL x;709710#if KMP_USE_X87CONTROL711/* Linux* OS already has 64-bit computation by default for long double,712and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On713Windows* OS on IA-32 architecture, we need to set precision to 64-bit714instead of the default 53-bit. Even though long double doesn't work715on Windows* OS on Intel(R) 64, the resulting lack of precision is not716expected to impact the correctness of the algorithm, but this has not717been mathematically proven. */718// save original FPCW and set precision to 64-bit, as719// Windows* OS on IA-32 architecture defaults to 53-bit720unsigned int oldFpcw = _control87(0, 0);721_control87(_PC_64, _MCW_PC); // 0,0x30000722#endif723/* value used for comparison in solver for cross-over point */724KMP_ASSERT(tc > 0);725long double target = ((long double)chunk * 2 + 1) * nproc / tc;726727/* crossover point--chunk indexes equal to or greater than728this point switch to dynamic-style scheduling */729UT cross;730731/* commonly used term: (2 nproc - 1)/(2 nproc) */732x = 1.0 - 0.5 / (double)nproc;733734#ifdef KMP_DEBUG735{ // test natural alignment736struct _test_a {737char a;738union {739char b;740DBL d;741};742} t;743ptrdiff_t natural_alignment =744(ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;745//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long746// long)natural_alignment );747KMP_DEBUG_ASSERT(748(((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);749}750#endif // KMP_DEBUG751752/* save the term in thread private dispatch structure */753*(DBL *)&pr->u.p.parm3 = x;754755/* solve for the crossover point to the nearest integer i for which C_i756<= chunk */757{758UT left, right, mid;759long double p;760761/* estimate initial upper and lower bound */762763/* doesn't matter what value right is as long as it is positive, but764it affects performance of the solver */765right = 229;766p = __kmp_pow<UT>(x, right);767if (p > target) {768do {769p *= p;770right <<= 1;771} while (p > target && right < (1 << 27));772/* lower bound is previous (failed) estimate of upper bound */773left = right >> 1;774} else {775left = 0;776}777778/* bisection root-finding method */779while (left + 1 < right) {780mid = (left + right) / 2;781if (__kmp_pow<UT>(x, mid) > target) {782left = mid;783} else {784right = mid;785}786} // while787cross = right;788}789/* assert sanity of computed crossover point */790KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&791__kmp_pow<UT>(x, cross) <= target);792793/* save the crossover point in thread private dispatch structure */794pr->u.p.parm2 = cross;795796// C75803797#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))798#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)799#else800#define GUIDED_ANALYTICAL_WORKAROUND (x)801#endif802/* dynamic-style scheduling offset */803pr->u.p.count = tc -804__kmp_dispatch_guided_remaining(805tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -806cross * chunk;807#if KMP_USE_X87CONTROL808// restore FPCW809_control87(oldFpcw, _MCW_PC);810#endif811} // if812} else {813KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "814"kmp_sch_static_greedy\n",815gtid));816schedule = kmp_sch_static_greedy;817/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */818pr->u.p.parm1 = tc;819} // if820} // case821break;822case kmp_sch_static_greedy:823KD_TRACE(824100,825("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",826gtid));827pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;828break;829case kmp_sch_static_chunked:830case kmp_sch_dynamic_chunked:831dynamic_init:832if (tc == 0)833break;834if (pr->u.p.parm1 <= 0)835pr->u.p.parm1 = KMP_DEFAULT_CHUNK;836else if (pr->u.p.parm1 > tc)837pr->u.p.parm1 = tc;838// Store the total number of chunks to prevent integer overflow during839// bounds calculations in the get next chunk routine.840pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);841KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "842"kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",843gtid));844break;845case kmp_sch_trapezoidal: {846/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */847848T parm1, parm2, parm3, parm4;849KD_TRACE(100,850("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",851gtid));852853parm1 = chunk;854855/* F : size of the first cycle */856parm2 = (tc / (2 * nproc));857858if (parm2 < 1) {859parm2 = 1;860}861862/* L : size of the last cycle. Make sure the last cycle is not larger863than the first cycle. */864if (parm1 < 1) {865parm1 = 1;866} else if (parm1 > parm2) {867parm1 = parm2;868}869870/* N : number of cycles */871parm3 = (parm2 + parm1);872parm3 = (2 * tc + parm3 - 1) / parm3;873874if (parm3 < 2) {875parm3 = 2;876}877878/* sigma : decreasing incr of the trapezoid */879parm4 = (parm3 - 1);880parm4 = (parm2 - parm1) / parm4;881882// pointless check, because parm4 >= 0 always883// if ( parm4 < 0 ) {884// parm4 = 0;885//}886887pr->u.p.parm1 = parm1;888pr->u.p.parm2 = parm2;889pr->u.p.parm3 = parm3;890pr->u.p.parm4 = parm4;891} // case892break;893894default: {895__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message896KMP_HNT(GetNewerLibrary), // Hint897__kmp_msg_null // Variadic argument list terminator898);899} break;900} // switch901pr->schedule = schedule;902}903904#if KMP_USE_HIER_SCHED905template <typename T>906inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,907typename traits_t<T>::signed_t st);908template <>909inline void910__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,911kmp_int32 ub, kmp_int32 st) {912__kmp_dispatch_init_hierarchy<kmp_int32>(913loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,914__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);915}916template <>917inline void918__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,919kmp_uint32 ub, kmp_int32 st) {920__kmp_dispatch_init_hierarchy<kmp_uint32>(921loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,922__kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);923}924template <>925inline void926__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,927kmp_int64 ub, kmp_int64 st) {928__kmp_dispatch_init_hierarchy<kmp_int64>(929loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,930__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);931}932template <>933inline void934__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,935kmp_uint64 ub, kmp_int64 st) {936__kmp_dispatch_init_hierarchy<kmp_uint64>(937loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,938__kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);939}940941// free all the hierarchy scheduling memory associated with the team942void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {943int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;944for (int i = 0; i < num_disp_buff; ++i) {945// type does not matter here so use kmp_int32946auto sh =947reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(948&team->t.t_disp_buffer[i]);949if (sh->hier) {950sh->hier->deallocate();951__kmp_free(sh->hier);952}953}954}955#endif956957// UT - unsigned flavor of T, ST - signed flavor of T,958// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8959template <typename T>960static void961__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,962T ub, typename traits_t<T>::signed_t st,963typename traits_t<T>::signed_t chunk, int push_ws) {964typedef typename traits_t<T>::unsigned_t UT;965966int active;967kmp_info_t *th;968kmp_team_t *team;969kmp_uint32 my_buffer_index;970dispatch_private_info_template<T> *pr;971dispatch_shared_info_template<T> volatile *sh;972973KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==974sizeof(dispatch_private_info));975KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==976sizeof(dispatch_shared_info));977__kmp_assert_valid_gtid(gtid);978979if (!TCR_4(__kmp_init_parallel))980__kmp_parallel_initialize();981982__kmp_resume_if_soft_paused();983984#if INCLUDE_SSC_MARKS985SSC_MARK_DISPATCH_INIT();986#endif987#ifdef KMP_DEBUG988typedef typename traits_t<T>::signed_t ST;989{990char *buff;991// create format specifiers before the debug output992buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "993"chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",994traits_t<ST>::spec, traits_t<T>::spec,995traits_t<T>::spec, traits_t<ST>::spec);996KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));997__kmp_str_free(&buff);998}999#endif1000/* setup data */1001th = __kmp_threads[gtid];1002team = th->th.th_team;1003active = !team->t.t_serialized;1004th->th.th_ident = loc;10051006// Any half-decent optimizer will remove this test when the blocks are empty1007// since the macros expand to nothing1008// when statistics are disabled.1009if (schedule == __kmp_static) {1010KMP_COUNT_BLOCK(OMP_LOOP_STATIC);1011} else {1012KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);1013}10141015#if KMP_USE_HIER_SCHED1016// Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable1017// Hierarchical scheduling does not work with ordered, so if ordered is1018// detected, then revert back to threaded scheduling.1019bool ordered;1020enum sched_type my_sched = schedule;1021my_buffer_index = th->th.th_dispatch->th_disp_index;1022pr = reinterpret_cast<dispatch_private_info_template<T> *>(1023&th->th.th_dispatch1024->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);1025my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);1026if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))1027my_sched =1028(enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));1029ordered = (kmp_ord_lower & my_sched);1030if (pr->flags.use_hier) {1031if (ordered) {1032KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "1033"Disabling hierarchical scheduling.\n",1034gtid));1035pr->flags.use_hier = FALSE;1036}1037}1038if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {1039// Don't use hierarchical for ordered parallel loops and don't1040// use the runtime hierarchy if one was specified in the program1041if (!ordered && !pr->flags.use_hier)1042__kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);1043}1044#endif // KMP_USE_HIER_SCHED10451046#if USE_ITT_BUILD1047kmp_uint64 cur_chunk = chunk;1048int itt_need_metadata_reporting =1049__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&1050KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&1051team->t.t_active_level == 1;1052#endif1053if (!active) {1054pr = reinterpret_cast<dispatch_private_info_template<T> *>(1055th->th.th_dispatch->th_disp_buffer); /* top of the stack */1056} else {1057KMP_DEBUG_ASSERT(th->th.th_dispatch ==1058&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);10591060my_buffer_index = th->th.th_dispatch->th_disp_index++;10611062/* What happens when number of threads changes, need to resize buffer? */1063pr = reinterpret_cast<dispatch_private_info_template<T> *>(1064&th->th.th_dispatch1065->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);1066sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(1067&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);1068KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,1069my_buffer_index));1070if (sh->buffer_index != my_buffer_index) { // too many loops in progress?1071KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"1072" sh->buffer_index:%d\n",1073gtid, my_buffer_index, sh->buffer_index));1074__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,1075__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));1076// Note: KMP_WAIT() cannot be used there: buffer index and1077// my_buffer_index are *always* 32-bit integers.1078KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "1079"sh->buffer_index:%d\n",1080gtid, my_buffer_index, sh->buffer_index));1081}1082}10831084__kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,1085#if USE_ITT_BUILD1086&cur_chunk,1087#endif1088chunk, (T)th->th.th_team_nproc,1089(T)th->th.th_info.ds.ds_tid);1090if (active) {1091if (pr->flags.ordered == 0) {1092th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;1093th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;1094} else {1095th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;1096th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;1097}1098th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;1099th->th.th_dispatch->th_dispatch_sh_current =1100CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);1101#if USE_ITT_BUILD1102if (pr->flags.ordered) {1103__kmp_itt_ordered_init(gtid);1104}1105// Report loop metadata1106if (itt_need_metadata_reporting) {1107// Only report metadata by primary thread of active team at level 11108kmp_uint64 schedtype = 0;1109switch (schedule) {1110case kmp_sch_static_chunked:1111case kmp_sch_static_balanced: // Chunk is calculated in the switch above1112break;1113case kmp_sch_static_greedy:1114cur_chunk = pr->u.p.parm1;1115break;1116case kmp_sch_dynamic_chunked:1117schedtype = 1;1118break;1119case kmp_sch_guided_iterative_chunked:1120case kmp_sch_guided_analytical_chunked:1121case kmp_sch_guided_simd:1122schedtype = 2;1123break;1124default:1125// Should we put this case under "static"?1126// case kmp_sch_static_steal:1127schedtype = 3;1128break;1129}1130__kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);1131}1132#if KMP_USE_HIER_SCHED1133if (pr->flags.use_hier) {1134pr->u.p.count = 0;1135pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;1136}1137#endif // KMP_USER_HIER_SCHED1138#endif /* USE_ITT_BUILD */1139}11401141#ifdef KMP_DEBUG1142{1143char *buff;1144// create format specifiers before the debug output1145buff = __kmp_str_format(1146"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "1147"lb:%%%s ub:%%%s"1148" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"1149" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",1150traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,1151traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,1152traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,1153traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);1154KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,1155pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,1156pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,1157pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));1158__kmp_str_free(&buff);1159}1160#endif1161#if OMPT_SUPPORT && OMPT_OPTIONAL1162if (ompt_enabled.ompt_callback_work) {1163ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);1164ompt_task_info_t *task_info = __ompt_get_task_info_object(0);1165ompt_callbacks.ompt_callback(ompt_callback_work)(1166ompt_get_work_schedule(pr->schedule), ompt_scope_begin,1167&(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,1168OMPT_LOAD_RETURN_ADDRESS(gtid));1169}1170#endif1171KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);1172}11731174/* For ordered loops, either __kmp_dispatch_finish() should be called after1175* every iteration, or __kmp_dispatch_finish_chunk() should be called after1176* every chunk of iterations. If the ordered section(s) were not executed1177* for this iteration (or every iteration in this chunk), we need to set the1178* ordered iteration counters so that the next thread can proceed. */1179template <typename UT>1180static void __kmp_dispatch_finish(int gtid, ident_t *loc) {1181typedef typename traits_t<UT>::signed_t ST;1182__kmp_assert_valid_gtid(gtid);1183kmp_info_t *th = __kmp_threads[gtid];11841185KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));1186if (!th->th.th_team->t.t_serialized) {11871188dispatch_private_info_template<UT> *pr =1189reinterpret_cast<dispatch_private_info_template<UT> *>(1190th->th.th_dispatch->th_dispatch_pr_current);1191dispatch_shared_info_template<UT> volatile *sh =1192reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(1193th->th.th_dispatch->th_dispatch_sh_current);1194KMP_DEBUG_ASSERT(pr);1195KMP_DEBUG_ASSERT(sh);1196KMP_DEBUG_ASSERT(th->th.th_dispatch ==1197&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);11981199if (pr->ordered_bumped) {1200KD_TRACE(12011000,1202("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",1203gtid));1204pr->ordered_bumped = 0;1205} else {1206UT lower = pr->u.p.ordered_lower;12071208#ifdef KMP_DEBUG1209{1210char *buff;1211// create format specifiers before the debug output1212buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "1213"ordered_iteration:%%%s lower:%%%s\n",1214traits_t<UT>::spec, traits_t<UT>::spec);1215KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));1216__kmp_str_free(&buff);1217}1218#endif12191220__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,1221__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));1222KMP_MB(); /* is this necessary? */1223#ifdef KMP_DEBUG1224{1225char *buff;1226// create format specifiers before the debug output1227buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "1228"ordered_iteration:%%%s lower:%%%s\n",1229traits_t<UT>::spec, traits_t<UT>::spec);1230KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));1231__kmp_str_free(&buff);1232}1233#endif12341235test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);1236} // if1237} // if1238KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));1239}12401241#ifdef KMP_GOMP_COMPAT12421243template <typename UT>1244static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {1245typedef typename traits_t<UT>::signed_t ST;1246__kmp_assert_valid_gtid(gtid);1247kmp_info_t *th = __kmp_threads[gtid];12481249KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));1250if (!th->th.th_team->t.t_serialized) {1251dispatch_private_info_template<UT> *pr =1252reinterpret_cast<dispatch_private_info_template<UT> *>(1253th->th.th_dispatch->th_dispatch_pr_current);1254dispatch_shared_info_template<UT> volatile *sh =1255reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(1256th->th.th_dispatch->th_dispatch_sh_current);1257KMP_DEBUG_ASSERT(pr);1258KMP_DEBUG_ASSERT(sh);1259KMP_DEBUG_ASSERT(th->th.th_dispatch ==1260&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);12611262UT lower = pr->u.p.ordered_lower;1263UT upper = pr->u.p.ordered_upper;1264UT inc = upper - lower + 1;12651266if (pr->ordered_bumped == inc) {1267KD_TRACE(12681000,1269("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",1270gtid));1271pr->ordered_bumped = 0;1272} else {1273inc -= pr->ordered_bumped;12741275#ifdef KMP_DEBUG1276{1277char *buff;1278// create format specifiers before the debug output1279buff = __kmp_str_format(1280"__kmp_dispatch_finish_chunk: T#%%d before wait: "1281"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",1282traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);1283KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));1284__kmp_str_free(&buff);1285}1286#endif12871288__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,1289__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));12901291KMP_MB(); /* is this necessary? */1292KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "1293"ordered_bumped to zero\n",1294gtid));1295pr->ordered_bumped = 0;1296//!!!!! TODO check if the inc should be unsigned, or signed???1297#ifdef KMP_DEBUG1298{1299char *buff;1300// create format specifiers before the debug output1301buff = __kmp_str_format(1302"__kmp_dispatch_finish_chunk: T#%%d after wait: "1303"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",1304traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,1305traits_t<UT>::spec);1306KD_TRACE(1000,1307(buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));1308__kmp_str_free(&buff);1309}1310#endif13111312test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);1313}1314// }1315}1316KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));1317}13181319#endif /* KMP_GOMP_COMPAT */13201321template <typename T>1322int __kmp_dispatch_next_algorithm(int gtid,1323dispatch_private_info_template<T> *pr,1324dispatch_shared_info_template<T> volatile *sh,1325kmp_int32 *p_last, T *p_lb, T *p_ub,1326typename traits_t<T>::signed_t *p_st, T nproc,1327T tid) {1328typedef typename traits_t<T>::unsigned_t UT;1329typedef typename traits_t<T>::signed_t ST;1330typedef typename traits_t<T>::floating_t DBL;1331int status = 0;1332bool last = false;1333T start;1334ST incr;1335UT limit, trip, init;1336kmp_info_t *th = __kmp_threads[gtid];1337kmp_team_t *team = th->th.th_team;13381339KMP_DEBUG_ASSERT(th->th.th_dispatch ==1340&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);1341KMP_DEBUG_ASSERT(pr);1342KMP_DEBUG_ASSERT(sh);1343KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);1344#ifdef KMP_DEBUG1345{1346char *buff;1347// create format specifiers before the debug output1348buff =1349__kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "1350"sh:%%p nproc:%%%s tid:%%%s\n",1351traits_t<T>::spec, traits_t<T>::spec);1352KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));1353__kmp_str_free(&buff);1354}1355#endif13561357// zero trip count1358if (pr->u.p.tc == 0) {1359KD_TRACE(10,1360("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "1361"zero status:%d\n",1362gtid, status));1363return 0;1364}13651366switch (pr->schedule) {1367#if KMP_STATIC_STEAL_ENABLED1368case kmp_sch_static_steal: {1369T chunk = pr->u.p.parm1;1370UT nchunks = pr->u.p.parm2;1371KD_TRACE(100,1372("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",1373gtid));13741375trip = pr->u.p.tc - 1;13761377if (traits_t<T>::type_size > 4) {1378// use lock for 8-byte induction variable.1379// TODO (optional): check presence and use 16-byte CAS1380kmp_lock_t *lck = pr->u.p.steal_lock;1381KMP_DEBUG_ASSERT(lck != NULL);1382if (pr->u.p.count < (UT)pr->u.p.ub) {1383KMP_DEBUG_ASSERT(pr->steal_flag == READY);1384__kmp_acquire_lock(lck, gtid);1385// try to get own chunk of iterations1386init = (pr->u.p.count)++;1387status = (init < (UT)pr->u.p.ub);1388__kmp_release_lock(lck, gtid);1389} else {1390status = 0; // no own chunks1391}1392if (!status) { // try to steal1393kmp_lock_t *lckv; // victim buffer's lock1394T while_limit = pr->u.p.parm3;1395T while_index = 0;1396int idx = (th->th.th_dispatch->th_disp_index - 1) %1397__kmp_dispatch_num_buffers; // current loop index1398// note: victim thread can potentially execute another loop1399KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive1400while ((!status) && (while_limit != ++while_index)) {1401dispatch_private_info_template<T> *v;1402T remaining;1403T victimId = pr->u.p.parm4;1404T oldVictimId = victimId ? victimId - 1 : nproc - 1;1405v = reinterpret_cast<dispatch_private_info_template<T> *>(1406&team->t.t_dispatch[victimId].th_disp_buffer[idx]);1407KMP_DEBUG_ASSERT(v);1408while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&1409oldVictimId != victimId) {1410victimId = (victimId + 1) % nproc;1411v = reinterpret_cast<dispatch_private_info_template<T> *>(1412&team->t.t_dispatch[victimId].th_disp_buffer[idx]);1413KMP_DEBUG_ASSERT(v);1414}1415if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {1416continue; // try once more (nproc attempts in total)1417}1418if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {1419kmp_uint32 old = UNUSED;1420// try to steal whole range from inactive victim1421status = v->steal_flag.compare_exchange_strong(old, THIEF);1422if (status) {1423// initialize self buffer with victim's whole range of chunks1424T id = victimId;1425T small_chunk = 0, extras = 0, p_extra = 0;1426__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,1427init, small_chunk, extras,1428p_extra);1429__kmp_acquire_lock(lck, gtid);1430pr->u.p.count = init + 1; // exclude one we execute immediately1431pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);1432__kmp_release_lock(lck, gtid);1433pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid1434// no need to reinitialize other thread invariants: lb, st, etc.1435#ifdef KMP_DEBUG1436{1437char *buff;1438// create format specifiers before the debug output1439buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "1440"stolen chunks from T#%%d, "1441"count:%%%s ub:%%%s\n",1442traits_t<UT>::spec, traits_t<T>::spec);1443KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));1444__kmp_str_free(&buff);1445}1446#endif1447// activate non-empty buffer and let others steal from us1448if (pr->u.p.count < (UT)pr->u.p.ub)1449KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);1450break;1451}1452}1453if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||1454v->u.p.count >= (UT)v->u.p.ub) {1455pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid1456continue; // no chunks to steal, try next victim1457}1458lckv = v->u.p.steal_lock;1459KMP_ASSERT(lckv != NULL);1460__kmp_acquire_lock(lckv, gtid);1461limit = v->u.p.ub; // keep initial ub1462if (v->u.p.count >= limit) {1463__kmp_release_lock(lckv, gtid);1464pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid1465continue; // no chunks to steal, try next victim1466}14671468// stealing succeded, reduce victim's ub by 1/4 of undone chunks1469// TODO: is this heuristics good enough??1470remaining = limit - v->u.p.count;1471if (remaining > 7) {1472// steal 1/4 of remaining1473KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);1474init = (v->u.p.ub -= (remaining >> 2));1475} else {1476// steal 1 chunk of 1..7 remaining1477KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);1478init = (v->u.p.ub -= 1);1479}1480__kmp_release_lock(lckv, gtid);1481#ifdef KMP_DEBUG1482{1483char *buff;1484// create format specifiers before the debug output1485buff = __kmp_str_format(1486"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "1487"count:%%%s ub:%%%s\n",1488traits_t<UT>::spec, traits_t<UT>::spec);1489KD_TRACE(10, (buff, gtid, victimId, init, limit));1490__kmp_str_free(&buff);1491}1492#endif1493KMP_DEBUG_ASSERT(init + 1 <= limit);1494pr->u.p.parm4 = victimId; // remember victim to steal from1495status = 1;1496// now update own count and ub with stolen range excluding init chunk1497__kmp_acquire_lock(lck, gtid);1498pr->u.p.count = init + 1;1499pr->u.p.ub = limit;1500__kmp_release_lock(lck, gtid);1501// activate non-empty buffer and let others steal from us1502if (init + 1 < limit)1503KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);1504} // while (search for victim)1505} // if (try to find victim and steal)1506} else {1507// 4-byte induction variable, use 8-byte CAS for pair (count, ub)1508// as all operations on pair (count, ub) must be done atomically1509typedef union {1510struct {1511UT count;1512T ub;1513} p;1514kmp_int64 b;1515} union_i4;1516union_i4 vold, vnew;1517if (pr->u.p.count < (UT)pr->u.p.ub) {1518KMP_DEBUG_ASSERT(pr->steal_flag == READY);1519vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);1520vnew.b = vold.b;1521vnew.p.count++; // get chunk from head of self range1522while (!KMP_COMPARE_AND_STORE_REL64(1523(volatile kmp_int64 *)&pr->u.p.count,1524*VOLATILE_CAST(kmp_int64 *) & vold.b,1525*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {1526KMP_CPU_PAUSE();1527vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);1528vnew.b = vold.b;1529vnew.p.count++;1530}1531init = vold.p.count;1532status = (init < (UT)vold.p.ub);1533} else {1534status = 0; // no own chunks1535}1536if (!status) { // try to steal1537T while_limit = pr->u.p.parm3;1538T while_index = 0;1539int idx = (th->th.th_dispatch->th_disp_index - 1) %1540__kmp_dispatch_num_buffers; // current loop index1541// note: victim thread can potentially execute another loop1542KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive1543while ((!status) && (while_limit != ++while_index)) {1544dispatch_private_info_template<T> *v;1545T remaining;1546T victimId = pr->u.p.parm4;1547T oldVictimId = victimId ? victimId - 1 : nproc - 1;1548v = reinterpret_cast<dispatch_private_info_template<T> *>(1549&team->t.t_dispatch[victimId].th_disp_buffer[idx]);1550KMP_DEBUG_ASSERT(v);1551while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&1552oldVictimId != victimId) {1553victimId = (victimId + 1) % nproc;1554v = reinterpret_cast<dispatch_private_info_template<T> *>(1555&team->t.t_dispatch[victimId].th_disp_buffer[idx]);1556KMP_DEBUG_ASSERT(v);1557}1558if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {1559continue; // try once more (nproc attempts in total)1560}1561if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {1562kmp_uint32 old = UNUSED;1563// try to steal whole range from inactive victim1564status = v->steal_flag.compare_exchange_strong(old, THIEF);1565if (status) {1566// initialize self buffer with victim's whole range of chunks1567T id = victimId;1568T small_chunk = 0, extras = 0, p_extra = 0;1569__kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,1570init, small_chunk, extras,1571p_extra);1572vnew.p.count = init + 1;1573vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);1574// write pair (count, ub) at once atomically1575#if KMP_ARCH_X861576KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);1577#else1578*(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;1579#endif1580pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid1581// no need to initialize other thread invariants: lb, st, etc.1582#ifdef KMP_DEBUG1583{1584char *buff;1585// create format specifiers before the debug output1586buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "1587"stolen chunks from T#%%d, "1588"count:%%%s ub:%%%s\n",1589traits_t<UT>::spec, traits_t<T>::spec);1590KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));1591__kmp_str_free(&buff);1592}1593#endif1594// activate non-empty buffer and let others steal from us1595if (pr->u.p.count < (UT)pr->u.p.ub)1596KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);1597break;1598}1599}1600while (1) { // CAS loop with check if victim still has enough chunks1601// many threads may be stealing concurrently from same victim1602vold.b = *(volatile kmp_int64 *)(&v->u.p.count);1603if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||1604vold.p.count >= (UT)vold.p.ub) {1605pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id1606break; // no chunks to steal, try next victim1607}1608vnew.b = vold.b;1609remaining = vold.p.ub - vold.p.count;1610// try to steal 1/4 of remaining1611// TODO: is this heuristics good enough??1612if (remaining > 7) {1613vnew.p.ub -= remaining >> 2; // steal from tail of victim's range1614} else {1615vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining1616}1617KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);1618if (KMP_COMPARE_AND_STORE_REL64(1619(volatile kmp_int64 *)&v->u.p.count,1620*VOLATILE_CAST(kmp_int64 *) & vold.b,1621*VOLATILE_CAST(kmp_int64 *) & vnew.b)) {1622// stealing succedded1623#ifdef KMP_DEBUG1624{1625char *buff;1626// create format specifiers before the debug output1627buff = __kmp_str_format(1628"__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "1629"count:%%%s ub:%%%s\n",1630traits_t<T>::spec, traits_t<T>::spec);1631KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));1632__kmp_str_free(&buff);1633}1634#endif1635KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,1636vold.p.ub - vnew.p.ub);1637status = 1;1638pr->u.p.parm4 = victimId; // keep victim id1639// now update own count and ub1640init = vnew.p.ub;1641vold.p.count = init + 1;1642#if KMP_ARCH_X861643KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);1644#else1645*(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;1646#endif1647// activate non-empty buffer and let others steal from us1648if (vold.p.count < (UT)vold.p.ub)1649KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);1650break;1651} // if (check CAS result)1652KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt1653} // while (try to steal from particular victim)1654} // while (search for victim)1655} // if (try to find victim and steal)1656} // if (4-byte induction variable)1657if (!status) {1658*p_lb = 0;1659*p_ub = 0;1660if (p_st != NULL)1661*p_st = 0;1662} else {1663start = pr->u.p.lb;1664init *= chunk;1665limit = chunk + init - 1;1666incr = pr->u.p.st;1667KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);16681669KMP_DEBUG_ASSERT(init <= trip);1670// keep track of done chunks for possible early exit from stealing1671// TODO: count executed chunks locally with rare update of shared location1672// test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);1673if ((last = (limit >= trip)) != 0)1674limit = trip;1675if (p_st != NULL)1676*p_st = incr;16771678if (incr == 1) {1679*p_lb = start + init;1680*p_ub = start + limit;1681} else {1682*p_lb = start + init * incr;1683*p_ub = start + limit * incr;1684}1685} // if1686break;1687} // case1688#endif // KMP_STATIC_STEAL_ENABLED1689case kmp_sch_static_balanced: {1690KD_TRACE(169110,1692("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",1693gtid));1694/* check if thread has any iteration to do */1695if ((status = !pr->u.p.count) != 0) {1696pr->u.p.count = 1;1697*p_lb = pr->u.p.lb;1698*p_ub = pr->u.p.ub;1699last = (pr->u.p.parm1 != 0);1700if (p_st != NULL)1701*p_st = pr->u.p.st;1702} else { /* no iterations to do */1703pr->u.p.lb = pr->u.p.ub + pr->u.p.st;1704}1705} // case1706break;1707case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was1708merged here */1709case kmp_sch_static_chunked: {1710T parm1;17111712KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "1713"kmp_sch_static_[affinity|chunked] case\n",1714gtid));1715parm1 = pr->u.p.parm1;17161717trip = pr->u.p.tc - 1;1718init = parm1 * (pr->u.p.count + tid);17191720if ((status = (init <= trip)) != 0) {1721start = pr->u.p.lb;1722incr = pr->u.p.st;1723limit = parm1 + init - 1;17241725if ((last = (limit >= trip)) != 0)1726limit = trip;17271728if (p_st != NULL)1729*p_st = incr;17301731pr->u.p.count += nproc;17321733if (incr == 1) {1734*p_lb = start + init;1735*p_ub = start + limit;1736} else {1737*p_lb = start + init * incr;1738*p_ub = start + limit * incr;1739}17401741if (pr->flags.ordered) {1742pr->u.p.ordered_lower = init;1743pr->u.p.ordered_upper = limit;1744} // if1745} // if1746} // case1747break;17481749case kmp_sch_dynamic_chunked: {1750UT chunk_number;1751UT chunk_size = pr->u.p.parm1;1752UT nchunks = pr->u.p.parm2;17531754KD_TRACE(1755100,1756("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",1757gtid));17581759chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);1760status = (chunk_number < nchunks);1761if (!status) {1762*p_lb = 0;1763*p_ub = 0;1764if (p_st != NULL)1765*p_st = 0;1766} else {1767init = chunk_size * chunk_number;1768trip = pr->u.p.tc - 1;1769start = pr->u.p.lb;1770incr = pr->u.p.st;17711772if ((last = (trip - init < (UT)chunk_size)))1773limit = trip;1774else1775limit = chunk_size + init - 1;17761777if (p_st != NULL)1778*p_st = incr;17791780if (incr == 1) {1781*p_lb = start + init;1782*p_ub = start + limit;1783} else {1784*p_lb = start + init * incr;1785*p_ub = start + limit * incr;1786}17871788if (pr->flags.ordered) {1789pr->u.p.ordered_lower = init;1790pr->u.p.ordered_upper = limit;1791} // if1792} // if1793} // case1794break;17951796case kmp_sch_guided_iterative_chunked: {1797T chunkspec = pr->u.p.parm1;1798KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "1799"iterative case\n",1800gtid));1801trip = pr->u.p.tc;1802// Start atomic part of calculations1803while (1) {1804ST remaining; // signed, because can be < 01805init = sh->u.s.iteration; // shared value1806remaining = trip - init;1807if (remaining <= 0) { // AC: need to compare with 0 first1808// nothing to do, don't try atomic op1809status = 0;1810break;1811}1812if ((T)remaining <1813pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default1814// use dynamic-style schedule1815// atomically increment iterations, get old value1816init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),1817(ST)chunkspec);1818remaining = trip - init;1819if (remaining <= 0) {1820status = 0; // all iterations got by other threads1821} else {1822// got some iterations to work on1823status = 1;1824if ((T)remaining > chunkspec) {1825limit = init + chunkspec - 1;1826} else {1827last = true; // the last chunk1828limit = init + remaining - 1;1829} // if1830} // if1831break;1832} // if1833limit = init + (UT)((double)remaining *1834*(double *)&pr->u.p.parm3); // divide by K*nproc1835if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),1836(ST)init, (ST)limit)) {1837// CAS was successful, chunk obtained1838status = 1;1839--limit;1840break;1841} // if1842} // while1843if (status != 0) {1844start = pr->u.p.lb;1845incr = pr->u.p.st;1846if (p_st != NULL)1847*p_st = incr;1848*p_lb = start + init * incr;1849*p_ub = start + limit * incr;1850if (pr->flags.ordered) {1851pr->u.p.ordered_lower = init;1852pr->u.p.ordered_upper = limit;1853} // if1854} else {1855*p_lb = 0;1856*p_ub = 0;1857if (p_st != NULL)1858*p_st = 0;1859} // if1860} // case1861break;18621863case kmp_sch_guided_simd: {1864// same as iterative but curr-chunk adjusted to be multiple of given1865// chunk1866T chunk = pr->u.p.parm1;1867KD_TRACE(100,1868("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",1869gtid));1870trip = pr->u.p.tc;1871// Start atomic part of calculations1872while (1) {1873ST remaining; // signed, because can be < 01874init = sh->u.s.iteration; // shared value1875remaining = trip - init;1876if (remaining <= 0) { // AC: need to compare with 0 first1877status = 0; // nothing to do, don't try atomic op1878break;1879}1880KMP_DEBUG_ASSERT(chunk && init % chunk == 0);1881// compare with K*nproc*(chunk+1), K=2 by default1882if ((T)remaining < pr->u.p.parm2) {1883// use dynamic-style schedule1884// atomically increment iterations, get old value1885init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),1886(ST)chunk);1887remaining = trip - init;1888if (remaining <= 0) {1889status = 0; // all iterations got by other threads1890} else {1891// got some iterations to work on1892status = 1;1893if ((T)remaining > chunk) {1894limit = init + chunk - 1;1895} else {1896last = true; // the last chunk1897limit = init + remaining - 1;1898} // if1899} // if1900break;1901} // if1902// divide by K*nproc1903UT span;1904__kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),1905&span);1906UT rem = span % chunk;1907if (rem) // adjust so that span%chunk == 01908span += chunk - rem;1909limit = init + span;1910if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),1911(ST)init, (ST)limit)) {1912// CAS was successful, chunk obtained1913status = 1;1914--limit;1915break;1916} // if1917} // while1918if (status != 0) {1919start = pr->u.p.lb;1920incr = pr->u.p.st;1921if (p_st != NULL)1922*p_st = incr;1923*p_lb = start + init * incr;1924*p_ub = start + limit * incr;1925if (pr->flags.ordered) {1926pr->u.p.ordered_lower = init;1927pr->u.p.ordered_upper = limit;1928} // if1929} else {1930*p_lb = 0;1931*p_ub = 0;1932if (p_st != NULL)1933*p_st = 0;1934} // if1935} // case1936break;19371938case kmp_sch_guided_analytical_chunked: {1939T chunkspec = pr->u.p.parm1;1940UT chunkIdx;1941#if KMP_USE_X87CONTROL1942/* for storing original FPCW value for Windows* OS on1943IA-32 architecture 8-byte version */1944unsigned int oldFpcw;1945unsigned int fpcwSet = 0;1946#endif1947KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "1948"kmp_sch_guided_analytical_chunked case\n",1949gtid));19501951trip = pr->u.p.tc;19521953KMP_DEBUG_ASSERT(nproc > 1);1954KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);19551956while (1) { /* this while loop is a safeguard against unexpected zero1957chunk sizes */1958chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);1959if (chunkIdx >= (UT)pr->u.p.parm2) {1960--trip;1961/* use dynamic-style scheduling */1962init = chunkIdx * chunkspec + pr->u.p.count;1963/* need to verify init > 0 in case of overflow in the above1964* calculation */1965if ((status = (init > 0 && init <= trip)) != 0) {1966limit = init + chunkspec - 1;19671968if ((last = (limit >= trip)) != 0)1969limit = trip;1970}1971break;1972} else {1973/* use exponential-style scheduling */1974/* The following check is to workaround the lack of long double precision on1975Windows* OS.1976This check works around the possible effect that init != 0 for chunkIdx == 0.1977*/1978#if KMP_USE_X87CONTROL1979/* If we haven't already done so, save original1980FPCW and set precision to 64-bit, as Windows* OS1981on IA-32 architecture defaults to 53-bit */1982if (!fpcwSet) {1983oldFpcw = _control87(0, 0);1984_control87(_PC_64, _MCW_PC);1985fpcwSet = 0x30000;1986}1987#endif1988if (chunkIdx) {1989init = __kmp_dispatch_guided_remaining<T>(1990trip, *(DBL *)&pr->u.p.parm3, chunkIdx);1991KMP_DEBUG_ASSERT(init);1992init = trip - init;1993} else1994init = 0;1995limit = trip - __kmp_dispatch_guided_remaining<T>(1996trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);1997KMP_ASSERT(init <= limit);1998if (init < limit) {1999KMP_DEBUG_ASSERT(limit <= trip);2000--limit;2001status = 1;2002break;2003} // if2004} // if2005} // while (1)2006#if KMP_USE_X87CONTROL2007/* restore FPCW if necessary2008AC: check fpcwSet flag first because oldFpcw can be uninitialized here2009*/2010if (fpcwSet && (oldFpcw & fpcwSet))2011_control87(oldFpcw, _MCW_PC);2012#endif2013if (status != 0) {2014start = pr->u.p.lb;2015incr = pr->u.p.st;2016if (p_st != NULL)2017*p_st = incr;2018*p_lb = start + init * incr;2019*p_ub = start + limit * incr;2020if (pr->flags.ordered) {2021pr->u.p.ordered_lower = init;2022pr->u.p.ordered_upper = limit;2023}2024} else {2025*p_lb = 0;2026*p_ub = 0;2027if (p_st != NULL)2028*p_st = 0;2029}2030} // case2031break;20322033case kmp_sch_trapezoidal: {2034UT index;2035T parm2 = pr->u.p.parm2;2036T parm3 = pr->u.p.parm3;2037T parm4 = pr->u.p.parm4;2038KD_TRACE(100,2039("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",2040gtid));20412042index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);20432044init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;2045trip = pr->u.p.tc - 1;20462047if ((status = ((T)index < parm3 && init <= trip)) == 0) {2048*p_lb = 0;2049*p_ub = 0;2050if (p_st != NULL)2051*p_st = 0;2052} else {2053start = pr->u.p.lb;2054limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;2055incr = pr->u.p.st;20562057if ((last = (limit >= trip)) != 0)2058limit = trip;20592060if (p_st != NULL)2061*p_st = incr;20622063if (incr == 1) {2064*p_lb = start + init;2065*p_ub = start + limit;2066} else {2067*p_lb = start + init * incr;2068*p_ub = start + limit * incr;2069}20702071if (pr->flags.ordered) {2072pr->u.p.ordered_lower = init;2073pr->u.p.ordered_upper = limit;2074} // if2075} // if2076} // case2077break;2078default: {2079status = 0; // to avoid complaints on uninitialized variable use2080__kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message2081KMP_HNT(GetNewerLibrary), // Hint2082__kmp_msg_null // Variadic argument list terminator2083);2084} break;2085} // switch2086if (p_last)2087*p_last = last;2088#ifdef KMP_DEBUG2089if (pr->flags.ordered) {2090char *buff;2091// create format specifiers before the debug output2092buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "2093"ordered_lower:%%%s ordered_upper:%%%s\n",2094traits_t<UT>::spec, traits_t<UT>::spec);2095KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));2096__kmp_str_free(&buff);2097}2098{2099char *buff;2100// create format specifiers before the debug output2101buff = __kmp_str_format(2102"__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "2103"p_lb:%%%s p_ub:%%%s p_st:%%%s\n",2104traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);2105KMP_DEBUG_ASSERT(p_last);2106KMP_DEBUG_ASSERT(p_st);2107KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));2108__kmp_str_free(&buff);2109}2110#endif2111return status;2112}21132114/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more2115work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()2116is not called. */2117#if OMPT_SUPPORT && OMPT_OPTIONAL2118#define OMPT_LOOP_END \2119if (status == 0) { \2120if (ompt_enabled.ompt_callback_work) { \2121ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \2122ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \2123ompt_callbacks.ompt_callback(ompt_callback_work)( \2124ompt_get_work_schedule(pr->schedule), ompt_scope_end, \2125&(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \2126} \2127}2128#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \2129if (ompt_enabled.ompt_callback_dispatch && status) { \2130ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \2131ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \2132ompt_dispatch_chunk_t chunk; \2133ompt_data_t instance = ompt_data_none; \2134OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \2135instance.ptr = &chunk; \2136ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \2137&(team_info->parallel_data), &(task_info->task_data), \2138ompt_dispatch_ws_loop_chunk, instance); \2139}2140// TODO: implement count2141#else2142#define OMPT_LOOP_END // no-op2143#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op2144#endif21452146#if KMP_STATS_ENABLED2147#define KMP_STATS_LOOP_END \2148{ \2149kmp_int64 u, l, t, i; \2150l = (kmp_int64)(*p_lb); \2151u = (kmp_int64)(*p_ub); \2152i = (kmp_int64)(pr->u.p.st); \2153if (status == 0) { \2154t = 0; \2155KMP_POP_PARTITIONED_TIMER(); \2156} else if (i == 1) { \2157if (u >= l) \2158t = u - l + 1; \2159else \2160t = 0; \2161} else if (i < 0) { \2162if (l >= u) \2163t = (l - u) / (-i) + 1; \2164else \2165t = 0; \2166} else { \2167if (u >= l) \2168t = (u - l) / i + 1; \2169else \2170t = 0; \2171} \2172KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \2173}2174#else2175#define KMP_STATS_LOOP_END /* Nothing */2176#endif21772178template <typename T>2179static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,2180T *p_lb, T *p_ub,2181typename traits_t<T>::signed_t *p_st2182#if OMPT_SUPPORT && OMPT_OPTIONAL2183,2184void *codeptr2185#endif2186) {21872188typedef typename traits_t<T>::unsigned_t UT;2189typedef typename traits_t<T>::signed_t ST;2190// This is potentially slightly misleading, schedule(runtime) will appear here2191// even if the actual runtime schedule is static. (Which points out a2192// disadvantage of schedule(runtime): even when static scheduling is used it2193// costs more than a compile time choice to use static scheduling would.)2194KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);21952196int status;2197dispatch_private_info_template<T> *pr;2198__kmp_assert_valid_gtid(gtid);2199kmp_info_t *th = __kmp_threads[gtid];2200kmp_team_t *team = th->th.th_team;22012202KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL2203KD_TRACE(22041000,2205("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",2206gtid, p_lb, p_ub, p_st, p_last));22072208if (team->t.t_serialized) {2209/* NOTE: serialize this dispatch because we are not at the active level */2210pr = reinterpret_cast<dispatch_private_info_template<T> *>(2211th->th.th_dispatch->th_disp_buffer); /* top of the stack */2212KMP_DEBUG_ASSERT(pr);22132214if ((status = (pr->u.p.tc != 0)) == 0) {2215*p_lb = 0;2216*p_ub = 0;2217// if ( p_last != NULL )2218// *p_last = 0;2219if (p_st != NULL)2220*p_st = 0;2221if (__kmp_env_consistency_check) {2222if (pr->pushed_ws != ct_none) {2223pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);2224}2225}2226} else if (pr->flags.nomerge) {2227kmp_int32 last;2228T start;2229UT limit, trip, init;2230ST incr;2231T chunk = pr->u.p.parm1;22322233KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",2234gtid));22352236init = chunk * pr->u.p.count++;2237trip = pr->u.p.tc - 1;22382239if ((status = (init <= trip)) == 0) {2240*p_lb = 0;2241*p_ub = 0;2242// if ( p_last != NULL )2243// *p_last = 0;2244if (p_st != NULL)2245*p_st = 0;2246if (__kmp_env_consistency_check) {2247if (pr->pushed_ws != ct_none) {2248pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);2249}2250}2251} else {2252start = pr->u.p.lb;2253limit = chunk + init - 1;2254incr = pr->u.p.st;22552256if ((last = (limit >= trip)) != 0) {2257limit = trip;2258#if KMP_OS_WINDOWS2259pr->u.p.last_upper = pr->u.p.ub;2260#endif /* KMP_OS_WINDOWS */2261}2262if (p_last != NULL)2263*p_last = last;2264if (p_st != NULL)2265*p_st = incr;2266if (incr == 1) {2267*p_lb = start + init;2268*p_ub = start + limit;2269} else {2270*p_lb = start + init * incr;2271*p_ub = start + limit * incr;2272}22732274if (pr->flags.ordered) {2275pr->u.p.ordered_lower = init;2276pr->u.p.ordered_upper = limit;2277#ifdef KMP_DEBUG2278{2279char *buff;2280// create format specifiers before the debug output2281buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "2282"ordered_lower:%%%s ordered_upper:%%%s\n",2283traits_t<UT>::spec, traits_t<UT>::spec);2284KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,2285pr->u.p.ordered_upper));2286__kmp_str_free(&buff);2287}2288#endif2289} // if2290} // if2291} else {2292pr->u.p.tc = 0;2293*p_lb = pr->u.p.lb;2294*p_ub = pr->u.p.ub;2295#if KMP_OS_WINDOWS2296pr->u.p.last_upper = *p_ub;2297#endif /* KMP_OS_WINDOWS */2298if (p_last != NULL)2299*p_last = TRUE;2300if (p_st != NULL)2301*p_st = pr->u.p.st;2302} // if2303#ifdef KMP_DEBUG2304{2305char *buff;2306// create format specifiers before the debug output2307buff = __kmp_str_format(2308"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "2309"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",2310traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);2311KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,2312(p_last ? *p_last : 0), status));2313__kmp_str_free(&buff);2314}2315#endif2316#if INCLUDE_SSC_MARKS2317SSC_MARK_DISPATCH_NEXT();2318#endif2319OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);2320OMPT_LOOP_END;2321KMP_STATS_LOOP_END;2322return status;2323} else {2324kmp_int32 last = 0;2325dispatch_shared_info_template<T> volatile *sh;23262327KMP_DEBUG_ASSERT(th->th.th_dispatch ==2328&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);23292330pr = reinterpret_cast<dispatch_private_info_template<T> *>(2331th->th.th_dispatch->th_dispatch_pr_current);2332KMP_DEBUG_ASSERT(pr);2333sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(2334th->th.th_dispatch->th_dispatch_sh_current);2335KMP_DEBUG_ASSERT(sh);23362337#if KMP_USE_HIER_SCHED2338if (pr->flags.use_hier)2339status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);2340else2341#endif // KMP_USE_HIER_SCHED2342status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,2343p_st, th->th.th_team_nproc,2344th->th.th_info.ds.ds_tid);2345// status == 0: no more iterations to execute2346if (status == 0) {2347ST num_done;2348num_done = test_then_inc<ST>(&sh->u.s.num_done);2349#ifdef KMP_DEBUG2350{2351char *buff;2352// create format specifiers before the debug output2353buff = __kmp_str_format(2354"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",2355traits_t<ST>::spec);2356KD_TRACE(10, (buff, gtid, sh->u.s.num_done));2357__kmp_str_free(&buff);2358}2359#endif23602361#if KMP_USE_HIER_SCHED2362pr->flags.use_hier = FALSE;2363#endif2364if (num_done == th->th.th_team_nproc - 1) {2365#if KMP_STATIC_STEAL_ENABLED2366if (pr->schedule == kmp_sch_static_steal) {2367int i;2368int idx = (th->th.th_dispatch->th_disp_index - 1) %2369__kmp_dispatch_num_buffers; // current loop index2370// loop complete, safe to destroy locks used for stealing2371for (i = 0; i < th->th.th_team_nproc; ++i) {2372dispatch_private_info_template<T> *buf =2373reinterpret_cast<dispatch_private_info_template<T> *>(2374&team->t.t_dispatch[i].th_disp_buffer[idx]);2375KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive2376KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);2377if (traits_t<T>::type_size > 4) {2378// destroy locks used for stealing2379kmp_lock_t *lck = buf->u.p.steal_lock;2380KMP_ASSERT(lck != NULL);2381__kmp_destroy_lock(lck);2382__kmp_free(lck);2383buf->u.p.steal_lock = NULL;2384}2385}2386}2387#endif2388/* NOTE: release shared buffer to be reused */23892390KMP_MB(); /* Flush all pending memory write invalidates. */23912392sh->u.s.num_done = 0;2393sh->u.s.iteration = 0;23942395/* TODO replace with general release procedure? */2396if (pr->flags.ordered) {2397sh->u.s.ordered_iteration = 0;2398}23992400KMP_MB(); /* Flush all pending memory write invalidates. */24012402sh->buffer_index += __kmp_dispatch_num_buffers;2403KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",2404gtid, sh->buffer_index));24052406KMP_MB(); /* Flush all pending memory write invalidates. */24072408} // if2409if (__kmp_env_consistency_check) {2410if (pr->pushed_ws != ct_none) {2411pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);2412}2413}24142415th->th.th_dispatch->th_deo_fcn = NULL;2416th->th.th_dispatch->th_dxo_fcn = NULL;2417th->th.th_dispatch->th_dispatch_sh_current = NULL;2418th->th.th_dispatch->th_dispatch_pr_current = NULL;2419} // if (status == 0)2420#if KMP_OS_WINDOWS2421else if (last) {2422pr->u.p.last_upper = pr->u.p.ub;2423}2424#endif /* KMP_OS_WINDOWS */2425if (p_last != NULL && status != 0)2426*p_last = last;2427} // if24282429#ifdef KMP_DEBUG2430{2431char *buff;2432// create format specifiers before the debug output2433buff = __kmp_str_format(2434"__kmp_dispatch_next: T#%%d normal case: "2435"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",2436traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);2437KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,2438(p_last ? *p_last : 0), status));2439__kmp_str_free(&buff);2440}2441#endif2442#if INCLUDE_SSC_MARKS2443SSC_MARK_DISPATCH_NEXT();2444#endif2445OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);2446OMPT_LOOP_END;2447KMP_STATS_LOOP_END;2448return status;2449}24502451/*!2452@ingroup WORK_SHARING2453@param loc source location information2454@param global_tid global thread number2455@return Zero if the parallel region is not active and this thread should execute2456all sections, non-zero otherwise.24572458Beginning of sections construct.2459There are no implicit barriers in the "sections" calls, rather the compiler2460should introduce an explicit barrier if it is required.24612462This implementation is based on __kmp_dispatch_init, using same constructs for2463shared data (we can't have sections nested directly in omp for loop, there2464should be a parallel region in between)2465*/2466kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {24672468int active;2469kmp_info_t *th;2470kmp_team_t *team;2471kmp_uint32 my_buffer_index;2472dispatch_shared_info_template<kmp_int32> volatile *sh;24732474KMP_DEBUG_ASSERT(__kmp_init_serial);24752476if (!TCR_4(__kmp_init_parallel))2477__kmp_parallel_initialize();2478__kmp_resume_if_soft_paused();24792480/* setup data */2481th = __kmp_threads[gtid];2482team = th->th.th_team;2483active = !team->t.t_serialized;2484th->th.th_ident = loc;24852486KMP_COUNT_BLOCK(OMP_SECTIONS);2487KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));24882489if (active) {2490// Setup sections in the same way as dynamic scheduled loops.2491// We need one shared data: which section is to execute next.2492// (in case parallel is not active, all sections will be executed on the2493// same thread)2494KMP_DEBUG_ASSERT(th->th.th_dispatch ==2495&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);24962497my_buffer_index = th->th.th_dispatch->th_disp_index++;24982499// reuse shared data structures from dynamic sched loops:2500sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(2501&team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);2502KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,2503my_buffer_index));25042505th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;2506th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;25072508KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "2509"sh->buffer_index:%d\n",2510gtid, my_buffer_index, sh->buffer_index));2511__kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,2512__kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));2513// Note: KMP_WAIT() cannot be used there: buffer index and2514// my_buffer_index are *always* 32-bit integers.2515KMP_MB();2516KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "2517"sh->buffer_index:%d\n",2518gtid, my_buffer_index, sh->buffer_index));25192520th->th.th_dispatch->th_dispatch_pr_current =2521nullptr; // sections construct doesn't need private data2522th->th.th_dispatch->th_dispatch_sh_current =2523CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);2524}25252526#if OMPT_SUPPORT && OMPT_OPTIONAL2527if (ompt_enabled.ompt_callback_work) {2528ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);2529ompt_task_info_t *task_info = __ompt_get_task_info_object(0);2530ompt_callbacks.ompt_callback(ompt_callback_work)(2531ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),2532&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));2533}2534#endif2535KMP_PUSH_PARTITIONED_TIMER(OMP_sections);25362537return active;2538}25392540/*!2541@ingroup WORK_SHARING2542@param loc source location information2543@param global_tid global thread number2544@param numberOfSections number of sections in the 'sections' construct2545@return unsigned [from 0 to n) - number (id) of the section to execute next on2546this thread. n (or any other number not in range) - nothing to execute on this2547thread2548*/25492550kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,2551kmp_int32 numberOfSections) {25522553KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);25542555kmp_info_t *th = __kmp_threads[gtid];2556#ifdef KMP_DEBUG2557kmp_team_t *team = th->th.th_team;2558#endif25592560KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,2561numberOfSections));25622563// For serialized case we should not call this function:2564KMP_DEBUG_ASSERT(!team->t.t_serialized);25652566dispatch_shared_info_template<kmp_int32> volatile *sh;25672568KMP_DEBUG_ASSERT(th->th.th_dispatch ==2569&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);25702571KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));2572sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(2573th->th.th_dispatch->th_dispatch_sh_current);2574KMP_DEBUG_ASSERT(sh);25752576kmp_int32 sectionIndex = 0;2577bool moreSectionsToExecute = true;25782579// Find section to execute:2580sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);2581if (sectionIndex >= numberOfSections) {2582moreSectionsToExecute = false;2583}25842585// status == 0: no more sections to execute;2586// OMPTODO: __kmpc_end_sections could be bypassed?2587if (!moreSectionsToExecute) {2588kmp_int32 num_done;25892590num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));25912592if (num_done == th->th.th_team_nproc - 1) {2593/* NOTE: release this buffer to be reused */25942595KMP_MB(); /* Flush all pending memory write invalidates. */25962597sh->u.s.num_done = 0;2598sh->u.s.iteration = 0;25992600KMP_MB(); /* Flush all pending memory write invalidates. */26012602sh->buffer_index += __kmp_dispatch_num_buffers;2603KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,2604sh->buffer_index));26052606KMP_MB(); /* Flush all pending memory write invalidates. */26072608} // if26092610th->th.th_dispatch->th_deo_fcn = NULL;2611th->th.th_dispatch->th_dxo_fcn = NULL;2612th->th.th_dispatch->th_dispatch_sh_current = NULL;2613th->th.th_dispatch->th_dispatch_pr_current = NULL;26142615#if OMPT_SUPPORT && OMPT_OPTIONAL2616if (ompt_enabled.ompt_callback_dispatch) {2617ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);2618ompt_task_info_t *task_info = __ompt_get_task_info_object(0);2619ompt_data_t instance = ompt_data_none;2620instance.ptr = OMPT_GET_RETURN_ADDRESS(0);2621ompt_callbacks.ompt_callback(ompt_callback_dispatch)(2622&(team_info->parallel_data), &(task_info->task_data),2623ompt_dispatch_section, instance);2624}2625#endif2626}26272628return sectionIndex;2629}26302631/*!2632@ingroup WORK_SHARING2633@param loc source location information2634@param global_tid global thread number26352636End of "sections" construct.2637Don't need to wait here: barrier is added separately when needed.2638*/2639void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {26402641kmp_info_t *th = __kmp_threads[gtid];2642int active = !th->th.th_team->t.t_serialized;26432644KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));26452646if (!active) {2647// In active case call finalization is done in __kmpc_next_section2648#if OMPT_SUPPORT && OMPT_OPTIONAL2649if (ompt_enabled.ompt_callback_work) {2650ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);2651ompt_task_info_t *task_info = __ompt_get_task_info_object(0);2652ompt_callbacks.ompt_callback(ompt_callback_work)(2653ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),2654&(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));2655}2656#endif2657}26582659KMP_POP_PARTITIONED_TIMER();2660KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));2661}26622663template <typename T>2664static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,2665kmp_int32 *plastiter, T *plower, T *pupper,2666typename traits_t<T>::signed_t incr) {2667typedef typename traits_t<T>::unsigned_t UT;2668kmp_uint32 team_id;2669kmp_uint32 nteams;2670UT trip_count;2671kmp_team_t *team;2672kmp_info_t *th;26732674KMP_DEBUG_ASSERT(plastiter && plower && pupper);2675KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));2676#ifdef KMP_DEBUG2677typedef typename traits_t<T>::signed_t ST;2678{2679char *buff;2680// create format specifiers before the debug output2681buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "2682"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",2683traits_t<T>::spec, traits_t<T>::spec,2684traits_t<ST>::spec, traits_t<T>::spec);2685KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));2686__kmp_str_free(&buff);2687}2688#endif26892690if (__kmp_env_consistency_check) {2691if (incr == 0) {2692__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,2693loc);2694}2695if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {2696// The loop is illegal.2697// Some zero-trip loops maintained by compiler, e.g.:2698// for(i=10;i<0;++i) // lower >= upper - run-time check2699// for(i=0;i>10;--i) // lower <= upper - run-time check2700// for(i=0;i>10;++i) // incr > 0 - compile-time check2701// for(i=10;i<0;--i) // incr < 0 - compile-time check2702// Compiler does not check the following illegal loops:2703// for(i=0;i<10;i+=incr) // where incr<02704// for(i=10;i>0;i-=incr) // where incr<02705__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);2706}2707}2708__kmp_assert_valid_gtid(gtid);2709th = __kmp_threads[gtid];2710team = th->th.th_team;2711KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct2712nteams = th->th.th_teams_size.nteams;2713team_id = team->t.t_master_tid;2714KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);27152716// compute global trip count2717if (incr == 1) {2718trip_count = *pupper - *plower + 1;2719} else if (incr == -1) {2720trip_count = *plower - *pupper + 1;2721} else if (incr > 0) {2722// upper-lower can exceed the limit of signed type2723trip_count = (UT)(*pupper - *plower) / incr + 1;2724} else {2725trip_count = (UT)(*plower - *pupper) / (-incr) + 1;2726}27272728if (trip_count <= nteams) {2729KMP_DEBUG_ASSERT(2730__kmp_static == kmp_sch_static_greedy ||2731__kmp_static ==2732kmp_sch_static_balanced); // Unknown static scheduling type.2733// only some teams get single iteration, others get nothing2734if (team_id < trip_count) {2735*pupper = *plower = *plower + team_id * incr;2736} else {2737*plower = *pupper + incr; // zero-trip loop2738}2739if (plastiter != NULL)2740*plastiter = (team_id == trip_count - 1);2741} else {2742if (__kmp_static == kmp_sch_static_balanced) {2743UT chunk = trip_count / nteams;2744UT extras = trip_count % nteams;2745*plower +=2746incr * (team_id * chunk + (team_id < extras ? team_id : extras));2747*pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);2748if (plastiter != NULL)2749*plastiter = (team_id == nteams - 1);2750} else {2751T chunk_inc_count =2752(trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;2753T upper = *pupper;2754KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);2755// Unknown static scheduling type.2756*plower += team_id * chunk_inc_count;2757*pupper = *plower + chunk_inc_count - incr;2758// Check/correct bounds if needed2759if (incr > 0) {2760if (*pupper < *plower)2761*pupper = traits_t<T>::max_value;2762if (plastiter != NULL)2763*plastiter = *plower <= upper && *pupper > upper - incr;2764if (*pupper > upper)2765*pupper = upper; // tracker C732582766} else {2767if (*pupper > *plower)2768*pupper = traits_t<T>::min_value;2769if (plastiter != NULL)2770*plastiter = *plower >= upper && *pupper < upper - incr;2771if (*pupper < upper)2772*pupper = upper; // tracker C732582773}2774}2775}2776}27772778//-----------------------------------------------------------------------------2779// Dispatch routines2780// Transfer call to template< type T >2781// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,2782// T lb, T ub, ST st, ST chunk )2783extern "C" {27842785/*!2786@ingroup WORK_SHARING2787@{2788@param loc Source location2789@param gtid Global thread id2790@param schedule Schedule type2791@param lb Lower bound2792@param ub Upper bound2793@param st Step (or increment if you prefer)2794@param chunk The chunk size to block with27952796This function prepares the runtime to start a dynamically scheduled for loop,2797saving the loop arguments.2798These functions are all identical apart from the types of the arguments.2799*/28002801void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,2802enum sched_type schedule, kmp_int32 lb,2803kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {2804KMP_DEBUG_ASSERT(__kmp_init_serial);2805#if OMPT_SUPPORT && OMPT_OPTIONAL2806OMPT_STORE_RETURN_ADDRESS(gtid);2807#endif2808__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);2809}2810/*!2811See @ref __kmpc_dispatch_init_42812*/2813void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,2814enum sched_type schedule, kmp_uint32 lb,2815kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {2816KMP_DEBUG_ASSERT(__kmp_init_serial);2817#if OMPT_SUPPORT && OMPT_OPTIONAL2818OMPT_STORE_RETURN_ADDRESS(gtid);2819#endif2820__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);2821}28222823/*!2824See @ref __kmpc_dispatch_init_42825*/2826void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,2827enum sched_type schedule, kmp_int64 lb,2828kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {2829KMP_DEBUG_ASSERT(__kmp_init_serial);2830#if OMPT_SUPPORT && OMPT_OPTIONAL2831OMPT_STORE_RETURN_ADDRESS(gtid);2832#endif2833__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);2834}28352836/*!2837See @ref __kmpc_dispatch_init_42838*/2839void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,2840enum sched_type schedule, kmp_uint64 lb,2841kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {2842KMP_DEBUG_ASSERT(__kmp_init_serial);2843#if OMPT_SUPPORT && OMPT_OPTIONAL2844OMPT_STORE_RETURN_ADDRESS(gtid);2845#endif2846__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);2847}28482849/*!2850See @ref __kmpc_dispatch_init_428512852Difference from __kmpc_dispatch_init set of functions is these functions2853are called for composite distribute parallel for construct. Thus before2854regular iterations dispatching we need to calc per-team iteration space.28552856These functions are all identical apart from the types of the arguments.2857*/2858void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,2859enum sched_type schedule, kmp_int32 *p_last,2860kmp_int32 lb, kmp_int32 ub, kmp_int32 st,2861kmp_int32 chunk) {2862KMP_DEBUG_ASSERT(__kmp_init_serial);2863#if OMPT_SUPPORT && OMPT_OPTIONAL2864OMPT_STORE_RETURN_ADDRESS(gtid);2865#endif2866__kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);2867__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);2868}28692870void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,2871enum sched_type schedule, kmp_int32 *p_last,2872kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,2873kmp_int32 chunk) {2874KMP_DEBUG_ASSERT(__kmp_init_serial);2875#if OMPT_SUPPORT && OMPT_OPTIONAL2876OMPT_STORE_RETURN_ADDRESS(gtid);2877#endif2878__kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);2879__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);2880}28812882void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,2883enum sched_type schedule, kmp_int32 *p_last,2884kmp_int64 lb, kmp_int64 ub, kmp_int64 st,2885kmp_int64 chunk) {2886KMP_DEBUG_ASSERT(__kmp_init_serial);2887#if OMPT_SUPPORT && OMPT_OPTIONAL2888OMPT_STORE_RETURN_ADDRESS(gtid);2889#endif2890__kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);2891__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);2892}28932894void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,2895enum sched_type schedule, kmp_int32 *p_last,2896kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,2897kmp_int64 chunk) {2898KMP_DEBUG_ASSERT(__kmp_init_serial);2899#if OMPT_SUPPORT && OMPT_OPTIONAL2900OMPT_STORE_RETURN_ADDRESS(gtid);2901#endif2902__kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);2903__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);2904}29052906/*!2907@param loc Source code location2908@param gtid Global thread id2909@param p_last Pointer to a flag set to one if this is the last chunk or zero2910otherwise2911@param p_lb Pointer to the lower bound for the next chunk of work2912@param p_ub Pointer to the upper bound for the next chunk of work2913@param p_st Pointer to the stride for the next chunk of work2914@return one if there is work to be done, zero otherwise29152916Get the next dynamically allocated chunk of work for this thread.2917If there is no more work, then the lb,ub and stride need not be modified.2918*/2919int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,2920kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {2921#if OMPT_SUPPORT && OMPT_OPTIONAL2922OMPT_STORE_RETURN_ADDRESS(gtid);2923#endif2924return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st2925#if OMPT_SUPPORT && OMPT_OPTIONAL2926,2927OMPT_LOAD_RETURN_ADDRESS(gtid)2928#endif2929);2930}29312932/*!2933See @ref __kmpc_dispatch_next_42934*/2935int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,2936kmp_uint32 *p_lb, kmp_uint32 *p_ub,2937kmp_int32 *p_st) {2938#if OMPT_SUPPORT && OMPT_OPTIONAL2939OMPT_STORE_RETURN_ADDRESS(gtid);2940#endif2941return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st2942#if OMPT_SUPPORT && OMPT_OPTIONAL2943,2944OMPT_LOAD_RETURN_ADDRESS(gtid)2945#endif2946);2947}29482949/*!2950See @ref __kmpc_dispatch_next_42951*/2952int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,2953kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {2954#if OMPT_SUPPORT && OMPT_OPTIONAL2955OMPT_STORE_RETURN_ADDRESS(gtid);2956#endif2957return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st2958#if OMPT_SUPPORT && OMPT_OPTIONAL2959,2960OMPT_LOAD_RETURN_ADDRESS(gtid)2961#endif2962);2963}29642965/*!2966See @ref __kmpc_dispatch_next_42967*/2968int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,2969kmp_uint64 *p_lb, kmp_uint64 *p_ub,2970kmp_int64 *p_st) {2971#if OMPT_SUPPORT && OMPT_OPTIONAL2972OMPT_STORE_RETURN_ADDRESS(gtid);2973#endif2974return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st2975#if OMPT_SUPPORT && OMPT_OPTIONAL2976,2977OMPT_LOAD_RETURN_ADDRESS(gtid)2978#endif2979);2980}29812982/*!2983@param loc Source code location2984@param gtid Global thread id29852986Mark the end of a dynamic loop.2987*/2988void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {2989__kmp_dispatch_finish<kmp_uint32>(gtid, loc);2990}29912992/*!2993See @ref __kmpc_dispatch_fini_42994*/2995void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {2996__kmp_dispatch_finish<kmp_uint64>(gtid, loc);2997}29982999/*!3000See @ref __kmpc_dispatch_fini_43001*/3002void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {3003__kmp_dispatch_finish<kmp_uint32>(gtid, loc);3004}30053006/*!3007See @ref __kmpc_dispatch_fini_43008*/3009void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {3010__kmp_dispatch_finish<kmp_uint64>(gtid, loc);3011}30123013/*!3014See @ref __kmpc_dispatch_deinit3015*/3016void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}3017/*! @} */30183019//-----------------------------------------------------------------------------3020// Non-template routines from kmp_dispatch.cpp used in other sources30213022kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {3023return value == checker;3024}30253026kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {3027return value != checker;3028}30293030kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {3031return value < checker;3032}30333034kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {3035return value >= checker;3036}30373038kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {3039return value <= checker;3040}30413042kmp_uint323043__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,3044kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),3045void *obj // Higher-level synchronization object, or NULL.3046) {3047// note: we may not belong to a team at this point3048volatile kmp_uint32 *spin = spinner;3049kmp_uint32 check = checker;3050kmp_uint32 spins;3051kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;3052kmp_uint32 r;3053kmp_uint64 time;30543055KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));3056KMP_INIT_YIELD(spins);3057KMP_INIT_BACKOFF(time);3058// main wait spin loop3059while (!f(r = TCR_4(*spin), check)) {3060KMP_FSYNC_SPIN_PREPARE(obj);3061/* GEH - remove this since it was accidentally introduced when kmp_wait was3062split. It causes problems with infinite recursion because of exit lock */3063/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)3064__kmp_abort_thread(); */3065KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);3066}3067KMP_FSYNC_SPIN_ACQUIRED(obj);3068return r;3069}30703071void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,3072kmp_uint32 (*pred)(void *, kmp_uint32),3073void *obj // Higher-level synchronization object, or NULL.3074) {3075// note: we may not belong to a team at this point3076void *spin = spinner;3077kmp_uint32 check = checker;3078kmp_uint32 spins;3079kmp_uint32 (*f)(void *, kmp_uint32) = pred;3080kmp_uint64 time;30813082KMP_FSYNC_SPIN_INIT(obj, spin);3083KMP_INIT_YIELD(spins);3084KMP_INIT_BACKOFF(time);3085// main wait spin loop3086while (!f(spin, check)) {3087KMP_FSYNC_SPIN_PREPARE(obj);3088/* if we have waited a bit, or are noversubscribed, yield */3089/* pause is in the following code */3090KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);3091}3092KMP_FSYNC_SPIN_ACQUIRED(obj);3093}30943095} // extern "C"30963097#ifdef KMP_GOMP_COMPAT30983099void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,3100enum sched_type schedule, kmp_int32 lb,3101kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,3102int push_ws) {3103__kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,3104push_ws);3105}31063107void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,3108enum sched_type schedule, kmp_uint32 lb,3109kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,3110int push_ws) {3111__kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,3112push_ws);3113}31143115void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,3116enum sched_type schedule, kmp_int64 lb,3117kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,3118int push_ws) {3119__kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,3120push_ws);3121}31223123void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,3124enum sched_type schedule, kmp_uint64 lb,3125kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,3126int push_ws) {3127__kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,3128push_ws);3129}31303131void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {3132__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);3133}31343135void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {3136__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);3137}31383139void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {3140__kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);3141}31423143void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {3144__kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);3145}31463147#endif /* KMP_GOMP_COMPAT */31483149/* ------------------------------------------------------------------------ */315031513152