Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.h
35258 views
/*1* kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.2*/34//===----------------------------------------------------------------------===//5//6// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.7// See https://llvm.org/LICENSE.txt for license information.8// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception9//10//===----------------------------------------------------------------------===//1112#ifndef KMP_DISPATCH_H13#define KMP_DISPATCH_H1415/* ------------------------------------------------------------------------ */16/* ------------------------------------------------------------------------ */1718#include "kmp.h"19#include "kmp_error.h"20#include "kmp_i18n.h"21#include "kmp_itt.h"22#include "kmp_stats.h"23#include "kmp_str.h"24#if KMP_OS_WINDOWS && KMP_ARCH_X8625#include <float.h>26#endif2728#if OMPT_SUPPORT29#include "ompt-internal.h"30#include "ompt-specific.h"31#endif3233/* ------------------------------------------------------------------------ */34/* ------------------------------------------------------------------------ */35#if KMP_USE_HIER_SCHED36// Forward declarations of some hierarchical scheduling data structures37template <typename T> struct kmp_hier_t;38template <typename T> struct kmp_hier_top_unit_t;39#endif // KMP_USE_HIER_SCHED4041template <typename T> struct dispatch_shared_info_template;42template <typename T> struct dispatch_private_info_template;4344template <typename T>45extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,46dispatch_private_info_template<T> *pr,47enum sched_type schedule, T lb, T ub,48typename traits_t<T>::signed_t st,49#if USE_ITT_BUILD50kmp_uint64 *cur_chunk,51#endif52typename traits_t<T>::signed_t chunk,53T nproc, T unit_id);54template <typename T>55extern int __kmp_dispatch_next_algorithm(56int gtid, dispatch_private_info_template<T> *pr,57dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,58T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);5960void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);61void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);6263#if KMP_STATIC_STEAL_ENABLED6465// replaces dispatch_private_info{32,64} structures and66// dispatch_private_info{32,64}_t types67template <typename T> struct dispatch_private_infoXX_template {68typedef typename traits_t<T>::unsigned_t UT;69typedef typename traits_t<T>::signed_t ST;70UT count; // unsigned71T ub;72/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */73T lb;74ST st; // signed75UT tc; // unsigned76kmp_lock_t *steal_lock; // lock used for chunk stealing7778UT ordered_lower; // unsigned79UT ordered_upper; // unsigned8081/* parm[1-4] are used in different ways by different scheduling algorithms */8283// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )84// a) parm3 is properly aligned and85// b) all parm1-4 are in the same cache line.86// Because of parm1-4 are used together, performance seems to be better87// if they are in the same line (not measured though).88struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*489T parm1;90T parm2;91T parm3;92T parm4;93};9495#if KMP_WEIGHTED_ITERATIONS_SUPPORTED96UT pchunks; // total number of chunks for processes with p-core97UT num_procs_with_pcore; // number of threads with p-core98T first_thread_with_ecore;99#endif100#if KMP_OS_WINDOWS101T last_upper;102#endif /* KMP_OS_WINDOWS */103};104105#else /* KMP_STATIC_STEAL_ENABLED */106107// replaces dispatch_private_info{32,64} structures and108// dispatch_private_info{32,64}_t types109template <typename T> struct dispatch_private_infoXX_template {110typedef typename traits_t<T>::unsigned_t UT;111typedef typename traits_t<T>::signed_t ST;112T lb;113T ub;114ST st; // signed115UT tc; // unsigned116117T parm1;118T parm2;119T parm3;120T parm4;121122UT count; // unsigned123124UT ordered_lower; // unsigned125UT ordered_upper; // unsigned126#if KMP_OS_WINDOWS127T last_upper;128#endif /* KMP_OS_WINDOWS */129};130#endif /* KMP_STATIC_STEAL_ENABLED */131132template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {133// duplicate alignment here, otherwise size of structure is not correct in our134// compiler135union KMP_ALIGN_CACHE private_info_tmpl {136dispatch_private_infoXX_template<T> p;137dispatch_private_info64_t p64;138} u;139enum sched_type schedule; /* scheduling algorithm */140kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */141std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer142kmp_uint32 ordered_bumped;143dispatch_private_info *next; /* stack of buffers for nest of serial regions */144kmp_uint32 type_size;145#if KMP_USE_HIER_SCHED146kmp_int32 hier_id;147kmp_hier_top_unit_t<T> *hier_parent;148// member functions149kmp_int32 get_hier_id() const { return hier_id; }150kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }151#endif152enum cons_type pushed_ws;153};154155// replaces dispatch_shared_info{32,64} structures and156// dispatch_shared_info{32,64}_t types157template <typename T> struct dispatch_shared_infoXX_template {158typedef typename traits_t<T>::unsigned_t UT;159typedef typename traits_t<T>::signed_t ST;160/* chunk index under dynamic, number of idle threads under static-steal;161iteration index otherwise */162volatile UT iteration;163volatile ST num_done;164volatile UT ordered_iteration;165// to retain the structure size making ordered_iteration scalar166UT ordered_dummy[KMP_MAX_ORDERED - 3];167};168169// replaces dispatch_shared_info structure and dispatch_shared_info_t type170template <typename T> struct dispatch_shared_info_template {171typedef typename traits_t<T>::unsigned_t UT;172// we need union here to keep the structure size173union shared_info_tmpl {174dispatch_shared_infoXX_template<UT> s;175dispatch_shared_info64_t s64;176} u;177volatile kmp_uint32 buffer_index;178volatile kmp_int32 doacross_buf_idx; // teamwise index179kmp_uint32 *doacross_flags; // array of iteration flags (0/1)180kmp_int32 doacross_num_done; // count finished threads181#if KMP_USE_HIER_SCHED182kmp_hier_t<T> *hier;183#endif184#if KMP_USE_HWLOC185// When linking with libhwloc, the ORDERED EPCC test slowsdown on big186// machines (> 48 cores). Performance analysis showed that a cache thrash187// was occurring and this padding helps alleviate the problem.188char padding[64];189#endif190};191192/* ------------------------------------------------------------------------ */193/* ------------------------------------------------------------------------ */194195#undef USE_TEST_LOCKS196197// test_then_add template (general template should NOT be used)198template <typename T> static __forceinline T test_then_add(volatile T *p, T d);199200template <>201__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,202kmp_int32 d) {203kmp_int32 r;204r = KMP_TEST_THEN_ADD32(p, d);205return r;206}207208template <>209__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,210kmp_int64 d) {211kmp_int64 r;212r = KMP_TEST_THEN_ADD64(p, d);213return r;214}215216// test_then_inc_acq template (general template should NOT be used)217template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);218219template <>220__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {221kmp_int32 r;222r = KMP_TEST_THEN_INC_ACQ32(p);223return r;224}225226template <>227__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {228kmp_int64 r;229r = KMP_TEST_THEN_INC_ACQ64(p);230return r;231}232233// test_then_inc template (general template should NOT be used)234template <typename T> static __forceinline T test_then_inc(volatile T *p);235236template <>237__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {238kmp_int32 r;239r = KMP_TEST_THEN_INC32(p);240return r;241}242243template <>244__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {245kmp_int64 r;246r = KMP_TEST_THEN_INC64(p);247return r;248}249250// compare_and_swap template (general template should NOT be used)251template <typename T>252static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);253254template <>255__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,256kmp_int32 c, kmp_int32 s) {257return KMP_COMPARE_AND_STORE_REL32(p, c, s);258}259260template <>261__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,262kmp_int64 c, kmp_int64 s) {263return KMP_COMPARE_AND_STORE_REL64(p, c, s);264}265266template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {267return value >= checker;268}269template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {270return value == checker;271}272273/*274Spin wait loop that pauses between checks.275Waits until function returns non-zero when called with *spinner and check.276Does NOT put threads to sleep.277Arguments:278UT is unsigned 4- or 8-byte type279spinner - memory location to check value280checker - value which spinner is >, <, ==, etc.281pred - predicate function to perform binary comparison of some sort282#if USE_ITT_BUILD283obj -- is higher-level synchronization object to report to ittnotify. It284is used to report locks consistently. For example, if lock is acquired285immediately, its address is reported to ittnotify via286KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately287and lock routine calls to KMP_WAIT(), the later should report the288same address, not an address of low-level spinner.289#endif // USE_ITT_BUILD290TODO: make inline function (move to header file for icl)291*/292template <typename UT>293static UT __kmp_wait(volatile UT *spinner, UT checker,294kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {295// note: we may not belong to a team at this point296volatile UT *spin = spinner;297UT check = checker;298kmp_uint32 spins;299kmp_uint32 (*f)(UT, UT) = pred;300kmp_uint64 time;301UT r;302303KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));304KMP_INIT_YIELD(spins);305KMP_INIT_BACKOFF(time);306// main wait spin loop307while (!f(r = *spin, check)) {308KMP_FSYNC_SPIN_PREPARE(obj);309/* GEH - remove this since it was accidentally introduced when kmp_wait was310split.311It causes problems with infinite recursion because of exit lock */312/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)313__kmp_abort_thread(); */314// If oversubscribed, or have waited a bit then yield.315KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);316}317KMP_FSYNC_SPIN_ACQUIRED(obj);318return r;319}320321/* ------------------------------------------------------------------------ */322/* ------------------------------------------------------------------------ */323324template <typename UT>325void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {326dispatch_private_info_template<UT> *pr;327328int gtid = *gtid_ref;329// int cid = *cid_ref;330kmp_info_t *th = __kmp_threads[gtid];331KMP_DEBUG_ASSERT(th->th.th_dispatch);332333KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));334if (__kmp_env_consistency_check) {335pr = reinterpret_cast<dispatch_private_info_template<UT> *>(336th->th.th_dispatch->th_dispatch_pr_current);337if (pr->pushed_ws != ct_none) {338#if KMP_USE_DYNAMIC_LOCK339__kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);340#else341__kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);342#endif343}344}345346if (!th->th.th_team->t.t_serialized) {347dispatch_shared_info_template<UT> *sh =348reinterpret_cast<dispatch_shared_info_template<UT> *>(349th->th.th_dispatch->th_dispatch_sh_current);350UT lower;351352if (!__kmp_env_consistency_check) {353pr = reinterpret_cast<dispatch_private_info_template<UT> *>(354th->th.th_dispatch->th_dispatch_pr_current);355}356lower = pr->u.p.ordered_lower;357358#if !defined(KMP_GOMP_COMPAT)359if (__kmp_env_consistency_check) {360if (pr->ordered_bumped) {361struct cons_header *p = __kmp_threads[gtid]->th.th_cons;362__kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,363ct_ordered_in_pdo, loc_ref,364&p->stack_data[p->w_top]);365}366}367#endif /* !defined(KMP_GOMP_COMPAT) */368369KMP_MB();370#ifdef KMP_DEBUG371{372char *buff;373// create format specifiers before the debug output374buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "375"ordered_iter:%%%s lower:%%%s\n",376traits_t<UT>::spec, traits_t<UT>::spec);377KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));378__kmp_str_free(&buff);379}380#endif381__kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,382__kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));383KMP_MB(); /* is this necessary? */384#ifdef KMP_DEBUG385{386char *buff;387// create format specifiers before the debug output388buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "389"ordered_iter:%%%s lower:%%%s\n",390traits_t<UT>::spec, traits_t<UT>::spec);391KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));392__kmp_str_free(&buff);393}394#endif395}396KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));397}398399template <typename UT>400void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {401typedef typename traits_t<UT>::signed_t ST;402dispatch_private_info_template<UT> *pr;403404int gtid = *gtid_ref;405// int cid = *cid_ref;406kmp_info_t *th = __kmp_threads[gtid];407KMP_DEBUG_ASSERT(th->th.th_dispatch);408409KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));410if (__kmp_env_consistency_check) {411pr = reinterpret_cast<dispatch_private_info_template<UT> *>(412th->th.th_dispatch->th_dispatch_pr_current);413if (pr->pushed_ws != ct_none) {414__kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);415}416}417418if (!th->th.th_team->t.t_serialized) {419dispatch_shared_info_template<UT> *sh =420reinterpret_cast<dispatch_shared_info_template<UT> *>(421th->th.th_dispatch->th_dispatch_sh_current);422423if (!__kmp_env_consistency_check) {424pr = reinterpret_cast<dispatch_private_info_template<UT> *>(425th->th.th_dispatch->th_dispatch_pr_current);426}427428KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));429#if !defined(KMP_GOMP_COMPAT)430if (__kmp_env_consistency_check) {431if (pr->ordered_bumped != 0) {432struct cons_header *p = __kmp_threads[gtid]->th.th_cons;433/* How to test it? - OM */434__kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,435ct_ordered_in_pdo, loc_ref,436&p->stack_data[p->w_top]);437}438}439#endif /* !defined(KMP_GOMP_COMPAT) */440441KMP_MB(); /* Flush all pending memory write invalidates. */442443pr->ordered_bumped += 1;444445KD_TRACE(1000,446("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",447gtid, pr->ordered_bumped));448449KMP_MB(); /* Flush all pending memory write invalidates. */450451/* TODO use general release procedure? */452test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);453454KMP_MB(); /* Flush all pending memory write invalidates. */455}456KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));457}458459/* Computes and returns x to the power of y, where y must a non-negative integer460*/461template <typename UT>462static __forceinline long double __kmp_pow(long double x, UT y) {463long double s = 1.0L;464465KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);466// KMP_DEBUG_ASSERT(y >= 0); // y is unsigned467while (y) {468if (y & 1)469s *= x;470x *= x;471y >>= 1;472}473return s;474}475476/* Computes and returns the number of unassigned iterations after idx chunks477have been assigned478(the total number of unassigned iterations in chunks with index greater than479or equal to idx).480__forceinline seems to be broken so that if we __forceinline this function,481the behavior is wrong482(one of the unit tests, sch_guided_analytical_basic.cpp, fails)483*/484template <typename T>485static __inline typename traits_t<T>::unsigned_t486__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,487typename traits_t<T>::unsigned_t idx) {488/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at489least for ICL 8.1, long double arithmetic may not really have490long double precision, even with /Qlong_double. Currently, we491workaround that in the caller code, by manipulating the FPCW for492Windows* OS on IA-32 architecture. The lack of precision is not493expected to be a correctness issue, though.494*/495typedef typename traits_t<T>::unsigned_t UT;496497long double x = tc * __kmp_pow<UT>(base, idx);498UT r = (UT)x;499if (x == r)500return r;501return r + 1;502}503504// Parameters of the guided-iterative algorithm:505// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic506// p3 = 1 / ( n * nproc ) // remaining iterations multiplier507// by default n = 2. For example with n = 3 the chunks distribution will be more508// flat.509// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.510static const int guided_int_param = 2;511static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;512#endif // KMP_DISPATCH_H513514515