CoCalc -- kmp_dispatch.h

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.h
³⁵²⁵⁸ views
1
/*
2
 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
#ifndef KMP_DISPATCH_H
14
#define KMP_DISPATCH_H
15

16
/* ------------------------------------------------------------------------ */
17
/* ------------------------------------------------------------------------ */
18

19
#include "kmp.h"
20
#include "kmp_error.h"
21
#include "kmp_i18n.h"
22
#include "kmp_itt.h"
23
#include "kmp_stats.h"
24
#include "kmp_str.h"
25
#if KMP_OS_WINDOWS && KMP_ARCH_X86
26
#include <float.h>
27
#endif
28

29
#if OMPT_SUPPORT
30
#include "ompt-internal.h"
31
#include "ompt-specific.h"
32
#endif
33

34
/* ------------------------------------------------------------------------ */
35
/* ------------------------------------------------------------------------ */
36
#if KMP_USE_HIER_SCHED
37
// Forward declarations of some hierarchical scheduling data structures
38
template <typename T> struct kmp_hier_t;
39
template <typename T> struct kmp_hier_top_unit_t;
40
#endif // KMP_USE_HIER_SCHED
41

42
template <typename T> struct dispatch_shared_info_template;
43
template <typename T> struct dispatch_private_info_template;
44

45
template <typename T>
46
extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47
                                          dispatch_private_info_template<T> *pr,
48
                                          enum sched_type schedule, T lb, T ub,
49
                                          typename traits_t<T>::signed_t st,
50
#if USE_ITT_BUILD
51
                                          kmp_uint64 *cur_chunk,
52
#endif
53
                                          typename traits_t<T>::signed_t chunk,
54
                                          T nproc, T unit_id);
55
template <typename T>
56
extern int __kmp_dispatch_next_algorithm(
57
    int gtid, dispatch_private_info_template<T> *pr,
58
    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59
    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60

61
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63

64
#if KMP_STATIC_STEAL_ENABLED
65

66
// replaces dispatch_private_info{32,64} structures and
67
// dispatch_private_info{32,64}_t types
68
template <typename T> struct dispatch_private_infoXX_template {
69
  typedef typename traits_t<T>::unsigned_t UT;
70
  typedef typename traits_t<T>::signed_t ST;
71
  UT count; // unsigned
72
  T ub;
73
  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74
  T lb;
75
  ST st; // signed
76
  UT tc; // unsigned
77
  kmp_lock_t *steal_lock; // lock used for chunk stealing
78

79
  UT ordered_lower; // unsigned
80
  UT ordered_upper; // unsigned
81

82
  /* parm[1-4] are used in different ways by different scheduling algorithms */
83

84
  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
85
  //    a) parm3 is properly aligned and
86
  //    b) all parm1-4 are in the same cache line.
87
  // Because of parm1-4 are used together, performance seems to be better
88
  // if they are in the same line (not measured though).
89
  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
90
    T parm1;
91
    T parm2;
92
    T parm3;
93
    T parm4;
94
  };
95

96
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
97
  UT pchunks; // total number of chunks for processes with p-core
98
  UT num_procs_with_pcore; // number of threads with p-core
99
  T first_thread_with_ecore;
100
#endif
101
#if KMP_OS_WINDOWS
102
  T last_upper;
103
#endif /* KMP_OS_WINDOWS */
104
};
105

106
#else /* KMP_STATIC_STEAL_ENABLED */
107

108
// replaces dispatch_private_info{32,64} structures and
109
// dispatch_private_info{32,64}_t types
110
template <typename T> struct dispatch_private_infoXX_template {
111
  typedef typename traits_t<T>::unsigned_t UT;
112
  typedef typename traits_t<T>::signed_t ST;
113
  T lb;
114
  T ub;
115
  ST st; // signed
116
  UT tc; // unsigned
117

118
  T parm1;
119
  T parm2;
120
  T parm3;
121
  T parm4;
122

123
  UT count; // unsigned
124

125
  UT ordered_lower; // unsigned
126
  UT ordered_upper; // unsigned
127
#if KMP_OS_WINDOWS
128
  T last_upper;
129
#endif /* KMP_OS_WINDOWS */
130
};
131
#endif /* KMP_STATIC_STEAL_ENABLED */
132

133
template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
134
  // duplicate alignment here, otherwise size of structure is not correct in our
135
  // compiler
136
  union KMP_ALIGN_CACHE private_info_tmpl {
137
    dispatch_private_infoXX_template<T> p;
138
    dispatch_private_info64_t p64;
139
  } u;
140
  enum sched_type schedule; /* scheduling algorithm */
141
  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
142
  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
143
  kmp_uint32 ordered_bumped;
144
  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
145
  kmp_uint32 type_size;
146
#if KMP_USE_HIER_SCHED
147
  kmp_int32 hier_id;
148
  kmp_hier_top_unit_t<T> *hier_parent;
149
  // member functions
150
  kmp_int32 get_hier_id() const { return hier_id; }
151
  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
152
#endif
153
  enum cons_type pushed_ws;
154
};
155

156
// replaces dispatch_shared_info{32,64} structures and
157
// dispatch_shared_info{32,64}_t types
158
template <typename T> struct dispatch_shared_infoXX_template {
159
  typedef typename traits_t<T>::unsigned_t UT;
160
  typedef typename traits_t<T>::signed_t ST;
161
  /* chunk index under dynamic, number of idle threads under static-steal;
162
     iteration index otherwise */
163
  volatile UT iteration;
164
  volatile ST num_done;
165
  volatile UT ordered_iteration;
166
  // to retain the structure size making ordered_iteration scalar
167
  UT ordered_dummy[KMP_MAX_ORDERED - 3];
168
};
169

170
// replaces dispatch_shared_info structure and dispatch_shared_info_t type
171
template <typename T> struct dispatch_shared_info_template {
172
  typedef typename traits_t<T>::unsigned_t UT;
173
  // we need union here to keep the structure size
174
  union shared_info_tmpl {
175
    dispatch_shared_infoXX_template<UT> s;
176
    dispatch_shared_info64_t s64;
177
  } u;
178
  volatile kmp_uint32 buffer_index;
179
  volatile kmp_int32 doacross_buf_idx; // teamwise index
180
  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
181
  kmp_int32 doacross_num_done; // count finished threads
182
#if KMP_USE_HIER_SCHED
183
  kmp_hier_t<T> *hier;
184
#endif
185
#if KMP_USE_HWLOC
186
  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
187
  // machines (> 48 cores). Performance analysis showed that a cache thrash
188
  // was occurring and this padding helps alleviate the problem.
189
  char padding[64];
190
#endif
191
};
192

193
/* ------------------------------------------------------------------------ */
194
/* ------------------------------------------------------------------------ */
195

196
#undef USE_TEST_LOCKS
197

198
// test_then_add template (general template should NOT be used)
199
template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
200

201
template <>
202
__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
203
                                                 kmp_int32 d) {
204
  kmp_int32 r;
205
  r = KMP_TEST_THEN_ADD32(p, d);
206
  return r;
207
}
208

209
template <>
210
__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
211
                                                 kmp_int64 d) {
212
  kmp_int64 r;
213
  r = KMP_TEST_THEN_ADD64(p, d);
214
  return r;
215
}
216

217
// test_then_inc_acq template (general template should NOT be used)
218
template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
219

220
template <>
221
__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
222
  kmp_int32 r;
223
  r = KMP_TEST_THEN_INC_ACQ32(p);
224
  return r;
225
}
226

227
template <>
228
__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
229
  kmp_int64 r;
230
  r = KMP_TEST_THEN_INC_ACQ64(p);
231
  return r;
232
}
233

234
// test_then_inc template (general template should NOT be used)
235
template <typename T> static __forceinline T test_then_inc(volatile T *p);
236

237
template <>
238
__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
239
  kmp_int32 r;
240
  r = KMP_TEST_THEN_INC32(p);
241
  return r;
242
}
243

244
template <>
245
__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
246
  kmp_int64 r;
247
  r = KMP_TEST_THEN_INC64(p);
248
  return r;
249
}
250

251
// compare_and_swap template (general template should NOT be used)
252
template <typename T>
253
static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
254

255
template <>
256
__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
257
                                                    kmp_int32 c, kmp_int32 s) {
258
  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
259
}
260

261
template <>
262
__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
263
                                                    kmp_int64 c, kmp_int64 s) {
264
  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
265
}
266

267
template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
268
  return value >= checker;
269
}
270
template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
271
  return value == checker;
272
}
273

274
/*
275
    Spin wait loop that pauses between checks.
276
    Waits until function returns non-zero when called with *spinner and check.
277
    Does NOT put threads to sleep.
278
    Arguments:
279
        UT is unsigned 4- or 8-byte type
280
        spinner - memory location to check value
281
        checker - value which spinner is >, <, ==, etc.
282
        pred - predicate function to perform binary comparison of some sort
283
#if USE_ITT_BUILD
284
        obj -- is higher-level synchronization object to report to ittnotify. It
285
        is used to report locks consistently. For example, if lock is acquired
286
        immediately, its address is reported to ittnotify via
287
        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
288
        and lock routine calls to KMP_WAIT(), the later should report the
289
        same address, not an address of low-level spinner.
290
#endif // USE_ITT_BUILD
291
    TODO: make inline function (move to header file for icl)
292
*/
293
template <typename UT>
294
static UT __kmp_wait(volatile UT *spinner, UT checker,
295
                     kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
296
  // note: we may not belong to a team at this point
297
  volatile UT *spin = spinner;
298
  UT check = checker;
299
  kmp_uint32 spins;
300
  kmp_uint32 (*f)(UT, UT) = pred;
301
  kmp_uint64 time;
302
  UT r;
303

304
  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
305
  KMP_INIT_YIELD(spins);
306
  KMP_INIT_BACKOFF(time);
307
  // main wait spin loop
308
  while (!f(r = *spin, check)) {
309
    KMP_FSYNC_SPIN_PREPARE(obj);
310
    /* GEH - remove this since it was accidentally introduced when kmp_wait was
311
       split.
312
       It causes problems with infinite recursion because of exit lock */
313
    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
314
        __kmp_abort_thread(); */
315
    // If oversubscribed, or have waited a bit then yield.
316
    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
317
  }
318
  KMP_FSYNC_SPIN_ACQUIRED(obj);
319
  return r;
320
}
321

322
/* ------------------------------------------------------------------------ */
323
/* ------------------------------------------------------------------------ */
324

325
template <typename UT>
326
void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327
  dispatch_private_info_template<UT> *pr;
328

329
  int gtid = *gtid_ref;
330
  //    int  cid = *cid_ref;
331
  kmp_info_t *th = __kmp_threads[gtid];
332
  KMP_DEBUG_ASSERT(th->th.th_dispatch);
333

334
  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
335
  if (__kmp_env_consistency_check) {
336
    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
337
        th->th.th_dispatch->th_dispatch_pr_current);
338
    if (pr->pushed_ws != ct_none) {
339
#if KMP_USE_DYNAMIC_LOCK
340
      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
341
#else
342
      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
343
#endif
344
    }
345
  }
346

347
  if (!th->th.th_team->t.t_serialized) {
348
    dispatch_shared_info_template<UT> *sh =
349
        reinterpret_cast<dispatch_shared_info_template<UT> *>(
350
            th->th.th_dispatch->th_dispatch_sh_current);
351
    UT lower;
352

353
    if (!__kmp_env_consistency_check) {
354
      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
355
          th->th.th_dispatch->th_dispatch_pr_current);
356
    }
357
    lower = pr->u.p.ordered_lower;
358

359
#if !defined(KMP_GOMP_COMPAT)
360
    if (__kmp_env_consistency_check) {
361
      if (pr->ordered_bumped) {
362
        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
363
        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
364
                               ct_ordered_in_pdo, loc_ref,
365
                               &p->stack_data[p->w_top]);
366
      }
367
    }
368
#endif /* !defined(KMP_GOMP_COMPAT) */
369

370
    KMP_MB();
371
#ifdef KMP_DEBUG
372
    {
373
      char *buff;
374
      // create format specifiers before the debug output
375
      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
376
                              "ordered_iter:%%%s lower:%%%s\n",
377
                              traits_t<UT>::spec, traits_t<UT>::spec);
378
      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
379
      __kmp_str_free(&buff);
380
    }
381
#endif
382
    __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
383
                   __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
384
    KMP_MB(); /* is this necessary? */
385
#ifdef KMP_DEBUG
386
    {
387
      char *buff;
388
      // create format specifiers before the debug output
389
      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
390
                              "ordered_iter:%%%s lower:%%%s\n",
391
                              traits_t<UT>::spec, traits_t<UT>::spec);
392
      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
393
      __kmp_str_free(&buff);
394
    }
395
#endif
396
  }
397
  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
398
}
399

400
template <typename UT>
401
void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
402
  typedef typename traits_t<UT>::signed_t ST;
403
  dispatch_private_info_template<UT> *pr;
404

405
  int gtid = *gtid_ref;
406
  //    int  cid = *cid_ref;
407
  kmp_info_t *th = __kmp_threads[gtid];
408
  KMP_DEBUG_ASSERT(th->th.th_dispatch);
409

410
  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
411
  if (__kmp_env_consistency_check) {
412
    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
413
        th->th.th_dispatch->th_dispatch_pr_current);
414
    if (pr->pushed_ws != ct_none) {
415
      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
416
    }
417
  }
418

419
  if (!th->th.th_team->t.t_serialized) {
420
    dispatch_shared_info_template<UT> *sh =
421
        reinterpret_cast<dispatch_shared_info_template<UT> *>(
422
            th->th.th_dispatch->th_dispatch_sh_current);
423

424
    if (!__kmp_env_consistency_check) {
425
      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
426
          th->th.th_dispatch->th_dispatch_pr_current);
427
    }
428

429
    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
430
#if !defined(KMP_GOMP_COMPAT)
431
    if (__kmp_env_consistency_check) {
432
      if (pr->ordered_bumped != 0) {
433
        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
434
        /* How to test it? - OM */
435
        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
436
                               ct_ordered_in_pdo, loc_ref,
437
                               &p->stack_data[p->w_top]);
438
      }
439
    }
440
#endif /* !defined(KMP_GOMP_COMPAT) */
441

442
    KMP_MB(); /* Flush all pending memory write invalidates.  */
443

444
    pr->ordered_bumped += 1;
445

446
    KD_TRACE(1000,
447
             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
448
              gtid, pr->ordered_bumped));
449

450
    KMP_MB(); /* Flush all pending memory write invalidates.  */
451

452
    /* TODO use general release procedure? */
453
    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
454

455
    KMP_MB(); /* Flush all pending memory write invalidates.  */
456
  }
457
  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
458
}
459

460
/* Computes and returns x to the power of y, where y must a non-negative integer
461
 */
462
template <typename UT>
463
static __forceinline long double __kmp_pow(long double x, UT y) {
464
  long double s = 1.0L;
465

466
  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
467
  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
468
  while (y) {
469
    if (y & 1)
470
      s *= x;
471
    x *= x;
472
    y >>= 1;
473
  }
474
  return s;
475
}
476

477
/* Computes and returns the number of unassigned iterations after idx chunks
478
   have been assigned
479
   (the total number of unassigned iterations in chunks with index greater than
480
   or equal to idx).
481
   __forceinline seems to be broken so that if we __forceinline this function,
482
   the behavior is wrong
483
   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
484
*/
485
template <typename T>
486
static __inline typename traits_t<T>::unsigned_t
487
__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
488
                                typename traits_t<T>::unsigned_t idx) {
489
  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
490
     least for ICL 8.1, long double arithmetic may not really have
491
     long double precision, even with /Qlong_double.  Currently, we
492
     workaround that in the caller code, by manipulating the FPCW for
493
     Windows* OS on IA-32 architecture.  The lack of precision is not
494
     expected to be a correctness issue, though.
495
  */
496
  typedef typename traits_t<T>::unsigned_t UT;
497

498
  long double x = tc * __kmp_pow<UT>(base, idx);
499
  UT r = (UT)x;
500
  if (x == r)
501
    return r;
502
  return r + 1;
503
}
504

505
// Parameters of the guided-iterative algorithm:
506
//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
507
//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
508
// by default n = 2. For example with n = 3 the chunks distribution will be more
509
// flat.
510
// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
511
static const int guided_int_param = 2;
512
static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
513
#endif // KMP_DISPATCH_H
514

515
Product

Resources

Company