CoCalc -- kmp_dispatch.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch.cpp
³⁵²⁵⁸ views
1
/*
2
 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
/* Dynamic scheduling initialization and dispatch.
14
 *
15
 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16
 *       it may change values between parallel regions.  __kmp_max_nth
17
 *       is the largest value __kmp_nth may take, 1 is the smallest.
18
 */
19

20
#include "kmp.h"
21
#include "kmp_error.h"
22
#include "kmp_i18n.h"
23
#include "kmp_itt.h"
24
#include "kmp_stats.h"
25
#include "kmp_str.h"
26
#if KMP_USE_X87CONTROL
27
#include <float.h>
28
#endif
29
#include "kmp_lock.h"
30
#include "kmp_dispatch.h"
31
#if KMP_USE_HIER_SCHED
32
#include "kmp_dispatch_hier.h"
33
#endif
34

35
#if OMPT_SUPPORT
36
#include "ompt-specific.h"
37
#endif
38

39
/* ------------------------------------------------------------------------ */
40
/* ------------------------------------------------------------------------ */
41

42
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43
  kmp_info_t *th;
44

45
  KMP_DEBUG_ASSERT(gtid_ref);
46

47
  if (__kmp_env_consistency_check) {
48
    th = __kmp_threads[*gtid_ref];
49
    if (th->th.th_root->r.r_active &&
50
        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51
#if KMP_USE_DYNAMIC_LOCK
52
      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53
#else
54
      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55
#endif
56
    }
57
  }
58
}
59

60
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61
  kmp_info_t *th;
62

63
  if (__kmp_env_consistency_check) {
64
    th = __kmp_threads[*gtid_ref];
65
    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66
      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67
    }
68
  }
69
}
70

71
// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72
static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73
                                         bool use_hier = false) {
74
  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75
  // Nonmonotonic as default for dynamic schedule when no modifier is specified
76
  int monotonicity = SCHEDULE_NONMONOTONIC;
77

78
  // Let default be monotonic for executables
79
  // compiled with OpenMP* 4.5 or less compilers
80
  if (loc != NULL && loc->get_openmp_version() < 50)
81
    monotonicity = SCHEDULE_MONOTONIC;
82

83
  if (use_hier || __kmp_force_monotonic)
84
    monotonicity = SCHEDULE_MONOTONIC;
85
  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86
    monotonicity = SCHEDULE_NONMONOTONIC;
87
  else if (SCHEDULE_HAS_MONOTONIC(schedule))
88
    monotonicity = SCHEDULE_MONOTONIC;
89

90
  return monotonicity;
91
}
92

93
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94
// Return floating point number rounded to two decimal points
95
static inline float __kmp_round_2decimal_val(float num) {
96
  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97
}
98
static inline int __kmp_get_round_val(float num) {
99
  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100
}
101
#endif
102

103
template <typename T>
104
inline void
105
__kmp_initialize_self_buffer(kmp_team_t *team, T id,
106
                             dispatch_private_info_template<T> *pr,
107
                             typename traits_t<T>::unsigned_t nchunks, T nproc,
108
                             typename traits_t<T>::unsigned_t &init,
109
                             T &small_chunk, T &extras, T &p_extra) {
110

111
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112
  if (pr->flags.use_hybrid) {
113
    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114
    kmp_hw_core_type_t type =
115
        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116
    T pchunks = pr->u.p.pchunks;
117
    T echunks = nchunks - pchunks;
118
    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119
    T num_procs_with_ecore = nproc - num_procs_with_pcore;
120
    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121
    T big_chunk =
122
        pchunks / num_procs_with_pcore; // chunks per thread with p-core
123
    small_chunk =
124
        echunks / num_procs_with_ecore; // chunks per thread with e-core
125

126
    extras =
127
        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128

129
    p_extra = (big_chunk - small_chunk);
130

131
    if (type == KMP_HW_CORE_TYPE_CORE) {
132
      if (id < first_thread_with_ecore) {
133
        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134
      } else {
135
        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136
               (id < extras ? id : extras);
137
      }
138
    } else {
139
      if (id == first_thread_with_ecore) {
140
        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141
      } else {
142
        init = id * small_chunk + first_thread_with_ecore * p_extra +
143
               (id < extras ? id : extras);
144
      }
145
    }
146
    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147
    return;
148
  }
149
#endif
150

151
  small_chunk = nchunks / nproc; // chunks per thread
152
  extras = nchunks % nproc;
153
  p_extra = 0;
154
  init = id * small_chunk + (id < extras ? id : extras);
155
}
156

157
#if KMP_STATIC_STEAL_ENABLED
158
enum { // values for steal_flag (possible states of private per-loop buffer)
159
  UNUSED = 0,
160
  CLAIMED = 1, // owner thread started initialization
161
  READY = 2, // available for stealing
162
  THIEF = 3 // finished by owner, or claimed by thief
163
  // possible state changes:
164
  // 0 -> 1 owner only, sync
165
  // 0 -> 3 thief only, sync
166
  // 1 -> 2 owner only, async
167
  // 2 -> 3 owner only, async
168
  // 3 -> 2 owner only, async
169
  // 3 -> 0 last thread finishing the loop, async
170
};
171
#endif
172

173
// Initialize a dispatch_private_info_template<T> buffer for a particular
174
// type of schedule,chunk.  The loop description is found in lb (lower bound),
175
// ub (upper bound), and st (stride).  nproc is the number of threads relevant
176
// to the scheduling (often the number of threads in a team, but not always if
177
// hierarchical scheduling is used).  tid is the id of the thread calling
178
// the function within the group of nproc threads.  It will have a value
179
// between 0 and nproc - 1.  This is often just the thread id within a team, but
180
// is not necessarily the case when using hierarchical scheduling.
181
// loc is the source file location of the corresponding loop
182
// gtid is the global thread id
183
template <typename T>
184
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185
                                   dispatch_private_info_template<T> *pr,
186
                                   enum sched_type schedule, T lb, T ub,
187
                                   typename traits_t<T>::signed_t st,
188
#if USE_ITT_BUILD
189
                                   kmp_uint64 *cur_chunk,
190
#endif
191
                                   typename traits_t<T>::signed_t chunk,
192
                                   T nproc, T tid) {
193
  typedef typename traits_t<T>::unsigned_t UT;
194
  typedef typename traits_t<T>::floating_t DBL;
195

196
  int active;
197
  T tc;
198
  kmp_info_t *th;
199
  kmp_team_t *team;
200
  int monotonicity;
201
  bool use_hier;
202

203
#ifdef KMP_DEBUG
204
  typedef typename traits_t<T>::signed_t ST;
205
  {
206
    char *buff;
207
    // create format specifiers before the debug output
208
    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209
                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210
                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211
                            traits_t<T>::spec, traits_t<T>::spec,
212
                            traits_t<ST>::spec, traits_t<ST>::spec,
213
                            traits_t<T>::spec, traits_t<T>::spec);
214
    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215
    __kmp_str_free(&buff);
216
  }
217
#endif
218
  /* setup data */
219
  th = __kmp_threads[gtid];
220
  team = th->th.th_team;
221
  active = !team->t.t_serialized;
222

223
#if USE_ITT_BUILD
224
  int itt_need_metadata_reporting =
225
      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226
      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227
      team->t.t_active_level == 1;
228
#endif
229

230
#if KMP_USE_HIER_SCHED
231
  use_hier = pr->flags.use_hier;
232
#else
233
  use_hier = false;
234
#endif
235

236
  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237
  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238
  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239

240
  /* Pick up the nomerge/ordered bits from the scheduling type */
241
  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242
    pr->flags.nomerge = TRUE;
243
    schedule =
244
        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245
  } else {
246
    pr->flags.nomerge = FALSE;
247
  }
248
  pr->type_size = traits_t<T>::type_size; // remember the size of variables
249
  if (kmp_ord_lower & schedule) {
250
    pr->flags.ordered = TRUE;
251
    schedule =
252
        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253
  } else {
254
    pr->flags.ordered = FALSE;
255
  }
256
  // Ordered overrides nonmonotonic
257
  if (pr->flags.ordered) {
258
    monotonicity = SCHEDULE_MONOTONIC;
259
  }
260

261
  if (schedule == kmp_sch_static) {
262
    schedule = __kmp_static;
263
  } else {
264
    if (schedule == kmp_sch_runtime) {
265
      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266
      // not specified)
267
      schedule = team->t.t_sched.r_sched_type;
268
      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269
      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270
      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271
        monotonicity = SCHEDULE_MONOTONIC;
272
      // Detail the schedule if needed (global controls are differentiated
273
      // appropriately)
274
      if (schedule == kmp_sch_guided_chunked) {
275
        schedule = __kmp_guided;
276
      } else if (schedule == kmp_sch_static) {
277
        schedule = __kmp_static;
278
      }
279
      // Use the chunk size specified by OMP_SCHEDULE (or default if not
280
      // specified)
281
      chunk = team->t.t_sched.chunk;
282
#if USE_ITT_BUILD
283
      if (cur_chunk)
284
        *cur_chunk = chunk;
285
#endif
286
#ifdef KMP_DEBUG
287
      {
288
        char *buff;
289
        // create format specifiers before the debug output
290
        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291
                                "schedule:%%d chunk:%%%s\n",
292
                                traits_t<ST>::spec);
293
        KD_TRACE(10, (buff, gtid, schedule, chunk));
294
        __kmp_str_free(&buff);
295
      }
296
#endif
297
    } else {
298
      if (schedule == kmp_sch_guided_chunked) {
299
        schedule = __kmp_guided;
300
      }
301
      if (chunk <= 0) {
302
        chunk = KMP_DEFAULT_CHUNK;
303
      }
304
    }
305

306
    if (schedule == kmp_sch_auto) {
307
      // mapping and differentiation: in the __kmp_do_serial_initialize()
308
      schedule = __kmp_auto;
309
#ifdef KMP_DEBUG
310
      {
311
        char *buff;
312
        // create format specifiers before the debug output
313
        buff = __kmp_str_format(
314
            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315
            "schedule:%%d chunk:%%%s\n",
316
            traits_t<ST>::spec);
317
        KD_TRACE(10, (buff, gtid, schedule, chunk));
318
        __kmp_str_free(&buff);
319
      }
320
#endif
321
    }
322
#if KMP_STATIC_STEAL_ENABLED
323
    // map nonmonotonic:dynamic to static steal
324
    if (schedule == kmp_sch_dynamic_chunked) {
325
      if (monotonicity == SCHEDULE_NONMONOTONIC)
326
        schedule = kmp_sch_static_steal;
327
    }
328
#endif
329
    /* guided analytical not safe for too many threads */
330
    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331
      schedule = kmp_sch_guided_iterative_chunked;
332
      KMP_WARNING(DispatchManyThreads);
333
    }
334
    if (schedule == kmp_sch_runtime_simd) {
335
      // compiler provides simd_width in the chunk parameter
336
      schedule = team->t.t_sched.r_sched_type;
337
      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338
      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339
      // Detail the schedule if needed (global controls are differentiated
340
      // appropriately)
341
      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342
          schedule == __kmp_static) {
343
        schedule = kmp_sch_static_balanced_chunked;
344
      } else {
345
        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346
          schedule = kmp_sch_guided_simd;
347
        }
348
        chunk = team->t.t_sched.chunk * chunk;
349
      }
350
#if USE_ITT_BUILD
351
      if (cur_chunk)
352
        *cur_chunk = chunk;
353
#endif
354
#ifdef KMP_DEBUG
355
      {
356
        char *buff;
357
        // create format specifiers before the debug output
358
        buff = __kmp_str_format(
359
            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360
            " chunk:%%%s\n",
361
            traits_t<ST>::spec);
362
        KD_TRACE(10, (buff, gtid, schedule, chunk));
363
        __kmp_str_free(&buff);
364
      }
365
#endif
366
    }
367
    pr->u.p.parm1 = chunk;
368
  }
369
  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370
              "unknown scheduling type");
371

372
  pr->u.p.count = 0;
373

374
  if (__kmp_env_consistency_check) {
375
    if (st == 0) {
376
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377
                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378
    }
379
  }
380
  // compute trip count
381
  if (st == 1) { // most common case
382
    if (ub >= lb) {
383
      tc = ub - lb + 1;
384
    } else { // ub < lb
385
      tc = 0; // zero-trip
386
    }
387
  } else if (st < 0) {
388
    if (lb >= ub) {
389
      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390
      // where the division needs to be unsigned regardless of the result type
391
      tc = (UT)(lb - ub) / (-st) + 1;
392
    } else { // lb < ub
393
      tc = 0; // zero-trip
394
    }
395
  } else { // st > 0
396
    if (ub >= lb) {
397
      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398
      // where the division needs to be unsigned regardless of the result type
399
      tc = (UT)(ub - lb) / st + 1;
400
    } else { // ub < lb
401
      tc = 0; // zero-trip
402
    }
403
  }
404

405
#if KMP_STATS_ENABLED
406
  if (KMP_MASTER_GTID(gtid)) {
407
    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408
  }
409
#endif
410

411
  pr->u.p.lb = lb;
412
  pr->u.p.ub = ub;
413
  pr->u.p.st = st;
414
  pr->u.p.tc = tc;
415

416
#if KMP_OS_WINDOWS
417
  pr->u.p.last_upper = ub + st;
418
#endif /* KMP_OS_WINDOWS */
419

420
  /* NOTE: only the active parallel region(s) has active ordered sections */
421

422
  if (active) {
423
    if (pr->flags.ordered) {
424
      pr->ordered_bumped = 0;
425
      pr->u.p.ordered_lower = 1;
426
      pr->u.p.ordered_upper = 0;
427
    }
428
  }
429

430
  switch (schedule) {
431
#if KMP_STATIC_STEAL_ENABLED
432
  case kmp_sch_static_steal: {
433
    T ntc, init = 0;
434

435
    KD_TRACE(100,
436
             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437
              gtid));
438

439
    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440
    if (nproc > 1 && ntc >= nproc) {
441
      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442
      T id = tid;
443
      T small_chunk, extras, p_extra = 0;
444
      kmp_uint32 old = UNUSED;
445
      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446
      if (traits_t<T>::type_size > 4) {
447
        // AC: TODO: check if 16-byte CAS available and use it to
448
        // improve performance (probably wait for explicit request
449
        // before spending time on this).
450
        // For now use dynamically allocated per-private-buffer lock,
451
        // free memory in __kmp_dispatch_next when status==0.
452
        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453
        __kmp_init_lock(pr->u.p.steal_lock);
454
      }
455

456
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457
      // Iterations are divided in a 60/40 skewed distribution among CORE and
458
      // ATOM processors for hybrid systems
459
      bool use_hybrid = false;
460
      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461
      T first_thread_with_ecore = 0;
462
      T num_procs_with_pcore = 0;
463
      T num_procs_with_ecore = 0;
464
      T p_ntc = 0, e_ntc = 0;
465
      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466
          __kmp_affinity.type != affinity_explicit) {
467
        use_hybrid = true;
468
        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469
        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470
            __kmp_first_osid_with_ecore > -1) {
471
          for (int i = 0; i < team->t.t_nproc; ++i) {
472
            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473
                                          ->th.th_topology_attrs.core_type;
474
            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475
            if (id == __kmp_first_osid_with_ecore) {
476
              first_thread_with_ecore =
477
                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
478
            }
479
            if (type == KMP_HW_CORE_TYPE_CORE) {
480
              num_procs_with_pcore++;
481
            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482
              num_procs_with_ecore++;
483
            } else {
484
              use_hybrid = false;
485
              break;
486
            }
487
          }
488
        }
489
        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490
          float multiplier = 60.0 / 40.0;
491
          float p_ratio = (float)num_procs_with_pcore / nproc;
492
          float e_ratio = (float)num_procs_with_ecore / nproc;
493
          float e_multiplier =
494
              (float)1 /
495
              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496
          float p_multiplier = multiplier * e_multiplier;
497
          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498
          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499
            e_ntc =
500
                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501
          else
502
            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503
          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504

505
          // Use regular static steal if not enough chunks for skewed
506
          // distribution
507
          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508
                                       e_ntc >= num_procs_with_ecore)
509
                            ? true
510
                            : false);
511
        } else {
512
          use_hybrid = false;
513
        }
514
      }
515
      pr->flags.use_hybrid = use_hybrid;
516
      pr->u.p.pchunks = p_ntc;
517
      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518
      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519

520
      if (use_hybrid) {
521
        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522
        T big_chunk = p_ntc / num_procs_with_pcore;
523
        small_chunk = e_ntc / num_procs_with_ecore;
524

525
        extras =
526
            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527

528
        p_extra = (big_chunk - small_chunk);
529

530
        if (core_type == KMP_HW_CORE_TYPE_CORE) {
531
          if (id < first_thread_with_ecore) {
532
            init =
533
                id * small_chunk + id * p_extra + (id < extras ? id : extras);
534
          } else {
535
            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536
                   (id < extras ? id : extras);
537
          }
538
        } else {
539
          if (id == first_thread_with_ecore) {
540
            init =
541
                id * small_chunk + id * p_extra + (id < extras ? id : extras);
542
          } else {
543
            init = id * small_chunk + first_thread_with_ecore * p_extra +
544
                   (id < extras ? id : extras);
545
          }
546
        }
547
        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548
      } else
549
#endif
550
      {
551
        small_chunk = ntc / nproc;
552
        extras = ntc % nproc;
553
        init = id * small_chunk + (id < extras ? id : extras);
554
        p_extra = 0;
555
      }
556
      pr->u.p.count = init;
557
      if (claimed) { // are we succeeded in claiming own buffer?
558
        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559
        // Other threads will inspect steal_flag when searching for a victim.
560
        // READY means other threads may steal from this thread from now on.
561
        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562
      } else {
563
        // other thread has stolen whole our range
564
        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565
        pr->u.p.ub = init; // mark there is no iterations to work on
566
      }
567
      pr->u.p.parm2 = ntc; // save number of chunks
568
      // parm3 is the number of times to attempt stealing which is
569
      // nproc (just a heuristics, could be optimized later on).
570
      pr->u.p.parm3 = nproc;
571
      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572
      break;
573
    } else {
574
      /* too few chunks: switching to kmp_sch_dynamic_chunked */
575
      schedule = kmp_sch_dynamic_chunked;
576
      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577
                     "kmp_sch_dynamic_chunked\n",
578
                     gtid));
579
      goto dynamic_init;
580
      break;
581
    } // if
582
  } // case
583
#endif
584
  case kmp_sch_static_balanced: {
585
    T init, limit;
586

587
    KD_TRACE(
588
        100,
589
        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590
         gtid));
591

592
    if (nproc > 1) {
593
      T id = tid;
594

595
      if (tc < nproc) {
596
        if (id < tc) {
597
          init = id;
598
          limit = id;
599
          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600
        } else {
601
          pr->u.p.count = 1; /* means no more chunks to execute */
602
          pr->u.p.parm1 = FALSE;
603
          break;
604
        }
605
      } else {
606
        T small_chunk = tc / nproc;
607
        T extras = tc % nproc;
608
        init = id * small_chunk + (id < extras ? id : extras);
609
        limit = init + small_chunk - (id < extras ? 0 : 1);
610
        pr->u.p.parm1 = (id == nproc - 1);
611
      }
612
    } else {
613
      if (tc > 0) {
614
        init = 0;
615
        limit = tc - 1;
616
        pr->u.p.parm1 = TRUE;
617
      } else {
618
        // zero trip count
619
        pr->u.p.count = 1; /* means no more chunks to execute */
620
        pr->u.p.parm1 = FALSE;
621
        break;
622
      }
623
    }
624
#if USE_ITT_BUILD
625
    // Calculate chunk for metadata report
626
    if (itt_need_metadata_reporting)
627
      if (cur_chunk)
628
        *cur_chunk = limit - init + 1;
629
#endif
630
    if (st == 1) {
631
      pr->u.p.lb = lb + init;
632
      pr->u.p.ub = lb + limit;
633
    } else {
634
      // calculated upper bound, "ub" is user-defined upper bound
635
      T ub_tmp = lb + limit * st;
636
      pr->u.p.lb = lb + init * st;
637
      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638
      // it exactly
639
      if (st > 0) {
640
        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641
      } else {
642
        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643
      }
644
    }
645
    if (pr->flags.ordered) {
646
      pr->u.p.ordered_lower = init;
647
      pr->u.p.ordered_upper = limit;
648
    }
649
    break;
650
  } // case
651
  case kmp_sch_static_balanced_chunked: {
652
    // similar to balanced, but chunk adjusted to multiple of simd width
653
    T nth = nproc;
654
    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655
                   " -> falling-through to static_greedy\n",
656
                   gtid));
657
    schedule = kmp_sch_static_greedy;
658
    if (nth > 1)
659
      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660
    else
661
      pr->u.p.parm1 = tc;
662
    break;
663
  } // case
664
  case kmp_sch_guided_simd:
665
  case kmp_sch_guided_iterative_chunked: {
666
    KD_TRACE(
667
        100,
668
        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669
         " case\n",
670
         gtid));
671

672
    if (nproc > 1) {
673
      if ((2L * chunk + 1) * nproc >= tc) {
674
        /* chunk size too large, switch to dynamic */
675
        schedule = kmp_sch_dynamic_chunked;
676
        goto dynamic_init;
677
      } else {
678
        // when remaining iters become less than parm2 - switch to dynamic
679
        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680
        *(double *)&pr->u.p.parm3 =
681
            guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682
      }
683
    } else {
684
      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685
                     "kmp_sch_static_greedy\n",
686
                     gtid));
687
      schedule = kmp_sch_static_greedy;
688
      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689
      KD_TRACE(
690
          100,
691
          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692
           gtid));
693
      pr->u.p.parm1 = tc;
694
    } // if
695
  } // case
696
  break;
697
  case kmp_sch_guided_analytical_chunked: {
698
    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699
                   "kmp_sch_guided_analytical_chunked case\n",
700
                   gtid));
701

702
    if (nproc > 1) {
703
      if ((2L * chunk + 1) * nproc >= tc) {
704
        /* chunk size too large, switch to dynamic */
705
        schedule = kmp_sch_dynamic_chunked;
706
        goto dynamic_init;
707
      } else {
708
        /* commonly used term: (2 nproc - 1)/(2 nproc) */
709
        DBL x;
710

711
#if KMP_USE_X87CONTROL
712
        /* Linux* OS already has 64-bit computation by default for long double,
713
           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714
           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715
           instead of the default 53-bit. Even though long double doesn't work
716
           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717
           expected to impact the correctness of the algorithm, but this has not
718
           been mathematically proven. */
719
        // save original FPCW and set precision to 64-bit, as
720
        // Windows* OS on IA-32 architecture defaults to 53-bit
721
        unsigned int oldFpcw = _control87(0, 0);
722
        _control87(_PC_64, _MCW_PC); // 0,0x30000
723
#endif
724
        /* value used for comparison in solver for cross-over point */
725
        KMP_ASSERT(tc > 0);
726
        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727

728
        /* crossover point--chunk indexes equal to or greater than
729
           this point switch to dynamic-style scheduling */
730
        UT cross;
731

732
        /* commonly used term: (2 nproc - 1)/(2 nproc) */
733
        x = 1.0 - 0.5 / (double)nproc;
734

735
#ifdef KMP_DEBUG
736
        { // test natural alignment
737
          struct _test_a {
738
            char a;
739
            union {
740
              char b;
741
              DBL d;
742
            };
743
          } t;
744
          ptrdiff_t natural_alignment =
745
              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746
          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747
          // long)natural_alignment );
748
          KMP_DEBUG_ASSERT(
749
              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750
        }
751
#endif // KMP_DEBUG
752

753
        /* save the term in thread private dispatch structure */
754
        *(DBL *)&pr->u.p.parm3 = x;
755

756
        /* solve for the crossover point to the nearest integer i for which C_i
757
           <= chunk */
758
        {
759
          UT left, right, mid;
760
          long double p;
761

762
          /* estimate initial upper and lower bound */
763

764
          /* doesn't matter what value right is as long as it is positive, but
765
             it affects performance of the solver */
766
          right = 229;
767
          p = __kmp_pow<UT>(x, right);
768
          if (p > target) {
769
            do {
770
              p *= p;
771
              right <<= 1;
772
            } while (p > target && right < (1 << 27));
773
            /* lower bound is previous (failed) estimate of upper bound */
774
            left = right >> 1;
775
          } else {
776
            left = 0;
777
          }
778

779
          /* bisection root-finding method */
780
          while (left + 1 < right) {
781
            mid = (left + right) / 2;
782
            if (__kmp_pow<UT>(x, mid) > target) {
783
              left = mid;
784
            } else {
785
              right = mid;
786
            }
787
          } // while
788
          cross = right;
789
        }
790
        /* assert sanity of computed crossover point */
791
        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792
                   __kmp_pow<UT>(x, cross) <= target);
793

794
        /* save the crossover point in thread private dispatch structure */
795
        pr->u.p.parm2 = cross;
796

797
// C75803
798
#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799
#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800
#else
801
#define GUIDED_ANALYTICAL_WORKAROUND (x)
802
#endif
803
        /* dynamic-style scheduling offset */
804
        pr->u.p.count = tc -
805
                        __kmp_dispatch_guided_remaining(
806
                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807
                        cross * chunk;
808
#if KMP_USE_X87CONTROL
809
        // restore FPCW
810
        _control87(oldFpcw, _MCW_PC);
811
#endif
812
      } // if
813
    } else {
814
      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815
                     "kmp_sch_static_greedy\n",
816
                     gtid));
817
      schedule = kmp_sch_static_greedy;
818
      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819
      pr->u.p.parm1 = tc;
820
    } // if
821
  } // case
822
  break;
823
  case kmp_sch_static_greedy:
824
    KD_TRACE(
825
        100,
826
        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827
         gtid));
828
    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829
    break;
830
  case kmp_sch_static_chunked:
831
  case kmp_sch_dynamic_chunked:
832
  dynamic_init:
833
    if (tc == 0)
834
      break;
835
    if (pr->u.p.parm1 <= 0)
836
      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837
    else if (pr->u.p.parm1 > tc)
838
      pr->u.p.parm1 = tc;
839
    // Store the total number of chunks to prevent integer overflow during
840
    // bounds calculations in the get next chunk routine.
841
    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842
    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843
                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844
                   gtid));
845
    break;
846
  case kmp_sch_trapezoidal: {
847
    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848

849
    T parm1, parm2, parm3, parm4;
850
    KD_TRACE(100,
851
             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852
              gtid));
853

854
    parm1 = chunk;
855

856
    /* F : size of the first cycle */
857
    parm2 = (tc / (2 * nproc));
858

859
    if (parm2 < 1) {
860
      parm2 = 1;
861
    }
862

863
    /* L : size of the last cycle.  Make sure the last cycle is not larger
864
       than the first cycle. */
865
    if (parm1 < 1) {
866
      parm1 = 1;
867
    } else if (parm1 > parm2) {
868
      parm1 = parm2;
869
    }
870

871
    /* N : number of cycles */
872
    parm3 = (parm2 + parm1);
873
    parm3 = (2 * tc + parm3 - 1) / parm3;
874

875
    if (parm3 < 2) {
876
      parm3 = 2;
877
    }
878

879
    /* sigma : decreasing incr of the trapezoid */
880
    parm4 = (parm3 - 1);
881
    parm4 = (parm2 - parm1) / parm4;
882

883
    // pointless check, because parm4 >= 0 always
884
    // if ( parm4 < 0 ) {
885
    //    parm4 = 0;
886
    //}
887

888
    pr->u.p.parm1 = parm1;
889
    pr->u.p.parm2 = parm2;
890
    pr->u.p.parm3 = parm3;
891
    pr->u.p.parm4 = parm4;
892
  } // case
893
  break;
894

895
  default: {
896
    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897
                KMP_HNT(GetNewerLibrary), // Hint
898
                __kmp_msg_null // Variadic argument list terminator
899
    );
900
  } break;
901
  } // switch
902
  pr->schedule = schedule;
903
}
904

905
#if KMP_USE_HIER_SCHED
906
template <typename T>
907
inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908
                                             typename traits_t<T>::signed_t st);
909
template <>
910
inline void
911
__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912
                                            kmp_int32 ub, kmp_int32 st) {
913
  __kmp_dispatch_init_hierarchy<kmp_int32>(
914
      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915
      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916
}
917
template <>
918
inline void
919
__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920
                                             kmp_uint32 ub, kmp_int32 st) {
921
  __kmp_dispatch_init_hierarchy<kmp_uint32>(
922
      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923
      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924
}
925
template <>
926
inline void
927
__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928
                                            kmp_int64 ub, kmp_int64 st) {
929
  __kmp_dispatch_init_hierarchy<kmp_int64>(
930
      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931
      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932
}
933
template <>
934
inline void
935
__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936
                                             kmp_uint64 ub, kmp_int64 st) {
937
  __kmp_dispatch_init_hierarchy<kmp_uint64>(
938
      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939
      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940
}
941

942
// free all the hierarchy scheduling memory associated with the team
943
void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944
  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945
  for (int i = 0; i < num_disp_buff; ++i) {
946
    // type does not matter here so use kmp_int32
947
    auto sh =
948
        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949
            &team->t.t_disp_buffer[i]);
950
    if (sh->hier) {
951
      sh->hier->deallocate();
952
      __kmp_free(sh->hier);
953
    }
954
  }
955
}
956
#endif
957

958
// UT - unsigned flavor of T, ST - signed flavor of T,
959
// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960
template <typename T>
961
static void
962
__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963
                    T ub, typename traits_t<T>::signed_t st,
964
                    typename traits_t<T>::signed_t chunk, int push_ws) {
965
  typedef typename traits_t<T>::unsigned_t UT;
966

967
  int active;
968
  kmp_info_t *th;
969
  kmp_team_t *team;
970
  kmp_uint32 my_buffer_index;
971
  dispatch_private_info_template<T> *pr;
972
  dispatch_shared_info_template<T> volatile *sh;
973

974
  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975
                   sizeof(dispatch_private_info));
976
  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977
                   sizeof(dispatch_shared_info));
978
  __kmp_assert_valid_gtid(gtid);
979

980
  if (!TCR_4(__kmp_init_parallel))
981
    __kmp_parallel_initialize();
982

983
  __kmp_resume_if_soft_paused();
984

985
#if INCLUDE_SSC_MARKS
986
  SSC_MARK_DISPATCH_INIT();
987
#endif
988
#ifdef KMP_DEBUG
989
  typedef typename traits_t<T>::signed_t ST;
990
  {
991
    char *buff;
992
    // create format specifiers before the debug output
993
    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994
                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995
                            traits_t<ST>::spec, traits_t<T>::spec,
996
                            traits_t<T>::spec, traits_t<ST>::spec);
997
    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998
    __kmp_str_free(&buff);
999
  }
1000
#endif
1001
  /* setup data */
1002
  th = __kmp_threads[gtid];
1003
  team = th->th.th_team;
1004
  active = !team->t.t_serialized;
1005
  th->th.th_ident = loc;
1006

1007
  // Any half-decent optimizer will remove this test when the blocks are empty
1008
  // since the macros expand to nothing
1009
  // when statistics are disabled.
1010
  if (schedule == __kmp_static) {
1011
    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012
  } else {
1013
    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014
  }
1015

1016
#if KMP_USE_HIER_SCHED
1017
  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018
  // Hierarchical scheduling does not work with ordered, so if ordered is
1019
  // detected, then revert back to threaded scheduling.
1020
  bool ordered;
1021
  enum sched_type my_sched = schedule;
1022
  my_buffer_index = th->th.th_dispatch->th_disp_index;
1023
  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024
      &th->th.th_dispatch
1025
           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026
  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027
  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028
    my_sched =
1029
        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030
  ordered = (kmp_ord_lower & my_sched);
1031
  if (pr->flags.use_hier) {
1032
    if (ordered) {
1033
      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
1034
                     "Disabling hierarchical scheduling.\n",
1035
                     gtid));
1036
      pr->flags.use_hier = FALSE;
1037
    }
1038
  }
1039
  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040
    // Don't use hierarchical for ordered parallel loops and don't
1041
    // use the runtime hierarchy if one was specified in the program
1042
    if (!ordered && !pr->flags.use_hier)
1043
      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044
  }
1045
#endif // KMP_USE_HIER_SCHED
1046

1047
#if USE_ITT_BUILD
1048
  kmp_uint64 cur_chunk = chunk;
1049
  int itt_need_metadata_reporting =
1050
      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051
      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052
      team->t.t_active_level == 1;
1053
#endif
1054
  if (!active) {
1055
    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056
        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057
  } else {
1058
    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060

1061
    my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062

1063
    /* What happens when number of threads changes, need to resize buffer? */
1064
    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065
        &th->th.th_dispatch
1066
             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067
    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068
        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069
    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070
                  my_buffer_index));
1071
    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072
      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073
                     " sh->buffer_index:%d\n",
1074
                     gtid, my_buffer_index, sh->buffer_index));
1075
      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076
                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077
      // Note: KMP_WAIT() cannot be used there: buffer index and
1078
      // my_buffer_index are *always* 32-bit integers.
1079
      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080
                     "sh->buffer_index:%d\n",
1081
                     gtid, my_buffer_index, sh->buffer_index));
1082
    }
1083
  }
1084

1085
  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086
#if USE_ITT_BUILD
1087
                                &cur_chunk,
1088
#endif
1089
                                chunk, (T)th->th.th_team_nproc,
1090
                                (T)th->th.th_info.ds.ds_tid);
1091
  if (active) {
1092
    if (pr->flags.ordered == 0) {
1093
      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094
      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095
    } else {
1096
      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097
      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098
    }
1099
    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100
    th->th.th_dispatch->th_dispatch_sh_current =
1101
        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102
#if USE_ITT_BUILD
1103
    if (pr->flags.ordered) {
1104
      __kmp_itt_ordered_init(gtid);
1105
    }
1106
    // Report loop metadata
1107
    if (itt_need_metadata_reporting) {
1108
      // Only report metadata by primary thread of active team at level 1
1109
      kmp_uint64 schedtype = 0;
1110
      switch (schedule) {
1111
      case kmp_sch_static_chunked:
1112
      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113
        break;
1114
      case kmp_sch_static_greedy:
1115
        cur_chunk = pr->u.p.parm1;
1116
        break;
1117
      case kmp_sch_dynamic_chunked:
1118
        schedtype = 1;
1119
        break;
1120
      case kmp_sch_guided_iterative_chunked:
1121
      case kmp_sch_guided_analytical_chunked:
1122
      case kmp_sch_guided_simd:
1123
        schedtype = 2;
1124
        break;
1125
      default:
1126
        // Should we put this case under "static"?
1127
        // case kmp_sch_static_steal:
1128
        schedtype = 3;
1129
        break;
1130
      }
1131
      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132
    }
1133
#if KMP_USE_HIER_SCHED
1134
    if (pr->flags.use_hier) {
1135
      pr->u.p.count = 0;
1136
      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137
    }
1138
#endif // KMP_USER_HIER_SCHED
1139
#endif /* USE_ITT_BUILD */
1140
  }
1141

1142
#ifdef KMP_DEBUG
1143
  {
1144
    char *buff;
1145
    // create format specifiers before the debug output
1146
    buff = __kmp_str_format(
1147
        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148
        "lb:%%%s ub:%%%s"
1149
        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150
        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151
        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152
        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153
        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154
        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155
    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156
                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157
                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158
                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159
    __kmp_str_free(&buff);
1160
  }
1161
#endif
1162
#if OMPT_SUPPORT && OMPT_OPTIONAL
1163
  if (ompt_enabled.ompt_callback_work) {
1164
    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165
    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166
    ompt_callbacks.ompt_callback(ompt_callback_work)(
1167
        ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
1168
        &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
1169
        OMPT_LOAD_RETURN_ADDRESS(gtid));
1170
  }
1171
#endif
1172
  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1173
}
1174

1175
/* For ordered loops, either __kmp_dispatch_finish() should be called after
1176
 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1177
 * every chunk of iterations.  If the ordered section(s) were not executed
1178
 * for this iteration (or every iteration in this chunk), we need to set the
1179
 * ordered iteration counters so that the next thread can proceed. */
1180
template <typename UT>
1181
static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1182
  typedef typename traits_t<UT>::signed_t ST;
1183
  __kmp_assert_valid_gtid(gtid);
1184
  kmp_info_t *th = __kmp_threads[gtid];
1185

1186
  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1187
  if (!th->th.th_team->t.t_serialized) {
1188

1189
    dispatch_private_info_template<UT> *pr =
1190
        reinterpret_cast<dispatch_private_info_template<UT> *>(
1191
            th->th.th_dispatch->th_dispatch_pr_current);
1192
    dispatch_shared_info_template<UT> volatile *sh =
1193
        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1194
            th->th.th_dispatch->th_dispatch_sh_current);
1195
    KMP_DEBUG_ASSERT(pr);
1196
    KMP_DEBUG_ASSERT(sh);
1197
    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1198
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1199

1200
    if (pr->ordered_bumped) {
1201
      KD_TRACE(
1202
          1000,
1203
          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1204
           gtid));
1205
      pr->ordered_bumped = 0;
1206
    } else {
1207
      UT lower = pr->u.p.ordered_lower;
1208

1209
#ifdef KMP_DEBUG
1210
      {
1211
        char *buff;
1212
        // create format specifiers before the debug output
1213
        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1214
                                "ordered_iteration:%%%s lower:%%%s\n",
1215
                                traits_t<UT>::spec, traits_t<UT>::spec);
1216
        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1217
        __kmp_str_free(&buff);
1218
      }
1219
#endif
1220

1221
      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1222
                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1223
      KMP_MB(); /* is this necessary? */
1224
#ifdef KMP_DEBUG
1225
      {
1226
        char *buff;
1227
        // create format specifiers before the debug output
1228
        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1229
                                "ordered_iteration:%%%s lower:%%%s\n",
1230
                                traits_t<UT>::spec, traits_t<UT>::spec);
1231
        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1232
        __kmp_str_free(&buff);
1233
      }
1234
#endif
1235

1236
      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1237
    } // if
1238
  } // if
1239
  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1240
}
1241

1242
#ifdef KMP_GOMP_COMPAT
1243

1244
template <typename UT>
1245
static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1246
  typedef typename traits_t<UT>::signed_t ST;
1247
  __kmp_assert_valid_gtid(gtid);
1248
  kmp_info_t *th = __kmp_threads[gtid];
1249

1250
  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1251
  if (!th->th.th_team->t.t_serialized) {
1252
    dispatch_private_info_template<UT> *pr =
1253
        reinterpret_cast<dispatch_private_info_template<UT> *>(
1254
            th->th.th_dispatch->th_dispatch_pr_current);
1255
    dispatch_shared_info_template<UT> volatile *sh =
1256
        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1257
            th->th.th_dispatch->th_dispatch_sh_current);
1258
    KMP_DEBUG_ASSERT(pr);
1259
    KMP_DEBUG_ASSERT(sh);
1260
    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1261
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1262

1263
    UT lower = pr->u.p.ordered_lower;
1264
    UT upper = pr->u.p.ordered_upper;
1265
    UT inc = upper - lower + 1;
1266

1267
    if (pr->ordered_bumped == inc) {
1268
      KD_TRACE(
1269
          1000,
1270
          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1271
           gtid));
1272
      pr->ordered_bumped = 0;
1273
    } else {
1274
      inc -= pr->ordered_bumped;
1275

1276
#ifdef KMP_DEBUG
1277
      {
1278
        char *buff;
1279
        // create format specifiers before the debug output
1280
        buff = __kmp_str_format(
1281
            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1282
            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1283
            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1284
        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1285
        __kmp_str_free(&buff);
1286
      }
1287
#endif
1288

1289
      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1290
                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1291

1292
      KMP_MB(); /* is this necessary? */
1293
      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1294
                      "ordered_bumped to zero\n",
1295
                      gtid));
1296
      pr->ordered_bumped = 0;
1297
//!!!!! TODO check if the inc should be unsigned, or signed???
1298
#ifdef KMP_DEBUG
1299
      {
1300
        char *buff;
1301
        // create format specifiers before the debug output
1302
        buff = __kmp_str_format(
1303
            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1304
            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1305
            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1306
            traits_t<UT>::spec);
1307
        KD_TRACE(1000,
1308
                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1309
        __kmp_str_free(&buff);
1310
      }
1311
#endif
1312

1313
      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1314
    }
1315
    //        }
1316
  }
1317
  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1318
}
1319

1320
#endif /* KMP_GOMP_COMPAT */
1321

1322
template <typename T>
1323
int __kmp_dispatch_next_algorithm(int gtid,
1324
                                  dispatch_private_info_template<T> *pr,
1325
                                  dispatch_shared_info_template<T> volatile *sh,
1326
                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
1327
                                  typename traits_t<T>::signed_t *p_st, T nproc,
1328
                                  T tid) {
1329
  typedef typename traits_t<T>::unsigned_t UT;
1330
  typedef typename traits_t<T>::signed_t ST;
1331
  typedef typename traits_t<T>::floating_t DBL;
1332
  int status = 0;
1333
  bool last = false;
1334
  T start;
1335
  ST incr;
1336
  UT limit, trip, init;
1337
  kmp_info_t *th = __kmp_threads[gtid];
1338
  kmp_team_t *team = th->th.th_team;
1339

1340
  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1341
                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1342
  KMP_DEBUG_ASSERT(pr);
1343
  KMP_DEBUG_ASSERT(sh);
1344
  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1345
#ifdef KMP_DEBUG
1346
  {
1347
    char *buff;
1348
    // create format specifiers before the debug output
1349
    buff =
1350
        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1351
                         "sh:%%p nproc:%%%s tid:%%%s\n",
1352
                         traits_t<T>::spec, traits_t<T>::spec);
1353
    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1354
    __kmp_str_free(&buff);
1355
  }
1356
#endif
1357

1358
  // zero trip count
1359
  if (pr->u.p.tc == 0) {
1360
    KD_TRACE(10,
1361
             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1362
              "zero status:%d\n",
1363
              gtid, status));
1364
    return 0;
1365
  }
1366

1367
  switch (pr->schedule) {
1368
#if KMP_STATIC_STEAL_ENABLED
1369
  case kmp_sch_static_steal: {
1370
    T chunk = pr->u.p.parm1;
1371
    UT nchunks = pr->u.p.parm2;
1372
    KD_TRACE(100,
1373
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1374
              gtid));
1375

1376
    trip = pr->u.p.tc - 1;
1377

1378
    if (traits_t<T>::type_size > 4) {
1379
      // use lock for 8-byte induction variable.
1380
      // TODO (optional): check presence and use 16-byte CAS
1381
      kmp_lock_t *lck = pr->u.p.steal_lock;
1382
      KMP_DEBUG_ASSERT(lck != NULL);
1383
      if (pr->u.p.count < (UT)pr->u.p.ub) {
1384
        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1385
        __kmp_acquire_lock(lck, gtid);
1386
        // try to get own chunk of iterations
1387
        init = (pr->u.p.count)++;
1388
        status = (init < (UT)pr->u.p.ub);
1389
        __kmp_release_lock(lck, gtid);
1390
      } else {
1391
        status = 0; // no own chunks
1392
      }
1393
      if (!status) { // try to steal
1394
        kmp_lock_t *lckv; // victim buffer's lock
1395
        T while_limit = pr->u.p.parm3;
1396
        T while_index = 0;
1397
        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1398
                  __kmp_dispatch_num_buffers; // current loop index
1399
        // note: victim thread can potentially execute another loop
1400
        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1401
        while ((!status) && (while_limit != ++while_index)) {
1402
          dispatch_private_info_template<T> *v;
1403
          T remaining;
1404
          T victimId = pr->u.p.parm4;
1405
          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1406
          v = reinterpret_cast<dispatch_private_info_template<T> *>(
1407
              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1408
          KMP_DEBUG_ASSERT(v);
1409
          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1410
                 oldVictimId != victimId) {
1411
            victimId = (victimId + 1) % nproc;
1412
            v = reinterpret_cast<dispatch_private_info_template<T> *>(
1413
                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1414
            KMP_DEBUG_ASSERT(v);
1415
          }
1416
          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1417
            continue; // try once more (nproc attempts in total)
1418
          }
1419
          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1420
            kmp_uint32 old = UNUSED;
1421
            // try to steal whole range from inactive victim
1422
            status = v->steal_flag.compare_exchange_strong(old, THIEF);
1423
            if (status) {
1424
              // initialize self buffer with victim's whole range of chunks
1425
              T id = victimId;
1426
              T small_chunk = 0, extras = 0, p_extra = 0;
1427
              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1428
                                              init, small_chunk, extras,
1429
                                              p_extra);
1430
              __kmp_acquire_lock(lck, gtid);
1431
              pr->u.p.count = init + 1; // exclude one we execute immediately
1432
              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1433
              __kmp_release_lock(lck, gtid);
1434
              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1435
              // no need to reinitialize other thread invariants: lb, st, etc.
1436
#ifdef KMP_DEBUG
1437
              {
1438
                char *buff;
1439
                // create format specifiers before the debug output
1440
                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1441
                                        "stolen chunks from T#%%d, "
1442
                                        "count:%%%s ub:%%%s\n",
1443
                                        traits_t<UT>::spec, traits_t<T>::spec);
1444
                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1445
                __kmp_str_free(&buff);
1446
              }
1447
#endif
1448
              // activate non-empty buffer and let others steal from us
1449
              if (pr->u.p.count < (UT)pr->u.p.ub)
1450
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1451
              break;
1452
            }
1453
          }
1454
          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1455
              v->u.p.count >= (UT)v->u.p.ub) {
1456
            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1457
            continue; // no chunks to steal, try next victim
1458
          }
1459
          lckv = v->u.p.steal_lock;
1460
          KMP_ASSERT(lckv != NULL);
1461
          __kmp_acquire_lock(lckv, gtid);
1462
          limit = v->u.p.ub; // keep initial ub
1463
          if (v->u.p.count >= limit) {
1464
            __kmp_release_lock(lckv, gtid);
1465
            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1466
            continue; // no chunks to steal, try next victim
1467
          }
1468

1469
          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1470
          // TODO: is this heuristics good enough??
1471
          remaining = limit - v->u.p.count;
1472
          if (remaining > 7) {
1473
            // steal 1/4 of remaining
1474
            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1475
            init = (v->u.p.ub -= (remaining >> 2));
1476
          } else {
1477
            // steal 1 chunk of 1..7 remaining
1478
            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1479
            init = (v->u.p.ub -= 1);
1480
          }
1481
          __kmp_release_lock(lckv, gtid);
1482
#ifdef KMP_DEBUG
1483
          {
1484
            char *buff;
1485
            // create format specifiers before the debug output
1486
            buff = __kmp_str_format(
1487
                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1488
                "count:%%%s ub:%%%s\n",
1489
                traits_t<UT>::spec, traits_t<UT>::spec);
1490
            KD_TRACE(10, (buff, gtid, victimId, init, limit));
1491
            __kmp_str_free(&buff);
1492
          }
1493
#endif
1494
          KMP_DEBUG_ASSERT(init + 1 <= limit);
1495
          pr->u.p.parm4 = victimId; // remember victim to steal from
1496
          status = 1;
1497
          // now update own count and ub with stolen range excluding init chunk
1498
          __kmp_acquire_lock(lck, gtid);
1499
          pr->u.p.count = init + 1;
1500
          pr->u.p.ub = limit;
1501
          __kmp_release_lock(lck, gtid);
1502
          // activate non-empty buffer and let others steal from us
1503
          if (init + 1 < limit)
1504
            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1505
        } // while (search for victim)
1506
      } // if (try to find victim and steal)
1507
    } else {
1508
      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1509
      // as all operations on pair (count, ub) must be done atomically
1510
      typedef union {
1511
        struct {
1512
          UT count;
1513
          T ub;
1514
        } p;
1515
        kmp_int64 b;
1516
      } union_i4;
1517
      union_i4 vold, vnew;
1518
      if (pr->u.p.count < (UT)pr->u.p.ub) {
1519
        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1520
        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1521
        vnew.b = vold.b;
1522
        vnew.p.count++; // get chunk from head of self range
1523
        while (!KMP_COMPARE_AND_STORE_REL64(
1524
            (volatile kmp_int64 *)&pr->u.p.count,
1525
            *VOLATILE_CAST(kmp_int64 *) & vold.b,
1526
            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1527
          KMP_CPU_PAUSE();
1528
          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1529
          vnew.b = vold.b;
1530
          vnew.p.count++;
1531
        }
1532
        init = vold.p.count;
1533
        status = (init < (UT)vold.p.ub);
1534
      } else {
1535
        status = 0; // no own chunks
1536
      }
1537
      if (!status) { // try to steal
1538
        T while_limit = pr->u.p.parm3;
1539
        T while_index = 0;
1540
        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1541
                  __kmp_dispatch_num_buffers; // current loop index
1542
        // note: victim thread can potentially execute another loop
1543
        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1544
        while ((!status) && (while_limit != ++while_index)) {
1545
          dispatch_private_info_template<T> *v;
1546
          T remaining;
1547
          T victimId = pr->u.p.parm4;
1548
          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1549
          v = reinterpret_cast<dispatch_private_info_template<T> *>(
1550
              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1551
          KMP_DEBUG_ASSERT(v);
1552
          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1553
                 oldVictimId != victimId) {
1554
            victimId = (victimId + 1) % nproc;
1555
            v = reinterpret_cast<dispatch_private_info_template<T> *>(
1556
                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1557
            KMP_DEBUG_ASSERT(v);
1558
          }
1559
          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1560
            continue; // try once more (nproc attempts in total)
1561
          }
1562
          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1563
            kmp_uint32 old = UNUSED;
1564
            // try to steal whole range from inactive victim
1565
            status = v->steal_flag.compare_exchange_strong(old, THIEF);
1566
            if (status) {
1567
              // initialize self buffer with victim's whole range of chunks
1568
              T id = victimId;
1569
              T small_chunk = 0, extras = 0, p_extra = 0;
1570
              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1571
                                              init, small_chunk, extras,
1572
                                              p_extra);
1573
              vnew.p.count = init + 1;
1574
              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1575
              // write pair (count, ub) at once atomically
1576
#if KMP_ARCH_X86
1577
              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1578
#else
1579
              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1580
#endif
1581
              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1582
              // no need to initialize other thread invariants: lb, st, etc.
1583
#ifdef KMP_DEBUG
1584
              {
1585
                char *buff;
1586
                // create format specifiers before the debug output
1587
                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1588
                                        "stolen chunks from T#%%d, "
1589
                                        "count:%%%s ub:%%%s\n",
1590
                                        traits_t<UT>::spec, traits_t<T>::spec);
1591
                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1592
                __kmp_str_free(&buff);
1593
              }
1594
#endif
1595
              // activate non-empty buffer and let others steal from us
1596
              if (pr->u.p.count < (UT)pr->u.p.ub)
1597
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1598
              break;
1599
            }
1600
          }
1601
          while (1) { // CAS loop with check if victim still has enough chunks
1602
            // many threads may be stealing concurrently from same victim
1603
            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1604
            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1605
                vold.p.count >= (UT)vold.p.ub) {
1606
              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1607
              break; // no chunks to steal, try next victim
1608
            }
1609
            vnew.b = vold.b;
1610
            remaining = vold.p.ub - vold.p.count;
1611
            // try to steal 1/4 of remaining
1612
            // TODO: is this heuristics good enough??
1613
            if (remaining > 7) {
1614
              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1615
            } else {
1616
              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1617
            }
1618
            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1619
            if (KMP_COMPARE_AND_STORE_REL64(
1620
                    (volatile kmp_int64 *)&v->u.p.count,
1621
                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
1622
                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1623
              // stealing succedded
1624
#ifdef KMP_DEBUG
1625
              {
1626
                char *buff;
1627
                // create format specifiers before the debug output
1628
                buff = __kmp_str_format(
1629
                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1630
                    "count:%%%s ub:%%%s\n",
1631
                    traits_t<T>::spec, traits_t<T>::spec);
1632
                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1633
                __kmp_str_free(&buff);
1634
              }
1635
#endif
1636
              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1637
                                        vold.p.ub - vnew.p.ub);
1638
              status = 1;
1639
              pr->u.p.parm4 = victimId; // keep victim id
1640
              // now update own count and ub
1641
              init = vnew.p.ub;
1642
              vold.p.count = init + 1;
1643
#if KMP_ARCH_X86
1644
              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1645
#else
1646
              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1647
#endif
1648
              // activate non-empty buffer and let others steal from us
1649
              if (vold.p.count < (UT)vold.p.ub)
1650
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1651
              break;
1652
            } // if (check CAS result)
1653
            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1654
          } // while (try to steal from particular victim)
1655
        } // while (search for victim)
1656
      } // if (try to find victim and steal)
1657
    } // if (4-byte induction variable)
1658
    if (!status) {
1659
      *p_lb = 0;
1660
      *p_ub = 0;
1661
      if (p_st != NULL)
1662
        *p_st = 0;
1663
    } else {
1664
      start = pr->u.p.lb;
1665
      init *= chunk;
1666
      limit = chunk + init - 1;
1667
      incr = pr->u.p.st;
1668
      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1669

1670
      KMP_DEBUG_ASSERT(init <= trip);
1671
      // keep track of done chunks for possible early exit from stealing
1672
      // TODO: count executed chunks locally with rare update of shared location
1673
      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1674
      if ((last = (limit >= trip)) != 0)
1675
        limit = trip;
1676
      if (p_st != NULL)
1677
        *p_st = incr;
1678

1679
      if (incr == 1) {
1680
        *p_lb = start + init;
1681
        *p_ub = start + limit;
1682
      } else {
1683
        *p_lb = start + init * incr;
1684
        *p_ub = start + limit * incr;
1685
      }
1686
    } // if
1687
    break;
1688
  } // case
1689
#endif // KMP_STATIC_STEAL_ENABLED
1690
  case kmp_sch_static_balanced: {
1691
    KD_TRACE(
1692
        10,
1693
        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1694
         gtid));
1695
    /* check if thread has any iteration to do */
1696
    if ((status = !pr->u.p.count) != 0) {
1697
      pr->u.p.count = 1;
1698
      *p_lb = pr->u.p.lb;
1699
      *p_ub = pr->u.p.ub;
1700
      last = (pr->u.p.parm1 != 0);
1701
      if (p_st != NULL)
1702
        *p_st = pr->u.p.st;
1703
    } else { /* no iterations to do */
1704
      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1705
    }
1706
  } // case
1707
  break;
1708
  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1709
                                 merged here */
1710
  case kmp_sch_static_chunked: {
1711
    T parm1;
1712

1713
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1714
                   "kmp_sch_static_[affinity|chunked] case\n",
1715
                   gtid));
1716
    parm1 = pr->u.p.parm1;
1717

1718
    trip = pr->u.p.tc - 1;
1719
    init = parm1 * (pr->u.p.count + tid);
1720

1721
    if ((status = (init <= trip)) != 0) {
1722
      start = pr->u.p.lb;
1723
      incr = pr->u.p.st;
1724
      limit = parm1 + init - 1;
1725

1726
      if ((last = (limit >= trip)) != 0)
1727
        limit = trip;
1728

1729
      if (p_st != NULL)
1730
        *p_st = incr;
1731

1732
      pr->u.p.count += nproc;
1733

1734
      if (incr == 1) {
1735
        *p_lb = start + init;
1736
        *p_ub = start + limit;
1737
      } else {
1738
        *p_lb = start + init * incr;
1739
        *p_ub = start + limit * incr;
1740
      }
1741

1742
      if (pr->flags.ordered) {
1743
        pr->u.p.ordered_lower = init;
1744
        pr->u.p.ordered_upper = limit;
1745
      } // if
1746
    } // if
1747
  } // case
1748
  break;
1749

1750
  case kmp_sch_dynamic_chunked: {
1751
    UT chunk_number;
1752
    UT chunk_size = pr->u.p.parm1;
1753
    UT nchunks = pr->u.p.parm2;
1754

1755
    KD_TRACE(
1756
        100,
1757
        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1758
         gtid));
1759

1760
    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1761
    status = (chunk_number < nchunks);
1762
    if (!status) {
1763
      *p_lb = 0;
1764
      *p_ub = 0;
1765
      if (p_st != NULL)
1766
        *p_st = 0;
1767
    } else {
1768
      init = chunk_size * chunk_number;
1769
      trip = pr->u.p.tc - 1;
1770
      start = pr->u.p.lb;
1771
      incr = pr->u.p.st;
1772

1773
      if ((last = (trip - init < (UT)chunk_size)))
1774
        limit = trip;
1775
      else
1776
        limit = chunk_size + init - 1;
1777

1778
      if (p_st != NULL)
1779
        *p_st = incr;
1780

1781
      if (incr == 1) {
1782
        *p_lb = start + init;
1783
        *p_ub = start + limit;
1784
      } else {
1785
        *p_lb = start + init * incr;
1786
        *p_ub = start + limit * incr;
1787
      }
1788

1789
      if (pr->flags.ordered) {
1790
        pr->u.p.ordered_lower = init;
1791
        pr->u.p.ordered_upper = limit;
1792
      } // if
1793
    } // if
1794
  } // case
1795
  break;
1796

1797
  case kmp_sch_guided_iterative_chunked: {
1798
    T chunkspec = pr->u.p.parm1;
1799
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1800
                   "iterative case\n",
1801
                   gtid));
1802
    trip = pr->u.p.tc;
1803
    // Start atomic part of calculations
1804
    while (1) {
1805
      ST remaining; // signed, because can be < 0
1806
      init = sh->u.s.iteration; // shared value
1807
      remaining = trip - init;
1808
      if (remaining <= 0) { // AC: need to compare with 0 first
1809
        // nothing to do, don't try atomic op
1810
        status = 0;
1811
        break;
1812
      }
1813
      if ((T)remaining <
1814
          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1815
        // use dynamic-style schedule
1816
        // atomically increment iterations, get old value
1817
        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1818
                                 (ST)chunkspec);
1819
        remaining = trip - init;
1820
        if (remaining <= 0) {
1821
          status = 0; // all iterations got by other threads
1822
        } else {
1823
          // got some iterations to work on
1824
          status = 1;
1825
          if ((T)remaining > chunkspec) {
1826
            limit = init + chunkspec - 1;
1827
          } else {
1828
            last = true; // the last chunk
1829
            limit = init + remaining - 1;
1830
          } // if
1831
        } // if
1832
        break;
1833
      } // if
1834
      limit = init + (UT)((double)remaining *
1835
                          *(double *)&pr->u.p.parm3); // divide by K*nproc
1836
      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1837
                               (ST)init, (ST)limit)) {
1838
        // CAS was successful, chunk obtained
1839
        status = 1;
1840
        --limit;
1841
        break;
1842
      } // if
1843
    } // while
1844
    if (status != 0) {
1845
      start = pr->u.p.lb;
1846
      incr = pr->u.p.st;
1847
      if (p_st != NULL)
1848
        *p_st = incr;
1849
      *p_lb = start + init * incr;
1850
      *p_ub = start + limit * incr;
1851
      if (pr->flags.ordered) {
1852
        pr->u.p.ordered_lower = init;
1853
        pr->u.p.ordered_upper = limit;
1854
      } // if
1855
    } else {
1856
      *p_lb = 0;
1857
      *p_ub = 0;
1858
      if (p_st != NULL)
1859
        *p_st = 0;
1860
    } // if
1861
  } // case
1862
  break;
1863

1864
  case kmp_sch_guided_simd: {
1865
    // same as iterative but curr-chunk adjusted to be multiple of given
1866
    // chunk
1867
    T chunk = pr->u.p.parm1;
1868
    KD_TRACE(100,
1869
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1870
              gtid));
1871
    trip = pr->u.p.tc;
1872
    // Start atomic part of calculations
1873
    while (1) {
1874
      ST remaining; // signed, because can be < 0
1875
      init = sh->u.s.iteration; // shared value
1876
      remaining = trip - init;
1877
      if (remaining <= 0) { // AC: need to compare with 0 first
1878
        status = 0; // nothing to do, don't try atomic op
1879
        break;
1880
      }
1881
      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1882
      // compare with K*nproc*(chunk+1), K=2 by default
1883
      if ((T)remaining < pr->u.p.parm2) {
1884
        // use dynamic-style schedule
1885
        // atomically increment iterations, get old value
1886
        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1887
                                 (ST)chunk);
1888
        remaining = trip - init;
1889
        if (remaining <= 0) {
1890
          status = 0; // all iterations got by other threads
1891
        } else {
1892
          // got some iterations to work on
1893
          status = 1;
1894
          if ((T)remaining > chunk) {
1895
            limit = init + chunk - 1;
1896
          } else {
1897
            last = true; // the last chunk
1898
            limit = init + remaining - 1;
1899
          } // if
1900
        } // if
1901
        break;
1902
      } // if
1903
      // divide by K*nproc
1904
      UT span;
1905
      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1906
                         &span);
1907
      UT rem = span % chunk;
1908
      if (rem) // adjust so that span%chunk == 0
1909
        span += chunk - rem;
1910
      limit = init + span;
1911
      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1912
                               (ST)init, (ST)limit)) {
1913
        // CAS was successful, chunk obtained
1914
        status = 1;
1915
        --limit;
1916
        break;
1917
      } // if
1918
    } // while
1919
    if (status != 0) {
1920
      start = pr->u.p.lb;
1921
      incr = pr->u.p.st;
1922
      if (p_st != NULL)
1923
        *p_st = incr;
1924
      *p_lb = start + init * incr;
1925
      *p_ub = start + limit * incr;
1926
      if (pr->flags.ordered) {
1927
        pr->u.p.ordered_lower = init;
1928
        pr->u.p.ordered_upper = limit;
1929
      } // if
1930
    } else {
1931
      *p_lb = 0;
1932
      *p_ub = 0;
1933
      if (p_st != NULL)
1934
        *p_st = 0;
1935
    } // if
1936
  } // case
1937
  break;
1938

1939
  case kmp_sch_guided_analytical_chunked: {
1940
    T chunkspec = pr->u.p.parm1;
1941
    UT chunkIdx;
1942
#if KMP_USE_X87CONTROL
1943
    /* for storing original FPCW value for Windows* OS on
1944
       IA-32 architecture 8-byte version */
1945
    unsigned int oldFpcw;
1946
    unsigned int fpcwSet = 0;
1947
#endif
1948
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1949
                   "kmp_sch_guided_analytical_chunked case\n",
1950
                   gtid));
1951

1952
    trip = pr->u.p.tc;
1953

1954
    KMP_DEBUG_ASSERT(nproc > 1);
1955
    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1956

1957
    while (1) { /* this while loop is a safeguard against unexpected zero
1958
                   chunk sizes */
1959
      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1960
      if (chunkIdx >= (UT)pr->u.p.parm2) {
1961
        --trip;
1962
        /* use dynamic-style scheduling */
1963
        init = chunkIdx * chunkspec + pr->u.p.count;
1964
        /* need to verify init > 0 in case of overflow in the above
1965
         * calculation */
1966
        if ((status = (init > 0 && init <= trip)) != 0) {
1967
          limit = init + chunkspec - 1;
1968

1969
          if ((last = (limit >= trip)) != 0)
1970
            limit = trip;
1971
        }
1972
        break;
1973
      } else {
1974
/* use exponential-style scheduling */
1975
/* The following check is to workaround the lack of long double precision on
1976
   Windows* OS.
1977
   This check works around the possible effect that init != 0 for chunkIdx == 0.
1978
 */
1979
#if KMP_USE_X87CONTROL
1980
        /* If we haven't already done so, save original
1981
           FPCW and set precision to 64-bit, as Windows* OS
1982
           on IA-32 architecture defaults to 53-bit */
1983
        if (!fpcwSet) {
1984
          oldFpcw = _control87(0, 0);
1985
          _control87(_PC_64, _MCW_PC);
1986
          fpcwSet = 0x30000;
1987
        }
1988
#endif
1989
        if (chunkIdx) {
1990
          init = __kmp_dispatch_guided_remaining<T>(
1991
              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1992
          KMP_DEBUG_ASSERT(init);
1993
          init = trip - init;
1994
        } else
1995
          init = 0;
1996
        limit = trip - __kmp_dispatch_guided_remaining<T>(
1997
                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1998
        KMP_ASSERT(init <= limit);
1999
        if (init < limit) {
2000
          KMP_DEBUG_ASSERT(limit <= trip);
2001
          --limit;
2002
          status = 1;
2003
          break;
2004
        } // if
2005
      } // if
2006
    } // while (1)
2007
#if KMP_USE_X87CONTROL
2008
    /* restore FPCW if necessary
2009
       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2010
    */
2011
    if (fpcwSet && (oldFpcw & fpcwSet))
2012
      _control87(oldFpcw, _MCW_PC);
2013
#endif
2014
    if (status != 0) {
2015
      start = pr->u.p.lb;
2016
      incr = pr->u.p.st;
2017
      if (p_st != NULL)
2018
        *p_st = incr;
2019
      *p_lb = start + init * incr;
2020
      *p_ub = start + limit * incr;
2021
      if (pr->flags.ordered) {
2022
        pr->u.p.ordered_lower = init;
2023
        pr->u.p.ordered_upper = limit;
2024
      }
2025
    } else {
2026
      *p_lb = 0;
2027
      *p_ub = 0;
2028
      if (p_st != NULL)
2029
        *p_st = 0;
2030
    }
2031
  } // case
2032
  break;
2033

2034
  case kmp_sch_trapezoidal: {
2035
    UT index;
2036
    T parm2 = pr->u.p.parm2;
2037
    T parm3 = pr->u.p.parm3;
2038
    T parm4 = pr->u.p.parm4;
2039
    KD_TRACE(100,
2040
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2041
              gtid));
2042

2043
    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2044

2045
    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2046
    trip = pr->u.p.tc - 1;
2047

2048
    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2049
      *p_lb = 0;
2050
      *p_ub = 0;
2051
      if (p_st != NULL)
2052
        *p_st = 0;
2053
    } else {
2054
      start = pr->u.p.lb;
2055
      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2056
      incr = pr->u.p.st;
2057

2058
      if ((last = (limit >= trip)) != 0)
2059
        limit = trip;
2060

2061
      if (p_st != NULL)
2062
        *p_st = incr;
2063

2064
      if (incr == 1) {
2065
        *p_lb = start + init;
2066
        *p_ub = start + limit;
2067
      } else {
2068
        *p_lb = start + init * incr;
2069
        *p_ub = start + limit * incr;
2070
      }
2071

2072
      if (pr->flags.ordered) {
2073
        pr->u.p.ordered_lower = init;
2074
        pr->u.p.ordered_upper = limit;
2075
      } // if
2076
    } // if
2077
  } // case
2078
  break;
2079
  default: {
2080
    status = 0; // to avoid complaints on uninitialized variable use
2081
    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2082
                KMP_HNT(GetNewerLibrary), // Hint
2083
                __kmp_msg_null // Variadic argument list terminator
2084
    );
2085
  } break;
2086
  } // switch
2087
  if (p_last)
2088
    *p_last = last;
2089
#ifdef KMP_DEBUG
2090
  if (pr->flags.ordered) {
2091
    char *buff;
2092
    // create format specifiers before the debug output
2093
    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2094
                            "ordered_lower:%%%s ordered_upper:%%%s\n",
2095
                            traits_t<UT>::spec, traits_t<UT>::spec);
2096
    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2097
    __kmp_str_free(&buff);
2098
  }
2099
  {
2100
    char *buff;
2101
    // create format specifiers before the debug output
2102
    buff = __kmp_str_format(
2103
        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2104
        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2105
        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106
    KMP_DEBUG_ASSERT(p_last);
2107
    KMP_DEBUG_ASSERT(p_st);
2108
    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2109
    __kmp_str_free(&buff);
2110
  }
2111
#endif
2112
  return status;
2113
}
2114

2115
/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2116
   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2117
   is not called. */
2118
#if OMPT_SUPPORT && OMPT_OPTIONAL
2119
#define OMPT_LOOP_END                                                          \
2120
  if (status == 0) {                                                           \
2121
    if (ompt_enabled.ompt_callback_work) {                                     \
2122
      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
2123
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
2124
      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
2125
          ompt_get_work_schedule(pr->schedule), ompt_scope_end,                \
2126
          &(team_info->parallel_data), &(task_info->task_data), 0, codeptr);   \
2127
    }                                                                          \
2128
  }
2129
#define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
2130
  if (ompt_enabled.ompt_callback_dispatch && status) {                         \
2131
    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
2132
    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
2133
    ompt_dispatch_chunk_t chunk;                                               \
2134
    ompt_data_t instance = ompt_data_none;                                     \
2135
    OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
2136
    instance.ptr = &chunk;                                                     \
2137
    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
2138
        &(team_info->parallel_data), &(task_info->task_data),                  \
2139
        ompt_dispatch_ws_loop_chunk, instance);                                \
2140
  }
2141
// TODO: implement count
2142
#else
2143
#define OMPT_LOOP_END // no-op
2144
#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2145
#endif
2146

2147
#if KMP_STATS_ENABLED
2148
#define KMP_STATS_LOOP_END                                                     \
2149
  {                                                                            \
2150
    kmp_int64 u, l, t, i;                                                      \
2151
    l = (kmp_int64)(*p_lb);                                                    \
2152
    u = (kmp_int64)(*p_ub);                                                    \
2153
    i = (kmp_int64)(pr->u.p.st);                                               \
2154
    if (status == 0) {                                                         \
2155
      t = 0;                                                                   \
2156
      KMP_POP_PARTITIONED_TIMER();                                             \
2157
    } else if (i == 1) {                                                       \
2158
      if (u >= l)                                                              \
2159
        t = u - l + 1;                                                         \
2160
      else                                                                     \
2161
        t = 0;                                                                 \
2162
    } else if (i < 0) {                                                        \
2163
      if (l >= u)                                                              \
2164
        t = (l - u) / (-i) + 1;                                                \
2165
      else                                                                     \
2166
        t = 0;                                                                 \
2167
    } else {                                                                   \
2168
      if (u >= l)                                                              \
2169
        t = (u - l) / i + 1;                                                   \
2170
      else                                                                     \
2171
        t = 0;                                                                 \
2172
    }                                                                          \
2173
    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
2174
  }
2175
#else
2176
#define KMP_STATS_LOOP_END /* Nothing */
2177
#endif
2178

2179
template <typename T>
2180
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2181
                               T *p_lb, T *p_ub,
2182
                               typename traits_t<T>::signed_t *p_st
2183
#if OMPT_SUPPORT && OMPT_OPTIONAL
2184
                               ,
2185
                               void *codeptr
2186
#endif
2187
) {
2188

2189
  typedef typename traits_t<T>::unsigned_t UT;
2190
  typedef typename traits_t<T>::signed_t ST;
2191
  // This is potentially slightly misleading, schedule(runtime) will appear here
2192
  // even if the actual runtime schedule is static. (Which points out a
2193
  // disadvantage of schedule(runtime): even when static scheduling is used it
2194
  // costs more than a compile time choice to use static scheduling would.)
2195
  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2196

2197
  int status;
2198
  dispatch_private_info_template<T> *pr;
2199
  __kmp_assert_valid_gtid(gtid);
2200
  kmp_info_t *th = __kmp_threads[gtid];
2201
  kmp_team_t *team = th->th.th_team;
2202

2203
  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2204
  KD_TRACE(
2205
      1000,
2206
      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2207
       gtid, p_lb, p_ub, p_st, p_last));
2208

2209
  if (team->t.t_serialized) {
2210
    /* NOTE: serialize this dispatch because we are not at the active level */
2211
    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2212
        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2213
    KMP_DEBUG_ASSERT(pr);
2214

2215
    if ((status = (pr->u.p.tc != 0)) == 0) {
2216
      *p_lb = 0;
2217
      *p_ub = 0;
2218
      //            if ( p_last != NULL )
2219
      //                *p_last = 0;
2220
      if (p_st != NULL)
2221
        *p_st = 0;
2222
      if (__kmp_env_consistency_check) {
2223
        if (pr->pushed_ws != ct_none) {
2224
          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225
        }
2226
      }
2227
    } else if (pr->flags.nomerge) {
2228
      kmp_int32 last;
2229
      T start;
2230
      UT limit, trip, init;
2231
      ST incr;
2232
      T chunk = pr->u.p.parm1;
2233

2234
      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2235
                     gtid));
2236

2237
      init = chunk * pr->u.p.count++;
2238
      trip = pr->u.p.tc - 1;
2239

2240
      if ((status = (init <= trip)) == 0) {
2241
        *p_lb = 0;
2242
        *p_ub = 0;
2243
        //                if ( p_last != NULL )
2244
        //                    *p_last = 0;
2245
        if (p_st != NULL)
2246
          *p_st = 0;
2247
        if (__kmp_env_consistency_check) {
2248
          if (pr->pushed_ws != ct_none) {
2249
            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2250
          }
2251
        }
2252
      } else {
2253
        start = pr->u.p.lb;
2254
        limit = chunk + init - 1;
2255
        incr = pr->u.p.st;
2256

2257
        if ((last = (limit >= trip)) != 0) {
2258
          limit = trip;
2259
#if KMP_OS_WINDOWS
2260
          pr->u.p.last_upper = pr->u.p.ub;
2261
#endif /* KMP_OS_WINDOWS */
2262
        }
2263
        if (p_last != NULL)
2264
          *p_last = last;
2265
        if (p_st != NULL)
2266
          *p_st = incr;
2267
        if (incr == 1) {
2268
          *p_lb = start + init;
2269
          *p_ub = start + limit;
2270
        } else {
2271
          *p_lb = start + init * incr;
2272
          *p_ub = start + limit * incr;
2273
        }
2274

2275
        if (pr->flags.ordered) {
2276
          pr->u.p.ordered_lower = init;
2277
          pr->u.p.ordered_upper = limit;
2278
#ifdef KMP_DEBUG
2279
          {
2280
            char *buff;
2281
            // create format specifiers before the debug output
2282
            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283
                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
2284
                                    traits_t<UT>::spec, traits_t<UT>::spec);
2285
            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286
                            pr->u.p.ordered_upper));
2287
            __kmp_str_free(&buff);
2288
          }
2289
#endif
2290
        } // if
2291
      } // if
2292
    } else {
2293
      pr->u.p.tc = 0;
2294
      *p_lb = pr->u.p.lb;
2295
      *p_ub = pr->u.p.ub;
2296
#if KMP_OS_WINDOWS
2297
      pr->u.p.last_upper = *p_ub;
2298
#endif /* KMP_OS_WINDOWS */
2299
      if (p_last != NULL)
2300
        *p_last = TRUE;
2301
      if (p_st != NULL)
2302
        *p_st = pr->u.p.st;
2303
    } // if
2304
#ifdef KMP_DEBUG
2305
    {
2306
      char *buff;
2307
      // create format specifiers before the debug output
2308
      buff = __kmp_str_format(
2309
          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2310
          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2311
          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2312
      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2313
                    (p_last ? *p_last : 0), status));
2314
      __kmp_str_free(&buff);
2315
    }
2316
#endif
2317
#if INCLUDE_SSC_MARKS
2318
    SSC_MARK_DISPATCH_NEXT();
2319
#endif
2320
    OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2321
    OMPT_LOOP_END;
2322
    KMP_STATS_LOOP_END;
2323
    return status;
2324
  } else {
2325
    kmp_int32 last = 0;
2326
    dispatch_shared_info_template<T> volatile *sh;
2327

2328
    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2329
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2330

2331
    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2332
        th->th.th_dispatch->th_dispatch_pr_current);
2333
    KMP_DEBUG_ASSERT(pr);
2334
    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2335
        th->th.th_dispatch->th_dispatch_sh_current);
2336
    KMP_DEBUG_ASSERT(sh);
2337

2338
#if KMP_USE_HIER_SCHED
2339
    if (pr->flags.use_hier)
2340
      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2341
    else
2342
#endif // KMP_USE_HIER_SCHED
2343
      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2344
                                                p_st, th->th.th_team_nproc,
2345
                                                th->th.th_info.ds.ds_tid);
2346
    // status == 0: no more iterations to execute
2347
    if (status == 0) {
2348
      ST num_done;
2349
      num_done = test_then_inc<ST>(&sh->u.s.num_done);
2350
#ifdef KMP_DEBUG
2351
      {
2352
        char *buff;
2353
        // create format specifiers before the debug output
2354
        buff = __kmp_str_format(
2355
            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2356
            traits_t<ST>::spec);
2357
        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2358
        __kmp_str_free(&buff);
2359
      }
2360
#endif
2361

2362
#if KMP_USE_HIER_SCHED
2363
      pr->flags.use_hier = FALSE;
2364
#endif
2365
      if (num_done == th->th.th_team_nproc - 1) {
2366
#if KMP_STATIC_STEAL_ENABLED
2367
        if (pr->schedule == kmp_sch_static_steal) {
2368
          int i;
2369
          int idx = (th->th.th_dispatch->th_disp_index - 1) %
2370
                    __kmp_dispatch_num_buffers; // current loop index
2371
          // loop complete, safe to destroy locks used for stealing
2372
          for (i = 0; i < th->th.th_team_nproc; ++i) {
2373
            dispatch_private_info_template<T> *buf =
2374
                reinterpret_cast<dispatch_private_info_template<T> *>(
2375
                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
2376
            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2377
            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2378
            if (traits_t<T>::type_size > 4) {
2379
              // destroy locks used for stealing
2380
              kmp_lock_t *lck = buf->u.p.steal_lock;
2381
              KMP_ASSERT(lck != NULL);
2382
              __kmp_destroy_lock(lck);
2383
              __kmp_free(lck);
2384
              buf->u.p.steal_lock = NULL;
2385
            }
2386
          }
2387
        }
2388
#endif
2389
        /* NOTE: release shared buffer to be reused */
2390

2391
        KMP_MB(); /* Flush all pending memory write invalidates.  */
2392

2393
        sh->u.s.num_done = 0;
2394
        sh->u.s.iteration = 0;
2395

2396
        /* TODO replace with general release procedure? */
2397
        if (pr->flags.ordered) {
2398
          sh->u.s.ordered_iteration = 0;
2399
        }
2400

2401
        KMP_MB(); /* Flush all pending memory write invalidates.  */
2402

2403
        sh->buffer_index += __kmp_dispatch_num_buffers;
2404
        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2405
                       gtid, sh->buffer_index));
2406

2407
        KMP_MB(); /* Flush all pending memory write invalidates.  */
2408

2409
      } // if
2410
      if (__kmp_env_consistency_check) {
2411
        if (pr->pushed_ws != ct_none) {
2412
          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2413
        }
2414
      }
2415

2416
      th->th.th_dispatch->th_deo_fcn = NULL;
2417
      th->th.th_dispatch->th_dxo_fcn = NULL;
2418
      th->th.th_dispatch->th_dispatch_sh_current = NULL;
2419
      th->th.th_dispatch->th_dispatch_pr_current = NULL;
2420
    } // if (status == 0)
2421
#if KMP_OS_WINDOWS
2422
    else if (last) {
2423
      pr->u.p.last_upper = pr->u.p.ub;
2424
    }
2425
#endif /* KMP_OS_WINDOWS */
2426
    if (p_last != NULL && status != 0)
2427
      *p_last = last;
2428
  } // if
2429

2430
#ifdef KMP_DEBUG
2431
  {
2432
    char *buff;
2433
    // create format specifiers before the debug output
2434
    buff = __kmp_str_format(
2435
        "__kmp_dispatch_next: T#%%d normal case: "
2436
        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2437
        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2438
    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2439
                  (p_last ? *p_last : 0), status));
2440
    __kmp_str_free(&buff);
2441
  }
2442
#endif
2443
#if INCLUDE_SSC_MARKS
2444
  SSC_MARK_DISPATCH_NEXT();
2445
#endif
2446
  OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2447
  OMPT_LOOP_END;
2448
  KMP_STATS_LOOP_END;
2449
  return status;
2450
}
2451

2452
/*!
2453
@ingroup WORK_SHARING
2454
@param loc  source location information
2455
@param global_tid  global thread number
2456
@return Zero if the parallel region is not active and this thread should execute
2457
all sections, non-zero otherwise.
2458

2459
Beginning of sections construct.
2460
There are no implicit barriers in the "sections" calls, rather the compiler
2461
should introduce an explicit barrier if it is required.
2462

2463
This implementation is based on __kmp_dispatch_init, using same constructs for
2464
shared data (we can't have sections nested directly in omp for loop, there
2465
should be a parallel region in between)
2466
*/
2467
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2468

2469
  int active;
2470
  kmp_info_t *th;
2471
  kmp_team_t *team;
2472
  kmp_uint32 my_buffer_index;
2473
  dispatch_shared_info_template<kmp_int32> volatile *sh;
2474

2475
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2476

2477
  if (!TCR_4(__kmp_init_parallel))
2478
    __kmp_parallel_initialize();
2479
  __kmp_resume_if_soft_paused();
2480

2481
  /* setup data */
2482
  th = __kmp_threads[gtid];
2483
  team = th->th.th_team;
2484
  active = !team->t.t_serialized;
2485
  th->th.th_ident = loc;
2486

2487
  KMP_COUNT_BLOCK(OMP_SECTIONS);
2488
  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2489

2490
  if (active) {
2491
    // Setup sections in the same way as dynamic scheduled loops.
2492
    // We need one shared data: which section is to execute next.
2493
    // (in case parallel is not active, all sections will be executed on the
2494
    // same thread)
2495
    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2496
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2497

2498
    my_buffer_index = th->th.th_dispatch->th_disp_index++;
2499

2500
    // reuse shared data structures from dynamic sched loops:
2501
    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2502
        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2503
    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2504
                  my_buffer_index));
2505

2506
    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2507
    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2508

2509
    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2510
                   "sh->buffer_index:%d\n",
2511
                   gtid, my_buffer_index, sh->buffer_index));
2512
    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2513
                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2514
    // Note: KMP_WAIT() cannot be used there: buffer index and
2515
    // my_buffer_index are *always* 32-bit integers.
2516
    KMP_MB();
2517
    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2518
                   "sh->buffer_index:%d\n",
2519
                   gtid, my_buffer_index, sh->buffer_index));
2520

2521
    th->th.th_dispatch->th_dispatch_pr_current =
2522
        nullptr; // sections construct doesn't need private data
2523
    th->th.th_dispatch->th_dispatch_sh_current =
2524
        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2525
  }
2526

2527
#if OMPT_SUPPORT && OMPT_OPTIONAL
2528
  if (ompt_enabled.ompt_callback_work) {
2529
    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2530
    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2531
    ompt_callbacks.ompt_callback(ompt_callback_work)(
2532
        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2533
        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2534
  }
2535
#endif
2536
  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2537

2538
  return active;
2539
}
2540

2541
/*!
2542
@ingroup WORK_SHARING
2543
@param loc  source location information
2544
@param global_tid  global thread number
2545
@param numberOfSections  number of sections in the 'sections' construct
2546
@return unsigned [from 0 to n) - number (id) of the section to execute next on
2547
this thread. n (or any other number not in range) - nothing to execute on this
2548
thread
2549
*/
2550

2551
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2552
                              kmp_int32 numberOfSections) {
2553

2554
  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2555

2556
  kmp_info_t *th = __kmp_threads[gtid];
2557
#ifdef KMP_DEBUG
2558
  kmp_team_t *team = th->th.th_team;
2559
#endif
2560

2561
  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2562
                  numberOfSections));
2563

2564
  // For serialized case we should not call this function:
2565
  KMP_DEBUG_ASSERT(!team->t.t_serialized);
2566

2567
  dispatch_shared_info_template<kmp_int32> volatile *sh;
2568

2569
  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2570
                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2571

2572
  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2573
  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2574
      th->th.th_dispatch->th_dispatch_sh_current);
2575
  KMP_DEBUG_ASSERT(sh);
2576

2577
  kmp_int32 sectionIndex = 0;
2578
  bool moreSectionsToExecute = true;
2579

2580
  // Find section to execute:
2581
  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2582
  if (sectionIndex >= numberOfSections) {
2583
    moreSectionsToExecute = false;
2584
  }
2585

2586
  // status == 0: no more sections to execute;
2587
  // OMPTODO: __kmpc_end_sections could be bypassed?
2588
  if (!moreSectionsToExecute) {
2589
    kmp_int32 num_done;
2590

2591
    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2592

2593
    if (num_done == th->th.th_team_nproc - 1) {
2594
      /* NOTE: release this buffer to be reused */
2595

2596
      KMP_MB(); /* Flush all pending memory write invalidates.  */
2597

2598
      sh->u.s.num_done = 0;
2599
      sh->u.s.iteration = 0;
2600

2601
      KMP_MB(); /* Flush all pending memory write invalidates.  */
2602

2603
      sh->buffer_index += __kmp_dispatch_num_buffers;
2604
      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2605
                     sh->buffer_index));
2606

2607
      KMP_MB(); /* Flush all pending memory write invalidates.  */
2608

2609
    } // if
2610

2611
    th->th.th_dispatch->th_deo_fcn = NULL;
2612
    th->th.th_dispatch->th_dxo_fcn = NULL;
2613
    th->th.th_dispatch->th_dispatch_sh_current = NULL;
2614
    th->th.th_dispatch->th_dispatch_pr_current = NULL;
2615

2616
#if OMPT_SUPPORT && OMPT_OPTIONAL
2617
    if (ompt_enabled.ompt_callback_dispatch) {
2618
      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2619
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2620
      ompt_data_t instance = ompt_data_none;
2621
      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2622
      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2623
          &(team_info->parallel_data), &(task_info->task_data),
2624
          ompt_dispatch_section, instance);
2625
    }
2626
#endif
2627
  }
2628

2629
  return sectionIndex;
2630
}
2631

2632
/*!
2633
@ingroup WORK_SHARING
2634
@param loc  source location information
2635
@param global_tid  global thread number
2636

2637
End of "sections" construct.
2638
Don't need to wait here: barrier is added separately when needed.
2639
*/
2640
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2641

2642
  kmp_info_t *th = __kmp_threads[gtid];
2643
  int active = !th->th.th_team->t.t_serialized;
2644

2645
  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2646

2647
  if (!active) {
2648
    // In active case call finalization is done in __kmpc_next_section
2649
#if OMPT_SUPPORT && OMPT_OPTIONAL
2650
    if (ompt_enabled.ompt_callback_work) {
2651
      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2652
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2653
      ompt_callbacks.ompt_callback(ompt_callback_work)(
2654
          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2655
          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2656
    }
2657
#endif
2658
  }
2659

2660
  KMP_POP_PARTITIONED_TIMER();
2661
  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2662
}
2663

2664
template <typename T>
2665
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2666
                                  kmp_int32 *plastiter, T *plower, T *pupper,
2667
                                  typename traits_t<T>::signed_t incr) {
2668
  typedef typename traits_t<T>::unsigned_t UT;
2669
  kmp_uint32 team_id;
2670
  kmp_uint32 nteams;
2671
  UT trip_count;
2672
  kmp_team_t *team;
2673
  kmp_info_t *th;
2674

2675
  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2676
  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2677
#ifdef KMP_DEBUG
2678
  typedef typename traits_t<T>::signed_t ST;
2679
  {
2680
    char *buff;
2681
    // create format specifiers before the debug output
2682
    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2683
                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2684
                            traits_t<T>::spec, traits_t<T>::spec,
2685
                            traits_t<ST>::spec, traits_t<T>::spec);
2686
    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2687
    __kmp_str_free(&buff);
2688
  }
2689
#endif
2690

2691
  if (__kmp_env_consistency_check) {
2692
    if (incr == 0) {
2693
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2694
                            loc);
2695
    }
2696
    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2697
      // The loop is illegal.
2698
      // Some zero-trip loops maintained by compiler, e.g.:
2699
      //   for(i=10;i<0;++i) // lower >= upper - run-time check
2700
      //   for(i=0;i>10;--i) // lower <= upper - run-time check
2701
      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2702
      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2703
      // Compiler does not check the following illegal loops:
2704
      //   for(i=0;i<10;i+=incr) // where incr<0
2705
      //   for(i=10;i>0;i-=incr) // where incr<0
2706
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2707
    }
2708
  }
2709
  __kmp_assert_valid_gtid(gtid);
2710
  th = __kmp_threads[gtid];
2711
  team = th->th.th_team;
2712
  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2713
  nteams = th->th.th_teams_size.nteams;
2714
  team_id = team->t.t_master_tid;
2715
  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2716

2717
  // compute global trip count
2718
  if (incr == 1) {
2719
    trip_count = *pupper - *plower + 1;
2720
  } else if (incr == -1) {
2721
    trip_count = *plower - *pupper + 1;
2722
  } else if (incr > 0) {
2723
    // upper-lower can exceed the limit of signed type
2724
    trip_count = (UT)(*pupper - *plower) / incr + 1;
2725
  } else {
2726
    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2727
  }
2728

2729
  if (trip_count <= nteams) {
2730
    KMP_DEBUG_ASSERT(
2731
        __kmp_static == kmp_sch_static_greedy ||
2732
        __kmp_static ==
2733
            kmp_sch_static_balanced); // Unknown static scheduling type.
2734
    // only some teams get single iteration, others get nothing
2735
    if (team_id < trip_count) {
2736
      *pupper = *plower = *plower + team_id * incr;
2737
    } else {
2738
      *plower = *pupper + incr; // zero-trip loop
2739
    }
2740
    if (plastiter != NULL)
2741
      *plastiter = (team_id == trip_count - 1);
2742
  } else {
2743
    if (__kmp_static == kmp_sch_static_balanced) {
2744
      UT chunk = trip_count / nteams;
2745
      UT extras = trip_count % nteams;
2746
      *plower +=
2747
          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2748
      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2749
      if (plastiter != NULL)
2750
        *plastiter = (team_id == nteams - 1);
2751
    } else {
2752
      T chunk_inc_count =
2753
          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2754
      T upper = *pupper;
2755
      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2756
      // Unknown static scheduling type.
2757
      *plower += team_id * chunk_inc_count;
2758
      *pupper = *plower + chunk_inc_count - incr;
2759
      // Check/correct bounds if needed
2760
      if (incr > 0) {
2761
        if (*pupper < *plower)
2762
          *pupper = traits_t<T>::max_value;
2763
        if (plastiter != NULL)
2764
          *plastiter = *plower <= upper && *pupper > upper - incr;
2765
        if (*pupper > upper)
2766
          *pupper = upper; // tracker C73258
2767
      } else {
2768
        if (*pupper > *plower)
2769
          *pupper = traits_t<T>::min_value;
2770
        if (plastiter != NULL)
2771
          *plastiter = *plower >= upper && *pupper < upper - incr;
2772
        if (*pupper < upper)
2773
          *pupper = upper; // tracker C73258
2774
      }
2775
    }
2776
  }
2777
}
2778

2779
//-----------------------------------------------------------------------------
2780
// Dispatch routines
2781
//    Transfer call to template< type T >
2782
//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2783
//                         T lb, T ub, ST st, ST chunk )
2784
extern "C" {
2785

2786
/*!
2787
@ingroup WORK_SHARING
2788
@{
2789
@param loc Source location
2790
@param gtid Global thread id
2791
@param schedule Schedule type
2792
@param lb  Lower bound
2793
@param ub  Upper bound
2794
@param st  Step (or increment if you prefer)
2795
@param chunk The chunk size to block with
2796

2797
This function prepares the runtime to start a dynamically scheduled for loop,
2798
saving the loop arguments.
2799
These functions are all identical apart from the types of the arguments.
2800
*/
2801

2802
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2803
                            enum sched_type schedule, kmp_int32 lb,
2804
                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2805
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2806
#if OMPT_SUPPORT && OMPT_OPTIONAL
2807
  OMPT_STORE_RETURN_ADDRESS(gtid);
2808
#endif
2809
  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2810
}
2811
/*!
2812
See @ref __kmpc_dispatch_init_4
2813
*/
2814
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2815
                             enum sched_type schedule, kmp_uint32 lb,
2816
                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2817
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2818
#if OMPT_SUPPORT && OMPT_OPTIONAL
2819
  OMPT_STORE_RETURN_ADDRESS(gtid);
2820
#endif
2821
  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2822
}
2823

2824
/*!
2825
See @ref __kmpc_dispatch_init_4
2826
*/
2827
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2828
                            enum sched_type schedule, kmp_int64 lb,
2829
                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2830
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2831
#if OMPT_SUPPORT && OMPT_OPTIONAL
2832
  OMPT_STORE_RETURN_ADDRESS(gtid);
2833
#endif
2834
  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2835
}
2836

2837
/*!
2838
See @ref __kmpc_dispatch_init_4
2839
*/
2840
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2841
                             enum sched_type schedule, kmp_uint64 lb,
2842
                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2843
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2844
#if OMPT_SUPPORT && OMPT_OPTIONAL
2845
  OMPT_STORE_RETURN_ADDRESS(gtid);
2846
#endif
2847
  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2848
}
2849

2850
/*!
2851
See @ref __kmpc_dispatch_init_4
2852

2853
Difference from __kmpc_dispatch_init set of functions is these functions
2854
are called for composite distribute parallel for construct. Thus before
2855
regular iterations dispatching we need to calc per-team iteration space.
2856

2857
These functions are all identical apart from the types of the arguments.
2858
*/
2859
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2860
                                 enum sched_type schedule, kmp_int32 *p_last,
2861
                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2862
                                 kmp_int32 chunk) {
2863
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2864
#if OMPT_SUPPORT && OMPT_OPTIONAL
2865
  OMPT_STORE_RETURN_ADDRESS(gtid);
2866
#endif
2867
  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2868
  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2869
}
2870

2871
void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2872
                                  enum sched_type schedule, kmp_int32 *p_last,
2873
                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2874
                                  kmp_int32 chunk) {
2875
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2876
#if OMPT_SUPPORT && OMPT_OPTIONAL
2877
  OMPT_STORE_RETURN_ADDRESS(gtid);
2878
#endif
2879
  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2880
  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2881
}
2882

2883
void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2884
                                 enum sched_type schedule, kmp_int32 *p_last,
2885
                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2886
                                 kmp_int64 chunk) {
2887
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2888
#if OMPT_SUPPORT && OMPT_OPTIONAL
2889
  OMPT_STORE_RETURN_ADDRESS(gtid);
2890
#endif
2891
  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2892
  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2893
}
2894

2895
void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2896
                                  enum sched_type schedule, kmp_int32 *p_last,
2897
                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2898
                                  kmp_int64 chunk) {
2899
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2900
#if OMPT_SUPPORT && OMPT_OPTIONAL
2901
  OMPT_STORE_RETURN_ADDRESS(gtid);
2902
#endif
2903
  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2904
  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2905
}
2906

2907
/*!
2908
@param loc Source code location
2909
@param gtid Global thread id
2910
@param p_last Pointer to a flag set to one if this is the last chunk or zero
2911
otherwise
2912
@param p_lb   Pointer to the lower bound for the next chunk of work
2913
@param p_ub   Pointer to the upper bound for the next chunk of work
2914
@param p_st   Pointer to the stride for the next chunk of work
2915
@return one if there is work to be done, zero otherwise
2916

2917
Get the next dynamically allocated chunk of work for this thread.
2918
If there is no more work, then the lb,ub and stride need not be modified.
2919
*/
2920
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2921
                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2922
#if OMPT_SUPPORT && OMPT_OPTIONAL
2923
  OMPT_STORE_RETURN_ADDRESS(gtid);
2924
#endif
2925
  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2926
#if OMPT_SUPPORT && OMPT_OPTIONAL
2927
                                        ,
2928
                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2929
#endif
2930
  );
2931
}
2932

2933
/*!
2934
See @ref __kmpc_dispatch_next_4
2935
*/
2936
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2937
                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2938
                            kmp_int32 *p_st) {
2939
#if OMPT_SUPPORT && OMPT_OPTIONAL
2940
  OMPT_STORE_RETURN_ADDRESS(gtid);
2941
#endif
2942
  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2943
#if OMPT_SUPPORT && OMPT_OPTIONAL
2944
                                         ,
2945
                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2946
#endif
2947
  );
2948
}
2949

2950
/*!
2951
See @ref __kmpc_dispatch_next_4
2952
*/
2953
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2954
                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2955
#if OMPT_SUPPORT && OMPT_OPTIONAL
2956
  OMPT_STORE_RETURN_ADDRESS(gtid);
2957
#endif
2958
  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2959
#if OMPT_SUPPORT && OMPT_OPTIONAL
2960
                                        ,
2961
                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2962
#endif
2963
  );
2964
}
2965

2966
/*!
2967
See @ref __kmpc_dispatch_next_4
2968
*/
2969
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2970
                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2971
                            kmp_int64 *p_st) {
2972
#if OMPT_SUPPORT && OMPT_OPTIONAL
2973
  OMPT_STORE_RETURN_ADDRESS(gtid);
2974
#endif
2975
  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2976
#if OMPT_SUPPORT && OMPT_OPTIONAL
2977
                                         ,
2978
                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2979
#endif
2980
  );
2981
}
2982

2983
/*!
2984
@param loc Source code location
2985
@param gtid Global thread id
2986

2987
Mark the end of a dynamic loop.
2988
*/
2989
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2990
  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2991
}
2992

2993
/*!
2994
See @ref __kmpc_dispatch_fini_4
2995
*/
2996
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2997
  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2998
}
2999

3000
/*!
3001
See @ref __kmpc_dispatch_fini_4
3002
*/
3003
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3004
  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3005
}
3006

3007
/*!
3008
See @ref __kmpc_dispatch_fini_4
3009
*/
3010
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3011
  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3012
}
3013

3014
/*!
3015
See @ref __kmpc_dispatch_deinit
3016
*/
3017
void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
3018
/*! @} */
3019

3020
//-----------------------------------------------------------------------------
3021
// Non-template routines from kmp_dispatch.cpp used in other sources
3022

3023
kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3024
  return value == checker;
3025
}
3026

3027
kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3028
  return value != checker;
3029
}
3030

3031
kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3032
  return value < checker;
3033
}
3034

3035
kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3036
  return value >= checker;
3037
}
3038

3039
kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3040
  return value <= checker;
3041
}
3042

3043
kmp_uint32
3044
__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3045
             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3046
             void *obj // Higher-level synchronization object, or NULL.
3047
) {
3048
  // note: we may not belong to a team at this point
3049
  volatile kmp_uint32 *spin = spinner;
3050
  kmp_uint32 check = checker;
3051
  kmp_uint32 spins;
3052
  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3053
  kmp_uint32 r;
3054
  kmp_uint64 time;
3055

3056
  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3057
  KMP_INIT_YIELD(spins);
3058
  KMP_INIT_BACKOFF(time);
3059
  // main wait spin loop
3060
  while (!f(r = TCR_4(*spin), check)) {
3061
    KMP_FSYNC_SPIN_PREPARE(obj);
3062
    /* GEH - remove this since it was accidentally introduced when kmp_wait was
3063
       split. It causes problems with infinite recursion because of exit lock */
3064
    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3065
        __kmp_abort_thread(); */
3066
    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3067
  }
3068
  KMP_FSYNC_SPIN_ACQUIRED(obj);
3069
  return r;
3070
}
3071

3072
void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3073
                      kmp_uint32 (*pred)(void *, kmp_uint32),
3074
                      void *obj // Higher-level synchronization object, or NULL.
3075
) {
3076
  // note: we may not belong to a team at this point
3077
  void *spin = spinner;
3078
  kmp_uint32 check = checker;
3079
  kmp_uint32 spins;
3080
  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3081
  kmp_uint64 time;
3082

3083
  KMP_FSYNC_SPIN_INIT(obj, spin);
3084
  KMP_INIT_YIELD(spins);
3085
  KMP_INIT_BACKOFF(time);
3086
  // main wait spin loop
3087
  while (!f(spin, check)) {
3088
    KMP_FSYNC_SPIN_PREPARE(obj);
3089
    /* if we have waited a bit, or are noversubscribed, yield */
3090
    /* pause is in the following code */
3091
    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3092
  }
3093
  KMP_FSYNC_SPIN_ACQUIRED(obj);
3094
}
3095

3096
} // extern "C"
3097

3098
#ifdef KMP_GOMP_COMPAT
3099

3100
void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3101
                               enum sched_type schedule, kmp_int32 lb,
3102
                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3103
                               int push_ws) {
3104
  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3105
                                 push_ws);
3106
}
3107

3108
void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3109
                                enum sched_type schedule, kmp_uint32 lb,
3110
                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3111
                                int push_ws) {
3112
  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3113
                                  push_ws);
3114
}
3115

3116
void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3117
                               enum sched_type schedule, kmp_int64 lb,
3118
                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3119
                               int push_ws) {
3120
  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3121
                                 push_ws);
3122
}
3123

3124
void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3125
                                enum sched_type schedule, kmp_uint64 lb,
3126
                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3127
                                int push_ws) {
3128
  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3129
                                  push_ws);
3130
}
3131

3132
void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3133
  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134
}
3135

3136
void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3137
  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138
}
3139

3140
void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3141
  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3142
}
3143

3144
void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3145
  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3146
}
3147

3148
#endif /* KMP_GOMP_COMPAT */
3149

3150
/* ------------------------------------------------------------------------ */
3151

3152
Product

Resources

Company