CoCalc -- kmp_barrier.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp
³⁵²⁵⁸ views
1
/*
2
 * kmp_barrier.cpp
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
#include "kmp_wait_release.h"
14
#include "kmp_barrier.h"
15
#include "kmp_itt.h"
16
#include "kmp_os.h"
17
#include "kmp_stats.h"
18
#include "ompt-specific.h"
19
// for distributed barrier
20
#include "kmp_affinity.h"
21

22
#if KMP_MIC
23
#include <immintrin.h>
24
#define USE_NGO_STORES 1
25
#endif // KMP_MIC
26

27
#if KMP_MIC && USE_NGO_STORES
28
// ICV copying
29
#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30
#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31
#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32
#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33
#else
34
#define ngo_load(src) ((void)0)
35
#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36
#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37
#define ngo_sync() ((void)0)
38
#endif /* KMP_MIC && USE_NGO_STORES */
39

40
void __kmp_print_structure(void); // Forward declaration
41

42
// ---------------------------- Barrier Algorithms ----------------------------
43
// Distributed barrier
44

45
// Compute how many threads to have polling each cache-line.
46
// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47
void distributedBarrier::computeVarsForN(size_t n) {
48
  int nsockets = 1;
49
  if (__kmp_topology) {
50
    int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51
    int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52
    int ncores_per_socket =
53
        __kmp_topology->calculate_ratio(core_level, socket_level);
54
    nsockets = __kmp_topology->get_count(socket_level);
55

56
    if (nsockets <= 0)
57
      nsockets = 1;
58
    if (ncores_per_socket <= 0)
59
      ncores_per_socket = 1;
60

61
    threads_per_go = ncores_per_socket >> 1;
62
    if (!fix_threads_per_go) {
63
      // Minimize num_gos
64
      if (threads_per_go > 4) {
65
        if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66
          threads_per_go = threads_per_go >> 1;
67
        }
68
        if (threads_per_go > 4 && nsockets == 1)
69
          threads_per_go = threads_per_go >> 1;
70
      }
71
    }
72
    if (threads_per_go == 0)
73
      threads_per_go = 1;
74
    fix_threads_per_go = true;
75
    num_gos = n / threads_per_go;
76
    if (n % threads_per_go)
77
      num_gos++;
78
    if (nsockets == 1 || num_gos == 1)
79
      num_groups = 1;
80
    else {
81
      num_groups = num_gos / nsockets;
82
      if (num_gos % nsockets)
83
        num_groups++;
84
    }
85
    if (num_groups <= 0)
86
      num_groups = 1;
87
    gos_per_group = num_gos / num_groups;
88
    if (num_gos % num_groups)
89
      gos_per_group++;
90
    threads_per_group = threads_per_go * gos_per_group;
91
  } else {
92
    num_gos = n / threads_per_go;
93
    if (n % threads_per_go)
94
      num_gos++;
95
    if (num_gos == 1)
96
      num_groups = 1;
97
    else {
98
      num_groups = num_gos / 2;
99
      if (num_gos % 2)
100
        num_groups++;
101
    }
102
    gos_per_group = num_gos / num_groups;
103
    if (num_gos % num_groups)
104
      gos_per_group++;
105
    threads_per_group = threads_per_go * gos_per_group;
106
  }
107
}
108

109
void distributedBarrier::computeGo(size_t n) {
110
  // Minimize num_gos
111
  for (num_gos = 1;; num_gos++)
112
    if (IDEAL_CONTENTION * num_gos >= n)
113
      break;
114
  threads_per_go = n / num_gos;
115
  if (n % num_gos)
116
    threads_per_go++;
117
  while (num_gos > MAX_GOS) {
118
    threads_per_go++;
119
    num_gos = n / threads_per_go;
120
    if (n % threads_per_go)
121
      num_gos++;
122
  }
123
  computeVarsForN(n);
124
}
125

126
// This function is to resize the barrier arrays when the new number of threads
127
// exceeds max_threads, which is the current size of all the arrays
128
void distributedBarrier::resize(size_t nthr) {
129
  KMP_DEBUG_ASSERT(nthr > max_threads);
130

131
  // expand to requested size * 2
132
  max_threads = nthr * 2;
133

134
  // allocate arrays to new max threads
135
  for (int i = 0; i < MAX_ITERS; ++i) {
136
    if (flags[i])
137
      flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138
                                                 max_threads * sizeof(flags_s));
139
    else
140
      flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141
  }
142

143
  if (go)
144
    go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145
  else
146
    go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147

148
  if (iter)
149
    iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150
  else
151
    iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152

153
  if (sleep)
154
    sleep =
155
        (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156
  else
157
    sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158
}
159

160
// This function is to set all the go flags that threads might be waiting
161
// on, and when blocktime is not infinite, it should be followed by a wake-up
162
// call to each thread
163
kmp_uint64 distributedBarrier::go_release() {
164
  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165
  for (size_t j = 0; j < num_gos; j++) {
166
    go[j].go.store(next_go);
167
  }
168
  return next_go;
169
}
170

171
void distributedBarrier::go_reset() {
172
  for (size_t j = 0; j < max_threads; ++j) {
173
    for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174
      flags[i][j].stillNeed = 1;
175
    }
176
    go[j].go.store(0);
177
    iter[j].iter = 0;
178
  }
179
}
180

181
// This function inits/re-inits the distributed barrier for a particular number
182
// of threads. If a resize of arrays is needed, it calls the resize function.
183
void distributedBarrier::init(size_t nthr) {
184
  size_t old_max = max_threads;
185
  if (nthr > max_threads) { // need more space in arrays
186
    resize(nthr);
187
  }
188

189
  for (size_t i = 0; i < max_threads; i++) {
190
    for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191
      flags[j][i].stillNeed = 1;
192
    }
193
    go[i].go.store(0);
194
    iter[i].iter = 0;
195
    if (i >= old_max)
196
      sleep[i].sleep = false;
197
  }
198

199
  // Recalculate num_gos, etc. based on new nthr
200
  computeVarsForN(nthr);
201

202
  num_threads = nthr;
203

204
  if (team_icvs == NULL)
205
    team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206
}
207

208
// This function is used only when KMP_BLOCKTIME is not infinite.
209
// static
210
void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211
                               size_t start, size_t stop, size_t inc,
212
                               size_t tid) {
213
  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214
  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215
    return;
216

217
  kmp_info_t **other_threads = team->t.t_threads;
218
  for (size_t thr = start; thr < stop; thr += inc) {
219
    KMP_DEBUG_ASSERT(other_threads[thr]);
220
    int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221
    // Wake up worker regardless of if it appears to be sleeping or not
222
    __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223
  }
224
}
225

226
static void __kmp_dist_barrier_gather(
227
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230
  kmp_team_t *team;
231
  distributedBarrier *b;
232
  kmp_info_t **other_threads;
233
  kmp_uint64 my_current_iter, my_next_iter;
234
  kmp_uint32 nproc;
235
  bool group_leader;
236

237
  team = this_thr->th.th_team;
238
  nproc = this_thr->th.th_team_nproc;
239
  other_threads = team->t.t_threads;
240
  b = team->t.b;
241
  my_current_iter = b->iter[tid].iter;
242
  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243
  group_leader = ((tid % b->threads_per_group) == 0);
244

245
  KA_TRACE(20,
246
           ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247
            gtid, team->t.t_id, tid, bt));
248

249
#if USE_ITT_BUILD && USE_ITT_NOTIFY
250
  // Barrier imbalance - save arrive time to the thread
251
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253
        __itt_get_timestamp();
254
  }
255
#endif
256

257
  if (group_leader) {
258
    // Start from the thread after the group leader
259
    size_t group_start = tid + 1;
260
    size_t group_end = tid + b->threads_per_group;
261
    size_t threads_pending = 0;
262

263
    if (group_end > nproc)
264
      group_end = nproc;
265
    do { // wait for threads in my group
266
      threads_pending = 0;
267
      // Check all the flags every time to avoid branch misspredict
268
      for (size_t thr = group_start; thr < group_end; thr++) {
269
        // Each thread uses a different cache line
270
        threads_pending += b->flags[my_current_iter][thr].stillNeed;
271
      }
272
      // Execute tasks here
273
      if (__kmp_tasking_mode != tskm_immediate_exec) {
274
        kmp_task_team_t *task_team = this_thr->th.th_task_team;
275
        if (task_team != NULL) {
276
          if (TCR_SYNC_4(task_team->tt.tt_active)) {
277
            if (KMP_TASKING_ENABLED(task_team)) {
278
              int tasks_completed = FALSE;
279
              __kmp_atomic_execute_tasks_64(
280
                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281
                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282
            } else
283
              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284
          }
285
        } else {
286
          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287
        } // if
288
      }
289
      if (TCR_4(__kmp_global.g.g_done)) {
290
        if (__kmp_global.g.g_abort)
291
          __kmp_abort_thread();
292
        break;
293
      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294
                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295
        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296
      }
297
    } while (threads_pending > 0);
298

299
    if (reduce) { // Perform reduction if needed
300
      OMPT_REDUCTION_DECL(this_thr, gtid);
301
      OMPT_REDUCTION_BEGIN;
302
      // Group leader reduces all threads in group
303
      for (size_t thr = group_start; thr < group_end; thr++) {
304
        (*reduce)(this_thr->th.th_local.reduce_data,
305
                  other_threads[thr]->th.th_local.reduce_data);
306
      }
307
      OMPT_REDUCTION_END;
308
    }
309

310
    // Set flag for next iteration
311
    b->flags[my_next_iter][tid].stillNeed = 1;
312
    // Each thread uses a different cache line; resets stillNeed to 0 to
313
    // indicate it has reached the barrier
314
    b->flags[my_current_iter][tid].stillNeed = 0;
315

316
    do { // wait for all group leaders
317
      threads_pending = 0;
318
      for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319
        threads_pending += b->flags[my_current_iter][thr].stillNeed;
320
      }
321
      // Execute tasks here
322
      if (__kmp_tasking_mode != tskm_immediate_exec) {
323
        kmp_task_team_t *task_team = this_thr->th.th_task_team;
324
        if (task_team != NULL) {
325
          if (TCR_SYNC_4(task_team->tt.tt_active)) {
326
            if (KMP_TASKING_ENABLED(task_team)) {
327
              int tasks_completed = FALSE;
328
              __kmp_atomic_execute_tasks_64(
329
                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330
                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331
            } else
332
              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333
          }
334
        } else {
335
          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336
        } // if
337
      }
338
      if (TCR_4(__kmp_global.g.g_done)) {
339
        if (__kmp_global.g.g_abort)
340
          __kmp_abort_thread();
341
        break;
342
      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343
                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344
        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345
      }
346
    } while (threads_pending > 0);
347

348
    if (reduce) { // Perform reduction if needed
349
      if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350
        OMPT_REDUCTION_DECL(this_thr, gtid);
351
        OMPT_REDUCTION_BEGIN;
352
        for (size_t thr = b->threads_per_group; thr < nproc;
353
             thr += b->threads_per_group) {
354
          (*reduce)(this_thr->th.th_local.reduce_data,
355
                    other_threads[thr]->th.th_local.reduce_data);
356
        }
357
        OMPT_REDUCTION_END;
358
      }
359
    }
360
  } else {
361
    // Set flag for next iteration
362
    b->flags[my_next_iter][tid].stillNeed = 1;
363
    // Each thread uses a different cache line; resets stillNeed to 0 to
364
    // indicate it has reached the barrier
365
    b->flags[my_current_iter][tid].stillNeed = 0;
366
  }
367

368
  KMP_MFENCE();
369

370
  KA_TRACE(20,
371
           ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372
            gtid, team->t.t_id, tid, bt));
373
}
374

375
static void __kmp_dist_barrier_release(
376
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379
  kmp_team_t *team;
380
  distributedBarrier *b;
381
  kmp_bstate_t *thr_bar;
382
  kmp_uint64 my_current_iter, next_go;
383
  size_t my_go_index;
384
  bool group_leader;
385

386
  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387
                gtid, tid, bt));
388

389
  thr_bar = &this_thr->th.th_bar[bt].bb;
390

391
  if (!KMP_MASTER_TID(tid)) {
392
    // workers and non-master group leaders need to check their presence in team
393
    do {
394
      if (this_thr->th.th_used_in_team.load() != 1 &&
395
          this_thr->th.th_used_in_team.load() != 3) {
396
        // Thread is not in use in a team. Wait on location in tid's thread
397
        // struct. The 0 value tells anyone looking that this thread is spinning
398
        // or sleeping until this location becomes 3 again; 3 is the transition
399
        // state to get to 1 which is waiting on go and being in the team
400
        kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401
        if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402
                                        0) ||
403
            this_thr->th.th_used_in_team.load() == 0) {
404
          my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405
        }
406
#if USE_ITT_BUILD && USE_ITT_NOTIFY
407
        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408
          // In fork barrier where we could not get the object reliably
409
          itt_sync_obj =
410
              __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411
          // Cancel wait on previous parallel region...
412
          __kmp_itt_task_starting(itt_sync_obj);
413

414
          if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415
            return;
416

417
          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418
          if (itt_sync_obj != NULL)
419
            // Call prepare as early as possible for "new" barrier
420
            __kmp_itt_task_finished(itt_sync_obj);
421
        } else
422
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423
            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424
          return;
425
      }
426
      if (this_thr->th.th_used_in_team.load() != 1 &&
427
          this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428
        continue;
429
      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430
        return;
431

432
      // At this point, the thread thinks it is in use in a team, or in
433
      // transition to be used in a team, but it might have reached this barrier
434
      // before it was marked unused by the team. Unused threads are awoken and
435
      // shifted to wait on local thread struct elsewhere. It also might reach
436
      // this point by being picked up for use by a different team. Either way,
437
      // we need to update the tid.
438
      tid = __kmp_tid_from_gtid(gtid);
439
      team = this_thr->th.th_team;
440
      KMP_DEBUG_ASSERT(tid >= 0);
441
      KMP_DEBUG_ASSERT(team);
442
      b = team->t.b;
443
      my_current_iter = b->iter[tid].iter;
444
      next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445
      my_go_index = tid / b->threads_per_go;
446
      if (this_thr->th.th_used_in_team.load() == 3) {
447
        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
448
      }
449
      // Check if go flag is set
450
      if (b->go[my_go_index].go.load() != next_go) {
451
        // Wait on go flag on team
452
        kmp_atomic_flag_64<false, true> my_flag(
453
            &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454
        my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
455
        KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
456
                         b->iter[tid].iter == 0);
457
        KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458
      }
459

460
      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461
        return;
462
      // At this point, the thread's go location was set. This means the primary
463
      // thread is safely in the barrier, and so this thread's data is
464
      // up-to-date, but we should check again that this thread is really in
465
      // use in the team, as it could have been woken up for the purpose of
466
      // changing team size, or reaping threads at shutdown.
467
      if (this_thr->th.th_used_in_team.load() == 1)
468
        break;
469
    } while (1);
470

471
    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472
      return;
473

474
    group_leader = ((tid % b->threads_per_group) == 0);
475
    if (group_leader) {
476
      // Tell all the threads in my group they can go!
477
      for (size_t go_idx = my_go_index + 1;
478
           go_idx < my_go_index + b->gos_per_group; go_idx++) {
479
        b->go[go_idx].go.store(next_go);
480
      }
481
      // Fence added so that workers can see changes to go. sfence inadequate.
482
      KMP_MFENCE();
483
    }
484

485
#if KMP_BARRIER_ICV_PUSH
486
    if (propagate_icvs) { // copy ICVs to final dest
487
      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
488
                               tid, FALSE);
489
      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
490
                (kmp_internal_control_t *)team->t.b->team_icvs);
491
      copy_icvs(&thr_bar->th_fixed_icvs,
492
                &team->t.t_implicit_task_taskdata[tid].td_icvs);
493
    }
494
#endif
495
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496
      // This thread is now awake and participating in the barrier;
497
      // wake up the other threads in the group
498
      size_t nproc = this_thr->th.th_team_nproc;
499
      size_t group_end = tid + b->threads_per_group;
500
      if (nproc < group_end)
501
        group_end = nproc;
502
      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
503
    }
504
  } else { //  Primary thread
505
    team = this_thr->th.th_team;
506
    b = team->t.b;
507
    my_current_iter = b->iter[tid].iter;
508
    next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509
#if KMP_BARRIER_ICV_PUSH
510
    if (propagate_icvs) {
511
      // primary thread has ICVs in final destination; copy
512
      copy_icvs(&thr_bar->th_fixed_icvs,
513
                &team->t.t_implicit_task_taskdata[tid].td_icvs);
514
    }
515
#endif
516
    // Tell all the group leaders they can go!
517
    for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518
      b->go[go_idx].go.store(next_go);
519
    }
520

521
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
522
      // Wake-up the group leaders
523
      size_t nproc = this_thr->th.th_team_nproc;
524
      __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
525
                                b->threads_per_group, tid);
526
    }
527

528
    // Tell all the threads in my group they can go!
529
    for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
530
      b->go[go_idx].go.store(next_go);
531
    }
532

533
    // Fence added so that workers can see changes to go. sfence inadequate.
534
    KMP_MFENCE();
535

536
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
537
      // Wake-up the other threads in my group
538
      size_t nproc = this_thr->th.th_team_nproc;
539
      size_t group_end = tid + b->threads_per_group;
540
      if (nproc < group_end)
541
        group_end = nproc;
542
      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
543
    }
544
  }
545
  // Update to next iteration
546
  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547
  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
548

549
  KA_TRACE(
550
      20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551
           gtid, team->t.t_id, tid, bt));
552
}
553

554
// Linear Barrier
555
template <bool cancellable = false>
556
static bool __kmp_linear_barrier_gather_template(
557
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
558
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
559
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560
  kmp_team_t *team = this_thr->th.th_team;
561
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562
  kmp_info_t **other_threads = team->t.t_threads;
563

564
  KA_TRACE(
565
      20,
566
      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567
       gtid, team->t.t_id, tid, bt));
568
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569

570
#if USE_ITT_BUILD && USE_ITT_NOTIFY
571
  // Barrier imbalance - save arrive time to the thread
572
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
573
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574
        __itt_get_timestamp();
575
  }
576
#endif
577
  // We now perform a linear reduction to signal that all of the threads have
578
  // arrived.
579
  if (!KMP_MASTER_TID(tid)) {
580
    KA_TRACE(20,
581
             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582
              "arrived(%p): %llu => %llu\n",
583
              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
584
              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
585
              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586
    // Mark arrival to primary thread
587
    /* After performing this write, a worker thread may not assume that the team
588
       is valid any more - it could be deallocated by the primary thread at any
589
       time. */
590
    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
591
    flag.release();
592
  } else {
593
    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594
    int nproc = this_thr->th.th_team_nproc;
595
    int i;
596
    // Don't have to worry about sleep bit here or atomic since team setting
597
    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598

599
    // Collect all the worker team member threads.
600
    for (i = 1; i < nproc; ++i) {
601
#if KMP_CACHE_MANAGE
602
      // Prefetch next thread's arrived count
603
      if (i + 1 < nproc)
604
        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
605
#endif /* KMP_CACHE_MANAGE */
606
      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607
                    "arrived(%p) == %llu\n",
608
                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609
                    team->t.t_id, i,
610
                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611

612
      // Wait for worker thread to arrive
613
      if (cancellable) {
614
        kmp_flag_64<true, false> flag(
615
            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616
        if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617
          return true;
618
      } else {
619
        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620
                           new_state);
621
        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622
      }
623
#if USE_ITT_BUILD && USE_ITT_NOTIFY
624
      // Barrier imbalance - write min of the thread time and the other thread
625
      // time to the thread.
626
      if (__kmp_forkjoin_frames_mode == 2) {
627
        this_thr->th.th_bar_min_time = KMP_MIN(
628
            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629
      }
630
#endif
631
      if (reduce) {
632
        KA_TRACE(100,
633
                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634
                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635
                  team->t.t_id, i));
636
        OMPT_REDUCTION_DECL(this_thr, gtid);
637
        OMPT_REDUCTION_BEGIN;
638
        (*reduce)(this_thr->th.th_local.reduce_data,
639
                  other_threads[i]->th.th_local.reduce_data);
640
        OMPT_REDUCTION_END;
641
      }
642
    }
643
    // Don't have to worry about sleep bit here or atomic since team setting
644
    team_bar->b_arrived = new_state;
645
    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646
                  "arrived(%p) = %llu\n",
647
                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648
                  new_state));
649
  }
650
  KA_TRACE(
651
      20,
652
      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653
       gtid, team->t.t_id, tid, bt));
654
  return false;
655
}
656

657
template <bool cancellable = false>
658
static bool __kmp_linear_barrier_release_template(
659
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
660
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663
  kmp_team_t *team;
664

665
  if (KMP_MASTER_TID(tid)) {
666
    unsigned int i;
667
    kmp_uint32 nproc = this_thr->th.th_team_nproc;
668
    kmp_info_t **other_threads;
669

670
    team = __kmp_threads[gtid]->th.th_team;
671
    KMP_DEBUG_ASSERT(team != NULL);
672
    other_threads = team->t.t_threads;
673

674
    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675
                  "barrier type %d\n",
676
                  gtid, team->t.t_id, tid, bt));
677

678
    if (nproc > 1) {
679
#if KMP_BARRIER_ICV_PUSH
680
      {
681
        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
682
        if (propagate_icvs) {
683
          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
684
          for (i = 1; i < nproc; ++i) {
685
            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
686
                                     team, i, FALSE);
687
            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688
                           &team->t.t_implicit_task_taskdata[0].td_icvs);
689
          }
690
          ngo_sync();
691
        }
692
      }
693
#endif // KMP_BARRIER_ICV_PUSH
694

695
      // Now, release all of the worker threads
696
      for (i = 1; i < nproc; ++i) {
697
#if KMP_CACHE_MANAGE
698
        // Prefetch next thread's go flag
699
        if (i + 1 < nproc)
700
          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
701
#endif /* KMP_CACHE_MANAGE */
702
        KA_TRACE(
703
            20,
704
            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705
             "go(%p): %u => %u\n",
706
             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707
             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708
             other_threads[i]->th.th_bar[bt].bb.b_go,
709
             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710
        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711
                           other_threads[i]);
712
        flag.release();
713
      }
714
    }
715
  } else { // Wait for the PRIMARY thread to release us
716
    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717
                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718
    if (cancellable) {
719
      kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
720
      if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721
        return true;
722
    } else {
723
      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
724
      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725
    }
726
#if USE_ITT_BUILD && USE_ITT_NOTIFY
727
    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
728
      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729
      // disabled)
730
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
731
      // Cancel wait on previous parallel region...
732
      __kmp_itt_task_starting(itt_sync_obj);
733

734
      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735
        return false;
736

737
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
738
      if (itt_sync_obj != NULL)
739
        // Call prepare as early as possible for "new" barrier
740
        __kmp_itt_task_finished(itt_sync_obj);
741
    } else
742
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743
        // Early exit for reaping threads releasing forkjoin barrier
744
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745
      return false;
746
// The worker thread may now assume that the team is valid.
747
#ifdef KMP_DEBUG
748
    tid = __kmp_tid_from_gtid(gtid);
749
    team = __kmp_threads[gtid]->th.th_team;
750
#endif
751
    KMP_DEBUG_ASSERT(team != NULL);
752
    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753
    KA_TRACE(20,
754
             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755
              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756
    KMP_MB(); // Flush all pending memory write invalidates.
757
  }
758
  KA_TRACE(
759
      20,
760
      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761
       gtid, team->t.t_id, tid, bt));
762
  return false;
763
}
764

765
static void __kmp_linear_barrier_gather(
766
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
767
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
768
  __kmp_linear_barrier_gather_template<false>(
769
      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770
}
771

772
static bool __kmp_linear_barrier_gather_cancellable(
773
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
774
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
775
  return __kmp_linear_barrier_gather_template<true>(
776
      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777
}
778

779
static void __kmp_linear_barrier_release(
780
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
781
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782
  __kmp_linear_barrier_release_template<false>(
783
      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784
}
785

786
static bool __kmp_linear_barrier_release_cancellable(
787
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
788
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789
  return __kmp_linear_barrier_release_template<true>(
790
      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791
}
792

793
// Tree barrier
794
static void __kmp_tree_barrier_gather(
795
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
796
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
797
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
798
  kmp_team_t *team = this_thr->th.th_team;
799
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800
  kmp_info_t **other_threads = team->t.t_threads;
801
  kmp_uint32 nproc = this_thr->th.th_team_nproc;
802
  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
803
  kmp_uint32 branch_factor = 1 << branch_bits;
804
  kmp_uint32 child;
805
  kmp_uint32 child_tid;
806
  kmp_uint64 new_state = 0;
807

808
  KA_TRACE(
809
      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810
           gtid, team->t.t_id, tid, bt));
811
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812

813
#if USE_ITT_BUILD && USE_ITT_NOTIFY
814
  // Barrier imbalance - save arrive time to the thread
815
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
816
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817
        __itt_get_timestamp();
818
  }
819
#endif
820
  // Perform tree gather to wait until all threads have arrived; reduce any
821
  // required data as we go
822
  child_tid = (tid << branch_bits) + 1;
823
  if (child_tid < nproc) {
824
    // Parent threads wait for all their children to arrive
825
    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826
    child = 1;
827
    do {
828
      kmp_info_t *child_thr = other_threads[child_tid];
829
      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830
#if KMP_CACHE_MANAGE
831
      // Prefetch next thread's arrived count
832
      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
833
        KMP_CACHE_PREFETCH(
834
            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
835
#endif /* KMP_CACHE_MANAGE */
836
      KA_TRACE(20,
837
               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838
                "arrived(%p) == %llu\n",
839
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840
                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841
      // Wait for child to arrive
842
      kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843
      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844
#if USE_ITT_BUILD && USE_ITT_NOTIFY
845
      // Barrier imbalance - write min of the thread time and a child time to
846
      // the thread.
847
      if (__kmp_forkjoin_frames_mode == 2) {
848
        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849
                                               child_thr->th.th_bar_min_time);
850
      }
851
#endif
852
      if (reduce) {
853
        KA_TRACE(100,
854
                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855
                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856
                  team->t.t_id, child_tid));
857
        OMPT_REDUCTION_DECL(this_thr, gtid);
858
        OMPT_REDUCTION_BEGIN;
859
        (*reduce)(this_thr->th.th_local.reduce_data,
860
                  child_thr->th.th_local.reduce_data);
861
        OMPT_REDUCTION_END;
862
      }
863
      child++;
864
      child_tid++;
865
    } while (child <= branch_factor && child_tid < nproc);
866
  }
867

868
  if (!KMP_MASTER_TID(tid)) { // Worker threads
869
    kmp_int32 parent_tid = (tid - 1) >> branch_bits;
870

871
    KA_TRACE(20,
872
             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873
              "arrived(%p): %llu => %llu\n",
874
              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875
              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876
              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877

878
    // Mark arrival to parent thread
879
    /* After performing this write, a worker thread may not assume that the team
880
       is valid any more - it could be deallocated by the primary thread at any
881
       time.  */
882
    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883
    flag.release();
884
  } else {
885
    // Need to update the team arrived pointer if we are the primary thread
886
    if (nproc > 1) // New value was already computed above
887
      team->t.t_bar[bt].b_arrived = new_state;
888
    else
889
      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890
    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891
                  "arrived(%p) = %llu\n",
892
                  gtid, team->t.t_id, tid, team->t.t_id,
893
                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894
  }
895
  KA_TRACE(20,
896
           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897
            gtid, team->t.t_id, tid, bt));
898
}
899

900
static void __kmp_tree_barrier_release(
901
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
902
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904
  kmp_team_t *team;
905
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906
  kmp_uint32 nproc;
907
  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
908
  kmp_uint32 branch_factor = 1 << branch_bits;
909
  kmp_uint32 child;
910
  kmp_uint32 child_tid;
911

912
  // Perform a tree release for all of the threads that have been gathered
913
  if (!KMP_MASTER_TID(
914
          tid)) { // Handle fork barrier workers who aren't part of a team yet
915
    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916
                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917
    // Wait for parent thread to release us
918
    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
919
    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920
#if USE_ITT_BUILD && USE_ITT_NOTIFY
921
    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
922
      // In fork barrier where we could not get the object reliably (or
923
      // ITTNOTIFY is disabled)
924
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
925
      // Cancel wait on previous parallel region...
926
      __kmp_itt_task_starting(itt_sync_obj);
927

928
      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929
        return;
930

931
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
932
      if (itt_sync_obj != NULL)
933
        // Call prepare as early as possible for "new" barrier
934
        __kmp_itt_task_finished(itt_sync_obj);
935
    } else
936
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937
        // Early exit for reaping threads releasing forkjoin barrier
938
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939
      return;
940

941
    // The worker thread may now assume that the team is valid.
942
    team = __kmp_threads[gtid]->th.th_team;
943
    KMP_DEBUG_ASSERT(team != NULL);
944
    tid = __kmp_tid_from_gtid(gtid);
945

946
    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947
    KA_TRACE(20,
948
             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949
              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950
    KMP_MB(); // Flush all pending memory write invalidates.
951
  } else {
952
    team = __kmp_threads[gtid]->th.th_team;
953
    KMP_DEBUG_ASSERT(team != NULL);
954
    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955
                  "barrier type %d\n",
956
                  gtid, team->t.t_id, tid, bt));
957
  }
958
  nproc = this_thr->th.th_team_nproc;
959
  child_tid = (tid << branch_bits) + 1;
960

961
  if (child_tid < nproc) {
962
    kmp_info_t **other_threads = team->t.t_threads;
963
    child = 1;
964
    // Parent threads release all their children
965
    do {
966
      kmp_info_t *child_thr = other_threads[child_tid];
967
      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968
#if KMP_CACHE_MANAGE
969
      // Prefetch next thread's go count
970
      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
971
        KMP_CACHE_PREFETCH(
972
            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
973
#endif /* KMP_CACHE_MANAGE */
974

975
#if KMP_BARRIER_ICV_PUSH
976
      {
977
        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
978
        if (propagate_icvs) {
979
          __kmp_init_implicit_task(team->t.t_ident,
980
                                   team->t.t_threads[child_tid], team,
981
                                   child_tid, FALSE);
982
          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983
                    &team->t.t_implicit_task_taskdata[0].td_icvs);
984
        }
985
      }
986
#endif // KMP_BARRIER_ICV_PUSH
987
      KA_TRACE(20,
988
               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989
                "go(%p): %u => %u\n",
990
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991
                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992
                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993
      // Release child from barrier
994
      kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995
      flag.release();
996
      child++;
997
      child_tid++;
998
    } while (child <= branch_factor && child_tid < nproc);
999
  }
1000
  KA_TRACE(
1001
      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002
           gtid, team->t.t_id, tid, bt));
1003
}
1004

1005
// Hyper Barrier
1006
static void __kmp_hyper_barrier_gather(
1007
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1008
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1009
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010
  kmp_team_t *team = this_thr->th.th_team;
1011
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012
  kmp_info_t **other_threads = team->t.t_threads;
1013
  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1014
  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1015
  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1016
  kmp_uint32 branch_factor = 1 << branch_bits;
1017
  kmp_uint32 offset;
1018
  kmp_uint32 level;
1019

1020
  KA_TRACE(
1021
      20,
1022
      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023
       gtid, team->t.t_id, tid, bt));
1024
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025

1026
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1027
  // Barrier imbalance - save arrive time to the thread
1028
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1029
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030
        __itt_get_timestamp();
1031
  }
1032
#endif
1033
  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1034
     have arrived, and reduce any required data as we go.  */
1035
  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036
  for (level = 0, offset = 1; offset < num_threads;
1037
       level += branch_bits, offset <<= branch_bits) {
1038
    kmp_uint32 child;
1039
    kmp_uint32 child_tid;
1040

1041
    if (((tid >> level) & (branch_factor - 1)) != 0) {
1042
      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1043

1044
      KMP_MB(); // Synchronize parent and child threads.
1045
      KA_TRACE(20,
1046
               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047
                "arrived(%p): %llu => %llu\n",
1048
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049
                team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050
                thr_bar->b_arrived,
1051
                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052
      // Mark arrival to parent thread
1053
      /* After performing this write (in the last iteration of the enclosing for
1054
         loop), a worker thread may not assume that the team is valid any more
1055
         - it could be deallocated by the primary thread at any time.  */
1056
      p_flag.set_waiter(other_threads[parent_tid]);
1057
      p_flag.release();
1058
      break;
1059
    }
1060

1061
    // Parent threads wait for children to arrive
1062
    if (new_state == KMP_BARRIER_UNUSED_STATE)
1063
      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064
    for (child = 1, child_tid = tid + (1 << level);
1065
         child < branch_factor && child_tid < num_threads;
1066
         child++, child_tid += (1 << level)) {
1067
      kmp_info_t *child_thr = other_threads[child_tid];
1068
      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069
#if KMP_CACHE_MANAGE
1070
      kmp_uint32 next_child_tid = child_tid + (1 << level);
1071
      // Prefetch next thread's arrived count
1072
      if (child + 1 < branch_factor && next_child_tid < num_threads)
1073
        KMP_CACHE_PREFETCH(
1074
            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075
#endif /* KMP_CACHE_MANAGE */
1076
      KA_TRACE(20,
1077
               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078
                "arrived(%p) == %llu\n",
1079
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080
                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081
      // Wait for child to arrive
1082
      kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083
      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084
      KMP_MB(); // Synchronize parent and child threads.
1085
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1086
      // Barrier imbalance - write min of the thread time and a child time to
1087
      // the thread.
1088
      if (__kmp_forkjoin_frames_mode == 2) {
1089
        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090
                                               child_thr->th.th_bar_min_time);
1091
      }
1092
#endif
1093
      if (reduce) {
1094
        KA_TRACE(100,
1095
                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096
                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097
                  team->t.t_id, child_tid));
1098
        OMPT_REDUCTION_DECL(this_thr, gtid);
1099
        OMPT_REDUCTION_BEGIN;
1100
        (*reduce)(this_thr->th.th_local.reduce_data,
1101
                  child_thr->th.th_local.reduce_data);
1102
        OMPT_REDUCTION_END;
1103
      }
1104
    }
1105
  }
1106

1107
  if (KMP_MASTER_TID(tid)) {
1108
    // Need to update the team arrived pointer if we are the primary thread
1109
    if (new_state == KMP_BARRIER_UNUSED_STATE)
1110
      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111
    else
1112
      team->t.t_bar[bt].b_arrived = new_state;
1113
    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114
                  "arrived(%p) = %llu\n",
1115
                  gtid, team->t.t_id, tid, team->t.t_id,
1116
                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117
  }
1118
  KA_TRACE(
1119
      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120
           gtid, team->t.t_id, tid, bt));
1121
}
1122

1123
// The reverse versions seem to beat the forward versions overall
1124
#define KMP_REVERSE_HYPER_BAR
1125
static void __kmp_hyper_barrier_release(
1126
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1127
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129
  kmp_team_t *team;
1130
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131
  kmp_info_t **other_threads;
1132
  kmp_uint32 num_threads;
1133
  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1134
  kmp_uint32 branch_factor = 1 << branch_bits;
1135
  kmp_uint32 child;
1136
  kmp_uint32 child_tid;
1137
  kmp_uint32 offset;
1138
  kmp_uint32 level;
1139

1140
  /* Perform a hypercube-embedded tree release for all of the threads that have
1141
     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142
     are released in the reverse order of the corresponding gather, otherwise
1143
     threads are released in the same order. */
1144
  if (KMP_MASTER_TID(tid)) { // primary thread
1145
    team = __kmp_threads[gtid]->th.th_team;
1146
    KMP_DEBUG_ASSERT(team != NULL);
1147
    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148
                  "barrier type %d\n",
1149
                  gtid, team->t.t_id, tid, bt));
1150
#if KMP_BARRIER_ICV_PUSH
1151
    if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152
      copy_icvs(&thr_bar->th_fixed_icvs,
1153
                &team->t.t_implicit_task_taskdata[tid].td_icvs);
1154
    }
1155
#endif
1156
  } else { // Handle fork barrier workers who aren't part of a team yet
1157
    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158
                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159
    // Wait for parent thread to release us
1160
    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161
    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1163
    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1164
      // In fork barrier where we could not get the object reliably
1165
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1166
      // Cancel wait on previous parallel region...
1167
      __kmp_itt_task_starting(itt_sync_obj);
1168

1169
      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170
        return;
1171

1172
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1173
      if (itt_sync_obj != NULL)
1174
        // Call prepare as early as possible for "new" barrier
1175
        __kmp_itt_task_finished(itt_sync_obj);
1176
    } else
1177
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178
        // Early exit for reaping threads releasing forkjoin barrier
1179
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180
      return;
1181

1182
    // The worker thread may now assume that the team is valid.
1183
    team = __kmp_threads[gtid]->th.th_team;
1184
    KMP_DEBUG_ASSERT(team != NULL);
1185
    tid = __kmp_tid_from_gtid(gtid);
1186

1187
    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188
    KA_TRACE(20,
1189
             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190
              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191
    KMP_MB(); // Flush all pending memory write invalidates.
1192
  }
1193
  num_threads = this_thr->th.th_team_nproc;
1194
  other_threads = team->t.t_threads;
1195

1196
#ifdef KMP_REVERSE_HYPER_BAR
1197
  // Count up to correct level for parent
1198
  for (level = 0, offset = 1;
1199
       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1200
       level += branch_bits, offset <<= branch_bits)
1201
    ;
1202

1203
  // Now go down from there
1204
  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1205
       level -= branch_bits, offset >>= branch_bits)
1206
#else
1207
  // Go down the tree, level by level
1208
  for (level = 0, offset = 1; offset < num_threads;
1209
       level += branch_bits, offset <<= branch_bits)
1210
#endif // KMP_REVERSE_HYPER_BAR
1211
  {
1212
#ifdef KMP_REVERSE_HYPER_BAR
1213
    /* Now go in reverse order through the children, highest to lowest.
1214
       Initial setting of child is conservative here. */
1215
    child = num_threads >> ((level == 0) ? level : level - 1);
1216
    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1217
        child_tid = tid + (child << level);
1218
         child >= 1; child--, child_tid -= (1 << level))
1219
#else
1220
    if (((tid >> level) & (branch_factor - 1)) != 0)
1221
      // No need to go lower than this, since this is the level parent would be
1222
      // notified
1223
      break;
1224
    // Iterate through children on this level of the tree
1225
    for (child = 1, child_tid = tid + (1 << level);
1226
         child < branch_factor && child_tid < num_threads;
1227
         child++, child_tid += (1 << level))
1228
#endif // KMP_REVERSE_HYPER_BAR
1229
    {
1230
      if (child_tid >= num_threads)
1231
        continue; // Child doesn't exist so keep going
1232
      else {
1233
        kmp_info_t *child_thr = other_threads[child_tid];
1234
        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235
#if KMP_CACHE_MANAGE
1236
        kmp_uint32 next_child_tid = child_tid - (1 << level);
1237
// Prefetch next thread's go count
1238
#ifdef KMP_REVERSE_HYPER_BAR
1239
        if (child - 1 >= 1 && next_child_tid < num_threads)
1240
#else
1241
        if (child + 1 < branch_factor && next_child_tid < num_threads)
1242
#endif // KMP_REVERSE_HYPER_BAR
1243
          KMP_CACHE_PREFETCH(
1244
              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245
#endif /* KMP_CACHE_MANAGE */
1246

1247
#if KMP_BARRIER_ICV_PUSH
1248
        if (propagate_icvs) // push my fixed ICVs to my child
1249
          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1250
#endif // KMP_BARRIER_ICV_PUSH
1251

1252
        KA_TRACE(
1253
            20,
1254
            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255
             "go(%p): %u => %u\n",
1256
             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257
             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258
             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259
        // Release child from barrier
1260
        kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261
        flag.release();
1262
      }
1263
    }
1264
  }
1265
#if KMP_BARRIER_ICV_PUSH
1266
  if (propagate_icvs &&
1267
      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268
    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1269
                             FALSE);
1270
    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1271
              &thr_bar->th_fixed_icvs);
1272
  }
1273
#endif
1274
  KA_TRACE(
1275
      20,
1276
      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277
       gtid, team->t.t_id, tid, bt));
1278
}
1279

1280
// Hierarchical Barrier
1281

1282
// Initialize thread barrier data
1283
/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1284
   Performs the minimum amount of initialization required based on how the team
1285
   has changed. Returns true if leaf children will require both on-core and
1286
   traditional wake-up mechanisms. For example, if the team size increases,
1287
   threads already in the team will respond to on-core wakeup on their parent
1288
   thread, but threads newly added to the team will only be listening on the
1289
   their local b_go. */
1290
static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1291
                                                   kmp_bstate_t *thr_bar,
1292
                                                   kmp_uint32 nproc, int gtid,
1293
                                                   int tid, kmp_team_t *team) {
1294
  // Checks to determine if (re-)initialization is needed
1295
  bool uninitialized = thr_bar->team == NULL;
1296
  bool team_changed = team != thr_bar->team;
1297
  bool team_sz_changed = nproc != thr_bar->nproc;
1298
  bool tid_changed = tid != thr_bar->old_tid;
1299
  bool retval = false;
1300

1301
  if (uninitialized || team_sz_changed) {
1302
    __kmp_get_hierarchy(nproc, thr_bar);
1303
  }
1304

1305
  if (uninitialized || team_sz_changed || tid_changed) {
1306
    thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1307
    thr_bar->parent_tid = -1; // default for primary thread
1308
    if (!KMP_MASTER_TID(tid)) {
1309
      // if not primary thread, find parent thread in hierarchy
1310
      kmp_uint32 d = 0;
1311
      while (d < thr_bar->depth) { // find parent based on level of thread in
1312
        // hierarchy, and note level
1313
        kmp_uint32 rem;
1314
        if (d == thr_bar->depth - 2) { // reached level right below the primary
1315
          thr_bar->parent_tid = 0;
1316
          thr_bar->my_level = d;
1317
          break;
1318
        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1319
          // TODO: can we make the above op faster?
1320
          // thread is not a subtree root at next level, so this is max
1321
          thr_bar->parent_tid = tid - rem;
1322
          thr_bar->my_level = d;
1323
          break;
1324
        }
1325
        ++d;
1326
      }
1327
    }
1328
    __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1329
                            (thr_bar->skip_per_level[thr_bar->my_level])),
1330
                       &(thr_bar->offset));
1331
    thr_bar->old_tid = tid;
1332
    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333
    thr_bar->team = team;
1334
    thr_bar->parent_bar =
1335
        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336
  }
1337
  if (uninitialized || team_changed || tid_changed) {
1338
    thr_bar->team = team;
1339
    thr_bar->parent_bar =
1340
        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341
    retval = true;
1342
  }
1343
  if (uninitialized || team_sz_changed || tid_changed) {
1344
    thr_bar->nproc = nproc;
1345
    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346
    if (thr_bar->my_level == 0)
1347
      thr_bar->leaf_kids = 0;
1348
    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1349
      __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1350
    thr_bar->leaf_state = 0;
1351
    for (int i = 0; i < thr_bar->leaf_kids; ++i)
1352
      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1353
  }
1354
  return retval;
1355
}
1356

1357
static void __kmp_hierarchical_barrier_gather(
1358
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1359
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1360
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361
  kmp_team_t *team = this_thr->th.th_team;
1362
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363
  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364
  kmp_info_t **other_threads = team->t.t_threads;
1365
  kmp_uint64 new_state = 0;
1366

1367
  int level = team->t.t_level;
1368
  if (other_threads[0]
1369
          ->th.th_teams_microtask) // are we inside the teams construct?
1370
    if (this_thr->th.th_teams_size.nteams > 1)
1371
      ++level; // level was not increased in teams construct for team_of_masters
1372
  if (level == 1)
1373
    thr_bar->use_oncore_barrier = 1;
1374
  else
1375
    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1376

1377
  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378
                "barrier type %d\n",
1379
                gtid, team->t.t_id, tid, bt));
1380
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381

1382
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1383
  // Barrier imbalance - save arrive time to the thread
1384
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1385
    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386
  }
1387
#endif
1388

1389
  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390
                                               team);
1391

1392
  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393
    kmp_int32 child_tid;
1394
    new_state =
1395
        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1396
    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1397
        thr_bar->use_oncore_barrier) {
1398
      if (thr_bar->leaf_kids) {
1399
        // First, wait for leaf children to check-in on my b_arrived flag
1400
        kmp_uint64 leaf_state =
1401
            KMP_MASTER_TID(tid)
1402
                ? thr_bar->b_arrived | thr_bar->leaf_state
1403
                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1404
        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405
                      "for leaf kids\n",
1406
                      gtid, team->t.t_id, tid));
1407
        kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408
        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409
        if (reduce) {
1410
          OMPT_REDUCTION_DECL(this_thr, gtid);
1411
          OMPT_REDUCTION_BEGIN;
1412
          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1413
               ++child_tid) {
1414
            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415
                           "T#%d(%d:%d)\n",
1416
                           gtid, team->t.t_id, tid,
1417
                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418
                           child_tid));
1419
            (*reduce)(this_thr->th.th_local.reduce_data,
1420
                      other_threads[child_tid]->th.th_local.reduce_data);
1421
          }
1422
          OMPT_REDUCTION_END;
1423
        }
1424
        // clear leaf_state bits
1425
        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426
      }
1427
      // Next, wait for higher level children on each child's b_arrived flag
1428
      for (kmp_uint32 d = 1; d < thr_bar->my_level;
1429
           ++d) { // gather lowest level threads first, but skip 0
1430
        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1431
                   skip = thr_bar->skip_per_level[d];
1432
        if (last > nproc)
1433
          last = nproc;
1434
        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435
          kmp_info_t *child_thr = other_threads[child_tid];
1436
          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437
          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438
                        "T#%d(%d:%d) "
1439
                        "arrived(%p) == %llu\n",
1440
                        gtid, team->t.t_id, tid,
1441
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442
                        child_tid, &child_bar->b_arrived, new_state));
1443
          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444
          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445
          if (reduce) {
1446
            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447
                           "T#%d(%d:%d)\n",
1448
                           gtid, team->t.t_id, tid,
1449
                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450
                           child_tid));
1451
            (*reduce)(this_thr->th.th_local.reduce_data,
1452
                      child_thr->th.th_local.reduce_data);
1453
          }
1454
        }
1455
      }
1456
    } else { // Blocktime is not infinite
1457
      for (kmp_uint32 d = 0; d < thr_bar->my_level;
1458
           ++d) { // Gather lowest level threads first
1459
        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1460
                   skip = thr_bar->skip_per_level[d];
1461
        if (last > nproc)
1462
          last = nproc;
1463
        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464
          kmp_info_t *child_thr = other_threads[child_tid];
1465
          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466
          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467
                        "T#%d(%d:%d) "
1468
                        "arrived(%p) == %llu\n",
1469
                        gtid, team->t.t_id, tid,
1470
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471
                        child_tid, &child_bar->b_arrived, new_state));
1472
          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473
          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474
          if (reduce) {
1475
            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476
                           "T#%d(%d:%d)\n",
1477
                           gtid, team->t.t_id, tid,
1478
                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479
                           child_tid));
1480
            (*reduce)(this_thr->th.th_local.reduce_data,
1481
                      child_thr->th.th_local.reduce_data);
1482
          }
1483
        }
1484
      }
1485
    }
1486
  }
1487
  // All subordinates are gathered; now release parent if not primary thread
1488

1489
  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490
    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491
                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492
                  gtid, team->t.t_id, tid,
1493
                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494
                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495
                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496
    /* Mark arrival to parent: After performing this write, a worker thread may
1497
       not assume that the team is valid any more - it could be deallocated by
1498
       the primary thread at any time. */
1499
    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1500
        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501
      // flag; release it
1502
      kmp_flag_64<> flag(&thr_bar->b_arrived,
1503
                         other_threads[thr_bar->parent_tid]);
1504
      flag.release();
1505
    } else {
1506
      // Leaf does special release on "offset" bits of parent's b_arrived flag
1507
      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508
      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509
                           thr_bar->offset + 1);
1510
      flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511
      flag.release();
1512
    }
1513
  } else { // Primary thread needs to update the team's b_arrived value
1514
    team->t.t_bar[bt].b_arrived = new_state;
1515
    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516
                  "arrived(%p) = %llu\n",
1517
                  gtid, team->t.t_id, tid, team->t.t_id,
1518
                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519
  }
1520
  // Is the team access below unsafe or just technically invalid?
1521
  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522
                "barrier type %d\n",
1523
                gtid, team->t.t_id, tid, bt));
1524
}
1525

1526
static void __kmp_hierarchical_barrier_release(
1527
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1528
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530
  kmp_team_t *team;
1531
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532
  kmp_uint32 nproc;
1533
  bool team_change = false; // indicates on-core barrier shouldn't be used
1534

1535
  if (KMP_MASTER_TID(tid)) {
1536
    team = __kmp_threads[gtid]->th.th_team;
1537
    KMP_DEBUG_ASSERT(team != NULL);
1538
    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539
                  "entered barrier type %d\n",
1540
                  gtid, team->t.t_id, tid, bt));
1541
  } else { // Worker threads
1542
    // Wait for parent thread to release me
1543
    if (!thr_bar->use_oncore_barrier ||
1544
        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1545
        thr_bar->team == NULL) {
1546
      // Use traditional method of waiting on my own b_go flag
1547
      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548
      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549
      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550
      TCW_8(thr_bar->b_go,
1551
            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552
    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553
      // infinite, not nested
1554
      // Wait on my "offset" bits on parent's b_go flag
1555
      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556
      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557
                           thr_bar->offset + 1, bt,
1558
                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559
      flag.wait(this_thr, TRUE);
1560
      if (thr_bar->wait_flag ==
1561
          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562
        TCW_8(thr_bar->b_go,
1563
              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564
      } else { // Reset my bits on parent's b_go flag
1565
        (RCAST(volatile char *,
1566
               &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1567
      }
1568
    }
1569
    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570
    // Early exit for reaping threads releasing forkjoin barrier
1571
    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572
      return;
1573
    // The worker thread may now assume that the team is valid.
1574
    team = __kmp_threads[gtid]->th.th_team;
1575
    KMP_DEBUG_ASSERT(team != NULL);
1576
    tid = __kmp_tid_from_gtid(gtid);
1577

1578
    KA_TRACE(
1579
        20,
1580
        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581
         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582
    KMP_MB(); // Flush all pending memory write invalidates.
1583
  }
1584

1585
  nproc = this_thr->th.th_team_nproc;
1586
  int level = team->t.t_level;
1587
  if (team->t.t_threads[0]
1588
          ->th.th_teams_microtask) { // are we inside the teams construct?
1589
    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590
        this_thr->th.th_teams_level == level)
1591
      ++level; // level was not increased in teams construct for team_of_workers
1592
    if (this_thr->th.th_teams_size.nteams > 1)
1593
      ++level; // level was not increased in teams construct for team_of_masters
1594
  }
1595
  if (level == 1)
1596
    thr_bar->use_oncore_barrier = 1;
1597
  else
1598
    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1599

1600
  // If the team size has increased, we still communicate with old leaves via
1601
  // oncore barrier.
1602
  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603
  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604
  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605
                                                       tid, team);
1606
  // But if the entire team changes, we won't use oncore barrier at all
1607
  if (team_change)
1608
    old_leaf_kids = 0;
1609

1610
#if KMP_BARRIER_ICV_PUSH
1611
  if (propagate_icvs) {
1612
    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1613
                             FALSE);
1614
    if (KMP_MASTER_TID(
1615
            tid)) { // primary already has copy in final destination; copy
1616
      copy_icvs(&thr_bar->th_fixed_icvs,
1617
                &team->t.t_implicit_task_taskdata[tid].td_icvs);
1618
    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1619
               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620
      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621
        // leaves (on-core children) pull parent's fixed ICVs directly to local
1622
        // ICV store
1623
        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1624
                  &thr_bar->parent_bar->th_fixed_icvs);
1625
      // non-leaves will get ICVs piggybacked with b_go via NGO store
1626
    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627
      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628
        // access
1629
        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1630
      else // leaves copy parent's fixed ICVs directly to local ICV store
1631
        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1632
                  &thr_bar->parent_bar->th_fixed_icvs);
1633
    }
1634
  }
1635
#endif // KMP_BARRIER_ICV_PUSH
1636

1637
  // Now, release my children
1638
  if (thr_bar->my_level) { // not a leaf
1639
    kmp_int32 child_tid;
1640
    kmp_uint32 last;
1641
    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1642
        thr_bar->use_oncore_barrier) {
1643
      if (KMP_MASTER_TID(tid)) { // do a flat release
1644
        // Set local b_go to bump children via NGO store of the cache line
1645
        // containing IVCs and b_go.
1646
        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647
        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648
        // the cache line
1649
        ngo_load(&thr_bar->th_fixed_icvs);
1650
        // This loops over all the threads skipping only the leaf nodes in the
1651
        // hierarchy
1652
        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1653
             child_tid += thr_bar->skip_per_level[1]) {
1654
          kmp_bstate_t *child_bar =
1655
              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656
          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657
                        "releasing T#%d(%d:%d)"
1658
                        " go(%p): %u => %u\n",
1659
                        gtid, team->t.t_id, tid,
1660
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661
                        child_tid, &child_bar->b_go, child_bar->b_go,
1662
                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663
          // Use ngo store (if available) to both store ICVs and release child
1664
          // via child's b_go
1665
          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666
        }
1667
        ngo_sync();
1668
      }
1669
      TCW_8(thr_bar->b_go,
1670
            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671
      // Now, release leaf children
1672
      if (thr_bar->leaf_kids) { // if there are any
1673
        // We test team_change on the off-chance that the level 1 team changed.
1674
        if (team_change ||
1675
            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676
          if (old_leaf_kids) { // release old leaf kids
1677
            thr_bar->b_go |= old_leaf_state;
1678
          }
1679
          // Release new leaf kids
1680
          last = tid + thr_bar->skip_per_level[1];
1681
          if (last > nproc)
1682
            last = nproc;
1683
          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1684
               ++child_tid) { // skip_per_level[0]=1
1685
            kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686
            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687
            KA_TRACE(
1688
                20,
1689
                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690
                 " T#%d(%d:%d) go(%p): %u => %u\n",
1691
                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692
                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693
                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694
            // Release child using child's b_go flag
1695
            kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696
            flag.release();
1697
          }
1698
        } else { // Release all children at once with leaf_state bits on my own
1699
          // b_go flag
1700
          thr_bar->b_go |= thr_bar->leaf_state;
1701
        }
1702
      }
1703
    } else { // Blocktime is not infinite; do a simple hierarchical release
1704
      for (int d = thr_bar->my_level - 1; d >= 0;
1705
           --d) { // Release highest level threads first
1706
        last = tid + thr_bar->skip_per_level[d + 1];
1707
        kmp_uint32 skip = thr_bar->skip_per_level[d];
1708
        if (last > nproc)
1709
          last = nproc;
1710
        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711
          kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712
          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713
          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714
                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715
                        gtid, team->t.t_id, tid,
1716
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717
                        child_tid, &child_bar->b_go, child_bar->b_go,
1718
                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719
          // Release child using child's b_go flag
1720
          kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721
          flag.release();
1722
        }
1723
      }
1724
    }
1725
#if KMP_BARRIER_ICV_PUSH
1726
    if (propagate_icvs && !KMP_MASTER_TID(tid))
1727
      // non-leaves copy ICVs from fixed ICVs to local dest
1728
      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1729
                &thr_bar->th_fixed_icvs);
1730
#endif // KMP_BARRIER_ICV_PUSH
1731
  }
1732
  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733
                "barrier type %d\n",
1734
                gtid, team->t.t_id, tid, bt));
1735
}
1736

1737
// End of Barrier Algorithms
1738

1739
// type traits for cancellable value
1740
// if cancellable is true, then is_cancellable is a normal boolean variable
1741
// if cancellable is false, then is_cancellable is a compile time constant
1742
template <bool cancellable> struct is_cancellable {};
1743
template <> struct is_cancellable<true> {
1744
  bool value;
1745
  is_cancellable() : value(false) {}
1746
  is_cancellable(bool b) : value(b) {}
1747
  is_cancellable &operator=(bool b) {
1748
    value = b;
1749
    return *this;
1750
  }
1751
  operator bool() const { return value; }
1752
};
1753
template <> struct is_cancellable<false> {
1754
  is_cancellable &operator=(bool b) { return *this; }
1755
  constexpr operator bool() const { return false; }
1756
};
1757

1758
// Internal function to do a barrier.
1759
/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1760
   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761
   barrier
1762
   When cancellable = false,
1763
     Returns 0 if primary thread, 1 if worker thread.
1764
   When cancellable = true
1765
     Returns 0 if not cancelled, 1 if cancelled.  */
1766
template <bool cancellable = false>
1767
static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768
                                  size_t reduce_size, void *reduce_data,
1769
                                  void (*reduce)(void *, void *)) {
1770
  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771
  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772
  int tid = __kmp_tid_from_gtid(gtid);
1773
  kmp_info_t *this_thr = __kmp_threads[gtid];
1774
  kmp_team_t *team = this_thr->th.th_team;
1775
  int status = 0;
1776
  is_cancellable<cancellable> cancelled;
1777
#if OMPT_SUPPORT && OMPT_OPTIONAL
1778
  ompt_data_t *my_task_data;
1779
  ompt_data_t *my_parallel_data;
1780
  void *return_address;
1781
  ompt_sync_region_t barrier_kind;
1782
#endif
1783

1784
  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785
                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786

1787
#if OMPT_SUPPORT
1788
  if (ompt_enabled.enabled) {
1789
#if OMPT_OPTIONAL
1790
    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791
    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792
    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793
    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794
    if (ompt_enabled.ompt_callback_sync_region) {
1795
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796
          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797
          return_address);
1798
    }
1799
    if (ompt_enabled.ompt_callback_sync_region_wait) {
1800
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801
          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802
          return_address);
1803
    }
1804
#endif
1805
    // It is OK to report the barrier state after the barrier begin callback.
1806
    // According to the OMPT specification, a compliant implementation may
1807
    // even delay reporting this state until the barrier begins to wait.
1808
    auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1809
    switch (barrier_kind) {
1810
    case ompt_sync_region_barrier_explicit:
1811
      ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1812
      break;
1813
    case ompt_sync_region_barrier_implicit_workshare:
1814
      ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1815
      break;
1816
    case ompt_sync_region_barrier_implicit_parallel:
1817
      ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1818
      break;
1819
    case ompt_sync_region_barrier_teams:
1820
      ompt_thr_info->state = ompt_state_wait_barrier_teams;
1821
      break;
1822
    case ompt_sync_region_barrier_implementation:
1823
      [[fallthrough]];
1824
    default:
1825
      ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1826
    }
1827
  }
1828
#endif
1829

1830
  if (!team->t.t_serialized) {
1831
#if USE_ITT_BUILD
1832
    // This value will be used in itt notify events below.
1833
    void *itt_sync_obj = NULL;
1834
#if USE_ITT_NOTIFY
1835
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1837
#endif
1838
#endif /* USE_ITT_BUILD */
1839
    if (__kmp_tasking_mode == tskm_extra_barrier) {
1840
      __kmp_tasking_barrier(team, this_thr, gtid);
1841
      KA_TRACE(15,
1842
               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1843
                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1844
    }
1845

1846
    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1847
       access it when the team struct is not guaranteed to exist. */
1848
    // See note about the corresponding code in __kmp_join_barrier() being
1849
    // performance-critical.
1850
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1851
#if KMP_USE_MONITOR
1852
      this_thr->th.th_team_bt_intervals =
1853
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1854
      this_thr->th.th_team_bt_set =
1855
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1856
#else
1857
      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1858
#endif
1859
    }
1860

1861
#if USE_ITT_BUILD
1862
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1863
      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1864
#endif /* USE_ITT_BUILD */
1865
#if USE_DEBUGGER
1866
    // Let the debugger know: the thread arrived to the barrier and waiting.
1867
    if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1868
      team->t.t_bar[bt].b_master_arrived += 1;
1869
    } else {
1870
      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1871
    } // if
1872
#endif /* USE_DEBUGGER */
1873
    if (reduce != NULL) {
1874
      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1875
      this_thr->th.th_local.reduce_data = reduce_data;
1876
    }
1877

1878
    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1879
      __kmp_task_team_setup(this_thr, team);
1880

1881
    if (cancellable) {
1882
      cancelled = __kmp_linear_barrier_gather_cancellable(
1883
          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1884
    } else {
1885
      switch (__kmp_barrier_gather_pattern[bt]) {
1886
      case bp_dist_bar: {
1887
        __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1888
                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1889
        break;
1890
      }
1891
      case bp_hyper_bar: {
1892
        // don't set branch bits to 0; use linear
1893
        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1894
        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1895
                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1896
        break;
1897
      }
1898
      case bp_hierarchical_bar: {
1899
        __kmp_hierarchical_barrier_gather(
1900
            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1901
        break;
1902
      }
1903
      case bp_tree_bar: {
1904
        // don't set branch bits to 0; use linear
1905
        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1906
        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1907
                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1908
        break;
1909
      }
1910
      default: {
1911
        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1912
                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1913
      }
1914
      }
1915
    }
1916

1917
    KMP_MB();
1918

1919
    if (KMP_MASTER_TID(tid)) {
1920
      status = 0;
1921
      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1922
        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1923
      }
1924
#if USE_DEBUGGER
1925
      // Let the debugger know: All threads are arrived and starting leaving the
1926
      // barrier.
1927
      team->t.t_bar[bt].b_team_arrived += 1;
1928
#endif
1929

1930
      if (__kmp_omp_cancellation) {
1931
        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1932
        // Reset cancellation flag for worksharing constructs
1933
        if (cancel_request == cancel_loop ||
1934
            cancel_request == cancel_sections) {
1935
          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1936
        }
1937
      }
1938
#if USE_ITT_BUILD
1939
      /* TODO: In case of split reduction barrier, primary thread may send
1940
         acquired event early, before the final summation into the shared
1941
         variable is done (final summation can be a long operation for array
1942
         reductions).  */
1943
      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1944
        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1945
#endif /* USE_ITT_BUILD */
1946
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1947
      // Barrier - report frame end (only if active_level == 1)
1948
      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1949
          __kmp_forkjoin_frames_mode &&
1950
          (this_thr->th.th_teams_microtask == NULL || // either not in teams
1951
           this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1952
          team->t.t_active_level == 1) {
1953
        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1954
        kmp_uint64 cur_time = __itt_get_timestamp();
1955
        kmp_info_t **other_threads = team->t.t_threads;
1956
        int nproc = this_thr->th.th_team_nproc;
1957
        int i;
1958
        switch (__kmp_forkjoin_frames_mode) {
1959
        case 1:
1960
          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1961
                                 loc, nproc);
1962
          this_thr->th.th_frame_time = cur_time;
1963
          break;
1964
        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1965
          // be fixed)
1966
          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1967
                                 1, loc, nproc);
1968
          break;
1969
        case 3:
1970
          if (__itt_metadata_add_ptr) {
1971
            // Initialize with primary thread's wait time
1972
            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1973
            // Set arrive time to zero to be able to check it in
1974
            // __kmp_invoke_task(); the same is done inside the loop below
1975
            this_thr->th.th_bar_arrive_time = 0;
1976
            for (i = 1; i < nproc; ++i) {
1977
              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1978
              other_threads[i]->th.th_bar_arrive_time = 0;
1979
            }
1980
            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1981
                                         cur_time, delta,
1982
                                         (kmp_uint64)(reduce != NULL));
1983
          }
1984
          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1985
                                 loc, nproc);
1986
          this_thr->th.th_frame_time = cur_time;
1987
          break;
1988
        }
1989
      }
1990
#endif /* USE_ITT_BUILD */
1991
    } else {
1992
      status = 1;
1993
#if USE_ITT_BUILD
1994
      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1995
        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1996
#endif /* USE_ITT_BUILD */
1997
    }
1998
    if ((status == 1 || !is_split) && !cancelled) {
1999
      if (cancellable) {
2000
        cancelled = __kmp_linear_barrier_release_cancellable(
2001
            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2002
      } else {
2003
        switch (__kmp_barrier_release_pattern[bt]) {
2004
        case bp_dist_bar: {
2005
          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2006
          __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2007
                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2008
          break;
2009
        }
2010
        case bp_hyper_bar: {
2011
          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2012
          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2013
                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2014
          break;
2015
        }
2016
        case bp_hierarchical_bar: {
2017
          __kmp_hierarchical_barrier_release(
2018
              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2019
          break;
2020
        }
2021
        case bp_tree_bar: {
2022
          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2023
          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2024
                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2025
          break;
2026
        }
2027
        default: {
2028
          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2029
                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2030
        }
2031
        }
2032
      }
2033
      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2034
        __kmp_task_team_sync(this_thr, team);
2035
      }
2036
    }
2037

2038
#if USE_ITT_BUILD
2039
    /* GEH: TODO: Move this under if-condition above and also include in
2040
       __kmp_end_split_barrier(). This will more accurately represent the actual
2041
       release time of the threads for split barriers.  */
2042
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2043
      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2044
#endif /* USE_ITT_BUILD */
2045
  } else { // Team is serialized.
2046
    status = 0;
2047
    if (__kmp_tasking_mode != tskm_immediate_exec) {
2048
      if (this_thr->th.th_task_team != NULL) {
2049
#if USE_ITT_NOTIFY
2050
        void *itt_sync_obj = NULL;
2051
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2052
          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2053
          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2054
        }
2055
#endif
2056

2057
        KMP_DEBUG_ASSERT(
2058
            this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2059
            this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2060
                TRUE);
2061
        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2062
        __kmp_task_team_setup(this_thr, team);
2063

2064
#if USE_ITT_BUILD
2065
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2066
          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2067
#endif /* USE_ITT_BUILD */
2068
      }
2069
    }
2070
  }
2071
  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2072
                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2073
                __kmp_tid_from_gtid(gtid), status));
2074

2075
#if OMPT_SUPPORT
2076
  if (ompt_enabled.enabled) {
2077
#if OMPT_OPTIONAL
2078
    if (ompt_enabled.ompt_callback_sync_region_wait) {
2079
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2080
          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2081
          return_address);
2082
    }
2083
    if (ompt_enabled.ompt_callback_sync_region) {
2084
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2085
          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2086
          return_address);
2087
    }
2088
#endif
2089
    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2090
  }
2091
#endif
2092

2093
  if (cancellable)
2094
    return (int)cancelled;
2095
  return status;
2096
}
2097

2098
// Returns 0 if primary thread, 1 if worker thread.
2099
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2100
                  size_t reduce_size, void *reduce_data,
2101
                  void (*reduce)(void *, void *)) {
2102
  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2103
                                  reduce);
2104
}
2105

2106
#if defined(KMP_GOMP_COMPAT)
2107
// Returns 1 if cancelled, 0 otherwise
2108
int __kmp_barrier_gomp_cancel(int gtid) {
2109
  if (__kmp_omp_cancellation) {
2110
    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2111
                                                 0, NULL, NULL);
2112
    if (cancelled) {
2113
      int tid = __kmp_tid_from_gtid(gtid);
2114
      kmp_info_t *this_thr = __kmp_threads[gtid];
2115
      if (KMP_MASTER_TID(tid)) {
2116
        // Primary thread does not need to revert anything
2117
      } else {
2118
        // Workers need to revert their private b_arrived flag
2119
        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2120
            KMP_BARRIER_STATE_BUMP;
2121
      }
2122
    }
2123
    return cancelled;
2124
  }
2125
  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2126
  return FALSE;
2127
}
2128
#endif
2129

2130
void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2131
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2132
  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2133
  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2134
  int tid = __kmp_tid_from_gtid(gtid);
2135
  kmp_info_t *this_thr = __kmp_threads[gtid];
2136
  kmp_team_t *team = this_thr->th.th_team;
2137

2138
  if (!team->t.t_serialized) {
2139
    if (KMP_MASTER_GTID(gtid)) {
2140
      switch (__kmp_barrier_release_pattern[bt]) {
2141
      case bp_dist_bar: {
2142
        __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2143
                                   FALSE USE_ITT_BUILD_ARG(NULL));
2144
        break;
2145
      }
2146
      case bp_hyper_bar: {
2147
        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2148
        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2149
                                    FALSE USE_ITT_BUILD_ARG(NULL));
2150
        break;
2151
      }
2152
      case bp_hierarchical_bar: {
2153
        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2154
                                           FALSE USE_ITT_BUILD_ARG(NULL));
2155
        break;
2156
      }
2157
      case bp_tree_bar: {
2158
        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2159
        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2160
                                   FALSE USE_ITT_BUILD_ARG(NULL));
2161
        break;
2162
      }
2163
      default: {
2164
        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2165
                                     FALSE USE_ITT_BUILD_ARG(NULL));
2166
      }
2167
      }
2168
      if (__kmp_tasking_mode != tskm_immediate_exec) {
2169
        __kmp_task_team_sync(this_thr, team);
2170
      } // if
2171
    }
2172
  }
2173
}
2174

2175
void __kmp_join_barrier(int gtid) {
2176
  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2177
  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2178

2179
  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2180

2181
  kmp_info_t *this_thr = __kmp_threads[gtid];
2182
  kmp_team_t *team;
2183
  int tid;
2184
#ifdef KMP_DEBUG
2185
  int team_id;
2186
#endif /* KMP_DEBUG */
2187
#if USE_ITT_BUILD
2188
  void *itt_sync_obj = NULL;
2189
#if USE_ITT_NOTIFY
2190
  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2191
    // Get object created at fork_barrier
2192
    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2193
#endif
2194
#endif /* USE_ITT_BUILD */
2195
#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2196
  int nproc = this_thr->th.th_team_nproc;
2197
#endif
2198
  KMP_MB();
2199

2200
  // Get current info
2201
  team = this_thr->th.th_team;
2202
  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2203
  tid = __kmp_tid_from_gtid(gtid);
2204
#ifdef KMP_DEBUG
2205
  team_id = team->t.t_id;
2206
  kmp_info_t *master_thread = this_thr->th.th_team_master;
2207
  if (master_thread != team->t.t_threads[0]) {
2208
    __kmp_print_structure();
2209
  }
2210
#endif /* KMP_DEBUG */
2211
  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2212
  KMP_MB();
2213

2214
  // Verify state
2215
  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2216
  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2217
  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2218
  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2219
                gtid, team_id, tid));
2220

2221
#if OMPT_SUPPORT
2222
  if (ompt_enabled.enabled) {
2223
#if OMPT_OPTIONAL
2224
    ompt_data_t *my_task_data;
2225
    ompt_data_t *my_parallel_data;
2226
    void *codeptr = NULL;
2227
    int ds_tid = this_thr->th.th_info.ds.ds_tid;
2228
    if (KMP_MASTER_TID(ds_tid) &&
2229
        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2230
         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2231
      codeptr = team->t.ompt_team_info.master_return_address;
2232
    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2233
    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2234
    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2235
    ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2236
    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2237
      sync_kind = ompt_sync_region_barrier_teams;
2238
      ompt_state = ompt_state_wait_barrier_teams;
2239
    }
2240
    if (ompt_enabled.ompt_callback_sync_region) {
2241
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2242
          sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2243
    }
2244
    if (ompt_enabled.ompt_callback_sync_region_wait) {
2245
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246
          sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2247
    }
2248
    if (!KMP_MASTER_TID(ds_tid))
2249
      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2250
#endif
2251
    this_thr->th.ompt_thread_info.state = ompt_state;
2252
  }
2253
#endif
2254

2255
  if (__kmp_tasking_mode == tskm_extra_barrier) {
2256
    __kmp_tasking_barrier(team, this_thr, gtid);
2257
    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2258
                  gtid, team_id, tid));
2259
  }
2260
#ifdef KMP_DEBUG
2261
  if (__kmp_tasking_mode != tskm_immediate_exec) {
2262
    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2263
                  "%p, th_task_team = %p\n",
2264
                  __kmp_gtid_from_thread(this_thr), team_id,
2265
                  team->t.t_task_team[this_thr->th.th_task_state],
2266
                  this_thr->th.th_task_team));
2267
    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2268
  }
2269
#endif /* KMP_DEBUG */
2270

2271
  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2272
     access it when the team struct is not guaranteed to exist. Doing these
2273
     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2274
     we do not perform the copy if blocktime=infinite, since the values are not
2275
     used by __kmp_wait_template() in that case. */
2276
  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2277
#if KMP_USE_MONITOR
2278
    this_thr->th.th_team_bt_intervals =
2279
        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2280
    this_thr->th.th_team_bt_set =
2281
        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2282
#else
2283
    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2284
#endif
2285
  }
2286

2287
#if USE_ITT_BUILD
2288
  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2289
    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2290
#endif /* USE_ITT_BUILD */
2291

2292
  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2293
  case bp_dist_bar: {
2294
    __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2295
                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2296
    break;
2297
  }
2298
  case bp_hyper_bar: {
2299
    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2300
    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2301
                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2302
    break;
2303
  }
2304
  case bp_hierarchical_bar: {
2305
    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2306
                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2307
    break;
2308
  }
2309
  case bp_tree_bar: {
2310
    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2311
    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2312
                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2313
    break;
2314
  }
2315
  default: {
2316
    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317
                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2318
  }
2319
  }
2320

2321
  /* From this point on, the team data structure may be deallocated at any time
2322
     by the primary thread - it is unsafe to reference it in any of the worker
2323
     threads. Any per-team data items that need to be referenced before the
2324
     end of the barrier should be moved to the kmp_task_team_t structs.  */
2325
  if (KMP_MASTER_TID(tid)) {
2326
    if (__kmp_tasking_mode != tskm_immediate_exec) {
2327
      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2328
    }
2329
    if (__kmp_display_affinity) {
2330
      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2331
    }
2332
#if KMP_STATS_ENABLED
2333
    // Have primary thread flag the workers to indicate they are now waiting for
2334
    // next parallel region, Also wake them up so they switch their timers to
2335
    // idle.
2336
    for (int i = 0; i < team->t.t_nproc; ++i) {
2337
      kmp_info_t *team_thread = team->t.t_threads[i];
2338
      if (team_thread == this_thr)
2339
        continue;
2340
      team_thread->th.th_stats->setIdleFlag();
2341
      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2342
          team_thread->th.th_sleep_loc != NULL)
2343
        __kmp_null_resume_wrapper(team_thread);
2344
    }
2345
#endif
2346
#if USE_ITT_BUILD
2347
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2348
      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2349
#endif /* USE_ITT_BUILD */
2350

2351
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2352
    // Join barrier - report frame end
2353
    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2354
        __kmp_forkjoin_frames_mode &&
2355
        (this_thr->th.th_teams_microtask == NULL || // either not in teams
2356
         this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2357
        team->t.t_active_level == 1) {
2358
      kmp_uint64 cur_time = __itt_get_timestamp();
2359
      ident_t *loc = team->t.t_ident;
2360
      kmp_info_t **other_threads = team->t.t_threads;
2361
      switch (__kmp_forkjoin_frames_mode) {
2362
      case 1:
2363
        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2364
                               loc, nproc);
2365
        break;
2366
      case 2:
2367
        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2368
                               loc, nproc);
2369
        break;
2370
      case 3:
2371
        if (__itt_metadata_add_ptr) {
2372
          // Initialize with primary thread's wait time
2373
          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2374
          // Set arrive time to zero to be able to check it in
2375
          // __kmp_invoke_task(); the same is done inside the loop below
2376
          this_thr->th.th_bar_arrive_time = 0;
2377
          for (int i = 1; i < nproc; ++i) {
2378
            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2379
            other_threads[i]->th.th_bar_arrive_time = 0;
2380
          }
2381
          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2382
                                       cur_time, delta, 0);
2383
        }
2384
        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2385
                               loc, nproc);
2386
        this_thr->th.th_frame_time = cur_time;
2387
        break;
2388
      }
2389
    }
2390
#endif /* USE_ITT_BUILD */
2391
  }
2392
#if USE_ITT_BUILD
2393
  else {
2394
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2395
      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2396
  }
2397
#endif /* USE_ITT_BUILD */
2398

2399
#if KMP_DEBUG
2400
  if (KMP_MASTER_TID(tid)) {
2401
    KA_TRACE(
2402
        15,
2403
        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2404
         gtid, team_id, tid, nproc));
2405
  }
2406
#endif /* KMP_DEBUG */
2407

2408
  // TODO now, mark worker threads as done so they may be disbanded
2409
  KMP_MB(); // Flush all pending memory write invalidates.
2410
  KA_TRACE(10,
2411
           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2412

2413
}
2414

2415
// TODO release worker threads' fork barriers as we are ready instead of all at
2416
// once
2417
void __kmp_fork_barrier(int gtid, int tid) {
2418
  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2419
  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2420
  kmp_info_t *this_thr = __kmp_threads[gtid];
2421
  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2422
#if USE_ITT_BUILD
2423
  void *itt_sync_obj = NULL;
2424
#endif /* USE_ITT_BUILD */
2425
#ifdef KMP_DEBUG
2426
  if (team)
2427
    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2428
                  (team != NULL) ? team->t.t_id : -1, tid));
2429
#endif
2430
  // th_team pointer only valid for primary thread here
2431
  if (KMP_MASTER_TID(tid)) {
2432
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2433
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2434
      // Create itt barrier object
2435
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2436
      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2437
    }
2438
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2439

2440
#ifdef KMP_DEBUG
2441
    KMP_DEBUG_ASSERT(team);
2442
    kmp_info_t **other_threads = team->t.t_threads;
2443
    int i;
2444

2445
    // Verify state
2446
    KMP_MB();
2447

2448
    for (i = 1; i < team->t.t_nproc; ++i) {
2449
      KA_TRACE(500,
2450
               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2451
                "== %u.\n",
2452
                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2453
                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2454
                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2455
      KMP_DEBUG_ASSERT(
2456
          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2457
           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2458
      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2459
    }
2460
#endif
2461

2462
    if (__kmp_tasking_mode != tskm_immediate_exec)
2463
      __kmp_task_team_setup(this_thr, team);
2464

2465
    /* The primary thread may have changed its blocktime between join barrier
2466
       and fork barrier. Copy the blocktime info to the thread, where
2467
       __kmp_wait_template() can access it when the team struct is not
2468
       guaranteed to exist. */
2469
    // See note about the corresponding code in __kmp_join_barrier() being
2470
    // performance-critical
2471
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2472
#if KMP_USE_MONITOR
2473
      this_thr->th.th_team_bt_intervals =
2474
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2475
      this_thr->th.th_team_bt_set =
2476
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2477
#else
2478
      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2479
#endif
2480
    }
2481
  } // primary thread
2482

2483
  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2484
  case bp_dist_bar: {
2485
    __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2486
                               TRUE USE_ITT_BUILD_ARG(NULL));
2487
    break;
2488
  }
2489
  case bp_hyper_bar: {
2490
    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2491
    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2492
                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2493
    break;
2494
  }
2495
  case bp_hierarchical_bar: {
2496
    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2497
                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2498
    break;
2499
  }
2500
  case bp_tree_bar: {
2501
    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2502
    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2503
                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2504
    break;
2505
  }
2506
  default: {
2507
    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2508
                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2509
  }
2510
  }
2511

2512
#if OMPT_SUPPORT
2513
  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2514
  if (ompt_enabled.enabled &&
2515
      (ompt_state == ompt_state_wait_barrier_teams ||
2516
       ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2517
    int ds_tid = this_thr->th.th_info.ds.ds_tid;
2518
    ompt_data_t *task_data = (team)
2519
                                 ? OMPT_CUR_TASK_DATA(this_thr)
2520
                                 : &(this_thr->th.ompt_thread_info.task_data);
2521
    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2522
#if OMPT_OPTIONAL
2523
    void *codeptr = NULL;
2524
    if (KMP_MASTER_TID(ds_tid) &&
2525
        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2526
         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2527
      codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2528
    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2529
    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2530
      sync_kind = ompt_sync_region_barrier_teams;
2531
    if (ompt_enabled.ompt_callback_sync_region_wait) {
2532
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2533
          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2534
    }
2535
    if (ompt_enabled.ompt_callback_sync_region) {
2536
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2537
          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2538
    }
2539
#endif
2540
    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2541
      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2542
          ompt_scope_end, NULL, task_data, 0, ds_tid,
2543
          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2544
    }
2545
  }
2546
#endif
2547

2548
  // Early exit for reaping threads releasing forkjoin barrier
2549
  if (TCR_4(__kmp_global.g.g_done)) {
2550
    this_thr->th.th_task_team = NULL;
2551

2552
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2553
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2554
      if (!KMP_MASTER_TID(tid)) {
2555
        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2556
        if (itt_sync_obj)
2557
          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2558
      }
2559
    }
2560
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2561
    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2562
    return;
2563
  }
2564

2565
  /* We can now assume that a valid team structure has been allocated by the
2566
     primary thread and propagated to all worker threads. The current thread,
2567
     however, may not be part of the team, so we can't blindly assume that the
2568
     team pointer is non-null.  */
2569
  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2570
  KMP_DEBUG_ASSERT(team != NULL);
2571
  tid = __kmp_tid_from_gtid(gtid);
2572

2573
#if KMP_BARRIER_ICV_PULL
2574
  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2575
     __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2576
     implicit task has this data before this function is called. We cannot
2577
     modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2578
     thread struct, because it is not always the case that the threads arrays
2579
     have been allocated when __kmp_fork_call() is executed. */
2580
  {
2581
    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2582
    if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2583
      // Copy the initial ICVs from the primary thread's thread struct to the
2584
      // implicit task for this tid.
2585
      KA_TRACE(10,
2586
               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2587
      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2588
                               tid, FALSE);
2589
      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2590
                &team->t.t_threads[0]
2591
                     ->th.th_bar[bs_forkjoin_barrier]
2592
                     .bb.th_fixed_icvs);
2593
    }
2594
  }
2595
#endif // KMP_BARRIER_ICV_PULL
2596

2597
  if (__kmp_tasking_mode != tskm_immediate_exec) {
2598
    __kmp_task_team_sync(this_thr, team);
2599
  }
2600

2601
#if KMP_AFFINITY_SUPPORTED
2602
  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2603
  if (proc_bind == proc_bind_intel) {
2604
    // Call dynamic affinity settings
2605
    if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2606
      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2607
    }
2608
  } else if (proc_bind != proc_bind_false) {
2609
    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2610
      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2611
                     __kmp_gtid_from_thread(this_thr),
2612
                     this_thr->th.th_current_place));
2613
    } else {
2614
      __kmp_affinity_bind_place(gtid);
2615
    }
2616
  }
2617
#endif // KMP_AFFINITY_SUPPORTED
2618
  // Perform the display affinity functionality
2619
  if (__kmp_display_affinity) {
2620
    if (team->t.t_display_affinity
2621
#if KMP_AFFINITY_SUPPORTED
2622
        || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2623
#endif
2624
    ) {
2625
      // NULL means use the affinity-format-var ICV
2626
      __kmp_aux_display_affinity(gtid, NULL);
2627
      this_thr->th.th_prev_num_threads = team->t.t_nproc;
2628
      this_thr->th.th_prev_level = team->t.t_level;
2629
    }
2630
  }
2631
  if (!KMP_MASTER_TID(tid))
2632
    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2633

2634
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2635
  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2636
    if (!KMP_MASTER_TID(tid)) {
2637
      // Get correct barrier object
2638
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2639
      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2640
    } // (prepare called inside barrier_release)
2641
  }
2642
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2643
  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2644
                team->t.t_id, tid));
2645
}
2646

2647
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2648
                          kmp_internal_control_t *new_icvs, ident_t *loc) {
2649
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2650

2651
  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2652
  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2653

2654
/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2655
   __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2656
   implicit task has this data before this function is called. */
2657
#if KMP_BARRIER_ICV_PULL
2658
  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2659
     remains untouched), where all of the worker threads can access them and
2660
     make their own copies after the barrier. */
2661
  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2662
  // allocated at this point
2663
  copy_icvs(
2664
      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2665
      new_icvs);
2666
  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2667
                team->t.t_threads[0], team));
2668
#elif KMP_BARRIER_ICV_PUSH
2669
  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2670
  // done here.
2671
  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2672
                team->t.t_threads[0], team));
2673
#else
2674
  // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2675
  // time.
2676
  ngo_load(new_icvs);
2677
  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2678
  // allocated at this point
2679
  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2680
    // TODO: GEH - pass in better source location info since usually NULL here
2681
    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2682
                  f, team->t.t_threads[f], team));
2683
    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2684
    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2685
    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2686
                  f, team->t.t_threads[f], team));
2687
  }
2688
  ngo_sync();
2689
#endif // KMP_BARRIER_ICV_PULL
2690
}
2691

2692
Product

Resources

Company