Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_barrier.cpp
35258 views
1
/*
2
* kmp_barrier.cpp
3
*/
4
5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "kmp_wait_release.h"
14
#include "kmp_barrier.h"
15
#include "kmp_itt.h"
16
#include "kmp_os.h"
17
#include "kmp_stats.h"
18
#include "ompt-specific.h"
19
// for distributed barrier
20
#include "kmp_affinity.h"
21
22
#if KMP_MIC
23
#include <immintrin.h>
24
#define USE_NGO_STORES 1
25
#endif // KMP_MIC
26
27
#if KMP_MIC && USE_NGO_STORES
28
// ICV copying
29
#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30
#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31
#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32
#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33
#else
34
#define ngo_load(src) ((void)0)
35
#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36
#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37
#define ngo_sync() ((void)0)
38
#endif /* KMP_MIC && USE_NGO_STORES */
39
40
void __kmp_print_structure(void); // Forward declaration
41
42
// ---------------------------- Barrier Algorithms ----------------------------
43
// Distributed barrier
44
45
// Compute how many threads to have polling each cache-line.
46
// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47
void distributedBarrier::computeVarsForN(size_t n) {
48
int nsockets = 1;
49
if (__kmp_topology) {
50
int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51
int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52
int ncores_per_socket =
53
__kmp_topology->calculate_ratio(core_level, socket_level);
54
nsockets = __kmp_topology->get_count(socket_level);
55
56
if (nsockets <= 0)
57
nsockets = 1;
58
if (ncores_per_socket <= 0)
59
ncores_per_socket = 1;
60
61
threads_per_go = ncores_per_socket >> 1;
62
if (!fix_threads_per_go) {
63
// Minimize num_gos
64
if (threads_per_go > 4) {
65
if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66
threads_per_go = threads_per_go >> 1;
67
}
68
if (threads_per_go > 4 && nsockets == 1)
69
threads_per_go = threads_per_go >> 1;
70
}
71
}
72
if (threads_per_go == 0)
73
threads_per_go = 1;
74
fix_threads_per_go = true;
75
num_gos = n / threads_per_go;
76
if (n % threads_per_go)
77
num_gos++;
78
if (nsockets == 1 || num_gos == 1)
79
num_groups = 1;
80
else {
81
num_groups = num_gos / nsockets;
82
if (num_gos % nsockets)
83
num_groups++;
84
}
85
if (num_groups <= 0)
86
num_groups = 1;
87
gos_per_group = num_gos / num_groups;
88
if (num_gos % num_groups)
89
gos_per_group++;
90
threads_per_group = threads_per_go * gos_per_group;
91
} else {
92
num_gos = n / threads_per_go;
93
if (n % threads_per_go)
94
num_gos++;
95
if (num_gos == 1)
96
num_groups = 1;
97
else {
98
num_groups = num_gos / 2;
99
if (num_gos % 2)
100
num_groups++;
101
}
102
gos_per_group = num_gos / num_groups;
103
if (num_gos % num_groups)
104
gos_per_group++;
105
threads_per_group = threads_per_go * gos_per_group;
106
}
107
}
108
109
void distributedBarrier::computeGo(size_t n) {
110
// Minimize num_gos
111
for (num_gos = 1;; num_gos++)
112
if (IDEAL_CONTENTION * num_gos >= n)
113
break;
114
threads_per_go = n / num_gos;
115
if (n % num_gos)
116
threads_per_go++;
117
while (num_gos > MAX_GOS) {
118
threads_per_go++;
119
num_gos = n / threads_per_go;
120
if (n % threads_per_go)
121
num_gos++;
122
}
123
computeVarsForN(n);
124
}
125
126
// This function is to resize the barrier arrays when the new number of threads
127
// exceeds max_threads, which is the current size of all the arrays
128
void distributedBarrier::resize(size_t nthr) {
129
KMP_DEBUG_ASSERT(nthr > max_threads);
130
131
// expand to requested size * 2
132
max_threads = nthr * 2;
133
134
// allocate arrays to new max threads
135
for (int i = 0; i < MAX_ITERS; ++i) {
136
if (flags[i])
137
flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138
max_threads * sizeof(flags_s));
139
else
140
flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141
}
142
143
if (go)
144
go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145
else
146
go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147
148
if (iter)
149
iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150
else
151
iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152
153
if (sleep)
154
sleep =
155
(sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156
else
157
sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158
}
159
160
// This function is to set all the go flags that threads might be waiting
161
// on, and when blocktime is not infinite, it should be followed by a wake-up
162
// call to each thread
163
kmp_uint64 distributedBarrier::go_release() {
164
kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165
for (size_t j = 0; j < num_gos; j++) {
166
go[j].go.store(next_go);
167
}
168
return next_go;
169
}
170
171
void distributedBarrier::go_reset() {
172
for (size_t j = 0; j < max_threads; ++j) {
173
for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174
flags[i][j].stillNeed = 1;
175
}
176
go[j].go.store(0);
177
iter[j].iter = 0;
178
}
179
}
180
181
// This function inits/re-inits the distributed barrier for a particular number
182
// of threads. If a resize of arrays is needed, it calls the resize function.
183
void distributedBarrier::init(size_t nthr) {
184
size_t old_max = max_threads;
185
if (nthr > max_threads) { // need more space in arrays
186
resize(nthr);
187
}
188
189
for (size_t i = 0; i < max_threads; i++) {
190
for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191
flags[j][i].stillNeed = 1;
192
}
193
go[i].go.store(0);
194
iter[i].iter = 0;
195
if (i >= old_max)
196
sleep[i].sleep = false;
197
}
198
199
// Recalculate num_gos, etc. based on new nthr
200
computeVarsForN(nthr);
201
202
num_threads = nthr;
203
204
if (team_icvs == NULL)
205
team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206
}
207
208
// This function is used only when KMP_BLOCKTIME is not infinite.
209
// static
210
void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211
size_t start, size_t stop, size_t inc,
212
size_t tid) {
213
KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215
return;
216
217
kmp_info_t **other_threads = team->t.t_threads;
218
for (size_t thr = start; thr < stop; thr += inc) {
219
KMP_DEBUG_ASSERT(other_threads[thr]);
220
int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221
// Wake up worker regardless of if it appears to be sleeping or not
222
__kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223
}
224
}
225
226
static void __kmp_dist_barrier_gather(
227
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230
kmp_team_t *team;
231
distributedBarrier *b;
232
kmp_info_t **other_threads;
233
kmp_uint64 my_current_iter, my_next_iter;
234
kmp_uint32 nproc;
235
bool group_leader;
236
237
team = this_thr->th.th_team;
238
nproc = this_thr->th.th_team_nproc;
239
other_threads = team->t.t_threads;
240
b = team->t.b;
241
my_current_iter = b->iter[tid].iter;
242
my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243
group_leader = ((tid % b->threads_per_group) == 0);
244
245
KA_TRACE(20,
246
("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247
gtid, team->t.t_id, tid, bt));
248
249
#if USE_ITT_BUILD && USE_ITT_NOTIFY
250
// Barrier imbalance - save arrive time to the thread
251
if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252
this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253
__itt_get_timestamp();
254
}
255
#endif
256
257
if (group_leader) {
258
// Start from the thread after the group leader
259
size_t group_start = tid + 1;
260
size_t group_end = tid + b->threads_per_group;
261
size_t threads_pending = 0;
262
263
if (group_end > nproc)
264
group_end = nproc;
265
do { // wait for threads in my group
266
threads_pending = 0;
267
// Check all the flags every time to avoid branch misspredict
268
for (size_t thr = group_start; thr < group_end; thr++) {
269
// Each thread uses a different cache line
270
threads_pending += b->flags[my_current_iter][thr].stillNeed;
271
}
272
// Execute tasks here
273
if (__kmp_tasking_mode != tskm_immediate_exec) {
274
kmp_task_team_t *task_team = this_thr->th.th_task_team;
275
if (task_team != NULL) {
276
if (TCR_SYNC_4(task_team->tt.tt_active)) {
277
if (KMP_TASKING_ENABLED(task_team)) {
278
int tasks_completed = FALSE;
279
__kmp_atomic_execute_tasks_64(
280
this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281
&tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282
} else
283
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284
}
285
} else {
286
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287
} // if
288
}
289
if (TCR_4(__kmp_global.g.g_done)) {
290
if (__kmp_global.g.g_abort)
291
__kmp_abort_thread();
292
break;
293
} else if (__kmp_tasking_mode != tskm_immediate_exec &&
294
this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295
this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296
}
297
} while (threads_pending > 0);
298
299
if (reduce) { // Perform reduction if needed
300
OMPT_REDUCTION_DECL(this_thr, gtid);
301
OMPT_REDUCTION_BEGIN;
302
// Group leader reduces all threads in group
303
for (size_t thr = group_start; thr < group_end; thr++) {
304
(*reduce)(this_thr->th.th_local.reduce_data,
305
other_threads[thr]->th.th_local.reduce_data);
306
}
307
OMPT_REDUCTION_END;
308
}
309
310
// Set flag for next iteration
311
b->flags[my_next_iter][tid].stillNeed = 1;
312
// Each thread uses a different cache line; resets stillNeed to 0 to
313
// indicate it has reached the barrier
314
b->flags[my_current_iter][tid].stillNeed = 0;
315
316
do { // wait for all group leaders
317
threads_pending = 0;
318
for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319
threads_pending += b->flags[my_current_iter][thr].stillNeed;
320
}
321
// Execute tasks here
322
if (__kmp_tasking_mode != tskm_immediate_exec) {
323
kmp_task_team_t *task_team = this_thr->th.th_task_team;
324
if (task_team != NULL) {
325
if (TCR_SYNC_4(task_team->tt.tt_active)) {
326
if (KMP_TASKING_ENABLED(task_team)) {
327
int tasks_completed = FALSE;
328
__kmp_atomic_execute_tasks_64(
329
this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330
&tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331
} else
332
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333
}
334
} else {
335
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336
} // if
337
}
338
if (TCR_4(__kmp_global.g.g_done)) {
339
if (__kmp_global.g.g_abort)
340
__kmp_abort_thread();
341
break;
342
} else if (__kmp_tasking_mode != tskm_immediate_exec &&
343
this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344
this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345
}
346
} while (threads_pending > 0);
347
348
if (reduce) { // Perform reduction if needed
349
if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350
OMPT_REDUCTION_DECL(this_thr, gtid);
351
OMPT_REDUCTION_BEGIN;
352
for (size_t thr = b->threads_per_group; thr < nproc;
353
thr += b->threads_per_group) {
354
(*reduce)(this_thr->th.th_local.reduce_data,
355
other_threads[thr]->th.th_local.reduce_data);
356
}
357
OMPT_REDUCTION_END;
358
}
359
}
360
} else {
361
// Set flag for next iteration
362
b->flags[my_next_iter][tid].stillNeed = 1;
363
// Each thread uses a different cache line; resets stillNeed to 0 to
364
// indicate it has reached the barrier
365
b->flags[my_current_iter][tid].stillNeed = 0;
366
}
367
368
KMP_MFENCE();
369
370
KA_TRACE(20,
371
("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372
gtid, team->t.t_id, tid, bt));
373
}
374
375
static void __kmp_dist_barrier_release(
376
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379
kmp_team_t *team;
380
distributedBarrier *b;
381
kmp_bstate_t *thr_bar;
382
kmp_uint64 my_current_iter, next_go;
383
size_t my_go_index;
384
bool group_leader;
385
386
KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387
gtid, tid, bt));
388
389
thr_bar = &this_thr->th.th_bar[bt].bb;
390
391
if (!KMP_MASTER_TID(tid)) {
392
// workers and non-master group leaders need to check their presence in team
393
do {
394
if (this_thr->th.th_used_in_team.load() != 1 &&
395
this_thr->th.th_used_in_team.load() != 3) {
396
// Thread is not in use in a team. Wait on location in tid's thread
397
// struct. The 0 value tells anyone looking that this thread is spinning
398
// or sleeping until this location becomes 3 again; 3 is the transition
399
// state to get to 1 which is waiting on go and being in the team
400
kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401
if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402
0) ||
403
this_thr->th.th_used_in_team.load() == 0) {
404
my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405
}
406
#if USE_ITT_BUILD && USE_ITT_NOTIFY
407
if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408
// In fork barrier where we could not get the object reliably
409
itt_sync_obj =
410
__kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411
// Cancel wait on previous parallel region...
412
__kmp_itt_task_starting(itt_sync_obj);
413
414
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415
return;
416
417
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418
if (itt_sync_obj != NULL)
419
// Call prepare as early as possible for "new" barrier
420
__kmp_itt_task_finished(itt_sync_obj);
421
} else
422
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424
return;
425
}
426
if (this_thr->th.th_used_in_team.load() != 1 &&
427
this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428
continue;
429
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430
return;
431
432
// At this point, the thread thinks it is in use in a team, or in
433
// transition to be used in a team, but it might have reached this barrier
434
// before it was marked unused by the team. Unused threads are awoken and
435
// shifted to wait on local thread struct elsewhere. It also might reach
436
// this point by being picked up for use by a different team. Either way,
437
// we need to update the tid.
438
tid = __kmp_tid_from_gtid(gtid);
439
team = this_thr->th.th_team;
440
KMP_DEBUG_ASSERT(tid >= 0);
441
KMP_DEBUG_ASSERT(team);
442
b = team->t.b;
443
my_current_iter = b->iter[tid].iter;
444
next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445
my_go_index = tid / b->threads_per_go;
446
if (this_thr->th.th_used_in_team.load() == 3) {
447
KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
448
}
449
// Check if go flag is set
450
if (b->go[my_go_index].go.load() != next_go) {
451
// Wait on go flag on team
452
kmp_atomic_flag_64<false, true> my_flag(
453
&(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454
my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
455
KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
456
b->iter[tid].iter == 0);
457
KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458
}
459
460
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461
return;
462
// At this point, the thread's go location was set. This means the primary
463
// thread is safely in the barrier, and so this thread's data is
464
// up-to-date, but we should check again that this thread is really in
465
// use in the team, as it could have been woken up for the purpose of
466
// changing team size, or reaping threads at shutdown.
467
if (this_thr->th.th_used_in_team.load() == 1)
468
break;
469
} while (1);
470
471
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472
return;
473
474
group_leader = ((tid % b->threads_per_group) == 0);
475
if (group_leader) {
476
// Tell all the threads in my group they can go!
477
for (size_t go_idx = my_go_index + 1;
478
go_idx < my_go_index + b->gos_per_group; go_idx++) {
479
b->go[go_idx].go.store(next_go);
480
}
481
// Fence added so that workers can see changes to go. sfence inadequate.
482
KMP_MFENCE();
483
}
484
485
#if KMP_BARRIER_ICV_PUSH
486
if (propagate_icvs) { // copy ICVs to final dest
487
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
488
tid, FALSE);
489
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
490
(kmp_internal_control_t *)team->t.b->team_icvs);
491
copy_icvs(&thr_bar->th_fixed_icvs,
492
&team->t.t_implicit_task_taskdata[tid].td_icvs);
493
}
494
#endif
495
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496
// This thread is now awake and participating in the barrier;
497
// wake up the other threads in the group
498
size_t nproc = this_thr->th.th_team_nproc;
499
size_t group_end = tid + b->threads_per_group;
500
if (nproc < group_end)
501
group_end = nproc;
502
__kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
503
}
504
} else { // Primary thread
505
team = this_thr->th.th_team;
506
b = team->t.b;
507
my_current_iter = b->iter[tid].iter;
508
next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509
#if KMP_BARRIER_ICV_PUSH
510
if (propagate_icvs) {
511
// primary thread has ICVs in final destination; copy
512
copy_icvs(&thr_bar->th_fixed_icvs,
513
&team->t.t_implicit_task_taskdata[tid].td_icvs);
514
}
515
#endif
516
// Tell all the group leaders they can go!
517
for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518
b->go[go_idx].go.store(next_go);
519
}
520
521
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
522
// Wake-up the group leaders
523
size_t nproc = this_thr->th.th_team_nproc;
524
__kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
525
b->threads_per_group, tid);
526
}
527
528
// Tell all the threads in my group they can go!
529
for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
530
b->go[go_idx].go.store(next_go);
531
}
532
533
// Fence added so that workers can see changes to go. sfence inadequate.
534
KMP_MFENCE();
535
536
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
537
// Wake-up the other threads in my group
538
size_t nproc = this_thr->th.th_team_nproc;
539
size_t group_end = tid + b->threads_per_group;
540
if (nproc < group_end)
541
group_end = nproc;
542
__kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
543
}
544
}
545
// Update to next iteration
546
KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547
b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
548
549
KA_TRACE(
550
20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551
gtid, team->t.t_id, tid, bt));
552
}
553
554
// Linear Barrier
555
template <bool cancellable = false>
556
static bool __kmp_linear_barrier_gather_template(
557
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
558
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
559
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560
kmp_team_t *team = this_thr->th.th_team;
561
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562
kmp_info_t **other_threads = team->t.t_threads;
563
564
KA_TRACE(
565
20,
566
("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567
gtid, team->t.t_id, tid, bt));
568
KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569
570
#if USE_ITT_BUILD && USE_ITT_NOTIFY
571
// Barrier imbalance - save arrive time to the thread
572
if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
573
this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574
__itt_get_timestamp();
575
}
576
#endif
577
// We now perform a linear reduction to signal that all of the threads have
578
// arrived.
579
if (!KMP_MASTER_TID(tid)) {
580
KA_TRACE(20,
581
("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582
"arrived(%p): %llu => %llu\n",
583
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
584
team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
585
thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586
// Mark arrival to primary thread
587
/* After performing this write, a worker thread may not assume that the team
588
is valid any more - it could be deallocated by the primary thread at any
589
time. */
590
kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
591
flag.release();
592
} else {
593
kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594
int nproc = this_thr->th.th_team_nproc;
595
int i;
596
// Don't have to worry about sleep bit here or atomic since team setting
597
kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598
599
// Collect all the worker team member threads.
600
for (i = 1; i < nproc; ++i) {
601
#if KMP_CACHE_MANAGE
602
// Prefetch next thread's arrived count
603
if (i + 1 < nproc)
604
KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
605
#endif /* KMP_CACHE_MANAGE */
606
KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607
"arrived(%p) == %llu\n",
608
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609
team->t.t_id, i,
610
&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611
612
// Wait for worker thread to arrive
613
if (cancellable) {
614
kmp_flag_64<true, false> flag(
615
&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616
if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617
return true;
618
} else {
619
kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620
new_state);
621
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622
}
623
#if USE_ITT_BUILD && USE_ITT_NOTIFY
624
// Barrier imbalance - write min of the thread time and the other thread
625
// time to the thread.
626
if (__kmp_forkjoin_frames_mode == 2) {
627
this_thr->th.th_bar_min_time = KMP_MIN(
628
this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629
}
630
#endif
631
if (reduce) {
632
KA_TRACE(100,
633
("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635
team->t.t_id, i));
636
OMPT_REDUCTION_DECL(this_thr, gtid);
637
OMPT_REDUCTION_BEGIN;
638
(*reduce)(this_thr->th.th_local.reduce_data,
639
other_threads[i]->th.th_local.reduce_data);
640
OMPT_REDUCTION_END;
641
}
642
}
643
// Don't have to worry about sleep bit here or atomic since team setting
644
team_bar->b_arrived = new_state;
645
KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646
"arrived(%p) = %llu\n",
647
gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648
new_state));
649
}
650
KA_TRACE(
651
20,
652
("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653
gtid, team->t.t_id, tid, bt));
654
return false;
655
}
656
657
template <bool cancellable = false>
658
static bool __kmp_linear_barrier_release_template(
659
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
660
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663
kmp_team_t *team;
664
665
if (KMP_MASTER_TID(tid)) {
666
unsigned int i;
667
kmp_uint32 nproc = this_thr->th.th_team_nproc;
668
kmp_info_t **other_threads;
669
670
team = __kmp_threads[gtid]->th.th_team;
671
KMP_DEBUG_ASSERT(team != NULL);
672
other_threads = team->t.t_threads;
673
674
KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675
"barrier type %d\n",
676
gtid, team->t.t_id, tid, bt));
677
678
if (nproc > 1) {
679
#if KMP_BARRIER_ICV_PUSH
680
{
681
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
682
if (propagate_icvs) {
683
ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
684
for (i = 1; i < nproc; ++i) {
685
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
686
team, i, FALSE);
687
ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688
&team->t.t_implicit_task_taskdata[0].td_icvs);
689
}
690
ngo_sync();
691
}
692
}
693
#endif // KMP_BARRIER_ICV_PUSH
694
695
// Now, release all of the worker threads
696
for (i = 1; i < nproc; ++i) {
697
#if KMP_CACHE_MANAGE
698
// Prefetch next thread's go flag
699
if (i + 1 < nproc)
700
KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
701
#endif /* KMP_CACHE_MANAGE */
702
KA_TRACE(
703
20,
704
("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705
"go(%p): %u => %u\n",
706
gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707
team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708
other_threads[i]->th.th_bar[bt].bb.b_go,
709
other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710
kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711
other_threads[i]);
712
flag.release();
713
}
714
}
715
} else { // Wait for the PRIMARY thread to release us
716
KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717
gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718
if (cancellable) {
719
kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
720
if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721
return true;
722
} else {
723
kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
724
flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725
}
726
#if USE_ITT_BUILD && USE_ITT_NOTIFY
727
if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
728
// In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729
// disabled)
730
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
731
// Cancel wait on previous parallel region...
732
__kmp_itt_task_starting(itt_sync_obj);
733
734
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735
return false;
736
737
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
738
if (itt_sync_obj != NULL)
739
// Call prepare as early as possible for "new" barrier
740
__kmp_itt_task_finished(itt_sync_obj);
741
} else
742
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743
// Early exit for reaping threads releasing forkjoin barrier
744
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745
return false;
746
// The worker thread may now assume that the team is valid.
747
#ifdef KMP_DEBUG
748
tid = __kmp_tid_from_gtid(gtid);
749
team = __kmp_threads[gtid]->th.th_team;
750
#endif
751
KMP_DEBUG_ASSERT(team != NULL);
752
TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753
KA_TRACE(20,
754
("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755
gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756
KMP_MB(); // Flush all pending memory write invalidates.
757
}
758
KA_TRACE(
759
20,
760
("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761
gtid, team->t.t_id, tid, bt));
762
return false;
763
}
764
765
static void __kmp_linear_barrier_gather(
766
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
767
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
768
__kmp_linear_barrier_gather_template<false>(
769
bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770
}
771
772
static bool __kmp_linear_barrier_gather_cancellable(
773
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
774
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
775
return __kmp_linear_barrier_gather_template<true>(
776
bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777
}
778
779
static void __kmp_linear_barrier_release(
780
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
781
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782
__kmp_linear_barrier_release_template<false>(
783
bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784
}
785
786
static bool __kmp_linear_barrier_release_cancellable(
787
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
788
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789
return __kmp_linear_barrier_release_template<true>(
790
bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791
}
792
793
// Tree barrier
794
static void __kmp_tree_barrier_gather(
795
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
796
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
797
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
798
kmp_team_t *team = this_thr->th.th_team;
799
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800
kmp_info_t **other_threads = team->t.t_threads;
801
kmp_uint32 nproc = this_thr->th.th_team_nproc;
802
kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
803
kmp_uint32 branch_factor = 1 << branch_bits;
804
kmp_uint32 child;
805
kmp_uint32 child_tid;
806
kmp_uint64 new_state = 0;
807
808
KA_TRACE(
809
20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810
gtid, team->t.t_id, tid, bt));
811
KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812
813
#if USE_ITT_BUILD && USE_ITT_NOTIFY
814
// Barrier imbalance - save arrive time to the thread
815
if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
816
this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817
__itt_get_timestamp();
818
}
819
#endif
820
// Perform tree gather to wait until all threads have arrived; reduce any
821
// required data as we go
822
child_tid = (tid << branch_bits) + 1;
823
if (child_tid < nproc) {
824
// Parent threads wait for all their children to arrive
825
new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826
child = 1;
827
do {
828
kmp_info_t *child_thr = other_threads[child_tid];
829
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830
#if KMP_CACHE_MANAGE
831
// Prefetch next thread's arrived count
832
if (child + 1 <= branch_factor && child_tid + 1 < nproc)
833
KMP_CACHE_PREFETCH(
834
&other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
835
#endif /* KMP_CACHE_MANAGE */
836
KA_TRACE(20,
837
("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838
"arrived(%p) == %llu\n",
839
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840
team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841
// Wait for child to arrive
842
kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844
#if USE_ITT_BUILD && USE_ITT_NOTIFY
845
// Barrier imbalance - write min of the thread time and a child time to
846
// the thread.
847
if (__kmp_forkjoin_frames_mode == 2) {
848
this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849
child_thr->th.th_bar_min_time);
850
}
851
#endif
852
if (reduce) {
853
KA_TRACE(100,
854
("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856
team->t.t_id, child_tid));
857
OMPT_REDUCTION_DECL(this_thr, gtid);
858
OMPT_REDUCTION_BEGIN;
859
(*reduce)(this_thr->th.th_local.reduce_data,
860
child_thr->th.th_local.reduce_data);
861
OMPT_REDUCTION_END;
862
}
863
child++;
864
child_tid++;
865
} while (child <= branch_factor && child_tid < nproc);
866
}
867
868
if (!KMP_MASTER_TID(tid)) { // Worker threads
869
kmp_int32 parent_tid = (tid - 1) >> branch_bits;
870
871
KA_TRACE(20,
872
("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873
"arrived(%p): %llu => %llu\n",
874
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875
team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876
thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877
878
// Mark arrival to parent thread
879
/* After performing this write, a worker thread may not assume that the team
880
is valid any more - it could be deallocated by the primary thread at any
881
time. */
882
kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883
flag.release();
884
} else {
885
// Need to update the team arrived pointer if we are the primary thread
886
if (nproc > 1) // New value was already computed above
887
team->t.t_bar[bt].b_arrived = new_state;
888
else
889
team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890
KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891
"arrived(%p) = %llu\n",
892
gtid, team->t.t_id, tid, team->t.t_id,
893
&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894
}
895
KA_TRACE(20,
896
("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897
gtid, team->t.t_id, tid, bt));
898
}
899
900
static void __kmp_tree_barrier_release(
901
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
902
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904
kmp_team_t *team;
905
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906
kmp_uint32 nproc;
907
kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
908
kmp_uint32 branch_factor = 1 << branch_bits;
909
kmp_uint32 child;
910
kmp_uint32 child_tid;
911
912
// Perform a tree release for all of the threads that have been gathered
913
if (!KMP_MASTER_TID(
914
tid)) { // Handle fork barrier workers who aren't part of a team yet
915
KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916
&thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917
// Wait for parent thread to release us
918
kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
919
flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920
#if USE_ITT_BUILD && USE_ITT_NOTIFY
921
if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
922
// In fork barrier where we could not get the object reliably (or
923
// ITTNOTIFY is disabled)
924
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
925
// Cancel wait on previous parallel region...
926
__kmp_itt_task_starting(itt_sync_obj);
927
928
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929
return;
930
931
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
932
if (itt_sync_obj != NULL)
933
// Call prepare as early as possible for "new" barrier
934
__kmp_itt_task_finished(itt_sync_obj);
935
} else
936
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937
// Early exit for reaping threads releasing forkjoin barrier
938
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939
return;
940
941
// The worker thread may now assume that the team is valid.
942
team = __kmp_threads[gtid]->th.th_team;
943
KMP_DEBUG_ASSERT(team != NULL);
944
tid = __kmp_tid_from_gtid(gtid);
945
946
TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947
KA_TRACE(20,
948
("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949
team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950
KMP_MB(); // Flush all pending memory write invalidates.
951
} else {
952
team = __kmp_threads[gtid]->th.th_team;
953
KMP_DEBUG_ASSERT(team != NULL);
954
KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955
"barrier type %d\n",
956
gtid, team->t.t_id, tid, bt));
957
}
958
nproc = this_thr->th.th_team_nproc;
959
child_tid = (tid << branch_bits) + 1;
960
961
if (child_tid < nproc) {
962
kmp_info_t **other_threads = team->t.t_threads;
963
child = 1;
964
// Parent threads release all their children
965
do {
966
kmp_info_t *child_thr = other_threads[child_tid];
967
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968
#if KMP_CACHE_MANAGE
969
// Prefetch next thread's go count
970
if (child + 1 <= branch_factor && child_tid + 1 < nproc)
971
KMP_CACHE_PREFETCH(
972
&other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
973
#endif /* KMP_CACHE_MANAGE */
974
975
#if KMP_BARRIER_ICV_PUSH
976
{
977
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
978
if (propagate_icvs) {
979
__kmp_init_implicit_task(team->t.t_ident,
980
team->t.t_threads[child_tid], team,
981
child_tid, FALSE);
982
copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983
&team->t.t_implicit_task_taskdata[0].td_icvs);
984
}
985
}
986
#endif // KMP_BARRIER_ICV_PUSH
987
KA_TRACE(20,
988
("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989
"go(%p): %u => %u\n",
990
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991
team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992
child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993
// Release child from barrier
994
kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995
flag.release();
996
child++;
997
child_tid++;
998
} while (child <= branch_factor && child_tid < nproc);
999
}
1000
KA_TRACE(
1001
20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002
gtid, team->t.t_id, tid, bt));
1003
}
1004
1005
// Hyper Barrier
1006
static void __kmp_hyper_barrier_gather(
1007
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1008
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1009
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010
kmp_team_t *team = this_thr->th.th_team;
1011
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012
kmp_info_t **other_threads = team->t.t_threads;
1013
kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1014
kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1015
kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1016
kmp_uint32 branch_factor = 1 << branch_bits;
1017
kmp_uint32 offset;
1018
kmp_uint32 level;
1019
1020
KA_TRACE(
1021
20,
1022
("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023
gtid, team->t.t_id, tid, bt));
1024
KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025
1026
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1027
// Barrier imbalance - save arrive time to the thread
1028
if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1029
this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030
__itt_get_timestamp();
1031
}
1032
#endif
1033
/* Perform a hypercube-embedded tree gather to wait until all of the threads
1034
have arrived, and reduce any required data as we go. */
1035
kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036
for (level = 0, offset = 1; offset < num_threads;
1037
level += branch_bits, offset <<= branch_bits) {
1038
kmp_uint32 child;
1039
kmp_uint32 child_tid;
1040
1041
if (((tid >> level) & (branch_factor - 1)) != 0) {
1042
kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1043
1044
KMP_MB(); // Synchronize parent and child threads.
1045
KA_TRACE(20,
1046
("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047
"arrived(%p): %llu => %llu\n",
1048
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049
team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050
thr_bar->b_arrived,
1051
thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052
// Mark arrival to parent thread
1053
/* After performing this write (in the last iteration of the enclosing for
1054
loop), a worker thread may not assume that the team is valid any more
1055
- it could be deallocated by the primary thread at any time. */
1056
p_flag.set_waiter(other_threads[parent_tid]);
1057
p_flag.release();
1058
break;
1059
}
1060
1061
// Parent threads wait for children to arrive
1062
if (new_state == KMP_BARRIER_UNUSED_STATE)
1063
new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064
for (child = 1, child_tid = tid + (1 << level);
1065
child < branch_factor && child_tid < num_threads;
1066
child++, child_tid += (1 << level)) {
1067
kmp_info_t *child_thr = other_threads[child_tid];
1068
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069
#if KMP_CACHE_MANAGE
1070
kmp_uint32 next_child_tid = child_tid + (1 << level);
1071
// Prefetch next thread's arrived count
1072
if (child + 1 < branch_factor && next_child_tid < num_threads)
1073
KMP_CACHE_PREFETCH(
1074
&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075
#endif /* KMP_CACHE_MANAGE */
1076
KA_TRACE(20,
1077
("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078
"arrived(%p) == %llu\n",
1079
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080
team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081
// Wait for child to arrive
1082
kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083
c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084
KMP_MB(); // Synchronize parent and child threads.
1085
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1086
// Barrier imbalance - write min of the thread time and a child time to
1087
// the thread.
1088
if (__kmp_forkjoin_frames_mode == 2) {
1089
this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090
child_thr->th.th_bar_min_time);
1091
}
1092
#endif
1093
if (reduce) {
1094
KA_TRACE(100,
1095
("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097
team->t.t_id, child_tid));
1098
OMPT_REDUCTION_DECL(this_thr, gtid);
1099
OMPT_REDUCTION_BEGIN;
1100
(*reduce)(this_thr->th.th_local.reduce_data,
1101
child_thr->th.th_local.reduce_data);
1102
OMPT_REDUCTION_END;
1103
}
1104
}
1105
}
1106
1107
if (KMP_MASTER_TID(tid)) {
1108
// Need to update the team arrived pointer if we are the primary thread
1109
if (new_state == KMP_BARRIER_UNUSED_STATE)
1110
team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111
else
1112
team->t.t_bar[bt].b_arrived = new_state;
1113
KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114
"arrived(%p) = %llu\n",
1115
gtid, team->t.t_id, tid, team->t.t_id,
1116
&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117
}
1118
KA_TRACE(
1119
20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120
gtid, team->t.t_id, tid, bt));
1121
}
1122
1123
// The reverse versions seem to beat the forward versions overall
1124
#define KMP_REVERSE_HYPER_BAR
1125
static void __kmp_hyper_barrier_release(
1126
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1127
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129
kmp_team_t *team;
1130
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131
kmp_info_t **other_threads;
1132
kmp_uint32 num_threads;
1133
kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1134
kmp_uint32 branch_factor = 1 << branch_bits;
1135
kmp_uint32 child;
1136
kmp_uint32 child_tid;
1137
kmp_uint32 offset;
1138
kmp_uint32 level;
1139
1140
/* Perform a hypercube-embedded tree release for all of the threads that have
1141
been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142
are released in the reverse order of the corresponding gather, otherwise
1143
threads are released in the same order. */
1144
if (KMP_MASTER_TID(tid)) { // primary thread
1145
team = __kmp_threads[gtid]->th.th_team;
1146
KMP_DEBUG_ASSERT(team != NULL);
1147
KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148
"barrier type %d\n",
1149
gtid, team->t.t_id, tid, bt));
1150
#if KMP_BARRIER_ICV_PUSH
1151
if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152
copy_icvs(&thr_bar->th_fixed_icvs,
1153
&team->t.t_implicit_task_taskdata[tid].td_icvs);
1154
}
1155
#endif
1156
} else { // Handle fork barrier workers who aren't part of a team yet
1157
KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158
&thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159
// Wait for parent thread to release us
1160
kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161
flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1163
if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1164
// In fork barrier where we could not get the object reliably
1165
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1166
// Cancel wait on previous parallel region...
1167
__kmp_itt_task_starting(itt_sync_obj);
1168
1169
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170
return;
1171
1172
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1173
if (itt_sync_obj != NULL)
1174
// Call prepare as early as possible for "new" barrier
1175
__kmp_itt_task_finished(itt_sync_obj);
1176
} else
1177
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178
// Early exit for reaping threads releasing forkjoin barrier
1179
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180
return;
1181
1182
// The worker thread may now assume that the team is valid.
1183
team = __kmp_threads[gtid]->th.th_team;
1184
KMP_DEBUG_ASSERT(team != NULL);
1185
tid = __kmp_tid_from_gtid(gtid);
1186
1187
TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188
KA_TRACE(20,
1189
("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190
gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191
KMP_MB(); // Flush all pending memory write invalidates.
1192
}
1193
num_threads = this_thr->th.th_team_nproc;
1194
other_threads = team->t.t_threads;
1195
1196
#ifdef KMP_REVERSE_HYPER_BAR
1197
// Count up to correct level for parent
1198
for (level = 0, offset = 1;
1199
offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1200
level += branch_bits, offset <<= branch_bits)
1201
;
1202
1203
// Now go down from there
1204
for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1205
level -= branch_bits, offset >>= branch_bits)
1206
#else
1207
// Go down the tree, level by level
1208
for (level = 0, offset = 1; offset < num_threads;
1209
level += branch_bits, offset <<= branch_bits)
1210
#endif // KMP_REVERSE_HYPER_BAR
1211
{
1212
#ifdef KMP_REVERSE_HYPER_BAR
1213
/* Now go in reverse order through the children, highest to lowest.
1214
Initial setting of child is conservative here. */
1215
child = num_threads >> ((level == 0) ? level : level - 1);
1216
for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1217
child_tid = tid + (child << level);
1218
child >= 1; child--, child_tid -= (1 << level))
1219
#else
1220
if (((tid >> level) & (branch_factor - 1)) != 0)
1221
// No need to go lower than this, since this is the level parent would be
1222
// notified
1223
break;
1224
// Iterate through children on this level of the tree
1225
for (child = 1, child_tid = tid + (1 << level);
1226
child < branch_factor && child_tid < num_threads;
1227
child++, child_tid += (1 << level))
1228
#endif // KMP_REVERSE_HYPER_BAR
1229
{
1230
if (child_tid >= num_threads)
1231
continue; // Child doesn't exist so keep going
1232
else {
1233
kmp_info_t *child_thr = other_threads[child_tid];
1234
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235
#if KMP_CACHE_MANAGE
1236
kmp_uint32 next_child_tid = child_tid - (1 << level);
1237
// Prefetch next thread's go count
1238
#ifdef KMP_REVERSE_HYPER_BAR
1239
if (child - 1 >= 1 && next_child_tid < num_threads)
1240
#else
1241
if (child + 1 < branch_factor && next_child_tid < num_threads)
1242
#endif // KMP_REVERSE_HYPER_BAR
1243
KMP_CACHE_PREFETCH(
1244
&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245
#endif /* KMP_CACHE_MANAGE */
1246
1247
#if KMP_BARRIER_ICV_PUSH
1248
if (propagate_icvs) // push my fixed ICVs to my child
1249
copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1250
#endif // KMP_BARRIER_ICV_PUSH
1251
1252
KA_TRACE(
1253
20,
1254
("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255
"go(%p): %u => %u\n",
1256
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257
team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258
child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259
// Release child from barrier
1260
kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261
flag.release();
1262
}
1263
}
1264
}
1265
#if KMP_BARRIER_ICV_PUSH
1266
if (propagate_icvs &&
1267
!KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1269
FALSE);
1270
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1271
&thr_bar->th_fixed_icvs);
1272
}
1273
#endif
1274
KA_TRACE(
1275
20,
1276
("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277
gtid, team->t.t_id, tid, bt));
1278
}
1279
1280
// Hierarchical Barrier
1281
1282
// Initialize thread barrier data
1283
/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1284
Performs the minimum amount of initialization required based on how the team
1285
has changed. Returns true if leaf children will require both on-core and
1286
traditional wake-up mechanisms. For example, if the team size increases,
1287
threads already in the team will respond to on-core wakeup on their parent
1288
thread, but threads newly added to the team will only be listening on the
1289
their local b_go. */
1290
static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1291
kmp_bstate_t *thr_bar,
1292
kmp_uint32 nproc, int gtid,
1293
int tid, kmp_team_t *team) {
1294
// Checks to determine if (re-)initialization is needed
1295
bool uninitialized = thr_bar->team == NULL;
1296
bool team_changed = team != thr_bar->team;
1297
bool team_sz_changed = nproc != thr_bar->nproc;
1298
bool tid_changed = tid != thr_bar->old_tid;
1299
bool retval = false;
1300
1301
if (uninitialized || team_sz_changed) {
1302
__kmp_get_hierarchy(nproc, thr_bar);
1303
}
1304
1305
if (uninitialized || team_sz_changed || tid_changed) {
1306
thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1307
thr_bar->parent_tid = -1; // default for primary thread
1308
if (!KMP_MASTER_TID(tid)) {
1309
// if not primary thread, find parent thread in hierarchy
1310
kmp_uint32 d = 0;
1311
while (d < thr_bar->depth) { // find parent based on level of thread in
1312
// hierarchy, and note level
1313
kmp_uint32 rem;
1314
if (d == thr_bar->depth - 2) { // reached level right below the primary
1315
thr_bar->parent_tid = 0;
1316
thr_bar->my_level = d;
1317
break;
1318
} else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1319
// TODO: can we make the above op faster?
1320
// thread is not a subtree root at next level, so this is max
1321
thr_bar->parent_tid = tid - rem;
1322
thr_bar->my_level = d;
1323
break;
1324
}
1325
++d;
1326
}
1327
}
1328
__kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1329
(thr_bar->skip_per_level[thr_bar->my_level])),
1330
&(thr_bar->offset));
1331
thr_bar->old_tid = tid;
1332
thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333
thr_bar->team = team;
1334
thr_bar->parent_bar =
1335
&team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336
}
1337
if (uninitialized || team_changed || tid_changed) {
1338
thr_bar->team = team;
1339
thr_bar->parent_bar =
1340
&team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341
retval = true;
1342
}
1343
if (uninitialized || team_sz_changed || tid_changed) {
1344
thr_bar->nproc = nproc;
1345
thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346
if (thr_bar->my_level == 0)
1347
thr_bar->leaf_kids = 0;
1348
if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1349
__kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1350
thr_bar->leaf_state = 0;
1351
for (int i = 0; i < thr_bar->leaf_kids; ++i)
1352
((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1353
}
1354
return retval;
1355
}
1356
1357
static void __kmp_hierarchical_barrier_gather(
1358
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1359
void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1360
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361
kmp_team_t *team = this_thr->th.th_team;
1362
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363
kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364
kmp_info_t **other_threads = team->t.t_threads;
1365
kmp_uint64 new_state = 0;
1366
1367
int level = team->t.t_level;
1368
if (other_threads[0]
1369
->th.th_teams_microtask) // are we inside the teams construct?
1370
if (this_thr->th.th_teams_size.nteams > 1)
1371
++level; // level was not increased in teams construct for team_of_masters
1372
if (level == 1)
1373
thr_bar->use_oncore_barrier = 1;
1374
else
1375
thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1376
1377
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378
"barrier type %d\n",
1379
gtid, team->t.t_id, tid, bt));
1380
KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381
1382
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1383
// Barrier imbalance - save arrive time to the thread
1384
if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1385
this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386
}
1387
#endif
1388
1389
(void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390
team);
1391
1392
if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393
kmp_int32 child_tid;
1394
new_state =
1395
(kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1396
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1397
thr_bar->use_oncore_barrier) {
1398
if (thr_bar->leaf_kids) {
1399
// First, wait for leaf children to check-in on my b_arrived flag
1400
kmp_uint64 leaf_state =
1401
KMP_MASTER_TID(tid)
1402
? thr_bar->b_arrived | thr_bar->leaf_state
1403
: team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1404
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405
"for leaf kids\n",
1406
gtid, team->t.t_id, tid));
1407
kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409
if (reduce) {
1410
OMPT_REDUCTION_DECL(this_thr, gtid);
1411
OMPT_REDUCTION_BEGIN;
1412
for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1413
++child_tid) {
1414
KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415
"T#%d(%d:%d)\n",
1416
gtid, team->t.t_id, tid,
1417
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418
child_tid));
1419
(*reduce)(this_thr->th.th_local.reduce_data,
1420
other_threads[child_tid]->th.th_local.reduce_data);
1421
}
1422
OMPT_REDUCTION_END;
1423
}
1424
// clear leaf_state bits
1425
KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426
}
1427
// Next, wait for higher level children on each child's b_arrived flag
1428
for (kmp_uint32 d = 1; d < thr_bar->my_level;
1429
++d) { // gather lowest level threads first, but skip 0
1430
kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1431
skip = thr_bar->skip_per_level[d];
1432
if (last > nproc)
1433
last = nproc;
1434
for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435
kmp_info_t *child_thr = other_threads[child_tid];
1436
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438
"T#%d(%d:%d) "
1439
"arrived(%p) == %llu\n",
1440
gtid, team->t.t_id, tid,
1441
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442
child_tid, &child_bar->b_arrived, new_state));
1443
kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445
if (reduce) {
1446
KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447
"T#%d(%d:%d)\n",
1448
gtid, team->t.t_id, tid,
1449
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450
child_tid));
1451
(*reduce)(this_thr->th.th_local.reduce_data,
1452
child_thr->th.th_local.reduce_data);
1453
}
1454
}
1455
}
1456
} else { // Blocktime is not infinite
1457
for (kmp_uint32 d = 0; d < thr_bar->my_level;
1458
++d) { // Gather lowest level threads first
1459
kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1460
skip = thr_bar->skip_per_level[d];
1461
if (last > nproc)
1462
last = nproc;
1463
for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464
kmp_info_t *child_thr = other_threads[child_tid];
1465
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467
"T#%d(%d:%d) "
1468
"arrived(%p) == %llu\n",
1469
gtid, team->t.t_id, tid,
1470
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471
child_tid, &child_bar->b_arrived, new_state));
1472
kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474
if (reduce) {
1475
KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476
"T#%d(%d:%d)\n",
1477
gtid, team->t.t_id, tid,
1478
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479
child_tid));
1480
(*reduce)(this_thr->th.th_local.reduce_data,
1481
child_thr->th.th_local.reduce_data);
1482
}
1483
}
1484
}
1485
}
1486
}
1487
// All subordinates are gathered; now release parent if not primary thread
1488
1489
if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491
" T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492
gtid, team->t.t_id, tid,
1493
__kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494
thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495
thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496
/* Mark arrival to parent: After performing this write, a worker thread may
1497
not assume that the team is valid any more - it could be deallocated by
1498
the primary thread at any time. */
1499
if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1500
!thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501
// flag; release it
1502
kmp_flag_64<> flag(&thr_bar->b_arrived,
1503
other_threads[thr_bar->parent_tid]);
1504
flag.release();
1505
} else {
1506
// Leaf does special release on "offset" bits of parent's b_arrived flag
1507
thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508
kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509
thr_bar->offset + 1);
1510
flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511
flag.release();
1512
}
1513
} else { // Primary thread needs to update the team's b_arrived value
1514
team->t.t_bar[bt].b_arrived = new_state;
1515
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516
"arrived(%p) = %llu\n",
1517
gtid, team->t.t_id, tid, team->t.t_id,
1518
&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519
}
1520
// Is the team access below unsafe or just technically invalid?
1521
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522
"barrier type %d\n",
1523
gtid, team->t.t_id, tid, bt));
1524
}
1525
1526
static void __kmp_hierarchical_barrier_release(
1527
enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1528
int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530
kmp_team_t *team;
1531
kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532
kmp_uint32 nproc;
1533
bool team_change = false; // indicates on-core barrier shouldn't be used
1534
1535
if (KMP_MASTER_TID(tid)) {
1536
team = __kmp_threads[gtid]->th.th_team;
1537
KMP_DEBUG_ASSERT(team != NULL);
1538
KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539
"entered barrier type %d\n",
1540
gtid, team->t.t_id, tid, bt));
1541
} else { // Worker threads
1542
// Wait for parent thread to release me
1543
if (!thr_bar->use_oncore_barrier ||
1544
__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1545
thr_bar->team == NULL) {
1546
// Use traditional method of waiting on my own b_go flag
1547
thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548
kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549
flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550
TCW_8(thr_bar->b_go,
1551
KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552
} else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553
// infinite, not nested
1554
// Wait on my "offset" bits on parent's b_go flag
1555
thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556
kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557
thr_bar->offset + 1, bt,
1558
this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559
flag.wait(this_thr, TRUE);
1560
if (thr_bar->wait_flag ==
1561
KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562
TCW_8(thr_bar->b_go,
1563
KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564
} else { // Reset my bits on parent's b_go flag
1565
(RCAST(volatile char *,
1566
&(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1567
}
1568
}
1569
thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570
// Early exit for reaping threads releasing forkjoin barrier
1571
if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572
return;
1573
// The worker thread may now assume that the team is valid.
1574
team = __kmp_threads[gtid]->th.th_team;
1575
KMP_DEBUG_ASSERT(team != NULL);
1576
tid = __kmp_tid_from_gtid(gtid);
1577
1578
KA_TRACE(
1579
20,
1580
("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581
gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582
KMP_MB(); // Flush all pending memory write invalidates.
1583
}
1584
1585
nproc = this_thr->th.th_team_nproc;
1586
int level = team->t.t_level;
1587
if (team->t.t_threads[0]
1588
->th.th_teams_microtask) { // are we inside the teams construct?
1589
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590
this_thr->th.th_teams_level == level)
1591
++level; // level was not increased in teams construct for team_of_workers
1592
if (this_thr->th.th_teams_size.nteams > 1)
1593
++level; // level was not increased in teams construct for team_of_masters
1594
}
1595
if (level == 1)
1596
thr_bar->use_oncore_barrier = 1;
1597
else
1598
thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1599
1600
// If the team size has increased, we still communicate with old leaves via
1601
// oncore barrier.
1602
unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603
kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604
team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605
tid, team);
1606
// But if the entire team changes, we won't use oncore barrier at all
1607
if (team_change)
1608
old_leaf_kids = 0;
1609
1610
#if KMP_BARRIER_ICV_PUSH
1611
if (propagate_icvs) {
1612
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1613
FALSE);
1614
if (KMP_MASTER_TID(
1615
tid)) { // primary already has copy in final destination; copy
1616
copy_icvs(&thr_bar->th_fixed_icvs,
1617
&team->t.t_implicit_task_taskdata[tid].td_icvs);
1618
} else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1619
thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620
if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621
// leaves (on-core children) pull parent's fixed ICVs directly to local
1622
// ICV store
1623
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1624
&thr_bar->parent_bar->th_fixed_icvs);
1625
// non-leaves will get ICVs piggybacked with b_go via NGO store
1626
} else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627
if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628
// access
1629
copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1630
else // leaves copy parent's fixed ICVs directly to local ICV store
1631
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1632
&thr_bar->parent_bar->th_fixed_icvs);
1633
}
1634
}
1635
#endif // KMP_BARRIER_ICV_PUSH
1636
1637
// Now, release my children
1638
if (thr_bar->my_level) { // not a leaf
1639
kmp_int32 child_tid;
1640
kmp_uint32 last;
1641
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1642
thr_bar->use_oncore_barrier) {
1643
if (KMP_MASTER_TID(tid)) { // do a flat release
1644
// Set local b_go to bump children via NGO store of the cache line
1645
// containing IVCs and b_go.
1646
thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647
// Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648
// the cache line
1649
ngo_load(&thr_bar->th_fixed_icvs);
1650
// This loops over all the threads skipping only the leaf nodes in the
1651
// hierarchy
1652
for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1653
child_tid += thr_bar->skip_per_level[1]) {
1654
kmp_bstate_t *child_bar =
1655
&team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656
KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657
"releasing T#%d(%d:%d)"
1658
" go(%p): %u => %u\n",
1659
gtid, team->t.t_id, tid,
1660
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661
child_tid, &child_bar->b_go, child_bar->b_go,
1662
child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663
// Use ngo store (if available) to both store ICVs and release child
1664
// via child's b_go
1665
ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666
}
1667
ngo_sync();
1668
}
1669
TCW_8(thr_bar->b_go,
1670
KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671
// Now, release leaf children
1672
if (thr_bar->leaf_kids) { // if there are any
1673
// We test team_change on the off-chance that the level 1 team changed.
1674
if (team_change ||
1675
old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676
if (old_leaf_kids) { // release old leaf kids
1677
thr_bar->b_go |= old_leaf_state;
1678
}
1679
// Release new leaf kids
1680
last = tid + thr_bar->skip_per_level[1];
1681
if (last > nproc)
1682
last = nproc;
1683
for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1684
++child_tid) { // skip_per_level[0]=1
1685
kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687
KA_TRACE(
1688
20,
1689
("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690
" T#%d(%d:%d) go(%p): %u => %u\n",
1691
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692
team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693
child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694
// Release child using child's b_go flag
1695
kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696
flag.release();
1697
}
1698
} else { // Release all children at once with leaf_state bits on my own
1699
// b_go flag
1700
thr_bar->b_go |= thr_bar->leaf_state;
1701
}
1702
}
1703
} else { // Blocktime is not infinite; do a simple hierarchical release
1704
for (int d = thr_bar->my_level - 1; d >= 0;
1705
--d) { // Release highest level threads first
1706
last = tid + thr_bar->skip_per_level[d + 1];
1707
kmp_uint32 skip = thr_bar->skip_per_level[d];
1708
if (last > nproc)
1709
last = nproc;
1710
for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711
kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712
kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713
KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714
"releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715
gtid, team->t.t_id, tid,
1716
__kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717
child_tid, &child_bar->b_go, child_bar->b_go,
1718
child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719
// Release child using child's b_go flag
1720
kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721
flag.release();
1722
}
1723
}
1724
}
1725
#if KMP_BARRIER_ICV_PUSH
1726
if (propagate_icvs && !KMP_MASTER_TID(tid))
1727
// non-leaves copy ICVs from fixed ICVs to local dest
1728
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1729
&thr_bar->th_fixed_icvs);
1730
#endif // KMP_BARRIER_ICV_PUSH
1731
}
1732
KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733
"barrier type %d\n",
1734
gtid, team->t.t_id, tid, bt));
1735
}
1736
1737
// End of Barrier Algorithms
1738
1739
// type traits for cancellable value
1740
// if cancellable is true, then is_cancellable is a normal boolean variable
1741
// if cancellable is false, then is_cancellable is a compile time constant
1742
template <bool cancellable> struct is_cancellable {};
1743
template <> struct is_cancellable<true> {
1744
bool value;
1745
is_cancellable() : value(false) {}
1746
is_cancellable(bool b) : value(b) {}
1747
is_cancellable &operator=(bool b) {
1748
value = b;
1749
return *this;
1750
}
1751
operator bool() const { return value; }
1752
};
1753
template <> struct is_cancellable<false> {
1754
is_cancellable &operator=(bool b) { return *this; }
1755
constexpr operator bool() const { return false; }
1756
};
1757
1758
// Internal function to do a barrier.
1759
/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1760
If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761
barrier
1762
When cancellable = false,
1763
Returns 0 if primary thread, 1 if worker thread.
1764
When cancellable = true
1765
Returns 0 if not cancelled, 1 if cancelled. */
1766
template <bool cancellable = false>
1767
static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768
size_t reduce_size, void *reduce_data,
1769
void (*reduce)(void *, void *)) {
1770
KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771
KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772
int tid = __kmp_tid_from_gtid(gtid);
1773
kmp_info_t *this_thr = __kmp_threads[gtid];
1774
kmp_team_t *team = this_thr->th.th_team;
1775
int status = 0;
1776
is_cancellable<cancellable> cancelled;
1777
#if OMPT_SUPPORT && OMPT_OPTIONAL
1778
ompt_data_t *my_task_data;
1779
ompt_data_t *my_parallel_data;
1780
void *return_address;
1781
ompt_sync_region_t barrier_kind;
1782
#endif
1783
1784
KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785
__kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786
1787
#if OMPT_SUPPORT
1788
if (ompt_enabled.enabled) {
1789
#if OMPT_OPTIONAL
1790
my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791
my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792
return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793
barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794
if (ompt_enabled.ompt_callback_sync_region) {
1795
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796
barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797
return_address);
1798
}
1799
if (ompt_enabled.ompt_callback_sync_region_wait) {
1800
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801
barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802
return_address);
1803
}
1804
#endif
1805
// It is OK to report the barrier state after the barrier begin callback.
1806
// According to the OMPT specification, a compliant implementation may
1807
// even delay reporting this state until the barrier begins to wait.
1808
auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1809
switch (barrier_kind) {
1810
case ompt_sync_region_barrier_explicit:
1811
ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1812
break;
1813
case ompt_sync_region_barrier_implicit_workshare:
1814
ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1815
break;
1816
case ompt_sync_region_barrier_implicit_parallel:
1817
ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1818
break;
1819
case ompt_sync_region_barrier_teams:
1820
ompt_thr_info->state = ompt_state_wait_barrier_teams;
1821
break;
1822
case ompt_sync_region_barrier_implementation:
1823
[[fallthrough]];
1824
default:
1825
ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1826
}
1827
}
1828
#endif
1829
1830
if (!team->t.t_serialized) {
1831
#if USE_ITT_BUILD
1832
// This value will be used in itt notify events below.
1833
void *itt_sync_obj = NULL;
1834
#if USE_ITT_NOTIFY
1835
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836
itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1837
#endif
1838
#endif /* USE_ITT_BUILD */
1839
if (__kmp_tasking_mode == tskm_extra_barrier) {
1840
__kmp_tasking_barrier(team, this_thr, gtid);
1841
KA_TRACE(15,
1842
("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1843
__kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1844
}
1845
1846
/* Copy the blocktime info to the thread, where __kmp_wait_template() can
1847
access it when the team struct is not guaranteed to exist. */
1848
// See note about the corresponding code in __kmp_join_barrier() being
1849
// performance-critical.
1850
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1851
#if KMP_USE_MONITOR
1852
this_thr->th.th_team_bt_intervals =
1853
team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1854
this_thr->th.th_team_bt_set =
1855
team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1856
#else
1857
this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1858
#endif
1859
}
1860
1861
#if USE_ITT_BUILD
1862
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1863
__kmp_itt_barrier_starting(gtid, itt_sync_obj);
1864
#endif /* USE_ITT_BUILD */
1865
#if USE_DEBUGGER
1866
// Let the debugger know: the thread arrived to the barrier and waiting.
1867
if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1868
team->t.t_bar[bt].b_master_arrived += 1;
1869
} else {
1870
this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1871
} // if
1872
#endif /* USE_DEBUGGER */
1873
if (reduce != NULL) {
1874
// KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1875
this_thr->th.th_local.reduce_data = reduce_data;
1876
}
1877
1878
if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1879
__kmp_task_team_setup(this_thr, team);
1880
1881
if (cancellable) {
1882
cancelled = __kmp_linear_barrier_gather_cancellable(
1883
bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1884
} else {
1885
switch (__kmp_barrier_gather_pattern[bt]) {
1886
case bp_dist_bar: {
1887
__kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1888
reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1889
break;
1890
}
1891
case bp_hyper_bar: {
1892
// don't set branch bits to 0; use linear
1893
KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1894
__kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1895
reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1896
break;
1897
}
1898
case bp_hierarchical_bar: {
1899
__kmp_hierarchical_barrier_gather(
1900
bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1901
break;
1902
}
1903
case bp_tree_bar: {
1904
// don't set branch bits to 0; use linear
1905
KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1906
__kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1907
reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1908
break;
1909
}
1910
default: {
1911
__kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1912
reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1913
}
1914
}
1915
}
1916
1917
KMP_MB();
1918
1919
if (KMP_MASTER_TID(tid)) {
1920
status = 0;
1921
if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1922
__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1923
}
1924
#if USE_DEBUGGER
1925
// Let the debugger know: All threads are arrived and starting leaving the
1926
// barrier.
1927
team->t.t_bar[bt].b_team_arrived += 1;
1928
#endif
1929
1930
if (__kmp_omp_cancellation) {
1931
kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1932
// Reset cancellation flag for worksharing constructs
1933
if (cancel_request == cancel_loop ||
1934
cancel_request == cancel_sections) {
1935
KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1936
}
1937
}
1938
#if USE_ITT_BUILD
1939
/* TODO: In case of split reduction barrier, primary thread may send
1940
acquired event early, before the final summation into the shared
1941
variable is done (final summation can be a long operation for array
1942
reductions). */
1943
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1944
__kmp_itt_barrier_middle(gtid, itt_sync_obj);
1945
#endif /* USE_ITT_BUILD */
1946
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1947
// Barrier - report frame end (only if active_level == 1)
1948
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1949
__kmp_forkjoin_frames_mode &&
1950
(this_thr->th.th_teams_microtask == NULL || // either not in teams
1951
this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1952
team->t.t_active_level == 1) {
1953
ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1954
kmp_uint64 cur_time = __itt_get_timestamp();
1955
kmp_info_t **other_threads = team->t.t_threads;
1956
int nproc = this_thr->th.th_team_nproc;
1957
int i;
1958
switch (__kmp_forkjoin_frames_mode) {
1959
case 1:
1960
__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1961
loc, nproc);
1962
this_thr->th.th_frame_time = cur_time;
1963
break;
1964
case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1965
// be fixed)
1966
__kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1967
1, loc, nproc);
1968
break;
1969
case 3:
1970
if (__itt_metadata_add_ptr) {
1971
// Initialize with primary thread's wait time
1972
kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1973
// Set arrive time to zero to be able to check it in
1974
// __kmp_invoke_task(); the same is done inside the loop below
1975
this_thr->th.th_bar_arrive_time = 0;
1976
for (i = 1; i < nproc; ++i) {
1977
delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1978
other_threads[i]->th.th_bar_arrive_time = 0;
1979
}
1980
__kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1981
cur_time, delta,
1982
(kmp_uint64)(reduce != NULL));
1983
}
1984
__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1985
loc, nproc);
1986
this_thr->th.th_frame_time = cur_time;
1987
break;
1988
}
1989
}
1990
#endif /* USE_ITT_BUILD */
1991
} else {
1992
status = 1;
1993
#if USE_ITT_BUILD
1994
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1995
__kmp_itt_barrier_middle(gtid, itt_sync_obj);
1996
#endif /* USE_ITT_BUILD */
1997
}
1998
if ((status == 1 || !is_split) && !cancelled) {
1999
if (cancellable) {
2000
cancelled = __kmp_linear_barrier_release_cancellable(
2001
bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2002
} else {
2003
switch (__kmp_barrier_release_pattern[bt]) {
2004
case bp_dist_bar: {
2005
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2006
__kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2007
FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2008
break;
2009
}
2010
case bp_hyper_bar: {
2011
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2012
__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2013
FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2014
break;
2015
}
2016
case bp_hierarchical_bar: {
2017
__kmp_hierarchical_barrier_release(
2018
bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2019
break;
2020
}
2021
case bp_tree_bar: {
2022
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2023
__kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2024
FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2025
break;
2026
}
2027
default: {
2028
__kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2029
FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2030
}
2031
}
2032
}
2033
if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2034
__kmp_task_team_sync(this_thr, team);
2035
}
2036
}
2037
2038
#if USE_ITT_BUILD
2039
/* GEH: TODO: Move this under if-condition above and also include in
2040
__kmp_end_split_barrier(). This will more accurately represent the actual
2041
release time of the threads for split barriers. */
2042
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2043
__kmp_itt_barrier_finished(gtid, itt_sync_obj);
2044
#endif /* USE_ITT_BUILD */
2045
} else { // Team is serialized.
2046
status = 0;
2047
if (__kmp_tasking_mode != tskm_immediate_exec) {
2048
if (this_thr->th.th_task_team != NULL) {
2049
#if USE_ITT_NOTIFY
2050
void *itt_sync_obj = NULL;
2051
if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2052
itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2053
__kmp_itt_barrier_starting(gtid, itt_sync_obj);
2054
}
2055
#endif
2056
2057
KMP_DEBUG_ASSERT(
2058
this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2059
this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2060
TRUE);
2061
__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2062
__kmp_task_team_setup(this_thr, team);
2063
2064
#if USE_ITT_BUILD
2065
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2066
__kmp_itt_barrier_finished(gtid, itt_sync_obj);
2067
#endif /* USE_ITT_BUILD */
2068
}
2069
}
2070
}
2071
KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2072
gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2073
__kmp_tid_from_gtid(gtid), status));
2074
2075
#if OMPT_SUPPORT
2076
if (ompt_enabled.enabled) {
2077
#if OMPT_OPTIONAL
2078
if (ompt_enabled.ompt_callback_sync_region_wait) {
2079
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2080
barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2081
return_address);
2082
}
2083
if (ompt_enabled.ompt_callback_sync_region) {
2084
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2085
barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2086
return_address);
2087
}
2088
#endif
2089
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2090
}
2091
#endif
2092
2093
if (cancellable)
2094
return (int)cancelled;
2095
return status;
2096
}
2097
2098
// Returns 0 if primary thread, 1 if worker thread.
2099
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2100
size_t reduce_size, void *reduce_data,
2101
void (*reduce)(void *, void *)) {
2102
return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2103
reduce);
2104
}
2105
2106
#if defined(KMP_GOMP_COMPAT)
2107
// Returns 1 if cancelled, 0 otherwise
2108
int __kmp_barrier_gomp_cancel(int gtid) {
2109
if (__kmp_omp_cancellation) {
2110
int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2111
0, NULL, NULL);
2112
if (cancelled) {
2113
int tid = __kmp_tid_from_gtid(gtid);
2114
kmp_info_t *this_thr = __kmp_threads[gtid];
2115
if (KMP_MASTER_TID(tid)) {
2116
// Primary thread does not need to revert anything
2117
} else {
2118
// Workers need to revert their private b_arrived flag
2119
this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2120
KMP_BARRIER_STATE_BUMP;
2121
}
2122
}
2123
return cancelled;
2124
}
2125
__kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2126
return FALSE;
2127
}
2128
#endif
2129
2130
void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2131
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2132
KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2133
KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2134
int tid = __kmp_tid_from_gtid(gtid);
2135
kmp_info_t *this_thr = __kmp_threads[gtid];
2136
kmp_team_t *team = this_thr->th.th_team;
2137
2138
if (!team->t.t_serialized) {
2139
if (KMP_MASTER_GTID(gtid)) {
2140
switch (__kmp_barrier_release_pattern[bt]) {
2141
case bp_dist_bar: {
2142
__kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2143
FALSE USE_ITT_BUILD_ARG(NULL));
2144
break;
2145
}
2146
case bp_hyper_bar: {
2147
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2148
__kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2149
FALSE USE_ITT_BUILD_ARG(NULL));
2150
break;
2151
}
2152
case bp_hierarchical_bar: {
2153
__kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2154
FALSE USE_ITT_BUILD_ARG(NULL));
2155
break;
2156
}
2157
case bp_tree_bar: {
2158
KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2159
__kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2160
FALSE USE_ITT_BUILD_ARG(NULL));
2161
break;
2162
}
2163
default: {
2164
__kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2165
FALSE USE_ITT_BUILD_ARG(NULL));
2166
}
2167
}
2168
if (__kmp_tasking_mode != tskm_immediate_exec) {
2169
__kmp_task_team_sync(this_thr, team);
2170
} // if
2171
}
2172
}
2173
}
2174
2175
void __kmp_join_barrier(int gtid) {
2176
KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2177
KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2178
2179
KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2180
2181
kmp_info_t *this_thr = __kmp_threads[gtid];
2182
kmp_team_t *team;
2183
int tid;
2184
#ifdef KMP_DEBUG
2185
int team_id;
2186
#endif /* KMP_DEBUG */
2187
#if USE_ITT_BUILD
2188
void *itt_sync_obj = NULL;
2189
#if USE_ITT_NOTIFY
2190
if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2191
// Get object created at fork_barrier
2192
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2193
#endif
2194
#endif /* USE_ITT_BUILD */
2195
#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2196
int nproc = this_thr->th.th_team_nproc;
2197
#endif
2198
KMP_MB();
2199
2200
// Get current info
2201
team = this_thr->th.th_team;
2202
KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2203
tid = __kmp_tid_from_gtid(gtid);
2204
#ifdef KMP_DEBUG
2205
team_id = team->t.t_id;
2206
kmp_info_t *master_thread = this_thr->th.th_team_master;
2207
if (master_thread != team->t.t_threads[0]) {
2208
__kmp_print_structure();
2209
}
2210
#endif /* KMP_DEBUG */
2211
KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2212
KMP_MB();
2213
2214
// Verify state
2215
KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2216
KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2217
KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2218
KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2219
gtid, team_id, tid));
2220
2221
#if OMPT_SUPPORT
2222
if (ompt_enabled.enabled) {
2223
#if OMPT_OPTIONAL
2224
ompt_data_t *my_task_data;
2225
ompt_data_t *my_parallel_data;
2226
void *codeptr = NULL;
2227
int ds_tid = this_thr->th.th_info.ds.ds_tid;
2228
if (KMP_MASTER_TID(ds_tid) &&
2229
(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2230
ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2231
codeptr = team->t.ompt_team_info.master_return_address;
2232
my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2233
my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2234
ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2235
ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2236
if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2237
sync_kind = ompt_sync_region_barrier_teams;
2238
ompt_state = ompt_state_wait_barrier_teams;
2239
}
2240
if (ompt_enabled.ompt_callback_sync_region) {
2241
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2242
sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2243
}
2244
if (ompt_enabled.ompt_callback_sync_region_wait) {
2245
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246
sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2247
}
2248
if (!KMP_MASTER_TID(ds_tid))
2249
this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2250
#endif
2251
this_thr->th.ompt_thread_info.state = ompt_state;
2252
}
2253
#endif
2254
2255
if (__kmp_tasking_mode == tskm_extra_barrier) {
2256
__kmp_tasking_barrier(team, this_thr, gtid);
2257
KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2258
gtid, team_id, tid));
2259
}
2260
#ifdef KMP_DEBUG
2261
if (__kmp_tasking_mode != tskm_immediate_exec) {
2262
KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2263
"%p, th_task_team = %p\n",
2264
__kmp_gtid_from_thread(this_thr), team_id,
2265
team->t.t_task_team[this_thr->th.th_task_state],
2266
this_thr->th.th_task_team));
2267
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2268
}
2269
#endif /* KMP_DEBUG */
2270
2271
/* Copy the blocktime info to the thread, where __kmp_wait_template() can
2272
access it when the team struct is not guaranteed to exist. Doing these
2273
loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2274
we do not perform the copy if blocktime=infinite, since the values are not
2275
used by __kmp_wait_template() in that case. */
2276
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2277
#if KMP_USE_MONITOR
2278
this_thr->th.th_team_bt_intervals =
2279
team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2280
this_thr->th.th_team_bt_set =
2281
team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2282
#else
2283
this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2284
#endif
2285
}
2286
2287
#if USE_ITT_BUILD
2288
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2289
__kmp_itt_barrier_starting(gtid, itt_sync_obj);
2290
#endif /* USE_ITT_BUILD */
2291
2292
switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2293
case bp_dist_bar: {
2294
__kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2295
NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2296
break;
2297
}
2298
case bp_hyper_bar: {
2299
KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2300
__kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2301
NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2302
break;
2303
}
2304
case bp_hierarchical_bar: {
2305
__kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2306
NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2307
break;
2308
}
2309
case bp_tree_bar: {
2310
KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2311
__kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2312
NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2313
break;
2314
}
2315
default: {
2316
__kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317
NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2318
}
2319
}
2320
2321
/* From this point on, the team data structure may be deallocated at any time
2322
by the primary thread - it is unsafe to reference it in any of the worker
2323
threads. Any per-team data items that need to be referenced before the
2324
end of the barrier should be moved to the kmp_task_team_t structs. */
2325
if (KMP_MASTER_TID(tid)) {
2326
if (__kmp_tasking_mode != tskm_immediate_exec) {
2327
__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2328
}
2329
if (__kmp_display_affinity) {
2330
KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2331
}
2332
#if KMP_STATS_ENABLED
2333
// Have primary thread flag the workers to indicate they are now waiting for
2334
// next parallel region, Also wake them up so they switch their timers to
2335
// idle.
2336
for (int i = 0; i < team->t.t_nproc; ++i) {
2337
kmp_info_t *team_thread = team->t.t_threads[i];
2338
if (team_thread == this_thr)
2339
continue;
2340
team_thread->th.th_stats->setIdleFlag();
2341
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2342
team_thread->th.th_sleep_loc != NULL)
2343
__kmp_null_resume_wrapper(team_thread);
2344
}
2345
#endif
2346
#if USE_ITT_BUILD
2347
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2348
__kmp_itt_barrier_middle(gtid, itt_sync_obj);
2349
#endif /* USE_ITT_BUILD */
2350
2351
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2352
// Join barrier - report frame end
2353
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2354
__kmp_forkjoin_frames_mode &&
2355
(this_thr->th.th_teams_microtask == NULL || // either not in teams
2356
this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2357
team->t.t_active_level == 1) {
2358
kmp_uint64 cur_time = __itt_get_timestamp();
2359
ident_t *loc = team->t.t_ident;
2360
kmp_info_t **other_threads = team->t.t_threads;
2361
switch (__kmp_forkjoin_frames_mode) {
2362
case 1:
2363
__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2364
loc, nproc);
2365
break;
2366
case 2:
2367
__kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2368
loc, nproc);
2369
break;
2370
case 3:
2371
if (__itt_metadata_add_ptr) {
2372
// Initialize with primary thread's wait time
2373
kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2374
// Set arrive time to zero to be able to check it in
2375
// __kmp_invoke_task(); the same is done inside the loop below
2376
this_thr->th.th_bar_arrive_time = 0;
2377
for (int i = 1; i < nproc; ++i) {
2378
delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2379
other_threads[i]->th.th_bar_arrive_time = 0;
2380
}
2381
__kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2382
cur_time, delta, 0);
2383
}
2384
__kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2385
loc, nproc);
2386
this_thr->th.th_frame_time = cur_time;
2387
break;
2388
}
2389
}
2390
#endif /* USE_ITT_BUILD */
2391
}
2392
#if USE_ITT_BUILD
2393
else {
2394
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2395
__kmp_itt_barrier_middle(gtid, itt_sync_obj);
2396
}
2397
#endif /* USE_ITT_BUILD */
2398
2399
#if KMP_DEBUG
2400
if (KMP_MASTER_TID(tid)) {
2401
KA_TRACE(
2402
15,
2403
("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2404
gtid, team_id, tid, nproc));
2405
}
2406
#endif /* KMP_DEBUG */
2407
2408
// TODO now, mark worker threads as done so they may be disbanded
2409
KMP_MB(); // Flush all pending memory write invalidates.
2410
KA_TRACE(10,
2411
("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2412
2413
}
2414
2415
// TODO release worker threads' fork barriers as we are ready instead of all at
2416
// once
2417
void __kmp_fork_barrier(int gtid, int tid) {
2418
KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2419
KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2420
kmp_info_t *this_thr = __kmp_threads[gtid];
2421
kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2422
#if USE_ITT_BUILD
2423
void *itt_sync_obj = NULL;
2424
#endif /* USE_ITT_BUILD */
2425
#ifdef KMP_DEBUG
2426
if (team)
2427
KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2428
(team != NULL) ? team->t.t_id : -1, tid));
2429
#endif
2430
// th_team pointer only valid for primary thread here
2431
if (KMP_MASTER_TID(tid)) {
2432
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2433
if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2434
// Create itt barrier object
2435
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2436
__kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2437
}
2438
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2439
2440
#ifdef KMP_DEBUG
2441
KMP_DEBUG_ASSERT(team);
2442
kmp_info_t **other_threads = team->t.t_threads;
2443
int i;
2444
2445
// Verify state
2446
KMP_MB();
2447
2448
for (i = 1; i < team->t.t_nproc; ++i) {
2449
KA_TRACE(500,
2450
("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2451
"== %u.\n",
2452
gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2453
team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2454
other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2455
KMP_DEBUG_ASSERT(
2456
(TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2457
~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2458
KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2459
}
2460
#endif
2461
2462
if (__kmp_tasking_mode != tskm_immediate_exec)
2463
__kmp_task_team_setup(this_thr, team);
2464
2465
/* The primary thread may have changed its blocktime between join barrier
2466
and fork barrier. Copy the blocktime info to the thread, where
2467
__kmp_wait_template() can access it when the team struct is not
2468
guaranteed to exist. */
2469
// See note about the corresponding code in __kmp_join_barrier() being
2470
// performance-critical
2471
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2472
#if KMP_USE_MONITOR
2473
this_thr->th.th_team_bt_intervals =
2474
team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2475
this_thr->th.th_team_bt_set =
2476
team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2477
#else
2478
this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2479
#endif
2480
}
2481
} // primary thread
2482
2483
switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2484
case bp_dist_bar: {
2485
__kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2486
TRUE USE_ITT_BUILD_ARG(NULL));
2487
break;
2488
}
2489
case bp_hyper_bar: {
2490
KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2491
__kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2492
TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2493
break;
2494
}
2495
case bp_hierarchical_bar: {
2496
__kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2497
TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2498
break;
2499
}
2500
case bp_tree_bar: {
2501
KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2502
__kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2503
TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2504
break;
2505
}
2506
default: {
2507
__kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2508
TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2509
}
2510
}
2511
2512
#if OMPT_SUPPORT
2513
ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2514
if (ompt_enabled.enabled &&
2515
(ompt_state == ompt_state_wait_barrier_teams ||
2516
ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2517
int ds_tid = this_thr->th.th_info.ds.ds_tid;
2518
ompt_data_t *task_data = (team)
2519
? OMPT_CUR_TASK_DATA(this_thr)
2520
: &(this_thr->th.ompt_thread_info.task_data);
2521
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2522
#if OMPT_OPTIONAL
2523
void *codeptr = NULL;
2524
if (KMP_MASTER_TID(ds_tid) &&
2525
(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2526
ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2527
codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2528
ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2529
if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2530
sync_kind = ompt_sync_region_barrier_teams;
2531
if (ompt_enabled.ompt_callback_sync_region_wait) {
2532
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2533
sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2534
}
2535
if (ompt_enabled.ompt_callback_sync_region) {
2536
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2537
sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2538
}
2539
#endif
2540
if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2541
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2542
ompt_scope_end, NULL, task_data, 0, ds_tid,
2543
ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2544
}
2545
}
2546
#endif
2547
2548
// Early exit for reaping threads releasing forkjoin barrier
2549
if (TCR_4(__kmp_global.g.g_done)) {
2550
this_thr->th.th_task_team = NULL;
2551
2552
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2553
if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2554
if (!KMP_MASTER_TID(tid)) {
2555
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2556
if (itt_sync_obj)
2557
__kmp_itt_barrier_finished(gtid, itt_sync_obj);
2558
}
2559
}
2560
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2561
KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2562
return;
2563
}
2564
2565
/* We can now assume that a valid team structure has been allocated by the
2566
primary thread and propagated to all worker threads. The current thread,
2567
however, may not be part of the team, so we can't blindly assume that the
2568
team pointer is non-null. */
2569
team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2570
KMP_DEBUG_ASSERT(team != NULL);
2571
tid = __kmp_tid_from_gtid(gtid);
2572
2573
#if KMP_BARRIER_ICV_PULL
2574
/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2575
__kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2576
implicit task has this data before this function is called. We cannot
2577
modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2578
thread struct, because it is not always the case that the threads arrays
2579
have been allocated when __kmp_fork_call() is executed. */
2580
{
2581
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2582
if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2583
// Copy the initial ICVs from the primary thread's thread struct to the
2584
// implicit task for this tid.
2585
KA_TRACE(10,
2586
("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2587
__kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2588
tid, FALSE);
2589
copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2590
&team->t.t_threads[0]
2591
->th.th_bar[bs_forkjoin_barrier]
2592
.bb.th_fixed_icvs);
2593
}
2594
}
2595
#endif // KMP_BARRIER_ICV_PULL
2596
2597
if (__kmp_tasking_mode != tskm_immediate_exec) {
2598
__kmp_task_team_sync(this_thr, team);
2599
}
2600
2601
#if KMP_AFFINITY_SUPPORTED
2602
kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2603
if (proc_bind == proc_bind_intel) {
2604
// Call dynamic affinity settings
2605
if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2606
__kmp_balanced_affinity(this_thr, team->t.t_nproc);
2607
}
2608
} else if (proc_bind != proc_bind_false) {
2609
if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2610
KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2611
__kmp_gtid_from_thread(this_thr),
2612
this_thr->th.th_current_place));
2613
} else {
2614
__kmp_affinity_bind_place(gtid);
2615
}
2616
}
2617
#endif // KMP_AFFINITY_SUPPORTED
2618
// Perform the display affinity functionality
2619
if (__kmp_display_affinity) {
2620
if (team->t.t_display_affinity
2621
#if KMP_AFFINITY_SUPPORTED
2622
|| (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2623
#endif
2624
) {
2625
// NULL means use the affinity-format-var ICV
2626
__kmp_aux_display_affinity(gtid, NULL);
2627
this_thr->th.th_prev_num_threads = team->t.t_nproc;
2628
this_thr->th.th_prev_level = team->t.t_level;
2629
}
2630
}
2631
if (!KMP_MASTER_TID(tid))
2632
KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2633
2634
#if USE_ITT_BUILD && USE_ITT_NOTIFY
2635
if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2636
if (!KMP_MASTER_TID(tid)) {
2637
// Get correct barrier object
2638
itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2639
__kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2640
} // (prepare called inside barrier_release)
2641
}
2642
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2643
KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2644
team->t.t_id, tid));
2645
}
2646
2647
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2648
kmp_internal_control_t *new_icvs, ident_t *loc) {
2649
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2650
2651
KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2652
KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2653
2654
/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2655
__kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2656
implicit task has this data before this function is called. */
2657
#if KMP_BARRIER_ICV_PULL
2658
/* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2659
remains untouched), where all of the worker threads can access them and
2660
make their own copies after the barrier. */
2661
KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2662
// allocated at this point
2663
copy_icvs(
2664
&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2665
new_icvs);
2666
KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2667
team->t.t_threads[0], team));
2668
#elif KMP_BARRIER_ICV_PUSH
2669
// The ICVs will be propagated in the fork barrier, so nothing needs to be
2670
// done here.
2671
KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2672
team->t.t_threads[0], team));
2673
#else
2674
// Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2675
// time.
2676
ngo_load(new_icvs);
2677
KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2678
// allocated at this point
2679
for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2680
// TODO: GEH - pass in better source location info since usually NULL here
2681
KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2682
f, team->t.t_threads[f], team));
2683
__kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2684
ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2685
KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2686
f, team->t.t_threads[f], team));
2687
}
2688
ngo_sync();
2689
#endif // KMP_BARRIER_ICV_PULL
2690
}
2691
2692