Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp
35258 views
1
/*
2
* kmp_runtime.cpp -- KPTS runtime support library
3
*/
4
5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "kmp.h"
14
#include "kmp_affinity.h"
15
#include "kmp_atomic.h"
16
#include "kmp_environment.h"
17
#include "kmp_error.h"
18
#include "kmp_i18n.h"
19
#include "kmp_io.h"
20
#include "kmp_itt.h"
21
#include "kmp_settings.h"
22
#include "kmp_stats.h"
23
#include "kmp_str.h"
24
#include "kmp_wait_release.h"
25
#include "kmp_wrapper_getpid.h"
26
#include "kmp_dispatch.h"
27
#include "kmp_utils.h"
28
#if KMP_USE_HIER_SCHED
29
#include "kmp_dispatch_hier.h"
30
#endif
31
32
#if OMPT_SUPPORT
33
#include "ompt-specific.h"
34
#endif
35
#if OMPD_SUPPORT
36
#include "ompd-specific.h"
37
#endif
38
39
#if OMP_PROFILING_SUPPORT
40
#include "llvm/Support/TimeProfiler.h"
41
static char *ProfileTraceFile = nullptr;
42
#endif
43
44
/* these are temporary issues to be dealt with */
45
#define KMP_USE_PRCTL 0
46
47
#if KMP_OS_WINDOWS
48
#include <process.h>
49
#endif
50
51
#ifndef KMP_USE_SHM
52
// Windows and WASI do not need these include files as they don't use shared
53
// memory.
54
#else
55
#include <sys/mman.h>
56
#include <sys/stat.h>
57
#include <fcntl.h>
58
#define SHM_SIZE 1024
59
#endif
60
61
#if defined(KMP_GOMP_COMPAT)
62
char const __kmp_version_alt_comp[] =
63
KMP_VERSION_PREFIX "alternative compiler support: yes";
64
#endif /* defined(KMP_GOMP_COMPAT) */
65
66
char const __kmp_version_omp_api[] =
67
KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69
#ifdef KMP_DEBUG
70
char const __kmp_version_lock[] =
71
KMP_VERSION_PREFIX "lock type: run time selectable";
72
#endif /* KMP_DEBUG */
73
74
#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76
/* ------------------------------------------------------------------------ */
77
78
#if KMP_USE_MONITOR
79
kmp_info_t __kmp_monitor;
80
#endif
81
82
/* Forward declarations */
83
84
void __kmp_cleanup(void);
85
86
static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87
int gtid);
88
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89
kmp_internal_control_t *new_icvs,
90
ident_t *loc);
91
#if KMP_AFFINITY_SUPPORTED
92
static void __kmp_partition_places(kmp_team_t *team,
93
int update_master_only = 0);
94
#endif
95
static void __kmp_do_serial_initialize(void);
96
void __kmp_fork_barrier(int gtid, int tid);
97
void __kmp_join_barrier(int gtid);
98
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99
kmp_internal_control_t *new_icvs, ident_t *loc);
100
101
#ifdef USE_LOAD_BALANCE
102
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103
#endif
104
105
static int __kmp_expand_threads(int nNeed);
106
#if KMP_OS_WINDOWS
107
static int __kmp_unregister_root_other_thread(int gtid);
108
#endif
109
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113
int new_nthreads);
114
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116
static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117
int level) {
118
kmp_nested_nthreads_t *new_nested_nth =
119
(kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120
sizeof(kmp_nested_nthreads_t));
121
int new_size = level + thr->th.th_set_nested_nth_sz;
122
new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123
for (int i = 0; i < level + 1; ++i)
124
new_nested_nth->nth[i] = 0;
125
for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126
new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127
new_nested_nth->size = new_nested_nth->used = new_size;
128
return new_nested_nth;
129
}
130
131
/* Calculate the identifier of the current thread */
132
/* fast (and somewhat portable) way to get unique identifier of executing
133
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134
int __kmp_get_global_thread_id() {
135
int i;
136
kmp_info_t **other_threads;
137
size_t stack_data;
138
char *stack_addr;
139
size_t stack_size;
140
char *stack_base;
141
142
KA_TRACE(
143
1000,
144
("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145
__kmp_nth, __kmp_all_nth));
146
147
/* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148
a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149
by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150
__kmp_init_gtid for this to work. */
151
152
if (!TCR_4(__kmp_init_gtid))
153
return KMP_GTID_DNE;
154
155
#ifdef KMP_TDATA_GTID
156
if (TCR_4(__kmp_gtid_mode) >= 3) {
157
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158
return __kmp_gtid;
159
}
160
#endif
161
if (TCR_4(__kmp_gtid_mode) >= 2) {
162
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163
return __kmp_gtid_get_specific();
164
}
165
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166
167
stack_addr = (char *)&stack_data;
168
other_threads = __kmp_threads;
169
170
/* ATT: The code below is a source of potential bugs due to unsynchronized
171
access to __kmp_threads array. For example:
172
1. Current thread loads other_threads[i] to thr and checks it, it is
173
non-NULL.
174
2. Current thread is suspended by OS.
175
3. Another thread unregisters and finishes (debug versions of free()
176
may fill memory with something like 0xEF).
177
4. Current thread is resumed.
178
5. Current thread reads junk from *thr.
179
TODO: Fix it. --ln */
180
181
for (i = 0; i < __kmp_threads_capacity; i++) {
182
183
kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184
if (!thr)
185
continue;
186
187
stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188
stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189
190
/* stack grows down -- search through all of the active threads */
191
192
if (stack_addr <= stack_base) {
193
size_t stack_diff = stack_base - stack_addr;
194
195
if (stack_diff <= stack_size) {
196
/* The only way we can be closer than the allocated */
197
/* stack size is if we are running on this thread. */
198
// __kmp_gtid_get_specific can return negative value because this
199
// function can be called by thread destructor. However, before the
200
// thread destructor is called, the value of the corresponding
201
// thread-specific data will be reset to NULL.
202
KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203
__kmp_gtid_get_specific() == i);
204
return i;
205
}
206
}
207
}
208
209
/* get specific to try and determine our gtid */
210
KA_TRACE(1000,
211
("*** __kmp_get_global_thread_id: internal alg. failed to find "
212
"thread, using TLS\n"));
213
i = __kmp_gtid_get_specific();
214
215
/*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216
217
/* if we havn't been assigned a gtid, then return code */
218
if (i < 0)
219
return i;
220
221
// other_threads[i] can be nullptr at this point because the corresponding
222
// thread could have already been destructed. It can happen when this function
223
// is called in end library routine.
224
if (!TCR_SYNC_PTR(other_threads[i]))
225
return i;
226
227
/* dynamically updated stack window for uber threads to avoid get_specific
228
call */
229
if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230
KMP_FATAL(StackOverflow, i);
231
}
232
233
stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234
if (stack_addr > stack_base) {
235
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237
other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238
stack_base);
239
} else {
240
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241
stack_base - stack_addr);
242
}
243
244
/* Reprint stack bounds for ubermaster since they have been refined */
245
if (__kmp_storage_map) {
246
char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247
char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248
__kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249
other_threads[i]->th.th_info.ds.ds_stacksize,
250
"th_%d stack (refinement)", i);
251
}
252
return i;
253
}
254
255
int __kmp_get_global_thread_id_reg() {
256
int gtid;
257
258
if (!__kmp_init_serial) {
259
gtid = KMP_GTID_DNE;
260
} else
261
#ifdef KMP_TDATA_GTID
262
if (TCR_4(__kmp_gtid_mode) >= 3) {
263
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264
gtid = __kmp_gtid;
265
} else
266
#endif
267
if (TCR_4(__kmp_gtid_mode) >= 2) {
268
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269
gtid = __kmp_gtid_get_specific();
270
} else {
271
KA_TRACE(1000,
272
("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273
gtid = __kmp_get_global_thread_id();
274
}
275
276
/* we must be a new uber master sibling thread */
277
if (gtid == KMP_GTID_DNE) {
278
KA_TRACE(10,
279
("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280
"Registering a new gtid.\n"));
281
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282
if (!__kmp_init_serial) {
283
__kmp_do_serial_initialize();
284
gtid = __kmp_gtid_get_specific();
285
} else {
286
gtid = __kmp_register_root(FALSE);
287
}
288
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
289
/*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290
}
291
292
KMP_DEBUG_ASSERT(gtid >= 0);
293
294
return gtid;
295
}
296
297
/* caller must hold forkjoin_lock */
298
void __kmp_check_stack_overlap(kmp_info_t *th) {
299
int f;
300
char *stack_beg = NULL;
301
char *stack_end = NULL;
302
int gtid;
303
304
KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305
if (__kmp_storage_map) {
306
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308
309
gtid = __kmp_gtid_from_thread(th);
310
311
if (gtid == KMP_GTID_MONITOR) {
312
__kmp_print_storage_map_gtid(
313
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314
"th_%s stack (%s)", "mon",
315
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316
} else {
317
__kmp_print_storage_map_gtid(
318
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319
"th_%d stack (%s)", gtid,
320
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321
}
322
}
323
324
/* No point in checking ubermaster threads since they use refinement and
325
* cannot overlap */
326
gtid = __kmp_gtid_from_thread(th);
327
if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328
KA_TRACE(10,
329
("__kmp_check_stack_overlap: performing extensive checking\n"));
330
if (stack_beg == NULL) {
331
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333
}
334
335
for (f = 0; f < __kmp_threads_capacity; f++) {
336
kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337
338
if (f_th && f_th != th) {
339
char *other_stack_end =
340
(char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341
char *other_stack_beg =
342
other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343
if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344
(stack_end > other_stack_beg && stack_end < other_stack_end)) {
345
346
/* Print the other stack values before the abort */
347
if (__kmp_storage_map)
348
__kmp_print_storage_map_gtid(
349
-1, other_stack_beg, other_stack_end,
350
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351
"th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352
353
__kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354
__kmp_msg_null);
355
}
356
}
357
}
358
}
359
KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360
}
361
362
/* ------------------------------------------------------------------------ */
363
364
void __kmp_infinite_loop(void) {
365
static int done = FALSE;
366
367
while (!done) {
368
KMP_YIELD(TRUE);
369
}
370
}
371
372
#define MAX_MESSAGE 512
373
374
void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375
char const *format, ...) {
376
char buffer[MAX_MESSAGE];
377
va_list ap;
378
379
va_start(ap, format);
380
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381
p2, (unsigned long)size, format);
382
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383
__kmp_vprintf(kmp_err, buffer, ap);
384
#if KMP_PRINT_DATA_PLACEMENT
385
int node;
386
if (gtid >= 0) {
387
if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388
if (__kmp_storage_map_verbose) {
389
node = __kmp_get_host_node(p1);
390
if (node < 0) /* doesn't work, so don't try this next time */
391
__kmp_storage_map_verbose = FALSE;
392
else {
393
char *last;
394
int lastNode;
395
int localProc = __kmp_get_cpu_from_gtid(gtid);
396
397
const int page_size = KMP_GET_PAGE_SIZE();
398
399
p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400
p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401
if (localProc >= 0)
402
__kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403
localProc >> 1);
404
else
405
__kmp_printf_no_lock(" GTID %d\n", gtid);
406
#if KMP_USE_PRCTL
407
/* The more elaborate format is disabled for now because of the prctl
408
* hanging bug. */
409
do {
410
last = p1;
411
lastNode = node;
412
/* This loop collates adjacent pages with the same host node. */
413
do {
414
(char *)p1 += page_size;
415
} while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416
__kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417
lastNode);
418
} while (p1 <= p2);
419
#else
420
__kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421
(char *)p1 + (page_size - 1),
422
__kmp_get_host_node(p1));
423
if (p1 < p2) {
424
__kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425
(char *)p2 + (page_size - 1),
426
__kmp_get_host_node(p2));
427
}
428
#endif
429
}
430
}
431
} else
432
__kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433
}
434
#endif /* KMP_PRINT_DATA_PLACEMENT */
435
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436
437
va_end(ap);
438
}
439
440
void __kmp_warn(char const *format, ...) {
441
char buffer[MAX_MESSAGE];
442
va_list ap;
443
444
if (__kmp_generate_warnings == kmp_warnings_off) {
445
return;
446
}
447
448
va_start(ap, format);
449
450
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452
__kmp_vprintf(kmp_err, buffer, ap);
453
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454
455
va_end(ap);
456
}
457
458
void __kmp_abort_process() {
459
// Later threads may stall here, but that's ok because abort() will kill them.
460
__kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461
462
if (__kmp_debug_buf) {
463
__kmp_dump_debug_buffer();
464
}
465
466
#if KMP_OS_WINDOWS
467
// Let other threads know of abnormal termination and prevent deadlock
468
// if abort happened during library initialization or shutdown
469
__kmp_global.g.g_abort = SIGABRT;
470
471
/* On Windows* OS by default abort() causes pop-up error box, which stalls
472
nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473
boxes. _set_abort_behavior() works well, but this function is not
474
available in VS7 (this is not problem for DLL, but it is a problem for
475
static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476
help, at least in some versions of MS C RTL.
477
478
It seems following sequence is the only way to simulate abort() and
479
avoid pop-up error box. */
480
raise(SIGABRT);
481
_exit(3); // Just in case, if signal ignored, exit anyway.
482
#else
483
__kmp_unregister_library();
484
abort();
485
#endif
486
487
__kmp_infinite_loop();
488
__kmp_release_bootstrap_lock(&__kmp_exit_lock);
489
490
} // __kmp_abort_process
491
492
void __kmp_abort_thread(void) {
493
// TODO: Eliminate g_abort global variable and this function.
494
// In case of abort just call abort(), it will kill all the threads.
495
__kmp_infinite_loop();
496
} // __kmp_abort_thread
497
498
/* Print out the storage map for the major kmp_info_t thread data structures
499
that are allocated together. */
500
501
static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502
__kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503
gtid);
504
505
__kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506
sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507
508
__kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509
sizeof(kmp_local_t), "th_%d.th_local", gtid);
510
511
__kmp_print_storage_map_gtid(
512
gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513
sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514
515
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516
&thr->th.th_bar[bs_plain_barrier + 1],
517
sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518
gtid);
519
520
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521
&thr->th.th_bar[bs_forkjoin_barrier + 1],
522
sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523
gtid);
524
525
#if KMP_FAST_REDUCTION_BARRIER
526
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527
&thr->th.th_bar[bs_reduction_barrier + 1],
528
sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529
gtid);
530
#endif // KMP_FAST_REDUCTION_BARRIER
531
}
532
533
/* Print out the storage map for the major kmp_team_t team data structures
534
that are allocated together. */
535
536
static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537
int team_id, int num_thr) {
538
int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539
__kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540
header, team_id);
541
542
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543
&team->t.t_bar[bs_last_barrier],
544
sizeof(kmp_balign_team_t) * bs_last_barrier,
545
"%s_%d.t_bar", header, team_id);
546
547
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548
&team->t.t_bar[bs_plain_barrier + 1],
549
sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550
header, team_id);
551
552
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553
&team->t.t_bar[bs_forkjoin_barrier + 1],
554
sizeof(kmp_balign_team_t),
555
"%s_%d.t_bar[forkjoin]", header, team_id);
556
557
#if KMP_FAST_REDUCTION_BARRIER
558
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559
&team->t.t_bar[bs_reduction_barrier + 1],
560
sizeof(kmp_balign_team_t),
561
"%s_%d.t_bar[reduction]", header, team_id);
562
#endif // KMP_FAST_REDUCTION_BARRIER
563
564
__kmp_print_storage_map_gtid(
565
-1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566
sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567
568
__kmp_print_storage_map_gtid(
569
-1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570
sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571
572
__kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573
&team->t.t_disp_buffer[num_disp_buff],
574
sizeof(dispatch_shared_info_t) * num_disp_buff,
575
"%s_%d.t_disp_buffer", header, team_id);
576
}
577
578
static void __kmp_init_allocator() {
579
__kmp_init_memkind();
580
__kmp_init_target_mem();
581
}
582
static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
583
584
/* ------------------------------------------------------------------------ */
585
586
#if ENABLE_LIBOMPTARGET
587
static void __kmp_init_omptarget() {
588
__kmp_init_target_task();
589
}
590
#endif
591
592
/* ------------------------------------------------------------------------ */
593
594
#if KMP_DYNAMIC_LIB
595
#if KMP_OS_WINDOWS
596
597
BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
598
//__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
599
600
switch (fdwReason) {
601
602
case DLL_PROCESS_ATTACH:
603
KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
604
605
return TRUE;
606
607
case DLL_PROCESS_DETACH:
608
KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
609
610
// According to Windows* documentation for DllMain entry point:
611
// for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
612
// lpReserved == NULL when FreeLibrary() is called,
613
// lpReserved != NULL when the process is terminated.
614
// When FreeLibrary() is called, worker threads remain alive. So the
615
// runtime's state is consistent and executing proper shutdown is OK.
616
// When the process is terminated, worker threads have exited or been
617
// forcefully terminated by the OS and only the shutdown thread remains.
618
// This can leave the runtime in an inconsistent state.
619
// Hence, only attempt proper cleanup when FreeLibrary() is called.
620
// Otherwise, rely on OS to reclaim resources.
621
if (lpReserved == NULL)
622
__kmp_internal_end_library(__kmp_gtid_get_specific());
623
624
return TRUE;
625
626
case DLL_THREAD_ATTACH:
627
KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
628
629
/* if we want to register new siblings all the time here call
630
* __kmp_get_gtid(); */
631
return TRUE;
632
633
case DLL_THREAD_DETACH:
634
KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
635
636
__kmp_internal_end_thread(__kmp_gtid_get_specific());
637
return TRUE;
638
}
639
640
return TRUE;
641
}
642
643
#endif /* KMP_OS_WINDOWS */
644
#endif /* KMP_DYNAMIC_LIB */
645
646
/* __kmp_parallel_deo -- Wait until it's our turn. */
647
void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
648
int gtid = *gtid_ref;
649
#ifdef BUILD_PARALLEL_ORDERED
650
kmp_team_t *team = __kmp_team_from_gtid(gtid);
651
#endif /* BUILD_PARALLEL_ORDERED */
652
653
if (__kmp_env_consistency_check) {
654
if (__kmp_threads[gtid]->th.th_root->r.r_active)
655
#if KMP_USE_DYNAMIC_LOCK
656
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
657
#else
658
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
659
#endif
660
}
661
#ifdef BUILD_PARALLEL_ORDERED
662
if (!team->t.t_serialized) {
663
KMP_MB();
664
KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
665
NULL);
666
KMP_MB();
667
}
668
#endif /* BUILD_PARALLEL_ORDERED */
669
}
670
671
/* __kmp_parallel_dxo -- Signal the next task. */
672
void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673
int gtid = *gtid_ref;
674
#ifdef BUILD_PARALLEL_ORDERED
675
int tid = __kmp_tid_from_gtid(gtid);
676
kmp_team_t *team = __kmp_team_from_gtid(gtid);
677
#endif /* BUILD_PARALLEL_ORDERED */
678
679
if (__kmp_env_consistency_check) {
680
if (__kmp_threads[gtid]->th.th_root->r.r_active)
681
__kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
682
}
683
#ifdef BUILD_PARALLEL_ORDERED
684
if (!team->t.t_serialized) {
685
KMP_MB(); /* Flush all pending memory write invalidates. */
686
687
/* use the tid of the next thread in this team */
688
/* TODO replace with general release procedure */
689
team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
690
691
KMP_MB(); /* Flush all pending memory write invalidates. */
692
}
693
#endif /* BUILD_PARALLEL_ORDERED */
694
}
695
696
/* ------------------------------------------------------------------------ */
697
/* The BARRIER for a SINGLE process section is always explicit */
698
699
int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
700
int status;
701
kmp_info_t *th;
702
kmp_team_t *team;
703
704
if (!TCR_4(__kmp_init_parallel))
705
__kmp_parallel_initialize();
706
__kmp_resume_if_soft_paused();
707
708
th = __kmp_threads[gtid];
709
team = th->th.th_team;
710
status = 0;
711
712
th->th.th_ident = id_ref;
713
714
if (team->t.t_serialized) {
715
status = 1;
716
} else {
717
kmp_int32 old_this = th->th.th_local.this_construct;
718
719
++th->th.th_local.this_construct;
720
/* try to set team count to thread count--success means thread got the
721
single block */
722
/* TODO: Should this be acquire or release? */
723
if (team->t.t_construct == old_this) {
724
status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
725
th->th.th_local.this_construct);
726
}
727
#if USE_ITT_BUILD
728
if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
729
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
730
team->t.t_active_level == 1) {
731
// Only report metadata by primary thread of active team at level 1
732
__kmp_itt_metadata_single(id_ref);
733
}
734
#endif /* USE_ITT_BUILD */
735
}
736
737
if (__kmp_env_consistency_check) {
738
if (status && push_ws) {
739
__kmp_push_workshare(gtid, ct_psingle, id_ref);
740
} else {
741
__kmp_check_workshare(gtid, ct_psingle, id_ref);
742
}
743
}
744
#if USE_ITT_BUILD
745
if (status) {
746
__kmp_itt_single_start(gtid);
747
}
748
#endif /* USE_ITT_BUILD */
749
return status;
750
}
751
752
void __kmp_exit_single(int gtid) {
753
#if USE_ITT_BUILD
754
__kmp_itt_single_end(gtid);
755
#endif /* USE_ITT_BUILD */
756
if (__kmp_env_consistency_check)
757
__kmp_pop_workshare(gtid, ct_psingle, NULL);
758
}
759
760
/* determine if we can go parallel or must use a serialized parallel region and
761
* how many threads we can use
762
* set_nproc is the number of threads requested for the team
763
* returns 0 if we should serialize or only use one thread,
764
* otherwise the number of threads to use
765
* The forkjoin lock is held by the caller. */
766
static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
767
int master_tid, int set_nthreads,
768
int enter_teams) {
769
int capacity;
770
int new_nthreads;
771
KMP_DEBUG_ASSERT(__kmp_init_serial);
772
KMP_DEBUG_ASSERT(root && parent_team);
773
kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
774
775
// If dyn-var is set, dynamically adjust the number of desired threads,
776
// according to the method specified by dynamic_mode.
777
new_nthreads = set_nthreads;
778
if (!get__dynamic_2(parent_team, master_tid)) {
779
;
780
}
781
#ifdef USE_LOAD_BALANCE
782
else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
783
new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
784
if (new_nthreads == 1) {
785
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
786
"reservation to 1 thread\n",
787
master_tid));
788
return 1;
789
}
790
if (new_nthreads < set_nthreads) {
791
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792
"reservation to %d threads\n",
793
master_tid, new_nthreads));
794
}
795
}
796
#endif /* USE_LOAD_BALANCE */
797
else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
798
new_nthreads = __kmp_avail_proc - __kmp_nth +
799
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
800
if (new_nthreads <= 1) {
801
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
802
"reservation to 1 thread\n",
803
master_tid));
804
return 1;
805
}
806
if (new_nthreads < set_nthreads) {
807
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808
"reservation to %d threads\n",
809
master_tid, new_nthreads));
810
} else {
811
new_nthreads = set_nthreads;
812
}
813
} else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
814
if (set_nthreads > 2) {
815
new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
816
new_nthreads = (new_nthreads % set_nthreads) + 1;
817
if (new_nthreads == 1) {
818
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
819
"reservation to 1 thread\n",
820
master_tid));
821
return 1;
822
}
823
if (new_nthreads < set_nthreads) {
824
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825
"reservation to %d threads\n",
826
master_tid, new_nthreads));
827
}
828
}
829
} else {
830
KMP_ASSERT(0);
831
}
832
833
// Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
834
if (__kmp_nth + new_nthreads -
835
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
836
__kmp_max_nth) {
837
int tl_nthreads = __kmp_max_nth - __kmp_nth +
838
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839
if (tl_nthreads <= 0) {
840
tl_nthreads = 1;
841
}
842
843
// If dyn-var is false, emit a 1-time warning.
844
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
845
__kmp_reserve_warn = 1;
846
__kmp_msg(kmp_ms_warning,
847
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
848
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
849
}
850
if (tl_nthreads == 1) {
851
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
852
"reduced reservation to 1 thread\n",
853
master_tid));
854
return 1;
855
}
856
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
857
"reservation to %d threads\n",
858
master_tid, tl_nthreads));
859
new_nthreads = tl_nthreads;
860
}
861
862
// Respect OMP_THREAD_LIMIT
863
int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
864
int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
865
if (cg_nthreads + new_nthreads -
866
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
867
max_cg_threads) {
868
int tl_nthreads = max_cg_threads - cg_nthreads +
869
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
870
if (tl_nthreads <= 0) {
871
tl_nthreads = 1;
872
}
873
874
// If dyn-var is false, emit a 1-time warning.
875
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
876
__kmp_reserve_warn = 1;
877
__kmp_msg(kmp_ms_warning,
878
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
879
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
880
}
881
if (tl_nthreads == 1) {
882
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
883
"reduced reservation to 1 thread\n",
884
master_tid));
885
return 1;
886
}
887
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
888
"reservation to %d threads\n",
889
master_tid, tl_nthreads));
890
new_nthreads = tl_nthreads;
891
}
892
893
// Check if the threads array is large enough, or needs expanding.
894
// See comment in __kmp_register_root() about the adjustment if
895
// __kmp_threads[0] == NULL.
896
capacity = __kmp_threads_capacity;
897
if (TCR_PTR(__kmp_threads[0]) == NULL) {
898
--capacity;
899
}
900
// If it is not for initializing the hidden helper team, we need to take
901
// __kmp_hidden_helper_threads_num out of the capacity because it is included
902
// in __kmp_threads_capacity.
903
if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
904
capacity -= __kmp_hidden_helper_threads_num;
905
}
906
if (__kmp_nth + new_nthreads -
907
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
908
capacity) {
909
// Expand the threads array.
910
int slotsRequired = __kmp_nth + new_nthreads -
911
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
912
capacity;
913
int slotsAdded = __kmp_expand_threads(slotsRequired);
914
if (slotsAdded < slotsRequired) {
915
// The threads array was not expanded enough.
916
new_nthreads -= (slotsRequired - slotsAdded);
917
KMP_ASSERT(new_nthreads >= 1);
918
919
// If dyn-var is false, emit a 1-time warning.
920
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
921
__kmp_reserve_warn = 1;
922
if (__kmp_tp_cached) {
923
__kmp_msg(kmp_ms_warning,
924
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
925
KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
926
KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
927
} else {
928
__kmp_msg(kmp_ms_warning,
929
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
930
KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
931
}
932
}
933
}
934
}
935
936
#ifdef KMP_DEBUG
937
if (new_nthreads == 1) {
938
KC_TRACE(10,
939
("__kmp_reserve_threads: T#%d serializing team after reclaiming "
940
"dead roots and rechecking; requested %d threads\n",
941
__kmp_get_gtid(), set_nthreads));
942
} else {
943
KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
944
" %d threads\n",
945
__kmp_get_gtid(), new_nthreads, set_nthreads));
946
}
947
#endif // KMP_DEBUG
948
949
if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
950
__kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
951
this_thr->th.th_nt_msg);
952
}
953
return new_nthreads;
954
}
955
956
/* Allocate threads from the thread pool and assign them to the new team. We are
957
assured that there are enough threads available, because we checked on that
958
earlier within critical section forkjoin */
959
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
960
kmp_info_t *master_th, int master_gtid,
961
int fork_teams_workers) {
962
int i;
963
int use_hot_team;
964
965
KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
966
KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
967
KMP_MB();
968
969
/* first, let's setup the primary thread */
970
master_th->th.th_info.ds.ds_tid = 0;
971
master_th->th.th_team = team;
972
master_th->th.th_team_nproc = team->t.t_nproc;
973
master_th->th.th_team_master = master_th;
974
master_th->th.th_team_serialized = FALSE;
975
master_th->th.th_dispatch = &team->t.t_dispatch[0];
976
977
/* make sure we are not the optimized hot team */
978
#if KMP_NESTED_HOT_TEAMS
979
use_hot_team = 0;
980
kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
981
if (hot_teams) { // hot teams array is not allocated if
982
// KMP_HOT_TEAMS_MAX_LEVEL=0
983
int level = team->t.t_active_level - 1; // index in array of hot teams
984
if (master_th->th.th_teams_microtask) { // are we inside the teams?
985
if (master_th->th.th_teams_size.nteams > 1) {
986
++level; // level was not increased in teams construct for
987
// team_of_masters
988
}
989
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
990
master_th->th.th_teams_level == team->t.t_level) {
991
++level; // level was not increased in teams construct for
992
// team_of_workers before the parallel
993
} // team->t.t_level will be increased inside parallel
994
}
995
if (level < __kmp_hot_teams_max_level) {
996
if (hot_teams[level].hot_team) {
997
// hot team has already been allocated for given level
998
KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
999
use_hot_team = 1; // the team is ready to use
1000
} else {
1001
use_hot_team = 0; // AC: threads are not allocated yet
1002
hot_teams[level].hot_team = team; // remember new hot team
1003
hot_teams[level].hot_team_nth = team->t.t_nproc;
1004
}
1005
} else {
1006
use_hot_team = 0;
1007
}
1008
}
1009
#else
1010
use_hot_team = team == root->r.r_hot_team;
1011
#endif
1012
if (!use_hot_team) {
1013
1014
/* install the primary thread */
1015
team->t.t_threads[0] = master_th;
1016
__kmp_initialize_info(master_th, team, 0, master_gtid);
1017
1018
/* now, install the worker threads */
1019
for (i = 1; i < team->t.t_nproc; i++) {
1020
1021
/* fork or reallocate a new thread and install it in team */
1022
kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1023
team->t.t_threads[i] = thr;
1024
KMP_DEBUG_ASSERT(thr);
1025
KMP_DEBUG_ASSERT(thr->th.th_team == team);
1026
/* align team and thread arrived states */
1027
KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1028
"T#%d(%d:%d) join =%llu, plain=%llu\n",
1029
__kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1030
__kmp_gtid_from_tid(i, team), team->t.t_id, i,
1031
team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1032
team->t.t_bar[bs_plain_barrier].b_arrived));
1033
thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1034
thr->th.th_teams_level = master_th->th.th_teams_level;
1035
thr->th.th_teams_size = master_th->th.th_teams_size;
1036
{ // Initialize threads' barrier data.
1037
int b;
1038
kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1039
for (b = 0; b < bs_last_barrier; ++b) {
1040
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1041
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1042
#if USE_DEBUGGER
1043
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1044
#endif
1045
}
1046
}
1047
}
1048
1049
#if KMP_AFFINITY_SUPPORTED
1050
// Do not partition the places list for teams construct workers who
1051
// haven't actually been forked to do real work yet. This partitioning
1052
// will take place in the parallel region nested within the teams construct.
1053
if (!fork_teams_workers) {
1054
__kmp_partition_places(team);
1055
}
1056
#endif
1057
1058
if (team->t.t_nproc > 1 &&
1059
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1060
team->t.b->update_num_threads(team->t.t_nproc);
1061
__kmp_add_threads_to_team(team, team->t.t_nproc);
1062
}
1063
}
1064
1065
// Take care of primary thread's task state
1066
if (__kmp_tasking_mode != tskm_immediate_exec) {
1067
if (use_hot_team) {
1068
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1069
KA_TRACE(
1070
20,
1071
("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1072
"%p, new task_team %p / team %p\n",
1073
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1074
team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1075
team));
1076
1077
// Store primary thread's current task state on new team
1078
KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1079
master_th->th.th_task_state);
1080
1081
// Restore primary thread's task state to hot team's state
1082
// by using thread 1's task state
1083
if (team->t.t_nproc > 1) {
1084
KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1085
team->t.t_threads[1]->th.th_task_state == 1);
1086
KMP_CHECK_UPDATE(master_th->th.th_task_state,
1087
team->t.t_threads[1]->th.th_task_state);
1088
} else {
1089
master_th->th.th_task_state = 0;
1090
}
1091
} else {
1092
// Store primary thread's current task_state on new team
1093
KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1094
master_th->th.th_task_state);
1095
// Are not using hot team, so set task state to 0.
1096
master_th->th.th_task_state = 0;
1097
}
1098
}
1099
1100
if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101
for (i = 0; i < team->t.t_nproc; i++) {
1102
kmp_info_t *thr = team->t.t_threads[i];
1103
if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104
thr->th.th_prev_level != team->t.t_level) {
1105
team->t.t_display_affinity = 1;
1106
break;
1107
}
1108
}
1109
}
1110
1111
KMP_MB();
1112
}
1113
1114
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115
// Propagate any changes to the floating point control registers out to the team
1116
// We try to avoid unnecessary writes to the relevant cache line in the team
1117
// structure, so we don't make changes unless they are needed.
1118
inline static void propagateFPControl(kmp_team_t *team) {
1119
if (__kmp_inherit_fp_control) {
1120
kmp_int16 x87_fpu_control_word;
1121
kmp_uint32 mxcsr;
1122
1123
// Get primary thread's values of FPU control flags (both X87 and vector)
1124
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125
__kmp_store_mxcsr(&mxcsr);
1126
mxcsr &= KMP_X86_MXCSR_MASK;
1127
1128
// There is no point looking at t_fp_control_saved here.
1129
// If it is TRUE, we still have to update the values if they are different
1130
// from those we now have. If it is FALSE we didn't save anything yet, but
1131
// our objective is the same. We have to ensure that the values in the team
1132
// are the same as those we have.
1133
// So, this code achieves what we need whether or not t_fp_control_saved is
1134
// true. By checking whether the value needs updating we avoid unnecessary
1135
// writes that would put the cache-line into a written state, causing all
1136
// threads in the team to have to read it again.
1137
KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138
KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139
// Although we don't use this value, other code in the runtime wants to know
1140
// whether it should restore them. So we must ensure it is correct.
1141
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142
} else {
1143
// Similarly here. Don't write to this cache-line in the team structure
1144
// unless we have to.
1145
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146
}
1147
}
1148
1149
// Do the opposite, setting the hardware registers to the updated values from
1150
// the team.
1151
inline static void updateHWFPControl(kmp_team_t *team) {
1152
if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153
// Only reset the fp control regs if they have been changed in the team.
1154
// the parallel region that we are exiting.
1155
kmp_int16 x87_fpu_control_word;
1156
kmp_uint32 mxcsr;
1157
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158
__kmp_store_mxcsr(&mxcsr);
1159
mxcsr &= KMP_X86_MXCSR_MASK;
1160
1161
if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162
__kmp_clear_x87_fpu_status_word();
1163
__kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164
}
1165
1166
if (team->t.t_mxcsr != mxcsr) {
1167
__kmp_load_mxcsr(&team->t.t_mxcsr);
1168
}
1169
}
1170
}
1171
#else
1172
#define propagateFPControl(x) ((void)0)
1173
#define updateHWFPControl(x) ((void)0)
1174
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175
1176
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177
int realloc); // forward declaration
1178
1179
/* Run a parallel region that has been serialized, so runs only in a team of the
1180
single primary thread. */
1181
void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182
kmp_info_t *this_thr;
1183
kmp_team_t *serial_team;
1184
1185
KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186
1187
/* Skip all this code for autopar serialized loops since it results in
1188
unacceptable overhead */
1189
if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190
return;
1191
1192
if (!TCR_4(__kmp_init_parallel))
1193
__kmp_parallel_initialize();
1194
__kmp_resume_if_soft_paused();
1195
1196
this_thr = __kmp_threads[global_tid];
1197
serial_team = this_thr->th.th_serial_team;
1198
1199
/* utilize the serialized team held by this thread */
1200
KMP_DEBUG_ASSERT(serial_team);
1201
KMP_MB();
1202
1203
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1204
if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1205
proc_bind = proc_bind_false;
1206
} else if (proc_bind == proc_bind_default) {
1207
// No proc_bind clause was specified, so use the current value
1208
// of proc-bind-var for this parallel region.
1209
proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1210
}
1211
// Reset for next parallel region
1212
this_thr->th.th_set_proc_bind = proc_bind_default;
1213
1214
// Reset num_threads for next parallel region
1215
this_thr->th.th_set_nproc = 0;
1216
1217
#if OMPT_SUPPORT
1218
ompt_data_t ompt_parallel_data = ompt_data_none;
1219
void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220
if (ompt_enabled.enabled &&
1221
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222
1223
ompt_task_info_t *parent_task_info;
1224
parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225
1226
parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227
if (ompt_enabled.ompt_callback_parallel_begin) {
1228
int team_size = 1;
1229
1230
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231
&(parent_task_info->task_data), &(parent_task_info->frame),
1232
&ompt_parallel_data, team_size,
1233
ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1234
}
1235
}
1236
#endif // OMPT_SUPPORT
1237
1238
if (this_thr->th.th_team != serial_team) {
1239
// Nested level will be an index in the nested nthreads array
1240
int level = this_thr->th.th_team->t.t_level;
1241
1242
if (serial_team->t.t_serialized) {
1243
/* this serial team was already used
1244
TODO increase performance by making this locks more specific */
1245
kmp_team_t *new_team;
1246
1247
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248
1249
new_team =
1250
__kmp_allocate_team(this_thr->th.th_root, 1, 1,
1251
#if OMPT_SUPPORT
1252
ompt_parallel_data,
1253
#endif
1254
proc_bind, &this_thr->th.th_current_task->td_icvs,
1255
0 USE_NESTED_HOT_ARG(NULL));
1256
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1257
KMP_ASSERT(new_team);
1258
1259
/* setup new serialized team and install it */
1260
new_team->t.t_threads[0] = this_thr;
1261
new_team->t.t_parent = this_thr->th.th_team;
1262
serial_team = new_team;
1263
this_thr->th.th_serial_team = serial_team;
1264
1265
KF_TRACE(
1266
10,
1267
("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1268
global_tid, serial_team));
1269
1270
/* TODO the above breaks the requirement that if we run out of resources,
1271
then we can still guarantee that serialized teams are ok, since we may
1272
need to allocate a new one */
1273
} else {
1274
KF_TRACE(
1275
10,
1276
("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1277
global_tid, serial_team));
1278
}
1279
1280
/* we have to initialize this serial team */
1281
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283
KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1284
serial_team->t.t_ident = loc;
1285
serial_team->t.t_serialized = 1;
1286
serial_team->t.t_nproc = 1;
1287
serial_team->t.t_parent = this_thr->th.th_team;
1288
if (this_thr->th.th_team->t.t_nested_nth)
1289
serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1290
else
1291
serial_team->t.t_nested_nth = &__kmp_nested_nth;
1292
// Save previous team's task state on serial team structure
1293
serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1294
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1295
this_thr->th.th_team = serial_team;
1296
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1297
1298
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1299
this_thr->th.th_current_task));
1300
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1301
this_thr->th.th_current_task->td_flags.executing = 0;
1302
1303
__kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1304
1305
/* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1306
implicit task for each serialized task represented by
1307
team->t.t_serialized? */
1308
copy_icvs(&this_thr->th.th_current_task->td_icvs,
1309
&this_thr->th.th_current_task->td_parent->td_icvs);
1310
1311
// Thread value exists in the nested nthreads array for the next nested
1312
// level
1313
kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1314
if (this_thr->th.th_team->t.t_nested_nth)
1315
nested_nth = this_thr->th.th_team->t.t_nested_nth;
1316
if (nested_nth->used && (level + 1 < nested_nth->used)) {
1317
this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1318
}
1319
1320
if (__kmp_nested_proc_bind.used &&
1321
(level + 1 < __kmp_nested_proc_bind.used)) {
1322
this_thr->th.th_current_task->td_icvs.proc_bind =
1323
__kmp_nested_proc_bind.bind_types[level + 1];
1324
}
1325
1326
#if USE_DEBUGGER
1327
serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1328
#endif
1329
this_thr->th.th_info.ds.ds_tid = 0;
1330
1331
/* set thread cache values */
1332
this_thr->th.th_team_nproc = 1;
1333
this_thr->th.th_team_master = this_thr;
1334
this_thr->th.th_team_serialized = 1;
1335
this_thr->th.th_task_team = NULL;
1336
this_thr->th.th_task_state = 0;
1337
1338
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1339
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1340
serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1341
1342
propagateFPControl(serial_team);
1343
1344
/* check if we need to allocate dispatch buffers stack */
1345
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1346
if (!serial_team->t.t_dispatch->th_disp_buffer) {
1347
serial_team->t.t_dispatch->th_disp_buffer =
1348
(dispatch_private_info_t *)__kmp_allocate(
1349
sizeof(dispatch_private_info_t));
1350
}
1351
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352
1353
KMP_MB();
1354
1355
} else {
1356
/* this serialized team is already being used,
1357
* that's fine, just add another nested level */
1358
KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1359
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1360
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1361
++serial_team->t.t_serialized;
1362
this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1363
1364
// Nested level will be an index in the nested nthreads array
1365
int level = this_thr->th.th_team->t.t_level;
1366
// Thread value exists in the nested nthreads array for the next nested
1367
// level
1368
1369
kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1370
if (serial_team->t.t_nested_nth)
1371
nested_nth = serial_team->t.t_nested_nth;
1372
if (nested_nth->used && (level + 1 < nested_nth->used)) {
1373
this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1374
}
1375
1376
serial_team->t.t_level++;
1377
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378
"of serial team %p to %d\n",
1379
global_tid, serial_team, serial_team->t.t_level));
1380
1381
/* allocate/push dispatch buffers stack */
1382
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383
{
1384
dispatch_private_info_t *disp_buffer =
1385
(dispatch_private_info_t *)__kmp_allocate(
1386
sizeof(dispatch_private_info_t));
1387
disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388
serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389
}
1390
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391
1392
/* allocate/push task team stack */
1393
__kmp_push_task_team_node(this_thr, serial_team);
1394
1395
KMP_MB();
1396
}
1397
KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1398
1399
// Perform the display affinity functionality for
1400
// serialized parallel regions
1401
if (__kmp_display_affinity) {
1402
if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403
this_thr->th.th_prev_num_threads != 1) {
1404
// NULL means use the affinity-format-var ICV
1405
__kmp_aux_display_affinity(global_tid, NULL);
1406
this_thr->th.th_prev_level = serial_team->t.t_level;
1407
this_thr->th.th_prev_num_threads = 1;
1408
}
1409
}
1410
1411
if (__kmp_env_consistency_check)
1412
__kmp_push_parallel(global_tid, NULL);
1413
#if OMPT_SUPPORT
1414
serial_team->t.ompt_team_info.master_return_address = codeptr;
1415
if (ompt_enabled.enabled &&
1416
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1417
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1418
OMPT_GET_FRAME_ADDRESS(0);
1419
1420
ompt_lw_taskteam_t lw_taskteam;
1421
__ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422
&ompt_parallel_data, codeptr);
1423
1424
__ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425
// don't use lw_taskteam after linking. content was swaped
1426
1427
/* OMPT implicit task begin */
1428
if (ompt_enabled.ompt_callback_implicit_task) {
1429
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1430
ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1431
OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1432
ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433
OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1434
__kmp_tid_from_gtid(global_tid);
1435
}
1436
1437
/* OMPT state */
1438
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1440
OMPT_GET_FRAME_ADDRESS(0);
1441
}
1442
#endif
1443
}
1444
1445
// Test if this fork is for a team closely nested in a teams construct
1446
static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1447
microtask_t microtask, int level,
1448
int teams_level, kmp_va_list ap) {
1449
return (master_th->th.th_teams_microtask && ap &&
1450
microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1451
}
1452
1453
// Test if this fork is for the teams construct, i.e. to form the outer league
1454
// of teams
1455
static inline bool __kmp_is_entering_teams(int active_level, int level,
1456
int teams_level, kmp_va_list ap) {
1457
return ((ap == NULL && active_level == 0) ||
1458
(ap && teams_level > 0 && teams_level == level));
1459
}
1460
1461
// AC: This is start of parallel that is nested inside teams construct.
1462
// The team is actual (hot), all workers are ready at the fork barrier.
1463
// No lock needed to initialize the team a bit, then free workers.
1464
static inline int
1465
__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1466
kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1467
enum fork_context_e call_context, microtask_t microtask,
1468
launch_t invoker, int master_set_numthreads, int level,
1469
#if OMPT_SUPPORT
1470
ompt_data_t ompt_parallel_data, void *return_address,
1471
#endif
1472
kmp_va_list ap) {
1473
void **argv;
1474
int i;
1475
1476
parent_team->t.t_ident = loc;
1477
__kmp_alloc_argv_entries(argc, parent_team, TRUE);
1478
parent_team->t.t_argc = argc;
1479
argv = (void **)parent_team->t.t_argv;
1480
for (i = argc - 1; i >= 0; --i) {
1481
*argv++ = va_arg(kmp_va_deref(ap), void *);
1482
}
1483
// Increment our nested depth levels, but not increase the serialization
1484
if (parent_team == master_th->th.th_serial_team) {
1485
// AC: we are in serialized parallel
1486
__kmpc_serialized_parallel(loc, gtid);
1487
KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1488
1489
if (call_context == fork_context_gnu) {
1490
// AC: need to decrement t_serialized for enquiry functions to work
1491
// correctly, will restore at join time
1492
parent_team->t.t_serialized--;
1493
return TRUE;
1494
}
1495
1496
#if OMPD_SUPPORT
1497
parent_team->t.t_pkfn = microtask;
1498
#endif
1499
1500
#if OMPT_SUPPORT
1501
void *dummy;
1502
void **exit_frame_p;
1503
ompt_data_t *implicit_task_data;
1504
ompt_lw_taskteam_t lw_taskteam;
1505
1506
if (ompt_enabled.enabled) {
1507
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1508
&ompt_parallel_data, return_address);
1509
exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1510
1511
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1512
// Don't use lw_taskteam after linking. Content was swapped.
1513
1514
/* OMPT implicit task begin */
1515
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1516
if (ompt_enabled.ompt_callback_implicit_task) {
1517
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1518
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1520
1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521
}
1522
1523
/* OMPT state */
1524
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1525
} else {
1526
exit_frame_p = &dummy;
1527
}
1528
#endif
1529
1530
// AC: need to decrement t_serialized for enquiry functions to work
1531
// correctly, will restore at join time
1532
parent_team->t.t_serialized--;
1533
1534
{
1535
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1536
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1537
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1538
#if OMPT_SUPPORT
1539
,
1540
exit_frame_p
1541
#endif
1542
);
1543
}
1544
1545
#if OMPT_SUPPORT
1546
if (ompt_enabled.enabled) {
1547
*exit_frame_p = NULL;
1548
OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1549
if (ompt_enabled.ompt_callback_implicit_task) {
1550
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1551
ompt_scope_end, NULL, implicit_task_data, 1,
1552
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553
}
1554
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1555
__ompt_lw_taskteam_unlink(master_th);
1556
if (ompt_enabled.ompt_callback_parallel_end) {
1557
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1558
&ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1559
OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1560
}
1561
master_th->th.ompt_thread_info.state = ompt_state_overhead;
1562
}
1563
#endif
1564
return TRUE;
1565
}
1566
1567
parent_team->t.t_pkfn = microtask;
1568
parent_team->t.t_invoke = invoker;
1569
KMP_ATOMIC_INC(&root->r.r_in_parallel);
1570
parent_team->t.t_active_level++;
1571
parent_team->t.t_level++;
1572
parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1573
1574
// If the threads allocated to the team are less than the thread limit, update
1575
// the thread limit here. th_teams_size.nth is specific to this team nested
1576
// in a teams construct, the team is fully created, and we're about to do
1577
// the actual fork. Best to do this here so that the subsequent uses below
1578
// and in the join have the correct value.
1579
master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1580
1581
#if OMPT_SUPPORT
1582
if (ompt_enabled.enabled) {
1583
ompt_lw_taskteam_t lw_taskteam;
1584
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1585
return_address);
1586
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1587
}
1588
#endif
1589
1590
/* Change number of threads in the team if requested */
1591
if (master_set_numthreads) { // The parallel has num_threads clause
1592
if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1593
// AC: only can reduce number of threads dynamically, can't increase
1594
kmp_info_t **other_threads = parent_team->t.t_threads;
1595
// NOTE: if using distributed barrier, we need to run this code block
1596
// even when the team size appears not to have changed from the max.
1597
int old_proc = master_th->th.th_teams_size.nth;
1598
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1599
__kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1600
__kmp_add_threads_to_team(parent_team, master_set_numthreads);
1601
}
1602
parent_team->t.t_nproc = master_set_numthreads;
1603
for (i = 0; i < master_set_numthreads; ++i) {
1604
other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605
}
1606
}
1607
// Keep extra threads hot in the team for possible next parallels
1608
master_th->th.th_set_nproc = 0;
1609
}
1610
1611
#if USE_DEBUGGER
1612
if (__kmp_debugging) { // Let debugger override number of threads.
1613
int nth = __kmp_omp_num_threads(loc);
1614
if (nth > 0) { // 0 means debugger doesn't want to change num threads
1615
master_set_numthreads = nth;
1616
}
1617
}
1618
#endif
1619
1620
// Figure out the proc_bind policy for the nested parallel within teams
1621
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1622
// proc_bind_default means don't update
1623
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1624
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1625
proc_bind = proc_bind_false;
1626
} else {
1627
// No proc_bind clause specified; use current proc-bind-var
1628
if (proc_bind == proc_bind_default) {
1629
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1630
}
1631
/* else: The proc_bind policy was specified explicitly on parallel clause.
1632
This overrides proc-bind-var for this parallel region, but does not
1633
change proc-bind-var. */
1634
// Figure the value of proc-bind-var for the child threads.
1635
if ((level + 1 < __kmp_nested_proc_bind.used) &&
1636
(__kmp_nested_proc_bind.bind_types[level + 1] !=
1637
master_th->th.th_current_task->td_icvs.proc_bind)) {
1638
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1639
}
1640
}
1641
KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1642
// Need to change the bind-var ICV to correct value for each implicit task
1643
if (proc_bind_icv != proc_bind_default &&
1644
master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1645
kmp_info_t **other_threads = parent_team->t.t_threads;
1646
for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1647
other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1648
}
1649
}
1650
// Reset for next parallel region
1651
master_th->th.th_set_proc_bind = proc_bind_default;
1652
1653
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1654
if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1655
KMP_ITT_DEBUG) &&
1656
__kmp_forkjoin_frames_mode == 3 &&
1657
parent_team->t.t_active_level == 1 // only report frames at level 1
1658
&& master_th->th.th_teams_size.nteams == 1) {
1659
kmp_uint64 tmp_time = __itt_get_timestamp();
1660
master_th->th.th_frame_time = tmp_time;
1661
parent_team->t.t_region_time = tmp_time;
1662
}
1663
if (__itt_stack_caller_create_ptr) {
1664
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1665
// create new stack stitching id before entering fork barrier
1666
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1667
}
1668
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1669
#if KMP_AFFINITY_SUPPORTED
1670
__kmp_partition_places(parent_team);
1671
#endif
1672
1673
KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1674
"master_th=%p, gtid=%d\n",
1675
root, parent_team, master_th, gtid));
1676
__kmp_internal_fork(loc, gtid, parent_team);
1677
KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1678
"master_th=%p, gtid=%d\n",
1679
root, parent_team, master_th, gtid));
1680
1681
if (call_context == fork_context_gnu)
1682
return TRUE;
1683
1684
/* Invoke microtask for PRIMARY thread */
1685
KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1686
parent_team->t.t_id, parent_team->t.t_pkfn));
1687
1688
if (!parent_team->t.t_invoke(gtid)) {
1689
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1690
}
1691
KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1692
parent_team->t.t_id, parent_team->t.t_pkfn));
1693
KMP_MB(); /* Flush all pending memory write invalidates. */
1694
1695
KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1696
1697
return TRUE;
1698
}
1699
1700
// Create a serialized parallel region
1701
static inline int
1702
__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1703
kmp_int32 argc, microtask_t microtask, launch_t invoker,
1704
kmp_info_t *master_th, kmp_team_t *parent_team,
1705
#if OMPT_SUPPORT
1706
ompt_data_t *ompt_parallel_data, void **return_address,
1707
ompt_data_t **parent_task_data,
1708
#endif
1709
kmp_va_list ap) {
1710
kmp_team_t *team;
1711
int i;
1712
void **argv;
1713
1714
/* josh todo: hypothetical question: what do we do for OS X*? */
1715
#if KMP_OS_LINUX && \
1716
(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717
SimpleVLA<void *> args(argc);
1718
#else
1719
void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721
KMP_ARCH_AARCH64) */
1722
1723
KA_TRACE(
1724
20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1725
1726
__kmpc_serialized_parallel(loc, gtid);
1727
1728
#if OMPD_SUPPORT
1729
master_th->th.th_serial_team->t.t_pkfn = microtask;
1730
#endif
1731
1732
if (call_context == fork_context_intel) {
1733
/* TODO this sucks, use the compiler itself to pass args! :) */
1734
master_th->th.th_serial_team->t.t_ident = loc;
1735
if (!ap) {
1736
// revert change made in __kmpc_serialized_parallel()
1737
master_th->th.th_serial_team->t.t_level--;
1738
// Get args from parent team for teams construct
1739
1740
#if OMPT_SUPPORT
1741
void *dummy;
1742
void **exit_frame_p;
1743
ompt_task_info_t *task_info;
1744
ompt_lw_taskteam_t lw_taskteam;
1745
1746
if (ompt_enabled.enabled) {
1747
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1748
ompt_parallel_data, *return_address);
1749
1750
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1751
// don't use lw_taskteam after linking. content was swaped
1752
task_info = OMPT_CUR_TASK_INFO(master_th);
1753
exit_frame_p = &(task_info->frame.exit_frame.ptr);
1754
if (ompt_enabled.ompt_callback_implicit_task) {
1755
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1756
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1757
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1758
&(task_info->task_data), 1,
1759
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1760
}
1761
1762
/* OMPT state */
1763
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764
} else {
1765
exit_frame_p = &dummy;
1766
}
1767
#endif
1768
1769
{
1770
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1773
#if OMPT_SUPPORT
1774
,
1775
exit_frame_p
1776
#endif
1777
);
1778
}
1779
1780
#if OMPT_SUPPORT
1781
if (ompt_enabled.enabled) {
1782
*exit_frame_p = NULL;
1783
if (ompt_enabled.ompt_callback_implicit_task) {
1784
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1785
ompt_scope_end, NULL, &(task_info->task_data), 1,
1786
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1787
}
1788
*ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1789
__ompt_lw_taskteam_unlink(master_th);
1790
if (ompt_enabled.ompt_callback_parallel_end) {
1791
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1792
ompt_parallel_data, *parent_task_data,
1793
OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1794
}
1795
master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796
}
1797
#endif
1798
} else if (microtask == (microtask_t)__kmp_teams_master) {
1799
KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1800
team = master_th->th.th_team;
1801
// team->t.t_pkfn = microtask;
1802
team->t.t_invoke = invoker;
1803
__kmp_alloc_argv_entries(argc, team, TRUE);
1804
team->t.t_argc = argc;
1805
argv = (void **)team->t.t_argv;
1806
for (i = argc - 1; i >= 0; --i)
1807
*argv++ = va_arg(kmp_va_deref(ap), void *);
1808
// AC: revert change made in __kmpc_serialized_parallel()
1809
// because initial code in teams should have level=0
1810
team->t.t_level--;
1811
// AC: call special invoker for outer "parallel" of teams construct
1812
invoker(gtid);
1813
#if OMPT_SUPPORT
1814
if (ompt_enabled.enabled) {
1815
ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1816
if (ompt_enabled.ompt_callback_implicit_task) {
1817
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1818
ompt_scope_end, NULL, &(task_info->task_data), 0,
1819
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1820
}
1821
if (ompt_enabled.ompt_callback_parallel_end) {
1822
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823
ompt_parallel_data, *parent_task_data,
1824
OMPT_INVOKER(call_context) | ompt_parallel_league,
1825
*return_address);
1826
}
1827
master_th->th.ompt_thread_info.state = ompt_state_overhead;
1828
}
1829
#endif
1830
} else {
1831
argv = args;
1832
for (i = argc - 1; i >= 0; --i)
1833
*argv++ = va_arg(kmp_va_deref(ap), void *);
1834
KMP_MB();
1835
1836
#if OMPT_SUPPORT
1837
void *dummy;
1838
void **exit_frame_p;
1839
ompt_task_info_t *task_info;
1840
ompt_lw_taskteam_t lw_taskteam;
1841
ompt_data_t *implicit_task_data;
1842
1843
if (ompt_enabled.enabled) {
1844
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1845
ompt_parallel_data, *return_address);
1846
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1847
// don't use lw_taskteam after linking. content was swaped
1848
task_info = OMPT_CUR_TASK_INFO(master_th);
1849
exit_frame_p = &(task_info->frame.exit_frame.ptr);
1850
1851
/* OMPT implicit task begin */
1852
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1853
if (ompt_enabled.ompt_callback_implicit_task) {
1854
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1856
implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1857
ompt_task_implicit);
1858
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1859
}
1860
1861
/* OMPT state */
1862
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1863
} else {
1864
exit_frame_p = &dummy;
1865
}
1866
#endif
1867
1868
{
1869
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1870
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1871
__kmp_invoke_microtask(microtask, gtid, 0, argc, args
1872
#if OMPT_SUPPORT
1873
,
1874
exit_frame_p
1875
#endif
1876
);
1877
}
1878
1879
#if OMPT_SUPPORT
1880
if (ompt_enabled.enabled) {
1881
*exit_frame_p = NULL;
1882
if (ompt_enabled.ompt_callback_implicit_task) {
1883
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884
ompt_scope_end, NULL, &(task_info->task_data), 1,
1885
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1886
}
1887
1888
*ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1889
__ompt_lw_taskteam_unlink(master_th);
1890
if (ompt_enabled.ompt_callback_parallel_end) {
1891
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1892
ompt_parallel_data, *parent_task_data,
1893
OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1894
}
1895
master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896
}
1897
#endif
1898
}
1899
} else if (call_context == fork_context_gnu) {
1900
#if OMPT_SUPPORT
1901
if (ompt_enabled.enabled) {
1902
ompt_lw_taskteam_t lwt;
1903
__ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1904
*return_address);
1905
1906
lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1907
__ompt_lw_taskteam_link(&lwt, master_th, 1);
1908
}
1909
// don't use lw_taskteam after linking. content was swaped
1910
#endif
1911
1912
// we were called from GNU native code
1913
KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1914
return FALSE;
1915
} else {
1916
KMP_ASSERT2(call_context < fork_context_last,
1917
"__kmp_serial_fork_call: unknown fork_context parameter");
1918
}
1919
1920
KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921
KMP_MB();
1922
return FALSE;
1923
}
1924
1925
/* most of the work for a fork */
1926
/* return true if we really went parallel, false if serialized */
1927
int __kmp_fork_call(ident_t *loc, int gtid,
1928
enum fork_context_e call_context, // Intel, GNU, ...
1929
kmp_int32 argc, microtask_t microtask, launch_t invoker,
1930
kmp_va_list ap) {
1931
void **argv;
1932
int i;
1933
int master_tid;
1934
int master_this_cons;
1935
kmp_team_t *team;
1936
kmp_team_t *parent_team;
1937
kmp_info_t *master_th;
1938
kmp_root_t *root;
1939
int nthreads;
1940
int master_active;
1941
int master_set_numthreads;
1942
int task_thread_limit = 0;
1943
int level;
1944
int active_level;
1945
int teams_level;
1946
#if KMP_NESTED_HOT_TEAMS
1947
kmp_hot_team_ptr_t **p_hot_teams;
1948
#endif
1949
{ // KMP_TIME_BLOCK
1950
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1951
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1952
1953
KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1954
if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1955
/* Some systems prefer the stack for the root thread(s) to start with */
1956
/* some gap from the parent stack to prevent false sharing. */
1957
void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1958
/* These 2 lines below are so this does not get optimized out */
1959
if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1960
__kmp_stkpadding += (short)((kmp_int64)dummy);
1961
}
1962
1963
/* initialize if needed */
1964
KMP_DEBUG_ASSERT(
1965
__kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1966
if (!TCR_4(__kmp_init_parallel))
1967
__kmp_parallel_initialize();
1968
__kmp_resume_if_soft_paused();
1969
1970
/* setup current data */
1971
// AC: potentially unsafe, not in sync with library shutdown,
1972
// __kmp_threads can be freed
1973
master_th = __kmp_threads[gtid];
1974
1975
parent_team = master_th->th.th_team;
1976
master_tid = master_th->th.th_info.ds.ds_tid;
1977
master_this_cons = master_th->th.th_local.this_construct;
1978
root = master_th->th.th_root;
1979
master_active = root->r.r_active;
1980
master_set_numthreads = master_th->th.th_set_nproc;
1981
task_thread_limit =
1982
master_th->th.th_current_task->td_icvs.task_thread_limit;
1983
1984
#if OMPT_SUPPORT
1985
ompt_data_t ompt_parallel_data = ompt_data_none;
1986
ompt_data_t *parent_task_data;
1987
ompt_frame_t *ompt_frame;
1988
void *return_address = NULL;
1989
1990
if (ompt_enabled.enabled) {
1991
__ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1992
NULL, NULL);
1993
return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1994
}
1995
#endif
1996
1997
// Assign affinity to root thread if it hasn't happened yet
1998
__kmp_assign_root_init_mask();
1999
2000
// Nested level will be an index in the nested nthreads array
2001
level = parent_team->t.t_level;
2002
// used to launch non-serial teams even if nested is not allowed
2003
active_level = parent_team->t.t_active_level;
2004
// needed to check nesting inside the teams
2005
teams_level = master_th->th.th_teams_level;
2006
#if KMP_NESTED_HOT_TEAMS
2007
p_hot_teams = &master_th->th.th_hot_teams;
2008
if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2009
*p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2010
sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2011
(*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2012
// it is either actual or not needed (when active_level > 0)
2013
(*p_hot_teams)[0].hot_team_nth = 1;
2014
}
2015
#endif
2016
2017
#if OMPT_SUPPORT
2018
if (ompt_enabled.enabled) {
2019
if (ompt_enabled.ompt_callback_parallel_begin) {
2020
int team_size = master_set_numthreads
2021
? master_set_numthreads
2022
: get__nproc_2(parent_team, master_tid);
2023
int flags = OMPT_INVOKER(call_context) |
2024
((microtask == (microtask_t)__kmp_teams_master)
2025
? ompt_parallel_league
2026
: ompt_parallel_team);
2027
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028
parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029
return_address);
2030
}
2031
master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032
}
2033
#endif
2034
2035
master_th->th.th_ident = loc;
2036
2037
// Parallel closely nested in teams construct:
2038
if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039
return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040
call_context, microtask, invoker,
2041
master_set_numthreads, level,
2042
#if OMPT_SUPPORT
2043
ompt_parallel_data, return_address,
2044
#endif
2045
ap);
2046
} // End parallel closely nested in teams construct
2047
2048
// Need this to happen before we determine the number of threads, not while
2049
// we are allocating the team
2050
//__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051
2052
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053
2054
// Determine the number of threads
2055
int enter_teams =
2056
__kmp_is_entering_teams(active_level, level, teams_level, ap);
2057
if ((!enter_teams &&
2058
(parent_team->t.t_active_level >=
2059
master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060
(__kmp_library == library_serial)) {
2061
KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062
nthreads = 1;
2063
} else {
2064
nthreads = master_set_numthreads
2065
? master_set_numthreads
2066
// TODO: get nproc directly from current task
2067
: get__nproc_2(parent_team, master_tid);
2068
// Use the thread_limit set for the current target task if exists, else go
2069
// with the deduced nthreads
2070
nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071
? task_thread_limit
2072
: nthreads;
2073
// Check if we need to take forkjoin lock? (no need for serialized
2074
// parallel out of teams construct).
2075
if (nthreads > 1) {
2076
/* determine how many new threads we can use */
2077
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078
/* AC: If we execute teams from parallel region (on host), then teams
2079
should be created but each can only have 1 thread if nesting is
2080
disabled. If teams called from serial region, then teams and their
2081
threads should be created regardless of the nesting setting. */
2082
nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083
nthreads, enter_teams);
2084
if (nthreads == 1) {
2085
// Free lock for single thread execution here; for multi-thread
2086
// execution it will be freed later after team of threads created
2087
// and initialized
2088
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089
}
2090
}
2091
}
2092
KMP_DEBUG_ASSERT(nthreads > 0);
2093
2094
// If we temporarily changed the set number of threads then restore it now
2095
master_th->th.th_set_nproc = 0;
2096
2097
if (nthreads == 1) {
2098
return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099
invoker, master_th, parent_team,
2100
#if OMPT_SUPPORT
2101
&ompt_parallel_data, &return_address,
2102
&parent_task_data,
2103
#endif
2104
ap);
2105
} // if (nthreads == 1)
2106
2107
// GEH: only modify the executing flag in the case when not serialized
2108
// serialized case is handled in kmpc_serialized_parallel
2109
KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110
"curtask=%p, curtask_max_aclevel=%d\n",
2111
parent_team->t.t_active_level, master_th,
2112
master_th->th.th_current_task,
2113
master_th->th.th_current_task->td_icvs.max_active_levels));
2114
// TODO: GEH - cannot do this assertion because root thread not set up as
2115
// executing
2116
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117
master_th->th.th_current_task->td_flags.executing = 0;
2118
2119
if (!master_th->th.th_teams_microtask || level > teams_level) {
2120
/* Increment our nested depth level */
2121
KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122
}
2123
2124
// See if we need to make a copy of the ICVs.
2125
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126
kmp_nested_nthreads_t *nested_nth = NULL;
2127
if (!master_th->th.th_set_nested_nth &&
2128
(level + 1 < parent_team->t.t_nested_nth->used) &&
2129
(parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130
nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131
} else if (master_th->th.th_set_nested_nth) {
2132
nested_nth = __kmp_override_nested_nth(master_th, level);
2133
if ((level + 1 < nested_nth->used) &&
2134
(nested_nth->nth[level + 1] != nthreads_icv))
2135
nthreads_icv = nested_nth->nth[level + 1];
2136
else
2137
nthreads_icv = 0; // don't update
2138
} else {
2139
nthreads_icv = 0; // don't update
2140
}
2141
2142
// Figure out the proc_bind_policy for the new team.
2143
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144
// proc_bind_default means don't update
2145
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147
proc_bind = proc_bind_false;
2148
} else {
2149
// No proc_bind clause specified; use current proc-bind-var for this
2150
// parallel region
2151
if (proc_bind == proc_bind_default) {
2152
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153
}
2154
// Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155
if (master_th->th.th_teams_microtask &&
2156
microtask == (microtask_t)__kmp_teams_master) {
2157
proc_bind = __kmp_teams_proc_bind;
2158
}
2159
/* else: The proc_bind policy was specified explicitly on parallel clause.
2160
This overrides proc-bind-var for this parallel region, but does not
2161
change proc-bind-var. */
2162
// Figure the value of proc-bind-var for the child threads.
2163
if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164
(__kmp_nested_proc_bind.bind_types[level + 1] !=
2165
master_th->th.th_current_task->td_icvs.proc_bind)) {
2166
// Do not modify the proc bind icv for the two teams construct forks
2167
// They just let the proc bind icv pass through
2168
if (!master_th->th.th_teams_microtask ||
2169
!(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171
}
2172
}
2173
2174
// Reset for next parallel region
2175
master_th->th.th_set_proc_bind = proc_bind_default;
2176
2177
if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178
kmp_internal_control_t new_icvs;
2179
copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180
new_icvs.next = NULL;
2181
if (nthreads_icv > 0) {
2182
new_icvs.nproc = nthreads_icv;
2183
}
2184
if (proc_bind_icv != proc_bind_default) {
2185
new_icvs.proc_bind = proc_bind_icv;
2186
}
2187
2188
/* allocate a new parallel team */
2189
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190
team = __kmp_allocate_team(root, nthreads, nthreads,
2191
#if OMPT_SUPPORT
2192
ompt_parallel_data,
2193
#endif
2194
proc_bind, &new_icvs,
2195
argc USE_NESTED_HOT_ARG(master_th));
2196
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2197
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2198
} else {
2199
/* allocate a new parallel team */
2200
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2201
team = __kmp_allocate_team(root, nthreads, nthreads,
2202
#if OMPT_SUPPORT
2203
ompt_parallel_data,
2204
#endif
2205
proc_bind,
2206
&master_th->th.th_current_task->td_icvs,
2207
argc USE_NESTED_HOT_ARG(master_th));
2208
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2209
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2210
&master_th->th.th_current_task->td_icvs);
2211
}
2212
KF_TRACE(
2213
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2214
2215
/* setup the new team */
2216
KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2217
KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2218
KMP_CHECK_UPDATE(team->t.t_ident, loc);
2219
KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2220
KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2221
#if OMPT_SUPPORT
2222
KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2223
return_address);
2224
#endif
2225
KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2226
// TODO: parent_team->t.t_level == INT_MAX ???
2227
if (!master_th->th.th_teams_microtask || level > teams_level) {
2228
int new_level = parent_team->t.t_level + 1;
2229
KMP_CHECK_UPDATE(team->t.t_level, new_level);
2230
new_level = parent_team->t.t_active_level + 1;
2231
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2232
} else {
2233
// AC: Do not increase parallel level at start of the teams construct
2234
int new_level = parent_team->t.t_level;
2235
KMP_CHECK_UPDATE(team->t.t_level, new_level);
2236
new_level = parent_team->t.t_active_level;
2237
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2238
}
2239
kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2240
// set primary thread's schedule as new run-time schedule
2241
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2242
2243
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2244
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2245
2246
// Check if hot team has potentially outdated list, and if so, free it
2247
if (team->t.t_nested_nth &&
2248
team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2249
KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2250
KMP_INTERNAL_FREE(team->t.t_nested_nth);
2251
team->t.t_nested_nth = NULL;
2252
}
2253
team->t.t_nested_nth = parent_team->t.t_nested_nth;
2254
if (master_th->th.th_set_nested_nth) {
2255
if (!nested_nth)
2256
nested_nth = __kmp_override_nested_nth(master_th, level);
2257
team->t.t_nested_nth = nested_nth;
2258
KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2259
master_th->th.th_set_nested_nth = NULL;
2260
master_th->th.th_set_nested_nth_sz = 0;
2261
master_th->th.th_nt_strict = false;
2262
}
2263
2264
// Update the floating point rounding in the team if required.
2265
propagateFPControl(team);
2266
#if OMPD_SUPPORT
2267
if (ompd_state & OMPD_ENABLE_BP)
2268
ompd_bp_parallel_begin();
2269
#endif
2270
2271
KA_TRACE(
2272
20,
2273
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2274
gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2275
team->t.t_nproc));
2276
KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2277
(team->t.t_master_tid == 0 &&
2278
(team->t.t_parent == root->r.r_root_team ||
2279
team->t.t_parent->t.t_serialized)));
2280
KMP_MB();
2281
2282
/* now, setup the arguments */
2283
argv = (void **)team->t.t_argv;
2284
if (ap) {
2285
for (i = argc - 1; i >= 0; --i) {
2286
void *new_argv = va_arg(kmp_va_deref(ap), void *);
2287
KMP_CHECK_UPDATE(*argv, new_argv);
2288
argv++;
2289
}
2290
} else {
2291
for (i = 0; i < argc; ++i) {
2292
// Get args from parent team for teams construct
2293
KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2294
}
2295
}
2296
2297
/* now actually fork the threads */
2298
KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2299
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2300
root->r.r_active = TRUE;
2301
2302
__kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2303
__kmp_setup_icv_copy(team, nthreads,
2304
&master_th->th.th_current_task->td_icvs, loc);
2305
2306
#if OMPT_SUPPORT
2307
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2308
#endif
2309
2310
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2311
2312
#if USE_ITT_BUILD
2313
if (team->t.t_active_level == 1 // only report frames at level 1
2314
&& !master_th->th.th_teams_microtask) { // not in teams construct
2315
#if USE_ITT_NOTIFY
2316
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2317
(__kmp_forkjoin_frames_mode == 3 ||
2318
__kmp_forkjoin_frames_mode == 1)) {
2319
kmp_uint64 tmp_time = 0;
2320
if (__itt_get_timestamp_ptr)
2321
tmp_time = __itt_get_timestamp();
2322
// Internal fork - report frame begin
2323
master_th->th.th_frame_time = tmp_time;
2324
if (__kmp_forkjoin_frames_mode == 3)
2325
team->t.t_region_time = tmp_time;
2326
} else
2327
// only one notification scheme (either "submit" or "forking/joined", not both)
2328
#endif /* USE_ITT_NOTIFY */
2329
if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2330
__kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2331
// Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2332
__kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2333
}
2334
}
2335
#endif /* USE_ITT_BUILD */
2336
2337
/* now go on and do the work */
2338
KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2339
KMP_MB();
2340
KF_TRACE(10,
2341
("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2342
root, team, master_th, gtid));
2343
2344
#if USE_ITT_BUILD
2345
if (__itt_stack_caller_create_ptr) {
2346
// create new stack stitching id before entering fork barrier
2347
if (!enter_teams) {
2348
KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2349
team->t.t_stack_id = __kmp_itt_stack_caller_create();
2350
} else if (parent_team->t.t_serialized) {
2351
// keep stack stitching id in the serialized parent_team;
2352
// current team will be used for parallel inside the teams;
2353
// if parent_team is active, then it already keeps stack stitching id
2354
// for the league of teams
2355
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2356
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2357
}
2358
}
2359
#endif /* USE_ITT_BUILD */
2360
2361
// AC: skip __kmp_internal_fork at teams construct, let only primary
2362
// threads execute
2363
if (ap) {
2364
__kmp_internal_fork(loc, gtid, team);
2365
KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2366
"master_th=%p, gtid=%d\n",
2367
root, team, master_th, gtid));
2368
}
2369
2370
if (call_context == fork_context_gnu) {
2371
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2372
return TRUE;
2373
}
2374
2375
/* Invoke microtask for PRIMARY thread */
2376
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2377
team->t.t_id, team->t.t_pkfn));
2378
} // END of timer KMP_fork_call block
2379
2380
#if KMP_STATS_ENABLED
2381
// If beginning a teams construct, then change thread state
2382
stats_state_e previous_state = KMP_GET_THREAD_STATE();
2383
if (!ap) {
2384
KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2385
}
2386
#endif
2387
2388
if (!team->t.t_invoke(gtid)) {
2389
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2390
}
2391
2392
#if KMP_STATS_ENABLED
2393
// If was beginning of a teams construct, then reset thread state
2394
if (!ap) {
2395
KMP_SET_THREAD_STATE(previous_state);
2396
}
2397
#endif
2398
2399
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2400
team->t.t_id, team->t.t_pkfn));
2401
KMP_MB(); /* Flush all pending memory write invalidates. */
2402
2403
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2404
#if OMPT_SUPPORT
2405
if (ompt_enabled.enabled) {
2406
master_th->th.ompt_thread_info.state = ompt_state_overhead;
2407
}
2408
#endif
2409
2410
return TRUE;
2411
}
2412
2413
#if OMPT_SUPPORT
2414
static inline void __kmp_join_restore_state(kmp_info_t *thread,
2415
kmp_team_t *team) {
2416
// restore state outside the region
2417
thread->th.ompt_thread_info.state =
2418
((team->t.t_serialized) ? ompt_state_work_serial
2419
: ompt_state_work_parallel);
2420
}
2421
2422
static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2423
kmp_team_t *team, ompt_data_t *parallel_data,
2424
int flags, void *codeptr) {
2425
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426
if (ompt_enabled.ompt_callback_parallel_end) {
2427
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2428
parallel_data, &(task_info->task_data), flags, codeptr);
2429
}
2430
2431
task_info->frame.enter_frame = ompt_data_none;
2432
__kmp_join_restore_state(thread, team);
2433
}
2434
#endif
2435
2436
void __kmp_join_call(ident_t *loc, int gtid
2437
#if OMPT_SUPPORT
2438
,
2439
enum fork_context_e fork_context
2440
#endif
2441
,
2442
int exit_teams) {
2443
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2444
kmp_team_t *team;
2445
kmp_team_t *parent_team;
2446
kmp_info_t *master_th;
2447
kmp_root_t *root;
2448
int master_active;
2449
2450
KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2451
2452
/* setup current data */
2453
master_th = __kmp_threads[gtid];
2454
root = master_th->th.th_root;
2455
team = master_th->th.th_team;
2456
parent_team = team->t.t_parent;
2457
2458
master_th->th.th_ident = loc;
2459
2460
#if OMPT_SUPPORT
2461
void *team_microtask = (void *)team->t.t_pkfn;
2462
// For GOMP interface with serialized parallel, need the
2463
// __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2464
// and end-parallel events.
2465
if (ompt_enabled.enabled &&
2466
!(team->t.t_serialized && fork_context == fork_context_gnu)) {
2467
master_th->th.ompt_thread_info.state = ompt_state_overhead;
2468
}
2469
#endif
2470
2471
#if KMP_DEBUG
2472
if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2473
KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2474
"th_task_team = %p\n",
2475
__kmp_gtid_from_thread(master_th), team,
2476
team->t.t_task_team[master_th->th.th_task_state],
2477
master_th->th.th_task_team));
2478
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2479
}
2480
#endif
2481
2482
if (team->t.t_serialized) {
2483
if (master_th->th.th_teams_microtask) {
2484
// We are in teams construct
2485
int level = team->t.t_level;
2486
int tlevel = master_th->th.th_teams_level;
2487
if (level == tlevel) {
2488
// AC: we haven't incremented it earlier at start of teams construct,
2489
// so do it here - at the end of teams construct
2490
team->t.t_level++;
2491
} else if (level == tlevel + 1) {
2492
// AC: we are exiting parallel inside teams, need to increment
2493
// serialization in order to restore it in the next call to
2494
// __kmpc_end_serialized_parallel
2495
team->t.t_serialized++;
2496
}
2497
}
2498
__kmpc_end_serialized_parallel(loc, gtid);
2499
2500
#if OMPT_SUPPORT
2501
if (ompt_enabled.enabled) {
2502
if (fork_context == fork_context_gnu) {
2503
__ompt_lw_taskteam_unlink(master_th);
2504
}
2505
__kmp_join_restore_state(master_th, parent_team);
2506
}
2507
#endif
2508
2509
return;
2510
}
2511
2512
master_active = team->t.t_master_active;
2513
2514
if (!exit_teams) {
2515
// AC: No barrier for internal teams at exit from teams construct.
2516
// But there is barrier for external team (league).
2517
__kmp_internal_join(loc, gtid, team);
2518
#if USE_ITT_BUILD
2519
if (__itt_stack_caller_create_ptr) {
2520
KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2521
// destroy the stack stitching id after join barrier
2522
__kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2523
team->t.t_stack_id = NULL;
2524
}
2525
#endif
2526
} else {
2527
master_th->th.th_task_state =
2528
0; // AC: no tasking in teams (out of any parallel)
2529
#if USE_ITT_BUILD
2530
if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2531
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2532
// destroy the stack stitching id on exit from the teams construct
2533
// if parent_team is active, then the id will be destroyed later on
2534
// by master of the league of teams
2535
__kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2536
parent_team->t.t_stack_id = NULL;
2537
}
2538
#endif
2539
}
2540
2541
KMP_MB();
2542
2543
#if OMPT_SUPPORT
2544
ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2545
void *codeptr = team->t.ompt_team_info.master_return_address;
2546
#endif
2547
2548
#if USE_ITT_BUILD
2549
// Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2550
if (team->t.t_active_level == 1 &&
2551
(!master_th->th.th_teams_microtask || /* not in teams construct */
2552
master_th->th.th_teams_size.nteams == 1)) {
2553
master_th->th.th_ident = loc;
2554
// only one notification scheme (either "submit" or "forking/joined", not
2555
// both)
2556
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2557
__kmp_forkjoin_frames_mode == 3)
2558
__kmp_itt_frame_submit(gtid, team->t.t_region_time,
2559
master_th->th.th_frame_time, 0, loc,
2560
master_th->th.th_team_nproc, 1);
2561
else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2562
!__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2563
__kmp_itt_region_joined(gtid);
2564
} // active_level == 1
2565
#endif /* USE_ITT_BUILD */
2566
2567
#if KMP_AFFINITY_SUPPORTED
2568
if (!exit_teams) {
2569
// Restore master thread's partition.
2570
master_th->th.th_first_place = team->t.t_first_place;
2571
master_th->th.th_last_place = team->t.t_last_place;
2572
}
2573
#endif // KMP_AFFINITY_SUPPORTED
2574
2575
if (master_th->th.th_teams_microtask && !exit_teams &&
2576
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2577
team->t.t_level == master_th->th.th_teams_level + 1) {
2578
// AC: We need to leave the team structure intact at the end of parallel
2579
// inside the teams construct, so that at the next parallel same (hot) team
2580
// works, only adjust nesting levels
2581
#if OMPT_SUPPORT
2582
ompt_data_t ompt_parallel_data = ompt_data_none;
2583
if (ompt_enabled.enabled) {
2584
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2585
if (ompt_enabled.ompt_callback_implicit_task) {
2586
int ompt_team_size = team->t.t_nproc;
2587
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2588
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2589
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2590
}
2591
task_info->frame.exit_frame = ompt_data_none;
2592
task_info->task_data = ompt_data_none;
2593
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2594
__ompt_lw_taskteam_unlink(master_th);
2595
}
2596
#endif
2597
/* Decrement our nested depth level */
2598
team->t.t_level--;
2599
team->t.t_active_level--;
2600
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2601
2602
// Restore number of threads in the team if needed. This code relies on
2603
// the proper adjustment of th_teams_size.nth after the fork in
2604
// __kmp_teams_master on each teams primary thread in the case that
2605
// __kmp_reserve_threads reduced it.
2606
if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2607
int old_num = master_th->th.th_team_nproc;
2608
int new_num = master_th->th.th_teams_size.nth;
2609
kmp_info_t **other_threads = team->t.t_threads;
2610
team->t.t_nproc = new_num;
2611
for (int i = 0; i < old_num; ++i) {
2612
other_threads[i]->th.th_team_nproc = new_num;
2613
}
2614
// Adjust states of non-used threads of the team
2615
for (int i = old_num; i < new_num; ++i) {
2616
// Re-initialize thread's barrier data.
2617
KMP_DEBUG_ASSERT(other_threads[i]);
2618
kmp_balign_t *balign = other_threads[i]->th.th_bar;
2619
for (int b = 0; b < bs_last_barrier; ++b) {
2620
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2621
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2622
#if USE_DEBUGGER
2623
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2624
#endif
2625
}
2626
if (__kmp_tasking_mode != tskm_immediate_exec) {
2627
// Synchronize thread's task state
2628
other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2629
}
2630
}
2631
}
2632
2633
#if OMPT_SUPPORT
2634
if (ompt_enabled.enabled) {
2635
__kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2636
OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2637
}
2638
#endif
2639
2640
return;
2641
}
2642
2643
/* do cleanup and restore the parent team */
2644
master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2645
master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2646
2647
master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2648
2649
/* jc: The following lock has instructions with REL and ACQ semantics,
2650
separating the parallel user code called in this parallel region
2651
from the serial user code called after this function returns. */
2652
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2653
2654
if (!master_th->th.th_teams_microtask ||
2655
team->t.t_level > master_th->th.th_teams_level) {
2656
/* Decrement our nested depth level */
2657
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2658
}
2659
KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2660
2661
#if OMPT_SUPPORT
2662
if (ompt_enabled.enabled) {
2663
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2664
if (ompt_enabled.ompt_callback_implicit_task) {
2665
int flags = (team_microtask == (void *)__kmp_teams_master)
2666
? ompt_task_initial
2667
: ompt_task_implicit;
2668
int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2669
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2670
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2671
OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2672
}
2673
task_info->frame.exit_frame = ompt_data_none;
2674
task_info->task_data = ompt_data_none;
2675
}
2676
#endif
2677
2678
KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2679
master_th, team));
2680
__kmp_pop_current_task_from_thread(master_th);
2681
2682
master_th->th.th_def_allocator = team->t.t_def_allocator;
2683
2684
#if OMPD_SUPPORT
2685
if (ompd_state & OMPD_ENABLE_BP)
2686
ompd_bp_parallel_end();
2687
#endif
2688
updateHWFPControl(team);
2689
2690
if (root->r.r_active != master_active)
2691
root->r.r_active = master_active;
2692
2693
__kmp_free_team(root, team USE_NESTED_HOT_ARG(
2694
master_th)); // this will free worker threads
2695
2696
/* this race was fun to find. make sure the following is in the critical
2697
region otherwise assertions may fail occasionally since the old team may be
2698
reallocated and the hierarchy appears inconsistent. it is actually safe to
2699
run and won't cause any bugs, but will cause those assertion failures. it's
2700
only one deref&assign so might as well put this in the critical region */
2701
master_th->th.th_team = parent_team;
2702
master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703
master_th->th.th_team_master = parent_team->t.t_threads[0];
2704
master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705
2706
/* restore serialized team, if need be */
2707
if (parent_team->t.t_serialized &&
2708
parent_team != master_th->th.th_serial_team &&
2709
parent_team != root->r.r_root_team) {
2710
__kmp_free_team(root,
2711
master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2712
master_th->th.th_serial_team = parent_team;
2713
}
2714
2715
if (__kmp_tasking_mode != tskm_immediate_exec) {
2716
// Restore primary thread's task state from team structure
2717
KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2718
team->t.t_primary_task_state == 1);
2719
master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2720
2721
// Copy the task team from the parent team to the primary thread
2722
master_th->th.th_task_team =
2723
parent_team->t.t_task_team[master_th->th.th_task_state];
2724
KA_TRACE(20,
2725
("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2726
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2727
parent_team));
2728
}
2729
2730
// TODO: GEH - cannot do this assertion because root thread not set up as
2731
// executing
2732
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2733
master_th->th.th_current_task->td_flags.executing = 1;
2734
2735
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2736
2737
#if KMP_AFFINITY_SUPPORTED
2738
if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2739
__kmp_reset_root_init_mask(gtid);
2740
}
2741
#endif
2742
#if OMPT_SUPPORT
2743
int flags =
2744
OMPT_INVOKER(fork_context) |
2745
((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2746
: ompt_parallel_team);
2747
if (ompt_enabled.enabled) {
2748
__kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2749
codeptr);
2750
}
2751
#endif
2752
2753
KMP_MB();
2754
KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2755
}
2756
2757
/* Check whether we should push an internal control record onto the
2758
serial team stack. If so, do it. */
2759
void __kmp_save_internal_controls(kmp_info_t *thread) {
2760
2761
if (thread->th.th_team != thread->th.th_serial_team) {
2762
return;
2763
}
2764
if (thread->th.th_team->t.t_serialized > 1) {
2765
int push = 0;
2766
2767
if (thread->th.th_team->t.t_control_stack_top == NULL) {
2768
push = 1;
2769
} else {
2770
if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2771
thread->th.th_team->t.t_serialized) {
2772
push = 1;
2773
}
2774
}
2775
if (push) { /* push a record on the serial team's stack */
2776
kmp_internal_control_t *control =
2777
(kmp_internal_control_t *)__kmp_allocate(
2778
sizeof(kmp_internal_control_t));
2779
2780
copy_icvs(control, &thread->th.th_current_task->td_icvs);
2781
2782
control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2783
2784
control->next = thread->th.th_team->t.t_control_stack_top;
2785
thread->th.th_team->t.t_control_stack_top = control;
2786
}
2787
}
2788
}
2789
2790
/* Changes set_nproc */
2791
void __kmp_set_num_threads(int new_nth, int gtid) {
2792
kmp_info_t *thread;
2793
kmp_root_t *root;
2794
2795
KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2796
KMP_DEBUG_ASSERT(__kmp_init_serial);
2797
2798
if (new_nth < 1)
2799
new_nth = 1;
2800
else if (new_nth > __kmp_max_nth)
2801
new_nth = __kmp_max_nth;
2802
2803
KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2804
thread = __kmp_threads[gtid];
2805
if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2806
return; // nothing to do
2807
2808
__kmp_save_internal_controls(thread);
2809
2810
set__nproc(thread, new_nth);
2811
2812
// If this omp_set_num_threads() call will cause the hot team size to be
2813
// reduced (in the absence of a num_threads clause), then reduce it now,
2814
// rather than waiting for the next parallel region.
2815
root = thread->th.th_root;
2816
if (__kmp_init_parallel && (!root->r.r_active) &&
2817
(root->r.r_hot_team->t.t_nproc > new_nth)
2818
#if KMP_NESTED_HOT_TEAMS
2819
&& __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2820
#endif
2821
) {
2822
kmp_team_t *hot_team = root->r.r_hot_team;
2823
int f;
2824
2825
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2826
2827
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2828
__kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2829
}
2830
// Release the extra threads we don't need any more.
2831
for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2832
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2833
if (__kmp_tasking_mode != tskm_immediate_exec) {
2834
// When decreasing team size, threads no longer in the team should unref
2835
// task team.
2836
hot_team->t.t_threads[f]->th.th_task_team = NULL;
2837
}
2838
__kmp_free_thread(hot_team->t.t_threads[f]);
2839
hot_team->t.t_threads[f] = NULL;
2840
}
2841
hot_team->t.t_nproc = new_nth;
2842
#if KMP_NESTED_HOT_TEAMS
2843
if (thread->th.th_hot_teams) {
2844
KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2845
thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2846
}
2847
#endif
2848
2849
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2850
hot_team->t.b->update_num_threads(new_nth);
2851
__kmp_add_threads_to_team(hot_team, new_nth);
2852
}
2853
2854
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2855
2856
// Update the t_nproc field in the threads that are still active.
2857
for (f = 0; f < new_nth; f++) {
2858
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2859
hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2860
}
2861
// Special flag in case omp_set_num_threads() call
2862
hot_team->t.t_size_changed = -1;
2863
}
2864
}
2865
2866
/* Changes max_active_levels */
2867
void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2868
kmp_info_t *thread;
2869
2870
KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2871
"%d = (%d)\n",
2872
gtid, max_active_levels));
2873
KMP_DEBUG_ASSERT(__kmp_init_serial);
2874
2875
// validate max_active_levels
2876
if (max_active_levels < 0) {
2877
KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2878
// We ignore this call if the user has specified a negative value.
2879
// The current setting won't be changed. The last valid setting will be
2880
// used. A warning will be issued (if warnings are allowed as controlled by
2881
// the KMP_WARNINGS env var).
2882
KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2883
"max_active_levels for thread %d = (%d)\n",
2884
gtid, max_active_levels));
2885
return;
2886
}
2887
if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2888
// it's OK, the max_active_levels is within the valid range: [ 0;
2889
// KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2890
// We allow a zero value. (implementation defined behavior)
2891
} else {
2892
KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2893
KMP_MAX_ACTIVE_LEVELS_LIMIT);
2894
max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2895
// Current upper limit is MAX_INT. (implementation defined behavior)
2896
// If the input exceeds the upper limit, we correct the input to be the
2897
// upper limit. (implementation defined behavior)
2898
// Actually, the flow should never get here until we use MAX_INT limit.
2899
}
2900
KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2901
"max_active_levels for thread %d = (%d)\n",
2902
gtid, max_active_levels));
2903
2904
thread = __kmp_threads[gtid];
2905
2906
__kmp_save_internal_controls(thread);
2907
2908
set__max_active_levels(thread, max_active_levels);
2909
}
2910
2911
/* Gets max_active_levels */
2912
int __kmp_get_max_active_levels(int gtid) {
2913
kmp_info_t *thread;
2914
2915
KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2916
KMP_DEBUG_ASSERT(__kmp_init_serial);
2917
2918
thread = __kmp_threads[gtid];
2919
KMP_DEBUG_ASSERT(thread->th.th_current_task);
2920
KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2921
"curtask_maxaclevel=%d\n",
2922
gtid, thread->th.th_current_task,
2923
thread->th.th_current_task->td_icvs.max_active_levels));
2924
return thread->th.th_current_task->td_icvs.max_active_levels;
2925
}
2926
2927
// nteams-var per-device ICV
2928
void __kmp_set_num_teams(int num_teams) {
2929
if (num_teams > 0)
2930
__kmp_nteams = num_teams;
2931
}
2932
int __kmp_get_max_teams(void) { return __kmp_nteams; }
2933
// teams-thread-limit-var per-device ICV
2934
void __kmp_set_teams_thread_limit(int limit) {
2935
if (limit > 0)
2936
__kmp_teams_thread_limit = limit;
2937
}
2938
int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2939
2940
KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2941
KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2942
2943
/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2944
void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2945
kmp_info_t *thread;
2946
kmp_sched_t orig_kind;
2947
// kmp_team_t *team;
2948
2949
KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2950
gtid, (int)kind, chunk));
2951
KMP_DEBUG_ASSERT(__kmp_init_serial);
2952
2953
// Check if the kind parameter is valid, correct if needed.
2954
// Valid parameters should fit in one of two intervals - standard or extended:
2955
// <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2956
// 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2957
orig_kind = kind;
2958
kind = __kmp_sched_without_mods(kind);
2959
2960
if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2961
(kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2962
// TODO: Hint needs attention in case we change the default schedule.
2963
__kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2964
KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2965
__kmp_msg_null);
2966
kind = kmp_sched_default;
2967
chunk = 0; // ignore chunk value in case of bad kind
2968
}
2969
2970
thread = __kmp_threads[gtid];
2971
2972
__kmp_save_internal_controls(thread);
2973
2974
if (kind < kmp_sched_upper_std) {
2975
if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2976
// differ static chunked vs. unchunked: chunk should be invalid to
2977
// indicate unchunked schedule (which is the default)
2978
thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2979
} else {
2980
thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981
__kmp_sch_map[kind - kmp_sched_lower - 1];
2982
}
2983
} else {
2984
// __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2985
// kmp_sched_lower - 2 ];
2986
thread->th.th_current_task->td_icvs.sched.r_sched_type =
2987
__kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988
kmp_sched_lower - 2];
2989
}
2990
__kmp_sched_apply_mods_intkind(
2991
orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2992
if (kind == kmp_sched_auto || chunk < 1) {
2993
// ignore parameter chunk for schedule auto
2994
thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2995
} else {
2996
thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2997
}
2998
}
2999
3000
/* Gets def_sched_var ICV values */
3001
void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3002
kmp_info_t *thread;
3003
enum sched_type th_type;
3004
3005
KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3006
KMP_DEBUG_ASSERT(__kmp_init_serial);
3007
3008
thread = __kmp_threads[gtid];
3009
3010
th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3011
switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3012
case kmp_sch_static:
3013
case kmp_sch_static_greedy:
3014
case kmp_sch_static_balanced:
3015
*kind = kmp_sched_static;
3016
__kmp_sched_apply_mods_stdkind(kind, th_type);
3017
*chunk = 0; // chunk was not set, try to show this fact via zero value
3018
return;
3019
case kmp_sch_static_chunked:
3020
*kind = kmp_sched_static;
3021
break;
3022
case kmp_sch_dynamic_chunked:
3023
*kind = kmp_sched_dynamic;
3024
break;
3025
case kmp_sch_guided_chunked:
3026
case kmp_sch_guided_iterative_chunked:
3027
case kmp_sch_guided_analytical_chunked:
3028
*kind = kmp_sched_guided;
3029
break;
3030
case kmp_sch_auto:
3031
*kind = kmp_sched_auto;
3032
break;
3033
case kmp_sch_trapezoidal:
3034
*kind = kmp_sched_trapezoidal;
3035
break;
3036
#if KMP_STATIC_STEAL_ENABLED
3037
case kmp_sch_static_steal:
3038
*kind = kmp_sched_static_steal;
3039
break;
3040
#endif
3041
default:
3042
KMP_FATAL(UnknownSchedulingType, th_type);
3043
}
3044
3045
__kmp_sched_apply_mods_stdkind(kind, th_type);
3046
*chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3047
}
3048
3049
int __kmp_get_ancestor_thread_num(int gtid, int level) {
3050
3051
int ii, dd;
3052
kmp_team_t *team;
3053
kmp_info_t *thr;
3054
3055
KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3056
KMP_DEBUG_ASSERT(__kmp_init_serial);
3057
3058
// validate level
3059
if (level == 0)
3060
return 0;
3061
if (level < 0)
3062
return -1;
3063
thr = __kmp_threads[gtid];
3064
team = thr->th.th_team;
3065
ii = team->t.t_level;
3066
if (level > ii)
3067
return -1;
3068
3069
if (thr->th.th_teams_microtask) {
3070
// AC: we are in teams region where multiple nested teams have same level
3071
int tlevel = thr->th.th_teams_level; // the level of the teams construct
3072
if (level <=
3073
tlevel) { // otherwise usual algorithm works (will not touch the teams)
3074
KMP_DEBUG_ASSERT(ii >= tlevel);
3075
// AC: As we need to pass by the teams league, we need to artificially
3076
// increase ii
3077
if (ii == tlevel) {
3078
ii += 2; // three teams have same level
3079
} else {
3080
ii++; // two teams have same level
3081
}
3082
}
3083
}
3084
3085
if (ii == level)
3086
return __kmp_tid_from_gtid(gtid);
3087
3088
dd = team->t.t_serialized;
3089
level++;
3090
while (ii > level) {
3091
for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3092
}
3093
if ((team->t.t_serialized) && (!dd)) {
3094
team = team->t.t_parent;
3095
continue;
3096
}
3097
if (ii > level) {
3098
team = team->t.t_parent;
3099
dd = team->t.t_serialized;
3100
ii--;
3101
}
3102
}
3103
3104
return (dd > 1) ? (0) : (team->t.t_master_tid);
3105
}
3106
3107
int __kmp_get_team_size(int gtid, int level) {
3108
3109
int ii, dd;
3110
kmp_team_t *team;
3111
kmp_info_t *thr;
3112
3113
KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3114
KMP_DEBUG_ASSERT(__kmp_init_serial);
3115
3116
// validate level
3117
if (level == 0)
3118
return 1;
3119
if (level < 0)
3120
return -1;
3121
thr = __kmp_threads[gtid];
3122
team = thr->th.th_team;
3123
ii = team->t.t_level;
3124
if (level > ii)
3125
return -1;
3126
3127
if (thr->th.th_teams_microtask) {
3128
// AC: we are in teams region where multiple nested teams have same level
3129
int tlevel = thr->th.th_teams_level; // the level of the teams construct
3130
if (level <=
3131
tlevel) { // otherwise usual algorithm works (will not touch the teams)
3132
KMP_DEBUG_ASSERT(ii >= tlevel);
3133
// AC: As we need to pass by the teams league, we need to artificially
3134
// increase ii
3135
if (ii == tlevel) {
3136
ii += 2; // three teams have same level
3137
} else {
3138
ii++; // two teams have same level
3139
}
3140
}
3141
}
3142
3143
while (ii > level) {
3144
for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3145
}
3146
if (team->t.t_serialized && (!dd)) {
3147
team = team->t.t_parent;
3148
continue;
3149
}
3150
if (ii > level) {
3151
team = team->t.t_parent;
3152
ii--;
3153
}
3154
}
3155
3156
return team->t.t_nproc;
3157
}
3158
3159
kmp_r_sched_t __kmp_get_schedule_global() {
3160
// This routine created because pairs (__kmp_sched, __kmp_chunk) and
3161
// (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3162
// independently. So one can get the updated schedule here.
3163
3164
kmp_r_sched_t r_sched;
3165
3166
// create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3167
// __kmp_guided. __kmp_sched should keep original value, so that user can set
3168
// KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3169
// different roots (even in OMP 2.5)
3170
enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3171
enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3172
if (s == kmp_sch_static) {
3173
// replace STATIC with more detailed schedule (balanced or greedy)
3174
r_sched.r_sched_type = __kmp_static;
3175
} else if (s == kmp_sch_guided_chunked) {
3176
// replace GUIDED with more detailed schedule (iterative or analytical)
3177
r_sched.r_sched_type = __kmp_guided;
3178
} else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3179
r_sched.r_sched_type = __kmp_sched;
3180
}
3181
SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3182
3183
if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3184
// __kmp_chunk may be wrong here (if it was not ever set)
3185
r_sched.chunk = KMP_DEFAULT_CHUNK;
3186
} else {
3187
r_sched.chunk = __kmp_chunk;
3188
}
3189
3190
return r_sched;
3191
}
3192
3193
/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3194
at least argc number of *t_argv entries for the requested team. */
3195
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3196
3197
KMP_DEBUG_ASSERT(team);
3198
if (!realloc || argc > team->t.t_max_argc) {
3199
3200
KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3201
"current entries=%d\n",
3202
team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3203
/* if previously allocated heap space for args, free them */
3204
if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3205
__kmp_free((void *)team->t.t_argv);
3206
3207
if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3208
/* use unused space in the cache line for arguments */
3209
team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3210
KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3211
"argv entries\n",
3212
team->t.t_id, team->t.t_max_argc));
3213
team->t.t_argv = &team->t.t_inline_argv[0];
3214
if (__kmp_storage_map) {
3215
__kmp_print_storage_map_gtid(
3216
-1, &team->t.t_inline_argv[0],
3217
&team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3218
(sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3219
team->t.t_id);
3220
}
3221
} else {
3222
/* allocate space for arguments in the heap */
3223
team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3224
? KMP_MIN_MALLOC_ARGV_ENTRIES
3225
: 2 * argc;
3226
KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3227
"argv entries\n",
3228
team->t.t_id, team->t.t_max_argc));
3229
team->t.t_argv =
3230
(void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3231
if (__kmp_storage_map) {
3232
__kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3233
&team->t.t_argv[team->t.t_max_argc],
3234
sizeof(void *) * team->t.t_max_argc,
3235
"team_%d.t_argv", team->t.t_id);
3236
}
3237
}
3238
}
3239
}
3240
3241
static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3242
int i;
3243
int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3244
team->t.t_threads =
3245
(kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3246
team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3247
sizeof(dispatch_shared_info_t) * num_disp_buff);
3248
team->t.t_dispatch =
3249
(kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3250
team->t.t_implicit_task_taskdata =
3251
(kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3252
team->t.t_max_nproc = max_nth;
3253
3254
/* setup dispatch buffers */
3255
for (i = 0; i < num_disp_buff; ++i) {
3256
team->t.t_disp_buffer[i].buffer_index = i;
3257
team->t.t_disp_buffer[i].doacross_buf_idx = i;
3258
}
3259
}
3260
3261
static void __kmp_free_team_arrays(kmp_team_t *team) {
3262
/* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3263
int i;
3264
for (i = 0; i < team->t.t_max_nproc; ++i) {
3265
if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3266
__kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3267
team->t.t_dispatch[i].th_disp_buffer = NULL;
3268
}
3269
}
3270
#if KMP_USE_HIER_SCHED
3271
__kmp_dispatch_free_hierarchies(team);
3272
#endif
3273
__kmp_free(team->t.t_threads);
3274
__kmp_free(team->t.t_disp_buffer);
3275
__kmp_free(team->t.t_dispatch);
3276
__kmp_free(team->t.t_implicit_task_taskdata);
3277
team->t.t_threads = NULL;
3278
team->t.t_disp_buffer = NULL;
3279
team->t.t_dispatch = NULL;
3280
team->t.t_implicit_task_taskdata = 0;
3281
}
3282
3283
static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3284
kmp_info_t **oldThreads = team->t.t_threads;
3285
3286
__kmp_free(team->t.t_disp_buffer);
3287
__kmp_free(team->t.t_dispatch);
3288
__kmp_free(team->t.t_implicit_task_taskdata);
3289
__kmp_allocate_team_arrays(team, max_nth);
3290
3291
KMP_MEMCPY(team->t.t_threads, oldThreads,
3292
team->t.t_nproc * sizeof(kmp_info_t *));
3293
3294
__kmp_free(oldThreads);
3295
}
3296
3297
static kmp_internal_control_t __kmp_get_global_icvs(void) {
3298
3299
kmp_r_sched_t r_sched =
3300
__kmp_get_schedule_global(); // get current state of scheduling globals
3301
3302
KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3303
3304
kmp_internal_control_t g_icvs = {
3305
0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3306
(kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3307
// adjustment of threads (per thread)
3308
(kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3309
// whether blocktime is explicitly set
3310
__kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3311
#if KMP_USE_MONITOR
3312
__kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3313
// intervals
3314
#endif
3315
__kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3316
// next parallel region (per thread)
3317
// (use a max ub on value if __kmp_parallel_initialize not called yet)
3318
__kmp_cg_max_nth, // int thread_limit;
3319
__kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3320
// on task. This is used in the case of target thread_limit
3321
__kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3322
// for max_active_levels
3323
r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3324
// {sched,chunk} pair
3325
__kmp_nested_proc_bind.bind_types[0],
3326
__kmp_default_device,
3327
NULL // struct kmp_internal_control *next;
3328
};
3329
3330
return g_icvs;
3331
}
3332
3333
static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3334
3335
kmp_internal_control_t gx_icvs;
3336
gx_icvs.serial_nesting_level =
3337
0; // probably =team->t.t_serial like in save_inter_controls
3338
copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3339
gx_icvs.next = NULL;
3340
3341
return gx_icvs;
3342
}
3343
3344
static void __kmp_initialize_root(kmp_root_t *root) {
3345
int f;
3346
kmp_team_t *root_team;
3347
kmp_team_t *hot_team;
3348
int hot_team_max_nth;
3349
kmp_r_sched_t r_sched =
3350
__kmp_get_schedule_global(); // get current state of scheduling globals
3351
kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3352
KMP_DEBUG_ASSERT(root);
3353
KMP_ASSERT(!root->r.r_begin);
3354
3355
/* setup the root state structure */
3356
__kmp_init_lock(&root->r.r_begin_lock);
3357
root->r.r_begin = FALSE;
3358
root->r.r_active = FALSE;
3359
root->r.r_in_parallel = 0;
3360
root->r.r_blocktime = __kmp_dflt_blocktime;
3361
#if KMP_AFFINITY_SUPPORTED
3362
root->r.r_affinity_assigned = FALSE;
3363
#endif
3364
3365
/* setup the root team for this task */
3366
/* allocate the root team structure */
3367
KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3368
3369
root_team =
3370
__kmp_allocate_team(root,
3371
1, // new_nproc
3372
1, // max_nproc
3373
#if OMPT_SUPPORT
3374
ompt_data_none, // root parallel id
3375
#endif
3376
__kmp_nested_proc_bind.bind_types[0], &r_icvs,
3377
0 // argc
3378
USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3379
);
3380
#if USE_DEBUGGER
3381
// Non-NULL value should be assigned to make the debugger display the root
3382
// team.
3383
TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3384
#endif
3385
3386
KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3387
3388
root->r.r_root_team = root_team;
3389
root_team->t.t_control_stack_top = NULL;
3390
3391
/* initialize root team */
3392
root_team->t.t_threads[0] = NULL;
3393
root_team->t.t_nproc = 1;
3394
root_team->t.t_serialized = 1;
3395
// TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3396
root_team->t.t_sched.sched = r_sched.sched;
3397
root_team->t.t_nested_nth = &__kmp_nested_nth;
3398
KA_TRACE(
3399
20,
3400
("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3401
root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3402
3403
/* setup the hot team for this task */
3404
/* allocate the hot team structure */
3405
KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3406
3407
hot_team =
3408
__kmp_allocate_team(root,
3409
1, // new_nproc
3410
__kmp_dflt_team_nth_ub * 2, // max_nproc
3411
#if OMPT_SUPPORT
3412
ompt_data_none, // root parallel id
3413
#endif
3414
__kmp_nested_proc_bind.bind_types[0], &r_icvs,
3415
0 // argc
3416
USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3417
);
3418
KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3419
3420
root->r.r_hot_team = hot_team;
3421
root_team->t.t_control_stack_top = NULL;
3422
3423
/* first-time initialization */
3424
hot_team->t.t_parent = root_team;
3425
3426
/* initialize hot team */
3427
hot_team_max_nth = hot_team->t.t_max_nproc;
3428
for (f = 0; f < hot_team_max_nth; ++f) {
3429
hot_team->t.t_threads[f] = NULL;
3430
}
3431
hot_team->t.t_nproc = 1;
3432
// TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3433
hot_team->t.t_sched.sched = r_sched.sched;
3434
hot_team->t.t_size_changed = 0;
3435
hot_team->t.t_nested_nth = &__kmp_nested_nth;
3436
}
3437
3438
#ifdef KMP_DEBUG
3439
3440
typedef struct kmp_team_list_item {
3441
kmp_team_p const *entry;
3442
struct kmp_team_list_item *next;
3443
} kmp_team_list_item_t;
3444
typedef kmp_team_list_item_t *kmp_team_list_t;
3445
3446
static void __kmp_print_structure_team_accum( // Add team to list of teams.
3447
kmp_team_list_t list, // List of teams.
3448
kmp_team_p const *team // Team to add.
3449
) {
3450
3451
// List must terminate with item where both entry and next are NULL.
3452
// Team is added to the list only once.
3453
// List is sorted in ascending order by team id.
3454
// Team id is *not* a key.
3455
3456
kmp_team_list_t l;
3457
3458
KMP_DEBUG_ASSERT(list != NULL);
3459
if (team == NULL) {
3460
return;
3461
}
3462
3463
__kmp_print_structure_team_accum(list, team->t.t_parent);
3464
__kmp_print_structure_team_accum(list, team->t.t_next_pool);
3465
3466
// Search list for the team.
3467
l = list;
3468
while (l->next != NULL && l->entry != team) {
3469
l = l->next;
3470
}
3471
if (l->next != NULL) {
3472
return; // Team has been added before, exit.
3473
}
3474
3475
// Team is not found. Search list again for insertion point.
3476
l = list;
3477
while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3478
l = l->next;
3479
}
3480
3481
// Insert team.
3482
{
3483
kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3484
sizeof(kmp_team_list_item_t));
3485
*item = *l;
3486
l->entry = team;
3487
l->next = item;
3488
}
3489
}
3490
3491
static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3492
3493
) {
3494
__kmp_printf("%s", title);
3495
if (team != NULL) {
3496
__kmp_printf("%2x %p\n", team->t.t_id, team);
3497
} else {
3498
__kmp_printf(" - (nil)\n");
3499
}
3500
}
3501
3502
static void __kmp_print_structure_thread(char const *title,
3503
kmp_info_p const *thread) {
3504
__kmp_printf("%s", title);
3505
if (thread != NULL) {
3506
__kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3507
} else {
3508
__kmp_printf(" - (nil)\n");
3509
}
3510
}
3511
3512
void __kmp_print_structure(void) {
3513
3514
kmp_team_list_t list;
3515
3516
// Initialize list of teams.
3517
list =
3518
(kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3519
list->entry = NULL;
3520
list->next = NULL;
3521
3522
__kmp_printf("\n------------------------------\nGlobal Thread "
3523
"Table\n------------------------------\n");
3524
{
3525
int gtid;
3526
for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3527
__kmp_printf("%2d", gtid);
3528
if (__kmp_threads != NULL) {
3529
__kmp_printf(" %p", __kmp_threads[gtid]);
3530
}
3531
if (__kmp_root != NULL) {
3532
__kmp_printf(" %p", __kmp_root[gtid]);
3533
}
3534
__kmp_printf("\n");
3535
}
3536
}
3537
3538
// Print out __kmp_threads array.
3539
__kmp_printf("\n------------------------------\nThreads\n--------------------"
3540
"----------\n");
3541
if (__kmp_threads != NULL) {
3542
int gtid;
3543
for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3544
kmp_info_t const *thread = __kmp_threads[gtid];
3545
if (thread != NULL) {
3546
__kmp_printf("GTID %2d %p:\n", gtid, thread);
3547
__kmp_printf(" Our Root: %p\n", thread->th.th_root);
3548
__kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3549
__kmp_print_structure_team(" Serial Team: ",
3550
thread->th.th_serial_team);
3551
__kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3552
__kmp_print_structure_thread(" Primary: ",
3553
thread->th.th_team_master);
3554
__kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3555
__kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3556
__kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3557
__kmp_print_structure_thread(" Next in pool: ",
3558
thread->th.th_next_pool);
3559
__kmp_printf("\n");
3560
__kmp_print_structure_team_accum(list, thread->th.th_team);
3561
__kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3562
}
3563
}
3564
} else {
3565
__kmp_printf("Threads array is not allocated.\n");
3566
}
3567
3568
// Print out __kmp_root array.
3569
__kmp_printf("\n------------------------------\nUbers\n----------------------"
3570
"--------\n");
3571
if (__kmp_root != NULL) {
3572
int gtid;
3573
for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3574
kmp_root_t const *root = __kmp_root[gtid];
3575
if (root != NULL) {
3576
__kmp_printf("GTID %2d %p:\n", gtid, root);
3577
__kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3578
__kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3579
__kmp_print_structure_thread(" Uber Thread: ",
3580
root->r.r_uber_thread);
3581
__kmp_printf(" Active?: %2d\n", root->r.r_active);
3582
__kmp_printf(" In Parallel: %2d\n",
3583
KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3584
__kmp_printf("\n");
3585
__kmp_print_structure_team_accum(list, root->r.r_root_team);
3586
__kmp_print_structure_team_accum(list, root->r.r_hot_team);
3587
}
3588
}
3589
} else {
3590
__kmp_printf("Ubers array is not allocated.\n");
3591
}
3592
3593
__kmp_printf("\n------------------------------\nTeams\n----------------------"
3594
"--------\n");
3595
while (list->next != NULL) {
3596
kmp_team_p const *team = list->entry;
3597
int i;
3598
__kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3599
__kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3600
__kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3601
__kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3602
__kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3603
__kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3604
for (i = 0; i < team->t.t_nproc; ++i) {
3605
__kmp_printf(" Thread %2d: ", i);
3606
__kmp_print_structure_thread("", team->t.t_threads[i]);
3607
}
3608
__kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3609
__kmp_printf("\n");
3610
list = list->next;
3611
}
3612
3613
// Print out __kmp_thread_pool and __kmp_team_pool.
3614
__kmp_printf("\n------------------------------\nPools\n----------------------"
3615
"--------\n");
3616
__kmp_print_structure_thread("Thread pool: ",
3617
CCAST(kmp_info_t *, __kmp_thread_pool));
3618
__kmp_print_structure_team("Team pool: ",
3619
CCAST(kmp_team_t *, __kmp_team_pool));
3620
__kmp_printf("\n");
3621
3622
// Free team list.
3623
while (list != NULL) {
3624
kmp_team_list_item_t *item = list;
3625
list = list->next;
3626
KMP_INTERNAL_FREE(item);
3627
}
3628
}
3629
3630
#endif
3631
3632
//---------------------------------------------------------------------------
3633
// Stuff for per-thread fast random number generator
3634
// Table of primes
3635
static const unsigned __kmp_primes[] = {
3636
0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3637
0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3638
0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3639
0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3640
0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3641
0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3642
0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3643
0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3644
0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3645
0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3646
0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3647
3648
//---------------------------------------------------------------------------
3649
// __kmp_get_random: Get a random number using a linear congruential method.
3650
unsigned short __kmp_get_random(kmp_info_t *thread) {
3651
unsigned x = thread->th.th_x;
3652
unsigned short r = (unsigned short)(x >> 16);
3653
3654
thread->th.th_x = x * thread->th.th_a + 1;
3655
3656
KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3657
thread->th.th_info.ds.ds_tid, r));
3658
3659
return r;
3660
}
3661
//--------------------------------------------------------
3662
// __kmp_init_random: Initialize a random number generator
3663
void __kmp_init_random(kmp_info_t *thread) {
3664
unsigned seed = thread->th.th_info.ds.ds_tid;
3665
3666
thread->th.th_a =
3667
__kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3668
thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3669
KA_TRACE(30,
3670
("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3671
}
3672
3673
#if KMP_OS_WINDOWS
3674
/* reclaim array entries for root threads that are already dead, returns number
3675
* reclaimed */
3676
static int __kmp_reclaim_dead_roots(void) {
3677
int i, r = 0;
3678
3679
for (i = 0; i < __kmp_threads_capacity; ++i) {
3680
if (KMP_UBER_GTID(i) &&
3681
!__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3682
!__kmp_root[i]
3683
->r.r_active) { // AC: reclaim only roots died in non-active state
3684
r += __kmp_unregister_root_other_thread(i);
3685
}
3686
}
3687
return r;
3688
}
3689
#endif
3690
3691
/* This function attempts to create free entries in __kmp_threads and
3692
__kmp_root, and returns the number of free entries generated.
3693
3694
For Windows* OS static library, the first mechanism used is to reclaim array
3695
entries for root threads that are already dead.
3696
3697
On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3698
__kmp_root, with appropriate update to __kmp_threads_capacity. Array
3699
capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3700
threadprivate cache array has been created. Synchronization with
3701
__kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3702
3703
After any dead root reclamation, if the clipping value allows array expansion
3704
to result in the generation of a total of nNeed free slots, the function does
3705
that expansion. If not, nothing is done beyond the possible initial root
3706
thread reclamation.
3707
3708
If any argument is negative, the behavior is undefined. */
3709
static int __kmp_expand_threads(int nNeed) {
3710
int added = 0;
3711
int minimumRequiredCapacity;
3712
int newCapacity;
3713
kmp_info_t **newThreads;
3714
kmp_root_t **newRoot;
3715
3716
// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3717
// resizing __kmp_threads does not need additional protection if foreign
3718
// threads are present
3719
3720
#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3721
/* only for Windows static library */
3722
/* reclaim array entries for root threads that are already dead */
3723
added = __kmp_reclaim_dead_roots();
3724
3725
if (nNeed) {
3726
nNeed -= added;
3727
if (nNeed < 0)
3728
nNeed = 0;
3729
}
3730
#endif
3731
if (nNeed <= 0)
3732
return added;
3733
3734
// Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3735
// __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3736
// user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3737
// > __kmp_max_nth in one of two ways:
3738
//
3739
// 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3740
// may not be reused by another thread, so we may need to increase
3741
// __kmp_threads_capacity to __kmp_max_nth + 1.
3742
//
3743
// 2) New foreign root(s) are encountered. We always register new foreign
3744
// roots. This may cause a smaller # of threads to be allocated at
3745
// subsequent parallel regions, but the worker threads hang around (and
3746
// eventually go to sleep) and need slots in the __kmp_threads[] array.
3747
//
3748
// Anyway, that is the reason for moving the check to see if
3749
// __kmp_max_nth was exceeded into __kmp_reserve_threads()
3750
// instead of having it performed here. -BB
3751
3752
KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3753
3754
/* compute expansion headroom to check if we can expand */
3755
if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3756
/* possible expansion too small -- give up */
3757
return added;
3758
}
3759
minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3760
3761
newCapacity = __kmp_threads_capacity;
3762
do {
3763
newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3764
: __kmp_sys_max_nth;
3765
} while (newCapacity < minimumRequiredCapacity);
3766
newThreads = (kmp_info_t **)__kmp_allocate(
3767
(sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3768
newRoot =
3769
(kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3770
KMP_MEMCPY(newThreads, __kmp_threads,
3771
__kmp_threads_capacity * sizeof(kmp_info_t *));
3772
KMP_MEMCPY(newRoot, __kmp_root,
3773
__kmp_threads_capacity * sizeof(kmp_root_t *));
3774
// Put old __kmp_threads array on a list. Any ongoing references to the old
3775
// list will be valid. This list is cleaned up at library shutdown.
3776
kmp_old_threads_list_t *node =
3777
(kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3778
node->threads = __kmp_threads;
3779
node->next = __kmp_old_threads_list;
3780
__kmp_old_threads_list = node;
3781
3782
*(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3783
*(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3784
added += newCapacity - __kmp_threads_capacity;
3785
*(volatile int *)&__kmp_threads_capacity = newCapacity;
3786
3787
if (newCapacity > __kmp_tp_capacity) {
3788
__kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3789
if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3790
__kmp_threadprivate_resize_cache(newCapacity);
3791
} else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3792
*(volatile int *)&__kmp_tp_capacity = newCapacity;
3793
}
3794
__kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3795
}
3796
3797
return added;
3798
}
3799
3800
/* Register the current thread as a root thread and obtain our gtid. We must
3801
have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3802
thread that calls from __kmp_do_serial_initialize() */
3803
int __kmp_register_root(int initial_thread) {
3804
kmp_info_t *root_thread;
3805
kmp_root_t *root;
3806
int gtid;
3807
int capacity;
3808
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3809
KA_TRACE(20, ("__kmp_register_root: entered\n"));
3810
KMP_MB();
3811
3812
/* 2007-03-02:
3813
If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3814
initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3815
work as expected -- it may return false (that means there is at least one
3816
empty slot in __kmp_threads array), but it is possible the only free slot
3817
is #0, which is reserved for initial thread and so cannot be used for this
3818
one. Following code workarounds this bug.
3819
3820
However, right solution seems to be not reserving slot #0 for initial
3821
thread because:
3822
(1) there is no magic in slot #0,
3823
(2) we cannot detect initial thread reliably (the first thread which does
3824
serial initialization may be not a real initial thread).
3825
*/
3826
capacity = __kmp_threads_capacity;
3827
if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3828
--capacity;
3829
}
3830
3831
// If it is not for initializing the hidden helper team, we need to take
3832
// __kmp_hidden_helper_threads_num out of the capacity because it is included
3833
// in __kmp_threads_capacity.
3834
if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3835
capacity -= __kmp_hidden_helper_threads_num;
3836
}
3837
3838
/* see if there are too many threads */
3839
if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3840
if (__kmp_tp_cached) {
3841
__kmp_fatal(KMP_MSG(CantRegisterNewThread),
3842
KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3843
KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3844
} else {
3845
__kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3846
__kmp_msg_null);
3847
}
3848
}
3849
3850
// When hidden helper task is enabled, __kmp_threads is organized as follows:
3851
// 0: initial thread, also a regular OpenMP thread.
3852
// [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3853
// [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3854
// regular OpenMP threads.
3855
if (TCR_4(__kmp_init_hidden_helper_threads)) {
3856
// Find an available thread slot for hidden helper thread. Slots for hidden
3857
// helper threads start from 1 to __kmp_hidden_helper_threads_num.
3858
for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3859
gtid <= __kmp_hidden_helper_threads_num;
3860
gtid++)
3861
;
3862
KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3863
KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3864
"hidden helper thread: T#%d\n",
3865
gtid));
3866
} else {
3867
/* find an available thread slot */
3868
// Don't reassign the zero slot since we need that to only be used by
3869
// initial thread. Slots for hidden helper threads should also be skipped.
3870
if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3871
gtid = 0;
3872
} else {
3873
for (gtid = __kmp_hidden_helper_threads_num + 1;
3874
TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3875
;
3876
}
3877
KA_TRACE(
3878
1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3879
KMP_ASSERT(gtid < __kmp_threads_capacity);
3880
}
3881
3882
/* update global accounting */
3883
__kmp_all_nth++;
3884
TCW_4(__kmp_nth, __kmp_nth + 1);
3885
3886
// if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3887
// numbers of procs, and method #2 (keyed API call) for higher numbers.
3888
if (__kmp_adjust_gtid_mode) {
3889
if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3890
if (TCR_4(__kmp_gtid_mode) != 2) {
3891
TCW_4(__kmp_gtid_mode, 2);
3892
}
3893
} else {
3894
if (TCR_4(__kmp_gtid_mode) != 1) {
3895
TCW_4(__kmp_gtid_mode, 1);
3896
}
3897
}
3898
}
3899
3900
#ifdef KMP_ADJUST_BLOCKTIME
3901
/* Adjust blocktime to zero if necessary */
3902
/* Middle initialization might not have occurred yet */
3903
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3904
if (__kmp_nth > __kmp_avail_proc) {
3905
__kmp_zero_bt = TRUE;
3906
}
3907
}
3908
#endif /* KMP_ADJUST_BLOCKTIME */
3909
3910
/* setup this new hierarchy */
3911
if (!(root = __kmp_root[gtid])) {
3912
root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3913
KMP_DEBUG_ASSERT(!root->r.r_root_team);
3914
}
3915
3916
#if KMP_STATS_ENABLED
3917
// Initialize stats as soon as possible (right after gtid assignment).
3918
__kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3919
__kmp_stats_thread_ptr->startLife();
3920
KMP_SET_THREAD_STATE(SERIAL_REGION);
3921
KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3922
#endif
3923
__kmp_initialize_root(root);
3924
3925
/* setup new root thread structure */
3926
if (root->r.r_uber_thread) {
3927
root_thread = root->r.r_uber_thread;
3928
} else {
3929
root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3930
if (__kmp_storage_map) {
3931
__kmp_print_thread_storage_map(root_thread, gtid);
3932
}
3933
root_thread->th.th_info.ds.ds_gtid = gtid;
3934
#if OMPT_SUPPORT
3935
root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3936
#endif
3937
root_thread->th.th_root = root;
3938
if (__kmp_env_consistency_check) {
3939
root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3940
}
3941
#if USE_FAST_MEMORY
3942
__kmp_initialize_fast_memory(root_thread);
3943
#endif /* USE_FAST_MEMORY */
3944
3945
#if KMP_USE_BGET
3946
KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3947
__kmp_initialize_bget(root_thread);
3948
#endif
3949
__kmp_init_random(root_thread); // Initialize random number generator
3950
}
3951
3952
/* setup the serial team held in reserve by the root thread */
3953
if (!root_thread->th.th_serial_team) {
3954
kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3955
KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3956
root_thread->th.th_serial_team = __kmp_allocate_team(
3957
root, 1, 1,
3958
#if OMPT_SUPPORT
3959
ompt_data_none, // root parallel id
3960
#endif
3961
proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3962
}
3963
KMP_ASSERT(root_thread->th.th_serial_team);
3964
KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3965
root_thread->th.th_serial_team));
3966
3967
/* drop root_thread into place */
3968
TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3969
3970
root->r.r_root_team->t.t_threads[0] = root_thread;
3971
root->r.r_hot_team->t.t_threads[0] = root_thread;
3972
root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3973
// AC: the team created in reserve, not for execution (it is unused for now).
3974
root_thread->th.th_serial_team->t.t_serialized = 0;
3975
root->r.r_uber_thread = root_thread;
3976
3977
/* initialize the thread, get it ready to go */
3978
__kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3979
TCW_4(__kmp_init_gtid, TRUE);
3980
3981
/* prepare the primary thread for get_gtid() */
3982
__kmp_gtid_set_specific(gtid);
3983
3984
#if USE_ITT_BUILD
3985
__kmp_itt_thread_name(gtid);
3986
#endif /* USE_ITT_BUILD */
3987
3988
#ifdef KMP_TDATA_GTID
3989
__kmp_gtid = gtid;
3990
#endif
3991
__kmp_create_worker(gtid, root_thread, __kmp_stksize);
3992
KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3993
3994
KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3995
"plain=%u\n",
3996
gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3997
root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3998
KMP_INIT_BARRIER_STATE));
3999
{ // Initialize barrier data.
4000
int b;
4001
for (b = 0; b < bs_last_barrier; ++b) {
4002
root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4003
#if USE_DEBUGGER
4004
root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4005
#endif
4006
}
4007
}
4008
KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4009
KMP_INIT_BARRIER_STATE);
4010
4011
#if KMP_AFFINITY_SUPPORTED
4012
root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4013
root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4014
root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4015
root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4016
#endif /* KMP_AFFINITY_SUPPORTED */
4017
root_thread->th.th_def_allocator = __kmp_def_allocator;
4018
root_thread->th.th_prev_level = 0;
4019
root_thread->th.th_prev_num_threads = 1;
4020
4021
kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4022
tmp->cg_root = root_thread;
4023
tmp->cg_thread_limit = __kmp_cg_max_nth;
4024
tmp->cg_nthreads = 1;
4025
KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4026
" cg_nthreads init to 1\n",
4027
root_thread, tmp));
4028
tmp->up = NULL;
4029
root_thread->th.th_cg_roots = tmp;
4030
4031
__kmp_root_counter++;
4032
4033
#if OMPT_SUPPORT
4034
if (ompt_enabled.enabled) {
4035
4036
kmp_info_t *root_thread = ompt_get_thread();
4037
4038
ompt_set_thread_state(root_thread, ompt_state_overhead);
4039
4040
if (ompt_enabled.ompt_callback_thread_begin) {
4041
ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4042
ompt_thread_initial, __ompt_get_thread_data_internal());
4043
}
4044
ompt_data_t *task_data;
4045
ompt_data_t *parallel_data;
4046
__ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4047
NULL);
4048
if (ompt_enabled.ompt_callback_implicit_task) {
4049
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4050
ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4051
}
4052
4053
ompt_set_thread_state(root_thread, ompt_state_work_serial);
4054
}
4055
#endif
4056
#if OMPD_SUPPORT
4057
if (ompd_state & OMPD_ENABLE_BP)
4058
ompd_bp_thread_begin();
4059
#endif
4060
4061
KMP_MB();
4062
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4063
4064
return gtid;
4065
}
4066
4067
#if KMP_NESTED_HOT_TEAMS
4068
static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4069
const int max_level) {
4070
int i, n, nth;
4071
kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4072
if (!hot_teams || !hot_teams[level].hot_team) {
4073
return 0;
4074
}
4075
KMP_DEBUG_ASSERT(level < max_level);
4076
kmp_team_t *team = hot_teams[level].hot_team;
4077
nth = hot_teams[level].hot_team_nth;
4078
n = nth - 1; // primary thread is not freed
4079
if (level < max_level - 1) {
4080
for (i = 0; i < nth; ++i) {
4081
kmp_info_t *th = team->t.t_threads[i];
4082
n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4083
if (i > 0 && th->th.th_hot_teams) {
4084
__kmp_free(th->th.th_hot_teams);
4085
th->th.th_hot_teams = NULL;
4086
}
4087
}
4088
}
4089
__kmp_free_team(root, team, NULL);
4090
return n;
4091
}
4092
#endif
4093
4094
// Resets a root thread and clear its root and hot teams.
4095
// Returns the number of __kmp_threads entries directly and indirectly freed.
4096
static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4097
kmp_team_t *root_team = root->r.r_root_team;
4098
kmp_team_t *hot_team = root->r.r_hot_team;
4099
int n = hot_team->t.t_nproc;
4100
int i;
4101
4102
KMP_DEBUG_ASSERT(!root->r.r_active);
4103
4104
root->r.r_root_team = NULL;
4105
root->r.r_hot_team = NULL;
4106
// __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4107
// before call to __kmp_free_team().
4108
__kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4109
#if KMP_NESTED_HOT_TEAMS
4110
if (__kmp_hot_teams_max_level >
4111
0) { // need to free nested hot teams and their threads if any
4112
for (i = 0; i < hot_team->t.t_nproc; ++i) {
4113
kmp_info_t *th = hot_team->t.t_threads[i];
4114
if (__kmp_hot_teams_max_level > 1) {
4115
n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4116
}
4117
if (th->th.th_hot_teams) {
4118
__kmp_free(th->th.th_hot_teams);
4119
th->th.th_hot_teams = NULL;
4120
}
4121
}
4122
}
4123
#endif
4124
__kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4125
4126
// Before we can reap the thread, we need to make certain that all other
4127
// threads in the teams that had this root as ancestor have stopped trying to
4128
// steal tasks.
4129
if (__kmp_tasking_mode != tskm_immediate_exec) {
4130
__kmp_wait_to_unref_task_teams();
4131
}
4132
4133
#if KMP_OS_WINDOWS
4134
/* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4135
KA_TRACE(
4136
10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4137
"\n",
4138
(LPVOID) & (root->r.r_uber_thread->th),
4139
root->r.r_uber_thread->th.th_info.ds.ds_thread));
4140
__kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4141
#endif /* KMP_OS_WINDOWS */
4142
4143
#if OMPD_SUPPORT
4144
if (ompd_state & OMPD_ENABLE_BP)
4145
ompd_bp_thread_end();
4146
#endif
4147
4148
#if OMPT_SUPPORT
4149
ompt_data_t *task_data;
4150
ompt_data_t *parallel_data;
4151
__ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4152
NULL);
4153
if (ompt_enabled.ompt_callback_implicit_task) {
4154
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4155
ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4156
}
4157
if (ompt_enabled.ompt_callback_thread_end) {
4158
ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4159
&(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4160
}
4161
#endif
4162
4163
TCW_4(__kmp_nth,
4164
__kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4165
i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4166
KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4167
" to %d\n",
4168
root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4169
root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4170
if (i == 1) {
4171
// need to free contention group structure
4172
KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4173
root->r.r_uber_thread->th.th_cg_roots->cg_root);
4174
KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4175
__kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4176
root->r.r_uber_thread->th.th_cg_roots = NULL;
4177
}
4178
__kmp_reap_thread(root->r.r_uber_thread, 1);
4179
4180
// We canot put root thread to __kmp_thread_pool, so we have to reap it
4181
// instead of freeing.
4182
root->r.r_uber_thread = NULL;
4183
/* mark root as no longer in use */
4184
root->r.r_begin = FALSE;
4185
4186
return n;
4187
}
4188
4189
void __kmp_unregister_root_current_thread(int gtid) {
4190
KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4191
/* this lock should be ok, since unregister_root_current_thread is never
4192
called during an abort, only during a normal close. furthermore, if you
4193
have the forkjoin lock, you should never try to get the initz lock */
4194
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4195
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4196
KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4197
"exiting T#%d\n",
4198
gtid));
4199
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4200
return;
4201
}
4202
kmp_root_t *root = __kmp_root[gtid];
4203
4204
KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4205
KMP_ASSERT(KMP_UBER_GTID(gtid));
4206
KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4207
KMP_ASSERT(root->r.r_active == FALSE);
4208
4209
KMP_MB();
4210
4211
kmp_info_t *thread = __kmp_threads[gtid];
4212
kmp_team_t *team = thread->th.th_team;
4213
kmp_task_team_t *task_team = thread->th.th_task_team;
4214
4215
// we need to wait for the proxy tasks before finishing the thread
4216
if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4217
task_team->tt.tt_hidden_helper_task_encountered)) {
4218
#if OMPT_SUPPORT
4219
// the runtime is shutting down so we won't report any events
4220
thread->th.ompt_thread_info.state = ompt_state_undefined;
4221
#endif
4222
__kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4223
}
4224
4225
__kmp_reset_root(gtid, root);
4226
4227
KMP_MB();
4228
KC_TRACE(10,
4229
("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4230
4231
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4232
}
4233
4234
#if KMP_OS_WINDOWS
4235
/* __kmp_forkjoin_lock must be already held
4236
Unregisters a root thread that is not the current thread. Returns the number
4237
of __kmp_threads entries freed as a result. */
4238
static int __kmp_unregister_root_other_thread(int gtid) {
4239
kmp_root_t *root = __kmp_root[gtid];
4240
int r;
4241
4242
KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4243
KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4244
KMP_ASSERT(KMP_UBER_GTID(gtid));
4245
KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4246
KMP_ASSERT(root->r.r_active == FALSE);
4247
4248
r = __kmp_reset_root(gtid, root);
4249
KC_TRACE(10,
4250
("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4251
return r;
4252
}
4253
#endif
4254
4255
#if KMP_DEBUG
4256
void __kmp_task_info() {
4257
4258
kmp_int32 gtid = __kmp_entry_gtid();
4259
kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4260
kmp_info_t *this_thr = __kmp_threads[gtid];
4261
kmp_team_t *steam = this_thr->th.th_serial_team;
4262
kmp_team_t *team = this_thr->th.th_team;
4263
4264
__kmp_printf(
4265
"__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4266
"ptask=%p\n",
4267
gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4268
team->t.t_implicit_task_taskdata[tid].td_parent);
4269
}
4270
#endif // KMP_DEBUG
4271
4272
/* TODO optimize with one big memclr, take out what isn't needed, split
4273
responsibility to workers as much as possible, and delay initialization of
4274
features as much as possible */
4275
static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4276
int tid, int gtid) {
4277
/* this_thr->th.th_info.ds.ds_gtid is setup in
4278
kmp_allocate_thread/create_worker.
4279
this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4280
KMP_DEBUG_ASSERT(this_thr != NULL);
4281
KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4282
KMP_DEBUG_ASSERT(team);
4283
KMP_DEBUG_ASSERT(team->t.t_threads);
4284
KMP_DEBUG_ASSERT(team->t.t_dispatch);
4285
kmp_info_t *master = team->t.t_threads[0];
4286
KMP_DEBUG_ASSERT(master);
4287
KMP_DEBUG_ASSERT(master->th.th_root);
4288
4289
KMP_MB();
4290
4291
TCW_SYNC_PTR(this_thr->th.th_team, team);
4292
4293
this_thr->th.th_info.ds.ds_tid = tid;
4294
this_thr->th.th_set_nproc = 0;
4295
if (__kmp_tasking_mode != tskm_immediate_exec)
4296
// When tasking is possible, threads are not safe to reap until they are
4297
// done tasking; this will be set when tasking code is exited in wait
4298
this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4299
else // no tasking --> always safe to reap
4300
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4301
this_thr->th.th_set_proc_bind = proc_bind_default;
4302
4303
#if KMP_AFFINITY_SUPPORTED
4304
this_thr->th.th_new_place = this_thr->th.th_current_place;
4305
#endif
4306
this_thr->th.th_root = master->th.th_root;
4307
4308
/* setup the thread's cache of the team structure */
4309
this_thr->th.th_team_nproc = team->t.t_nproc;
4310
this_thr->th.th_team_master = master;
4311
this_thr->th.th_team_serialized = team->t.t_serialized;
4312
4313
KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4314
4315
KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4316
tid, gtid, this_thr, this_thr->th.th_current_task));
4317
4318
__kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4319
team, tid, TRUE);
4320
4321
KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4322
tid, gtid, this_thr, this_thr->th.th_current_task));
4323
// TODO: Initialize ICVs from parent; GEH - isn't that already done in
4324
// __kmp_initialize_team()?
4325
4326
/* TODO no worksharing in speculative threads */
4327
this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4328
4329
this_thr->th.th_local.this_construct = 0;
4330
4331
if (!this_thr->th.th_pri_common) {
4332
this_thr->th.th_pri_common =
4333
(struct common_table *)__kmp_allocate(sizeof(struct common_table));
4334
if (__kmp_storage_map) {
4335
__kmp_print_storage_map_gtid(
4336
gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4337
sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4338
}
4339
this_thr->th.th_pri_head = NULL;
4340
}
4341
4342
if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4343
this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4344
// Make new thread's CG root same as primary thread's
4345
KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4346
kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4347
if (tmp) {
4348
// worker changes CG, need to check if old CG should be freed
4349
int i = tmp->cg_nthreads--;
4350
KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4351
" on node %p of thread %p to %d\n",
4352
this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4353
if (i == 1) {
4354
__kmp_free(tmp); // last thread left CG --> free it
4355
}
4356
}
4357
this_thr->th.th_cg_roots = master->th.th_cg_roots;
4358
// Increment new thread's CG root's counter to add the new thread
4359
this_thr->th.th_cg_roots->cg_nthreads++;
4360
KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4361
" node %p of thread %p to %d\n",
4362
this_thr, this_thr->th.th_cg_roots,
4363
this_thr->th.th_cg_roots->cg_root,
4364
this_thr->th.th_cg_roots->cg_nthreads));
4365
this_thr->th.th_current_task->td_icvs.thread_limit =
4366
this_thr->th.th_cg_roots->cg_thread_limit;
4367
}
4368
4369
/* Initialize dynamic dispatch */
4370
{
4371
volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4372
// Use team max_nproc since this will never change for the team.
4373
size_t disp_size =
4374
sizeof(dispatch_private_info_t) *
4375
(team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4376
KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4377
team->t.t_max_nproc));
4378
KMP_ASSERT(dispatch);
4379
KMP_DEBUG_ASSERT(team->t.t_dispatch);
4380
KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4381
4382
dispatch->th_disp_index = 0;
4383
dispatch->th_doacross_buf_idx = 0;
4384
if (!dispatch->th_disp_buffer) {
4385
dispatch->th_disp_buffer =
4386
(dispatch_private_info_t *)__kmp_allocate(disp_size);
4387
4388
if (__kmp_storage_map) {
4389
__kmp_print_storage_map_gtid(
4390
gtid, &dispatch->th_disp_buffer[0],
4391
&dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4392
? 1
4393
: __kmp_dispatch_num_buffers],
4394
disp_size,
4395
"th_%d.th_dispatch.th_disp_buffer "
4396
"(team_%d.t_dispatch[%d].th_disp_buffer)",
4397
gtid, team->t.t_id, gtid);
4398
}
4399
} else {
4400
memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4401
}
4402
4403
dispatch->th_dispatch_pr_current = 0;
4404
dispatch->th_dispatch_sh_current = 0;
4405
4406
dispatch->th_deo_fcn = 0; /* ORDERED */
4407
dispatch->th_dxo_fcn = 0; /* END ORDERED */
4408
}
4409
4410
this_thr->th.th_next_pool = NULL;
4411
4412
KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4413
KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4414
4415
KMP_MB();
4416
}
4417
4418
/* allocate a new thread for the requesting team. this is only called from
4419
within a forkjoin critical section. we will first try to get an available
4420
thread from the thread pool. if none is available, we will fork a new one
4421
assuming we are able to create a new one. this should be assured, as the
4422
caller should check on this first. */
4423
kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4424
int new_tid) {
4425
kmp_team_t *serial_team;
4426
kmp_info_t *new_thr;
4427
int new_gtid;
4428
4429
KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4430
KMP_DEBUG_ASSERT(root && team);
4431
#if !KMP_NESTED_HOT_TEAMS
4432
KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4433
#endif
4434
KMP_MB();
4435
4436
/* first, try to get one from the thread pool unless allocating thread is
4437
* the main hidden helper thread. The hidden helper team should always
4438
* allocate new OS threads. */
4439
if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4440
new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441
__kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442
if (new_thr == __kmp_thread_pool_insert_pt) {
4443
__kmp_thread_pool_insert_pt = NULL;
4444
}
4445
TCW_4(new_thr->th.th_in_pool, FALSE);
4446
__kmp_suspend_initialize_thread(new_thr);
4447
__kmp_lock_suspend_mx(new_thr);
4448
if (new_thr->th.th_active_in_pool == TRUE) {
4449
KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450
KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451
new_thr->th.th_active_in_pool = FALSE;
4452
}
4453
__kmp_unlock_suspend_mx(new_thr);
4454
4455
KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456
__kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457
KMP_ASSERT(!new_thr->th.th_team);
4458
KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459
4460
/* setup the thread structure */
4461
__kmp_initialize_info(new_thr, team, new_tid,
4462
new_thr->th.th_info.ds.ds_gtid);
4463
KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464
4465
TCW_4(__kmp_nth, __kmp_nth + 1);
4466
4467
new_thr->th.th_task_state = 0;
4468
4469
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470
// Make sure pool thread has transitioned to waiting on own thread struct
4471
KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472
// Thread activated in __kmp_allocate_team when increasing team size
4473
}
4474
4475
#ifdef KMP_ADJUST_BLOCKTIME
4476
/* Adjust blocktime back to zero if necessary */
4477
/* Middle initialization might not have occurred yet */
4478
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479
if (__kmp_nth > __kmp_avail_proc) {
4480
__kmp_zero_bt = TRUE;
4481
}
4482
}
4483
#endif /* KMP_ADJUST_BLOCKTIME */
4484
4485
#if KMP_DEBUG
4486
// If thread entered pool via __kmp_free_thread, wait_flag should !=
4487
// KMP_BARRIER_PARENT_FLAG.
4488
int b;
4489
kmp_balign_t *balign = new_thr->th.th_bar;
4490
for (b = 0; b < bs_last_barrier; ++b)
4491
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492
#endif
4493
4494
KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495
__kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496
4497
KMP_MB();
4498
return new_thr;
4499
}
4500
4501
/* no, well fork a new one */
4502
KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503
KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504
4505
#if KMP_USE_MONITOR
4506
// If this is the first worker thread the RTL is creating, then also
4507
// launch the monitor thread. We try to do this as early as possible.
4508
if (!TCR_4(__kmp_init_monitor)) {
4509
__kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510
if (!TCR_4(__kmp_init_monitor)) {
4511
KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512
TCW_4(__kmp_init_monitor, 1);
4513
__kmp_create_monitor(&__kmp_monitor);
4514
KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515
#if KMP_OS_WINDOWS
4516
// AC: wait until monitor has started. This is a fix for CQ232808.
4517
// The reason is that if the library is loaded/unloaded in a loop with
4518
// small (parallel) work in between, then there is high probability that
4519
// monitor thread started after the library shutdown. At shutdown it is
4520
// too late to cope with the problem, because when the primary thread is
4521
// in DllMain (process detach) the monitor has no chances to start (it is
4522
// blocked), and primary thread has no means to inform the monitor that
4523
// the library has gone, because all the memory which the monitor can
4524
// access is going to be released/reset.
4525
while (TCR_4(__kmp_init_monitor) < 2) {
4526
KMP_YIELD(TRUE);
4527
}
4528
KF_TRACE(10, ("after monitor thread has started\n"));
4529
#endif
4530
}
4531
__kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532
}
4533
#endif
4534
4535
KMP_MB();
4536
4537
{
4538
int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539
? 1
4540
: __kmp_hidden_helper_threads_num + 1;
4541
4542
for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543
++new_gtid) {
4544
KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545
}
4546
4547
if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548
KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549
}
4550
}
4551
4552
/* allocate space for it. */
4553
new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554
4555
new_thr->th.th_nt_strict = false;
4556
new_thr->th.th_nt_loc = NULL;
4557
new_thr->th.th_nt_sev = severity_fatal;
4558
new_thr->th.th_nt_msg = NULL;
4559
4560
TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4561
4562
#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4563
// suppress race conditions detection on synchronization flags in debug mode
4564
// this helps to analyze library internals eliminating false positives
4565
__itt_suppress_mark_range(
4566
__itt_suppress_range, __itt_suppress_threading_errors,
4567
&new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4568
__itt_suppress_mark_range(
4569
__itt_suppress_range, __itt_suppress_threading_errors,
4570
&new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4571
#if KMP_OS_WINDOWS
4572
__itt_suppress_mark_range(
4573
__itt_suppress_range, __itt_suppress_threading_errors,
4574
&new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4575
#else
4576
__itt_suppress_mark_range(__itt_suppress_range,
4577
__itt_suppress_threading_errors,
4578
&new_thr->th.th_suspend_init_count,
4579
sizeof(new_thr->th.th_suspend_init_count));
4580
#endif
4581
// TODO: check if we need to also suppress b_arrived flags
4582
__itt_suppress_mark_range(__itt_suppress_range,
4583
__itt_suppress_threading_errors,
4584
CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4585
sizeof(new_thr->th.th_bar[0].bb.b_go));
4586
__itt_suppress_mark_range(__itt_suppress_range,
4587
__itt_suppress_threading_errors,
4588
CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4589
sizeof(new_thr->th.th_bar[1].bb.b_go));
4590
__itt_suppress_mark_range(__itt_suppress_range,
4591
__itt_suppress_threading_errors,
4592
CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4593
sizeof(new_thr->th.th_bar[2].bb.b_go));
4594
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4595
if (__kmp_storage_map) {
4596
__kmp_print_thread_storage_map(new_thr, new_gtid);
4597
}
4598
4599
// add the reserve serialized team, initialized from the team's primary thread
4600
{
4601
kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4602
KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4603
new_thr->th.th_serial_team = serial_team =
4604
(kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4605
#if OMPT_SUPPORT
4606
ompt_data_none, // root parallel id
4607
#endif
4608
proc_bind_default, &r_icvs,
4609
0 USE_NESTED_HOT_ARG(NULL));
4610
}
4611
KMP_ASSERT(serial_team);
4612
serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4613
// execution (it is unused for now).
4614
serial_team->t.t_threads[0] = new_thr;
4615
KF_TRACE(10,
4616
("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4617
new_thr));
4618
4619
/* setup the thread structures */
4620
__kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4621
4622
#if USE_FAST_MEMORY
4623
__kmp_initialize_fast_memory(new_thr);
4624
#endif /* USE_FAST_MEMORY */
4625
4626
#if KMP_USE_BGET
4627
KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4628
__kmp_initialize_bget(new_thr);
4629
#endif
4630
4631
__kmp_init_random(new_thr); // Initialize random number generator
4632
4633
/* Initialize these only once when thread is grabbed for a team allocation */
4634
KA_TRACE(20,
4635
("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4636
__kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4637
4638
int b;
4639
kmp_balign_t *balign = new_thr->th.th_bar;
4640
for (b = 0; b < bs_last_barrier; ++b) {
4641
balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4642
balign[b].bb.team = NULL;
4643
balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4644
balign[b].bb.use_oncore_barrier = 0;
4645
}
4646
4647
TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4648
new_thr->th.th_sleep_loc_type = flag_unset;
4649
4650
new_thr->th.th_spin_here = FALSE;
4651
new_thr->th.th_next_waiting = 0;
4652
#if KMP_OS_UNIX
4653
new_thr->th.th_blocking = false;
4654
#endif
4655
4656
#if KMP_AFFINITY_SUPPORTED
4657
new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4658
new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4659
new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4660
new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4661
#endif
4662
new_thr->th.th_def_allocator = __kmp_def_allocator;
4663
new_thr->th.th_prev_level = 0;
4664
new_thr->th.th_prev_num_threads = 1;
4665
4666
TCW_4(new_thr->th.th_in_pool, FALSE);
4667
new_thr->th.th_active_in_pool = FALSE;
4668
TCW_4(new_thr->th.th_active, TRUE);
4669
4670
new_thr->th.th_set_nested_nth = NULL;
4671
new_thr->th.th_set_nested_nth_sz = 0;
4672
4673
/* adjust the global counters */
4674
__kmp_all_nth++;
4675
__kmp_nth++;
4676
4677
// if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4678
// numbers of procs, and method #2 (keyed API call) for higher numbers.
4679
if (__kmp_adjust_gtid_mode) {
4680
if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4681
if (TCR_4(__kmp_gtid_mode) != 2) {
4682
TCW_4(__kmp_gtid_mode, 2);
4683
}
4684
} else {
4685
if (TCR_4(__kmp_gtid_mode) != 1) {
4686
TCW_4(__kmp_gtid_mode, 1);
4687
}
4688
}
4689
}
4690
4691
#ifdef KMP_ADJUST_BLOCKTIME
4692
/* Adjust blocktime back to zero if necessary */
4693
/* Middle initialization might not have occurred yet */
4694
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4695
if (__kmp_nth > __kmp_avail_proc) {
4696
__kmp_zero_bt = TRUE;
4697
}
4698
}
4699
#endif /* KMP_ADJUST_BLOCKTIME */
4700
4701
#if KMP_AFFINITY_SUPPORTED
4702
// Set the affinity and topology information for new thread
4703
__kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4704
#endif
4705
4706
/* actually fork it and create the new worker thread */
4707
KF_TRACE(
4708
10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4709
__kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4710
KF_TRACE(10,
4711
("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4712
4713
KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4714
new_gtid));
4715
KMP_MB();
4716
return new_thr;
4717
}
4718
4719
/* Reinitialize team for reuse.
4720
The hot team code calls this case at every fork barrier, so EPCC barrier
4721
test are extremely sensitive to changes in it, esp. writes to the team
4722
struct, which cause a cache invalidation in all threads.
4723
IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4724
static void __kmp_reinitialize_team(kmp_team_t *team,
4725
kmp_internal_control_t *new_icvs,
4726
ident_t *loc) {
4727
KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4728
team->t.t_threads[0], team));
4729
KMP_DEBUG_ASSERT(team && new_icvs);
4730
KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4731
KMP_CHECK_UPDATE(team->t.t_ident, loc);
4732
4733
KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4734
// Copy ICVs to the primary thread's implicit taskdata
4735
__kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4736
copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4737
4738
KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4739
team->t.t_threads[0], team));
4740
}
4741
4742
/* Initialize the team data structure.
4743
This assumes the t_threads and t_max_nproc are already set.
4744
Also, we don't touch the arguments */
4745
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4746
kmp_internal_control_t *new_icvs,
4747
ident_t *loc) {
4748
KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4749
4750
/* verify */
4751
KMP_DEBUG_ASSERT(team);
4752
KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4753
KMP_DEBUG_ASSERT(team->t.t_threads);
4754
KMP_MB();
4755
4756
team->t.t_master_tid = 0; /* not needed */
4757
/* team->t.t_master_bar; not needed */
4758
team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4759
team->t.t_nproc = new_nproc;
4760
4761
/* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4762
team->t.t_next_pool = NULL;
4763
/* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4764
* up hot team */
4765
4766
TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4767
team->t.t_invoke = NULL; /* not needed */
4768
4769
// TODO???: team->t.t_max_active_levels = new_max_active_levels;
4770
team->t.t_sched.sched = new_icvs->sched.sched;
4771
4772
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4773
team->t.t_fp_control_saved = FALSE; /* not needed */
4774
team->t.t_x87_fpu_control_word = 0; /* not needed */
4775
team->t.t_mxcsr = 0; /* not needed */
4776
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4777
4778
team->t.t_construct = 0;
4779
4780
team->t.t_ordered.dt.t_value = 0;
4781
team->t.t_master_active = FALSE;
4782
4783
#ifdef KMP_DEBUG
4784
team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4785
#endif
4786
#if KMP_OS_WINDOWS
4787
team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4788
#endif
4789
4790
team->t.t_control_stack_top = NULL;
4791
4792
__kmp_reinitialize_team(team, new_icvs, loc);
4793
4794
KMP_MB();
4795
KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4796
}
4797
4798
#if KMP_AFFINITY_SUPPORTED
4799
static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4800
int first, int last, int newp) {
4801
th->th.th_first_place = first;
4802
th->th.th_last_place = last;
4803
th->th.th_new_place = newp;
4804
if (newp != th->th.th_current_place) {
4805
if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4806
team->t.t_display_affinity = 1;
4807
// Copy topology information associated with the new place
4808
th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4809
th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4810
}
4811
}
4812
4813
// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4814
// It calculates the worker + primary thread's partition based upon the parent
4815
// thread's partition, and binds each worker to a thread in their partition.
4816
// The primary thread's partition should already include its current binding.
4817
static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4818
// Do not partition places for the hidden helper team
4819
if (KMP_HIDDEN_HELPER_TEAM(team))
4820
return;
4821
// Copy the primary thread's place partition to the team struct
4822
kmp_info_t *master_th = team->t.t_threads[0];
4823
KMP_DEBUG_ASSERT(master_th != NULL);
4824
kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4825
int first_place = master_th->th.th_first_place;
4826
int last_place = master_th->th.th_last_place;
4827
int masters_place = master_th->th.th_current_place;
4828
int num_masks = __kmp_affinity.num_masks;
4829
team->t.t_first_place = first_place;
4830
team->t.t_last_place = last_place;
4831
4832
KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4833
"bound to place %d partition = [%d,%d]\n",
4834
proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4835
team->t.t_id, masters_place, first_place, last_place));
4836
4837
switch (proc_bind) {
4838
4839
case proc_bind_default:
4840
// Serial teams might have the proc_bind policy set to proc_bind_default.
4841
// Not an issue -- we don't rebind primary thread for any proc_bind policy.
4842
KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4843
break;
4844
4845
case proc_bind_primary: {
4846
int f;
4847
int n_th = team->t.t_nproc;
4848
for (f = 1; f < n_th; f++) {
4849
kmp_info_t *th = team->t.t_threads[f];
4850
KMP_DEBUG_ASSERT(th != NULL);
4851
__kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4852
4853
KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4854
"partition = [%d,%d]\n",
4855
__kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4856
f, masters_place, first_place, last_place));
4857
}
4858
} break;
4859
4860
case proc_bind_close: {
4861
int f;
4862
int n_th = team->t.t_nproc;
4863
int n_places;
4864
if (first_place <= last_place) {
4865
n_places = last_place - first_place + 1;
4866
} else {
4867
n_places = num_masks - first_place + last_place + 1;
4868
}
4869
if (n_th <= n_places) {
4870
int place = masters_place;
4871
for (f = 1; f < n_th; f++) {
4872
kmp_info_t *th = team->t.t_threads[f];
4873
KMP_DEBUG_ASSERT(th != NULL);
4874
4875
if (place == last_place) {
4876
place = first_place;
4877
} else if (place == (num_masks - 1)) {
4878
place = 0;
4879
} else {
4880
place++;
4881
}
4882
__kmp_set_thread_place(team, th, first_place, last_place, place);
4883
4884
KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4885
"partition = [%d,%d]\n",
4886
__kmp_gtid_from_thread(team->t.t_threads[f]),
4887
team->t.t_id, f, place, first_place, last_place));
4888
}
4889
} else {
4890
int S, rem, gap, s_count;
4891
S = n_th / n_places;
4892
s_count = 0;
4893
rem = n_th - (S * n_places);
4894
gap = rem > 0 ? n_places / rem : n_places;
4895
int place = masters_place;
4896
int gap_ct = gap;
4897
for (f = 0; f < n_th; f++) {
4898
kmp_info_t *th = team->t.t_threads[f];
4899
KMP_DEBUG_ASSERT(th != NULL);
4900
4901
__kmp_set_thread_place(team, th, first_place, last_place, place);
4902
s_count++;
4903
4904
if ((s_count == S) && rem && (gap_ct == gap)) {
4905
// do nothing, add an extra thread to place on next iteration
4906
} else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4907
// we added an extra thread to this place; move to next place
4908
if (place == last_place) {
4909
place = first_place;
4910
} else if (place == (num_masks - 1)) {
4911
place = 0;
4912
} else {
4913
place++;
4914
}
4915
s_count = 0;
4916
gap_ct = 1;
4917
rem--;
4918
} else if (s_count == S) { // place full; don't add extra
4919
if (place == last_place) {
4920
place = first_place;
4921
} else if (place == (num_masks - 1)) {
4922
place = 0;
4923
} else {
4924
place++;
4925
}
4926
gap_ct++;
4927
s_count = 0;
4928
}
4929
4930
KA_TRACE(100,
4931
("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4932
"partition = [%d,%d]\n",
4933
__kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4934
th->th.th_new_place, first_place, last_place));
4935
}
4936
KMP_DEBUG_ASSERT(place == masters_place);
4937
}
4938
} break;
4939
4940
case proc_bind_spread: {
4941
int f;
4942
int n_th = team->t.t_nproc;
4943
int n_places;
4944
int thidx;
4945
if (first_place <= last_place) {
4946
n_places = last_place - first_place + 1;
4947
} else {
4948
n_places = num_masks - first_place + last_place + 1;
4949
}
4950
if (n_th <= n_places) {
4951
int place = -1;
4952
4953
if (n_places != num_masks) {
4954
int S = n_places / n_th;
4955
int s_count, rem, gap, gap_ct;
4956
4957
place = masters_place;
4958
rem = n_places - n_th * S;
4959
gap = rem ? n_th / rem : 1;
4960
gap_ct = gap;
4961
thidx = n_th;
4962
if (update_master_only == 1)
4963
thidx = 1;
4964
for (f = 0; f < thidx; f++) {
4965
kmp_info_t *th = team->t.t_threads[f];
4966
KMP_DEBUG_ASSERT(th != NULL);
4967
4968
int fplace = place, nplace = place;
4969
s_count = 1;
4970
while (s_count < S) {
4971
if (place == last_place) {
4972
place = first_place;
4973
} else if (place == (num_masks - 1)) {
4974
place = 0;
4975
} else {
4976
place++;
4977
}
4978
s_count++;
4979
}
4980
if (rem && (gap_ct == gap)) {
4981
if (place == last_place) {
4982
place = first_place;
4983
} else if (place == (num_masks - 1)) {
4984
place = 0;
4985
} else {
4986
place++;
4987
}
4988
rem--;
4989
gap_ct = 0;
4990
}
4991
__kmp_set_thread_place(team, th, fplace, place, nplace);
4992
gap_ct++;
4993
4994
if (place == last_place) {
4995
place = first_place;
4996
} else if (place == (num_masks - 1)) {
4997
place = 0;
4998
} else {
4999
place++;
5000
}
5001
5002
KA_TRACE(100,
5003
("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5004
"partition = [%d,%d], num_masks: %u\n",
5005
__kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5006
f, th->th.th_new_place, th->th.th_first_place,
5007
th->th.th_last_place, num_masks));
5008
}
5009
} else {
5010
/* Having uniform space of available computation places I can create
5011
T partitions of round(P/T) size and put threads into the first
5012
place of each partition. */
5013
double current = static_cast<double>(masters_place);
5014
double spacing =
5015
(static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5016
int first, last;
5017
kmp_info_t *th;
5018
5019
thidx = n_th + 1;
5020
if (update_master_only == 1)
5021
thidx = 1;
5022
for (f = 0; f < thidx; f++) {
5023
first = static_cast<int>(current);
5024
last = static_cast<int>(current + spacing) - 1;
5025
KMP_DEBUG_ASSERT(last >= first);
5026
if (first >= n_places) {
5027
if (masters_place) {
5028
first -= n_places;
5029
last -= n_places;
5030
if (first == (masters_place + 1)) {
5031
KMP_DEBUG_ASSERT(f == n_th);
5032
first--;
5033
}
5034
if (last == masters_place) {
5035
KMP_DEBUG_ASSERT(f == (n_th - 1));
5036
last--;
5037
}
5038
} else {
5039
KMP_DEBUG_ASSERT(f == n_th);
5040
first = 0;
5041
last = 0;
5042
}
5043
}
5044
if (last >= n_places) {
5045
last = (n_places - 1);
5046
}
5047
place = first;
5048
current += spacing;
5049
if (f < n_th) {
5050
KMP_DEBUG_ASSERT(0 <= first);
5051
KMP_DEBUG_ASSERT(n_places > first);
5052
KMP_DEBUG_ASSERT(0 <= last);
5053
KMP_DEBUG_ASSERT(n_places > last);
5054
KMP_DEBUG_ASSERT(last_place >= first_place);
5055
th = team->t.t_threads[f];
5056
KMP_DEBUG_ASSERT(th);
5057
__kmp_set_thread_place(team, th, first, last, place);
5058
KA_TRACE(100,
5059
("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5060
"partition = [%d,%d], spacing = %.4f\n",
5061
__kmp_gtid_from_thread(team->t.t_threads[f]),
5062
team->t.t_id, f, th->th.th_new_place,
5063
th->th.th_first_place, th->th.th_last_place, spacing));
5064
}
5065
}
5066
}
5067
KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5068
} else {
5069
int S, rem, gap, s_count;
5070
S = n_th / n_places;
5071
s_count = 0;
5072
rem = n_th - (S * n_places);
5073
gap = rem > 0 ? n_places / rem : n_places;
5074
int place = masters_place;
5075
int gap_ct = gap;
5076
thidx = n_th;
5077
if (update_master_only == 1)
5078
thidx = 1;
5079
for (f = 0; f < thidx; f++) {
5080
kmp_info_t *th = team->t.t_threads[f];
5081
KMP_DEBUG_ASSERT(th != NULL);
5082
5083
__kmp_set_thread_place(team, th, place, place, place);
5084
s_count++;
5085
5086
if ((s_count == S) && rem && (gap_ct == gap)) {
5087
// do nothing, add an extra thread to place on next iteration
5088
} else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5089
// we added an extra thread to this place; move on to next place
5090
if (place == last_place) {
5091
place = first_place;
5092
} else if (place == (num_masks - 1)) {
5093
place = 0;
5094
} else {
5095
place++;
5096
}
5097
s_count = 0;
5098
gap_ct = 1;
5099
rem--;
5100
} else if (s_count == S) { // place is full; don't add extra thread
5101
if (place == last_place) {
5102
place = first_place;
5103
} else if (place == (num_masks - 1)) {
5104
place = 0;
5105
} else {
5106
place++;
5107
}
5108
gap_ct++;
5109
s_count = 0;
5110
}
5111
5112
KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5113
"partition = [%d,%d]\n",
5114
__kmp_gtid_from_thread(team->t.t_threads[f]),
5115
team->t.t_id, f, th->th.th_new_place,
5116
th->th.th_first_place, th->th.th_last_place));
5117
}
5118
KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5119
}
5120
} break;
5121
5122
default:
5123
break;
5124
}
5125
5126
KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5127
}
5128
5129
#endif // KMP_AFFINITY_SUPPORTED
5130
5131
/* allocate a new team data structure to use. take one off of the free pool if
5132
available */
5133
kmp_team_t *
5134
__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5135
#if OMPT_SUPPORT
5136
ompt_data_t ompt_parallel_data,
5137
#endif
5138
kmp_proc_bind_t new_proc_bind,
5139
kmp_internal_control_t *new_icvs,
5140
int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5141
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5142
int f;
5143
kmp_team_t *team;
5144
int use_hot_team = !root->r.r_active;
5145
int level = 0;
5146
int do_place_partition = 1;
5147
5148
KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5149
KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5150
KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5151
KMP_MB();
5152
5153
#if KMP_NESTED_HOT_TEAMS
5154
kmp_hot_team_ptr_t *hot_teams;
5155
if (master) {
5156
team = master->th.th_team;
5157
level = team->t.t_active_level;
5158
if (master->th.th_teams_microtask) { // in teams construct?
5159
if (master->th.th_teams_size.nteams > 1 &&
5160
( // #teams > 1
5161
team->t.t_pkfn ==
5162
(microtask_t)__kmp_teams_master || // inner fork of the teams
5163
master->th.th_teams_level <
5164
team->t.t_level)) { // or nested parallel inside the teams
5165
++level; // not increment if #teams==1, or for outer fork of the teams;
5166
// increment otherwise
5167
}
5168
// Do not perform the place partition if inner fork of the teams
5169
// Wait until nested parallel region encountered inside teams construct
5170
if ((master->th.th_teams_size.nteams == 1 &&
5171
master->th.th_teams_level >= team->t.t_level) ||
5172
(team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5173
do_place_partition = 0;
5174
}
5175
hot_teams = master->th.th_hot_teams;
5176
if (level < __kmp_hot_teams_max_level && hot_teams &&
5177
hot_teams[level].hot_team) {
5178
// hot team has already been allocated for given level
5179
use_hot_team = 1;
5180
} else {
5181
use_hot_team = 0;
5182
}
5183
} else {
5184
// check we won't access uninitialized hot_teams, just in case
5185
KMP_DEBUG_ASSERT(new_nproc == 1);
5186
}
5187
#endif
5188
// Optimization to use a "hot" team
5189
if (use_hot_team && new_nproc > 1) {
5190
KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5191
#if KMP_NESTED_HOT_TEAMS
5192
team = hot_teams[level].hot_team;
5193
#else
5194
team = root->r.r_hot_team;
5195
#endif
5196
#if KMP_DEBUG
5197
if (__kmp_tasking_mode != tskm_immediate_exec) {
5198
KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199
"task_team[1] = %p before reinit\n",
5200
team->t.t_task_team[0], team->t.t_task_team[1]));
5201
}
5202
#endif
5203
5204
if (team->t.t_nproc != new_nproc &&
5205
__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5206
// Distributed barrier may need a resize
5207
int old_nthr = team->t.t_nproc;
5208
__kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5209
}
5210
5211
// If not doing the place partition, then reset the team's proc bind
5212
// to indicate that partitioning of all threads still needs to take place
5213
if (do_place_partition == 0)
5214
team->t.t_proc_bind = proc_bind_default;
5215
// Has the number of threads changed?
5216
/* Let's assume the most common case is that the number of threads is
5217
unchanged, and put that case first. */
5218
if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5219
KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5220
// This case can mean that omp_set_num_threads() was called and the hot
5221
// team size was already reduced, so we check the special flag
5222
if (team->t.t_size_changed == -1) {
5223
team->t.t_size_changed = 1;
5224
} else {
5225
KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5226
}
5227
5228
// TODO???: team->t.t_max_active_levels = new_max_active_levels;
5229
kmp_r_sched_t new_sched = new_icvs->sched;
5230
// set primary thread's schedule as new run-time schedule
5231
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5232
5233
__kmp_reinitialize_team(team, new_icvs,
5234
root->r.r_uber_thread->th.th_ident);
5235
5236
KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5237
team->t.t_threads[0], team));
5238
__kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5239
5240
#if KMP_AFFINITY_SUPPORTED
5241
if ((team->t.t_size_changed == 0) &&
5242
(team->t.t_proc_bind == new_proc_bind)) {
5243
if (new_proc_bind == proc_bind_spread) {
5244
if (do_place_partition) {
5245
// add flag to update only master for spread
5246
__kmp_partition_places(team, 1);
5247
}
5248
}
5249
KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5250
"proc_bind = %d, partition = [%d,%d]\n",
5251
team->t.t_id, new_proc_bind, team->t.t_first_place,
5252
team->t.t_last_place));
5253
} else {
5254
if (do_place_partition) {
5255
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256
__kmp_partition_places(team);
5257
}
5258
}
5259
#else
5260
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261
#endif /* KMP_AFFINITY_SUPPORTED */
5262
} else if (team->t.t_nproc > new_nproc) {
5263
KA_TRACE(20,
5264
("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5265
new_nproc));
5266
5267
team->t.t_size_changed = 1;
5268
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5269
// Barrier size already reduced earlier in this function
5270
// Activate team threads via th_used_in_team
5271
__kmp_add_threads_to_team(team, new_nproc);
5272
}
5273
// When decreasing team size, threads no longer in the team should
5274
// unref task team.
5275
if (__kmp_tasking_mode != tskm_immediate_exec) {
5276
for (f = new_nproc; f < team->t.t_nproc; f++) {
5277
kmp_info_t *th = team->t.t_threads[f];
5278
KMP_DEBUG_ASSERT(th);
5279
th->th.th_task_team = NULL;
5280
}
5281
}
5282
#if KMP_NESTED_HOT_TEAMS
5283
if (__kmp_hot_teams_mode == 0) {
5284
// AC: saved number of threads should correspond to team's value in this
5285
// mode, can be bigger in mode 1, when hot team has threads in reserve
5286
KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287
hot_teams[level].hot_team_nth = new_nproc;
5288
#endif // KMP_NESTED_HOT_TEAMS
5289
/* release the extra threads we don't need any more */
5290
for (f = new_nproc; f < team->t.t_nproc; f++) {
5291
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292
__kmp_free_thread(team->t.t_threads[f]);
5293
team->t.t_threads[f] = NULL;
5294
}
5295
#if KMP_NESTED_HOT_TEAMS
5296
} // (__kmp_hot_teams_mode == 0)
5297
else {
5298
// When keeping extra threads in team, switch threads to wait on own
5299
// b_go flag
5300
for (f = new_nproc; f < team->t.t_nproc; ++f) {
5301
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5302
kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5303
for (int b = 0; b < bs_last_barrier; ++b) {
5304
if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5305
balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5306
}
5307
KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5308
}
5309
}
5310
}
5311
#endif // KMP_NESTED_HOT_TEAMS
5312
team->t.t_nproc = new_nproc;
5313
// TODO???: team->t.t_max_active_levels = new_max_active_levels;
5314
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5315
__kmp_reinitialize_team(team, new_icvs,
5316
root->r.r_uber_thread->th.th_ident);
5317
5318
// Update remaining threads
5319
for (f = 0; f < new_nproc; ++f) {
5320
team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5321
}
5322
5323
// restore the current task state of the primary thread: should be the
5324
// implicit task
5325
KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5326
team->t.t_threads[0], team));
5327
5328
__kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5329
5330
#ifdef KMP_DEBUG
5331
for (f = 0; f < team->t.t_nproc; f++) {
5332
KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5333
team->t.t_threads[f]->th.th_team_nproc ==
5334
team->t.t_nproc);
5335
}
5336
#endif
5337
5338
if (do_place_partition) {
5339
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5340
#if KMP_AFFINITY_SUPPORTED
5341
__kmp_partition_places(team);
5342
#endif
5343
}
5344
} else { // team->t.t_nproc < new_nproc
5345
5346
KA_TRACE(20,
5347
("__kmp_allocate_team: increasing hot team thread count to %d\n",
5348
new_nproc));
5349
int old_nproc = team->t.t_nproc; // save old value and use to update only
5350
team->t.t_size_changed = 1;
5351
5352
#if KMP_NESTED_HOT_TEAMS
5353
int avail_threads = hot_teams[level].hot_team_nth;
5354
if (new_nproc < avail_threads)
5355
avail_threads = new_nproc;
5356
kmp_info_t **other_threads = team->t.t_threads;
5357
for (f = team->t.t_nproc; f < avail_threads; ++f) {
5358
// Adjust barrier data of reserved threads (if any) of the team
5359
// Other data will be set in __kmp_initialize_info() below.
5360
int b;
5361
kmp_balign_t *balign = other_threads[f]->th.th_bar;
5362
for (b = 0; b < bs_last_barrier; ++b) {
5363
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5364
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5365
#if USE_DEBUGGER
5366
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5367
#endif
5368
}
5369
}
5370
if (hot_teams[level].hot_team_nth >= new_nproc) {
5371
// we have all needed threads in reserve, no need to allocate any
5372
// this only possible in mode 1, cannot have reserved threads in mode 0
5373
KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5374
team->t.t_nproc = new_nproc; // just get reserved threads involved
5375
} else {
5376
// We may have some threads in reserve, but not enough;
5377
// get reserved threads involved if any.
5378
team->t.t_nproc = hot_teams[level].hot_team_nth;
5379
hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5380
#endif // KMP_NESTED_HOT_TEAMS
5381
if (team->t.t_max_nproc < new_nproc) {
5382
/* reallocate larger arrays */
5383
__kmp_reallocate_team_arrays(team, new_nproc);
5384
__kmp_reinitialize_team(team, new_icvs, NULL);
5385
}
5386
5387
#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5388
KMP_AFFINITY_SUPPORTED
5389
/* Temporarily set full mask for primary thread before creation of
5390
workers. The reason is that workers inherit the affinity from the
5391
primary thread, so if a lot of workers are created on the single
5392
core quickly, they don't get a chance to set their own affinity for
5393
a long time. */
5394
kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5395
#endif
5396
5397
/* allocate new threads for the hot team */
5398
for (f = team->t.t_nproc; f < new_nproc; f++) {
5399
kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5400
KMP_DEBUG_ASSERT(new_worker);
5401
team->t.t_threads[f] = new_worker;
5402
5403
KA_TRACE(20,
5404
("__kmp_allocate_team: team %d init T#%d arrived: "
5405
"join=%llu, plain=%llu\n",
5406
team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5407
team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5408
team->t.t_bar[bs_plain_barrier].b_arrived));
5409
5410
{ // Initialize barrier data for new threads.
5411
int b;
5412
kmp_balign_t *balign = new_worker->th.th_bar;
5413
for (b = 0; b < bs_last_barrier; ++b) {
5414
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5415
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5416
KMP_BARRIER_PARENT_FLAG);
5417
#if USE_DEBUGGER
5418
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5419
#endif
5420
}
5421
}
5422
}
5423
5424
#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5425
KMP_AFFINITY_SUPPORTED
5426
/* Restore initial primary thread's affinity mask */
5427
new_temp_affinity.restore();
5428
#endif
5429
#if KMP_NESTED_HOT_TEAMS
5430
} // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431
#endif // KMP_NESTED_HOT_TEAMS
5432
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433
// Barrier size already increased earlier in this function
5434
// Activate team threads via th_used_in_team
5435
__kmp_add_threads_to_team(team, new_nproc);
5436
}
5437
/* make sure everyone is syncronized */
5438
// new threads below
5439
__kmp_initialize_team(team, new_nproc, new_icvs,
5440
root->r.r_uber_thread->th.th_ident);
5441
5442
/* reinitialize the threads */
5443
KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444
for (f = 0; f < team->t.t_nproc; ++f)
5445
__kmp_initialize_info(team->t.t_threads[f], team, f,
5446
__kmp_gtid_from_tid(f, team));
5447
5448
// set th_task_state for new threads in hot team with older thread's state
5449
kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5450
for (f = old_nproc; f < team->t.t_nproc; ++f)
5451
team->t.t_threads[f]->th.th_task_state = old_state;
5452
5453
#ifdef KMP_DEBUG
5454
for (f = 0; f < team->t.t_nproc; ++f) {
5455
KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5456
team->t.t_threads[f]->th.th_team_nproc ==
5457
team->t.t_nproc);
5458
}
5459
#endif
5460
5461
if (do_place_partition) {
5462
KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5463
#if KMP_AFFINITY_SUPPORTED
5464
__kmp_partition_places(team);
5465
#endif
5466
}
5467
} // Check changes in number of threads
5468
5469
if (master->th.th_teams_microtask) {
5470
for (f = 1; f < new_nproc; ++f) {
5471
// propagate teams construct specific info to workers
5472
kmp_info_t *thr = team->t.t_threads[f];
5473
thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474
thr->th.th_teams_level = master->th.th_teams_level;
5475
thr->th.th_teams_size = master->th.th_teams_size;
5476
}
5477
}
5478
#if KMP_NESTED_HOT_TEAMS
5479
if (level) {
5480
// Sync barrier state for nested hot teams, not needed for outermost hot
5481
// team.
5482
for (f = 1; f < new_nproc; ++f) {
5483
kmp_info_t *thr = team->t.t_threads[f];
5484
int b;
5485
kmp_balign_t *balign = thr->th.th_bar;
5486
for (b = 0; b < bs_last_barrier; ++b) {
5487
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489
#if USE_DEBUGGER
5490
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491
#endif
5492
}
5493
}
5494
}
5495
#endif // KMP_NESTED_HOT_TEAMS
5496
5497
/* reallocate space for arguments if necessary */
5498
__kmp_alloc_argv_entries(argc, team, TRUE);
5499
KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500
// The hot team re-uses the previous task team,
5501
// if untouched during the previous release->gather phase.
5502
5503
KF_TRACE(10, (" hot_team = %p\n", team));
5504
5505
#if KMP_DEBUG
5506
if (__kmp_tasking_mode != tskm_immediate_exec) {
5507
KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508
"task_team[1] = %p after reinit\n",
5509
team->t.t_task_team[0], team->t.t_task_team[1]));
5510
}
5511
#endif
5512
5513
#if OMPT_SUPPORT
5514
__ompt_team_assign_id(team, ompt_parallel_data);
5515
#endif
5516
5517
KMP_MB();
5518
5519
return team;
5520
}
5521
5522
/* next, let's try to take one from the team pool */
5523
KMP_MB();
5524
for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525
/* TODO: consider resizing undersized teams instead of reaping them, now
5526
that we have a resizing mechanism */
5527
if (team->t.t_max_nproc >= max_nproc) {
5528
/* take this team from the team pool */
5529
__kmp_team_pool = team->t.t_next_pool;
5530
5531
if (max_nproc > 1 &&
5532
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533
if (!team->t.b) { // Allocate barrier structure
5534
team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535
}
5536
}
5537
5538
/* setup the team for fresh use */
5539
__kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540
5541
KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542
"task_team[1] %p to NULL\n",
5543
&team->t.t_task_team[0], &team->t.t_task_team[1]));
5544
team->t.t_task_team[0] = NULL;
5545
team->t.t_task_team[1] = NULL;
5546
5547
/* reallocate space for arguments if necessary */
5548
__kmp_alloc_argv_entries(argc, team, TRUE);
5549
KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550
5551
KA_TRACE(
5552
20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553
team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554
{ // Initialize barrier data.
5555
int b;
5556
for (b = 0; b < bs_last_barrier; ++b) {
5557
team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558
#if USE_DEBUGGER
5559
team->t.t_bar[b].b_master_arrived = 0;
5560
team->t.t_bar[b].b_team_arrived = 0;
5561
#endif
5562
}
5563
}
5564
5565
team->t.t_proc_bind = new_proc_bind;
5566
5567
KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568
team->t.t_id));
5569
5570
#if OMPT_SUPPORT
5571
__ompt_team_assign_id(team, ompt_parallel_data);
5572
#endif
5573
5574
team->t.t_nested_nth = NULL;
5575
5576
KMP_MB();
5577
5578
return team;
5579
}
5580
5581
/* reap team if it is too small, then loop back and check the next one */
5582
// not sure if this is wise, but, will be redone during the hot-teams
5583
// rewrite.
5584
/* TODO: Use technique to find the right size hot-team, don't reap them */
5585
team = __kmp_reap_team(team);
5586
__kmp_team_pool = team;
5587
}
5588
5589
/* nothing available in the pool, no matter, make a new team! */
5590
KMP_MB();
5591
team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5592
5593
/* and set it up */
5594
team->t.t_max_nproc = max_nproc;
5595
if (max_nproc > 1 &&
5596
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5597
// Allocate barrier structure
5598
team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5599
}
5600
5601
/* NOTE well, for some reason allocating one big buffer and dividing it up
5602
seems to really hurt performance a lot on the P4, so, let's not use this */
5603
__kmp_allocate_team_arrays(team, max_nproc);
5604
5605
KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5606
__kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5607
5608
KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5609
"%p to NULL\n",
5610
&team->t.t_task_team[0], &team->t.t_task_team[1]));
5611
team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5612
// memory, no need to duplicate
5613
team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5614
// memory, no need to duplicate
5615
5616
if (__kmp_storage_map) {
5617
__kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5618
}
5619
5620
/* allocate space for arguments */
5621
__kmp_alloc_argv_entries(argc, team, FALSE);
5622
team->t.t_argc = argc;
5623
5624
KA_TRACE(20,
5625
("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5626
team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5627
{ // Initialize barrier data.
5628
int b;
5629
for (b = 0; b < bs_last_barrier; ++b) {
5630
team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5631
#if USE_DEBUGGER
5632
team->t.t_bar[b].b_master_arrived = 0;
5633
team->t.t_bar[b].b_team_arrived = 0;
5634
#endif
5635
}
5636
}
5637
5638
team->t.t_proc_bind = new_proc_bind;
5639
5640
#if OMPT_SUPPORT
5641
__ompt_team_assign_id(team, ompt_parallel_data);
5642
team->t.ompt_serialized_team_info = NULL;
5643
#endif
5644
5645
KMP_MB();
5646
5647
team->t.t_nested_nth = NULL;
5648
5649
KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5650
team->t.t_id));
5651
5652
return team;
5653
}
5654
5655
/* TODO implement hot-teams at all levels */
5656
/* TODO implement lazy thread release on demand (disband request) */
5657
5658
/* free the team. return it to the team pool. release all the threads
5659
* associated with it */
5660
void __kmp_free_team(kmp_root_t *root,
5661
kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5662
int f;
5663
KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5664
team->t.t_id));
5665
5666
/* verify state */
5667
KMP_DEBUG_ASSERT(root);
5668
KMP_DEBUG_ASSERT(team);
5669
KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5670
KMP_DEBUG_ASSERT(team->t.t_threads);
5671
5672
int use_hot_team = team == root->r.r_hot_team;
5673
#if KMP_NESTED_HOT_TEAMS
5674
int level;
5675
if (master) {
5676
level = team->t.t_active_level - 1;
5677
if (master->th.th_teams_microtask) { // in teams construct?
5678
if (master->th.th_teams_size.nteams > 1) {
5679
++level; // level was not increased in teams construct for
5680
// team_of_masters
5681
}
5682
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5683
master->th.th_teams_level == team->t.t_level) {
5684
++level; // level was not increased in teams construct for
5685
// team_of_workers before the parallel
5686
} // team->t.t_level will be increased inside parallel
5687
}
5688
#if KMP_DEBUG
5689
kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5690
#endif
5691
if (level < __kmp_hot_teams_max_level) {
5692
KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5693
use_hot_team = 1;
5694
}
5695
}
5696
#endif // KMP_NESTED_HOT_TEAMS
5697
5698
/* team is done working */
5699
TCW_SYNC_PTR(team->t.t_pkfn,
5700
NULL); // Important for Debugging Support Library.
5701
#if KMP_OS_WINDOWS
5702
team->t.t_copyin_counter = 0; // init counter for possible reuse
5703
#endif
5704
// Do not reset pointer to parent team to NULL for hot teams.
5705
5706
/* if we are non-hot team, release our threads */
5707
if (!use_hot_team) {
5708
if (__kmp_tasking_mode != tskm_immediate_exec) {
5709
// Wait for threads to reach reapable state
5710
for (f = 1; f < team->t.t_nproc; ++f) {
5711
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5712
kmp_info_t *th = team->t.t_threads[f];
5713
volatile kmp_uint32 *state = &th->th.th_reap_state;
5714
while (*state != KMP_SAFE_TO_REAP) {
5715
#if KMP_OS_WINDOWS
5716
// On Windows a thread can be killed at any time, check this
5717
DWORD ecode;
5718
if (!__kmp_is_thread_alive(th, &ecode)) {
5719
*state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5720
break;
5721
}
5722
#endif
5723
// first check if thread is sleeping
5724
if (th->th.th_sleep_loc)
5725
__kmp_null_resume_wrapper(th);
5726
KMP_CPU_PAUSE();
5727
}
5728
}
5729
5730
// Delete task teams
5731
int tt_idx;
5732
for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5733
kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5734
if (task_team != NULL) {
5735
for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5736
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5737
team->t.t_threads[f]->th.th_task_team = NULL;
5738
}
5739
KA_TRACE(
5740
20,
5741
("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5742
__kmp_get_gtid(), task_team, team->t.t_id));
5743
#if KMP_NESTED_HOT_TEAMS
5744
__kmp_free_task_team(master, task_team);
5745
#endif
5746
team->t.t_task_team[tt_idx] = NULL;
5747
}
5748
}
5749
}
5750
5751
// Before clearing parent pointer, check if nested_nth list should be freed
5752
if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5753
team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5754
KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5755
KMP_INTERNAL_FREE(team->t.t_nested_nth);
5756
}
5757
team->t.t_nested_nth = NULL;
5758
5759
// Reset pointer to parent team only for non-hot teams.
5760
team->t.t_parent = NULL;
5761
team->t.t_level = 0;
5762
team->t.t_active_level = 0;
5763
5764
/* free the worker threads */
5765
for (f = 1; f < team->t.t_nproc; ++f) {
5766
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768
KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769
1, 2);
5770
}
5771
__kmp_free_thread(team->t.t_threads[f]);
5772
}
5773
5774
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775
if (team->t.b) {
5776
// wake up thread at old location
5777
team->t.b->go_release();
5778
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779
for (f = 1; f < team->t.t_nproc; ++f) {
5780
if (team->t.b->sleep[f].sleep) {
5781
__kmp_atomic_resume_64(
5782
team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783
(kmp_atomic_flag_64<> *)NULL);
5784
}
5785
}
5786
}
5787
// Wait for threads to be removed from team
5788
for (int f = 1; f < team->t.t_nproc; ++f) {
5789
while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790
KMP_CPU_PAUSE();
5791
}
5792
}
5793
}
5794
5795
for (f = 1; f < team->t.t_nproc; ++f) {
5796
team->t.t_threads[f] = NULL;
5797
}
5798
5799
if (team->t.t_max_nproc > 1 &&
5800
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801
distributedBarrier::deallocate(team->t.b);
5802
team->t.b = NULL;
5803
}
5804
/* put the team back in the team pool */
5805
/* TODO limit size of team pool, call reap_team if pool too large */
5806
team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807
__kmp_team_pool = (volatile kmp_team_t *)team;
5808
} else { // Check if team was created for primary threads in teams construct
5809
// See if first worker is a CG root
5810
KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811
team->t.t_threads[1]->th.th_cg_roots);
5812
if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813
// Clean up the CG root nodes on workers so that this team can be re-used
5814
for (f = 1; f < team->t.t_nproc; ++f) {
5815
kmp_info_t *thr = team->t.t_threads[f];
5816
KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817
thr->th.th_cg_roots->cg_root == thr);
5818
// Pop current CG root off list
5819
kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820
thr->th.th_cg_roots = tmp->up;
5821
KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822
" up to node %p. cg_nthreads was %d\n",
5823
thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824
int i = tmp->cg_nthreads--;
5825
if (i == 1) {
5826
__kmp_free(tmp); // free CG if we are the last thread in it
5827
}
5828
// Restore current task's thread_limit from CG root
5829
if (thr->th.th_cg_roots)
5830
thr->th.th_current_task->td_icvs.thread_limit =
5831
thr->th.th_cg_roots->cg_thread_limit;
5832
}
5833
}
5834
}
5835
5836
KMP_MB();
5837
}
5838
5839
/* reap the team. destroy it, reclaim all its resources and free its memory */
5840
kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841
kmp_team_t *next_pool = team->t.t_next_pool;
5842
5843
KMP_DEBUG_ASSERT(team);
5844
KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845
KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846
KMP_DEBUG_ASSERT(team->t.t_threads);
5847
KMP_DEBUG_ASSERT(team->t.t_argv);
5848
5849
/* TODO clean the threads that are a part of this? */
5850
5851
/* free stuff */
5852
__kmp_free_team_arrays(team);
5853
if (team->t.t_argv != &team->t.t_inline_argv[0])
5854
__kmp_free((void *)team->t.t_argv);
5855
__kmp_free(team);
5856
5857
KMP_MB();
5858
return next_pool;
5859
}
5860
5861
// Free the thread. Don't reap it, just place it on the pool of available
5862
// threads.
5863
//
5864
// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865
// binding for the affinity mechanism to be useful.
5866
//
5867
// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868
// However, we want to avoid a potential performance problem by always
5869
// scanning through the list to find the correct point at which to insert
5870
// the thread (potential N**2 behavior). To do this we keep track of the
5871
// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872
// With single-level parallelism, threads will always be added to the tail
5873
// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5874
// parallelism, all bets are off and we may need to scan through the entire
5875
// free list.
5876
//
5877
// This change also has a potentially large performance benefit, for some
5878
// applications. Previously, as threads were freed from the hot team, they
5879
// would be placed back on the free list in inverse order. If the hot team
5880
// grew back to it's original size, then the freed thread would be placed
5881
// back on the hot team in reverse order. This could cause bad cache
5882
// locality problems on programs where the size of the hot team regularly
5883
// grew and shrunk.
5884
//
5885
// Now, for single-level parallelism, the OMP tid is always == gtid.
5886
void __kmp_free_thread(kmp_info_t *this_th) {
5887
int gtid;
5888
kmp_info_t **scan;
5889
5890
KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891
__kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892
5893
KMP_DEBUG_ASSERT(this_th);
5894
5895
// When moving thread to pool, switch thread to wait on own b_go flag, and
5896
// uninitialized (NULL team).
5897
int b;
5898
kmp_balign_t *balign = this_th->th.th_bar;
5899
for (b = 0; b < bs_last_barrier; ++b) {
5900
if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901
balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902
balign[b].bb.team = NULL;
5903
balign[b].bb.leaf_kids = 0;
5904
}
5905
this_th->th.th_task_state = 0;
5906
this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907
5908
/* put thread back on the free pool */
5909
TCW_PTR(this_th->th.th_team, NULL);
5910
TCW_PTR(this_th->th.th_root, NULL);
5911
TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912
5913
while (this_th->th.th_cg_roots) {
5914
this_th->th.th_cg_roots->cg_nthreads--;
5915
KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916
" %p of thread %p to %d\n",
5917
this_th, this_th->th.th_cg_roots,
5918
this_th->th.th_cg_roots->cg_root,
5919
this_th->th.th_cg_roots->cg_nthreads));
5920
kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921
if (tmp->cg_root == this_th) { // Thread is a cg_root
5922
KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923
KA_TRACE(
5924
5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925
this_th->th.th_cg_roots = tmp->up;
5926
__kmp_free(tmp);
5927
} else { // Worker thread
5928
if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929
__kmp_free(tmp);
5930
}
5931
this_th->th.th_cg_roots = NULL;
5932
break;
5933
}
5934
}
5935
5936
/* If the implicit task assigned to this thread can be used by other threads
5937
* -> multiple threads can share the data and try to free the task at
5938
* __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939
* with higher probability when hot team is disabled but can occurs even when
5940
* the hot team is enabled */
5941
__kmp_free_implicit_task(this_th);
5942
this_th->th.th_current_task = NULL;
5943
5944
// If the __kmp_thread_pool_insert_pt is already past the new insert
5945
// point, then we need to re-scan the entire list.
5946
gtid = this_th->th.th_info.ds.ds_gtid;
5947
if (__kmp_thread_pool_insert_pt != NULL) {
5948
KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949
if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950
__kmp_thread_pool_insert_pt = NULL;
5951
}
5952
}
5953
5954
// Scan down the list to find the place to insert the thread.
5955
// scan is the address of a link in the list, possibly the address of
5956
// __kmp_thread_pool itself.
5957
//
5958
// In the absence of nested parallelism, the for loop will have 0 iterations.
5959
if (__kmp_thread_pool_insert_pt != NULL) {
5960
scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961
} else {
5962
scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963
}
5964
for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965
scan = &((*scan)->th.th_next_pool))
5966
;
5967
5968
// Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969
// to its address.
5970
TCW_PTR(this_th->th.th_next_pool, *scan);
5971
__kmp_thread_pool_insert_pt = *scan = this_th;
5972
KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973
(this_th->th.th_info.ds.ds_gtid <
5974
this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975
TCW_4(this_th->th.th_in_pool, TRUE);
5976
__kmp_suspend_initialize_thread(this_th);
5977
__kmp_lock_suspend_mx(this_th);
5978
if (this_th->th.th_active == TRUE) {
5979
KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980
this_th->th.th_active_in_pool = TRUE;
5981
}
5982
#if KMP_DEBUG
5983
else {
5984
KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985
}
5986
#endif
5987
__kmp_unlock_suspend_mx(this_th);
5988
5989
TCW_4(__kmp_nth, __kmp_nth - 1);
5990
5991
#ifdef KMP_ADJUST_BLOCKTIME
5992
/* Adjust blocktime back to user setting or default if necessary */
5993
/* Middle initialization might never have occurred */
5994
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995
KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996
if (__kmp_nth <= __kmp_avail_proc) {
5997
__kmp_zero_bt = FALSE;
5998
}
5999
}
6000
#endif /* KMP_ADJUST_BLOCKTIME */
6001
6002
KMP_MB();
6003
}
6004
6005
/* ------------------------------------------------------------------------ */
6006
6007
void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008
#if OMP_PROFILING_SUPPORT
6009
ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010
// TODO: add a configuration option for time granularity
6011
if (ProfileTraceFile)
6012
llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013
#endif
6014
6015
int gtid = this_thr->th.th_info.ds.ds_gtid;
6016
/* void *stack_data;*/
6017
kmp_team_t **volatile pteam;
6018
6019
KMP_MB();
6020
KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021
6022
if (__kmp_env_consistency_check) {
6023
this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024
}
6025
6026
#if OMPD_SUPPORT
6027
if (ompd_state & OMPD_ENABLE_BP)
6028
ompd_bp_thread_begin();
6029
#endif
6030
6031
#if OMPT_SUPPORT
6032
ompt_data_t *thread_data = nullptr;
6033
if (ompt_enabled.enabled) {
6034
thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035
*thread_data = ompt_data_none;
6036
6037
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038
this_thr->th.ompt_thread_info.wait_id = 0;
6039
this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040
this_thr->th.ompt_thread_info.parallel_flags = 0;
6041
if (ompt_enabled.ompt_callback_thread_begin) {
6042
ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043
ompt_thread_worker, thread_data);
6044
}
6045
this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046
}
6047
#endif
6048
6049
/* This is the place where threads wait for work */
6050
while (!TCR_4(__kmp_global.g.g_done)) {
6051
KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052
KMP_MB();
6053
6054
/* wait for work to do */
6055
KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056
6057
/* No tid yet since not part of a team */
6058
__kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059
6060
#if OMPT_SUPPORT
6061
if (ompt_enabled.enabled) {
6062
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063
}
6064
#endif
6065
6066
pteam = &this_thr->th.th_team;
6067
6068
/* have we been allocated? */
6069
if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070
/* we were just woken up, so run our new task */
6071
if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072
int rc;
6073
KA_TRACE(20,
6074
("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075
gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076
(*pteam)->t.t_pkfn));
6077
6078
updateHWFPControl(*pteam);
6079
6080
#if OMPT_SUPPORT
6081
if (ompt_enabled.enabled) {
6082
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083
}
6084
#endif
6085
6086
rc = (*pteam)->t.t_invoke(gtid);
6087
KMP_ASSERT(rc);
6088
6089
KMP_MB();
6090
KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091
gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092
(*pteam)->t.t_pkfn));
6093
}
6094
#if OMPT_SUPPORT
6095
if (ompt_enabled.enabled) {
6096
/* no frame set while outside task */
6097
__ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098
6099
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100
}
6101
#endif
6102
/* join barrier after parallel region */
6103
__kmp_join_barrier(gtid);
6104
}
6105
}
6106
6107
#if OMPD_SUPPORT
6108
if (ompd_state & OMPD_ENABLE_BP)
6109
ompd_bp_thread_end();
6110
#endif
6111
6112
#if OMPT_SUPPORT
6113
if (ompt_enabled.ompt_callback_thread_end) {
6114
ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6115
}
6116
#endif
6117
6118
this_thr->th.th_task_team = NULL;
6119
/* run the destructors for the threadprivate data for this thread */
6120
__kmp_common_destroy_gtid(gtid);
6121
6122
KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6123
KMP_MB();
6124
6125
#if OMP_PROFILING_SUPPORT
6126
llvm::timeTraceProfilerFinishThread();
6127
#endif
6128
return this_thr;
6129
}
6130
6131
/* ------------------------------------------------------------------------ */
6132
6133
void __kmp_internal_end_dest(void *specific_gtid) {
6134
// Make sure no significant bits are lost
6135
int gtid;
6136
__kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6137
6138
KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6139
/* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6140
* this is because 0 is reserved for the nothing-stored case */
6141
6142
__kmp_internal_end_thread(gtid);
6143
}
6144
6145
#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6146
6147
__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6148
__kmp_internal_end_atexit();
6149
}
6150
6151
#endif
6152
6153
/* [Windows] josh: when the atexit handler is called, there may still be more
6154
than one thread alive */
6155
void __kmp_internal_end_atexit(void) {
6156
KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6157
/* [Windows]
6158
josh: ideally, we want to completely shutdown the library in this atexit
6159
handler, but stat code that depends on thread specific data for gtid fails
6160
because that data becomes unavailable at some point during the shutdown, so
6161
we call __kmp_internal_end_thread instead. We should eventually remove the
6162
dependency on __kmp_get_specific_gtid in the stat code and use
6163
__kmp_internal_end_library to cleanly shutdown the library.
6164
6165
// TODO: Can some of this comment about GVS be removed?
6166
I suspect that the offending stat code is executed when the calling thread
6167
tries to clean up a dead root thread's data structures, resulting in GVS
6168
code trying to close the GVS structures for that thread, but since the stat
6169
code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6170
the calling thread is cleaning up itself instead of another thread, it get
6171
confused. This happens because allowing a thread to unregister and cleanup
6172
another thread is a recent modification for addressing an issue.
6173
Based on the current design (20050722), a thread may end up
6174
trying to unregister another thread only if thread death does not trigger
6175
the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6176
thread specific data destructor function to detect thread death. For
6177
Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6178
is nothing. Thus, the workaround is applicable only for Windows static
6179
stat library. */
6180
__kmp_internal_end_library(-1);
6181
#if KMP_OS_WINDOWS
6182
__kmp_close_console();
6183
#endif
6184
}
6185
6186
static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6187
// It is assumed __kmp_forkjoin_lock is acquired.
6188
6189
int gtid;
6190
6191
KMP_DEBUG_ASSERT(thread != NULL);
6192
6193
gtid = thread->th.th_info.ds.ds_gtid;
6194
6195
if (!is_root) {
6196
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6197
/* Assume the threads are at the fork barrier here */
6198
KA_TRACE(
6199
20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6200
gtid));
6201
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6202
while (
6203
!KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6204
KMP_CPU_PAUSE();
6205
__kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6206
} else {
6207
/* Need release fence here to prevent seg faults for tree forkjoin
6208
barrier (GEH) */
6209
kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6210
thread);
6211
__kmp_release_64(&flag);
6212
}
6213
}
6214
6215
// Terminate OS thread.
6216
__kmp_reap_worker(thread);
6217
6218
// The thread was killed asynchronously. If it was actively
6219
// spinning in the thread pool, decrement the global count.
6220
//
6221
// There is a small timing hole here - if the worker thread was just waking
6222
// up after sleeping in the pool, had reset it's th_active_in_pool flag but
6223
// not decremented the global counter __kmp_thread_pool_active_nth yet, then
6224
// the global counter might not get updated.
6225
//
6226
// Currently, this can only happen as the library is unloaded,
6227
// so there are no harmful side effects.
6228
if (thread->th.th_active_in_pool) {
6229
thread->th.th_active_in_pool = FALSE;
6230
KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6231
KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6232
}
6233
}
6234
6235
__kmp_free_implicit_task(thread);
6236
6237
// Free the fast memory for tasking
6238
#if USE_FAST_MEMORY
6239
__kmp_free_fast_memory(thread);
6240
#endif /* USE_FAST_MEMORY */
6241
6242
__kmp_suspend_uninitialize_thread(thread);
6243
6244
KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6245
TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6246
6247
--__kmp_all_nth;
6248
// __kmp_nth was decremented when thread is added to the pool.
6249
6250
#ifdef KMP_ADJUST_BLOCKTIME
6251
/* Adjust blocktime back to user setting or default if necessary */
6252
/* Middle initialization might never have occurred */
6253
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6254
KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6255
if (__kmp_nth <= __kmp_avail_proc) {
6256
__kmp_zero_bt = FALSE;
6257
}
6258
}
6259
#endif /* KMP_ADJUST_BLOCKTIME */
6260
6261
/* free the memory being used */
6262
if (__kmp_env_consistency_check) {
6263
if (thread->th.th_cons) {
6264
__kmp_free_cons_stack(thread->th.th_cons);
6265
thread->th.th_cons = NULL;
6266
}
6267
}
6268
6269
if (thread->th.th_pri_common != NULL) {
6270
__kmp_free(thread->th.th_pri_common);
6271
thread->th.th_pri_common = NULL;
6272
}
6273
6274
#if KMP_USE_BGET
6275
if (thread->th.th_local.bget_data != NULL) {
6276
__kmp_finalize_bget(thread);
6277
}
6278
#endif
6279
6280
#if KMP_AFFINITY_SUPPORTED
6281
if (thread->th.th_affin_mask != NULL) {
6282
KMP_CPU_FREE(thread->th.th_affin_mask);
6283
thread->th.th_affin_mask = NULL;
6284
}
6285
#endif /* KMP_AFFINITY_SUPPORTED */
6286
6287
#if KMP_USE_HIER_SCHED
6288
if (thread->th.th_hier_bar_data != NULL) {
6289
__kmp_free(thread->th.th_hier_bar_data);
6290
thread->th.th_hier_bar_data = NULL;
6291
}
6292
#endif
6293
6294
__kmp_reap_team(thread->th.th_serial_team);
6295
thread->th.th_serial_team = NULL;
6296
__kmp_free(thread);
6297
6298
KMP_MB();
6299
6300
} // __kmp_reap_thread
6301
6302
static void __kmp_itthash_clean(kmp_info_t *th) {
6303
#if USE_ITT_NOTIFY
6304
if (__kmp_itt_region_domains.count > 0) {
6305
for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6306
kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6307
while (bucket) {
6308
kmp_itthash_entry_t *next = bucket->next_in_bucket;
6309
__kmp_thread_free(th, bucket);
6310
bucket = next;
6311
}
6312
}
6313
}
6314
if (__kmp_itt_barrier_domains.count > 0) {
6315
for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6316
kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6317
while (bucket) {
6318
kmp_itthash_entry_t *next = bucket->next_in_bucket;
6319
__kmp_thread_free(th, bucket);
6320
bucket = next;
6321
}
6322
}
6323
}
6324
#endif
6325
}
6326
6327
static void __kmp_internal_end(void) {
6328
int i;
6329
6330
/* First, unregister the library */
6331
__kmp_unregister_library();
6332
6333
#if KMP_OS_WINDOWS
6334
/* In Win static library, we can't tell when a root actually dies, so we
6335
reclaim the data structures for any root threads that have died but not
6336
unregistered themselves, in order to shut down cleanly.
6337
In Win dynamic library we also can't tell when a thread dies. */
6338
__kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6339
// dead roots
6340
#endif
6341
6342
for (i = 0; i < __kmp_threads_capacity; i++)
6343
if (__kmp_root[i])
6344
if (__kmp_root[i]->r.r_active)
6345
break;
6346
KMP_MB(); /* Flush all pending memory write invalidates. */
6347
TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6348
6349
if (i < __kmp_threads_capacity) {
6350
#if KMP_USE_MONITOR
6351
// 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6352
KMP_MB(); /* Flush all pending memory write invalidates. */
6353
6354
// Need to check that monitor was initialized before reaping it. If we are
6355
// called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6356
// __kmp_monitor will appear to contain valid data, but it is only valid in
6357
// the parent process, not the child.
6358
// New behavior (201008): instead of keying off of the flag
6359
// __kmp_init_parallel, the monitor thread creation is keyed off
6360
// of the new flag __kmp_init_monitor.
6361
__kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6362
if (TCR_4(__kmp_init_monitor)) {
6363
__kmp_reap_monitor(&__kmp_monitor);
6364
TCW_4(__kmp_init_monitor, 0);
6365
}
6366
__kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6367
KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6368
#endif // KMP_USE_MONITOR
6369
} else {
6370
/* TODO move this to cleanup code */
6371
#ifdef KMP_DEBUG
6372
/* make sure that everything has properly ended */
6373
for (i = 0; i < __kmp_threads_capacity; i++) {
6374
if (__kmp_root[i]) {
6375
// KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6376
// there can be uber threads alive here
6377
KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6378
}
6379
}
6380
#endif
6381
6382
KMP_MB();
6383
6384
// Reap the worker threads.
6385
// This is valid for now, but be careful if threads are reaped sooner.
6386
while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6387
// Get the next thread from the pool.
6388
kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6389
__kmp_thread_pool = thread->th.th_next_pool;
6390
// Reap it.
6391
KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6392
thread->th.th_next_pool = NULL;
6393
thread->th.th_in_pool = FALSE;
6394
__kmp_reap_thread(thread, 0);
6395
}
6396
__kmp_thread_pool_insert_pt = NULL;
6397
6398
// Reap teams.
6399
while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6400
// Get the next team from the pool.
6401
kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6402
__kmp_team_pool = team->t.t_next_pool;
6403
// Reap it.
6404
team->t.t_next_pool = NULL;
6405
__kmp_reap_team(team);
6406
}
6407
6408
__kmp_reap_task_teams();
6409
6410
#if KMP_OS_UNIX
6411
// Threads that are not reaped should not access any resources since they
6412
// are going to be deallocated soon, so the shutdown sequence should wait
6413
// until all threads either exit the final spin-waiting loop or begin
6414
// sleeping after the given blocktime.
6415
for (i = 0; i < __kmp_threads_capacity; i++) {
6416
kmp_info_t *thr = __kmp_threads[i];
6417
while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6418
KMP_CPU_PAUSE();
6419
}
6420
#endif
6421
6422
for (i = 0; i < __kmp_threads_capacity; ++i) {
6423
// TBD: Add some checking...
6424
// Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6425
}
6426
6427
/* Make sure all threadprivate destructors get run by joining with all
6428
worker threads before resetting this flag */
6429
TCW_SYNC_4(__kmp_init_common, FALSE);
6430
6431
KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6432
KMP_MB();
6433
6434
#if KMP_USE_MONITOR
6435
// See note above: One of the possible fixes for CQ138434 / CQ140126
6436
//
6437
// FIXME: push both code fragments down and CSE them?
6438
// push them into __kmp_cleanup() ?
6439
__kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6440
if (TCR_4(__kmp_init_monitor)) {
6441
__kmp_reap_monitor(&__kmp_monitor);
6442
TCW_4(__kmp_init_monitor, 0);
6443
}
6444
__kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6445
KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6446
#endif
6447
} /* else !__kmp_global.t_active */
6448
TCW_4(__kmp_init_gtid, FALSE);
6449
KMP_MB(); /* Flush all pending memory write invalidates. */
6450
6451
__kmp_cleanup();
6452
#if OMPT_SUPPORT
6453
ompt_fini();
6454
#endif
6455
}
6456
6457
void __kmp_internal_end_library(int gtid_req) {
6458
/* if we have already cleaned up, don't try again, it wouldn't be pretty */
6459
/* this shouldn't be a race condition because __kmp_internal_end() is the
6460
only place to clear __kmp_serial_init */
6461
/* we'll check this later too, after we get the lock */
6462
// 2009-09-06: We do not set g_abort without setting g_done. This check looks
6463
// redundant, because the next check will work in any case.
6464
if (__kmp_global.g.g_abort) {
6465
KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6466
/* TODO abort? */
6467
return;
6468
}
6469
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6470
KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6471
return;
6472
}
6473
6474
// If hidden helper team has been initialized, we need to deinit it
6475
if (TCR_4(__kmp_init_hidden_helper) &&
6476
!TCR_4(__kmp_hidden_helper_team_done)) {
6477
TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6478
// First release the main thread to let it continue its work
6479
__kmp_hidden_helper_main_thread_release();
6480
// Wait until the hidden helper team has been destroyed
6481
__kmp_hidden_helper_threads_deinitz_wait();
6482
}
6483
6484
KMP_MB(); /* Flush all pending memory write invalidates. */
6485
/* find out who we are and what we should do */
6486
{
6487
int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6488
KA_TRACE(
6489
10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6490
if (gtid == KMP_GTID_SHUTDOWN) {
6491
KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6492
"already shutdown\n"));
6493
return;
6494
} else if (gtid == KMP_GTID_MONITOR) {
6495
KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6496
"registered, or system shutdown\n"));
6497
return;
6498
} else if (gtid == KMP_GTID_DNE) {
6499
KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6500
"shutdown\n"));
6501
/* we don't know who we are, but we may still shutdown the library */
6502
} else if (KMP_UBER_GTID(gtid)) {
6503
/* unregister ourselves as an uber thread. gtid is no longer valid */
6504
if (__kmp_root[gtid]->r.r_active) {
6505
__kmp_global.g.g_abort = -1;
6506
TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6507
__kmp_unregister_library();
6508
KA_TRACE(10,
6509
("__kmp_internal_end_library: root still active, abort T#%d\n",
6510
gtid));
6511
return;
6512
} else {
6513
__kmp_itthash_clean(__kmp_threads[gtid]);
6514
KA_TRACE(
6515
10,
6516
("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6517
__kmp_unregister_root_current_thread(gtid);
6518
}
6519
} else {
6520
/* worker threads may call this function through the atexit handler, if they
6521
* call exit() */
6522
/* For now, skip the usual subsequent processing and just dump the debug buffer.
6523
TODO: do a thorough shutdown instead */
6524
#ifdef DUMP_DEBUG_ON_EXIT
6525
if (__kmp_debug_buf)
6526
__kmp_dump_debug_buffer();
6527
#endif
6528
// added unregister library call here when we switch to shm linux
6529
// if we don't, it will leave lots of files in /dev/shm
6530
// cleanup shared memory file before exiting.
6531
__kmp_unregister_library();
6532
return;
6533
}
6534
}
6535
/* synchronize the termination process */
6536
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6537
6538
/* have we already finished */
6539
if (__kmp_global.g.g_abort) {
6540
KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6541
/* TODO abort? */
6542
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543
return;
6544
}
6545
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6546
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547
return;
6548
}
6549
6550
/* We need this lock to enforce mutex between this reading of
6551
__kmp_threads_capacity and the writing by __kmp_register_root.
6552
Alternatively, we can use a counter of roots that is atomically updated by
6553
__kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6554
__kmp_internal_end_*. */
6555
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6556
6557
/* now we can safely conduct the actual termination */
6558
__kmp_internal_end();
6559
6560
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6561
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6562
6563
KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6564
6565
#ifdef DUMP_DEBUG_ON_EXIT
6566
if (__kmp_debug_buf)
6567
__kmp_dump_debug_buffer();
6568
#endif
6569
6570
#if KMP_OS_WINDOWS
6571
__kmp_close_console();
6572
#endif
6573
6574
__kmp_fini_allocator();
6575
6576
} // __kmp_internal_end_library
6577
6578
void __kmp_internal_end_thread(int gtid_req) {
6579
int i;
6580
6581
/* if we have already cleaned up, don't try again, it wouldn't be pretty */
6582
/* this shouldn't be a race condition because __kmp_internal_end() is the
6583
* only place to clear __kmp_serial_init */
6584
/* we'll check this later too, after we get the lock */
6585
// 2009-09-06: We do not set g_abort without setting g_done. This check looks
6586
// redundant, because the next check will work in any case.
6587
if (__kmp_global.g.g_abort) {
6588
KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6589
/* TODO abort? */
6590
return;
6591
}
6592
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6593
KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6594
return;
6595
}
6596
6597
// If hidden helper team has been initialized, we need to deinit it
6598
if (TCR_4(__kmp_init_hidden_helper) &&
6599
!TCR_4(__kmp_hidden_helper_team_done)) {
6600
TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6601
// First release the main thread to let it continue its work
6602
__kmp_hidden_helper_main_thread_release();
6603
// Wait until the hidden helper team has been destroyed
6604
__kmp_hidden_helper_threads_deinitz_wait();
6605
}
6606
6607
KMP_MB(); /* Flush all pending memory write invalidates. */
6608
6609
/* find out who we are and what we should do */
6610
{
6611
int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6612
KA_TRACE(10,
6613
("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6614
if (gtid == KMP_GTID_SHUTDOWN) {
6615
KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6616
"already shutdown\n"));
6617
return;
6618
} else if (gtid == KMP_GTID_MONITOR) {
6619
KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6620
"registered, or system shutdown\n"));
6621
return;
6622
} else if (gtid == KMP_GTID_DNE) {
6623
KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6624
"shutdown\n"));
6625
return;
6626
/* we don't know who we are */
6627
} else if (KMP_UBER_GTID(gtid)) {
6628
/* unregister ourselves as an uber thread. gtid is no longer valid */
6629
if (__kmp_root[gtid]->r.r_active) {
6630
__kmp_global.g.g_abort = -1;
6631
TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6632
KA_TRACE(10,
6633
("__kmp_internal_end_thread: root still active, abort T#%d\n",
6634
gtid));
6635
return;
6636
} else {
6637
KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6638
gtid));
6639
__kmp_unregister_root_current_thread(gtid);
6640
}
6641
} else {
6642
/* just a worker thread, let's leave */
6643
KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6644
6645
if (gtid >= 0) {
6646
__kmp_threads[gtid]->th.th_task_team = NULL;
6647
}
6648
6649
KA_TRACE(10,
6650
("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6651
gtid));
6652
return;
6653
}
6654
}
6655
#if KMP_DYNAMIC_LIB
6656
if (__kmp_pause_status != kmp_hard_paused)
6657
// AC: lets not shutdown the dynamic library at the exit of uber thread,
6658
// because we will better shutdown later in the library destructor.
6659
{
6660
KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6661
return;
6662
}
6663
#endif
6664
/* synchronize the termination process */
6665
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6666
6667
/* have we already finished */
6668
if (__kmp_global.g.g_abort) {
6669
KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6670
/* TODO abort? */
6671
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6672
return;
6673
}
6674
if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6675
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6676
return;
6677
}
6678
6679
/* We need this lock to enforce mutex between this reading of
6680
__kmp_threads_capacity and the writing by __kmp_register_root.
6681
Alternatively, we can use a counter of roots that is atomically updated by
6682
__kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6683
__kmp_internal_end_*. */
6684
6685
/* should we finish the run-time? are all siblings done? */
6686
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6687
6688
for (i = 0; i < __kmp_threads_capacity; ++i) {
6689
if (KMP_UBER_GTID(i)) {
6690
KA_TRACE(
6691
10,
6692
("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6693
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6694
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6695
return;
6696
}
6697
}
6698
6699
/* now we can safely conduct the actual termination */
6700
6701
__kmp_internal_end();
6702
6703
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6704
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705
6706
KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6707
6708
#ifdef DUMP_DEBUG_ON_EXIT
6709
if (__kmp_debug_buf)
6710
__kmp_dump_debug_buffer();
6711
#endif
6712
} // __kmp_internal_end_thread
6713
6714
// -----------------------------------------------------------------------------
6715
// Library registration stuff.
6716
6717
static long __kmp_registration_flag = 0;
6718
// Random value used to indicate library initialization.
6719
static char *__kmp_registration_str = NULL;
6720
// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6721
6722
static inline char *__kmp_reg_status_name() {
6723
/* On RHEL 3u5 if linked statically, getpid() returns different values in
6724
each thread. If registration and unregistration go in different threads
6725
(omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6726
env var can not be found, because the name will contain different pid. */
6727
// macOS* complains about name being too long with additional getuid()
6728
#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6729
return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6730
(int)getuid());
6731
#else
6732
return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6733
#endif
6734
} // __kmp_reg_status_get
6735
6736
#if defined(KMP_USE_SHM)
6737
bool __kmp_shm_available = false;
6738
bool __kmp_tmp_available = false;
6739
// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6740
char *temp_reg_status_file_name = nullptr;
6741
#endif
6742
6743
void __kmp_register_library_startup(void) {
6744
6745
char *name = __kmp_reg_status_name(); // Name of the environment variable.
6746
int done = 0;
6747
union {
6748
double dtime;
6749
long ltime;
6750
} time;
6751
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6752
__kmp_initialize_system_tick();
6753
#endif
6754
__kmp_read_system_time(&time.dtime);
6755
__kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6756
__kmp_registration_str =
6757
__kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6758
__kmp_registration_flag, KMP_LIBRARY_FILE);
6759
6760
KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6761
__kmp_registration_str));
6762
6763
while (!done) {
6764
6765
char *value = NULL; // Actual value of the environment variable.
6766
6767
#if defined(KMP_USE_SHM)
6768
char *shm_name = nullptr;
6769
char *data1 = nullptr;
6770
__kmp_shm_available = __kmp_detect_shm();
6771
if (__kmp_shm_available) {
6772
int fd1 = -1;
6773
shm_name = __kmp_str_format("/%s", name);
6774
int shm_preexist = 0;
6775
fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6776
if ((fd1 == -1) && (errno == EEXIST)) {
6777
// file didn't open because it already exists.
6778
// try opening existing file
6779
fd1 = shm_open(shm_name, O_RDWR, 0600);
6780
if (fd1 == -1) { // file didn't open
6781
KMP_WARNING(FunctionError, "Can't open SHM");
6782
__kmp_shm_available = false;
6783
} else { // able to open existing file
6784
shm_preexist = 1;
6785
}
6786
}
6787
if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6788
if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6789
KMP_WARNING(FunctionError, "Can't set size of SHM");
6790
__kmp_shm_available = false;
6791
}
6792
}
6793
if (__kmp_shm_available) { // SHM exists, now map it
6794
data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6795
fd1, 0);
6796
if (data1 == MAP_FAILED) { // failed to map shared memory
6797
KMP_WARNING(FunctionError, "Can't map SHM");
6798
__kmp_shm_available = false;
6799
}
6800
}
6801
if (__kmp_shm_available) { // SHM mapped
6802
if (shm_preexist == 0) { // set data to SHM, set value
6803
KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6804
}
6805
// Read value from either what we just wrote or existing file.
6806
value = __kmp_str_format("%s", data1); // read value from SHM
6807
munmap(data1, SHM_SIZE);
6808
}
6809
if (fd1 != -1)
6810
close(fd1);
6811
}
6812
if (!__kmp_shm_available)
6813
__kmp_tmp_available = __kmp_detect_tmp();
6814
if (!__kmp_shm_available && __kmp_tmp_available) {
6815
// SHM failed to work due to an error other than that the file already
6816
// exists. Try to create a temp file under /tmp.
6817
// If /tmp isn't accessible, fall back to using environment variable.
6818
// TODO: /tmp might not always be the temporary directory. For now we will
6819
// not consider TMPDIR.
6820
int fd1 = -1;
6821
temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6822
int tmp_preexist = 0;
6823
fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6824
if ((fd1 == -1) && (errno == EEXIST)) {
6825
// file didn't open because it already exists.
6826
// try opening existing file
6827
fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6828
if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6829
KMP_WARNING(FunctionError, "Can't open TEMP");
6830
__kmp_tmp_available = false;
6831
} else {
6832
tmp_preexist = 1;
6833
}
6834
}
6835
if (__kmp_tmp_available && tmp_preexist == 0) {
6836
// we created /tmp file now set size
6837
if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6838
KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6839
__kmp_tmp_available = false;
6840
}
6841
}
6842
if (__kmp_tmp_available) {
6843
data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6844
fd1, 0);
6845
if (data1 == MAP_FAILED) { // failed to map /tmp
6846
KMP_WARNING(FunctionError, "Can't map /tmp");
6847
__kmp_tmp_available = false;
6848
}
6849
}
6850
if (__kmp_tmp_available) {
6851
if (tmp_preexist == 0) { // set data to TMP, set value
6852
KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6853
}
6854
// Read value from either what we just wrote or existing file.
6855
value = __kmp_str_format("%s", data1); // read value from SHM
6856
munmap(data1, SHM_SIZE);
6857
}
6858
if (fd1 != -1)
6859
close(fd1);
6860
}
6861
if (!__kmp_shm_available && !__kmp_tmp_available) {
6862
// no /dev/shm and no /tmp -- fall back to environment variable
6863
// Set environment variable, but do not overwrite if it exists.
6864
__kmp_env_set(name, __kmp_registration_str, 0);
6865
// read value to see if it got set
6866
value = __kmp_env_get(name);
6867
}
6868
#else // Windows and unix with static library
6869
// Set environment variable, but do not overwrite if it exists.
6870
__kmp_env_set(name, __kmp_registration_str, 0);
6871
// read value to see if it got set
6872
value = __kmp_env_get(name);
6873
#endif
6874
6875
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6876
done = 1; // Ok, environment variable set successfully, exit the loop.
6877
} else {
6878
// Oops. Write failed. Another copy of OpenMP RTL is in memory.
6879
// Check whether it alive or dead.
6880
int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6881
char *tail = value;
6882
char *flag_addr_str = NULL;
6883
char *flag_val_str = NULL;
6884
char const *file_name = NULL;
6885
__kmp_str_split(tail, '-', &flag_addr_str, &tail);
6886
__kmp_str_split(tail, '-', &flag_val_str, &tail);
6887
file_name = tail;
6888
if (tail != NULL) {
6889
unsigned long *flag_addr = 0;
6890
unsigned long flag_val = 0;
6891
KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6892
KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6893
if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6894
// First, check whether environment-encoded address is mapped into
6895
// addr space.
6896
// If so, dereference it to see if it still has the right value.
6897
if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6898
neighbor = 1;
6899
} else {
6900
// If not, then we know the other copy of the library is no longer
6901
// running.
6902
neighbor = 2;
6903
}
6904
}
6905
}
6906
switch (neighbor) {
6907
case 0: // Cannot parse environment variable -- neighbor status unknown.
6908
// Assume it is the incompatible format of future version of the
6909
// library. Assume the other library is alive.
6910
// WARN( ... ); // TODO: Issue a warning.
6911
file_name = "unknown library";
6912
KMP_FALLTHROUGH();
6913
// Attention! Falling to the next case. That's intentional.
6914
case 1: { // Neighbor is alive.
6915
// Check it is allowed.
6916
char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6917
if (!__kmp_str_match_true(duplicate_ok)) {
6918
// That's not allowed. Issue fatal error.
6919
__kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6920
KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6921
}
6922
KMP_INTERNAL_FREE(duplicate_ok);
6923
__kmp_duplicate_library_ok = 1;
6924
done = 1; // Exit the loop.
6925
} break;
6926
case 2: { // Neighbor is dead.
6927
6928
#if defined(KMP_USE_SHM)
6929
if (__kmp_shm_available) { // close shared memory.
6930
shm_unlink(shm_name); // this removes file in /dev/shm
6931
} else if (__kmp_tmp_available) {
6932
unlink(temp_reg_status_file_name); // this removes the temp file
6933
} else {
6934
// Clear the variable and try to register library again.
6935
__kmp_env_unset(name);
6936
}
6937
#else
6938
// Clear the variable and try to register library again.
6939
__kmp_env_unset(name);
6940
#endif
6941
} break;
6942
default: {
6943
KMP_DEBUG_ASSERT(0);
6944
} break;
6945
}
6946
}
6947
KMP_INTERNAL_FREE((void *)value);
6948
#if defined(KMP_USE_SHM)
6949
if (shm_name)
6950
KMP_INTERNAL_FREE((void *)shm_name);
6951
#endif
6952
} // while
6953
KMP_INTERNAL_FREE((void *)name);
6954
6955
} // func __kmp_register_library_startup
6956
6957
void __kmp_unregister_library(void) {
6958
6959
char *name = __kmp_reg_status_name();
6960
char *value = NULL;
6961
6962
#if defined(KMP_USE_SHM)
6963
char *shm_name = nullptr;
6964
int fd1;
6965
if (__kmp_shm_available) {
6966
shm_name = __kmp_str_format("/%s", name);
6967
fd1 = shm_open(shm_name, O_RDONLY, 0600);
6968
if (fd1 != -1) { // File opened successfully
6969
char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6970
if (data1 != MAP_FAILED) {
6971
value = __kmp_str_format("%s", data1); // read value from SHM
6972
munmap(data1, SHM_SIZE);
6973
}
6974
close(fd1);
6975
}
6976
} else if (__kmp_tmp_available) { // try /tmp
6977
fd1 = open(temp_reg_status_file_name, O_RDONLY);
6978
if (fd1 != -1) { // File opened successfully
6979
char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6980
if (data1 != MAP_FAILED) {
6981
value = __kmp_str_format("%s", data1); // read value from /tmp
6982
munmap(data1, SHM_SIZE);
6983
}
6984
close(fd1);
6985
}
6986
} else { // fall back to envirable
6987
value = __kmp_env_get(name);
6988
}
6989
#else
6990
value = __kmp_env_get(name);
6991
#endif
6992
6993
KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6994
KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6995
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6996
// Ok, this is our variable. Delete it.
6997
#if defined(KMP_USE_SHM)
6998
if (__kmp_shm_available) {
6999
shm_unlink(shm_name); // this removes file in /dev/shm
7000
} else if (__kmp_tmp_available) {
7001
unlink(temp_reg_status_file_name); // this removes the temp file
7002
} else {
7003
__kmp_env_unset(name);
7004
}
7005
#else
7006
__kmp_env_unset(name);
7007
#endif
7008
}
7009
7010
#if defined(KMP_USE_SHM)
7011
if (shm_name)
7012
KMP_INTERNAL_FREE(shm_name);
7013
if (temp_reg_status_file_name)
7014
KMP_INTERNAL_FREE(temp_reg_status_file_name);
7015
#endif
7016
7017
KMP_INTERNAL_FREE(__kmp_registration_str);
7018
KMP_INTERNAL_FREE(value);
7019
KMP_INTERNAL_FREE(name);
7020
7021
__kmp_registration_flag = 0;
7022
__kmp_registration_str = NULL;
7023
7024
} // __kmp_unregister_library
7025
7026
// End of Library registration stuff.
7027
// -----------------------------------------------------------------------------
7028
7029
#if KMP_MIC_SUPPORTED
7030
7031
static void __kmp_check_mic_type() {
7032
kmp_cpuid_t cpuid_state = {0};
7033
kmp_cpuid_t *cs_p = &cpuid_state;
7034
__kmp_x86_cpuid(1, 0, cs_p);
7035
// We don't support mic1 at the moment
7036
if ((cs_p->eax & 0xff0) == 0xB10) {
7037
__kmp_mic_type = mic2;
7038
} else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7039
__kmp_mic_type = mic3;
7040
} else {
7041
__kmp_mic_type = non_mic;
7042
}
7043
}
7044
7045
#endif /* KMP_MIC_SUPPORTED */
7046
7047
#if KMP_HAVE_UMWAIT
7048
static void __kmp_user_level_mwait_init() {
7049
struct kmp_cpuid buf;
7050
__kmp_x86_cpuid(7, 0, &buf);
7051
__kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7052
__kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7053
__kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7054
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7055
__kmp_umwait_enabled));
7056
}
7057
#elif KMP_HAVE_MWAIT
7058
#ifndef AT_INTELPHIUSERMWAIT
7059
// Spurious, non-existent value that should always fail to return anything.
7060
// Will be replaced with the correct value when we know that.
7061
#define AT_INTELPHIUSERMWAIT 10000
7062
#endif
7063
// getauxval() function is available in RHEL7 and SLES12. If a system with an
7064
// earlier OS is used to build the RTL, we'll use the following internal
7065
// function when the entry is not found.
7066
unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7067
unsigned long getauxval(unsigned long) { return 0; }
7068
7069
static void __kmp_user_level_mwait_init() {
7070
// When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7071
// use them to find if the user-level mwait is enabled. Otherwise, forcibly
7072
// set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7073
// KMP_USER_LEVEL_MWAIT was set to TRUE.
7074
if (__kmp_mic_type == mic3) {
7075
unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7076
if ((res & 0x1) || __kmp_user_level_mwait) {
7077
__kmp_mwait_enabled = TRUE;
7078
if (__kmp_user_level_mwait) {
7079
KMP_INFORM(EnvMwaitWarn);
7080
}
7081
} else {
7082
__kmp_mwait_enabled = FALSE;
7083
}
7084
}
7085
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7086
"__kmp_mwait_enabled = %d\n",
7087
__kmp_mic_type, __kmp_mwait_enabled));
7088
}
7089
#endif /* KMP_HAVE_UMWAIT */
7090
7091
static void __kmp_do_serial_initialize(void) {
7092
int i, gtid;
7093
size_t size;
7094
7095
KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7096
7097
KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7098
KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7099
KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7100
KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7101
KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7102
7103
#if OMPT_SUPPORT
7104
ompt_pre_init();
7105
#endif
7106
#if OMPD_SUPPORT
7107
__kmp_env_dump();
7108
ompd_init();
7109
#endif
7110
7111
__kmp_validate_locks();
7112
7113
#if ENABLE_LIBOMPTARGET
7114
/* Initialize functions from libomptarget */
7115
__kmp_init_omptarget();
7116
#endif
7117
7118
/* Initialize internal memory allocator */
7119
__kmp_init_allocator();
7120
7121
/* Register the library startup via an environment variable or via mapped
7122
shared memory file and check to see whether another copy of the library is
7123
already registered. Since forked child process is often terminated, we
7124
postpone the registration till middle initialization in the child */
7125
if (__kmp_need_register_serial)
7126
__kmp_register_library_startup();
7127
7128
/* TODO reinitialization of library */
7129
if (TCR_4(__kmp_global.g.g_done)) {
7130
KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7131
}
7132
7133
__kmp_global.g.g_abort = 0;
7134
TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7135
7136
/* initialize the locks */
7137
#if KMP_USE_ADAPTIVE_LOCKS
7138
#if KMP_DEBUG_ADAPTIVE_LOCKS
7139
__kmp_init_speculative_stats();
7140
#endif
7141
#endif
7142
#if KMP_STATS_ENABLED
7143
__kmp_stats_init();
7144
#endif
7145
__kmp_init_lock(&__kmp_global_lock);
7146
__kmp_init_queuing_lock(&__kmp_dispatch_lock);
7147
__kmp_init_lock(&__kmp_debug_lock);
7148
__kmp_init_atomic_lock(&__kmp_atomic_lock);
7149
__kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7150
__kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7151
__kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7152
__kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7153
__kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7154
__kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7155
__kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7156
__kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7157
__kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7158
__kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7159
__kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7160
__kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7161
__kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7162
__kmp_init_bootstrap_lock(&__kmp_exit_lock);
7163
#if KMP_USE_MONITOR
7164
__kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7165
#endif
7166
__kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7167
7168
/* conduct initialization and initial setup of configuration */
7169
7170
__kmp_runtime_initialize();
7171
7172
#if KMP_MIC_SUPPORTED
7173
__kmp_check_mic_type();
7174
#endif
7175
7176
// Some global variable initialization moved here from kmp_env_initialize()
7177
#ifdef KMP_DEBUG
7178
kmp_diag = 0;
7179
#endif
7180
__kmp_abort_delay = 0;
7181
7182
// From __kmp_init_dflt_team_nth()
7183
/* assume the entire machine will be used */
7184
__kmp_dflt_team_nth_ub = __kmp_xproc;
7185
if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7186
__kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7187
}
7188
if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7189
__kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7190
}
7191
__kmp_max_nth = __kmp_sys_max_nth;
7192
__kmp_cg_max_nth = __kmp_sys_max_nth;
7193
__kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7194
if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7195
__kmp_teams_max_nth = __kmp_sys_max_nth;
7196
}
7197
7198
// Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7199
// part
7200
__kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7201
#if KMP_USE_MONITOR
7202
__kmp_monitor_wakeups =
7203
KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7204
__kmp_bt_intervals =
7205
KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7206
#endif
7207
// From "KMP_LIBRARY" part of __kmp_env_initialize()
7208
__kmp_library = library_throughput;
7209
// From KMP_SCHEDULE initialization
7210
__kmp_static = kmp_sch_static_balanced;
7211
// AC: do not use analytical here, because it is non-monotonous
7212
//__kmp_guided = kmp_sch_guided_iterative_chunked;
7213
//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7214
// need to repeat assignment
7215
// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7216
// bit control and barrier method control parts
7217
#if KMP_FAST_REDUCTION_BARRIER
7218
#define kmp_reduction_barrier_gather_bb ((int)1)
7219
#define kmp_reduction_barrier_release_bb ((int)1)
7220
#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7221
#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7222
#endif // KMP_FAST_REDUCTION_BARRIER
7223
for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7224
__kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7225
__kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7226
__kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7227
__kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7228
#if KMP_FAST_REDUCTION_BARRIER
7229
if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7230
// lin_64 ): hyper,1
7231
__kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7232
__kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7233
__kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7234
__kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7235
}
7236
#endif // KMP_FAST_REDUCTION_BARRIER
7237
}
7238
#if KMP_FAST_REDUCTION_BARRIER
7239
#undef kmp_reduction_barrier_release_pat
7240
#undef kmp_reduction_barrier_gather_pat
7241
#undef kmp_reduction_barrier_release_bb
7242
#undef kmp_reduction_barrier_gather_bb
7243
#endif // KMP_FAST_REDUCTION_BARRIER
7244
#if KMP_MIC_SUPPORTED
7245
if (__kmp_mic_type == mic2) { // KNC
7246
// AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7247
__kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7248
__kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7249
1; // forkjoin release
7250
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7251
__kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252
}
7253
#if KMP_FAST_REDUCTION_BARRIER
7254
if (__kmp_mic_type == mic2) { // KNC
7255
__kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7256
__kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257
}
7258
#endif // KMP_FAST_REDUCTION_BARRIER
7259
#endif // KMP_MIC_SUPPORTED
7260
7261
// From KMP_CHECKS initialization
7262
#ifdef KMP_DEBUG
7263
__kmp_env_checks = TRUE; /* development versions have the extra checks */
7264
#else
7265
__kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7266
#endif
7267
7268
// From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7269
__kmp_foreign_tp = TRUE;
7270
7271
__kmp_global.g.g_dynamic = FALSE;
7272
__kmp_global.g.g_dynamic_mode = dynamic_default;
7273
7274
__kmp_init_nesting_mode();
7275
7276
__kmp_env_initialize(NULL);
7277
7278
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7279
__kmp_user_level_mwait_init();
7280
#endif
7281
// Print all messages in message catalog for testing purposes.
7282
#ifdef KMP_DEBUG
7283
char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7284
if (__kmp_str_match_true(val)) {
7285
kmp_str_buf_t buffer;
7286
__kmp_str_buf_init(&buffer);
7287
__kmp_i18n_dump_catalog(&buffer);
7288
__kmp_printf("%s", buffer.str);
7289
__kmp_str_buf_free(&buffer);
7290
}
7291
__kmp_env_free(&val);
7292
#endif
7293
7294
__kmp_threads_capacity =
7295
__kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7296
// Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7297
__kmp_tp_capacity = __kmp_default_tp_capacity(
7298
__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7299
7300
// If the library is shut down properly, both pools must be NULL. Just in
7301
// case, set them to NULL -- some memory may leak, but subsequent code will
7302
// work even if pools are not freed.
7303
KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7304
KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7305
KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7306
__kmp_thread_pool = NULL;
7307
__kmp_thread_pool_insert_pt = NULL;
7308
__kmp_team_pool = NULL;
7309
7310
/* Allocate all of the variable sized records */
7311
/* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7312
* expandable */
7313
/* Since allocation is cache-aligned, just add extra padding at the end */
7314
size =
7315
(sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7316
CACHE_LINE;
7317
__kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7318
__kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7319
sizeof(kmp_info_t *) * __kmp_threads_capacity);
7320
7321
/* init thread counts */
7322
KMP_DEBUG_ASSERT(__kmp_all_nth ==
7323
0); // Asserts fail if the library is reinitializing and
7324
KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7325
__kmp_all_nth = 0;
7326
__kmp_nth = 0;
7327
7328
/* setup the uber master thread and hierarchy */
7329
gtid = __kmp_register_root(TRUE);
7330
KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7331
KMP_ASSERT(KMP_UBER_GTID(gtid));
7332
KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7333
7334
KMP_MB(); /* Flush all pending memory write invalidates. */
7335
7336
__kmp_common_initialize();
7337
7338
#if KMP_OS_UNIX
7339
/* invoke the child fork handler */
7340
__kmp_register_atfork();
7341
#endif
7342
7343
#if !KMP_DYNAMIC_LIB || \
7344
((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7345
{
7346
/* Invoke the exit handler when the program finishes, only for static
7347
library and macOS* dynamic. For other dynamic libraries, we already
7348
have _fini and DllMain. */
7349
int rc = atexit(__kmp_internal_end_atexit);
7350
if (rc != 0) {
7351
__kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7352
__kmp_msg_null);
7353
}
7354
}
7355
#endif
7356
7357
#if KMP_HANDLE_SIGNALS
7358
#if KMP_OS_UNIX
7359
/* NOTE: make sure that this is called before the user installs their own
7360
signal handlers so that the user handlers are called first. this way they
7361
can return false, not call our handler, avoid terminating the library, and
7362
continue execution where they left off. */
7363
__kmp_install_signals(FALSE);
7364
#endif /* KMP_OS_UNIX */
7365
#if KMP_OS_WINDOWS
7366
__kmp_install_signals(TRUE);
7367
#endif /* KMP_OS_WINDOWS */
7368
#endif
7369
7370
/* we have finished the serial initialization */
7371
__kmp_init_counter++;
7372
7373
__kmp_init_serial = TRUE;
7374
7375
if (__kmp_version) {
7376
__kmp_print_version_1();
7377
}
7378
7379
if (__kmp_settings) {
7380
__kmp_env_print();
7381
}
7382
7383
if (__kmp_display_env || __kmp_display_env_verbose) {
7384
__kmp_env_print_2();
7385
}
7386
7387
#if OMPT_SUPPORT
7388
ompt_post_init();
7389
#endif
7390
7391
KMP_MB();
7392
7393
KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7394
}
7395
7396
void __kmp_serial_initialize(void) {
7397
if (__kmp_init_serial) {
7398
return;
7399
}
7400
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7401
if (__kmp_init_serial) {
7402
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7403
return;
7404
}
7405
__kmp_do_serial_initialize();
7406
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7407
}
7408
7409
static void __kmp_do_middle_initialize(void) {
7410
int i, j;
7411
int prev_dflt_team_nth;
7412
7413
if (!__kmp_init_serial) {
7414
__kmp_do_serial_initialize();
7415
}
7416
7417
KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7418
7419
if (UNLIKELY(!__kmp_need_register_serial)) {
7420
// We are in a forked child process. The registration was skipped during
7421
// serial initialization in __kmp_atfork_child handler. Do it here.
7422
__kmp_register_library_startup();
7423
}
7424
7425
// Save the previous value for the __kmp_dflt_team_nth so that
7426
// we can avoid some reinitialization if it hasn't changed.
7427
prev_dflt_team_nth = __kmp_dflt_team_nth;
7428
7429
#if KMP_AFFINITY_SUPPORTED
7430
// __kmp_affinity_initialize() will try to set __kmp_ncores to the
7431
// number of cores on the machine.
7432
__kmp_affinity_initialize(__kmp_affinity);
7433
7434
#endif /* KMP_AFFINITY_SUPPORTED */
7435
7436
KMP_ASSERT(__kmp_xproc > 0);
7437
if (__kmp_avail_proc == 0) {
7438
__kmp_avail_proc = __kmp_xproc;
7439
}
7440
7441
// If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7442
// correct them now
7443
j = 0;
7444
while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7445
__kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7446
__kmp_avail_proc;
7447
j++;
7448
}
7449
7450
if (__kmp_dflt_team_nth == 0) {
7451
#ifdef KMP_DFLT_NTH_CORES
7452
// Default #threads = #cores
7453
__kmp_dflt_team_nth = __kmp_ncores;
7454
KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7455
"__kmp_ncores (%d)\n",
7456
__kmp_dflt_team_nth));
7457
#else
7458
// Default #threads = #available OS procs
7459
__kmp_dflt_team_nth = __kmp_avail_proc;
7460
KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7461
"__kmp_avail_proc(%d)\n",
7462
__kmp_dflt_team_nth));
7463
#endif /* KMP_DFLT_NTH_CORES */
7464
}
7465
7466
if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7467
__kmp_dflt_team_nth = KMP_MIN_NTH;
7468
}
7469
if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7470
__kmp_dflt_team_nth = __kmp_sys_max_nth;
7471
}
7472
7473
if (__kmp_nesting_mode > 0)
7474
__kmp_set_nesting_mode_threads();
7475
7476
// There's no harm in continuing if the following check fails,
7477
// but it indicates an error in the previous logic.
7478
KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7479
7480
if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7481
// Run through the __kmp_threads array and set the num threads icv for each
7482
// root thread that is currently registered with the RTL (which has not
7483
// already explicitly set its nthreads-var with a call to
7484
// omp_set_num_threads()).
7485
for (i = 0; i < __kmp_threads_capacity; i++) {
7486
kmp_info_t *thread = __kmp_threads[i];
7487
if (thread == NULL)
7488
continue;
7489
if (thread->th.th_current_task->td_icvs.nproc != 0)
7490
continue;
7491
7492
set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7493
}
7494
}
7495
KA_TRACE(
7496
20,
7497
("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7498
__kmp_dflt_team_nth));
7499
7500
#ifdef KMP_ADJUST_BLOCKTIME
7501
/* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7502
if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7503
KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7504
if (__kmp_nth > __kmp_avail_proc) {
7505
__kmp_zero_bt = TRUE;
7506
}
7507
}
7508
#endif /* KMP_ADJUST_BLOCKTIME */
7509
7510
/* we have finished middle initialization */
7511
TCW_SYNC_4(__kmp_init_middle, TRUE);
7512
7513
KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7514
}
7515
7516
void __kmp_middle_initialize(void) {
7517
if (__kmp_init_middle) {
7518
return;
7519
}
7520
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521
if (__kmp_init_middle) {
7522
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523
return;
7524
}
7525
__kmp_do_middle_initialize();
7526
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7527
}
7528
7529
void __kmp_parallel_initialize(void) {
7530
int gtid = __kmp_entry_gtid(); // this might be a new root
7531
7532
/* synchronize parallel initialization (for sibling) */
7533
if (TCR_4(__kmp_init_parallel))
7534
return;
7535
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7536
if (TCR_4(__kmp_init_parallel)) {
7537
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538
return;
7539
}
7540
7541
/* TODO reinitialization after we have already shut down */
7542
if (TCR_4(__kmp_global.g.g_done)) {
7543
KA_TRACE(
7544
10,
7545
("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7546
__kmp_infinite_loop();
7547
}
7548
7549
/* jc: The lock __kmp_initz_lock is already held, so calling
7550
__kmp_serial_initialize would cause a deadlock. So we call
7551
__kmp_do_serial_initialize directly. */
7552
if (!__kmp_init_middle) {
7553
__kmp_do_middle_initialize();
7554
}
7555
__kmp_assign_root_init_mask();
7556
__kmp_resume_if_hard_paused();
7557
7558
/* begin initialization */
7559
KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7560
KMP_ASSERT(KMP_UBER_GTID(gtid));
7561
7562
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7563
// Save the FP control regs.
7564
// Worker threads will set theirs to these values at thread startup.
7565
__kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7566
__kmp_store_mxcsr(&__kmp_init_mxcsr);
7567
__kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7568
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7569
7570
#if KMP_OS_UNIX
7571
#if KMP_HANDLE_SIGNALS
7572
/* must be after __kmp_serial_initialize */
7573
__kmp_install_signals(TRUE);
7574
#endif
7575
#endif
7576
7577
__kmp_suspend_initialize();
7578
7579
#if defined(USE_LOAD_BALANCE)
7580
if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7581
__kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7582
}
7583
#else
7584
if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7585
__kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7586
}
7587
#endif
7588
7589
if (__kmp_version) {
7590
__kmp_print_version_2();
7591
}
7592
7593
/* we have finished parallel initialization */
7594
TCW_SYNC_4(__kmp_init_parallel, TRUE);
7595
7596
KMP_MB();
7597
KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7598
7599
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600
}
7601
7602
void __kmp_hidden_helper_initialize() {
7603
if (TCR_4(__kmp_init_hidden_helper))
7604
return;
7605
7606
// __kmp_parallel_initialize is required before we initialize hidden helper
7607
if (!TCR_4(__kmp_init_parallel))
7608
__kmp_parallel_initialize();
7609
7610
// Double check. Note that this double check should not be placed before
7611
// __kmp_parallel_initialize as it will cause dead lock.
7612
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613
if (TCR_4(__kmp_init_hidden_helper)) {
7614
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7615
return;
7616
}
7617
7618
#if KMP_AFFINITY_SUPPORTED
7619
// Initialize hidden helper affinity settings.
7620
// The above __kmp_parallel_initialize() will initialize
7621
// regular affinity (and topology) if not already done.
7622
if (!__kmp_hh_affinity.flags.initialized)
7623
__kmp_affinity_initialize(__kmp_hh_affinity);
7624
#endif
7625
7626
// Set the count of hidden helper tasks to be executed to zero
7627
KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7628
7629
// Set the global variable indicating that we're initializing hidden helper
7630
// team/threads
7631
TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7632
7633
// Platform independent initialization
7634
__kmp_do_initialize_hidden_helper_threads();
7635
7636
// Wait here for the finish of initialization of hidden helper teams
7637
__kmp_hidden_helper_threads_initz_wait();
7638
7639
// We have finished hidden helper initialization
7640
TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7641
7642
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
7643
}
7644
7645
/* ------------------------------------------------------------------------ */
7646
7647
void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7648
kmp_team_t *team) {
7649
kmp_disp_t *dispatch;
7650
7651
KMP_MB();
7652
7653
/* none of the threads have encountered any constructs, yet. */
7654
this_thr->th.th_local.this_construct = 0;
7655
#if KMP_CACHE_MANAGE
7656
KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7657
#endif /* KMP_CACHE_MANAGE */
7658
dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7659
KMP_DEBUG_ASSERT(dispatch);
7660
KMP_DEBUG_ASSERT(team->t.t_dispatch);
7661
// KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7662
// this_thr->th.th_info.ds.ds_tid ] );
7663
7664
dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7665
dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7666
if (__kmp_env_consistency_check)
7667
__kmp_push_parallel(gtid, team->t.t_ident);
7668
7669
KMP_MB(); /* Flush all pending memory write invalidates. */
7670
}
7671
7672
void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7673
kmp_team_t *team) {
7674
if (__kmp_env_consistency_check)
7675
__kmp_pop_parallel(gtid, team->t.t_ident);
7676
7677
__kmp_finish_implicit_task(this_thr);
7678
}
7679
7680
int __kmp_invoke_task_func(int gtid) {
7681
int rc;
7682
int tid = __kmp_tid_from_gtid(gtid);
7683
kmp_info_t *this_thr = __kmp_threads[gtid];
7684
kmp_team_t *team = this_thr->th.th_team;
7685
7686
__kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7687
#if USE_ITT_BUILD
7688
if (__itt_stack_caller_create_ptr) {
7689
// inform ittnotify about entering user's code
7690
if (team->t.t_stack_id != NULL) {
7691
__kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7692
} else {
7693
KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7694
__kmp_itt_stack_callee_enter(
7695
(__itt_caller)team->t.t_parent->t.t_stack_id);
7696
}
7697
}
7698
#endif /* USE_ITT_BUILD */
7699
#if INCLUDE_SSC_MARKS
7700
SSC_MARK_INVOKING();
7701
#endif
7702
7703
#if OMPT_SUPPORT
7704
void *dummy;
7705
void **exit_frame_p;
7706
ompt_data_t *my_task_data;
7707
ompt_data_t *my_parallel_data;
7708
int ompt_team_size;
7709
7710
if (ompt_enabled.enabled) {
7711
exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7712
.ompt_task_info.frame.exit_frame.ptr);
7713
} else {
7714
exit_frame_p = &dummy;
7715
}
7716
7717
my_task_data =
7718
&(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7719
my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7720
if (ompt_enabled.ompt_callback_implicit_task) {
7721
ompt_team_size = team->t.t_nproc;
7722
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723
ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7724
__kmp_tid_from_gtid(gtid), ompt_task_implicit);
7725
OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7726
}
7727
#endif
7728
7729
#if KMP_STATS_ENABLED
7730
stats_state_e previous_state = KMP_GET_THREAD_STATE();
7731
if (previous_state == stats_state_e::TEAMS_REGION) {
7732
KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7733
} else {
7734
KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7735
}
7736
KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7737
#endif
7738
7739
rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7740
tid, (int)team->t.t_argc, (void **)team->t.t_argv
7741
#if OMPT_SUPPORT
7742
,
7743
exit_frame_p
7744
#endif
7745
);
7746
#if OMPT_SUPPORT
7747
*exit_frame_p = NULL;
7748
this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7749
#endif
7750
7751
#if KMP_STATS_ENABLED
7752
if (previous_state == stats_state_e::TEAMS_REGION) {
7753
KMP_SET_THREAD_STATE(previous_state);
7754
}
7755
KMP_POP_PARTITIONED_TIMER();
7756
#endif
7757
7758
#if USE_ITT_BUILD
7759
if (__itt_stack_caller_create_ptr) {
7760
// inform ittnotify about leaving user's code
7761
if (team->t.t_stack_id != NULL) {
7762
__kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7763
} else {
7764
KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7765
__kmp_itt_stack_callee_leave(
7766
(__itt_caller)team->t.t_parent->t.t_stack_id);
7767
}
7768
}
7769
#endif /* USE_ITT_BUILD */
7770
__kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7771
7772
return rc;
7773
}
7774
7775
void __kmp_teams_master(int gtid) {
7776
// This routine is called by all primary threads in teams construct
7777
kmp_info_t *thr = __kmp_threads[gtid];
7778
kmp_team_t *team = thr->th.th_team;
7779
ident_t *loc = team->t.t_ident;
7780
thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7781
KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7782
KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7783
KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7784
__kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7785
7786
// This thread is a new CG root. Set up the proper variables.
7787
kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7788
tmp->cg_root = thr; // Make thr the CG root
7789
// Init to thread limit stored when league primary threads were forked
7790
tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7791
tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7792
KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7793
" cg_nthreads to 1\n",
7794
thr, tmp));
7795
tmp->up = thr->th.th_cg_roots;
7796
thr->th.th_cg_roots = tmp;
7797
7798
// Launch league of teams now, but not let workers execute
7799
// (they hang on fork barrier until next parallel)
7800
#if INCLUDE_SSC_MARKS
7801
SSC_MARK_FORKING();
7802
#endif
7803
__kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7804
(microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7805
VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7806
#if INCLUDE_SSC_MARKS
7807
SSC_MARK_JOINING();
7808
#endif
7809
// If the team size was reduced from the limit, set it to the new size
7810
if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7811
thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7812
// AC: last parameter "1" eliminates join barrier which won't work because
7813
// worker threads are in a fork barrier waiting for more parallel regions
7814
__kmp_join_call(loc, gtid
7815
#if OMPT_SUPPORT
7816
,
7817
fork_context_intel
7818
#endif
7819
,
7820
1);
7821
}
7822
7823
int __kmp_invoke_teams_master(int gtid) {
7824
kmp_info_t *this_thr = __kmp_threads[gtid];
7825
kmp_team_t *team = this_thr->th.th_team;
7826
#if KMP_DEBUG
7827
if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7828
KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7829
(void *)__kmp_teams_master);
7830
#endif
7831
__kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7832
#if OMPT_SUPPORT
7833
int tid = __kmp_tid_from_gtid(gtid);
7834
ompt_data_t *task_data =
7835
&team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7836
ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7837
if (ompt_enabled.ompt_callback_implicit_task) {
7838
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7839
ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7840
ompt_task_initial);
7841
OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7842
}
7843
#endif
7844
__kmp_teams_master(gtid);
7845
#if OMPT_SUPPORT
7846
this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7847
#endif
7848
__kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7849
return 1;
7850
}
7851
7852
/* this sets the requested number of threads for the next parallel region
7853
encountered by this team. since this should be enclosed in the forkjoin
7854
critical section it should avoid race conditions with asymmetrical nested
7855
parallelism */
7856
void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7857
kmp_info_t *thr = __kmp_threads[gtid];
7858
7859
if (num_threads > 0)
7860
thr->th.th_set_nproc = num_threads;
7861
}
7862
7863
void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7864
int *num_threads_list) {
7865
kmp_info_t *thr = __kmp_threads[gtid];
7866
7867
KMP_DEBUG_ASSERT(list_length > 1);
7868
7869
if (num_threads_list[0] > 0)
7870
thr->th.th_set_nproc = num_threads_list[0];
7871
thr->th.th_set_nested_nth =
7872
(int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7873
for (kmp_uint32 i = 0; i < list_length; ++i)
7874
thr->th.th_set_nested_nth[i] = num_threads_list[i];
7875
thr->th.th_set_nested_nth_sz = list_length;
7876
}
7877
7878
void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7879
const char *msg) {
7880
kmp_info_t *thr = __kmp_threads[gtid];
7881
thr->th.th_nt_strict = true;
7882
thr->th.th_nt_loc = loc;
7883
// if sev is unset make fatal
7884
if (sev == severity_warning)
7885
thr->th.th_nt_sev = sev;
7886
else
7887
thr->th.th_nt_sev = severity_fatal;
7888
// if msg is unset, use an appropriate message
7889
if (msg)
7890
thr->th.th_nt_msg = msg;
7891
else
7892
thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7893
"strict num_threads clause.";
7894
}
7895
7896
static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7897
int num_threads) {
7898
KMP_DEBUG_ASSERT(thr);
7899
// Remember the number of threads for inner parallel regions
7900
if (!TCR_4(__kmp_init_middle))
7901
__kmp_middle_initialize(); // get internal globals calculated
7902
__kmp_assign_root_init_mask();
7903
KMP_DEBUG_ASSERT(__kmp_avail_proc);
7904
KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7905
7906
if (num_threads == 0) {
7907
if (__kmp_teams_thread_limit > 0) {
7908
num_threads = __kmp_teams_thread_limit;
7909
} else {
7910
num_threads = __kmp_avail_proc / num_teams;
7911
}
7912
// adjust num_threads w/o warning as it is not user setting
7913
// num_threads = min(num_threads, nthreads-var, thread-limit-var)
7914
// no thread_limit clause specified - do not change thread-limit-var ICV
7915
if (num_threads > __kmp_dflt_team_nth) {
7916
num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7917
}
7918
if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7919
num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7920
} // prevent team size to exceed thread-limit-var
7921
if (num_teams * num_threads > __kmp_teams_max_nth) {
7922
num_threads = __kmp_teams_max_nth / num_teams;
7923
}
7924
if (num_threads == 0) {
7925
num_threads = 1;
7926
}
7927
} else {
7928
if (num_threads < 0) {
7929
__kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7930
__kmp_msg_null);
7931
num_threads = 1;
7932
}
7933
// This thread will be the primary thread of the league primary threads
7934
// Store new thread limit; old limit is saved in th_cg_roots list
7935
thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7936
// num_threads = min(num_threads, nthreads-var)
7937
if (num_threads > __kmp_dflt_team_nth) {
7938
num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7939
}
7940
if (num_teams * num_threads > __kmp_teams_max_nth) {
7941
int new_threads = __kmp_teams_max_nth / num_teams;
7942
if (new_threads == 0) {
7943
new_threads = 1;
7944
}
7945
if (new_threads != num_threads) {
7946
if (!__kmp_reserve_warn) { // user asked for too many threads
7947
__kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7948
__kmp_msg(kmp_ms_warning,
7949
KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7950
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7951
}
7952
}
7953
num_threads = new_threads;
7954
}
7955
}
7956
thr->th.th_teams_size.nth = num_threads;
7957
}
7958
7959
/* this sets the requested number of teams for the teams region and/or
7960
the number of threads for the next parallel region encountered */
7961
void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7962
int num_threads) {
7963
kmp_info_t *thr = __kmp_threads[gtid];
7964
if (num_teams < 0) {
7965
// OpenMP specification requires requested values to be positive,
7966
// but people can send us any value, so we'd better check
7967
__kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7968
__kmp_msg_null);
7969
num_teams = 1;
7970
}
7971
if (num_teams == 0) {
7972
if (__kmp_nteams > 0) {
7973
num_teams = __kmp_nteams;
7974
} else {
7975
num_teams = 1; // default number of teams is 1.
7976
}
7977
}
7978
if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7979
if (!__kmp_reserve_warn) {
7980
__kmp_reserve_warn = 1;
7981
__kmp_msg(kmp_ms_warning,
7982
KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984
}
7985
num_teams = __kmp_teams_max_nth;
7986
}
7987
// Set number of teams (number of threads in the outer "parallel" of the
7988
// teams)
7989
thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7990
7991
__kmp_push_thread_limit(thr, num_teams, num_threads);
7992
}
7993
7994
/* This sets the requested number of teams for the teams region and/or
7995
the number of threads for the next parallel region encountered */
7996
void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7997
int num_teams_ub, int num_threads) {
7998
kmp_info_t *thr = __kmp_threads[gtid];
7999
KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8000
KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8001
KMP_DEBUG_ASSERT(num_threads >= 0);
8002
8003
if (num_teams_lb > num_teams_ub) {
8004
__kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8005
KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8006
}
8007
8008
int num_teams = 1; // defalt number of teams is 1.
8009
8010
if (num_teams_lb == 0 && num_teams_ub > 0)
8011
num_teams_lb = num_teams_ub;
8012
8013
if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8014
num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8015
if (num_teams > __kmp_teams_max_nth) {
8016
if (!__kmp_reserve_warn) {
8017
__kmp_reserve_warn = 1;
8018
__kmp_msg(kmp_ms_warning,
8019
KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8020
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8021
}
8022
num_teams = __kmp_teams_max_nth;
8023
}
8024
} else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8025
num_teams = num_teams_ub;
8026
} else { // num_teams_lb <= num_teams <= num_teams_ub
8027
if (num_threads <= 0) {
8028
if (num_teams_ub > __kmp_teams_max_nth) {
8029
num_teams = num_teams_lb;
8030
} else {
8031
num_teams = num_teams_ub;
8032
}
8033
} else {
8034
num_teams = (num_threads > __kmp_teams_max_nth)
8035
? num_teams
8036
: __kmp_teams_max_nth / num_threads;
8037
if (num_teams < num_teams_lb) {
8038
num_teams = num_teams_lb;
8039
} else if (num_teams > num_teams_ub) {
8040
num_teams = num_teams_ub;
8041
}
8042
}
8043
}
8044
// Set number of teams (number of threads in the outer "parallel" of the
8045
// teams)
8046
thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8047
8048
__kmp_push_thread_limit(thr, num_teams, num_threads);
8049
}
8050
8051
// Set the proc_bind var to use in the following parallel region.
8052
void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8053
kmp_info_t *thr = __kmp_threads[gtid];
8054
thr->th.th_set_proc_bind = proc_bind;
8055
}
8056
8057
/* Launch the worker threads into the microtask. */
8058
8059
void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8060
kmp_info_t *this_thr = __kmp_threads[gtid];
8061
8062
#ifdef KMP_DEBUG
8063
int f;
8064
#endif /* KMP_DEBUG */
8065
8066
KMP_DEBUG_ASSERT(team);
8067
KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8068
KMP_ASSERT(KMP_MASTER_GTID(gtid));
8069
KMP_MB(); /* Flush all pending memory write invalidates. */
8070
8071
team->t.t_construct = 0; /* no single directives seen yet */
8072
team->t.t_ordered.dt.t_value =
8073
0; /* thread 0 enters the ordered section first */
8074
8075
/* Reset the identifiers on the dispatch buffer */
8076
KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8077
if (team->t.t_max_nproc > 1) {
8078
int i;
8079
for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8080
team->t.t_disp_buffer[i].buffer_index = i;
8081
team->t.t_disp_buffer[i].doacross_buf_idx = i;
8082
}
8083
} else {
8084
team->t.t_disp_buffer[0].buffer_index = 0;
8085
team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8086
}
8087
8088
KMP_MB(); /* Flush all pending memory write invalidates. */
8089
KMP_ASSERT(this_thr->th.th_team == team);
8090
8091
#ifdef KMP_DEBUG
8092
for (f = 0; f < team->t.t_nproc; f++) {
8093
KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8094
team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8095
}
8096
#endif /* KMP_DEBUG */
8097
8098
/* release the worker threads so they may begin working */
8099
__kmp_fork_barrier(gtid, 0);
8100
}
8101
8102
void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8103
kmp_info_t *this_thr = __kmp_threads[gtid];
8104
8105
KMP_DEBUG_ASSERT(team);
8106
KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8107
KMP_ASSERT(KMP_MASTER_GTID(gtid));
8108
KMP_MB(); /* Flush all pending memory write invalidates. */
8109
8110
/* Join barrier after fork */
8111
8112
#ifdef KMP_DEBUG
8113
if (__kmp_threads[gtid] &&
8114
__kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8115
__kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8116
__kmp_threads[gtid]);
8117
__kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8118
"team->t.t_nproc=%d\n",
8119
gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8120
team->t.t_nproc);
8121
__kmp_print_structure();
8122
}
8123
KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8124
__kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8125
#endif /* KMP_DEBUG */
8126
8127
__kmp_join_barrier(gtid); /* wait for everyone */
8128
#if OMPT_SUPPORT
8129
ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8130
if (ompt_enabled.enabled &&
8131
(ompt_state == ompt_state_wait_barrier_teams ||
8132
ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8133
int ds_tid = this_thr->th.th_info.ds.ds_tid;
8134
ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8135
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8136
#if OMPT_OPTIONAL
8137
void *codeptr = NULL;
8138
if (KMP_MASTER_TID(ds_tid) &&
8139
(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8140
ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8141
codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8142
8143
ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8144
if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8145
sync_kind = ompt_sync_region_barrier_teams;
8146
if (ompt_enabled.ompt_callback_sync_region_wait) {
8147
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8148
sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8149
}
8150
if (ompt_enabled.ompt_callback_sync_region) {
8151
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8152
sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8153
}
8154
#endif
8155
if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8156
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8157
ompt_scope_end, NULL, task_data, 0, ds_tid,
8158
ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8159
}
8160
}
8161
#endif
8162
8163
KMP_MB(); /* Flush all pending memory write invalidates. */
8164
KMP_ASSERT(this_thr->th.th_team == team);
8165
}
8166
8167
/* ------------------------------------------------------------------------ */
8168
8169
#ifdef USE_LOAD_BALANCE
8170
8171
// Return the worker threads actively spinning in the hot team, if we
8172
// are at the outermost level of parallelism. Otherwise, return 0.
8173
static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8174
int i;
8175
int retval;
8176
kmp_team_t *hot_team;
8177
8178
if (root->r.r_active) {
8179
return 0;
8180
}
8181
hot_team = root->r.r_hot_team;
8182
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8183
return hot_team->t.t_nproc - 1; // Don't count primary thread
8184
}
8185
8186
// Skip the primary thread - it is accounted for elsewhere.
8187
retval = 0;
8188
for (i = 1; i < hot_team->t.t_nproc; i++) {
8189
if (hot_team->t.t_threads[i]->th.th_active) {
8190
retval++;
8191
}
8192
}
8193
return retval;
8194
}
8195
8196
// Perform an automatic adjustment to the number of
8197
// threads used by the next parallel region.
8198
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8199
int retval;
8200
int pool_active;
8201
int hot_team_active;
8202
int team_curr_active;
8203
int system_active;
8204
8205
KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8206
set_nproc));
8207
KMP_DEBUG_ASSERT(root);
8208
KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8209
->th.th_current_task->td_icvs.dynamic == TRUE);
8210
KMP_DEBUG_ASSERT(set_nproc > 1);
8211
8212
if (set_nproc == 1) {
8213
KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8214
return 1;
8215
}
8216
8217
// Threads that are active in the thread pool, active in the hot team for this
8218
// particular root (if we are at the outer par level), and the currently
8219
// executing thread (to become the primary thread) are available to add to the
8220
// new team, but are currently contributing to the system load, and must be
8221
// accounted for.
8222
pool_active = __kmp_thread_pool_active_nth;
8223
hot_team_active = __kmp_active_hot_team_nproc(root);
8224
team_curr_active = pool_active + hot_team_active + 1;
8225
8226
// Check the system load.
8227
system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8228
KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8229
"hot team active = %d\n",
8230
system_active, pool_active, hot_team_active));
8231
8232
if (system_active < 0) {
8233
// There was an error reading the necessary info from /proc, so use the
8234
// thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8235
// = dynamic_thread_limit, we shouldn't wind up getting back here.
8236
__kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8237
KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8238
8239
// Make this call behave like the thread limit algorithm.
8240
retval = __kmp_avail_proc - __kmp_nth +
8241
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8242
if (retval > set_nproc) {
8243
retval = set_nproc;
8244
}
8245
if (retval < KMP_MIN_NTH) {
8246
retval = KMP_MIN_NTH;
8247
}
8248
8249
KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8250
retval));
8251
return retval;
8252
}
8253
8254
// There is a slight delay in the load balance algorithm in detecting new
8255
// running procs. The real system load at this instant should be at least as
8256
// large as the #active omp thread that are available to add to the team.
8257
if (system_active < team_curr_active) {
8258
system_active = team_curr_active;
8259
}
8260
retval = __kmp_avail_proc - system_active + team_curr_active;
8261
if (retval > set_nproc) {
8262
retval = set_nproc;
8263
}
8264
if (retval < KMP_MIN_NTH) {
8265
retval = KMP_MIN_NTH;
8266
}
8267
8268
KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8269
return retval;
8270
} // __kmp_load_balance_nproc()
8271
8272
#endif /* USE_LOAD_BALANCE */
8273
8274
/* ------------------------------------------------------------------------ */
8275
8276
/* NOTE: this is called with the __kmp_init_lock held */
8277
void __kmp_cleanup(void) {
8278
int f;
8279
8280
KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8281
8282
if (TCR_4(__kmp_init_parallel)) {
8283
#if KMP_HANDLE_SIGNALS
8284
__kmp_remove_signals();
8285
#endif
8286
TCW_4(__kmp_init_parallel, FALSE);
8287
}
8288
8289
if (TCR_4(__kmp_init_middle)) {
8290
#if KMP_AFFINITY_SUPPORTED
8291
__kmp_affinity_uninitialize();
8292
#endif /* KMP_AFFINITY_SUPPORTED */
8293
__kmp_cleanup_hierarchy();
8294
TCW_4(__kmp_init_middle, FALSE);
8295
}
8296
8297
KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8298
8299
if (__kmp_init_serial) {
8300
__kmp_runtime_destroy();
8301
__kmp_init_serial = FALSE;
8302
}
8303
8304
__kmp_cleanup_threadprivate_caches();
8305
8306
for (f = 0; f < __kmp_threads_capacity; f++) {
8307
if (__kmp_root[f] != NULL) {
8308
__kmp_free(__kmp_root[f]);
8309
__kmp_root[f] = NULL;
8310
}
8311
}
8312
__kmp_free(__kmp_threads);
8313
// __kmp_threads and __kmp_root were allocated at once, as single block, so
8314
// there is no need in freeing __kmp_root.
8315
__kmp_threads = NULL;
8316
__kmp_root = NULL;
8317
__kmp_threads_capacity = 0;
8318
8319
// Free old __kmp_threads arrays if they exist.
8320
kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8321
while (ptr) {
8322
kmp_old_threads_list_t *next = ptr->next;
8323
__kmp_free(ptr->threads);
8324
__kmp_free(ptr);
8325
ptr = next;
8326
}
8327
8328
#if KMP_USE_DYNAMIC_LOCK
8329
__kmp_cleanup_indirect_user_locks();
8330
#else
8331
__kmp_cleanup_user_locks();
8332
#endif
8333
#if OMPD_SUPPORT
8334
if (ompd_state) {
8335
__kmp_free(ompd_env_block);
8336
ompd_env_block = NULL;
8337
ompd_env_block_size = 0;
8338
}
8339
#endif
8340
8341
#if KMP_AFFINITY_SUPPORTED
8342
KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8343
__kmp_cpuinfo_file = NULL;
8344
#endif /* KMP_AFFINITY_SUPPORTED */
8345
8346
#if KMP_USE_ADAPTIVE_LOCKS
8347
#if KMP_DEBUG_ADAPTIVE_LOCKS
8348
__kmp_print_speculative_stats();
8349
#endif
8350
#endif
8351
KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8352
__kmp_nested_nth.nth = NULL;
8353
__kmp_nested_nth.size = 0;
8354
__kmp_nested_nth.used = 0;
8355
8356
KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8357
__kmp_nested_proc_bind.bind_types = NULL;
8358
__kmp_nested_proc_bind.size = 0;
8359
__kmp_nested_proc_bind.used = 0;
8360
if (__kmp_affinity_format) {
8361
KMP_INTERNAL_FREE(__kmp_affinity_format);
8362
__kmp_affinity_format = NULL;
8363
}
8364
8365
__kmp_i18n_catclose();
8366
8367
#if KMP_USE_HIER_SCHED
8368
__kmp_hier_scheds.deallocate();
8369
#endif
8370
8371
#if KMP_STATS_ENABLED
8372
__kmp_stats_fini();
8373
#endif
8374
8375
KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8376
}
8377
8378
/* ------------------------------------------------------------------------ */
8379
8380
int __kmp_ignore_mppbeg(void) {
8381
char *env;
8382
8383
if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8384
if (__kmp_str_match_false(env))
8385
return FALSE;
8386
}
8387
// By default __kmpc_begin() is no-op.
8388
return TRUE;
8389
}
8390
8391
int __kmp_ignore_mppend(void) {
8392
char *env;
8393
8394
if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8395
if (__kmp_str_match_false(env))
8396
return FALSE;
8397
}
8398
// By default __kmpc_end() is no-op.
8399
return TRUE;
8400
}
8401
8402
void __kmp_internal_begin(void) {
8403
int gtid;
8404
kmp_root_t *root;
8405
8406
/* this is a very important step as it will register new sibling threads
8407
and assign these new uber threads a new gtid */
8408
gtid = __kmp_entry_gtid();
8409
root = __kmp_threads[gtid]->th.th_root;
8410
KMP_ASSERT(KMP_UBER_GTID(gtid));
8411
8412
if (root->r.r_begin)
8413
return;
8414
__kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8415
if (root->r.r_begin) {
8416
__kmp_release_lock(&root->r.r_begin_lock, gtid);
8417
return;
8418
}
8419
8420
root->r.r_begin = TRUE;
8421
8422
__kmp_release_lock(&root->r.r_begin_lock, gtid);
8423
}
8424
8425
/* ------------------------------------------------------------------------ */
8426
8427
void __kmp_user_set_library(enum library_type arg) {
8428
int gtid;
8429
kmp_root_t *root;
8430
kmp_info_t *thread;
8431
8432
/* first, make sure we are initialized so we can get our gtid */
8433
8434
gtid = __kmp_entry_gtid();
8435
thread = __kmp_threads[gtid];
8436
8437
root = thread->th.th_root;
8438
8439
KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8440
library_serial));
8441
if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8442
thread */
8443
KMP_WARNING(SetLibraryIncorrectCall);
8444
return;
8445
}
8446
8447
switch (arg) {
8448
case library_serial:
8449
thread->th.th_set_nproc = 0;
8450
set__nproc(thread, 1);
8451
break;
8452
case library_turnaround:
8453
thread->th.th_set_nproc = 0;
8454
set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8455
: __kmp_dflt_team_nth_ub);
8456
break;
8457
case library_throughput:
8458
thread->th.th_set_nproc = 0;
8459
set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8460
: __kmp_dflt_team_nth_ub);
8461
break;
8462
default:
8463
KMP_FATAL(UnknownLibraryType, arg);
8464
}
8465
8466
__kmp_aux_set_library(arg);
8467
}
8468
8469
void __kmp_aux_set_stacksize(size_t arg) {
8470
if (!__kmp_init_serial)
8471
__kmp_serial_initialize();
8472
8473
#if KMP_OS_DARWIN
8474
if (arg & (0x1000 - 1)) {
8475
arg &= ~(0x1000 - 1);
8476
if (arg + 0x1000) /* check for overflow if we round up */
8477
arg += 0x1000;
8478
}
8479
#endif
8480
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8481
8482
/* only change the default stacksize before the first parallel region */
8483
if (!TCR_4(__kmp_init_parallel)) {
8484
size_t value = arg; /* argument is in bytes */
8485
8486
if (value < __kmp_sys_min_stksize)
8487
value = __kmp_sys_min_stksize;
8488
else if (value > KMP_MAX_STKSIZE)
8489
value = KMP_MAX_STKSIZE;
8490
8491
__kmp_stksize = value;
8492
8493
__kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8494
}
8495
8496
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
8497
}
8498
8499
/* set the behaviour of the runtime library */
8500
/* TODO this can cause some odd behaviour with sibling parallelism... */
8501
void __kmp_aux_set_library(enum library_type arg) {
8502
__kmp_library = arg;
8503
8504
switch (__kmp_library) {
8505
case library_serial: {
8506
KMP_INFORM(LibraryIsSerial);
8507
} break;
8508
case library_turnaround:
8509
if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8510
__kmp_use_yield = 2; // only yield when oversubscribed
8511
break;
8512
case library_throughput:
8513
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8514
__kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8515
break;
8516
default:
8517
KMP_FATAL(UnknownLibraryType, arg);
8518
}
8519
}
8520
8521
/* Getting team information common for all team API */
8522
// Returns NULL if not in teams construct
8523
static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8524
kmp_info_t *thr = __kmp_entry_thread();
8525
teams_serialized = 0;
8526
if (thr->th.th_teams_microtask) {
8527
kmp_team_t *team = thr->th.th_team;
8528
int tlevel = thr->th.th_teams_level; // the level of the teams construct
8529
int ii = team->t.t_level;
8530
teams_serialized = team->t.t_serialized;
8531
int level = tlevel + 1;
8532
KMP_DEBUG_ASSERT(ii >= tlevel);
8533
while (ii > level) {
8534
for (teams_serialized = team->t.t_serialized;
8535
(teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8536
}
8537
if (team->t.t_serialized && (!teams_serialized)) {
8538
team = team->t.t_parent;
8539
continue;
8540
}
8541
if (ii > level) {
8542
team = team->t.t_parent;
8543
ii--;
8544
}
8545
}
8546
return team;
8547
}
8548
return NULL;
8549
}
8550
8551
int __kmp_aux_get_team_num() {
8552
int serialized;
8553
kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8554
if (team) {
8555
if (serialized > 1) {
8556
return 0; // teams region is serialized ( 1 team of 1 thread ).
8557
} else {
8558
return team->t.t_master_tid;
8559
}
8560
}
8561
return 0;
8562
}
8563
8564
int __kmp_aux_get_num_teams() {
8565
int serialized;
8566
kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8567
if (team) {
8568
if (serialized > 1) {
8569
return 1;
8570
} else {
8571
return team->t.t_parent->t.t_nproc;
8572
}
8573
}
8574
return 1;
8575
}
8576
8577
/* ------------------------------------------------------------------------ */
8578
8579
/*
8580
* Affinity Format Parser
8581
*
8582
* Field is in form of: %[[[0].]size]type
8583
* % and type are required (%% means print a literal '%')
8584
* type is either single char or long name surrounded by {},
8585
* e.g., N or {num_threads}
8586
* 0 => leading zeros
8587
* . => right justified when size is specified
8588
* by default output is left justified
8589
* size is the *minimum* field length
8590
* All other characters are printed as is
8591
*
8592
* Available field types:
8593
* L {thread_level} - omp_get_level()
8594
* n {thread_num} - omp_get_thread_num()
8595
* h {host} - name of host machine
8596
* P {process_id} - process id (integer)
8597
* T {thread_identifier} - native thread identifier (integer)
8598
* N {num_threads} - omp_get_num_threads()
8599
* A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8600
* a {thread_affinity} - comma separated list of integers or integer ranges
8601
* (values of affinity mask)
8602
*
8603
* Implementation-specific field types can be added
8604
* If a type is unknown, print "undefined"
8605
*/
8606
8607
// Structure holding the short name, long name, and corresponding data type
8608
// for snprintf. A table of these will represent the entire valid keyword
8609
// field types.
8610
typedef struct kmp_affinity_format_field_t {
8611
char short_name; // from spec e.g., L -> thread level
8612
const char *long_name; // from spec thread_level -> thread level
8613
char field_format; // data type for snprintf (typically 'd' or 's'
8614
// for integer or string)
8615
} kmp_affinity_format_field_t;
8616
8617
static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8618
#if KMP_AFFINITY_SUPPORTED
8619
{'A', "thread_affinity", 's'},
8620
#endif
8621
{'t', "team_num", 'd'},
8622
{'T', "num_teams", 'd'},
8623
{'L', "nesting_level", 'd'},
8624
{'n', "thread_num", 'd'},
8625
{'N', "num_threads", 'd'},
8626
{'a', "ancestor_tnum", 'd'},
8627
{'H', "host", 's'},
8628
{'P', "process_id", 'd'},
8629
{'i', "native_thread_id", 'd'}};
8630
8631
// Return the number of characters it takes to hold field
8632
static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8633
const char **ptr,
8634
kmp_str_buf_t *field_buffer) {
8635
int rc, format_index, field_value;
8636
const char *width_left, *width_right;
8637
bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8638
static const int FORMAT_SIZE = 20;
8639
char format[FORMAT_SIZE] = {0};
8640
char absolute_short_name = 0;
8641
8642
KMP_DEBUG_ASSERT(gtid >= 0);
8643
KMP_DEBUG_ASSERT(th);
8644
KMP_DEBUG_ASSERT(**ptr == '%');
8645
KMP_DEBUG_ASSERT(field_buffer);
8646
8647
__kmp_str_buf_clear(field_buffer);
8648
8649
// Skip the initial %
8650
(*ptr)++;
8651
8652
// Check for %% first
8653
if (**ptr == '%') {
8654
__kmp_str_buf_cat(field_buffer, "%", 1);
8655
(*ptr)++; // skip over the second %
8656
return 1;
8657
}
8658
8659
// Parse field modifiers if they are present
8660
pad_zeros = false;
8661
if (**ptr == '0') {
8662
pad_zeros = true;
8663
(*ptr)++; // skip over 0
8664
}
8665
right_justify = false;
8666
if (**ptr == '.') {
8667
right_justify = true;
8668
(*ptr)++; // skip over .
8669
}
8670
// Parse width of field: [width_left, width_right)
8671
width_left = width_right = NULL;
8672
if (**ptr >= '0' && **ptr <= '9') {
8673
width_left = *ptr;
8674
SKIP_DIGITS(*ptr);
8675
width_right = *ptr;
8676
}
8677
8678
// Create the format for KMP_SNPRINTF based on flags parsed above
8679
format_index = 0;
8680
format[format_index++] = '%';
8681
if (!right_justify)
8682
format[format_index++] = '-';
8683
if (pad_zeros)
8684
format[format_index++] = '0';
8685
if (width_left && width_right) {
8686
int i = 0;
8687
// Only allow 8 digit number widths.
8688
// This also prevents overflowing format variable
8689
while (i < 8 && width_left < width_right) {
8690
format[format_index++] = *width_left;
8691
width_left++;
8692
i++;
8693
}
8694
}
8695
8696
// Parse a name (long or short)
8697
// Canonicalize the name into absolute_short_name
8698
found_valid_name = false;
8699
parse_long_name = (**ptr == '{');
8700
if (parse_long_name)
8701
(*ptr)++; // skip initial left brace
8702
for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8703
sizeof(__kmp_affinity_format_table[0]);
8704
++i) {
8705
char short_name = __kmp_affinity_format_table[i].short_name;
8706
const char *long_name = __kmp_affinity_format_table[i].long_name;
8707
char field_format = __kmp_affinity_format_table[i].field_format;
8708
if (parse_long_name) {
8709
size_t length = KMP_STRLEN(long_name);
8710
if (strncmp(*ptr, long_name, length) == 0) {
8711
found_valid_name = true;
8712
(*ptr) += length; // skip the long name
8713
}
8714
} else if (**ptr == short_name) {
8715
found_valid_name = true;
8716
(*ptr)++; // skip the short name
8717
}
8718
if (found_valid_name) {
8719
format[format_index++] = field_format;
8720
format[format_index++] = '\0';
8721
absolute_short_name = short_name;
8722
break;
8723
}
8724
}
8725
if (parse_long_name) {
8726
if (**ptr != '}') {
8727
absolute_short_name = 0;
8728
} else {
8729
(*ptr)++; // skip over the right brace
8730
}
8731
}
8732
8733
// Attempt to fill the buffer with the requested
8734
// value using snprintf within __kmp_str_buf_print()
8735
switch (absolute_short_name) {
8736
case 't':
8737
rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8738
break;
8739
case 'T':
8740
rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8741
break;
8742
case 'L':
8743
rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8744
break;
8745
case 'n':
8746
rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8747
break;
8748
case 'H': {
8749
static const int BUFFER_SIZE = 256;
8750
char buf[BUFFER_SIZE];
8751
__kmp_expand_host_name(buf, BUFFER_SIZE);
8752
rc = __kmp_str_buf_print(field_buffer, format, buf);
8753
} break;
8754
case 'P':
8755
rc = __kmp_str_buf_print(field_buffer, format, getpid());
8756
break;
8757
case 'i':
8758
rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8759
break;
8760
case 'N':
8761
rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8762
break;
8763
case 'a':
8764
field_value =
8765
__kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8766
rc = __kmp_str_buf_print(field_buffer, format, field_value);
8767
break;
8768
#if KMP_AFFINITY_SUPPORTED
8769
case 'A': {
8770
kmp_str_buf_t buf;
8771
__kmp_str_buf_init(&buf);
8772
__kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8773
rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8774
__kmp_str_buf_free(&buf);
8775
} break;
8776
#endif
8777
default:
8778
// According to spec, If an implementation does not have info for field
8779
// type, then "undefined" is printed
8780
rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8781
// Skip the field
8782
if (parse_long_name) {
8783
SKIP_TOKEN(*ptr);
8784
if (**ptr == '}')
8785
(*ptr)++;
8786
} else {
8787
(*ptr)++;
8788
}
8789
}
8790
8791
KMP_ASSERT(format_index <= FORMAT_SIZE);
8792
return rc;
8793
}
8794
8795
/*
8796
* Return number of characters needed to hold the affinity string
8797
* (not including null byte character)
8798
* The resultant string is printed to buffer, which the caller can then
8799
* handle afterwards
8800
*/
8801
size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8802
kmp_str_buf_t *buffer) {
8803
const char *parse_ptr;
8804
size_t retval;
8805
const kmp_info_t *th;
8806
kmp_str_buf_t field;
8807
8808
KMP_DEBUG_ASSERT(buffer);
8809
KMP_DEBUG_ASSERT(gtid >= 0);
8810
8811
__kmp_str_buf_init(&field);
8812
__kmp_str_buf_clear(buffer);
8813
8814
th = __kmp_threads[gtid];
8815
retval = 0;
8816
8817
// If format is NULL or zero-length string, then we use
8818
// affinity-format-var ICV
8819
parse_ptr = format;
8820
if (parse_ptr == NULL || *parse_ptr == '\0') {
8821
parse_ptr = __kmp_affinity_format;
8822
}
8823
KMP_DEBUG_ASSERT(parse_ptr);
8824
8825
while (*parse_ptr != '\0') {
8826
// Parse a field
8827
if (*parse_ptr == '%') {
8828
// Put field in the buffer
8829
int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8830
__kmp_str_buf_catbuf(buffer, &field);
8831
retval += rc;
8832
} else {
8833
// Put literal character in buffer
8834
__kmp_str_buf_cat(buffer, parse_ptr, 1);
8835
retval++;
8836
parse_ptr++;
8837
}
8838
}
8839
__kmp_str_buf_free(&field);
8840
return retval;
8841
}
8842
8843
// Displays the affinity string to stdout
8844
void __kmp_aux_display_affinity(int gtid, const char *format) {
8845
kmp_str_buf_t buf;
8846
__kmp_str_buf_init(&buf);
8847
__kmp_aux_capture_affinity(gtid, format, &buf);
8848
__kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8849
__kmp_str_buf_free(&buf);
8850
}
8851
8852
/* ------------------------------------------------------------------------ */
8853
void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8854
int blocktime = arg; /* argument is in microseconds */
8855
#if KMP_USE_MONITOR
8856
int bt_intervals;
8857
#endif
8858
kmp_int8 bt_set;
8859
8860
__kmp_save_internal_controls(thread);
8861
8862
/* Normalize and set blocktime for the teams */
8863
if (blocktime < KMP_MIN_BLOCKTIME)
8864
blocktime = KMP_MIN_BLOCKTIME;
8865
else if (blocktime > KMP_MAX_BLOCKTIME)
8866
blocktime = KMP_MAX_BLOCKTIME;
8867
8868
set__blocktime_team(thread->th.th_team, tid, blocktime);
8869
set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8870
8871
#if KMP_USE_MONITOR
8872
/* Calculate and set blocktime intervals for the teams */
8873
bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8874
8875
set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8876
set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8877
#endif
8878
8879
/* Set whether blocktime has been set to "TRUE" */
8880
bt_set = TRUE;
8881
8882
set__bt_set_team(thread->th.th_team, tid, bt_set);
8883
set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8884
#if KMP_USE_MONITOR
8885
KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8886
"bt_intervals=%d, monitor_updates=%d\n",
8887
__kmp_gtid_from_tid(tid, thread->th.th_team),
8888
thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8889
__kmp_monitor_wakeups));
8890
#else
8891
KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8892
__kmp_gtid_from_tid(tid, thread->th.th_team),
8893
thread->th.th_team->t.t_id, tid, blocktime));
8894
#endif
8895
}
8896
8897
void __kmp_aux_set_defaults(char const *str, size_t len) {
8898
if (!__kmp_init_serial) {
8899
__kmp_serial_initialize();
8900
}
8901
__kmp_env_initialize(str);
8902
8903
if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8904
__kmp_env_print();
8905
}
8906
} // __kmp_aux_set_defaults
8907
8908
/* ------------------------------------------------------------------------ */
8909
/* internal fast reduction routines */
8910
8911
PACKED_REDUCTION_METHOD_T
8912
__kmp_determine_reduction_method(
8913
ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8914
void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8915
kmp_critical_name *lck) {
8916
8917
// Default reduction method: critical construct ( lck != NULL, like in current
8918
// PAROPT )
8919
// If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8920
// can be selected by RTL
8921
// If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8922
// can be selected by RTL
8923
// Finally, it's up to OpenMP RTL to make a decision on which method to select
8924
// among generated by PAROPT.
8925
8926
PACKED_REDUCTION_METHOD_T retval;
8927
8928
int team_size;
8929
8930
KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8931
8932
#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8933
(loc && \
8934
((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8935
#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8936
8937
retval = critical_reduce_block;
8938
8939
// another choice of getting a team size (with 1 dynamic deference) is slower
8940
team_size = __kmp_get_team_num_threads(global_tid);
8941
if (team_size == 1) {
8942
8943
retval = empty_reduce_block;
8944
8945
} else {
8946
8947
int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8948
8949
#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8950
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8951
KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8952
8953
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8954
KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
8955
KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8956
8957
int teamsize_cutoff = 4;
8958
8959
#if KMP_MIC_SUPPORTED
8960
if (__kmp_mic_type != non_mic) {
8961
teamsize_cutoff = 8;
8962
}
8963
#endif
8964
int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8965
if (tree_available) {
8966
if (team_size <= teamsize_cutoff) {
8967
if (atomic_available) {
8968
retval = atomic_reduce_block;
8969
}
8970
} else {
8971
retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8972
}
8973
} else if (atomic_available) {
8974
retval = atomic_reduce_block;
8975
}
8976
#else
8977
#error "Unknown or unsupported OS"
8978
#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8979
// KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8980
// KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8981
8982
#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8983
KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8984
8985
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8986
KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
8987
KMP_OS_WASI || KMP_OS_AIX
8988
8989
// basic tuning
8990
8991
if (atomic_available) {
8992
if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8993
retval = atomic_reduce_block;
8994
}
8995
} // otherwise: use critical section
8996
8997
#elif KMP_OS_DARWIN
8998
8999
int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9000
if (atomic_available && (num_vars <= 3)) {
9001
retval = atomic_reduce_block;
9002
} else if (tree_available) {
9003
if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9004
(reduce_size < (2000 * sizeof(kmp_real64)))) {
9005
retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9006
}
9007
} // otherwise: use critical section
9008
9009
#else
9010
#error "Unknown or unsupported OS"
9011
#endif
9012
9013
#else
9014
#error "Unknown or unsupported architecture"
9015
#endif
9016
}
9017
9018
// KMP_FORCE_REDUCTION
9019
9020
// If the team is serialized (team_size == 1), ignore the forced reduction
9021
// method and stay with the unsynchronized method (empty_reduce_block)
9022
if (__kmp_force_reduction_method != reduction_method_not_defined &&
9023
team_size != 1) {
9024
9025
PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9026
9027
int atomic_available, tree_available;
9028
9029
switch ((forced_retval = __kmp_force_reduction_method)) {
9030
case critical_reduce_block:
9031
KMP_ASSERT(lck); // lck should be != 0
9032
break;
9033
9034
case atomic_reduce_block:
9035
atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9036
if (!atomic_available) {
9037
KMP_WARNING(RedMethodNotSupported, "atomic");
9038
forced_retval = critical_reduce_block;
9039
}
9040
break;
9041
9042
case tree_reduce_block:
9043
tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9044
if (!tree_available) {
9045
KMP_WARNING(RedMethodNotSupported, "tree");
9046
forced_retval = critical_reduce_block;
9047
} else {
9048
#if KMP_FAST_REDUCTION_BARRIER
9049
forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9050
#endif
9051
}
9052
break;
9053
9054
default:
9055
KMP_ASSERT(0); // "unsupported method specified"
9056
}
9057
9058
retval = forced_retval;
9059
}
9060
9061
KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9062
9063
#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9064
#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9065
9066
return (retval);
9067
}
9068
// this function is for testing set/get/determine reduce method
9069
kmp_int32 __kmp_get_reduce_method(void) {
9070
return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9071
}
9072
9073
// Soft pause sets up threads to ignore blocktime and just go to sleep.
9074
// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9075
void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9076
9077
// Hard pause shuts down the runtime completely. Resume happens naturally when
9078
// OpenMP is used subsequently.
9079
void __kmp_hard_pause() {
9080
__kmp_pause_status = kmp_hard_paused;
9081
__kmp_internal_end_thread(-1);
9082
}
9083
9084
// Soft resume sets __kmp_pause_status, and wakes up all threads.
9085
void __kmp_resume_if_soft_paused() {
9086
if (__kmp_pause_status == kmp_soft_paused) {
9087
__kmp_pause_status = kmp_not_paused;
9088
9089
for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9090
kmp_info_t *thread = __kmp_threads[gtid];
9091
if (thread) { // Wake it if sleeping
9092
kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9093
thread);
9094
if (fl.is_sleeping())
9095
fl.resume(gtid);
9096
else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9097
__kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9098
} else { // thread holds the lock and may sleep soon
9099
do { // until either the thread sleeps, or we can get the lock
9100
if (fl.is_sleeping()) {
9101
fl.resume(gtid);
9102
break;
9103
} else if (__kmp_try_suspend_mx(thread)) {
9104
__kmp_unlock_suspend_mx(thread);
9105
break;
9106
}
9107
} while (1);
9108
}
9109
}
9110
}
9111
}
9112
}
9113
9114
// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9115
// TODO: add warning messages
9116
int __kmp_pause_resource(kmp_pause_status_t level) {
9117
if (level == kmp_not_paused) { // requesting resume
9118
if (__kmp_pause_status == kmp_not_paused) {
9119
// error message about runtime not being paused, so can't resume
9120
return 1;
9121
} else {
9122
KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9123
__kmp_pause_status == kmp_hard_paused);
9124
__kmp_pause_status = kmp_not_paused;
9125
return 0;
9126
}
9127
} else if (level == kmp_soft_paused) { // requesting soft pause
9128
if (__kmp_pause_status != kmp_not_paused) {
9129
// error message about already being paused
9130
return 1;
9131
} else {
9132
__kmp_soft_pause();
9133
return 0;
9134
}
9135
} else if (level == kmp_hard_paused) { // requesting hard pause
9136
if (__kmp_pause_status != kmp_not_paused) {
9137
// error message about already being paused
9138
return 1;
9139
} else {
9140
__kmp_hard_pause();
9141
return 0;
9142
}
9143
} else {
9144
// error message about invalid level
9145
return 1;
9146
}
9147
}
9148
9149
void __kmp_omp_display_env(int verbose) {
9150
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9151
if (__kmp_init_serial == 0)
9152
__kmp_do_serial_initialize();
9153
__kmp_display_env_impl(!verbose, verbose);
9154
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
9155
}
9156
9157
// The team size is changing, so distributed barrier must be modified
9158
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9159
int new_nthreads) {
9160
KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9161
bp_dist_bar);
9162
kmp_info_t **other_threads = team->t.t_threads;
9163
9164
// We want all the workers to stop waiting on the barrier while we adjust the
9165
// size of the team.
9166
for (int f = 1; f < old_nthreads; ++f) {
9167
KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9168
// Ignore threads that are already inactive or not present in the team
9169
if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9170
// teams construct causes thread_limit to get passed in, and some of
9171
// those could be inactive; just ignore them
9172
continue;
9173
}
9174
// If thread is transitioning still to in_use state, wait for it
9175
if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9176
while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9177
KMP_CPU_PAUSE();
9178
}
9179
// The thread should be in_use now
9180
KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9181
// Transition to unused state
9182
team->t.t_threads[f]->th.th_used_in_team.store(2);
9183
KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9184
}
9185
// Release all the workers
9186
team->t.b->go_release();
9187
9188
KMP_MFENCE();
9189
9190
// Workers should see transition status 2 and move to 0; but may need to be
9191
// woken up first
9192
int count = old_nthreads - 1;
9193
while (count > 0) {
9194
count = old_nthreads - 1;
9195
for (int f = 1; f < old_nthreads; ++f) {
9196
if (other_threads[f]->th.th_used_in_team.load() != 0) {
9197
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9198
kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9199
void *, other_threads[f]->th.th_sleep_loc);
9200
__kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9201
}
9202
} else {
9203
KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9204
count--;
9205
}
9206
}
9207
}
9208
// Now update the barrier size
9209
team->t.b->update_num_threads(new_nthreads);
9210
team->t.b->go_reset();
9211
}
9212
9213
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9214
// Add the threads back to the team
9215
KMP_DEBUG_ASSERT(team);
9216
// Threads were paused and pointed at th_used_in_team temporarily during a
9217
// resize of the team. We're going to set th_used_in_team to 3 to indicate to
9218
// the thread that it should transition itself back into the team. Then, if
9219
// blocktime isn't infinite, the thread could be sleeping, so we send a resume
9220
// to wake it up.
9221
for (int f = 1; f < new_nthreads; ++f) {
9222
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9223
KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9224
3);
9225
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9226
__kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9227
(kmp_flag_32<false, false> *)NULL);
9228
}
9229
}
9230
// The threads should be transitioning to the team; when they are done, they
9231
// should have set th_used_in_team to 1. This loop forces master to wait until
9232
// all threads have moved into the team and are waiting in the barrier.
9233
int count = new_nthreads - 1;
9234
while (count > 0) {
9235
count = new_nthreads - 1;
9236
for (int f = 1; f < new_nthreads; ++f) {
9237
if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9238
count--;
9239
}
9240
}
9241
}
9242
}
9243
9244
// Globals and functions for hidden helper task
9245
kmp_info_t **__kmp_hidden_helper_threads;
9246
kmp_info_t *__kmp_hidden_helper_main_thread;
9247
std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9248
#if KMP_OS_LINUX
9249
kmp_int32 __kmp_hidden_helper_threads_num = 8;
9250
kmp_int32 __kmp_enable_hidden_helper = TRUE;
9251
#else
9252
kmp_int32 __kmp_hidden_helper_threads_num = 0;
9253
kmp_int32 __kmp_enable_hidden_helper = FALSE;
9254
#endif
9255
9256
namespace {
9257
std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9258
9259
void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9260
// This is an explicit synchronization on all hidden helper threads in case
9261
// that when a regular thread pushes a hidden helper task to one hidden
9262
// helper thread, the thread has not been awaken once since they're released
9263
// by the main thread after creating the team.
9264
KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9265
while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9266
__kmp_hidden_helper_threads_num)
9267
;
9268
9269
// If main thread, then wait for signal
9270
if (__kmpc_master(nullptr, *gtid)) {
9271
// First, unset the initial state and release the initial thread
9272
TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9273
__kmp_hidden_helper_initz_release();
9274
__kmp_hidden_helper_main_thread_wait();
9275
// Now wake up all worker threads
9276
for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9277
__kmp_hidden_helper_worker_thread_signal();
9278
}
9279
}
9280
}
9281
} // namespace
9282
9283
void __kmp_hidden_helper_threads_initz_routine() {
9284
// Create a new root for hidden helper team/threads
9285
const int gtid = __kmp_register_root(TRUE);
9286
__kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9287
__kmp_hidden_helper_threads = &__kmp_threads[gtid];
9288
__kmp_hidden_helper_main_thread->th.th_set_nproc =
9289
__kmp_hidden_helper_threads_num;
9290
9291
KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9292
9293
__kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9294
9295
// Set the initialization flag to FALSE
9296
TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9297
9298
__kmp_hidden_helper_threads_deinitz_release();
9299
}
9300
9301
/* Nesting Mode:
9302
Set via KMP_NESTING_MODE, which takes an integer.
9303
Note: we skip duplicate topology levels, and skip levels with only
9304
one entity.
9305
KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9306
KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9307
in the topology, and initializes the number of threads at each of those
9308
levels to the number of entities at each level, respectively, below the
9309
entity at the parent level.
9310
KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9311
but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9312
the user to turn nesting on explicitly. This is an even more experimental
9313
option to this experimental feature, and may change or go away in the
9314
future.
9315
*/
9316
9317
// Allocate space to store nesting levels
9318
void __kmp_init_nesting_mode() {
9319
int levels = KMP_HW_LAST;
9320
__kmp_nesting_mode_nlevels = levels;
9321
__kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9322
for (int i = 0; i < levels; ++i)
9323
__kmp_nesting_nth_level[i] = 0;
9324
if (__kmp_nested_nth.size < levels) {
9325
__kmp_nested_nth.nth =
9326
(int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9327
__kmp_nested_nth.size = levels;
9328
}
9329
}
9330
9331
// Set # threads for top levels of nesting; must be called after topology set
9332
void __kmp_set_nesting_mode_threads() {
9333
kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9334
9335
if (__kmp_nesting_mode == 1)
9336
__kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9337
else if (__kmp_nesting_mode > 1)
9338
__kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9339
9340
if (__kmp_topology) { // use topology info
9341
int loc, hw_level;
9342
for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9343
loc < __kmp_nesting_mode_nlevels;
9344
loc++, hw_level++) {
9345
__kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9346
if (__kmp_nesting_nth_level[loc] == 1)
9347
loc--;
9348
}
9349
// Make sure all cores are used
9350
if (__kmp_nesting_mode > 1 && loc > 1) {
9351
int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9352
int num_cores = __kmp_topology->get_count(core_level);
9353
int upper_levels = 1;
9354
for (int level = 0; level < loc - 1; ++level)
9355
upper_levels *= __kmp_nesting_nth_level[level];
9356
if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9357
__kmp_nesting_nth_level[loc - 1] =
9358
num_cores / __kmp_nesting_nth_level[loc - 2];
9359
}
9360
__kmp_nesting_mode_nlevels = loc;
9361
__kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9362
} else { // no topology info available; provide a reasonable guesstimation
9363
if (__kmp_avail_proc >= 4) {
9364
__kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9365
__kmp_nesting_nth_level[1] = 2;
9366
__kmp_nesting_mode_nlevels = 2;
9367
} else {
9368
__kmp_nesting_nth_level[0] = __kmp_avail_proc;
9369
__kmp_nesting_mode_nlevels = 1;
9370
}
9371
__kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9372
}
9373
for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9374
__kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9375
}
9376
set__nproc(thread, __kmp_nesting_nth_level[0]);
9377
if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9378
__kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9379
if (get__max_active_levels(thread) > 1) {
9380
// if max levels was set, set nesting mode levels to same
9381
__kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9382
}
9383
if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9384
set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9385
}
9386
9387
// Empty symbols to export (see exports_so.txt) when feature is disabled
9388
extern "C" {
9389
#if !KMP_STATS_ENABLED
9390
void __kmp_reset_stats() {}
9391
#endif
9392
#if !USE_DEBUGGER
9393
int __kmp_omp_debug_struct_info = FALSE;
9394
int __kmp_debugging = FALSE;
9395
#endif
9396
#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9397
void __kmp_itt_fini_ittlib() {}
9398
void __kmp_itt_init_ittlib() {}
9399
#endif
9400
}
9401
9402
// end of file
9403
9404