CoCalc -- kmp_runtime.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp
³⁵²⁵⁸ views
1
/*
2
 * kmp_runtime.cpp -- KPTS runtime support library
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
#include "kmp.h"
14
#include "kmp_affinity.h"
15
#include "kmp_atomic.h"
16
#include "kmp_environment.h"
17
#include "kmp_error.h"
18
#include "kmp_i18n.h"
19
#include "kmp_io.h"
20
#include "kmp_itt.h"
21
#include "kmp_settings.h"
22
#include "kmp_stats.h"
23
#include "kmp_str.h"
24
#include "kmp_wait_release.h"
25
#include "kmp_wrapper_getpid.h"
26
#include "kmp_dispatch.h"
27
#include "kmp_utils.h"
28
#if KMP_USE_HIER_SCHED
29
#include "kmp_dispatch_hier.h"
30
#endif
31

32
#if OMPT_SUPPORT
33
#include "ompt-specific.h"
34
#endif
35
#if OMPD_SUPPORT
36
#include "ompd-specific.h"
37
#endif
38

39
#if OMP_PROFILING_SUPPORT
40
#include "llvm/Support/TimeProfiler.h"
41
static char *ProfileTraceFile = nullptr;
42
#endif
43

44
/* these are temporary issues to be dealt with */
45
#define KMP_USE_PRCTL 0
46

47
#if KMP_OS_WINDOWS
48
#include <process.h>
49
#endif
50

51
#ifndef KMP_USE_SHM
52
// Windows and WASI do not need these include files as they don't use shared
53
// memory.
54
#else
55
#include <sys/mman.h>
56
#include <sys/stat.h>
57
#include <fcntl.h>
58
#define SHM_SIZE 1024
59
#endif
60

61
#if defined(KMP_GOMP_COMPAT)
62
char const __kmp_version_alt_comp[] =
63
    KMP_VERSION_PREFIX "alternative compiler support: yes";
64
#endif /* defined(KMP_GOMP_COMPAT) */
65

66
char const __kmp_version_omp_api[] =
67
    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68

69
#ifdef KMP_DEBUG
70
char const __kmp_version_lock[] =
71
    KMP_VERSION_PREFIX "lock type: run time selectable";
72
#endif /* KMP_DEBUG */
73

74
#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75

76
/* ------------------------------------------------------------------------ */
77

78
#if KMP_USE_MONITOR
79
kmp_info_t __kmp_monitor;
80
#endif
81

82
/* Forward declarations */
83

84
void __kmp_cleanup(void);
85

86
static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87
                                  int gtid);
88
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89
                                  kmp_internal_control_t *new_icvs,
90
                                  ident_t *loc);
91
#if KMP_AFFINITY_SUPPORTED
92
static void __kmp_partition_places(kmp_team_t *team,
93
                                   int update_master_only = 0);
94
#endif
95
static void __kmp_do_serial_initialize(void);
96
void __kmp_fork_barrier(int gtid, int tid);
97
void __kmp_join_barrier(int gtid);
98
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99
                          kmp_internal_control_t *new_icvs, ident_t *loc);
100

101
#ifdef USE_LOAD_BALANCE
102
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103
#endif
104

105
static int __kmp_expand_threads(int nNeed);
106
#if KMP_OS_WINDOWS
107
static int __kmp_unregister_root_other_thread(int gtid);
108
#endif
109
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111

112
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113
                               int new_nthreads);
114
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115

116
static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117
                                                        int level) {
118
  kmp_nested_nthreads_t *new_nested_nth =
119
      (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120
          sizeof(kmp_nested_nthreads_t));
121
  int new_size = level + thr->th.th_set_nested_nth_sz;
122
  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123
  for (int i = 0; i < level + 1; ++i)
124
    new_nested_nth->nth[i] = 0;
125
  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126
    new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127
  new_nested_nth->size = new_nested_nth->used = new_size;
128
  return new_nested_nth;
129
}
130

131
/* Calculate the identifier of the current thread */
132
/* fast (and somewhat portable) way to get unique identifier of executing
133
   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134
int __kmp_get_global_thread_id() {
135
  int i;
136
  kmp_info_t **other_threads;
137
  size_t stack_data;
138
  char *stack_addr;
139
  size_t stack_size;
140
  char *stack_base;
141

142
  KA_TRACE(
143
      1000,
144
      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
145
       __kmp_nth, __kmp_all_nth));
146

147
  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148
     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149
     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150
     __kmp_init_gtid for this to work. */
151

152
  if (!TCR_4(__kmp_init_gtid))
153
    return KMP_GTID_DNE;
154

155
#ifdef KMP_TDATA_GTID
156
  if (TCR_4(__kmp_gtid_mode) >= 3) {
157
    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158
    return __kmp_gtid;
159
  }
160
#endif
161
  if (TCR_4(__kmp_gtid_mode) >= 2) {
162
    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163
    return __kmp_gtid_get_specific();
164
  }
165
  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166

167
  stack_addr = (char *)&stack_data;
168
  other_threads = __kmp_threads;
169

170
  /* ATT: The code below is a source of potential bugs due to unsynchronized
171
     access to __kmp_threads array. For example:
172
     1. Current thread loads other_threads[i] to thr and checks it, it is
173
        non-NULL.
174
     2. Current thread is suspended by OS.
175
     3. Another thread unregisters and finishes (debug versions of free()
176
        may fill memory with something like 0xEF).
177
     4. Current thread is resumed.
178
     5. Current thread reads junk from *thr.
179
     TODO: Fix it.  --ln  */
180

181
  for (i = 0; i < __kmp_threads_capacity; i++) {
182

183
    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184
    if (!thr)
185
      continue;
186

187
    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188
    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189

190
    /* stack grows down -- search through all of the active threads */
191

192
    if (stack_addr <= stack_base) {
193
      size_t stack_diff = stack_base - stack_addr;
194

195
      if (stack_diff <= stack_size) {
196
        /* The only way we can be closer than the allocated */
197
        /* stack size is if we are running on this thread. */
198
        // __kmp_gtid_get_specific can return negative value because this
199
        // function can be called by thread destructor. However, before the
200
        // thread destructor is called, the value of the corresponding
201
        // thread-specific data will be reset to NULL.
202
        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203
                         __kmp_gtid_get_specific() == i);
204
        return i;
205
      }
206
    }
207
  }
208

209
  /* get specific to try and determine our gtid */
210
  KA_TRACE(1000,
211
           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212
            "thread, using TLS\n"));
213
  i = __kmp_gtid_get_specific();
214

215
  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
216

217
  /* if we havn't been assigned a gtid, then return code */
218
  if (i < 0)
219
    return i;
220

221
  // other_threads[i] can be nullptr at this point because the corresponding
222
  // thread could have already been destructed. It can happen when this function
223
  // is called in end library routine.
224
  if (!TCR_SYNC_PTR(other_threads[i]))
225
    return i;
226

227
  /* dynamically updated stack window for uber threads to avoid get_specific
228
     call */
229
  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230
    KMP_FATAL(StackOverflow, i);
231
  }
232

233
  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234
  if (stack_addr > stack_base) {
235
    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236
    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237
            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238
                stack_base);
239
  } else {
240
    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241
            stack_base - stack_addr);
242
  }
243

244
  /* Reprint stack bounds for ubermaster since they have been refined */
245
  if (__kmp_storage_map) {
246
    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247
    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248
    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249
                                 other_threads[i]->th.th_info.ds.ds_stacksize,
250
                                 "th_%d stack (refinement)", i);
251
  }
252
  return i;
253
}
254

255
int __kmp_get_global_thread_id_reg() {
256
  int gtid;
257

258
  if (!__kmp_init_serial) {
259
    gtid = KMP_GTID_DNE;
260
  } else
261
#ifdef KMP_TDATA_GTID
262
      if (TCR_4(__kmp_gtid_mode) >= 3) {
263
    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264
    gtid = __kmp_gtid;
265
  } else
266
#endif
267
      if (TCR_4(__kmp_gtid_mode) >= 2) {
268
    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269
    gtid = __kmp_gtid_get_specific();
270
  } else {
271
    KA_TRACE(1000,
272
             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273
    gtid = __kmp_get_global_thread_id();
274
  }
275

276
  /* we must be a new uber master sibling thread */
277
  if (gtid == KMP_GTID_DNE) {
278
    KA_TRACE(10,
279
             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280
              "Registering a new gtid.\n"));
281
    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282
    if (!__kmp_init_serial) {
283
      __kmp_do_serial_initialize();
284
      gtid = __kmp_gtid_get_specific();
285
    } else {
286
      gtid = __kmp_register_root(FALSE);
287
    }
288
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289
    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290
  }
291

292
  KMP_DEBUG_ASSERT(gtid >= 0);
293

294
  return gtid;
295
}
296

297
/* caller must hold forkjoin_lock */
298
void __kmp_check_stack_overlap(kmp_info_t *th) {
299
  int f;
300
  char *stack_beg = NULL;
301
  char *stack_end = NULL;
302
  int gtid;
303

304
  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305
  if (__kmp_storage_map) {
306
    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307
    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308

309
    gtid = __kmp_gtid_from_thread(th);
310

311
    if (gtid == KMP_GTID_MONITOR) {
312
      __kmp_print_storage_map_gtid(
313
          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314
          "th_%s stack (%s)", "mon",
315
          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316
    } else {
317
      __kmp_print_storage_map_gtid(
318
          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319
          "th_%d stack (%s)", gtid,
320
          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321
    }
322
  }
323

324
  /* No point in checking ubermaster threads since they use refinement and
325
   * cannot overlap */
326
  gtid = __kmp_gtid_from_thread(th);
327
  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328
    KA_TRACE(10,
329
             ("__kmp_check_stack_overlap: performing extensive checking\n"));
330
    if (stack_beg == NULL) {
331
      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332
      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333
    }
334

335
    for (f = 0; f < __kmp_threads_capacity; f++) {
336
      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337

338
      if (f_th && f_th != th) {
339
        char *other_stack_end =
340
            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341
        char *other_stack_beg =
342
            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343
        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344
            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345

346
          /* Print the other stack values before the abort */
347
          if (__kmp_storage_map)
348
            __kmp_print_storage_map_gtid(
349
                -1, other_stack_beg, other_stack_end,
350
                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351
                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352

353
          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354
                      __kmp_msg_null);
355
        }
356
      }
357
    }
358
  }
359
  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360
}
361

362
/* ------------------------------------------------------------------------ */
363

364
void __kmp_infinite_loop(void) {
365
  static int done = FALSE;
366

367
  while (!done) {
368
    KMP_YIELD(TRUE);
369
  }
370
}
371

372
#define MAX_MESSAGE 512
373

374
void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375
                                  char const *format, ...) {
376
  char buffer[MAX_MESSAGE];
377
  va_list ap;
378

379
  va_start(ap, format);
380
  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381
               p2, (unsigned long)size, format);
382
  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383
  __kmp_vprintf(kmp_err, buffer, ap);
384
#if KMP_PRINT_DATA_PLACEMENT
385
  int node;
386
  if (gtid >= 0) {
387
    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388
      if (__kmp_storage_map_verbose) {
389
        node = __kmp_get_host_node(p1);
390
        if (node < 0) /* doesn't work, so don't try this next time */
391
          __kmp_storage_map_verbose = FALSE;
392
        else {
393
          char *last;
394
          int lastNode;
395
          int localProc = __kmp_get_cpu_from_gtid(gtid);
396

397
          const int page_size = KMP_GET_PAGE_SIZE();
398

399
          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400
          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401
          if (localProc >= 0)
402
            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
403
                                 localProc >> 1);
404
          else
405
            __kmp_printf_no_lock("  GTID %d\n", gtid);
406
#if KMP_USE_PRCTL
407
          /* The more elaborate format is disabled for now because of the prctl
408
           * hanging bug. */
409
          do {
410
            last = p1;
411
            lastNode = node;
412
            /* This loop collates adjacent pages with the same host node. */
413
            do {
414
              (char *)p1 += page_size;
415
            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416
            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
417
                                 lastNode);
418
          } while (p1 <= p2);
419
#else
420
          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
421
                               (char *)p1 + (page_size - 1),
422
                               __kmp_get_host_node(p1));
423
          if (p1 < p2) {
424
            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
425
                                 (char *)p2 + (page_size - 1),
426
                                 __kmp_get_host_node(p2));
427
          }
428
#endif
429
        }
430
      }
431
    } else
432
      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
433
  }
434
#endif /* KMP_PRINT_DATA_PLACEMENT */
435
  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436

437
  va_end(ap);
438
}
439

440
void __kmp_warn(char const *format, ...) {
441
  char buffer[MAX_MESSAGE];
442
  va_list ap;
443

444
  if (__kmp_generate_warnings == kmp_warnings_off) {
445
    return;
446
  }
447

448
  va_start(ap, format);
449

450
  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451
  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452
  __kmp_vprintf(kmp_err, buffer, ap);
453
  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454

455
  va_end(ap);
456
}
457

458
void __kmp_abort_process() {
459
  // Later threads may stall here, but that's ok because abort() will kill them.
460
  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461

462
  if (__kmp_debug_buf) {
463
    __kmp_dump_debug_buffer();
464
  }
465

466
#if KMP_OS_WINDOWS
467
  // Let other threads know of abnormal termination and prevent deadlock
468
  // if abort happened during library initialization or shutdown
469
  __kmp_global.g.g_abort = SIGABRT;
470

471
  /* On Windows* OS by default abort() causes pop-up error box, which stalls
472
     nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473
     boxes. _set_abort_behavior() works well, but this function is not
474
     available in VS7 (this is not problem for DLL, but it is a problem for
475
     static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476
     help, at least in some versions of MS C RTL.
477

478
     It seems following sequence is the only way to simulate abort() and
479
     avoid pop-up error box. */
480
  raise(SIGABRT);
481
  _exit(3); // Just in case, if signal ignored, exit anyway.
482
#else
483
  __kmp_unregister_library();
484
  abort();
485
#endif
486

487
  __kmp_infinite_loop();
488
  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489

490
} // __kmp_abort_process
491

492
void __kmp_abort_thread(void) {
493
  // TODO: Eliminate g_abort global variable and this function.
494
  // In case of abort just call abort(), it will kill all the threads.
495
  __kmp_infinite_loop();
496
} // __kmp_abort_thread
497

498
/* Print out the storage map for the major kmp_info_t thread data structures
499
   that are allocated together. */
500

501
static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502
  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503
                               gtid);
504

505
  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506
                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507

508
  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509
                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
510

511
  __kmp_print_storage_map_gtid(
512
      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513
      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514

515
  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516
                               &thr->th.th_bar[bs_plain_barrier + 1],
517
                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518
                               gtid);
519

520
  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521
                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
522
                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523
                               gtid);
524

525
#if KMP_FAST_REDUCTION_BARRIER
526
  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527
                               &thr->th.th_bar[bs_reduction_barrier + 1],
528
                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529
                               gtid);
530
#endif // KMP_FAST_REDUCTION_BARRIER
531
}
532

533
/* Print out the storage map for the major kmp_team_t team data structures
534
   that are allocated together. */
535

536
static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537
                                         int team_id, int num_thr) {
538
  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539
  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540
                               header, team_id);
541

542
  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543
                               &team->t.t_bar[bs_last_barrier],
544
                               sizeof(kmp_balign_team_t) * bs_last_barrier,
545
                               "%s_%d.t_bar", header, team_id);
546

547
  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548
                               &team->t.t_bar[bs_plain_barrier + 1],
549
                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550
                               header, team_id);
551

552
  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553
                               &team->t.t_bar[bs_forkjoin_barrier + 1],
554
                               sizeof(kmp_balign_team_t),
555
                               "%s_%d.t_bar[forkjoin]", header, team_id);
556

557
#if KMP_FAST_REDUCTION_BARRIER
558
  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559
                               &team->t.t_bar[bs_reduction_barrier + 1],
560
                               sizeof(kmp_balign_team_t),
561
                               "%s_%d.t_bar[reduction]", header, team_id);
562
#endif // KMP_FAST_REDUCTION_BARRIER
563

564
  __kmp_print_storage_map_gtid(
565
      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566
      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567

568
  __kmp_print_storage_map_gtid(
569
      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570
      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571

572
  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573
                               &team->t.t_disp_buffer[num_disp_buff],
574
                               sizeof(dispatch_shared_info_t) * num_disp_buff,
575
                               "%s_%d.t_disp_buffer", header, team_id);
576
}
577

578
static void __kmp_init_allocator() {
579
  __kmp_init_memkind();
580
  __kmp_init_target_mem();
581
}
582
static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
583

584
/* ------------------------------------------------------------------------ */
585

586
#if ENABLE_LIBOMPTARGET
587
static void __kmp_init_omptarget() {
588
  __kmp_init_target_task();
589
}
590
#endif
591

592
/* ------------------------------------------------------------------------ */
593

594
#if KMP_DYNAMIC_LIB
595
#if KMP_OS_WINDOWS
596

597
BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
598
  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
599

600
  switch (fdwReason) {
601

602
  case DLL_PROCESS_ATTACH:
603
    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
604

605
    return TRUE;
606

607
  case DLL_PROCESS_DETACH:
608
    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
609

610
    // According to Windows* documentation for DllMain entry point:
611
    // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
612
    //   lpReserved == NULL when FreeLibrary() is called,
613
    //   lpReserved != NULL when the process is terminated.
614
    // When FreeLibrary() is called, worker threads remain alive. So the
615
    // runtime's state is consistent and executing proper shutdown is OK.
616
    // When the process is terminated, worker threads have exited or been
617
    // forcefully terminated by the OS and only the shutdown thread remains.
618
    // This can leave the runtime in an inconsistent state.
619
    // Hence, only attempt proper cleanup when FreeLibrary() is called.
620
    // Otherwise, rely on OS to reclaim resources.
621
    if (lpReserved == NULL)
622
      __kmp_internal_end_library(__kmp_gtid_get_specific());
623

624
    return TRUE;
625

626
  case DLL_THREAD_ATTACH:
627
    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
628

629
    /* if we want to register new siblings all the time here call
630
     * __kmp_get_gtid(); */
631
    return TRUE;
632

633
  case DLL_THREAD_DETACH:
634
    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
635

636
    __kmp_internal_end_thread(__kmp_gtid_get_specific());
637
    return TRUE;
638
  }
639

640
  return TRUE;
641
}
642

643
#endif /* KMP_OS_WINDOWS */
644
#endif /* KMP_DYNAMIC_LIB */
645

646
/* __kmp_parallel_deo -- Wait until it's our turn. */
647
void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
648
  int gtid = *gtid_ref;
649
#ifdef BUILD_PARALLEL_ORDERED
650
  kmp_team_t *team = __kmp_team_from_gtid(gtid);
651
#endif /* BUILD_PARALLEL_ORDERED */
652

653
  if (__kmp_env_consistency_check) {
654
    if (__kmp_threads[gtid]->th.th_root->r.r_active)
655
#if KMP_USE_DYNAMIC_LOCK
656
      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
657
#else
658
      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
659
#endif
660
  }
661
#ifdef BUILD_PARALLEL_ORDERED
662
  if (!team->t.t_serialized) {
663
    KMP_MB();
664
    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
665
             NULL);
666
    KMP_MB();
667
  }
668
#endif /* BUILD_PARALLEL_ORDERED */
669
}
670

671
/* __kmp_parallel_dxo -- Signal the next task. */
672
void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673
  int gtid = *gtid_ref;
674
#ifdef BUILD_PARALLEL_ORDERED
675
  int tid = __kmp_tid_from_gtid(gtid);
676
  kmp_team_t *team = __kmp_team_from_gtid(gtid);
677
#endif /* BUILD_PARALLEL_ORDERED */
678

679
  if (__kmp_env_consistency_check) {
680
    if (__kmp_threads[gtid]->th.th_root->r.r_active)
681
      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
682
  }
683
#ifdef BUILD_PARALLEL_ORDERED
684
  if (!team->t.t_serialized) {
685
    KMP_MB(); /* Flush all pending memory write invalidates.  */
686

687
    /* use the tid of the next thread in this team */
688
    /* TODO replace with general release procedure */
689
    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
690

691
    KMP_MB(); /* Flush all pending memory write invalidates.  */
692
  }
693
#endif /* BUILD_PARALLEL_ORDERED */
694
}
695

696
/* ------------------------------------------------------------------------ */
697
/* The BARRIER for a SINGLE process section is always explicit   */
698

699
int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
700
  int status;
701
  kmp_info_t *th;
702
  kmp_team_t *team;
703

704
  if (!TCR_4(__kmp_init_parallel))
705
    __kmp_parallel_initialize();
706
  __kmp_resume_if_soft_paused();
707

708
  th = __kmp_threads[gtid];
709
  team = th->th.th_team;
710
  status = 0;
711

712
  th->th.th_ident = id_ref;
713

714
  if (team->t.t_serialized) {
715
    status = 1;
716
  } else {
717
    kmp_int32 old_this = th->th.th_local.this_construct;
718

719
    ++th->th.th_local.this_construct;
720
    /* try to set team count to thread count--success means thread got the
721
       single block */
722
    /* TODO: Should this be acquire or release? */
723
    if (team->t.t_construct == old_this) {
724
      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
725
                                              th->th.th_local.this_construct);
726
    }
727
#if USE_ITT_BUILD
728
    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
729
        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
730
        team->t.t_active_level == 1) {
731
      // Only report metadata by primary thread of active team at level 1
732
      __kmp_itt_metadata_single(id_ref);
733
    }
734
#endif /* USE_ITT_BUILD */
735
  }
736

737
  if (__kmp_env_consistency_check) {
738
    if (status && push_ws) {
739
      __kmp_push_workshare(gtid, ct_psingle, id_ref);
740
    } else {
741
      __kmp_check_workshare(gtid, ct_psingle, id_ref);
742
    }
743
  }
744
#if USE_ITT_BUILD
745
  if (status) {
746
    __kmp_itt_single_start(gtid);
747
  }
748
#endif /* USE_ITT_BUILD */
749
  return status;
750
}
751

752
void __kmp_exit_single(int gtid) {
753
#if USE_ITT_BUILD
754
  __kmp_itt_single_end(gtid);
755
#endif /* USE_ITT_BUILD */
756
  if (__kmp_env_consistency_check)
757
    __kmp_pop_workshare(gtid, ct_psingle, NULL);
758
}
759

760
/* determine if we can go parallel or must use a serialized parallel region and
761
 * how many threads we can use
762
 * set_nproc is the number of threads requested for the team
763
 * returns 0 if we should serialize or only use one thread,
764
 * otherwise the number of threads to use
765
 * The forkjoin lock is held by the caller. */
766
static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
767
                                 int master_tid, int set_nthreads,
768
                                 int enter_teams) {
769
  int capacity;
770
  int new_nthreads;
771
  KMP_DEBUG_ASSERT(__kmp_init_serial);
772
  KMP_DEBUG_ASSERT(root && parent_team);
773
  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
774

775
  // If dyn-var is set, dynamically adjust the number of desired threads,
776
  // according to the method specified by dynamic_mode.
777
  new_nthreads = set_nthreads;
778
  if (!get__dynamic_2(parent_team, master_tid)) {
779
    ;
780
  }
781
#ifdef USE_LOAD_BALANCE
782
  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
783
    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
784
    if (new_nthreads == 1) {
785
      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
786
                    "reservation to 1 thread\n",
787
                    master_tid));
788
      return 1;
789
    }
790
    if (new_nthreads < set_nthreads) {
791
      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792
                    "reservation to %d threads\n",
793
                    master_tid, new_nthreads));
794
    }
795
  }
796
#endif /* USE_LOAD_BALANCE */
797
  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
798
    new_nthreads = __kmp_avail_proc - __kmp_nth +
799
                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
800
    if (new_nthreads <= 1) {
801
      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
802
                    "reservation to 1 thread\n",
803
                    master_tid));
804
      return 1;
805
    }
806
    if (new_nthreads < set_nthreads) {
807
      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808
                    "reservation to %d threads\n",
809
                    master_tid, new_nthreads));
810
    } else {
811
      new_nthreads = set_nthreads;
812
    }
813
  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
814
    if (set_nthreads > 2) {
815
      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
816
      new_nthreads = (new_nthreads % set_nthreads) + 1;
817
      if (new_nthreads == 1) {
818
        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
819
                      "reservation to 1 thread\n",
820
                      master_tid));
821
        return 1;
822
      }
823
      if (new_nthreads < set_nthreads) {
824
        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825
                      "reservation to %d threads\n",
826
                      master_tid, new_nthreads));
827
      }
828
    }
829
  } else {
830
    KMP_ASSERT(0);
831
  }
832

833
  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
834
  if (__kmp_nth + new_nthreads -
835
          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
836
      __kmp_max_nth) {
837
    int tl_nthreads = __kmp_max_nth - __kmp_nth +
838
                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839
    if (tl_nthreads <= 0) {
840
      tl_nthreads = 1;
841
    }
842

843
    // If dyn-var is false, emit a 1-time warning.
844
    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
845
      __kmp_reserve_warn = 1;
846
      __kmp_msg(kmp_ms_warning,
847
                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
848
                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
849
    }
850
    if (tl_nthreads == 1) {
851
      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
852
                    "reduced reservation to 1 thread\n",
853
                    master_tid));
854
      return 1;
855
    }
856
    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
857
                  "reservation to %d threads\n",
858
                  master_tid, tl_nthreads));
859
    new_nthreads = tl_nthreads;
860
  }
861

862
  // Respect OMP_THREAD_LIMIT
863
  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
864
  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
865
  if (cg_nthreads + new_nthreads -
866
          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
867
      max_cg_threads) {
868
    int tl_nthreads = max_cg_threads - cg_nthreads +
869
                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
870
    if (tl_nthreads <= 0) {
871
      tl_nthreads = 1;
872
    }
873

874
    // If dyn-var is false, emit a 1-time warning.
875
    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
876
      __kmp_reserve_warn = 1;
877
      __kmp_msg(kmp_ms_warning,
878
                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
879
                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
880
    }
881
    if (tl_nthreads == 1) {
882
      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
883
                    "reduced reservation to 1 thread\n",
884
                    master_tid));
885
      return 1;
886
    }
887
    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
888
                  "reservation to %d threads\n",
889
                  master_tid, tl_nthreads));
890
    new_nthreads = tl_nthreads;
891
  }
892

893
  // Check if the threads array is large enough, or needs expanding.
894
  // See comment in __kmp_register_root() about the adjustment if
895
  // __kmp_threads[0] == NULL.
896
  capacity = __kmp_threads_capacity;
897
  if (TCR_PTR(__kmp_threads[0]) == NULL) {
898
    --capacity;
899
  }
900
  // If it is not for initializing the hidden helper team, we need to take
901
  // __kmp_hidden_helper_threads_num out of the capacity because it is included
902
  // in __kmp_threads_capacity.
903
  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
904
    capacity -= __kmp_hidden_helper_threads_num;
905
  }
906
  if (__kmp_nth + new_nthreads -
907
          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
908
      capacity) {
909
    // Expand the threads array.
910
    int slotsRequired = __kmp_nth + new_nthreads -
911
                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
912
                        capacity;
913
    int slotsAdded = __kmp_expand_threads(slotsRequired);
914
    if (slotsAdded < slotsRequired) {
915
      // The threads array was not expanded enough.
916
      new_nthreads -= (slotsRequired - slotsAdded);
917
      KMP_ASSERT(new_nthreads >= 1);
918

919
      // If dyn-var is false, emit a 1-time warning.
920
      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
921
        __kmp_reserve_warn = 1;
922
        if (__kmp_tp_cached) {
923
          __kmp_msg(kmp_ms_warning,
924
                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
925
                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
926
                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
927
        } else {
928
          __kmp_msg(kmp_ms_warning,
929
                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
930
                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
931
        }
932
      }
933
    }
934
  }
935

936
#ifdef KMP_DEBUG
937
  if (new_nthreads == 1) {
938
    KC_TRACE(10,
939
             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
940
              "dead roots and rechecking; requested %d threads\n",
941
              __kmp_get_gtid(), set_nthreads));
942
  } else {
943
    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
944
                  " %d threads\n",
945
                  __kmp_get_gtid(), new_nthreads, set_nthreads));
946
  }
947
#endif // KMP_DEBUG
948

949
  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
950
    __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
951
                 this_thr->th.th_nt_msg);
952
  }
953
  return new_nthreads;
954
}
955

956
/* Allocate threads from the thread pool and assign them to the new team. We are
957
   assured that there are enough threads available, because we checked on that
958
   earlier within critical section forkjoin */
959
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
960
                                    kmp_info_t *master_th, int master_gtid,
961
                                    int fork_teams_workers) {
962
  int i;
963
  int use_hot_team;
964

965
  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
966
  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
967
  KMP_MB();
968

969
  /* first, let's setup the primary thread */
970
  master_th->th.th_info.ds.ds_tid = 0;
971
  master_th->th.th_team = team;
972
  master_th->th.th_team_nproc = team->t.t_nproc;
973
  master_th->th.th_team_master = master_th;
974
  master_th->th.th_team_serialized = FALSE;
975
  master_th->th.th_dispatch = &team->t.t_dispatch[0];
976

977
/* make sure we are not the optimized hot team */
978
#if KMP_NESTED_HOT_TEAMS
979
  use_hot_team = 0;
980
  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
981
  if (hot_teams) { // hot teams array is not allocated if
982
    // KMP_HOT_TEAMS_MAX_LEVEL=0
983
    int level = team->t.t_active_level - 1; // index in array of hot teams
984
    if (master_th->th.th_teams_microtask) { // are we inside the teams?
985
      if (master_th->th.th_teams_size.nteams > 1) {
986
        ++level; // level was not increased in teams construct for
987
        // team_of_masters
988
      }
989
      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
990
          master_th->th.th_teams_level == team->t.t_level) {
991
        ++level; // level was not increased in teams construct for
992
        // team_of_workers before the parallel
993
      } // team->t.t_level will be increased inside parallel
994
    }
995
    if (level < __kmp_hot_teams_max_level) {
996
      if (hot_teams[level].hot_team) {
997
        // hot team has already been allocated for given level
998
        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
999
        use_hot_team = 1; // the team is ready to use
1000
      } else {
1001
        use_hot_team = 0; // AC: threads are not allocated yet
1002
        hot_teams[level].hot_team = team; // remember new hot team
1003
        hot_teams[level].hot_team_nth = team->t.t_nproc;
1004
      }
1005
    } else {
1006
      use_hot_team = 0;
1007
    }
1008
  }
1009
#else
1010
  use_hot_team = team == root->r.r_hot_team;
1011
#endif
1012
  if (!use_hot_team) {
1013

1014
    /* install the primary thread */
1015
    team->t.t_threads[0] = master_th;
1016
    __kmp_initialize_info(master_th, team, 0, master_gtid);
1017

1018
    /* now, install the worker threads */
1019
    for (i = 1; i < team->t.t_nproc; i++) {
1020

1021
      /* fork or reallocate a new thread and install it in team */
1022
      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1023
      team->t.t_threads[i] = thr;
1024
      KMP_DEBUG_ASSERT(thr);
1025
      KMP_DEBUG_ASSERT(thr->th.th_team == team);
1026
      /* align team and thread arrived states */
1027
      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1028
                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
1029
                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1030
                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1031
                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1032
                    team->t.t_bar[bs_plain_barrier].b_arrived));
1033
      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1034
      thr->th.th_teams_level = master_th->th.th_teams_level;
1035
      thr->th.th_teams_size = master_th->th.th_teams_size;
1036
      { // Initialize threads' barrier data.
1037
        int b;
1038
        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1039
        for (b = 0; b < bs_last_barrier; ++b) {
1040
          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1041
          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1042
#if USE_DEBUGGER
1043
          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1044
#endif
1045
        }
1046
      }
1047
    }
1048

1049
#if KMP_AFFINITY_SUPPORTED
1050
    // Do not partition the places list for teams construct workers who
1051
    // haven't actually been forked to do real work yet. This partitioning
1052
    // will take place in the parallel region nested within the teams construct.
1053
    if (!fork_teams_workers) {
1054
      __kmp_partition_places(team);
1055
    }
1056
#endif
1057

1058
    if (team->t.t_nproc > 1 &&
1059
        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1060
      team->t.b->update_num_threads(team->t.t_nproc);
1061
      __kmp_add_threads_to_team(team, team->t.t_nproc);
1062
    }
1063
  }
1064

1065
  // Take care of primary thread's task state
1066
  if (__kmp_tasking_mode != tskm_immediate_exec) {
1067
    if (use_hot_team) {
1068
      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1069
      KA_TRACE(
1070
          20,
1071
          ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1072
           "%p, new task_team %p / team %p\n",
1073
           __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1074
           team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1075
           team));
1076

1077
      // Store primary thread's current task state on new team
1078
      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1079
                       master_th->th.th_task_state);
1080

1081
      // Restore primary thread's task state to hot team's state
1082
      // by using thread 1's task state
1083
      if (team->t.t_nproc > 1) {
1084
        KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1085
                         team->t.t_threads[1]->th.th_task_state == 1);
1086
        KMP_CHECK_UPDATE(master_th->th.th_task_state,
1087
                         team->t.t_threads[1]->th.th_task_state);
1088
      } else {
1089
        master_th->th.th_task_state = 0;
1090
      }
1091
    } else {
1092
      // Store primary thread's current task_state on new team
1093
      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1094
                       master_th->th.th_task_state);
1095
      // Are not using hot team, so set task state to 0.
1096
      master_th->th.th_task_state = 0;
1097
    }
1098
  }
1099

1100
  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101
    for (i = 0; i < team->t.t_nproc; i++) {
1102
      kmp_info_t *thr = team->t.t_threads[i];
1103
      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104
          thr->th.th_prev_level != team->t.t_level) {
1105
        team->t.t_display_affinity = 1;
1106
        break;
1107
      }
1108
    }
1109
  }
1110

1111
  KMP_MB();
1112
}
1113

1114
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115
// Propagate any changes to the floating point control registers out to the team
1116
// We try to avoid unnecessary writes to the relevant cache line in the team
1117
// structure, so we don't make changes unless they are needed.
1118
inline static void propagateFPControl(kmp_team_t *team) {
1119
  if (__kmp_inherit_fp_control) {
1120
    kmp_int16 x87_fpu_control_word;
1121
    kmp_uint32 mxcsr;
1122

1123
    // Get primary thread's values of FPU control flags (both X87 and vector)
1124
    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125
    __kmp_store_mxcsr(&mxcsr);
1126
    mxcsr &= KMP_X86_MXCSR_MASK;
1127

1128
    // There is no point looking at t_fp_control_saved here.
1129
    // If it is TRUE, we still have to update the values if they are different
1130
    // from those we now have. If it is FALSE we didn't save anything yet, but
1131
    // our objective is the same. We have to ensure that the values in the team
1132
    // are the same as those we have.
1133
    // So, this code achieves what we need whether or not t_fp_control_saved is
1134
    // true. By checking whether the value needs updating we avoid unnecessary
1135
    // writes that would put the cache-line into a written state, causing all
1136
    // threads in the team to have to read it again.
1137
    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138
    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139
    // Although we don't use this value, other code in the runtime wants to know
1140
    // whether it should restore them. So we must ensure it is correct.
1141
    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142
  } else {
1143
    // Similarly here. Don't write to this cache-line in the team structure
1144
    // unless we have to.
1145
    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146
  }
1147
}
1148

1149
// Do the opposite, setting the hardware registers to the updated values from
1150
// the team.
1151
inline static void updateHWFPControl(kmp_team_t *team) {
1152
  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153
    // Only reset the fp control regs if they have been changed in the team.
1154
    // the parallel region that we are exiting.
1155
    kmp_int16 x87_fpu_control_word;
1156
    kmp_uint32 mxcsr;
1157
    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158
    __kmp_store_mxcsr(&mxcsr);
1159
    mxcsr &= KMP_X86_MXCSR_MASK;
1160

1161
    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162
      __kmp_clear_x87_fpu_status_word();
1163
      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164
    }
1165

1166
    if (team->t.t_mxcsr != mxcsr) {
1167
      __kmp_load_mxcsr(&team->t.t_mxcsr);
1168
    }
1169
  }
1170
}
1171
#else
1172
#define propagateFPControl(x) ((void)0)
1173
#define updateHWFPControl(x) ((void)0)
1174
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175

1176
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177
                                     int realloc); // forward declaration
1178

1179
/* Run a parallel region that has been serialized, so runs only in a team of the
1180
   single primary thread. */
1181
void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182
  kmp_info_t *this_thr;
1183
  kmp_team_t *serial_team;
1184

1185
  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186

1187
  /* Skip all this code for autopar serialized loops since it results in
1188
     unacceptable overhead */
1189
  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190
    return;
1191

1192
  if (!TCR_4(__kmp_init_parallel))
1193
    __kmp_parallel_initialize();
1194
  __kmp_resume_if_soft_paused();
1195

1196
  this_thr = __kmp_threads[global_tid];
1197
  serial_team = this_thr->th.th_serial_team;
1198

1199
  /* utilize the serialized team held by this thread */
1200
  KMP_DEBUG_ASSERT(serial_team);
1201
  KMP_MB();
1202

1203
  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1204
  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1205
    proc_bind = proc_bind_false;
1206
  } else if (proc_bind == proc_bind_default) {
1207
    // No proc_bind clause was specified, so use the current value
1208
    // of proc-bind-var for this parallel region.
1209
    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1210
  }
1211
  // Reset for next parallel region
1212
  this_thr->th.th_set_proc_bind = proc_bind_default;
1213

1214
  // Reset num_threads for next parallel region
1215
  this_thr->th.th_set_nproc = 0;
1216

1217
#if OMPT_SUPPORT
1218
  ompt_data_t ompt_parallel_data = ompt_data_none;
1219
  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220
  if (ompt_enabled.enabled &&
1221
      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222

1223
    ompt_task_info_t *parent_task_info;
1224
    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225

1226
    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227
    if (ompt_enabled.ompt_callback_parallel_begin) {
1228
      int team_size = 1;
1229

1230
      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231
          &(parent_task_info->task_data), &(parent_task_info->frame),
1232
          &ompt_parallel_data, team_size,
1233
          ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1234
    }
1235
  }
1236
#endif // OMPT_SUPPORT
1237

1238
  if (this_thr->th.th_team != serial_team) {
1239
    // Nested level will be an index in the nested nthreads array
1240
    int level = this_thr->th.th_team->t.t_level;
1241

1242
    if (serial_team->t.t_serialized) {
1243
      /* this serial team was already used
1244
         TODO increase performance by making this locks more specific */
1245
      kmp_team_t *new_team;
1246

1247
      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248

1249
      new_team =
1250
          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1251
#if OMPT_SUPPORT
1252
                              ompt_parallel_data,
1253
#endif
1254
                              proc_bind, &this_thr->th.th_current_task->td_icvs,
1255
                              0 USE_NESTED_HOT_ARG(NULL));
1256
      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1257
      KMP_ASSERT(new_team);
1258

1259
      /* setup new serialized team and install it */
1260
      new_team->t.t_threads[0] = this_thr;
1261
      new_team->t.t_parent = this_thr->th.th_team;
1262
      serial_team = new_team;
1263
      this_thr->th.th_serial_team = serial_team;
1264

1265
      KF_TRACE(
1266
          10,
1267
          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1268
           global_tid, serial_team));
1269

1270
      /* TODO the above breaks the requirement that if we run out of resources,
1271
         then we can still guarantee that serialized teams are ok, since we may
1272
         need to allocate a new one */
1273
    } else {
1274
      KF_TRACE(
1275
          10,
1276
          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1277
           global_tid, serial_team));
1278
    }
1279

1280
    /* we have to initialize this serial team */
1281
    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282
    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283
    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1284
    serial_team->t.t_ident = loc;
1285
    serial_team->t.t_serialized = 1;
1286
    serial_team->t.t_nproc = 1;
1287
    serial_team->t.t_parent = this_thr->th.th_team;
1288
    if (this_thr->th.th_team->t.t_nested_nth)
1289
      serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1290
    else
1291
      serial_team->t.t_nested_nth = &__kmp_nested_nth;
1292
    // Save previous team's task state on serial team structure
1293
    serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1294
    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1295
    this_thr->th.th_team = serial_team;
1296
    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1297

1298
    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1299
                  this_thr->th.th_current_task));
1300
    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1301
    this_thr->th.th_current_task->td_flags.executing = 0;
1302

1303
    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1304

1305
    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1306
       implicit task for each serialized task represented by
1307
       team->t.t_serialized? */
1308
    copy_icvs(&this_thr->th.th_current_task->td_icvs,
1309
              &this_thr->th.th_current_task->td_parent->td_icvs);
1310

1311
    // Thread value exists in the nested nthreads array for the next nested
1312
    // level
1313
    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1314
    if (this_thr->th.th_team->t.t_nested_nth)
1315
      nested_nth = this_thr->th.th_team->t.t_nested_nth;
1316
    if (nested_nth->used && (level + 1 < nested_nth->used)) {
1317
      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1318
    }
1319

1320
    if (__kmp_nested_proc_bind.used &&
1321
        (level + 1 < __kmp_nested_proc_bind.used)) {
1322
      this_thr->th.th_current_task->td_icvs.proc_bind =
1323
          __kmp_nested_proc_bind.bind_types[level + 1];
1324
    }
1325

1326
#if USE_DEBUGGER
1327
    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1328
#endif
1329
    this_thr->th.th_info.ds.ds_tid = 0;
1330

1331
    /* set thread cache values */
1332
    this_thr->th.th_team_nproc = 1;
1333
    this_thr->th.th_team_master = this_thr;
1334
    this_thr->th.th_team_serialized = 1;
1335
    this_thr->th.th_task_team = NULL;
1336
    this_thr->th.th_task_state = 0;
1337

1338
    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1339
    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1340
    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1341

1342
    propagateFPControl(serial_team);
1343

1344
    /* check if we need to allocate dispatch buffers stack */
1345
    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1346
    if (!serial_team->t.t_dispatch->th_disp_buffer) {
1347
      serial_team->t.t_dispatch->th_disp_buffer =
1348
          (dispatch_private_info_t *)__kmp_allocate(
1349
              sizeof(dispatch_private_info_t));
1350
    }
1351
    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352

1353
    KMP_MB();
1354

1355
  } else {
1356
    /* this serialized team is already being used,
1357
     * that's fine, just add another nested level */
1358
    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1359
    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1360
    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1361
    ++serial_team->t.t_serialized;
1362
    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1363

1364
    // Nested level will be an index in the nested nthreads array
1365
    int level = this_thr->th.th_team->t.t_level;
1366
    // Thread value exists in the nested nthreads array for the next nested
1367
    // level
1368

1369
    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1370
    if (serial_team->t.t_nested_nth)
1371
      nested_nth = serial_team->t.t_nested_nth;
1372
    if (nested_nth->used && (level + 1 < nested_nth->used)) {
1373
      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1374
    }
1375

1376
    serial_team->t.t_level++;
1377
    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378
                  "of serial team %p to %d\n",
1379
                  global_tid, serial_team, serial_team->t.t_level));
1380

1381
    /* allocate/push dispatch buffers stack */
1382
    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383
    {
1384
      dispatch_private_info_t *disp_buffer =
1385
          (dispatch_private_info_t *)__kmp_allocate(
1386
              sizeof(dispatch_private_info_t));
1387
      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388
      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389
    }
1390
    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391

1392
    /* allocate/push task team stack */
1393
    __kmp_push_task_team_node(this_thr, serial_team);
1394

1395
    KMP_MB();
1396
  }
1397
  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1398

1399
  // Perform the display affinity functionality for
1400
  // serialized parallel regions
1401
  if (__kmp_display_affinity) {
1402
    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403
        this_thr->th.th_prev_num_threads != 1) {
1404
      // NULL means use the affinity-format-var ICV
1405
      __kmp_aux_display_affinity(global_tid, NULL);
1406
      this_thr->th.th_prev_level = serial_team->t.t_level;
1407
      this_thr->th.th_prev_num_threads = 1;
1408
    }
1409
  }
1410

1411
  if (__kmp_env_consistency_check)
1412
    __kmp_push_parallel(global_tid, NULL);
1413
#if OMPT_SUPPORT
1414
  serial_team->t.ompt_team_info.master_return_address = codeptr;
1415
  if (ompt_enabled.enabled &&
1416
      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1417
    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1418
        OMPT_GET_FRAME_ADDRESS(0);
1419

1420
    ompt_lw_taskteam_t lw_taskteam;
1421
    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422
                            &ompt_parallel_data, codeptr);
1423

1424
    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425
    // don't use lw_taskteam after linking. content was swaped
1426

1427
    /* OMPT implicit task begin */
1428
    if (ompt_enabled.ompt_callback_implicit_task) {
1429
      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1430
          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1431
          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1432
          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433
      OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1434
          __kmp_tid_from_gtid(global_tid);
1435
    }
1436

1437
    /* OMPT state */
1438
    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439
    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1440
        OMPT_GET_FRAME_ADDRESS(0);
1441
  }
1442
#endif
1443
}
1444

1445
// Test if this fork is for a team closely nested in a teams construct
1446
static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1447
                                          microtask_t microtask, int level,
1448
                                          int teams_level, kmp_va_list ap) {
1449
  return (master_th->th.th_teams_microtask && ap &&
1450
          microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1451
}
1452

1453
// Test if this fork is for the teams construct, i.e. to form the outer league
1454
// of teams
1455
static inline bool __kmp_is_entering_teams(int active_level, int level,
1456
                                           int teams_level, kmp_va_list ap) {
1457
  return ((ap == NULL && active_level == 0) ||
1458
          (ap && teams_level > 0 && teams_level == level));
1459
}
1460

1461
// AC: This is start of parallel that is nested inside teams construct.
1462
// The team is actual (hot), all workers are ready at the fork barrier.
1463
// No lock needed to initialize the team a bit, then free workers.
1464
static inline int
1465
__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1466
                    kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1467
                    enum fork_context_e call_context, microtask_t microtask,
1468
                    launch_t invoker, int master_set_numthreads, int level,
1469
#if OMPT_SUPPORT
1470
                    ompt_data_t ompt_parallel_data, void *return_address,
1471
#endif
1472
                    kmp_va_list ap) {
1473
  void **argv;
1474
  int i;
1475

1476
  parent_team->t.t_ident = loc;
1477
  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1478
  parent_team->t.t_argc = argc;
1479
  argv = (void **)parent_team->t.t_argv;
1480
  for (i = argc - 1; i >= 0; --i) {
1481
    *argv++ = va_arg(kmp_va_deref(ap), void *);
1482
  }
1483
  // Increment our nested depth levels, but not increase the serialization
1484
  if (parent_team == master_th->th.th_serial_team) {
1485
    // AC: we are in serialized parallel
1486
    __kmpc_serialized_parallel(loc, gtid);
1487
    KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1488

1489
    if (call_context == fork_context_gnu) {
1490
      // AC: need to decrement t_serialized for enquiry functions to work
1491
      // correctly, will restore at join time
1492
      parent_team->t.t_serialized--;
1493
      return TRUE;
1494
    }
1495

1496
#if OMPD_SUPPORT
1497
    parent_team->t.t_pkfn = microtask;
1498
#endif
1499

1500
#if OMPT_SUPPORT
1501
    void *dummy;
1502
    void **exit_frame_p;
1503
    ompt_data_t *implicit_task_data;
1504
    ompt_lw_taskteam_t lw_taskteam;
1505

1506
    if (ompt_enabled.enabled) {
1507
      __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1508
                              &ompt_parallel_data, return_address);
1509
      exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1510

1511
      __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1512
      // Don't use lw_taskteam after linking. Content was swapped.
1513

1514
      /* OMPT implicit task begin */
1515
      implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1516
      if (ompt_enabled.ompt_callback_implicit_task) {
1517
        OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1518
        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519
            ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1520
            1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521
      }
1522

1523
      /* OMPT state */
1524
      master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1525
    } else {
1526
      exit_frame_p = &dummy;
1527
    }
1528
#endif
1529

1530
    // AC: need to decrement t_serialized for enquiry functions to work
1531
    // correctly, will restore at join time
1532
    parent_team->t.t_serialized--;
1533

1534
    {
1535
      KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1536
      KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1537
      __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1538
#if OMPT_SUPPORT
1539
                             ,
1540
                             exit_frame_p
1541
#endif
1542
                             );
1543
    }
1544

1545
#if OMPT_SUPPORT
1546
    if (ompt_enabled.enabled) {
1547
      *exit_frame_p = NULL;
1548
      OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1549
      if (ompt_enabled.ompt_callback_implicit_task) {
1550
        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1551
            ompt_scope_end, NULL, implicit_task_data, 1,
1552
            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553
      }
1554
      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1555
      __ompt_lw_taskteam_unlink(master_th);
1556
      if (ompt_enabled.ompt_callback_parallel_end) {
1557
        ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1558
            &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1559
            OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1560
      }
1561
      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1562
    }
1563
#endif
1564
    return TRUE;
1565
  }
1566

1567
  parent_team->t.t_pkfn = microtask;
1568
  parent_team->t.t_invoke = invoker;
1569
  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1570
  parent_team->t.t_active_level++;
1571
  parent_team->t.t_level++;
1572
  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1573

1574
  // If the threads allocated to the team are less than the thread limit, update
1575
  // the thread limit here. th_teams_size.nth is specific to this team nested
1576
  // in a teams construct, the team is fully created, and we're about to do
1577
  // the actual fork. Best to do this here so that the subsequent uses below
1578
  // and in the join have the correct value.
1579
  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1580

1581
#if OMPT_SUPPORT
1582
  if (ompt_enabled.enabled) {
1583
    ompt_lw_taskteam_t lw_taskteam;
1584
    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1585
                            return_address);
1586
    __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1587
  }
1588
#endif
1589

1590
  /* Change number of threads in the team if requested */
1591
  if (master_set_numthreads) { // The parallel has num_threads clause
1592
    if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1593
      // AC: only can reduce number of threads dynamically, can't increase
1594
      kmp_info_t **other_threads = parent_team->t.t_threads;
1595
      // NOTE: if using distributed barrier, we need to run this code block
1596
      // even when the team size appears not to have changed from the max.
1597
      int old_proc = master_th->th.th_teams_size.nth;
1598
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1599
        __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1600
        __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1601
      }
1602
      parent_team->t.t_nproc = master_set_numthreads;
1603
      for (i = 0; i < master_set_numthreads; ++i) {
1604
        other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605
      }
1606
    }
1607
    // Keep extra threads hot in the team for possible next parallels
1608
    master_th->th.th_set_nproc = 0;
1609
  }
1610

1611
#if USE_DEBUGGER
1612
  if (__kmp_debugging) { // Let debugger override number of threads.
1613
    int nth = __kmp_omp_num_threads(loc);
1614
    if (nth > 0) { // 0 means debugger doesn't want to change num threads
1615
      master_set_numthreads = nth;
1616
    }
1617
  }
1618
#endif
1619

1620
  // Figure out the proc_bind policy for the nested parallel within teams
1621
  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1622
  // proc_bind_default means don't update
1623
  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1624
  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1625
    proc_bind = proc_bind_false;
1626
  } else {
1627
    // No proc_bind clause specified; use current proc-bind-var
1628
    if (proc_bind == proc_bind_default) {
1629
      proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1630
    }
1631
    /* else: The proc_bind policy was specified explicitly on parallel clause.
1632
       This overrides proc-bind-var for this parallel region, but does not
1633
       change proc-bind-var. */
1634
    // Figure the value of proc-bind-var for the child threads.
1635
    if ((level + 1 < __kmp_nested_proc_bind.used) &&
1636
        (__kmp_nested_proc_bind.bind_types[level + 1] !=
1637
         master_th->th.th_current_task->td_icvs.proc_bind)) {
1638
      proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1639
    }
1640
  }
1641
  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1642
  // Need to change the bind-var ICV to correct value for each implicit task
1643
  if (proc_bind_icv != proc_bind_default &&
1644
      master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1645
    kmp_info_t **other_threads = parent_team->t.t_threads;
1646
    for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1647
      other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1648
    }
1649
  }
1650
  // Reset for next parallel region
1651
  master_th->th.th_set_proc_bind = proc_bind_default;
1652

1653
#if USE_ITT_BUILD && USE_ITT_NOTIFY
1654
  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1655
       KMP_ITT_DEBUG) &&
1656
      __kmp_forkjoin_frames_mode == 3 &&
1657
      parent_team->t.t_active_level == 1 // only report frames at level 1
1658
      && master_th->th.th_teams_size.nteams == 1) {
1659
    kmp_uint64 tmp_time = __itt_get_timestamp();
1660
    master_th->th.th_frame_time = tmp_time;
1661
    parent_team->t.t_region_time = tmp_time;
1662
  }
1663
  if (__itt_stack_caller_create_ptr) {
1664
    KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1665
    // create new stack stitching id before entering fork barrier
1666
    parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1667
  }
1668
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1669
#if KMP_AFFINITY_SUPPORTED
1670
  __kmp_partition_places(parent_team);
1671
#endif
1672

1673
  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1674
                "master_th=%p, gtid=%d\n",
1675
                root, parent_team, master_th, gtid));
1676
  __kmp_internal_fork(loc, gtid, parent_team);
1677
  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1678
                "master_th=%p, gtid=%d\n",
1679
                root, parent_team, master_th, gtid));
1680

1681
  if (call_context == fork_context_gnu)
1682
    return TRUE;
1683

1684
  /* Invoke microtask for PRIMARY thread */
1685
  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1686
                parent_team->t.t_id, parent_team->t.t_pkfn));
1687

1688
  if (!parent_team->t.t_invoke(gtid)) {
1689
    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1690
  }
1691
  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1692
                parent_team->t.t_id, parent_team->t.t_pkfn));
1693
  KMP_MB(); /* Flush all pending memory write invalidates.  */
1694

1695
  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1696

1697
  return TRUE;
1698
}
1699

1700
// Create a serialized parallel region
1701
static inline int
1702
__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1703
                       kmp_int32 argc, microtask_t microtask, launch_t invoker,
1704
                       kmp_info_t *master_th, kmp_team_t *parent_team,
1705
#if OMPT_SUPPORT
1706
                       ompt_data_t *ompt_parallel_data, void **return_address,
1707
                       ompt_data_t **parent_task_data,
1708
#endif
1709
                       kmp_va_list ap) {
1710
  kmp_team_t *team;
1711
  int i;
1712
  void **argv;
1713

1714
/* josh todo: hypothetical question: what do we do for OS X*? */
1715
#if KMP_OS_LINUX &&                                                            \
1716
    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717
  SimpleVLA<void *> args(argc);
1718
#else
1719
  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721
          KMP_ARCH_AARCH64) */
1722

1723
  KA_TRACE(
1724
      20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1725

1726
  __kmpc_serialized_parallel(loc, gtid);
1727

1728
#if OMPD_SUPPORT
1729
  master_th->th.th_serial_team->t.t_pkfn = microtask;
1730
#endif
1731

1732
  if (call_context == fork_context_intel) {
1733
    /* TODO this sucks, use the compiler itself to pass args! :) */
1734
    master_th->th.th_serial_team->t.t_ident = loc;
1735
    if (!ap) {
1736
      // revert change made in __kmpc_serialized_parallel()
1737
      master_th->th.th_serial_team->t.t_level--;
1738
// Get args from parent team for teams construct
1739

1740
#if OMPT_SUPPORT
1741
      void *dummy;
1742
      void **exit_frame_p;
1743
      ompt_task_info_t *task_info;
1744
      ompt_lw_taskteam_t lw_taskteam;
1745

1746
      if (ompt_enabled.enabled) {
1747
        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1748
                                ompt_parallel_data, *return_address);
1749

1750
        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1751
        // don't use lw_taskteam after linking. content was swaped
1752
        task_info = OMPT_CUR_TASK_INFO(master_th);
1753
        exit_frame_p = &(task_info->frame.exit_frame.ptr);
1754
        if (ompt_enabled.ompt_callback_implicit_task) {
1755
          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1756
          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1757
              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1758
              &(task_info->task_data), 1,
1759
              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1760
        }
1761

1762
        /* OMPT state */
1763
        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764
      } else {
1765
        exit_frame_p = &dummy;
1766
      }
1767
#endif
1768

1769
      {
1770
        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771
        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772
        __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1773
#if OMPT_SUPPORT
1774
                               ,
1775
                               exit_frame_p
1776
#endif
1777
                               );
1778
      }
1779

1780
#if OMPT_SUPPORT
1781
      if (ompt_enabled.enabled) {
1782
        *exit_frame_p = NULL;
1783
        if (ompt_enabled.ompt_callback_implicit_task) {
1784
          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1785
              ompt_scope_end, NULL, &(task_info->task_data), 1,
1786
              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1787
        }
1788
        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1789
        __ompt_lw_taskteam_unlink(master_th);
1790
        if (ompt_enabled.ompt_callback_parallel_end) {
1791
          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1792
              ompt_parallel_data, *parent_task_data,
1793
              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1794
        }
1795
        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796
      }
1797
#endif
1798
    } else if (microtask == (microtask_t)__kmp_teams_master) {
1799
      KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1800
      team = master_th->th.th_team;
1801
      // team->t.t_pkfn = microtask;
1802
      team->t.t_invoke = invoker;
1803
      __kmp_alloc_argv_entries(argc, team, TRUE);
1804
      team->t.t_argc = argc;
1805
      argv = (void **)team->t.t_argv;
1806
      for (i = argc - 1; i >= 0; --i)
1807
        *argv++ = va_arg(kmp_va_deref(ap), void *);
1808
      // AC: revert change made in __kmpc_serialized_parallel()
1809
      //     because initial code in teams should have level=0
1810
      team->t.t_level--;
1811
      // AC: call special invoker for outer "parallel" of teams construct
1812
      invoker(gtid);
1813
#if OMPT_SUPPORT
1814
      if (ompt_enabled.enabled) {
1815
        ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1816
        if (ompt_enabled.ompt_callback_implicit_task) {
1817
          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1818
              ompt_scope_end, NULL, &(task_info->task_data), 0,
1819
              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1820
        }
1821
        if (ompt_enabled.ompt_callback_parallel_end) {
1822
          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823
              ompt_parallel_data, *parent_task_data,
1824
              OMPT_INVOKER(call_context) | ompt_parallel_league,
1825
              *return_address);
1826
        }
1827
        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1828
      }
1829
#endif
1830
    } else {
1831
      argv = args;
1832
      for (i = argc - 1; i >= 0; --i)
1833
        *argv++ = va_arg(kmp_va_deref(ap), void *);
1834
      KMP_MB();
1835

1836
#if OMPT_SUPPORT
1837
      void *dummy;
1838
      void **exit_frame_p;
1839
      ompt_task_info_t *task_info;
1840
      ompt_lw_taskteam_t lw_taskteam;
1841
      ompt_data_t *implicit_task_data;
1842

1843
      if (ompt_enabled.enabled) {
1844
        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1845
                                ompt_parallel_data, *return_address);
1846
        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1847
        // don't use lw_taskteam after linking. content was swaped
1848
        task_info = OMPT_CUR_TASK_INFO(master_th);
1849
        exit_frame_p = &(task_info->frame.exit_frame.ptr);
1850

1851
        /* OMPT implicit task begin */
1852
        implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1853
        if (ompt_enabled.ompt_callback_implicit_task) {
1854
          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855
              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1856
              implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1857
              ompt_task_implicit);
1858
          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1859
        }
1860

1861
        /* OMPT state */
1862
        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1863
      } else {
1864
        exit_frame_p = &dummy;
1865
      }
1866
#endif
1867

1868
      {
1869
        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1870
        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1871
        __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1872
#if OMPT_SUPPORT
1873
                               ,
1874
                               exit_frame_p
1875
#endif
1876
                               );
1877
      }
1878

1879
#if OMPT_SUPPORT
1880
      if (ompt_enabled.enabled) {
1881
        *exit_frame_p = NULL;
1882
        if (ompt_enabled.ompt_callback_implicit_task) {
1883
          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884
              ompt_scope_end, NULL, &(task_info->task_data), 1,
1885
              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1886
        }
1887

1888
        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1889
        __ompt_lw_taskteam_unlink(master_th);
1890
        if (ompt_enabled.ompt_callback_parallel_end) {
1891
          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1892
              ompt_parallel_data, *parent_task_data,
1893
              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1894
        }
1895
        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896
      }
1897
#endif
1898
    }
1899
  } else if (call_context == fork_context_gnu) {
1900
#if OMPT_SUPPORT
1901
    if (ompt_enabled.enabled) {
1902
      ompt_lw_taskteam_t lwt;
1903
      __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1904
                              *return_address);
1905

1906
      lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1907
      __ompt_lw_taskteam_link(&lwt, master_th, 1);
1908
    }
1909
// don't use lw_taskteam after linking. content was swaped
1910
#endif
1911

1912
    // we were called from GNU native code
1913
    KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1914
    return FALSE;
1915
  } else {
1916
    KMP_ASSERT2(call_context < fork_context_last,
1917
                "__kmp_serial_fork_call: unknown fork_context parameter");
1918
  }
1919

1920
  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921
  KMP_MB();
1922
  return FALSE;
1923
}
1924

1925
/* most of the work for a fork */
1926
/* return true if we really went parallel, false if serialized */
1927
int __kmp_fork_call(ident_t *loc, int gtid,
1928
                    enum fork_context_e call_context, // Intel, GNU, ...
1929
                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
1930
                    kmp_va_list ap) {
1931
  void **argv;
1932
  int i;
1933
  int master_tid;
1934
  int master_this_cons;
1935
  kmp_team_t *team;
1936
  kmp_team_t *parent_team;
1937
  kmp_info_t *master_th;
1938
  kmp_root_t *root;
1939
  int nthreads;
1940
  int master_active;
1941
  int master_set_numthreads;
1942
  int task_thread_limit = 0;
1943
  int level;
1944
  int active_level;
1945
  int teams_level;
1946
#if KMP_NESTED_HOT_TEAMS
1947
  kmp_hot_team_ptr_t **p_hot_teams;
1948
#endif
1949
  { // KMP_TIME_BLOCK
1950
    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1951
    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1952

1953
    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1954
    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1955
      /* Some systems prefer the stack for the root thread(s) to start with */
1956
      /* some gap from the parent stack to prevent false sharing. */
1957
      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1958
      /* These 2 lines below are so this does not get optimized out */
1959
      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1960
        __kmp_stkpadding += (short)((kmp_int64)dummy);
1961
    }
1962

1963
    /* initialize if needed */
1964
    KMP_DEBUG_ASSERT(
1965
        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1966
    if (!TCR_4(__kmp_init_parallel))
1967
      __kmp_parallel_initialize();
1968
    __kmp_resume_if_soft_paused();
1969

1970
    /* setup current data */
1971
    // AC: potentially unsafe, not in sync with library shutdown,
1972
    // __kmp_threads can be freed
1973
    master_th = __kmp_threads[gtid];
1974

1975
    parent_team = master_th->th.th_team;
1976
    master_tid = master_th->th.th_info.ds.ds_tid;
1977
    master_this_cons = master_th->th.th_local.this_construct;
1978
    root = master_th->th.th_root;
1979
    master_active = root->r.r_active;
1980
    master_set_numthreads = master_th->th.th_set_nproc;
1981
    task_thread_limit =
1982
        master_th->th.th_current_task->td_icvs.task_thread_limit;
1983

1984
#if OMPT_SUPPORT
1985
    ompt_data_t ompt_parallel_data = ompt_data_none;
1986
    ompt_data_t *parent_task_data;
1987
    ompt_frame_t *ompt_frame;
1988
    void *return_address = NULL;
1989

1990
    if (ompt_enabled.enabled) {
1991
      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1992
                                    NULL, NULL);
1993
      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1994
    }
1995
#endif
1996

1997
    // Assign affinity to root thread if it hasn't happened yet
1998
    __kmp_assign_root_init_mask();
1999

2000
    // Nested level will be an index in the nested nthreads array
2001
    level = parent_team->t.t_level;
2002
    // used to launch non-serial teams even if nested is not allowed
2003
    active_level = parent_team->t.t_active_level;
2004
    // needed to check nesting inside the teams
2005
    teams_level = master_th->th.th_teams_level;
2006
#if KMP_NESTED_HOT_TEAMS
2007
    p_hot_teams = &master_th->th.th_hot_teams;
2008
    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2009
      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2010
          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2011
      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2012
      // it is either actual or not needed (when active_level > 0)
2013
      (*p_hot_teams)[0].hot_team_nth = 1;
2014
    }
2015
#endif
2016

2017
#if OMPT_SUPPORT
2018
    if (ompt_enabled.enabled) {
2019
      if (ompt_enabled.ompt_callback_parallel_begin) {
2020
        int team_size = master_set_numthreads
2021
                            ? master_set_numthreads
2022
                            : get__nproc_2(parent_team, master_tid);
2023
        int flags = OMPT_INVOKER(call_context) |
2024
                    ((microtask == (microtask_t)__kmp_teams_master)
2025
                         ? ompt_parallel_league
2026
                         : ompt_parallel_team);
2027
        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028
            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029
            return_address);
2030
      }
2031
      master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032
    }
2033
#endif
2034

2035
    master_th->th.th_ident = loc;
2036

2037
    // Parallel closely nested in teams construct:
2038
    if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039
      return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040
                                 call_context, microtask, invoker,
2041
                                 master_set_numthreads, level,
2042
#if OMPT_SUPPORT
2043
                                 ompt_parallel_data, return_address,
2044
#endif
2045
                                 ap);
2046
    } // End parallel closely nested in teams construct
2047

2048
    // Need this to happen before we determine the number of threads, not while
2049
    // we are allocating the team
2050
    //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051

2052
    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053

2054
    // Determine the number of threads
2055
    int enter_teams =
2056
        __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057
    if ((!enter_teams &&
2058
         (parent_team->t.t_active_level >=
2059
          master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060
        (__kmp_library == library_serial)) {
2061
      KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062
      nthreads = 1;
2063
    } else {
2064
      nthreads = master_set_numthreads
2065
                     ? master_set_numthreads
2066
                     // TODO: get nproc directly from current task
2067
                     : get__nproc_2(parent_team, master_tid);
2068
      // Use the thread_limit set for the current target task if exists, else go
2069
      // with the deduced nthreads
2070
      nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071
                     ? task_thread_limit
2072
                     : nthreads;
2073
      // Check if we need to take forkjoin lock? (no need for serialized
2074
      // parallel out of teams construct).
2075
      if (nthreads > 1) {
2076
        /* determine how many new threads we can use */
2077
        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078
        /* AC: If we execute teams from parallel region (on host), then teams
2079
           should be created but each can only have 1 thread if nesting is
2080
           disabled. If teams called from serial region, then teams and their
2081
           threads should be created regardless of the nesting setting. */
2082
        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083
                                         nthreads, enter_teams);
2084
        if (nthreads == 1) {
2085
          // Free lock for single thread execution here; for multi-thread
2086
          // execution it will be freed later after team of threads created
2087
          // and initialized
2088
          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089
        }
2090
      }
2091
    }
2092
    KMP_DEBUG_ASSERT(nthreads > 0);
2093

2094
    // If we temporarily changed the set number of threads then restore it now
2095
    master_th->th.th_set_nproc = 0;
2096

2097
    if (nthreads == 1) {
2098
      return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099
                                    invoker, master_th, parent_team,
2100
#if OMPT_SUPPORT
2101
                                    &ompt_parallel_data, &return_address,
2102
                                    &parent_task_data,
2103
#endif
2104
                                    ap);
2105
    } // if (nthreads == 1)
2106

2107
    // GEH: only modify the executing flag in the case when not serialized
2108
    //      serialized case is handled in kmpc_serialized_parallel
2109
    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110
                  "curtask=%p, curtask_max_aclevel=%d\n",
2111
                  parent_team->t.t_active_level, master_th,
2112
                  master_th->th.th_current_task,
2113
                  master_th->th.th_current_task->td_icvs.max_active_levels));
2114
    // TODO: GEH - cannot do this assertion because root thread not set up as
2115
    // executing
2116
    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117
    master_th->th.th_current_task->td_flags.executing = 0;
2118

2119
    if (!master_th->th.th_teams_microtask || level > teams_level) {
2120
      /* Increment our nested depth level */
2121
      KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122
    }
2123

2124
    // See if we need to make a copy of the ICVs.
2125
    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126
    kmp_nested_nthreads_t *nested_nth = NULL;
2127
    if (!master_th->th.th_set_nested_nth &&
2128
        (level + 1 < parent_team->t.t_nested_nth->used) &&
2129
        (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130
      nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131
    } else if (master_th->th.th_set_nested_nth) {
2132
      nested_nth = __kmp_override_nested_nth(master_th, level);
2133
      if ((level + 1 < nested_nth->used) &&
2134
          (nested_nth->nth[level + 1] != nthreads_icv))
2135
        nthreads_icv = nested_nth->nth[level + 1];
2136
      else
2137
        nthreads_icv = 0; // don't update
2138
    } else {
2139
      nthreads_icv = 0; // don't update
2140
    }
2141

2142
    // Figure out the proc_bind_policy for the new team.
2143
    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144
    // proc_bind_default means don't update
2145
    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146
    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147
      proc_bind = proc_bind_false;
2148
    } else {
2149
      // No proc_bind clause specified; use current proc-bind-var for this
2150
      // parallel region
2151
      if (proc_bind == proc_bind_default) {
2152
        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153
      }
2154
      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155
      if (master_th->th.th_teams_microtask &&
2156
          microtask == (microtask_t)__kmp_teams_master) {
2157
        proc_bind = __kmp_teams_proc_bind;
2158
      }
2159
      /* else: The proc_bind policy was specified explicitly on parallel clause.
2160
         This overrides proc-bind-var for this parallel region, but does not
2161
         change proc-bind-var. */
2162
      // Figure the value of proc-bind-var for the child threads.
2163
      if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164
          (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165
           master_th->th.th_current_task->td_icvs.proc_bind)) {
2166
        // Do not modify the proc bind icv for the two teams construct forks
2167
        // They just let the proc bind icv pass through
2168
        if (!master_th->th.th_teams_microtask ||
2169
            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170
          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171
      }
2172
    }
2173

2174
    // Reset for next parallel region
2175
    master_th->th.th_set_proc_bind = proc_bind_default;
2176

2177
    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178
      kmp_internal_control_t new_icvs;
2179
      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180
      new_icvs.next = NULL;
2181
      if (nthreads_icv > 0) {
2182
        new_icvs.nproc = nthreads_icv;
2183
      }
2184
      if (proc_bind_icv != proc_bind_default) {
2185
        new_icvs.proc_bind = proc_bind_icv;
2186
      }
2187

2188
      /* allocate a new parallel team */
2189
      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190
      team = __kmp_allocate_team(root, nthreads, nthreads,
2191
#if OMPT_SUPPORT
2192
                                 ompt_parallel_data,
2193
#endif
2194
                                 proc_bind, &new_icvs,
2195
                                 argc USE_NESTED_HOT_ARG(master_th));
2196
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2197
        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2198
    } else {
2199
      /* allocate a new parallel team */
2200
      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2201
      team = __kmp_allocate_team(root, nthreads, nthreads,
2202
#if OMPT_SUPPORT
2203
                                 ompt_parallel_data,
2204
#endif
2205
                                 proc_bind,
2206
                                 &master_th->th.th_current_task->td_icvs,
2207
                                 argc USE_NESTED_HOT_ARG(master_th));
2208
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2209
        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2210
                  &master_th->th.th_current_task->td_icvs);
2211
    }
2212
    KF_TRACE(
2213
        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2214

2215
    /* setup the new team */
2216
    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2217
    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2218
    KMP_CHECK_UPDATE(team->t.t_ident, loc);
2219
    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2220
    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2221
#if OMPT_SUPPORT
2222
    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2223
                          return_address);
2224
#endif
2225
    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2226
    // TODO: parent_team->t.t_level == INT_MAX ???
2227
    if (!master_th->th.th_teams_microtask || level > teams_level) {
2228
      int new_level = parent_team->t.t_level + 1;
2229
      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2230
      new_level = parent_team->t.t_active_level + 1;
2231
      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2232
    } else {
2233
      // AC: Do not increase parallel level at start of the teams construct
2234
      int new_level = parent_team->t.t_level;
2235
      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2236
      new_level = parent_team->t.t_active_level;
2237
      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2238
    }
2239
    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2240
    // set primary thread's schedule as new run-time schedule
2241
    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2242

2243
    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2244
    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2245

2246
    // Check if hot team has potentially outdated list, and if so, free it
2247
    if (team->t.t_nested_nth &&
2248
        team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2249
      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2250
      KMP_INTERNAL_FREE(team->t.t_nested_nth);
2251
      team->t.t_nested_nth = NULL;
2252
    }
2253
    team->t.t_nested_nth = parent_team->t.t_nested_nth;
2254
    if (master_th->th.th_set_nested_nth) {
2255
      if (!nested_nth)
2256
        nested_nth = __kmp_override_nested_nth(master_th, level);
2257
      team->t.t_nested_nth = nested_nth;
2258
      KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2259
      master_th->th.th_set_nested_nth = NULL;
2260
      master_th->th.th_set_nested_nth_sz = 0;
2261
      master_th->th.th_nt_strict = false;
2262
    }
2263

2264
    // Update the floating point rounding in the team if required.
2265
    propagateFPControl(team);
2266
#if OMPD_SUPPORT
2267
    if (ompd_state & OMPD_ENABLE_BP)
2268
      ompd_bp_parallel_begin();
2269
#endif
2270

2271
    KA_TRACE(
2272
        20,
2273
        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2274
         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2275
         team->t.t_nproc));
2276
    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2277
                     (team->t.t_master_tid == 0 &&
2278
                      (team->t.t_parent == root->r.r_root_team ||
2279
                       team->t.t_parent->t.t_serialized)));
2280
    KMP_MB();
2281

2282
    /* now, setup the arguments */
2283
    argv = (void **)team->t.t_argv;
2284
    if (ap) {
2285
      for (i = argc - 1; i >= 0; --i) {
2286
        void *new_argv = va_arg(kmp_va_deref(ap), void *);
2287
        KMP_CHECK_UPDATE(*argv, new_argv);
2288
        argv++;
2289
      }
2290
    } else {
2291
      for (i = 0; i < argc; ++i) {
2292
        // Get args from parent team for teams construct
2293
        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2294
      }
2295
    }
2296

2297
    /* now actually fork the threads */
2298
    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2299
    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2300
      root->r.r_active = TRUE;
2301

2302
    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2303
    __kmp_setup_icv_copy(team, nthreads,
2304
                         &master_th->th.th_current_task->td_icvs, loc);
2305

2306
#if OMPT_SUPPORT
2307
    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2308
#endif
2309

2310
    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2311

2312
#if USE_ITT_BUILD
2313
    if (team->t.t_active_level == 1 // only report frames at level 1
2314
        && !master_th->th.th_teams_microtask) { // not in teams construct
2315
#if USE_ITT_NOTIFY
2316
      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2317
          (__kmp_forkjoin_frames_mode == 3 ||
2318
           __kmp_forkjoin_frames_mode == 1)) {
2319
        kmp_uint64 tmp_time = 0;
2320
        if (__itt_get_timestamp_ptr)
2321
          tmp_time = __itt_get_timestamp();
2322
        // Internal fork - report frame begin
2323
        master_th->th.th_frame_time = tmp_time;
2324
        if (__kmp_forkjoin_frames_mode == 3)
2325
          team->t.t_region_time = tmp_time;
2326
      } else
2327
// only one notification scheme (either "submit" or "forking/joined", not both)
2328
#endif /* USE_ITT_NOTIFY */
2329
        if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2330
            __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2331
          // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2332
          __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2333
        }
2334
    }
2335
#endif /* USE_ITT_BUILD */
2336

2337
    /* now go on and do the work */
2338
    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2339
    KMP_MB();
2340
    KF_TRACE(10,
2341
             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2342
              root, team, master_th, gtid));
2343

2344
#if USE_ITT_BUILD
2345
    if (__itt_stack_caller_create_ptr) {
2346
      // create new stack stitching id before entering fork barrier
2347
      if (!enter_teams) {
2348
        KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2349
        team->t.t_stack_id = __kmp_itt_stack_caller_create();
2350
      } else if (parent_team->t.t_serialized) {
2351
        // keep stack stitching id in the serialized parent_team;
2352
        // current team will be used for parallel inside the teams;
2353
        // if parent_team is active, then it already keeps stack stitching id
2354
        // for the league of teams
2355
        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2356
        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2357
      }
2358
    }
2359
#endif /* USE_ITT_BUILD */
2360

2361
    // AC: skip __kmp_internal_fork at teams construct, let only primary
2362
    // threads execute
2363
    if (ap) {
2364
      __kmp_internal_fork(loc, gtid, team);
2365
      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2366
                    "master_th=%p, gtid=%d\n",
2367
                    root, team, master_th, gtid));
2368
    }
2369

2370
    if (call_context == fork_context_gnu) {
2371
      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2372
      return TRUE;
2373
    }
2374

2375
    /* Invoke microtask for PRIMARY thread */
2376
    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2377
                  team->t.t_id, team->t.t_pkfn));
2378
  } // END of timer KMP_fork_call block
2379

2380
#if KMP_STATS_ENABLED
2381
  // If beginning a teams construct, then change thread state
2382
  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2383
  if (!ap) {
2384
    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2385
  }
2386
#endif
2387

2388
  if (!team->t.t_invoke(gtid)) {
2389
    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2390
  }
2391

2392
#if KMP_STATS_ENABLED
2393
  // If was beginning of a teams construct, then reset thread state
2394
  if (!ap) {
2395
    KMP_SET_THREAD_STATE(previous_state);
2396
  }
2397
#endif
2398

2399
  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2400
                team->t.t_id, team->t.t_pkfn));
2401
  KMP_MB(); /* Flush all pending memory write invalidates.  */
2402

2403
  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2404
#if OMPT_SUPPORT
2405
  if (ompt_enabled.enabled) {
2406
    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2407
  }
2408
#endif
2409

2410
  return TRUE;
2411
}
2412

2413
#if OMPT_SUPPORT
2414
static inline void __kmp_join_restore_state(kmp_info_t *thread,
2415
                                            kmp_team_t *team) {
2416
  // restore state outside the region
2417
  thread->th.ompt_thread_info.state =
2418
      ((team->t.t_serialized) ? ompt_state_work_serial
2419
                              : ompt_state_work_parallel);
2420
}
2421

2422
static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2423
                                   kmp_team_t *team, ompt_data_t *parallel_data,
2424
                                   int flags, void *codeptr) {
2425
  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426
  if (ompt_enabled.ompt_callback_parallel_end) {
2427
    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2428
        parallel_data, &(task_info->task_data), flags, codeptr);
2429
  }
2430

2431
  task_info->frame.enter_frame = ompt_data_none;
2432
  __kmp_join_restore_state(thread, team);
2433
}
2434
#endif
2435

2436
void __kmp_join_call(ident_t *loc, int gtid
2437
#if OMPT_SUPPORT
2438
                     ,
2439
                     enum fork_context_e fork_context
2440
#endif
2441
                     ,
2442
                     int exit_teams) {
2443
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2444
  kmp_team_t *team;
2445
  kmp_team_t *parent_team;
2446
  kmp_info_t *master_th;
2447
  kmp_root_t *root;
2448
  int master_active;
2449

2450
  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2451

2452
  /* setup current data */
2453
  master_th = __kmp_threads[gtid];
2454
  root = master_th->th.th_root;
2455
  team = master_th->th.th_team;
2456
  parent_team = team->t.t_parent;
2457

2458
  master_th->th.th_ident = loc;
2459

2460
#if OMPT_SUPPORT
2461
  void *team_microtask = (void *)team->t.t_pkfn;
2462
  // For GOMP interface with serialized parallel, need the
2463
  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2464
  // and end-parallel events.
2465
  if (ompt_enabled.enabled &&
2466
      !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2467
    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2468
  }
2469
#endif
2470

2471
#if KMP_DEBUG
2472
  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2473
    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2474
                  "th_task_team = %p\n",
2475
                  __kmp_gtid_from_thread(master_th), team,
2476
                  team->t.t_task_team[master_th->th.th_task_state],
2477
                  master_th->th.th_task_team));
2478
    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2479
  }
2480
#endif
2481

2482
  if (team->t.t_serialized) {
2483
    if (master_th->th.th_teams_microtask) {
2484
      // We are in teams construct
2485
      int level = team->t.t_level;
2486
      int tlevel = master_th->th.th_teams_level;
2487
      if (level == tlevel) {
2488
        // AC: we haven't incremented it earlier at start of teams construct,
2489
        //     so do it here - at the end of teams construct
2490
        team->t.t_level++;
2491
      } else if (level == tlevel + 1) {
2492
        // AC: we are exiting parallel inside teams, need to increment
2493
        // serialization in order to restore it in the next call to
2494
        // __kmpc_end_serialized_parallel
2495
        team->t.t_serialized++;
2496
      }
2497
    }
2498
    __kmpc_end_serialized_parallel(loc, gtid);
2499

2500
#if OMPT_SUPPORT
2501
    if (ompt_enabled.enabled) {
2502
      if (fork_context == fork_context_gnu) {
2503
        __ompt_lw_taskteam_unlink(master_th);
2504
      }
2505
      __kmp_join_restore_state(master_th, parent_team);
2506
    }
2507
#endif
2508

2509
    return;
2510
  }
2511

2512
  master_active = team->t.t_master_active;
2513

2514
  if (!exit_teams) {
2515
    // AC: No barrier for internal teams at exit from teams construct.
2516
    //     But there is barrier for external team (league).
2517
    __kmp_internal_join(loc, gtid, team);
2518
#if USE_ITT_BUILD
2519
    if (__itt_stack_caller_create_ptr) {
2520
      KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2521
      // destroy the stack stitching id after join barrier
2522
      __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2523
      team->t.t_stack_id = NULL;
2524
    }
2525
#endif
2526
  } else {
2527
    master_th->th.th_task_state =
2528
        0; // AC: no tasking in teams (out of any parallel)
2529
#if USE_ITT_BUILD
2530
    if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2531
      KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2532
      // destroy the stack stitching id on exit from the teams construct
2533
      // if parent_team is active, then the id will be destroyed later on
2534
      // by master of the league of teams
2535
      __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2536
      parent_team->t.t_stack_id = NULL;
2537
    }
2538
#endif
2539
  }
2540

2541
  KMP_MB();
2542

2543
#if OMPT_SUPPORT
2544
  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2545
  void *codeptr = team->t.ompt_team_info.master_return_address;
2546
#endif
2547

2548
#if USE_ITT_BUILD
2549
  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2550
  if (team->t.t_active_level == 1 &&
2551
      (!master_th->th.th_teams_microtask || /* not in teams construct */
2552
       master_th->th.th_teams_size.nteams == 1)) {
2553
    master_th->th.th_ident = loc;
2554
    // only one notification scheme (either "submit" or "forking/joined", not
2555
    // both)
2556
    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2557
        __kmp_forkjoin_frames_mode == 3)
2558
      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2559
                             master_th->th.th_frame_time, 0, loc,
2560
                             master_th->th.th_team_nproc, 1);
2561
    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2562
             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2563
      __kmp_itt_region_joined(gtid);
2564
  } // active_level == 1
2565
#endif /* USE_ITT_BUILD */
2566

2567
#if KMP_AFFINITY_SUPPORTED
2568
  if (!exit_teams) {
2569
    // Restore master thread's partition.
2570
    master_th->th.th_first_place = team->t.t_first_place;
2571
    master_th->th.th_last_place = team->t.t_last_place;
2572
  }
2573
#endif // KMP_AFFINITY_SUPPORTED
2574

2575
  if (master_th->th.th_teams_microtask && !exit_teams &&
2576
      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2577
      team->t.t_level == master_th->th.th_teams_level + 1) {
2578
// AC: We need to leave the team structure intact at the end of parallel
2579
// inside the teams construct, so that at the next parallel same (hot) team
2580
// works, only adjust nesting levels
2581
#if OMPT_SUPPORT
2582
    ompt_data_t ompt_parallel_data = ompt_data_none;
2583
    if (ompt_enabled.enabled) {
2584
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2585
      if (ompt_enabled.ompt_callback_implicit_task) {
2586
        int ompt_team_size = team->t.t_nproc;
2587
        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2588
            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2589
            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2590
      }
2591
      task_info->frame.exit_frame = ompt_data_none;
2592
      task_info->task_data = ompt_data_none;
2593
      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2594
      __ompt_lw_taskteam_unlink(master_th);
2595
    }
2596
#endif
2597
    /* Decrement our nested depth level */
2598
    team->t.t_level--;
2599
    team->t.t_active_level--;
2600
    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2601

2602
    // Restore number of threads in the team if needed. This code relies on
2603
    // the proper adjustment of th_teams_size.nth after the fork in
2604
    // __kmp_teams_master on each teams primary thread in the case that
2605
    // __kmp_reserve_threads reduced it.
2606
    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2607
      int old_num = master_th->th.th_team_nproc;
2608
      int new_num = master_th->th.th_teams_size.nth;
2609
      kmp_info_t **other_threads = team->t.t_threads;
2610
      team->t.t_nproc = new_num;
2611
      for (int i = 0; i < old_num; ++i) {
2612
        other_threads[i]->th.th_team_nproc = new_num;
2613
      }
2614
      // Adjust states of non-used threads of the team
2615
      for (int i = old_num; i < new_num; ++i) {
2616
        // Re-initialize thread's barrier data.
2617
        KMP_DEBUG_ASSERT(other_threads[i]);
2618
        kmp_balign_t *balign = other_threads[i]->th.th_bar;
2619
        for (int b = 0; b < bs_last_barrier; ++b) {
2620
          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2621
          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2622
#if USE_DEBUGGER
2623
          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2624
#endif
2625
        }
2626
        if (__kmp_tasking_mode != tskm_immediate_exec) {
2627
          // Synchronize thread's task state
2628
          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2629
        }
2630
      }
2631
    }
2632

2633
#if OMPT_SUPPORT
2634
    if (ompt_enabled.enabled) {
2635
      __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2636
                      OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2637
    }
2638
#endif
2639

2640
    return;
2641
  }
2642

2643
  /* do cleanup and restore the parent team */
2644
  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2645
  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2646

2647
  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2648

2649
  /* jc: The following lock has instructions with REL and ACQ semantics,
2650
     separating the parallel user code called in this parallel region
2651
     from the serial user code called after this function returns. */
2652
  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2653

2654
  if (!master_th->th.th_teams_microtask ||
2655
      team->t.t_level > master_th->th.th_teams_level) {
2656
    /* Decrement our nested depth level */
2657
    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2658
  }
2659
  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2660

2661
#if OMPT_SUPPORT
2662
  if (ompt_enabled.enabled) {
2663
    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2664
    if (ompt_enabled.ompt_callback_implicit_task) {
2665
      int flags = (team_microtask == (void *)__kmp_teams_master)
2666
                      ? ompt_task_initial
2667
                      : ompt_task_implicit;
2668
      int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2669
      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2670
          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2671
          OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2672
    }
2673
    task_info->frame.exit_frame = ompt_data_none;
2674
    task_info->task_data = ompt_data_none;
2675
  }
2676
#endif
2677

2678
  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2679
                master_th, team));
2680
  __kmp_pop_current_task_from_thread(master_th);
2681

2682
  master_th->th.th_def_allocator = team->t.t_def_allocator;
2683

2684
#if OMPD_SUPPORT
2685
  if (ompd_state & OMPD_ENABLE_BP)
2686
    ompd_bp_parallel_end();
2687
#endif
2688
  updateHWFPControl(team);
2689

2690
  if (root->r.r_active != master_active)
2691
    root->r.r_active = master_active;
2692

2693
  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2694
                            master_th)); // this will free worker threads
2695

2696
  /* this race was fun to find. make sure the following is in the critical
2697
     region otherwise assertions may fail occasionally since the old team may be
2698
     reallocated and the hierarchy appears inconsistent. it is actually safe to
2699
     run and won't cause any bugs, but will cause those assertion failures. it's
2700
     only one deref&assign so might as well put this in the critical region */
2701
  master_th->th.th_team = parent_team;
2702
  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703
  master_th->th.th_team_master = parent_team->t.t_threads[0];
2704
  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705

2706
  /* restore serialized team, if need be */
2707
  if (parent_team->t.t_serialized &&
2708
      parent_team != master_th->th.th_serial_team &&
2709
      parent_team != root->r.r_root_team) {
2710
    __kmp_free_team(root,
2711
                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2712
    master_th->th.th_serial_team = parent_team;
2713
  }
2714

2715
  if (__kmp_tasking_mode != tskm_immediate_exec) {
2716
    // Restore primary thread's task state from team structure
2717
    KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2718
                     team->t.t_primary_task_state == 1);
2719
    master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2720

2721
    // Copy the task team from the parent team to the primary thread
2722
    master_th->th.th_task_team =
2723
        parent_team->t.t_task_team[master_th->th.th_task_state];
2724
    KA_TRACE(20,
2725
             ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2726
              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2727
              parent_team));
2728
  }
2729

2730
  // TODO: GEH - cannot do this assertion because root thread not set up as
2731
  // executing
2732
  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2733
  master_th->th.th_current_task->td_flags.executing = 1;
2734

2735
  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2736

2737
#if KMP_AFFINITY_SUPPORTED
2738
  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2739
    __kmp_reset_root_init_mask(gtid);
2740
  }
2741
#endif
2742
#if OMPT_SUPPORT
2743
  int flags =
2744
      OMPT_INVOKER(fork_context) |
2745
      ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2746
                                                      : ompt_parallel_team);
2747
  if (ompt_enabled.enabled) {
2748
    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2749
                    codeptr);
2750
  }
2751
#endif
2752

2753
  KMP_MB();
2754
  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2755
}
2756

2757
/* Check whether we should push an internal control record onto the
2758
   serial team stack.  If so, do it.  */
2759
void __kmp_save_internal_controls(kmp_info_t *thread) {
2760

2761
  if (thread->th.th_team != thread->th.th_serial_team) {
2762
    return;
2763
  }
2764
  if (thread->th.th_team->t.t_serialized > 1) {
2765
    int push = 0;
2766

2767
    if (thread->th.th_team->t.t_control_stack_top == NULL) {
2768
      push = 1;
2769
    } else {
2770
      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2771
          thread->th.th_team->t.t_serialized) {
2772
        push = 1;
2773
      }
2774
    }
2775
    if (push) { /* push a record on the serial team's stack */
2776
      kmp_internal_control_t *control =
2777
          (kmp_internal_control_t *)__kmp_allocate(
2778
              sizeof(kmp_internal_control_t));
2779

2780
      copy_icvs(control, &thread->th.th_current_task->td_icvs);
2781

2782
      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2783

2784
      control->next = thread->th.th_team->t.t_control_stack_top;
2785
      thread->th.th_team->t.t_control_stack_top = control;
2786
    }
2787
  }
2788
}
2789

2790
/* Changes set_nproc */
2791
void __kmp_set_num_threads(int new_nth, int gtid) {
2792
  kmp_info_t *thread;
2793
  kmp_root_t *root;
2794

2795
  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2796
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2797

2798
  if (new_nth < 1)
2799
    new_nth = 1;
2800
  else if (new_nth > __kmp_max_nth)
2801
    new_nth = __kmp_max_nth;
2802

2803
  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2804
  thread = __kmp_threads[gtid];
2805
  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2806
    return; // nothing to do
2807

2808
  __kmp_save_internal_controls(thread);
2809

2810
  set__nproc(thread, new_nth);
2811

2812
  // If this omp_set_num_threads() call will cause the hot team size to be
2813
  // reduced (in the absence of a num_threads clause), then reduce it now,
2814
  // rather than waiting for the next parallel region.
2815
  root = thread->th.th_root;
2816
  if (__kmp_init_parallel && (!root->r.r_active) &&
2817
      (root->r.r_hot_team->t.t_nproc > new_nth)
2818
#if KMP_NESTED_HOT_TEAMS
2819
      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2820
#endif
2821
  ) {
2822
    kmp_team_t *hot_team = root->r.r_hot_team;
2823
    int f;
2824

2825
    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2826

2827
    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2828
      __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2829
    }
2830
    // Release the extra threads we don't need any more.
2831
    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2832
      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2833
      if (__kmp_tasking_mode != tskm_immediate_exec) {
2834
        // When decreasing team size, threads no longer in the team should unref
2835
        // task team.
2836
        hot_team->t.t_threads[f]->th.th_task_team = NULL;
2837
      }
2838
      __kmp_free_thread(hot_team->t.t_threads[f]);
2839
      hot_team->t.t_threads[f] = NULL;
2840
    }
2841
    hot_team->t.t_nproc = new_nth;
2842
#if KMP_NESTED_HOT_TEAMS
2843
    if (thread->th.th_hot_teams) {
2844
      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2845
      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2846
    }
2847
#endif
2848

2849
    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2850
      hot_team->t.b->update_num_threads(new_nth);
2851
      __kmp_add_threads_to_team(hot_team, new_nth);
2852
    }
2853

2854
    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2855

2856
    // Update the t_nproc field in the threads that are still active.
2857
    for (f = 0; f < new_nth; f++) {
2858
      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2859
      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2860
    }
2861
    // Special flag in case omp_set_num_threads() call
2862
    hot_team->t.t_size_changed = -1;
2863
  }
2864
}
2865

2866
/* Changes max_active_levels */
2867
void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2868
  kmp_info_t *thread;
2869

2870
  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2871
                "%d = (%d)\n",
2872
                gtid, max_active_levels));
2873
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2874

2875
  // validate max_active_levels
2876
  if (max_active_levels < 0) {
2877
    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2878
    // We ignore this call if the user has specified a negative value.
2879
    // The current setting won't be changed. The last valid setting will be
2880
    // used. A warning will be issued (if warnings are allowed as controlled by
2881
    // the KMP_WARNINGS env var).
2882
    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2883
                  "max_active_levels for thread %d = (%d)\n",
2884
                  gtid, max_active_levels));
2885
    return;
2886
  }
2887
  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2888
    // it's OK, the max_active_levels is within the valid range: [ 0;
2889
    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2890
    // We allow a zero value. (implementation defined behavior)
2891
  } else {
2892
    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2893
                KMP_MAX_ACTIVE_LEVELS_LIMIT);
2894
    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2895
    // Current upper limit is MAX_INT. (implementation defined behavior)
2896
    // If the input exceeds the upper limit, we correct the input to be the
2897
    // upper limit. (implementation defined behavior)
2898
    // Actually, the flow should never get here until we use MAX_INT limit.
2899
  }
2900
  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2901
                "max_active_levels for thread %d = (%d)\n",
2902
                gtid, max_active_levels));
2903

2904
  thread = __kmp_threads[gtid];
2905

2906
  __kmp_save_internal_controls(thread);
2907

2908
  set__max_active_levels(thread, max_active_levels);
2909
}
2910

2911
/* Gets max_active_levels */
2912
int __kmp_get_max_active_levels(int gtid) {
2913
  kmp_info_t *thread;
2914

2915
  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2916
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2917

2918
  thread = __kmp_threads[gtid];
2919
  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2920
  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2921
                "curtask_maxaclevel=%d\n",
2922
                gtid, thread->th.th_current_task,
2923
                thread->th.th_current_task->td_icvs.max_active_levels));
2924
  return thread->th.th_current_task->td_icvs.max_active_levels;
2925
}
2926

2927
// nteams-var per-device ICV
2928
void __kmp_set_num_teams(int num_teams) {
2929
  if (num_teams > 0)
2930
    __kmp_nteams = num_teams;
2931
}
2932
int __kmp_get_max_teams(void) { return __kmp_nteams; }
2933
// teams-thread-limit-var per-device ICV
2934
void __kmp_set_teams_thread_limit(int limit) {
2935
  if (limit > 0)
2936
    __kmp_teams_thread_limit = limit;
2937
}
2938
int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2939

2940
KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2941
KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2942

2943
/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2944
void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2945
  kmp_info_t *thread;
2946
  kmp_sched_t orig_kind;
2947
  //    kmp_team_t *team;
2948

2949
  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2950
                gtid, (int)kind, chunk));
2951
  KMP_DEBUG_ASSERT(__kmp_init_serial);
2952

2953
  // Check if the kind parameter is valid, correct if needed.
2954
  // Valid parameters should fit in one of two intervals - standard or extended:
2955
  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2956
  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2957
  orig_kind = kind;
2958
  kind = __kmp_sched_without_mods(kind);
2959

2960
  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2961
      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2962
    // TODO: Hint needs attention in case we change the default schedule.
2963
    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2964
              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2965
              __kmp_msg_null);
2966
    kind = kmp_sched_default;
2967
    chunk = 0; // ignore chunk value in case of bad kind
2968
  }
2969

2970
  thread = __kmp_threads[gtid];
2971

2972
  __kmp_save_internal_controls(thread);
2973

2974
  if (kind < kmp_sched_upper_std) {
2975
    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2976
      // differ static chunked vs. unchunked:  chunk should be invalid to
2977
      // indicate unchunked schedule (which is the default)
2978
      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2979
    } else {
2980
      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981
          __kmp_sch_map[kind - kmp_sched_lower - 1];
2982
    }
2983
  } else {
2984
    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2985
    //    kmp_sched_lower - 2 ];
2986
    thread->th.th_current_task->td_icvs.sched.r_sched_type =
2987
        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988
                      kmp_sched_lower - 2];
2989
  }
2990
  __kmp_sched_apply_mods_intkind(
2991
      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2992
  if (kind == kmp_sched_auto || chunk < 1) {
2993
    // ignore parameter chunk for schedule auto
2994
    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2995
  } else {
2996
    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2997
  }
2998
}
2999

3000
/* Gets def_sched_var ICV values */
3001
void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3002
  kmp_info_t *thread;
3003
  enum sched_type th_type;
3004

3005
  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3006
  KMP_DEBUG_ASSERT(__kmp_init_serial);
3007

3008
  thread = __kmp_threads[gtid];
3009

3010
  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3011
  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3012
  case kmp_sch_static:
3013
  case kmp_sch_static_greedy:
3014
  case kmp_sch_static_balanced:
3015
    *kind = kmp_sched_static;
3016
    __kmp_sched_apply_mods_stdkind(kind, th_type);
3017
    *chunk = 0; // chunk was not set, try to show this fact via zero value
3018
    return;
3019
  case kmp_sch_static_chunked:
3020
    *kind = kmp_sched_static;
3021
    break;
3022
  case kmp_sch_dynamic_chunked:
3023
    *kind = kmp_sched_dynamic;
3024
    break;
3025
  case kmp_sch_guided_chunked:
3026
  case kmp_sch_guided_iterative_chunked:
3027
  case kmp_sch_guided_analytical_chunked:
3028
    *kind = kmp_sched_guided;
3029
    break;
3030
  case kmp_sch_auto:
3031
    *kind = kmp_sched_auto;
3032
    break;
3033
  case kmp_sch_trapezoidal:
3034
    *kind = kmp_sched_trapezoidal;
3035
    break;
3036
#if KMP_STATIC_STEAL_ENABLED
3037
  case kmp_sch_static_steal:
3038
    *kind = kmp_sched_static_steal;
3039
    break;
3040
#endif
3041
  default:
3042
    KMP_FATAL(UnknownSchedulingType, th_type);
3043
  }
3044

3045
  __kmp_sched_apply_mods_stdkind(kind, th_type);
3046
  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3047
}
3048

3049
int __kmp_get_ancestor_thread_num(int gtid, int level) {
3050

3051
  int ii, dd;
3052
  kmp_team_t *team;
3053
  kmp_info_t *thr;
3054

3055
  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3056
  KMP_DEBUG_ASSERT(__kmp_init_serial);
3057

3058
  // validate level
3059
  if (level == 0)
3060
    return 0;
3061
  if (level < 0)
3062
    return -1;
3063
  thr = __kmp_threads[gtid];
3064
  team = thr->th.th_team;
3065
  ii = team->t.t_level;
3066
  if (level > ii)
3067
    return -1;
3068

3069
  if (thr->th.th_teams_microtask) {
3070
    // AC: we are in teams region where multiple nested teams have same level
3071
    int tlevel = thr->th.th_teams_level; // the level of the teams construct
3072
    if (level <=
3073
        tlevel) { // otherwise usual algorithm works (will not touch the teams)
3074
      KMP_DEBUG_ASSERT(ii >= tlevel);
3075
      // AC: As we need to pass by the teams league, we need to artificially
3076
      // increase ii
3077
      if (ii == tlevel) {
3078
        ii += 2; // three teams have same level
3079
      } else {
3080
        ii++; // two teams have same level
3081
      }
3082
    }
3083
  }
3084

3085
  if (ii == level)
3086
    return __kmp_tid_from_gtid(gtid);
3087

3088
  dd = team->t.t_serialized;
3089
  level++;
3090
  while (ii > level) {
3091
    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3092
    }
3093
    if ((team->t.t_serialized) && (!dd)) {
3094
      team = team->t.t_parent;
3095
      continue;
3096
    }
3097
    if (ii > level) {
3098
      team = team->t.t_parent;
3099
      dd = team->t.t_serialized;
3100
      ii--;
3101
    }
3102
  }
3103

3104
  return (dd > 1) ? (0) : (team->t.t_master_tid);
3105
}
3106

3107
int __kmp_get_team_size(int gtid, int level) {
3108

3109
  int ii, dd;
3110
  kmp_team_t *team;
3111
  kmp_info_t *thr;
3112

3113
  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3114
  KMP_DEBUG_ASSERT(__kmp_init_serial);
3115

3116
  // validate level
3117
  if (level == 0)
3118
    return 1;
3119
  if (level < 0)
3120
    return -1;
3121
  thr = __kmp_threads[gtid];
3122
  team = thr->th.th_team;
3123
  ii = team->t.t_level;
3124
  if (level > ii)
3125
    return -1;
3126

3127
  if (thr->th.th_teams_microtask) {
3128
    // AC: we are in teams region where multiple nested teams have same level
3129
    int tlevel = thr->th.th_teams_level; // the level of the teams construct
3130
    if (level <=
3131
        tlevel) { // otherwise usual algorithm works (will not touch the teams)
3132
      KMP_DEBUG_ASSERT(ii >= tlevel);
3133
      // AC: As we need to pass by the teams league, we need to artificially
3134
      // increase ii
3135
      if (ii == tlevel) {
3136
        ii += 2; // three teams have same level
3137
      } else {
3138
        ii++; // two teams have same level
3139
      }
3140
    }
3141
  }
3142

3143
  while (ii > level) {
3144
    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3145
    }
3146
    if (team->t.t_serialized && (!dd)) {
3147
      team = team->t.t_parent;
3148
      continue;
3149
    }
3150
    if (ii > level) {
3151
      team = team->t.t_parent;
3152
      ii--;
3153
    }
3154
  }
3155

3156
  return team->t.t_nproc;
3157
}
3158

3159
kmp_r_sched_t __kmp_get_schedule_global() {
3160
  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3161
  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3162
  // independently. So one can get the updated schedule here.
3163

3164
  kmp_r_sched_t r_sched;
3165

3166
  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3167
  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3168
  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3169
  // different roots (even in OMP 2.5)
3170
  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3171
  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3172
  if (s == kmp_sch_static) {
3173
    // replace STATIC with more detailed schedule (balanced or greedy)
3174
    r_sched.r_sched_type = __kmp_static;
3175
  } else if (s == kmp_sch_guided_chunked) {
3176
    // replace GUIDED with more detailed schedule (iterative or analytical)
3177
    r_sched.r_sched_type = __kmp_guided;
3178
  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3179
    r_sched.r_sched_type = __kmp_sched;
3180
  }
3181
  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3182

3183
  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3184
    // __kmp_chunk may be wrong here (if it was not ever set)
3185
    r_sched.chunk = KMP_DEFAULT_CHUNK;
3186
  } else {
3187
    r_sched.chunk = __kmp_chunk;
3188
  }
3189

3190
  return r_sched;
3191
}
3192

3193
/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3194
   at least argc number of *t_argv entries for the requested team. */
3195
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3196

3197
  KMP_DEBUG_ASSERT(team);
3198
  if (!realloc || argc > team->t.t_max_argc) {
3199

3200
    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3201
                   "current entries=%d\n",
3202
                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3203
    /* if previously allocated heap space for args, free them */
3204
    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3205
      __kmp_free((void *)team->t.t_argv);
3206

3207
    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3208
      /* use unused space in the cache line for arguments */
3209
      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3210
      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3211
                     "argv entries\n",
3212
                     team->t.t_id, team->t.t_max_argc));
3213
      team->t.t_argv = &team->t.t_inline_argv[0];
3214
      if (__kmp_storage_map) {
3215
        __kmp_print_storage_map_gtid(
3216
            -1, &team->t.t_inline_argv[0],
3217
            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3218
            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3219
            team->t.t_id);
3220
      }
3221
    } else {
3222
      /* allocate space for arguments in the heap */
3223
      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3224
                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
3225
                               : 2 * argc;
3226
      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3227
                     "argv entries\n",
3228
                     team->t.t_id, team->t.t_max_argc));
3229
      team->t.t_argv =
3230
          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3231
      if (__kmp_storage_map) {
3232
        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3233
                                     &team->t.t_argv[team->t.t_max_argc],
3234
                                     sizeof(void *) * team->t.t_max_argc,
3235
                                     "team_%d.t_argv", team->t.t_id);
3236
      }
3237
    }
3238
  }
3239
}
3240

3241
static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3242
  int i;
3243
  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3244
  team->t.t_threads =
3245
      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3246
  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3247
      sizeof(dispatch_shared_info_t) * num_disp_buff);
3248
  team->t.t_dispatch =
3249
      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3250
  team->t.t_implicit_task_taskdata =
3251
      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3252
  team->t.t_max_nproc = max_nth;
3253

3254
  /* setup dispatch buffers */
3255
  for (i = 0; i < num_disp_buff; ++i) {
3256
    team->t.t_disp_buffer[i].buffer_index = i;
3257
    team->t.t_disp_buffer[i].doacross_buf_idx = i;
3258
  }
3259
}
3260

3261
static void __kmp_free_team_arrays(kmp_team_t *team) {
3262
  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3263
  int i;
3264
  for (i = 0; i < team->t.t_max_nproc; ++i) {
3265
    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3266
      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3267
      team->t.t_dispatch[i].th_disp_buffer = NULL;
3268
    }
3269
  }
3270
#if KMP_USE_HIER_SCHED
3271
  __kmp_dispatch_free_hierarchies(team);
3272
#endif
3273
  __kmp_free(team->t.t_threads);
3274
  __kmp_free(team->t.t_disp_buffer);
3275
  __kmp_free(team->t.t_dispatch);
3276
  __kmp_free(team->t.t_implicit_task_taskdata);
3277
  team->t.t_threads = NULL;
3278
  team->t.t_disp_buffer = NULL;
3279
  team->t.t_dispatch = NULL;
3280
  team->t.t_implicit_task_taskdata = 0;
3281
}
3282

3283
static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3284
  kmp_info_t **oldThreads = team->t.t_threads;
3285

3286
  __kmp_free(team->t.t_disp_buffer);
3287
  __kmp_free(team->t.t_dispatch);
3288
  __kmp_free(team->t.t_implicit_task_taskdata);
3289
  __kmp_allocate_team_arrays(team, max_nth);
3290

3291
  KMP_MEMCPY(team->t.t_threads, oldThreads,
3292
             team->t.t_nproc * sizeof(kmp_info_t *));
3293

3294
  __kmp_free(oldThreads);
3295
}
3296

3297
static kmp_internal_control_t __kmp_get_global_icvs(void) {
3298

3299
  kmp_r_sched_t r_sched =
3300
      __kmp_get_schedule_global(); // get current state of scheduling globals
3301

3302
  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3303

3304
  kmp_internal_control_t g_icvs = {
3305
    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3306
    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3307
    // adjustment of threads (per thread)
3308
    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3309
    // whether blocktime is explicitly set
3310
    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3311
#if KMP_USE_MONITOR
3312
    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3313
// intervals
3314
#endif
3315
    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3316
    // next parallel region (per thread)
3317
    // (use a max ub on value if __kmp_parallel_initialize not called yet)
3318
    __kmp_cg_max_nth, // int thread_limit;
3319
    __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3320
    // on task. This is used in the case of target thread_limit
3321
    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3322
    // for max_active_levels
3323
    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3324
    // {sched,chunk} pair
3325
    __kmp_nested_proc_bind.bind_types[0],
3326
    __kmp_default_device,
3327
    NULL // struct kmp_internal_control *next;
3328
  };
3329

3330
  return g_icvs;
3331
}
3332

3333
static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3334

3335
  kmp_internal_control_t gx_icvs;
3336
  gx_icvs.serial_nesting_level =
3337
      0; // probably =team->t.t_serial like in save_inter_controls
3338
  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3339
  gx_icvs.next = NULL;
3340

3341
  return gx_icvs;
3342
}
3343

3344
static void __kmp_initialize_root(kmp_root_t *root) {
3345
  int f;
3346
  kmp_team_t *root_team;
3347
  kmp_team_t *hot_team;
3348
  int hot_team_max_nth;
3349
  kmp_r_sched_t r_sched =
3350
      __kmp_get_schedule_global(); // get current state of scheduling globals
3351
  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3352
  KMP_DEBUG_ASSERT(root);
3353
  KMP_ASSERT(!root->r.r_begin);
3354

3355
  /* setup the root state structure */
3356
  __kmp_init_lock(&root->r.r_begin_lock);
3357
  root->r.r_begin = FALSE;
3358
  root->r.r_active = FALSE;
3359
  root->r.r_in_parallel = 0;
3360
  root->r.r_blocktime = __kmp_dflt_blocktime;
3361
#if KMP_AFFINITY_SUPPORTED
3362
  root->r.r_affinity_assigned = FALSE;
3363
#endif
3364

3365
  /* setup the root team for this task */
3366
  /* allocate the root team structure */
3367
  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3368

3369
  root_team =
3370
      __kmp_allocate_team(root,
3371
                          1, // new_nproc
3372
                          1, // max_nproc
3373
#if OMPT_SUPPORT
3374
                          ompt_data_none, // root parallel id
3375
#endif
3376
                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3377
                          0 // argc
3378
                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3379
      );
3380
#if USE_DEBUGGER
3381
  // Non-NULL value should be assigned to make the debugger display the root
3382
  // team.
3383
  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3384
#endif
3385

3386
  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3387

3388
  root->r.r_root_team = root_team;
3389
  root_team->t.t_control_stack_top = NULL;
3390

3391
  /* initialize root team */
3392
  root_team->t.t_threads[0] = NULL;
3393
  root_team->t.t_nproc = 1;
3394
  root_team->t.t_serialized = 1;
3395
  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3396
  root_team->t.t_sched.sched = r_sched.sched;
3397
  root_team->t.t_nested_nth = &__kmp_nested_nth;
3398
  KA_TRACE(
3399
      20,
3400
      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3401
       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3402

3403
  /* setup the  hot team for this task */
3404
  /* allocate the hot team structure */
3405
  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3406

3407
  hot_team =
3408
      __kmp_allocate_team(root,
3409
                          1, // new_nproc
3410
                          __kmp_dflt_team_nth_ub * 2, // max_nproc
3411
#if OMPT_SUPPORT
3412
                          ompt_data_none, // root parallel id
3413
#endif
3414
                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3415
                          0 // argc
3416
                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3417
      );
3418
  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3419

3420
  root->r.r_hot_team = hot_team;
3421
  root_team->t.t_control_stack_top = NULL;
3422

3423
  /* first-time initialization */
3424
  hot_team->t.t_parent = root_team;
3425

3426
  /* initialize hot team */
3427
  hot_team_max_nth = hot_team->t.t_max_nproc;
3428
  for (f = 0; f < hot_team_max_nth; ++f) {
3429
    hot_team->t.t_threads[f] = NULL;
3430
  }
3431
  hot_team->t.t_nproc = 1;
3432
  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3433
  hot_team->t.t_sched.sched = r_sched.sched;
3434
  hot_team->t.t_size_changed = 0;
3435
  hot_team->t.t_nested_nth = &__kmp_nested_nth;
3436
}
3437

3438
#ifdef KMP_DEBUG
3439

3440
typedef struct kmp_team_list_item {
3441
  kmp_team_p const *entry;
3442
  struct kmp_team_list_item *next;
3443
} kmp_team_list_item_t;
3444
typedef kmp_team_list_item_t *kmp_team_list_t;
3445

3446
static void __kmp_print_structure_team_accum( // Add team to list of teams.
3447
    kmp_team_list_t list, // List of teams.
3448
    kmp_team_p const *team // Team to add.
3449
) {
3450

3451
  // List must terminate with item where both entry and next are NULL.
3452
  // Team is added to the list only once.
3453
  // List is sorted in ascending order by team id.
3454
  // Team id is *not* a key.
3455

3456
  kmp_team_list_t l;
3457

3458
  KMP_DEBUG_ASSERT(list != NULL);
3459
  if (team == NULL) {
3460
    return;
3461
  }
3462

3463
  __kmp_print_structure_team_accum(list, team->t.t_parent);
3464
  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3465

3466
  // Search list for the team.
3467
  l = list;
3468
  while (l->next != NULL && l->entry != team) {
3469
    l = l->next;
3470
  }
3471
  if (l->next != NULL) {
3472
    return; // Team has been added before, exit.
3473
  }
3474

3475
  // Team is not found. Search list again for insertion point.
3476
  l = list;
3477
  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3478
    l = l->next;
3479
  }
3480

3481
  // Insert team.
3482
  {
3483
    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3484
        sizeof(kmp_team_list_item_t));
3485
    *item = *l;
3486
    l->entry = team;
3487
    l->next = item;
3488
  }
3489
}
3490

3491
static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3492

3493
) {
3494
  __kmp_printf("%s", title);
3495
  if (team != NULL) {
3496
    __kmp_printf("%2x %p\n", team->t.t_id, team);
3497
  } else {
3498
    __kmp_printf(" - (nil)\n");
3499
  }
3500
}
3501

3502
static void __kmp_print_structure_thread(char const *title,
3503
                                         kmp_info_p const *thread) {
3504
  __kmp_printf("%s", title);
3505
  if (thread != NULL) {
3506
    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3507
  } else {
3508
    __kmp_printf(" - (nil)\n");
3509
  }
3510
}
3511

3512
void __kmp_print_structure(void) {
3513

3514
  kmp_team_list_t list;
3515

3516
  // Initialize list of teams.
3517
  list =
3518
      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3519
  list->entry = NULL;
3520
  list->next = NULL;
3521

3522
  __kmp_printf("\n------------------------------\nGlobal Thread "
3523
               "Table\n------------------------------\n");
3524
  {
3525
    int gtid;
3526
    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3527
      __kmp_printf("%2d", gtid);
3528
      if (__kmp_threads != NULL) {
3529
        __kmp_printf(" %p", __kmp_threads[gtid]);
3530
      }
3531
      if (__kmp_root != NULL) {
3532
        __kmp_printf(" %p", __kmp_root[gtid]);
3533
      }
3534
      __kmp_printf("\n");
3535
    }
3536
  }
3537

3538
  // Print out __kmp_threads array.
3539
  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3540
               "----------\n");
3541
  if (__kmp_threads != NULL) {
3542
    int gtid;
3543
    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3544
      kmp_info_t const *thread = __kmp_threads[gtid];
3545
      if (thread != NULL) {
3546
        __kmp_printf("GTID %2d %p:\n", gtid, thread);
3547
        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3548
        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3549
        __kmp_print_structure_team("    Serial Team:  ",
3550
                                   thread->th.th_serial_team);
3551
        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3552
        __kmp_print_structure_thread("    Primary:      ",
3553
                                     thread->th.th_team_master);
3554
        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3555
        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3556
        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3557
        __kmp_print_structure_thread("    Next in pool: ",
3558
                                     thread->th.th_next_pool);
3559
        __kmp_printf("\n");
3560
        __kmp_print_structure_team_accum(list, thread->th.th_team);
3561
        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3562
      }
3563
    }
3564
  } else {
3565
    __kmp_printf("Threads array is not allocated.\n");
3566
  }
3567

3568
  // Print out __kmp_root array.
3569
  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3570
               "--------\n");
3571
  if (__kmp_root != NULL) {
3572
    int gtid;
3573
    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3574
      kmp_root_t const *root = __kmp_root[gtid];
3575
      if (root != NULL) {
3576
        __kmp_printf("GTID %2d %p:\n", gtid, root);
3577
        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3578
        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3579
        __kmp_print_structure_thread("    Uber Thread:  ",
3580
                                     root->r.r_uber_thread);
3581
        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3582
        __kmp_printf("    In Parallel:  %2d\n",
3583
                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3584
        __kmp_printf("\n");
3585
        __kmp_print_structure_team_accum(list, root->r.r_root_team);
3586
        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3587
      }
3588
    }
3589
  } else {
3590
    __kmp_printf("Ubers array is not allocated.\n");
3591
  }
3592

3593
  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3594
               "--------\n");
3595
  while (list->next != NULL) {
3596
    kmp_team_p const *team = list->entry;
3597
    int i;
3598
    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3599
    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3600
    __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3601
    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3602
    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3603
    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3604
    for (i = 0; i < team->t.t_nproc; ++i) {
3605
      __kmp_printf("    Thread %2d:      ", i);
3606
      __kmp_print_structure_thread("", team->t.t_threads[i]);
3607
    }
3608
    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3609
    __kmp_printf("\n");
3610
    list = list->next;
3611
  }
3612

3613
  // Print out __kmp_thread_pool and __kmp_team_pool.
3614
  __kmp_printf("\n------------------------------\nPools\n----------------------"
3615
               "--------\n");
3616
  __kmp_print_structure_thread("Thread pool:          ",
3617
                               CCAST(kmp_info_t *, __kmp_thread_pool));
3618
  __kmp_print_structure_team("Team pool:            ",
3619
                             CCAST(kmp_team_t *, __kmp_team_pool));
3620
  __kmp_printf("\n");
3621

3622
  // Free team list.
3623
  while (list != NULL) {
3624
    kmp_team_list_item_t *item = list;
3625
    list = list->next;
3626
    KMP_INTERNAL_FREE(item);
3627
  }
3628
}
3629

3630
#endif
3631

3632
//---------------------------------------------------------------------------
3633
//  Stuff for per-thread fast random number generator
3634
//  Table of primes
3635
static const unsigned __kmp_primes[] = {
3636
    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3637
    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3638
    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3639
    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3640
    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3641
    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3642
    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3643
    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3644
    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3645
    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3646
    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3647

3648
//---------------------------------------------------------------------------
3649
//  __kmp_get_random: Get a random number using a linear congruential method.
3650
unsigned short __kmp_get_random(kmp_info_t *thread) {
3651
  unsigned x = thread->th.th_x;
3652
  unsigned short r = (unsigned short)(x >> 16);
3653

3654
  thread->th.th_x = x * thread->th.th_a + 1;
3655

3656
  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3657
                thread->th.th_info.ds.ds_tid, r));
3658

3659
  return r;
3660
}
3661
//--------------------------------------------------------
3662
// __kmp_init_random: Initialize a random number generator
3663
void __kmp_init_random(kmp_info_t *thread) {
3664
  unsigned seed = thread->th.th_info.ds.ds_tid;
3665

3666
  thread->th.th_a =
3667
      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3668
  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3669
  KA_TRACE(30,
3670
           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3671
}
3672

3673
#if KMP_OS_WINDOWS
3674
/* reclaim array entries for root threads that are already dead, returns number
3675
 * reclaimed */
3676
static int __kmp_reclaim_dead_roots(void) {
3677
  int i, r = 0;
3678

3679
  for (i = 0; i < __kmp_threads_capacity; ++i) {
3680
    if (KMP_UBER_GTID(i) &&
3681
        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3682
        !__kmp_root[i]
3683
             ->r.r_active) { // AC: reclaim only roots died in non-active state
3684
      r += __kmp_unregister_root_other_thread(i);
3685
    }
3686
  }
3687
  return r;
3688
}
3689
#endif
3690

3691
/* This function attempts to create free entries in __kmp_threads and
3692
   __kmp_root, and returns the number of free entries generated.
3693

3694
   For Windows* OS static library, the first mechanism used is to reclaim array
3695
   entries for root threads that are already dead.
3696

3697
   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3698
   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3699
   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3700
   threadprivate cache array has been created. Synchronization with
3701
   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3702

3703
   After any dead root reclamation, if the clipping value allows array expansion
3704
   to result in the generation of a total of nNeed free slots, the function does
3705
   that expansion. If not, nothing is done beyond the possible initial root
3706
   thread reclamation.
3707

3708
   If any argument is negative, the behavior is undefined. */
3709
static int __kmp_expand_threads(int nNeed) {
3710
  int added = 0;
3711
  int minimumRequiredCapacity;
3712
  int newCapacity;
3713
  kmp_info_t **newThreads;
3714
  kmp_root_t **newRoot;
3715

3716
  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3717
  // resizing __kmp_threads does not need additional protection if foreign
3718
  // threads are present
3719

3720
#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3721
  /* only for Windows static library */
3722
  /* reclaim array entries for root threads that are already dead */
3723
  added = __kmp_reclaim_dead_roots();
3724

3725
  if (nNeed) {
3726
    nNeed -= added;
3727
    if (nNeed < 0)
3728
      nNeed = 0;
3729
  }
3730
#endif
3731
  if (nNeed <= 0)
3732
    return added;
3733

3734
  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3735
  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3736
  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3737
  // > __kmp_max_nth in one of two ways:
3738
  //
3739
  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3740
  //    may not be reused by another thread, so we may need to increase
3741
  //    __kmp_threads_capacity to __kmp_max_nth + 1.
3742
  //
3743
  // 2) New foreign root(s) are encountered.  We always register new foreign
3744
  //    roots. This may cause a smaller # of threads to be allocated at
3745
  //    subsequent parallel regions, but the worker threads hang around (and
3746
  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3747
  //
3748
  // Anyway, that is the reason for moving the check to see if
3749
  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3750
  // instead of having it performed here. -BB
3751

3752
  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3753

3754
  /* compute expansion headroom to check if we can expand */
3755
  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3756
    /* possible expansion too small -- give up */
3757
    return added;
3758
  }
3759
  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3760

3761
  newCapacity = __kmp_threads_capacity;
3762
  do {
3763
    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3764
                                                          : __kmp_sys_max_nth;
3765
  } while (newCapacity < minimumRequiredCapacity);
3766
  newThreads = (kmp_info_t **)__kmp_allocate(
3767
      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3768
  newRoot =
3769
      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3770
  KMP_MEMCPY(newThreads, __kmp_threads,
3771
             __kmp_threads_capacity * sizeof(kmp_info_t *));
3772
  KMP_MEMCPY(newRoot, __kmp_root,
3773
             __kmp_threads_capacity * sizeof(kmp_root_t *));
3774
  // Put old __kmp_threads array on a list. Any ongoing references to the old
3775
  // list will be valid. This list is cleaned up at library shutdown.
3776
  kmp_old_threads_list_t *node =
3777
      (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3778
  node->threads = __kmp_threads;
3779
  node->next = __kmp_old_threads_list;
3780
  __kmp_old_threads_list = node;
3781

3782
  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3783
  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3784
  added += newCapacity - __kmp_threads_capacity;
3785
  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3786

3787
  if (newCapacity > __kmp_tp_capacity) {
3788
    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3789
    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3790
      __kmp_threadprivate_resize_cache(newCapacity);
3791
    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3792
      *(volatile int *)&__kmp_tp_capacity = newCapacity;
3793
    }
3794
    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3795
  }
3796

3797
  return added;
3798
}
3799

3800
/* Register the current thread as a root thread and obtain our gtid. We must
3801
   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3802
   thread that calls from __kmp_do_serial_initialize() */
3803
int __kmp_register_root(int initial_thread) {
3804
  kmp_info_t *root_thread;
3805
  kmp_root_t *root;
3806
  int gtid;
3807
  int capacity;
3808
  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3809
  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3810
  KMP_MB();
3811

3812
  /* 2007-03-02:
3813
     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3814
     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3815
     work as expected -- it may return false (that means there is at least one
3816
     empty slot in __kmp_threads array), but it is possible the only free slot
3817
     is #0, which is reserved for initial thread and so cannot be used for this
3818
     one. Following code workarounds this bug.
3819

3820
     However, right solution seems to be not reserving slot #0 for initial
3821
     thread because:
3822
     (1) there is no magic in slot #0,
3823
     (2) we cannot detect initial thread reliably (the first thread which does
3824
        serial initialization may be not a real initial thread).
3825
  */
3826
  capacity = __kmp_threads_capacity;
3827
  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3828
    --capacity;
3829
  }
3830

3831
  // If it is not for initializing the hidden helper team, we need to take
3832
  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3833
  // in __kmp_threads_capacity.
3834
  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3835
    capacity -= __kmp_hidden_helper_threads_num;
3836
  }
3837

3838
  /* see if there are too many threads */
3839
  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3840
    if (__kmp_tp_cached) {
3841
      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3842
                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3843
                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3844
    } else {
3845
      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3846
                  __kmp_msg_null);
3847
    }
3848
  }
3849

3850
  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3851
  // 0: initial thread, also a regular OpenMP thread.
3852
  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3853
  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3854
  // regular OpenMP threads.
3855
  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3856
    // Find an available thread slot for hidden helper thread. Slots for hidden
3857
    // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3858
    for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3859
                   gtid <= __kmp_hidden_helper_threads_num;
3860
         gtid++)
3861
      ;
3862
    KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3863
    KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3864
                 "hidden helper thread: T#%d\n",
3865
                 gtid));
3866
  } else {
3867
    /* find an available thread slot */
3868
    // Don't reassign the zero slot since we need that to only be used by
3869
    // initial thread. Slots for hidden helper threads should also be skipped.
3870
    if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3871
      gtid = 0;
3872
    } else {
3873
      for (gtid = __kmp_hidden_helper_threads_num + 1;
3874
           TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3875
        ;
3876
    }
3877
    KA_TRACE(
3878
        1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3879
    KMP_ASSERT(gtid < __kmp_threads_capacity);
3880
  }
3881

3882
  /* update global accounting */
3883
  __kmp_all_nth++;
3884
  TCW_4(__kmp_nth, __kmp_nth + 1);
3885

3886
  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3887
  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3888
  if (__kmp_adjust_gtid_mode) {
3889
    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3890
      if (TCR_4(__kmp_gtid_mode) != 2) {
3891
        TCW_4(__kmp_gtid_mode, 2);
3892
      }
3893
    } else {
3894
      if (TCR_4(__kmp_gtid_mode) != 1) {
3895
        TCW_4(__kmp_gtid_mode, 1);
3896
      }
3897
    }
3898
  }
3899

3900
#ifdef KMP_ADJUST_BLOCKTIME
3901
  /* Adjust blocktime to zero if necessary            */
3902
  /* Middle initialization might not have occurred yet */
3903
  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3904
    if (__kmp_nth > __kmp_avail_proc) {
3905
      __kmp_zero_bt = TRUE;
3906
    }
3907
  }
3908
#endif /* KMP_ADJUST_BLOCKTIME */
3909

3910
  /* setup this new hierarchy */
3911
  if (!(root = __kmp_root[gtid])) {
3912
    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3913
    KMP_DEBUG_ASSERT(!root->r.r_root_team);
3914
  }
3915

3916
#if KMP_STATS_ENABLED
3917
  // Initialize stats as soon as possible (right after gtid assignment).
3918
  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3919
  __kmp_stats_thread_ptr->startLife();
3920
  KMP_SET_THREAD_STATE(SERIAL_REGION);
3921
  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3922
#endif
3923
  __kmp_initialize_root(root);
3924

3925
  /* setup new root thread structure */
3926
  if (root->r.r_uber_thread) {
3927
    root_thread = root->r.r_uber_thread;
3928
  } else {
3929
    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3930
    if (__kmp_storage_map) {
3931
      __kmp_print_thread_storage_map(root_thread, gtid);
3932
    }
3933
    root_thread->th.th_info.ds.ds_gtid = gtid;
3934
#if OMPT_SUPPORT
3935
    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3936
#endif
3937
    root_thread->th.th_root = root;
3938
    if (__kmp_env_consistency_check) {
3939
      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3940
    }
3941
#if USE_FAST_MEMORY
3942
    __kmp_initialize_fast_memory(root_thread);
3943
#endif /* USE_FAST_MEMORY */
3944

3945
#if KMP_USE_BGET
3946
    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3947
    __kmp_initialize_bget(root_thread);
3948
#endif
3949
    __kmp_init_random(root_thread); // Initialize random number generator
3950
  }
3951

3952
  /* setup the serial team held in reserve by the root thread */
3953
  if (!root_thread->th.th_serial_team) {
3954
    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3955
    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3956
    root_thread->th.th_serial_team = __kmp_allocate_team(
3957
        root, 1, 1,
3958
#if OMPT_SUPPORT
3959
        ompt_data_none, // root parallel id
3960
#endif
3961
        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3962
  }
3963
  KMP_ASSERT(root_thread->th.th_serial_team);
3964
  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3965
                root_thread->th.th_serial_team));
3966

3967
  /* drop root_thread into place */
3968
  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3969

3970
  root->r.r_root_team->t.t_threads[0] = root_thread;
3971
  root->r.r_hot_team->t.t_threads[0] = root_thread;
3972
  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3973
  // AC: the team created in reserve, not for execution (it is unused for now).
3974
  root_thread->th.th_serial_team->t.t_serialized = 0;
3975
  root->r.r_uber_thread = root_thread;
3976

3977
  /* initialize the thread, get it ready to go */
3978
  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3979
  TCW_4(__kmp_init_gtid, TRUE);
3980

3981
  /* prepare the primary thread for get_gtid() */
3982
  __kmp_gtid_set_specific(gtid);
3983

3984
#if USE_ITT_BUILD
3985
  __kmp_itt_thread_name(gtid);
3986
#endif /* USE_ITT_BUILD */
3987

3988
#ifdef KMP_TDATA_GTID
3989
  __kmp_gtid = gtid;
3990
#endif
3991
  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3992
  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3993

3994
  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3995
                "plain=%u\n",
3996
                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3997
                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3998
                KMP_INIT_BARRIER_STATE));
3999
  { // Initialize barrier data.
4000
    int b;
4001
    for (b = 0; b < bs_last_barrier; ++b) {
4002
      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4003
#if USE_DEBUGGER
4004
      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4005
#endif
4006
    }
4007
  }
4008
  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4009
                   KMP_INIT_BARRIER_STATE);
4010

4011
#if KMP_AFFINITY_SUPPORTED
4012
  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4013
  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4014
  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4015
  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4016
#endif /* KMP_AFFINITY_SUPPORTED */
4017
  root_thread->th.th_def_allocator = __kmp_def_allocator;
4018
  root_thread->th.th_prev_level = 0;
4019
  root_thread->th.th_prev_num_threads = 1;
4020

4021
  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4022
  tmp->cg_root = root_thread;
4023
  tmp->cg_thread_limit = __kmp_cg_max_nth;
4024
  tmp->cg_nthreads = 1;
4025
  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4026
                 " cg_nthreads init to 1\n",
4027
                 root_thread, tmp));
4028
  tmp->up = NULL;
4029
  root_thread->th.th_cg_roots = tmp;
4030

4031
  __kmp_root_counter++;
4032

4033
#if OMPT_SUPPORT
4034
  if (ompt_enabled.enabled) {
4035

4036
    kmp_info_t *root_thread = ompt_get_thread();
4037

4038
    ompt_set_thread_state(root_thread, ompt_state_overhead);
4039

4040
    if (ompt_enabled.ompt_callback_thread_begin) {
4041
      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4042
          ompt_thread_initial, __ompt_get_thread_data_internal());
4043
    }
4044
    ompt_data_t *task_data;
4045
    ompt_data_t *parallel_data;
4046
    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4047
                                  NULL);
4048
    if (ompt_enabled.ompt_callback_implicit_task) {
4049
      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4050
          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4051
    }
4052

4053
    ompt_set_thread_state(root_thread, ompt_state_work_serial);
4054
  }
4055
#endif
4056
#if OMPD_SUPPORT
4057
  if (ompd_state & OMPD_ENABLE_BP)
4058
    ompd_bp_thread_begin();
4059
#endif
4060

4061
  KMP_MB();
4062
  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4063

4064
  return gtid;
4065
}
4066

4067
#if KMP_NESTED_HOT_TEAMS
4068
static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4069
                                const int max_level) {
4070
  int i, n, nth;
4071
  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4072
  if (!hot_teams || !hot_teams[level].hot_team) {
4073
    return 0;
4074
  }
4075
  KMP_DEBUG_ASSERT(level < max_level);
4076
  kmp_team_t *team = hot_teams[level].hot_team;
4077
  nth = hot_teams[level].hot_team_nth;
4078
  n = nth - 1; // primary thread is not freed
4079
  if (level < max_level - 1) {
4080
    for (i = 0; i < nth; ++i) {
4081
      kmp_info_t *th = team->t.t_threads[i];
4082
      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4083
      if (i > 0 && th->th.th_hot_teams) {
4084
        __kmp_free(th->th.th_hot_teams);
4085
        th->th.th_hot_teams = NULL;
4086
      }
4087
    }
4088
  }
4089
  __kmp_free_team(root, team, NULL);
4090
  return n;
4091
}
4092
#endif
4093

4094
// Resets a root thread and clear its root and hot teams.
4095
// Returns the number of __kmp_threads entries directly and indirectly freed.
4096
static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4097
  kmp_team_t *root_team = root->r.r_root_team;
4098
  kmp_team_t *hot_team = root->r.r_hot_team;
4099
  int n = hot_team->t.t_nproc;
4100
  int i;
4101

4102
  KMP_DEBUG_ASSERT(!root->r.r_active);
4103

4104
  root->r.r_root_team = NULL;
4105
  root->r.r_hot_team = NULL;
4106
  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4107
  // before call to __kmp_free_team().
4108
  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4109
#if KMP_NESTED_HOT_TEAMS
4110
  if (__kmp_hot_teams_max_level >
4111
      0) { // need to free nested hot teams and their threads if any
4112
    for (i = 0; i < hot_team->t.t_nproc; ++i) {
4113
      kmp_info_t *th = hot_team->t.t_threads[i];
4114
      if (__kmp_hot_teams_max_level > 1) {
4115
        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4116
      }
4117
      if (th->th.th_hot_teams) {
4118
        __kmp_free(th->th.th_hot_teams);
4119
        th->th.th_hot_teams = NULL;
4120
      }
4121
    }
4122
  }
4123
#endif
4124
  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4125

4126
  // Before we can reap the thread, we need to make certain that all other
4127
  // threads in the teams that had this root as ancestor have stopped trying to
4128
  // steal tasks.
4129
  if (__kmp_tasking_mode != tskm_immediate_exec) {
4130
    __kmp_wait_to_unref_task_teams();
4131
  }
4132

4133
#if KMP_OS_WINDOWS
4134
  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4135
  KA_TRACE(
4136
      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4137
           "\n",
4138
           (LPVOID) & (root->r.r_uber_thread->th),
4139
           root->r.r_uber_thread->th.th_info.ds.ds_thread));
4140
  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4141
#endif /* KMP_OS_WINDOWS */
4142

4143
#if OMPD_SUPPORT
4144
  if (ompd_state & OMPD_ENABLE_BP)
4145
    ompd_bp_thread_end();
4146
#endif
4147

4148
#if OMPT_SUPPORT
4149
  ompt_data_t *task_data;
4150
  ompt_data_t *parallel_data;
4151
  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4152
                                NULL);
4153
  if (ompt_enabled.ompt_callback_implicit_task) {
4154
    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4155
        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4156
  }
4157
  if (ompt_enabled.ompt_callback_thread_end) {
4158
    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4159
        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4160
  }
4161
#endif
4162

4163
  TCW_4(__kmp_nth,
4164
        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4165
  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4166
  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4167
                 " to %d\n",
4168
                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4169
                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4170
  if (i == 1) {
4171
    // need to free contention group structure
4172
    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4173
                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
4174
    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4175
    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4176
    root->r.r_uber_thread->th.th_cg_roots = NULL;
4177
  }
4178
  __kmp_reap_thread(root->r.r_uber_thread, 1);
4179

4180
  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4181
  // instead of freeing.
4182
  root->r.r_uber_thread = NULL;
4183
  /* mark root as no longer in use */
4184
  root->r.r_begin = FALSE;
4185

4186
  return n;
4187
}
4188

4189
void __kmp_unregister_root_current_thread(int gtid) {
4190
  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4191
  /* this lock should be ok, since unregister_root_current_thread is never
4192
     called during an abort, only during a normal close. furthermore, if you
4193
     have the forkjoin lock, you should never try to get the initz lock */
4194
  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4195
  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4196
    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4197
                  "exiting T#%d\n",
4198
                  gtid));
4199
    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4200
    return;
4201
  }
4202
  kmp_root_t *root = __kmp_root[gtid];
4203

4204
  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4205
  KMP_ASSERT(KMP_UBER_GTID(gtid));
4206
  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4207
  KMP_ASSERT(root->r.r_active == FALSE);
4208

4209
  KMP_MB();
4210

4211
  kmp_info_t *thread = __kmp_threads[gtid];
4212
  kmp_team_t *team = thread->th.th_team;
4213
  kmp_task_team_t *task_team = thread->th.th_task_team;
4214

4215
  // we need to wait for the proxy tasks before finishing the thread
4216
  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4217
                            task_team->tt.tt_hidden_helper_task_encountered)) {
4218
#if OMPT_SUPPORT
4219
    // the runtime is shutting down so we won't report any events
4220
    thread->th.ompt_thread_info.state = ompt_state_undefined;
4221
#endif
4222
    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4223
  }
4224

4225
  __kmp_reset_root(gtid, root);
4226

4227
  KMP_MB();
4228
  KC_TRACE(10,
4229
           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4230

4231
  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4232
}
4233

4234
#if KMP_OS_WINDOWS
4235
/* __kmp_forkjoin_lock must be already held
4236
   Unregisters a root thread that is not the current thread.  Returns the number
4237
   of __kmp_threads entries freed as a result. */
4238
static int __kmp_unregister_root_other_thread(int gtid) {
4239
  kmp_root_t *root = __kmp_root[gtid];
4240
  int r;
4241

4242
  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4243
  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4244
  KMP_ASSERT(KMP_UBER_GTID(gtid));
4245
  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4246
  KMP_ASSERT(root->r.r_active == FALSE);
4247

4248
  r = __kmp_reset_root(gtid, root);
4249
  KC_TRACE(10,
4250
           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4251
  return r;
4252
}
4253
#endif
4254

4255
#if KMP_DEBUG
4256
void __kmp_task_info() {
4257

4258
  kmp_int32 gtid = __kmp_entry_gtid();
4259
  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4260
  kmp_info_t *this_thr = __kmp_threads[gtid];
4261
  kmp_team_t *steam = this_thr->th.th_serial_team;
4262
  kmp_team_t *team = this_thr->th.th_team;
4263

4264
  __kmp_printf(
4265
      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4266
      "ptask=%p\n",
4267
      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4268
      team->t.t_implicit_task_taskdata[tid].td_parent);
4269
}
4270
#endif // KMP_DEBUG
4271

4272
/* TODO optimize with one big memclr, take out what isn't needed, split
4273
   responsibility to workers as much as possible, and delay initialization of
4274
   features as much as possible  */
4275
static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4276
                                  int tid, int gtid) {
4277
  /* this_thr->th.th_info.ds.ds_gtid is setup in
4278
     kmp_allocate_thread/create_worker.
4279
     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4280
  KMP_DEBUG_ASSERT(this_thr != NULL);
4281
  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4282
  KMP_DEBUG_ASSERT(team);
4283
  KMP_DEBUG_ASSERT(team->t.t_threads);
4284
  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4285
  kmp_info_t *master = team->t.t_threads[0];
4286
  KMP_DEBUG_ASSERT(master);
4287
  KMP_DEBUG_ASSERT(master->th.th_root);
4288

4289
  KMP_MB();
4290

4291
  TCW_SYNC_PTR(this_thr->th.th_team, team);
4292

4293
  this_thr->th.th_info.ds.ds_tid = tid;
4294
  this_thr->th.th_set_nproc = 0;
4295
  if (__kmp_tasking_mode != tskm_immediate_exec)
4296
    // When tasking is possible, threads are not safe to reap until they are
4297
    // done tasking; this will be set when tasking code is exited in wait
4298
    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4299
  else // no tasking --> always safe to reap
4300
    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4301
  this_thr->th.th_set_proc_bind = proc_bind_default;
4302

4303
#if KMP_AFFINITY_SUPPORTED
4304
  this_thr->th.th_new_place = this_thr->th.th_current_place;
4305
#endif
4306
  this_thr->th.th_root = master->th.th_root;
4307

4308
  /* setup the thread's cache of the team structure */
4309
  this_thr->th.th_team_nproc = team->t.t_nproc;
4310
  this_thr->th.th_team_master = master;
4311
  this_thr->th.th_team_serialized = team->t.t_serialized;
4312

4313
  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4314

4315
  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4316
                tid, gtid, this_thr, this_thr->th.th_current_task));
4317

4318
  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4319
                           team, tid, TRUE);
4320

4321
  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4322
                tid, gtid, this_thr, this_thr->th.th_current_task));
4323
  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4324
  // __kmp_initialize_team()?
4325

4326
  /* TODO no worksharing in speculative threads */
4327
  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4328

4329
  this_thr->th.th_local.this_construct = 0;
4330

4331
  if (!this_thr->th.th_pri_common) {
4332
    this_thr->th.th_pri_common =
4333
        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4334
    if (__kmp_storage_map) {
4335
      __kmp_print_storage_map_gtid(
4336
          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4337
          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4338
    }
4339
    this_thr->th.th_pri_head = NULL;
4340
  }
4341

4342
  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4343
      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4344
    // Make new thread's CG root same as primary thread's
4345
    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4346
    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4347
    if (tmp) {
4348
      // worker changes CG, need to check if old CG should be freed
4349
      int i = tmp->cg_nthreads--;
4350
      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4351
                     " on node %p of thread %p to %d\n",
4352
                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4353
      if (i == 1) {
4354
        __kmp_free(tmp); // last thread left CG --> free it
4355
      }
4356
    }
4357
    this_thr->th.th_cg_roots = master->th.th_cg_roots;
4358
    // Increment new thread's CG root's counter to add the new thread
4359
    this_thr->th.th_cg_roots->cg_nthreads++;
4360
    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4361
                   " node %p of thread %p to %d\n",
4362
                   this_thr, this_thr->th.th_cg_roots,
4363
                   this_thr->th.th_cg_roots->cg_root,
4364
                   this_thr->th.th_cg_roots->cg_nthreads));
4365
    this_thr->th.th_current_task->td_icvs.thread_limit =
4366
        this_thr->th.th_cg_roots->cg_thread_limit;
4367
  }
4368

4369
  /* Initialize dynamic dispatch */
4370
  {
4371
    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4372
    // Use team max_nproc since this will never change for the team.
4373
    size_t disp_size =
4374
        sizeof(dispatch_private_info_t) *
4375
        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4376
    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4377
                  team->t.t_max_nproc));
4378
    KMP_ASSERT(dispatch);
4379
    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4380
    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4381

4382
    dispatch->th_disp_index = 0;
4383
    dispatch->th_doacross_buf_idx = 0;
4384
    if (!dispatch->th_disp_buffer) {
4385
      dispatch->th_disp_buffer =
4386
          (dispatch_private_info_t *)__kmp_allocate(disp_size);
4387

4388
      if (__kmp_storage_map) {
4389
        __kmp_print_storage_map_gtid(
4390
            gtid, &dispatch->th_disp_buffer[0],
4391
            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4392
                                          ? 1
4393
                                          : __kmp_dispatch_num_buffers],
4394
            disp_size,
4395
            "th_%d.th_dispatch.th_disp_buffer "
4396
            "(team_%d.t_dispatch[%d].th_disp_buffer)",
4397
            gtid, team->t.t_id, gtid);
4398
      }
4399
    } else {
4400
      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4401
    }
4402

4403
    dispatch->th_dispatch_pr_current = 0;
4404
    dispatch->th_dispatch_sh_current = 0;
4405

4406
    dispatch->th_deo_fcn = 0; /* ORDERED     */
4407
    dispatch->th_dxo_fcn = 0; /* END ORDERED */
4408
  }
4409

4410
  this_thr->th.th_next_pool = NULL;
4411

4412
  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4413
  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4414

4415
  KMP_MB();
4416
}
4417

4418
/* allocate a new thread for the requesting team. this is only called from
4419
   within a forkjoin critical section. we will first try to get an available
4420
   thread from the thread pool. if none is available, we will fork a new one
4421
   assuming we are able to create a new one. this should be assured, as the
4422
   caller should check on this first. */
4423
kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4424
                                  int new_tid) {
4425
  kmp_team_t *serial_team;
4426
  kmp_info_t *new_thr;
4427
  int new_gtid;
4428

4429
  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4430
  KMP_DEBUG_ASSERT(root && team);
4431
#if !KMP_NESTED_HOT_TEAMS
4432
  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4433
#endif
4434
  KMP_MB();
4435

4436
  /* first, try to get one from the thread pool unless allocating thread is
4437
   * the main hidden helper thread. The hidden helper team should always
4438
   * allocate new OS threads. */
4439
  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4440
    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441
    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442
    if (new_thr == __kmp_thread_pool_insert_pt) {
4443
      __kmp_thread_pool_insert_pt = NULL;
4444
    }
4445
    TCW_4(new_thr->th.th_in_pool, FALSE);
4446
    __kmp_suspend_initialize_thread(new_thr);
4447
    __kmp_lock_suspend_mx(new_thr);
4448
    if (new_thr->th.th_active_in_pool == TRUE) {
4449
      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450
      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451
      new_thr->th.th_active_in_pool = FALSE;
4452
    }
4453
    __kmp_unlock_suspend_mx(new_thr);
4454

4455
    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456
                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457
    KMP_ASSERT(!new_thr->th.th_team);
4458
    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459

4460
    /* setup the thread structure */
4461
    __kmp_initialize_info(new_thr, team, new_tid,
4462
                          new_thr->th.th_info.ds.ds_gtid);
4463
    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464

4465
    TCW_4(__kmp_nth, __kmp_nth + 1);
4466

4467
    new_thr->th.th_task_state = 0;
4468

4469
    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470
      // Make sure pool thread has transitioned to waiting on own thread struct
4471
      KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472
      // Thread activated in __kmp_allocate_team when increasing team size
4473
    }
4474

4475
#ifdef KMP_ADJUST_BLOCKTIME
4476
    /* Adjust blocktime back to zero if necessary */
4477
    /* Middle initialization might not have occurred yet */
4478
    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479
      if (__kmp_nth > __kmp_avail_proc) {
4480
        __kmp_zero_bt = TRUE;
4481
      }
4482
    }
4483
#endif /* KMP_ADJUST_BLOCKTIME */
4484

4485
#if KMP_DEBUG
4486
    // If thread entered pool via __kmp_free_thread, wait_flag should !=
4487
    // KMP_BARRIER_PARENT_FLAG.
4488
    int b;
4489
    kmp_balign_t *balign = new_thr->th.th_bar;
4490
    for (b = 0; b < bs_last_barrier; ++b)
4491
      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492
#endif
4493

4494
    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495
                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496

4497
    KMP_MB();
4498
    return new_thr;
4499
  }
4500

4501
  /* no, well fork a new one */
4502
  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503
  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504

4505
#if KMP_USE_MONITOR
4506
  // If this is the first worker thread the RTL is creating, then also
4507
  // launch the monitor thread.  We try to do this as early as possible.
4508
  if (!TCR_4(__kmp_init_monitor)) {
4509
    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510
    if (!TCR_4(__kmp_init_monitor)) {
4511
      KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512
      TCW_4(__kmp_init_monitor, 1);
4513
      __kmp_create_monitor(&__kmp_monitor);
4514
      KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515
#if KMP_OS_WINDOWS
4516
      // AC: wait until monitor has started. This is a fix for CQ232808.
4517
      // The reason is that if the library is loaded/unloaded in a loop with
4518
      // small (parallel) work in between, then there is high probability that
4519
      // monitor thread started after the library shutdown. At shutdown it is
4520
      // too late to cope with the problem, because when the primary thread is
4521
      // in DllMain (process detach) the monitor has no chances to start (it is
4522
      // blocked), and primary thread has no means to inform the monitor that
4523
      // the library has gone, because all the memory which the monitor can
4524
      // access is going to be released/reset.
4525
      while (TCR_4(__kmp_init_monitor) < 2) {
4526
        KMP_YIELD(TRUE);
4527
      }
4528
      KF_TRACE(10, ("after monitor thread has started\n"));
4529
#endif
4530
    }
4531
    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532
  }
4533
#endif
4534

4535
  KMP_MB();
4536

4537
  {
4538
    int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539
                             ? 1
4540
                             : __kmp_hidden_helper_threads_num + 1;
4541

4542
    for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543
         ++new_gtid) {
4544
      KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545
    }
4546

4547
    if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548
      KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549
    }
4550
  }
4551

4552
  /* allocate space for it. */
4553
  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554

4555
  new_thr->th.th_nt_strict = false;
4556
  new_thr->th.th_nt_loc = NULL;
4557
  new_thr->th.th_nt_sev = severity_fatal;
4558
  new_thr->th.th_nt_msg = NULL;
4559

4560
  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4561

4562
#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4563
  // suppress race conditions detection on synchronization flags in debug mode
4564
  // this helps to analyze library internals eliminating false positives
4565
  __itt_suppress_mark_range(
4566
      __itt_suppress_range, __itt_suppress_threading_errors,
4567
      &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4568
  __itt_suppress_mark_range(
4569
      __itt_suppress_range, __itt_suppress_threading_errors,
4570
      &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4571
#if KMP_OS_WINDOWS
4572
  __itt_suppress_mark_range(
4573
      __itt_suppress_range, __itt_suppress_threading_errors,
4574
      &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4575
#else
4576
  __itt_suppress_mark_range(__itt_suppress_range,
4577
                            __itt_suppress_threading_errors,
4578
                            &new_thr->th.th_suspend_init_count,
4579
                            sizeof(new_thr->th.th_suspend_init_count));
4580
#endif
4581
  // TODO: check if we need to also suppress b_arrived flags
4582
  __itt_suppress_mark_range(__itt_suppress_range,
4583
                            __itt_suppress_threading_errors,
4584
                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4585
                            sizeof(new_thr->th.th_bar[0].bb.b_go));
4586
  __itt_suppress_mark_range(__itt_suppress_range,
4587
                            __itt_suppress_threading_errors,
4588
                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4589
                            sizeof(new_thr->th.th_bar[1].bb.b_go));
4590
  __itt_suppress_mark_range(__itt_suppress_range,
4591
                            __itt_suppress_threading_errors,
4592
                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4593
                            sizeof(new_thr->th.th_bar[2].bb.b_go));
4594
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4595
  if (__kmp_storage_map) {
4596
    __kmp_print_thread_storage_map(new_thr, new_gtid);
4597
  }
4598

4599
  // add the reserve serialized team, initialized from the team's primary thread
4600
  {
4601
    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4602
    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4603
    new_thr->th.th_serial_team = serial_team =
4604
        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4605
#if OMPT_SUPPORT
4606
                                          ompt_data_none, // root parallel id
4607
#endif
4608
                                          proc_bind_default, &r_icvs,
4609
                                          0 USE_NESTED_HOT_ARG(NULL));
4610
  }
4611
  KMP_ASSERT(serial_team);
4612
  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4613
  // execution (it is unused for now).
4614
  serial_team->t.t_threads[0] = new_thr;
4615
  KF_TRACE(10,
4616
           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4617
            new_thr));
4618

4619
  /* setup the thread structures */
4620
  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4621

4622
#if USE_FAST_MEMORY
4623
  __kmp_initialize_fast_memory(new_thr);
4624
#endif /* USE_FAST_MEMORY */
4625

4626
#if KMP_USE_BGET
4627
  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4628
  __kmp_initialize_bget(new_thr);
4629
#endif
4630

4631
  __kmp_init_random(new_thr); // Initialize random number generator
4632

4633
  /* Initialize these only once when thread is grabbed for a team allocation */
4634
  KA_TRACE(20,
4635
           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4636
            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4637

4638
  int b;
4639
  kmp_balign_t *balign = new_thr->th.th_bar;
4640
  for (b = 0; b < bs_last_barrier; ++b) {
4641
    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4642
    balign[b].bb.team = NULL;
4643
    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4644
    balign[b].bb.use_oncore_barrier = 0;
4645
  }
4646

4647
  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4648
  new_thr->th.th_sleep_loc_type = flag_unset;
4649

4650
  new_thr->th.th_spin_here = FALSE;
4651
  new_thr->th.th_next_waiting = 0;
4652
#if KMP_OS_UNIX
4653
  new_thr->th.th_blocking = false;
4654
#endif
4655

4656
#if KMP_AFFINITY_SUPPORTED
4657
  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4658
  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4659
  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4660
  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4661
#endif
4662
  new_thr->th.th_def_allocator = __kmp_def_allocator;
4663
  new_thr->th.th_prev_level = 0;
4664
  new_thr->th.th_prev_num_threads = 1;
4665

4666
  TCW_4(new_thr->th.th_in_pool, FALSE);
4667
  new_thr->th.th_active_in_pool = FALSE;
4668
  TCW_4(new_thr->th.th_active, TRUE);
4669

4670
  new_thr->th.th_set_nested_nth = NULL;
4671
  new_thr->th.th_set_nested_nth_sz = 0;
4672

4673
  /* adjust the global counters */
4674
  __kmp_all_nth++;
4675
  __kmp_nth++;
4676

4677
  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4678
  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4679
  if (__kmp_adjust_gtid_mode) {
4680
    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4681
      if (TCR_4(__kmp_gtid_mode) != 2) {
4682
        TCW_4(__kmp_gtid_mode, 2);
4683
      }
4684
    } else {
4685
      if (TCR_4(__kmp_gtid_mode) != 1) {
4686
        TCW_4(__kmp_gtid_mode, 1);
4687
      }
4688
    }
4689
  }
4690

4691
#ifdef KMP_ADJUST_BLOCKTIME
4692
  /* Adjust blocktime back to zero if necessary       */
4693
  /* Middle initialization might not have occurred yet */
4694
  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4695
    if (__kmp_nth > __kmp_avail_proc) {
4696
      __kmp_zero_bt = TRUE;
4697
    }
4698
  }
4699
#endif /* KMP_ADJUST_BLOCKTIME */
4700

4701
#if KMP_AFFINITY_SUPPORTED
4702
  // Set the affinity and topology information for new thread
4703
  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4704
#endif
4705

4706
  /* actually fork it and create the new worker thread */
4707
  KF_TRACE(
4708
      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4709
  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4710
  KF_TRACE(10,
4711
           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4712

4713
  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4714
                new_gtid));
4715
  KMP_MB();
4716
  return new_thr;
4717
}
4718

4719
/* Reinitialize team for reuse.
4720
   The hot team code calls this case at every fork barrier, so EPCC barrier
4721
   test are extremely sensitive to changes in it, esp. writes to the team
4722
   struct, which cause a cache invalidation in all threads.
4723
   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4724
static void __kmp_reinitialize_team(kmp_team_t *team,
4725
                                    kmp_internal_control_t *new_icvs,
4726
                                    ident_t *loc) {
4727
  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4728
                team->t.t_threads[0], team));
4729
  KMP_DEBUG_ASSERT(team && new_icvs);
4730
  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4731
  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4732

4733
  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4734
  // Copy ICVs to the primary thread's implicit taskdata
4735
  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4736
  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4737

4738
  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4739
                team->t.t_threads[0], team));
4740
}
4741

4742
/* Initialize the team data structure.
4743
   This assumes the t_threads and t_max_nproc are already set.
4744
   Also, we don't touch the arguments */
4745
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4746
                                  kmp_internal_control_t *new_icvs,
4747
                                  ident_t *loc) {
4748
  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4749

4750
  /* verify */
4751
  KMP_DEBUG_ASSERT(team);
4752
  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4753
  KMP_DEBUG_ASSERT(team->t.t_threads);
4754
  KMP_MB();
4755

4756
  team->t.t_master_tid = 0; /* not needed */
4757
  /* team->t.t_master_bar;        not needed */
4758
  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4759
  team->t.t_nproc = new_nproc;
4760

4761
  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4762
  team->t.t_next_pool = NULL;
4763
  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4764
   * up hot team */
4765

4766
  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4767
  team->t.t_invoke = NULL; /* not needed */
4768

4769
  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4770
  team->t.t_sched.sched = new_icvs->sched.sched;
4771

4772
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4773
  team->t.t_fp_control_saved = FALSE; /* not needed */
4774
  team->t.t_x87_fpu_control_word = 0; /* not needed */
4775
  team->t.t_mxcsr = 0; /* not needed */
4776
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4777

4778
  team->t.t_construct = 0;
4779

4780
  team->t.t_ordered.dt.t_value = 0;
4781
  team->t.t_master_active = FALSE;
4782

4783
#ifdef KMP_DEBUG
4784
  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4785
#endif
4786
#if KMP_OS_WINDOWS
4787
  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4788
#endif
4789

4790
  team->t.t_control_stack_top = NULL;
4791

4792
  __kmp_reinitialize_team(team, new_icvs, loc);
4793

4794
  KMP_MB();
4795
  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4796
}
4797

4798
#if KMP_AFFINITY_SUPPORTED
4799
static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4800
                                          int first, int last, int newp) {
4801
  th->th.th_first_place = first;
4802
  th->th.th_last_place = last;
4803
  th->th.th_new_place = newp;
4804
  if (newp != th->th.th_current_place) {
4805
    if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4806
      team->t.t_display_affinity = 1;
4807
    // Copy topology information associated with the new place
4808
    th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4809
    th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4810
  }
4811
}
4812

4813
// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4814
// It calculates the worker + primary thread's partition based upon the parent
4815
// thread's partition, and binds each worker to a thread in their partition.
4816
// The primary thread's partition should already include its current binding.
4817
static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4818
  // Do not partition places for the hidden helper team
4819
  if (KMP_HIDDEN_HELPER_TEAM(team))
4820
    return;
4821
  // Copy the primary thread's place partition to the team struct
4822
  kmp_info_t *master_th = team->t.t_threads[0];
4823
  KMP_DEBUG_ASSERT(master_th != NULL);
4824
  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4825
  int first_place = master_th->th.th_first_place;
4826
  int last_place = master_th->th.th_last_place;
4827
  int masters_place = master_th->th.th_current_place;
4828
  int num_masks = __kmp_affinity.num_masks;
4829
  team->t.t_first_place = first_place;
4830
  team->t.t_last_place = last_place;
4831

4832
  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4833
                "bound to place %d partition = [%d,%d]\n",
4834
                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4835
                team->t.t_id, masters_place, first_place, last_place));
4836

4837
  switch (proc_bind) {
4838

4839
  case proc_bind_default:
4840
    // Serial teams might have the proc_bind policy set to proc_bind_default.
4841
    // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4842
    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4843
    break;
4844

4845
  case proc_bind_primary: {
4846
    int f;
4847
    int n_th = team->t.t_nproc;
4848
    for (f = 1; f < n_th; f++) {
4849
      kmp_info_t *th = team->t.t_threads[f];
4850
      KMP_DEBUG_ASSERT(th != NULL);
4851
      __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4852

4853
      KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4854
                     "partition = [%d,%d]\n",
4855
                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4856
                     f, masters_place, first_place, last_place));
4857
    }
4858
  } break;
4859

4860
  case proc_bind_close: {
4861
    int f;
4862
    int n_th = team->t.t_nproc;
4863
    int n_places;
4864
    if (first_place <= last_place) {
4865
      n_places = last_place - first_place + 1;
4866
    } else {
4867
      n_places = num_masks - first_place + last_place + 1;
4868
    }
4869
    if (n_th <= n_places) {
4870
      int place = masters_place;
4871
      for (f = 1; f < n_th; f++) {
4872
        kmp_info_t *th = team->t.t_threads[f];
4873
        KMP_DEBUG_ASSERT(th != NULL);
4874

4875
        if (place == last_place) {
4876
          place = first_place;
4877
        } else if (place == (num_masks - 1)) {
4878
          place = 0;
4879
        } else {
4880
          place++;
4881
        }
4882
        __kmp_set_thread_place(team, th, first_place, last_place, place);
4883

4884
        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4885
                       "partition = [%d,%d]\n",
4886
                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4887
                       team->t.t_id, f, place, first_place, last_place));
4888
      }
4889
    } else {
4890
      int S, rem, gap, s_count;
4891
      S = n_th / n_places;
4892
      s_count = 0;
4893
      rem = n_th - (S * n_places);
4894
      gap = rem > 0 ? n_places / rem : n_places;
4895
      int place = masters_place;
4896
      int gap_ct = gap;
4897
      for (f = 0; f < n_th; f++) {
4898
        kmp_info_t *th = team->t.t_threads[f];
4899
        KMP_DEBUG_ASSERT(th != NULL);
4900

4901
        __kmp_set_thread_place(team, th, first_place, last_place, place);
4902
        s_count++;
4903

4904
        if ((s_count == S) && rem && (gap_ct == gap)) {
4905
          // do nothing, add an extra thread to place on next iteration
4906
        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4907
          // we added an extra thread to this place; move to next place
4908
          if (place == last_place) {
4909
            place = first_place;
4910
          } else if (place == (num_masks - 1)) {
4911
            place = 0;
4912
          } else {
4913
            place++;
4914
          }
4915
          s_count = 0;
4916
          gap_ct = 1;
4917
          rem--;
4918
        } else if (s_count == S) { // place full; don't add extra
4919
          if (place == last_place) {
4920
            place = first_place;
4921
          } else if (place == (num_masks - 1)) {
4922
            place = 0;
4923
          } else {
4924
            place++;
4925
          }
4926
          gap_ct++;
4927
          s_count = 0;
4928
        }
4929

4930
        KA_TRACE(100,
4931
                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4932
                  "partition = [%d,%d]\n",
4933
                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4934
                  th->th.th_new_place, first_place, last_place));
4935
      }
4936
      KMP_DEBUG_ASSERT(place == masters_place);
4937
    }
4938
  } break;
4939

4940
  case proc_bind_spread: {
4941
    int f;
4942
    int n_th = team->t.t_nproc;
4943
    int n_places;
4944
    int thidx;
4945
    if (first_place <= last_place) {
4946
      n_places = last_place - first_place + 1;
4947
    } else {
4948
      n_places = num_masks - first_place + last_place + 1;
4949
    }
4950
    if (n_th <= n_places) {
4951
      int place = -1;
4952

4953
      if (n_places != num_masks) {
4954
        int S = n_places / n_th;
4955
        int s_count, rem, gap, gap_ct;
4956

4957
        place = masters_place;
4958
        rem = n_places - n_th * S;
4959
        gap = rem ? n_th / rem : 1;
4960
        gap_ct = gap;
4961
        thidx = n_th;
4962
        if (update_master_only == 1)
4963
          thidx = 1;
4964
        for (f = 0; f < thidx; f++) {
4965
          kmp_info_t *th = team->t.t_threads[f];
4966
          KMP_DEBUG_ASSERT(th != NULL);
4967

4968
          int fplace = place, nplace = place;
4969
          s_count = 1;
4970
          while (s_count < S) {
4971
            if (place == last_place) {
4972
              place = first_place;
4973
            } else if (place == (num_masks - 1)) {
4974
              place = 0;
4975
            } else {
4976
              place++;
4977
            }
4978
            s_count++;
4979
          }
4980
          if (rem && (gap_ct == gap)) {
4981
            if (place == last_place) {
4982
              place = first_place;
4983
            } else if (place == (num_masks - 1)) {
4984
              place = 0;
4985
            } else {
4986
              place++;
4987
            }
4988
            rem--;
4989
            gap_ct = 0;
4990
          }
4991
          __kmp_set_thread_place(team, th, fplace, place, nplace);
4992
          gap_ct++;
4993

4994
          if (place == last_place) {
4995
            place = first_place;
4996
          } else if (place == (num_masks - 1)) {
4997
            place = 0;
4998
          } else {
4999
            place++;
5000
          }
5001

5002
          KA_TRACE(100,
5003
                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5004
                    "partition = [%d,%d], num_masks: %u\n",
5005
                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5006
                    f, th->th.th_new_place, th->th.th_first_place,
5007
                    th->th.th_last_place, num_masks));
5008
        }
5009
      } else {
5010
        /* Having uniform space of available computation places I can create
5011
           T partitions of round(P/T) size and put threads into the first
5012
           place of each partition. */
5013
        double current = static_cast<double>(masters_place);
5014
        double spacing =
5015
            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5016
        int first, last;
5017
        kmp_info_t *th;
5018

5019
        thidx = n_th + 1;
5020
        if (update_master_only == 1)
5021
          thidx = 1;
5022
        for (f = 0; f < thidx; f++) {
5023
          first = static_cast<int>(current);
5024
          last = static_cast<int>(current + spacing) - 1;
5025
          KMP_DEBUG_ASSERT(last >= first);
5026
          if (first >= n_places) {
5027
            if (masters_place) {
5028
              first -= n_places;
5029
              last -= n_places;
5030
              if (first == (masters_place + 1)) {
5031
                KMP_DEBUG_ASSERT(f == n_th);
5032
                first--;
5033
              }
5034
              if (last == masters_place) {
5035
                KMP_DEBUG_ASSERT(f == (n_th - 1));
5036
                last--;
5037
              }
5038
            } else {
5039
              KMP_DEBUG_ASSERT(f == n_th);
5040
              first = 0;
5041
              last = 0;
5042
            }
5043
          }
5044
          if (last >= n_places) {
5045
            last = (n_places - 1);
5046
          }
5047
          place = first;
5048
          current += spacing;
5049
          if (f < n_th) {
5050
            KMP_DEBUG_ASSERT(0 <= first);
5051
            KMP_DEBUG_ASSERT(n_places > first);
5052
            KMP_DEBUG_ASSERT(0 <= last);
5053
            KMP_DEBUG_ASSERT(n_places > last);
5054
            KMP_DEBUG_ASSERT(last_place >= first_place);
5055
            th = team->t.t_threads[f];
5056
            KMP_DEBUG_ASSERT(th);
5057
            __kmp_set_thread_place(team, th, first, last, place);
5058
            KA_TRACE(100,
5059
                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5060
                      "partition = [%d,%d], spacing = %.4f\n",
5061
                      __kmp_gtid_from_thread(team->t.t_threads[f]),
5062
                      team->t.t_id, f, th->th.th_new_place,
5063
                      th->th.th_first_place, th->th.th_last_place, spacing));
5064
          }
5065
        }
5066
      }
5067
      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5068
    } else {
5069
      int S, rem, gap, s_count;
5070
      S = n_th / n_places;
5071
      s_count = 0;
5072
      rem = n_th - (S * n_places);
5073
      gap = rem > 0 ? n_places / rem : n_places;
5074
      int place = masters_place;
5075
      int gap_ct = gap;
5076
      thidx = n_th;
5077
      if (update_master_only == 1)
5078
        thidx = 1;
5079
      for (f = 0; f < thidx; f++) {
5080
        kmp_info_t *th = team->t.t_threads[f];
5081
        KMP_DEBUG_ASSERT(th != NULL);
5082

5083
        __kmp_set_thread_place(team, th, place, place, place);
5084
        s_count++;
5085

5086
        if ((s_count == S) && rem && (gap_ct == gap)) {
5087
          // do nothing, add an extra thread to place on next iteration
5088
        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5089
          // we added an extra thread to this place; move on to next place
5090
          if (place == last_place) {
5091
            place = first_place;
5092
          } else if (place == (num_masks - 1)) {
5093
            place = 0;
5094
          } else {
5095
            place++;
5096
          }
5097
          s_count = 0;
5098
          gap_ct = 1;
5099
          rem--;
5100
        } else if (s_count == S) { // place is full; don't add extra thread
5101
          if (place == last_place) {
5102
            place = first_place;
5103
          } else if (place == (num_masks - 1)) {
5104
            place = 0;
5105
          } else {
5106
            place++;
5107
          }
5108
          gap_ct++;
5109
          s_count = 0;
5110
        }
5111

5112
        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5113
                       "partition = [%d,%d]\n",
5114
                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5115
                       team->t.t_id, f, th->th.th_new_place,
5116
                       th->th.th_first_place, th->th.th_last_place));
5117
      }
5118
      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5119
    }
5120
  } break;
5121

5122
  default:
5123
    break;
5124
  }
5125

5126
  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5127
}
5128

5129
#endif // KMP_AFFINITY_SUPPORTED
5130

5131
/* allocate a new team data structure to use.  take one off of the free pool if
5132
   available */
5133
kmp_team_t *
5134
__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5135
#if OMPT_SUPPORT
5136
                    ompt_data_t ompt_parallel_data,
5137
#endif
5138
                    kmp_proc_bind_t new_proc_bind,
5139
                    kmp_internal_control_t *new_icvs,
5140
                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5141
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5142
  int f;
5143
  kmp_team_t *team;
5144
  int use_hot_team = !root->r.r_active;
5145
  int level = 0;
5146
  int do_place_partition = 1;
5147

5148
  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5149
  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5150
  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5151
  KMP_MB();
5152

5153
#if KMP_NESTED_HOT_TEAMS
5154
  kmp_hot_team_ptr_t *hot_teams;
5155
  if (master) {
5156
    team = master->th.th_team;
5157
    level = team->t.t_active_level;
5158
    if (master->th.th_teams_microtask) { // in teams construct?
5159
      if (master->th.th_teams_size.nteams > 1 &&
5160
          ( // #teams > 1
5161
              team->t.t_pkfn ==
5162
                  (microtask_t)__kmp_teams_master || // inner fork of the teams
5163
              master->th.th_teams_level <
5164
                  team->t.t_level)) { // or nested parallel inside the teams
5165
        ++level; // not increment if #teams==1, or for outer fork of the teams;
5166
        // increment otherwise
5167
      }
5168
      // Do not perform the place partition if inner fork of the teams
5169
      // Wait until nested parallel region encountered inside teams construct
5170
      if ((master->th.th_teams_size.nteams == 1 &&
5171
           master->th.th_teams_level >= team->t.t_level) ||
5172
          (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5173
        do_place_partition = 0;
5174
    }
5175
    hot_teams = master->th.th_hot_teams;
5176
    if (level < __kmp_hot_teams_max_level && hot_teams &&
5177
        hot_teams[level].hot_team) {
5178
      // hot team has already been allocated for given level
5179
      use_hot_team = 1;
5180
    } else {
5181
      use_hot_team = 0;
5182
    }
5183
  } else {
5184
    // check we won't access uninitialized hot_teams, just in case
5185
    KMP_DEBUG_ASSERT(new_nproc == 1);
5186
  }
5187
#endif
5188
  // Optimization to use a "hot" team
5189
  if (use_hot_team && new_nproc > 1) {
5190
    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5191
#if KMP_NESTED_HOT_TEAMS
5192
    team = hot_teams[level].hot_team;
5193
#else
5194
    team = root->r.r_hot_team;
5195
#endif
5196
#if KMP_DEBUG
5197
    if (__kmp_tasking_mode != tskm_immediate_exec) {
5198
      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199
                    "task_team[1] = %p before reinit\n",
5200
                    team->t.t_task_team[0], team->t.t_task_team[1]));
5201
    }
5202
#endif
5203

5204
    if (team->t.t_nproc != new_nproc &&
5205
        __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5206
      // Distributed barrier may need a resize
5207
      int old_nthr = team->t.t_nproc;
5208
      __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5209
    }
5210

5211
    // If not doing the place partition, then reset the team's proc bind
5212
    // to indicate that partitioning of all threads still needs to take place
5213
    if (do_place_partition == 0)
5214
      team->t.t_proc_bind = proc_bind_default;
5215
    // Has the number of threads changed?
5216
    /* Let's assume the most common case is that the number of threads is
5217
       unchanged, and put that case first. */
5218
    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5219
      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5220
      // This case can mean that omp_set_num_threads() was called and the hot
5221
      // team size was already reduced, so we check the special flag
5222
      if (team->t.t_size_changed == -1) {
5223
        team->t.t_size_changed = 1;
5224
      } else {
5225
        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5226
      }
5227

5228
      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5229
      kmp_r_sched_t new_sched = new_icvs->sched;
5230
      // set primary thread's schedule as new run-time schedule
5231
      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5232

5233
      __kmp_reinitialize_team(team, new_icvs,
5234
                              root->r.r_uber_thread->th.th_ident);
5235

5236
      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5237
                    team->t.t_threads[0], team));
5238
      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5239

5240
#if KMP_AFFINITY_SUPPORTED
5241
      if ((team->t.t_size_changed == 0) &&
5242
          (team->t.t_proc_bind == new_proc_bind)) {
5243
        if (new_proc_bind == proc_bind_spread) {
5244
          if (do_place_partition) {
5245
            // add flag to update only master for spread
5246
            __kmp_partition_places(team, 1);
5247
          }
5248
        }
5249
        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5250
                       "proc_bind = %d, partition = [%d,%d]\n",
5251
                       team->t.t_id, new_proc_bind, team->t.t_first_place,
5252
                       team->t.t_last_place));
5253
      } else {
5254
        if (do_place_partition) {
5255
          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256
          __kmp_partition_places(team);
5257
        }
5258
      }
5259
#else
5260
      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261
#endif /* KMP_AFFINITY_SUPPORTED */
5262
    } else if (team->t.t_nproc > new_nproc) {
5263
      KA_TRACE(20,
5264
               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5265
                new_nproc));
5266

5267
      team->t.t_size_changed = 1;
5268
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5269
        // Barrier size already reduced earlier in this function
5270
        // Activate team threads via th_used_in_team
5271
        __kmp_add_threads_to_team(team, new_nproc);
5272
      }
5273
      // When decreasing team size, threads no longer in the team should
5274
      // unref task team.
5275
      if (__kmp_tasking_mode != tskm_immediate_exec) {
5276
        for (f = new_nproc; f < team->t.t_nproc; f++) {
5277
          kmp_info_t *th = team->t.t_threads[f];
5278
          KMP_DEBUG_ASSERT(th);
5279
          th->th.th_task_team = NULL;
5280
        }
5281
      }
5282
#if KMP_NESTED_HOT_TEAMS
5283
      if (__kmp_hot_teams_mode == 0) {
5284
        // AC: saved number of threads should correspond to team's value in this
5285
        // mode, can be bigger in mode 1, when hot team has threads in reserve
5286
        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287
        hot_teams[level].hot_team_nth = new_nproc;
5288
#endif // KMP_NESTED_HOT_TEAMS
5289
        /* release the extra threads we don't need any more */
5290
        for (f = new_nproc; f < team->t.t_nproc; f++) {
5291
          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292
          __kmp_free_thread(team->t.t_threads[f]);
5293
          team->t.t_threads[f] = NULL;
5294
        }
5295
#if KMP_NESTED_HOT_TEAMS
5296
      } // (__kmp_hot_teams_mode == 0)
5297
      else {
5298
        // When keeping extra threads in team, switch threads to wait on own
5299
        // b_go flag
5300
        for (f = new_nproc; f < team->t.t_nproc; ++f) {
5301
          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5302
          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5303
          for (int b = 0; b < bs_last_barrier; ++b) {
5304
            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5305
              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5306
            }
5307
            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5308
          }
5309
        }
5310
      }
5311
#endif // KMP_NESTED_HOT_TEAMS
5312
      team->t.t_nproc = new_nproc;
5313
      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5314
      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5315
      __kmp_reinitialize_team(team, new_icvs,
5316
                              root->r.r_uber_thread->th.th_ident);
5317

5318
      // Update remaining threads
5319
      for (f = 0; f < new_nproc; ++f) {
5320
        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5321
      }
5322

5323
      // restore the current task state of the primary thread: should be the
5324
      // implicit task
5325
      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5326
                    team->t.t_threads[0], team));
5327

5328
      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5329

5330
#ifdef KMP_DEBUG
5331
      for (f = 0; f < team->t.t_nproc; f++) {
5332
        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5333
                         team->t.t_threads[f]->th.th_team_nproc ==
5334
                             team->t.t_nproc);
5335
      }
5336
#endif
5337

5338
      if (do_place_partition) {
5339
        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5340
#if KMP_AFFINITY_SUPPORTED
5341
        __kmp_partition_places(team);
5342
#endif
5343
      }
5344
    } else { // team->t.t_nproc < new_nproc
5345

5346
      KA_TRACE(20,
5347
               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5348
                new_nproc));
5349
      int old_nproc = team->t.t_nproc; // save old value and use to update only
5350
      team->t.t_size_changed = 1;
5351

5352
#if KMP_NESTED_HOT_TEAMS
5353
      int avail_threads = hot_teams[level].hot_team_nth;
5354
      if (new_nproc < avail_threads)
5355
        avail_threads = new_nproc;
5356
      kmp_info_t **other_threads = team->t.t_threads;
5357
      for (f = team->t.t_nproc; f < avail_threads; ++f) {
5358
        // Adjust barrier data of reserved threads (if any) of the team
5359
        // Other data will be set in __kmp_initialize_info() below.
5360
        int b;
5361
        kmp_balign_t *balign = other_threads[f]->th.th_bar;
5362
        for (b = 0; b < bs_last_barrier; ++b) {
5363
          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5364
          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5365
#if USE_DEBUGGER
5366
          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5367
#endif
5368
        }
5369
      }
5370
      if (hot_teams[level].hot_team_nth >= new_nproc) {
5371
        // we have all needed threads in reserve, no need to allocate any
5372
        // this only possible in mode 1, cannot have reserved threads in mode 0
5373
        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5374
        team->t.t_nproc = new_nproc; // just get reserved threads involved
5375
      } else {
5376
        // We may have some threads in reserve, but not enough;
5377
        // get reserved threads involved if any.
5378
        team->t.t_nproc = hot_teams[level].hot_team_nth;
5379
        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5380
#endif // KMP_NESTED_HOT_TEAMS
5381
        if (team->t.t_max_nproc < new_nproc) {
5382
          /* reallocate larger arrays */
5383
          __kmp_reallocate_team_arrays(team, new_nproc);
5384
          __kmp_reinitialize_team(team, new_icvs, NULL);
5385
        }
5386

5387
#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5388
    KMP_AFFINITY_SUPPORTED
5389
        /* Temporarily set full mask for primary thread before creation of
5390
           workers. The reason is that workers inherit the affinity from the
5391
           primary thread, so if a lot of workers are created on the single
5392
           core quickly, they don't get a chance to set their own affinity for
5393
           a long time. */
5394
        kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5395
#endif
5396

5397
        /* allocate new threads for the hot team */
5398
        for (f = team->t.t_nproc; f < new_nproc; f++) {
5399
          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5400
          KMP_DEBUG_ASSERT(new_worker);
5401
          team->t.t_threads[f] = new_worker;
5402

5403
          KA_TRACE(20,
5404
                   ("__kmp_allocate_team: team %d init T#%d arrived: "
5405
                    "join=%llu, plain=%llu\n",
5406
                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5407
                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5408
                    team->t.t_bar[bs_plain_barrier].b_arrived));
5409

5410
          { // Initialize barrier data for new threads.
5411
            int b;
5412
            kmp_balign_t *balign = new_worker->th.th_bar;
5413
            for (b = 0; b < bs_last_barrier; ++b) {
5414
              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5415
              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5416
                               KMP_BARRIER_PARENT_FLAG);
5417
#if USE_DEBUGGER
5418
              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5419
#endif
5420
            }
5421
          }
5422
        }
5423

5424
#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) &&   \
5425
    KMP_AFFINITY_SUPPORTED
5426
        /* Restore initial primary thread's affinity mask */
5427
        new_temp_affinity.restore();
5428
#endif
5429
#if KMP_NESTED_HOT_TEAMS
5430
      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431
#endif // KMP_NESTED_HOT_TEAMS
5432
      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433
        // Barrier size already increased earlier in this function
5434
        // Activate team threads via th_used_in_team
5435
        __kmp_add_threads_to_team(team, new_nproc);
5436
      }
5437
      /* make sure everyone is syncronized */
5438
      // new threads below
5439
      __kmp_initialize_team(team, new_nproc, new_icvs,
5440
                            root->r.r_uber_thread->th.th_ident);
5441

5442
      /* reinitialize the threads */
5443
      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444
      for (f = 0; f < team->t.t_nproc; ++f)
5445
        __kmp_initialize_info(team->t.t_threads[f], team, f,
5446
                              __kmp_gtid_from_tid(f, team));
5447

5448
      // set th_task_state for new threads in hot team with older thread's state
5449
      kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5450
      for (f = old_nproc; f < team->t.t_nproc; ++f)
5451
        team->t.t_threads[f]->th.th_task_state = old_state;
5452

5453
#ifdef KMP_DEBUG
5454
      for (f = 0; f < team->t.t_nproc; ++f) {
5455
        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5456
                         team->t.t_threads[f]->th.th_team_nproc ==
5457
                             team->t.t_nproc);
5458
      }
5459
#endif
5460

5461
      if (do_place_partition) {
5462
        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5463
#if KMP_AFFINITY_SUPPORTED
5464
        __kmp_partition_places(team);
5465
#endif
5466
      }
5467
    } // Check changes in number of threads
5468

5469
    if (master->th.th_teams_microtask) {
5470
      for (f = 1; f < new_nproc; ++f) {
5471
        // propagate teams construct specific info to workers
5472
        kmp_info_t *thr = team->t.t_threads[f];
5473
        thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474
        thr->th.th_teams_level = master->th.th_teams_level;
5475
        thr->th.th_teams_size = master->th.th_teams_size;
5476
      }
5477
    }
5478
#if KMP_NESTED_HOT_TEAMS
5479
    if (level) {
5480
      // Sync barrier state for nested hot teams, not needed for outermost hot
5481
      // team.
5482
      for (f = 1; f < new_nproc; ++f) {
5483
        kmp_info_t *thr = team->t.t_threads[f];
5484
        int b;
5485
        kmp_balign_t *balign = thr->th.th_bar;
5486
        for (b = 0; b < bs_last_barrier; ++b) {
5487
          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488
          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489
#if USE_DEBUGGER
5490
          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491
#endif
5492
        }
5493
      }
5494
    }
5495
#endif // KMP_NESTED_HOT_TEAMS
5496

5497
    /* reallocate space for arguments if necessary */
5498
    __kmp_alloc_argv_entries(argc, team, TRUE);
5499
    KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500
    // The hot team re-uses the previous task team,
5501
    // if untouched during the previous release->gather phase.
5502

5503
    KF_TRACE(10, (" hot_team = %p\n", team));
5504

5505
#if KMP_DEBUG
5506
    if (__kmp_tasking_mode != tskm_immediate_exec) {
5507
      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508
                    "task_team[1] = %p after reinit\n",
5509
                    team->t.t_task_team[0], team->t.t_task_team[1]));
5510
    }
5511
#endif
5512

5513
#if OMPT_SUPPORT
5514
    __ompt_team_assign_id(team, ompt_parallel_data);
5515
#endif
5516

5517
    KMP_MB();
5518

5519
    return team;
5520
  }
5521

5522
  /* next, let's try to take one from the team pool */
5523
  KMP_MB();
5524
  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525
    /* TODO: consider resizing undersized teams instead of reaping them, now
5526
       that we have a resizing mechanism */
5527
    if (team->t.t_max_nproc >= max_nproc) {
5528
      /* take this team from the team pool */
5529
      __kmp_team_pool = team->t.t_next_pool;
5530

5531
      if (max_nproc > 1 &&
5532
          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533
        if (!team->t.b) { // Allocate barrier structure
5534
          team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535
        }
5536
      }
5537

5538
      /* setup the team for fresh use */
5539
      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540

5541
      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542
                    "task_team[1] %p to NULL\n",
5543
                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544
      team->t.t_task_team[0] = NULL;
5545
      team->t.t_task_team[1] = NULL;
5546

5547
      /* reallocate space for arguments if necessary */
5548
      __kmp_alloc_argv_entries(argc, team, TRUE);
5549
      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550

5551
      KA_TRACE(
5552
          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553
               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554
      { // Initialize barrier data.
5555
        int b;
5556
        for (b = 0; b < bs_last_barrier; ++b) {
5557
          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558
#if USE_DEBUGGER
5559
          team->t.t_bar[b].b_master_arrived = 0;
5560
          team->t.t_bar[b].b_team_arrived = 0;
5561
#endif
5562
        }
5563
      }
5564

5565
      team->t.t_proc_bind = new_proc_bind;
5566

5567
      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568
                    team->t.t_id));
5569

5570
#if OMPT_SUPPORT
5571
      __ompt_team_assign_id(team, ompt_parallel_data);
5572
#endif
5573

5574
      team->t.t_nested_nth = NULL;
5575

5576
      KMP_MB();
5577

5578
      return team;
5579
    }
5580

5581
    /* reap team if it is too small, then loop back and check the next one */
5582
    // not sure if this is wise, but, will be redone during the hot-teams
5583
    // rewrite.
5584
    /* TODO: Use technique to find the right size hot-team, don't reap them */
5585
    team = __kmp_reap_team(team);
5586
    __kmp_team_pool = team;
5587
  }
5588

5589
  /* nothing available in the pool, no matter, make a new team! */
5590
  KMP_MB();
5591
  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5592

5593
  /* and set it up */
5594
  team->t.t_max_nproc = max_nproc;
5595
  if (max_nproc > 1 &&
5596
      __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5597
    // Allocate barrier structure
5598
    team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5599
  }
5600

5601
  /* NOTE well, for some reason allocating one big buffer and dividing it up
5602
     seems to really hurt performance a lot on the P4, so, let's not use this */
5603
  __kmp_allocate_team_arrays(team, max_nproc);
5604

5605
  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5606
  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5607

5608
  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5609
                "%p to NULL\n",
5610
                &team->t.t_task_team[0], &team->t.t_task_team[1]));
5611
  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5612
  // memory, no need to duplicate
5613
  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5614
  // memory, no need to duplicate
5615

5616
  if (__kmp_storage_map) {
5617
    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5618
  }
5619

5620
  /* allocate space for arguments */
5621
  __kmp_alloc_argv_entries(argc, team, FALSE);
5622
  team->t.t_argc = argc;
5623

5624
  KA_TRACE(20,
5625
           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5626
            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5627
  { // Initialize barrier data.
5628
    int b;
5629
    for (b = 0; b < bs_last_barrier; ++b) {
5630
      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5631
#if USE_DEBUGGER
5632
      team->t.t_bar[b].b_master_arrived = 0;
5633
      team->t.t_bar[b].b_team_arrived = 0;
5634
#endif
5635
    }
5636
  }
5637

5638
  team->t.t_proc_bind = new_proc_bind;
5639

5640
#if OMPT_SUPPORT
5641
  __ompt_team_assign_id(team, ompt_parallel_data);
5642
  team->t.ompt_serialized_team_info = NULL;
5643
#endif
5644

5645
  KMP_MB();
5646

5647
  team->t.t_nested_nth = NULL;
5648

5649
  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5650
                team->t.t_id));
5651

5652
  return team;
5653
}
5654

5655
/* TODO implement hot-teams at all levels */
5656
/* TODO implement lazy thread release on demand (disband request) */
5657

5658
/* free the team.  return it to the team pool.  release all the threads
5659
 * associated with it */
5660
void __kmp_free_team(kmp_root_t *root,
5661
                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5662
  int f;
5663
  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5664
                team->t.t_id));
5665

5666
  /* verify state */
5667
  KMP_DEBUG_ASSERT(root);
5668
  KMP_DEBUG_ASSERT(team);
5669
  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5670
  KMP_DEBUG_ASSERT(team->t.t_threads);
5671

5672
  int use_hot_team = team == root->r.r_hot_team;
5673
#if KMP_NESTED_HOT_TEAMS
5674
  int level;
5675
  if (master) {
5676
    level = team->t.t_active_level - 1;
5677
    if (master->th.th_teams_microtask) { // in teams construct?
5678
      if (master->th.th_teams_size.nteams > 1) {
5679
        ++level; // level was not increased in teams construct for
5680
        // team_of_masters
5681
      }
5682
      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5683
          master->th.th_teams_level == team->t.t_level) {
5684
        ++level; // level was not increased in teams construct for
5685
        // team_of_workers before the parallel
5686
      } // team->t.t_level will be increased inside parallel
5687
    }
5688
#if KMP_DEBUG
5689
    kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5690
#endif
5691
    if (level < __kmp_hot_teams_max_level) {
5692
      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5693
      use_hot_team = 1;
5694
    }
5695
  }
5696
#endif // KMP_NESTED_HOT_TEAMS
5697

5698
  /* team is done working */
5699
  TCW_SYNC_PTR(team->t.t_pkfn,
5700
               NULL); // Important for Debugging Support Library.
5701
#if KMP_OS_WINDOWS
5702
  team->t.t_copyin_counter = 0; // init counter for possible reuse
5703
#endif
5704
  // Do not reset pointer to parent team to NULL for hot teams.
5705

5706
  /* if we are non-hot team, release our threads */
5707
  if (!use_hot_team) {
5708
    if (__kmp_tasking_mode != tskm_immediate_exec) {
5709
      // Wait for threads to reach reapable state
5710
      for (f = 1; f < team->t.t_nproc; ++f) {
5711
        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5712
        kmp_info_t *th = team->t.t_threads[f];
5713
        volatile kmp_uint32 *state = &th->th.th_reap_state;
5714
        while (*state != KMP_SAFE_TO_REAP) {
5715
#if KMP_OS_WINDOWS
5716
          // On Windows a thread can be killed at any time, check this
5717
          DWORD ecode;
5718
          if (!__kmp_is_thread_alive(th, &ecode)) {
5719
            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5720
            break;
5721
          }
5722
#endif
5723
          // first check if thread is sleeping
5724
          if (th->th.th_sleep_loc)
5725
            __kmp_null_resume_wrapper(th);
5726
          KMP_CPU_PAUSE();
5727
        }
5728
      }
5729

5730
      // Delete task teams
5731
      int tt_idx;
5732
      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5733
        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5734
        if (task_team != NULL) {
5735
          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5736
            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5737
            team->t.t_threads[f]->th.th_task_team = NULL;
5738
          }
5739
          KA_TRACE(
5740
              20,
5741
              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5742
               __kmp_get_gtid(), task_team, team->t.t_id));
5743
#if KMP_NESTED_HOT_TEAMS
5744
          __kmp_free_task_team(master, task_team);
5745
#endif
5746
          team->t.t_task_team[tt_idx] = NULL;
5747
        }
5748
      }
5749
    }
5750

5751
    // Before clearing parent pointer, check if nested_nth list should be freed
5752
    if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5753
        team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5754
      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5755
      KMP_INTERNAL_FREE(team->t.t_nested_nth);
5756
    }
5757
    team->t.t_nested_nth = NULL;
5758

5759
    // Reset pointer to parent team only for non-hot teams.
5760
    team->t.t_parent = NULL;
5761
    team->t.t_level = 0;
5762
    team->t.t_active_level = 0;
5763

5764
    /* free the worker threads */
5765
    for (f = 1; f < team->t.t_nproc; ++f) {
5766
      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767
      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768
        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769
                                    1, 2);
5770
      }
5771
      __kmp_free_thread(team->t.t_threads[f]);
5772
    }
5773

5774
    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775
      if (team->t.b) {
5776
        // wake up thread at old location
5777
        team->t.b->go_release();
5778
        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779
          for (f = 1; f < team->t.t_nproc; ++f) {
5780
            if (team->t.b->sleep[f].sleep) {
5781
              __kmp_atomic_resume_64(
5782
                  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783
                  (kmp_atomic_flag_64<> *)NULL);
5784
            }
5785
          }
5786
        }
5787
        // Wait for threads to be removed from team
5788
        for (int f = 1; f < team->t.t_nproc; ++f) {
5789
          while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790
            KMP_CPU_PAUSE();
5791
        }
5792
      }
5793
    }
5794

5795
    for (f = 1; f < team->t.t_nproc; ++f) {
5796
      team->t.t_threads[f] = NULL;
5797
    }
5798

5799
    if (team->t.t_max_nproc > 1 &&
5800
        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801
      distributedBarrier::deallocate(team->t.b);
5802
      team->t.b = NULL;
5803
    }
5804
    /* put the team back in the team pool */
5805
    /* TODO limit size of team pool, call reap_team if pool too large */
5806
    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807
    __kmp_team_pool = (volatile kmp_team_t *)team;
5808
  } else { // Check if team was created for primary threads in teams construct
5809
    // See if first worker is a CG root
5810
    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811
                     team->t.t_threads[1]->th.th_cg_roots);
5812
    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813
      // Clean up the CG root nodes on workers so that this team can be re-used
5814
      for (f = 1; f < team->t.t_nproc; ++f) {
5815
        kmp_info_t *thr = team->t.t_threads[f];
5816
        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817
                         thr->th.th_cg_roots->cg_root == thr);
5818
        // Pop current CG root off list
5819
        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820
        thr->th.th_cg_roots = tmp->up;
5821
        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822
                       " up to node %p. cg_nthreads was %d\n",
5823
                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824
        int i = tmp->cg_nthreads--;
5825
        if (i == 1) {
5826
          __kmp_free(tmp); // free CG if we are the last thread in it
5827
        }
5828
        // Restore current task's thread_limit from CG root
5829
        if (thr->th.th_cg_roots)
5830
          thr->th.th_current_task->td_icvs.thread_limit =
5831
              thr->th.th_cg_roots->cg_thread_limit;
5832
      }
5833
    }
5834
  }
5835

5836
  KMP_MB();
5837
}
5838

5839
/* reap the team.  destroy it, reclaim all its resources and free its memory */
5840
kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841
  kmp_team_t *next_pool = team->t.t_next_pool;
5842

5843
  KMP_DEBUG_ASSERT(team);
5844
  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845
  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846
  KMP_DEBUG_ASSERT(team->t.t_threads);
5847
  KMP_DEBUG_ASSERT(team->t.t_argv);
5848

5849
  /* TODO clean the threads that are a part of this? */
5850

5851
  /* free stuff */
5852
  __kmp_free_team_arrays(team);
5853
  if (team->t.t_argv != &team->t.t_inline_argv[0])
5854
    __kmp_free((void *)team->t.t_argv);
5855
  __kmp_free(team);
5856

5857
  KMP_MB();
5858
  return next_pool;
5859
}
5860

5861
// Free the thread.  Don't reap it, just place it on the pool of available
5862
// threads.
5863
//
5864
// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865
// binding for the affinity mechanism to be useful.
5866
//
5867
// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868
// However, we want to avoid a potential performance problem by always
5869
// scanning through the list to find the correct point at which to insert
5870
// the thread (potential N**2 behavior).  To do this we keep track of the
5871
// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872
// With single-level parallelism, threads will always be added to the tail
5873
// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5874
// parallelism, all bets are off and we may need to scan through the entire
5875
// free list.
5876
//
5877
// This change also has a potentially large performance benefit, for some
5878
// applications.  Previously, as threads were freed from the hot team, they
5879
// would be placed back on the free list in inverse order.  If the hot team
5880
// grew back to it's original size, then the freed thread would be placed
5881
// back on the hot team in reverse order.  This could cause bad cache
5882
// locality problems on programs where the size of the hot team regularly
5883
// grew and shrunk.
5884
//
5885
// Now, for single-level parallelism, the OMP tid is always == gtid.
5886
void __kmp_free_thread(kmp_info_t *this_th) {
5887
  int gtid;
5888
  kmp_info_t **scan;
5889

5890
  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891
                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892

5893
  KMP_DEBUG_ASSERT(this_th);
5894

5895
  // When moving thread to pool, switch thread to wait on own b_go flag, and
5896
  // uninitialized (NULL team).
5897
  int b;
5898
  kmp_balign_t *balign = this_th->th.th_bar;
5899
  for (b = 0; b < bs_last_barrier; ++b) {
5900
    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901
      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902
    balign[b].bb.team = NULL;
5903
    balign[b].bb.leaf_kids = 0;
5904
  }
5905
  this_th->th.th_task_state = 0;
5906
  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907

5908
  /* put thread back on the free pool */
5909
  TCW_PTR(this_th->th.th_team, NULL);
5910
  TCW_PTR(this_th->th.th_root, NULL);
5911
  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912

5913
  while (this_th->th.th_cg_roots) {
5914
    this_th->th.th_cg_roots->cg_nthreads--;
5915
    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916
                   " %p of thread  %p to %d\n",
5917
                   this_th, this_th->th.th_cg_roots,
5918
                   this_th->th.th_cg_roots->cg_root,
5919
                   this_th->th.th_cg_roots->cg_nthreads));
5920
    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921
    if (tmp->cg_root == this_th) { // Thread is a cg_root
5922
      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923
      KA_TRACE(
5924
          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925
      this_th->th.th_cg_roots = tmp->up;
5926
      __kmp_free(tmp);
5927
    } else { // Worker thread
5928
      if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929
        __kmp_free(tmp);
5930
      }
5931
      this_th->th.th_cg_roots = NULL;
5932
      break;
5933
    }
5934
  }
5935

5936
  /* If the implicit task assigned to this thread can be used by other threads
5937
   * -> multiple threads can share the data and try to free the task at
5938
   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939
   * with higher probability when hot team is disabled but can occurs even when
5940
   * the hot team is enabled */
5941
  __kmp_free_implicit_task(this_th);
5942
  this_th->th.th_current_task = NULL;
5943

5944
  // If the __kmp_thread_pool_insert_pt is already past the new insert
5945
  // point, then we need to re-scan the entire list.
5946
  gtid = this_th->th.th_info.ds.ds_gtid;
5947
  if (__kmp_thread_pool_insert_pt != NULL) {
5948
    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949
    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950
      __kmp_thread_pool_insert_pt = NULL;
5951
    }
5952
  }
5953

5954
  // Scan down the list to find the place to insert the thread.
5955
  // scan is the address of a link in the list, possibly the address of
5956
  // __kmp_thread_pool itself.
5957
  //
5958
  // In the absence of nested parallelism, the for loop will have 0 iterations.
5959
  if (__kmp_thread_pool_insert_pt != NULL) {
5960
    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961
  } else {
5962
    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963
  }
5964
  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965
       scan = &((*scan)->th.th_next_pool))
5966
    ;
5967

5968
  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969
  // to its address.
5970
  TCW_PTR(this_th->th.th_next_pool, *scan);
5971
  __kmp_thread_pool_insert_pt = *scan = this_th;
5972
  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973
                   (this_th->th.th_info.ds.ds_gtid <
5974
                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975
  TCW_4(this_th->th.th_in_pool, TRUE);
5976
  __kmp_suspend_initialize_thread(this_th);
5977
  __kmp_lock_suspend_mx(this_th);
5978
  if (this_th->th.th_active == TRUE) {
5979
    KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980
    this_th->th.th_active_in_pool = TRUE;
5981
  }
5982
#if KMP_DEBUG
5983
  else {
5984
    KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985
  }
5986
#endif
5987
  __kmp_unlock_suspend_mx(this_th);
5988

5989
  TCW_4(__kmp_nth, __kmp_nth - 1);
5990

5991
#ifdef KMP_ADJUST_BLOCKTIME
5992
  /* Adjust blocktime back to user setting or default if necessary */
5993
  /* Middle initialization might never have occurred                */
5994
  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995
    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996
    if (__kmp_nth <= __kmp_avail_proc) {
5997
      __kmp_zero_bt = FALSE;
5998
    }
5999
  }
6000
#endif /* KMP_ADJUST_BLOCKTIME */
6001

6002
  KMP_MB();
6003
}
6004

6005
/* ------------------------------------------------------------------------ */
6006

6007
void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008
#if OMP_PROFILING_SUPPORT
6009
  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010
  // TODO: add a configuration option for time granularity
6011
  if (ProfileTraceFile)
6012
    llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013
#endif
6014

6015
  int gtid = this_thr->th.th_info.ds.ds_gtid;
6016
  /*    void                 *stack_data;*/
6017
  kmp_team_t **volatile pteam;
6018

6019
  KMP_MB();
6020
  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021

6022
  if (__kmp_env_consistency_check) {
6023
    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024
  }
6025

6026
#if OMPD_SUPPORT
6027
  if (ompd_state & OMPD_ENABLE_BP)
6028
    ompd_bp_thread_begin();
6029
#endif
6030

6031
#if OMPT_SUPPORT
6032
  ompt_data_t *thread_data = nullptr;
6033
  if (ompt_enabled.enabled) {
6034
    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035
    *thread_data = ompt_data_none;
6036

6037
    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038
    this_thr->th.ompt_thread_info.wait_id = 0;
6039
    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040
    this_thr->th.ompt_thread_info.parallel_flags = 0;
6041
    if (ompt_enabled.ompt_callback_thread_begin) {
6042
      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043
          ompt_thread_worker, thread_data);
6044
    }
6045
    this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046
  }
6047
#endif
6048

6049
  /* This is the place where threads wait for work */
6050
  while (!TCR_4(__kmp_global.g.g_done)) {
6051
    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052
    KMP_MB();
6053

6054
    /* wait for work to do */
6055
    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056

6057
    /* No tid yet since not part of a team */
6058
    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059

6060
#if OMPT_SUPPORT
6061
    if (ompt_enabled.enabled) {
6062
      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063
    }
6064
#endif
6065

6066
    pteam = &this_thr->th.th_team;
6067

6068
    /* have we been allocated? */
6069
    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070
      /* we were just woken up, so run our new task */
6071
      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072
        int rc;
6073
        KA_TRACE(20,
6074
                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075
                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076
                  (*pteam)->t.t_pkfn));
6077

6078
        updateHWFPControl(*pteam);
6079

6080
#if OMPT_SUPPORT
6081
        if (ompt_enabled.enabled) {
6082
          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083
        }
6084
#endif
6085

6086
        rc = (*pteam)->t.t_invoke(gtid);
6087
        KMP_ASSERT(rc);
6088

6089
        KMP_MB();
6090
        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091
                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092
                      (*pteam)->t.t_pkfn));
6093
      }
6094
#if OMPT_SUPPORT
6095
      if (ompt_enabled.enabled) {
6096
        /* no frame set while outside task */
6097
        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098

6099
        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100
      }
6101
#endif
6102
      /* join barrier after parallel region */
6103
      __kmp_join_barrier(gtid);
6104
    }
6105
  }
6106

6107
#if OMPD_SUPPORT
6108
  if (ompd_state & OMPD_ENABLE_BP)
6109
    ompd_bp_thread_end();
6110
#endif
6111

6112
#if OMPT_SUPPORT
6113
  if (ompt_enabled.ompt_callback_thread_end) {
6114
    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6115
  }
6116
#endif
6117

6118
  this_thr->th.th_task_team = NULL;
6119
  /* run the destructors for the threadprivate data for this thread */
6120
  __kmp_common_destroy_gtid(gtid);
6121

6122
  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6123
  KMP_MB();
6124

6125
#if OMP_PROFILING_SUPPORT
6126
  llvm::timeTraceProfilerFinishThread();
6127
#endif
6128
  return this_thr;
6129
}
6130

6131
/* ------------------------------------------------------------------------ */
6132

6133
void __kmp_internal_end_dest(void *specific_gtid) {
6134
  // Make sure no significant bits are lost
6135
  int gtid;
6136
  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6137

6138
  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6139
  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6140
   * this is because 0 is reserved for the nothing-stored case */
6141

6142
  __kmp_internal_end_thread(gtid);
6143
}
6144

6145
#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6146

6147
__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6148
  __kmp_internal_end_atexit();
6149
}
6150

6151
#endif
6152

6153
/* [Windows] josh: when the atexit handler is called, there may still be more
6154
   than one thread alive */
6155
void __kmp_internal_end_atexit(void) {
6156
  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6157
  /* [Windows]
6158
     josh: ideally, we want to completely shutdown the library in this atexit
6159
     handler, but stat code that depends on thread specific data for gtid fails
6160
     because that data becomes unavailable at some point during the shutdown, so
6161
     we call __kmp_internal_end_thread instead. We should eventually remove the
6162
     dependency on __kmp_get_specific_gtid in the stat code and use
6163
     __kmp_internal_end_library to cleanly shutdown the library.
6164

6165
     // TODO: Can some of this comment about GVS be removed?
6166
     I suspect that the offending stat code is executed when the calling thread
6167
     tries to clean up a dead root thread's data structures, resulting in GVS
6168
     code trying to close the GVS structures for that thread, but since the stat
6169
     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6170
     the calling thread is cleaning up itself instead of another thread, it get
6171
     confused. This happens because allowing a thread to unregister and cleanup
6172
     another thread is a recent modification for addressing an issue.
6173
     Based on the current design (20050722), a thread may end up
6174
     trying to unregister another thread only if thread death does not trigger
6175
     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6176
     thread specific data destructor function to detect thread death. For
6177
     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6178
     is nothing.  Thus, the workaround is applicable only for Windows static
6179
     stat library. */
6180
  __kmp_internal_end_library(-1);
6181
#if KMP_OS_WINDOWS
6182
  __kmp_close_console();
6183
#endif
6184
}
6185

6186
static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6187
  // It is assumed __kmp_forkjoin_lock is acquired.
6188

6189
  int gtid;
6190

6191
  KMP_DEBUG_ASSERT(thread != NULL);
6192

6193
  gtid = thread->th.th_info.ds.ds_gtid;
6194

6195
  if (!is_root) {
6196
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6197
      /* Assume the threads are at the fork barrier here */
6198
      KA_TRACE(
6199
          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6200
               gtid));
6201
      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6202
        while (
6203
            !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6204
          KMP_CPU_PAUSE();
6205
        __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6206
      } else {
6207
        /* Need release fence here to prevent seg faults for tree forkjoin
6208
           barrier (GEH) */
6209
        kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6210
                           thread);
6211
        __kmp_release_64(&flag);
6212
      }
6213
    }
6214

6215
    // Terminate OS thread.
6216
    __kmp_reap_worker(thread);
6217

6218
    // The thread was killed asynchronously.  If it was actively
6219
    // spinning in the thread pool, decrement the global count.
6220
    //
6221
    // There is a small timing hole here - if the worker thread was just waking
6222
    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6223
    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6224
    // the global counter might not get updated.
6225
    //
6226
    // Currently, this can only happen as the library is unloaded,
6227
    // so there are no harmful side effects.
6228
    if (thread->th.th_active_in_pool) {
6229
      thread->th.th_active_in_pool = FALSE;
6230
      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6231
      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6232
    }
6233
  }
6234

6235
  __kmp_free_implicit_task(thread);
6236

6237
// Free the fast memory for tasking
6238
#if USE_FAST_MEMORY
6239
  __kmp_free_fast_memory(thread);
6240
#endif /* USE_FAST_MEMORY */
6241

6242
  __kmp_suspend_uninitialize_thread(thread);
6243

6244
  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6245
  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6246

6247
  --__kmp_all_nth;
6248
  // __kmp_nth was decremented when thread is added to the pool.
6249

6250
#ifdef KMP_ADJUST_BLOCKTIME
6251
  /* Adjust blocktime back to user setting or default if necessary */
6252
  /* Middle initialization might never have occurred                */
6253
  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6254
    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6255
    if (__kmp_nth <= __kmp_avail_proc) {
6256
      __kmp_zero_bt = FALSE;
6257
    }
6258
  }
6259
#endif /* KMP_ADJUST_BLOCKTIME */
6260

6261
  /* free the memory being used */
6262
  if (__kmp_env_consistency_check) {
6263
    if (thread->th.th_cons) {
6264
      __kmp_free_cons_stack(thread->th.th_cons);
6265
      thread->th.th_cons = NULL;
6266
    }
6267
  }
6268

6269
  if (thread->th.th_pri_common != NULL) {
6270
    __kmp_free(thread->th.th_pri_common);
6271
    thread->th.th_pri_common = NULL;
6272
  }
6273

6274
#if KMP_USE_BGET
6275
  if (thread->th.th_local.bget_data != NULL) {
6276
    __kmp_finalize_bget(thread);
6277
  }
6278
#endif
6279

6280
#if KMP_AFFINITY_SUPPORTED
6281
  if (thread->th.th_affin_mask != NULL) {
6282
    KMP_CPU_FREE(thread->th.th_affin_mask);
6283
    thread->th.th_affin_mask = NULL;
6284
  }
6285
#endif /* KMP_AFFINITY_SUPPORTED */
6286

6287
#if KMP_USE_HIER_SCHED
6288
  if (thread->th.th_hier_bar_data != NULL) {
6289
    __kmp_free(thread->th.th_hier_bar_data);
6290
    thread->th.th_hier_bar_data = NULL;
6291
  }
6292
#endif
6293

6294
  __kmp_reap_team(thread->th.th_serial_team);
6295
  thread->th.th_serial_team = NULL;
6296
  __kmp_free(thread);
6297

6298
  KMP_MB();
6299

6300
} // __kmp_reap_thread
6301

6302
static void __kmp_itthash_clean(kmp_info_t *th) {
6303
#if USE_ITT_NOTIFY
6304
  if (__kmp_itt_region_domains.count > 0) {
6305
    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6306
      kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6307
      while (bucket) {
6308
        kmp_itthash_entry_t *next = bucket->next_in_bucket;
6309
        __kmp_thread_free(th, bucket);
6310
        bucket = next;
6311
      }
6312
    }
6313
  }
6314
  if (__kmp_itt_barrier_domains.count > 0) {
6315
    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6316
      kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6317
      while (bucket) {
6318
        kmp_itthash_entry_t *next = bucket->next_in_bucket;
6319
        __kmp_thread_free(th, bucket);
6320
        bucket = next;
6321
      }
6322
    }
6323
  }
6324
#endif
6325
}
6326

6327
static void __kmp_internal_end(void) {
6328
  int i;
6329

6330
  /* First, unregister the library */
6331
  __kmp_unregister_library();
6332

6333
#if KMP_OS_WINDOWS
6334
  /* In Win static library, we can't tell when a root actually dies, so we
6335
     reclaim the data structures for any root threads that have died but not
6336
     unregistered themselves, in order to shut down cleanly.
6337
     In Win dynamic library we also can't tell when a thread dies.  */
6338
  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6339
// dead roots
6340
#endif
6341

6342
  for (i = 0; i < __kmp_threads_capacity; i++)
6343
    if (__kmp_root[i])
6344
      if (__kmp_root[i]->r.r_active)
6345
        break;
6346
  KMP_MB(); /* Flush all pending memory write invalidates.  */
6347
  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6348

6349
  if (i < __kmp_threads_capacity) {
6350
#if KMP_USE_MONITOR
6351
    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6352
    KMP_MB(); /* Flush all pending memory write invalidates.  */
6353

6354
    // Need to check that monitor was initialized before reaping it. If we are
6355
    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6356
    // __kmp_monitor will appear to contain valid data, but it is only valid in
6357
    // the parent process, not the child.
6358
    // New behavior (201008): instead of keying off of the flag
6359
    // __kmp_init_parallel, the monitor thread creation is keyed off
6360
    // of the new flag __kmp_init_monitor.
6361
    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6362
    if (TCR_4(__kmp_init_monitor)) {
6363
      __kmp_reap_monitor(&__kmp_monitor);
6364
      TCW_4(__kmp_init_monitor, 0);
6365
    }
6366
    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6367
    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6368
#endif // KMP_USE_MONITOR
6369
  } else {
6370
/* TODO move this to cleanup code */
6371
#ifdef KMP_DEBUG
6372
    /* make sure that everything has properly ended */
6373
    for (i = 0; i < __kmp_threads_capacity; i++) {
6374
      if (__kmp_root[i]) {
6375
        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6376
        //                    there can be uber threads alive here
6377
        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6378
      }
6379
    }
6380
#endif
6381

6382
    KMP_MB();
6383

6384
    // Reap the worker threads.
6385
    // This is valid for now, but be careful if threads are reaped sooner.
6386
    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6387
      // Get the next thread from the pool.
6388
      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6389
      __kmp_thread_pool = thread->th.th_next_pool;
6390
      // Reap it.
6391
      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6392
      thread->th.th_next_pool = NULL;
6393
      thread->th.th_in_pool = FALSE;
6394
      __kmp_reap_thread(thread, 0);
6395
    }
6396
    __kmp_thread_pool_insert_pt = NULL;
6397

6398
    // Reap teams.
6399
    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6400
      // Get the next team from the pool.
6401
      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6402
      __kmp_team_pool = team->t.t_next_pool;
6403
      // Reap it.
6404
      team->t.t_next_pool = NULL;
6405
      __kmp_reap_team(team);
6406
    }
6407

6408
    __kmp_reap_task_teams();
6409

6410
#if KMP_OS_UNIX
6411
    // Threads that are not reaped should not access any resources since they
6412
    // are going to be deallocated soon, so the shutdown sequence should wait
6413
    // until all threads either exit the final spin-waiting loop or begin
6414
    // sleeping after the given blocktime.
6415
    for (i = 0; i < __kmp_threads_capacity; i++) {
6416
      kmp_info_t *thr = __kmp_threads[i];
6417
      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6418
        KMP_CPU_PAUSE();
6419
    }
6420
#endif
6421

6422
    for (i = 0; i < __kmp_threads_capacity; ++i) {
6423
      // TBD: Add some checking...
6424
      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6425
    }
6426

6427
    /* Make sure all threadprivate destructors get run by joining with all
6428
       worker threads before resetting this flag */
6429
    TCW_SYNC_4(__kmp_init_common, FALSE);
6430

6431
    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6432
    KMP_MB();
6433

6434
#if KMP_USE_MONITOR
6435
    // See note above: One of the possible fixes for CQ138434 / CQ140126
6436
    //
6437
    // FIXME: push both code fragments down and CSE them?
6438
    // push them into __kmp_cleanup() ?
6439
    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6440
    if (TCR_4(__kmp_init_monitor)) {
6441
      __kmp_reap_monitor(&__kmp_monitor);
6442
      TCW_4(__kmp_init_monitor, 0);
6443
    }
6444
    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6445
    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6446
#endif
6447
  } /* else !__kmp_global.t_active */
6448
  TCW_4(__kmp_init_gtid, FALSE);
6449
  KMP_MB(); /* Flush all pending memory write invalidates.  */
6450

6451
  __kmp_cleanup();
6452
#if OMPT_SUPPORT
6453
  ompt_fini();
6454
#endif
6455
}
6456

6457
void __kmp_internal_end_library(int gtid_req) {
6458
  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6459
  /* this shouldn't be a race condition because __kmp_internal_end() is the
6460
     only place to clear __kmp_serial_init */
6461
  /* we'll check this later too, after we get the lock */
6462
  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6463
  // redundant, because the next check will work in any case.
6464
  if (__kmp_global.g.g_abort) {
6465
    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6466
    /* TODO abort? */
6467
    return;
6468
  }
6469
  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6470
    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6471
    return;
6472
  }
6473

6474
  // If hidden helper team has been initialized, we need to deinit it
6475
  if (TCR_4(__kmp_init_hidden_helper) &&
6476
      !TCR_4(__kmp_hidden_helper_team_done)) {
6477
    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6478
    // First release the main thread to let it continue its work
6479
    __kmp_hidden_helper_main_thread_release();
6480
    // Wait until the hidden helper team has been destroyed
6481
    __kmp_hidden_helper_threads_deinitz_wait();
6482
  }
6483

6484
  KMP_MB(); /* Flush all pending memory write invalidates.  */
6485
  /* find out who we are and what we should do */
6486
  {
6487
    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6488
    KA_TRACE(
6489
        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6490
    if (gtid == KMP_GTID_SHUTDOWN) {
6491
      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6492
                    "already shutdown\n"));
6493
      return;
6494
    } else if (gtid == KMP_GTID_MONITOR) {
6495
      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6496
                    "registered, or system shutdown\n"));
6497
      return;
6498
    } else if (gtid == KMP_GTID_DNE) {
6499
      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6500
                    "shutdown\n"));
6501
      /* we don't know who we are, but we may still shutdown the library */
6502
    } else if (KMP_UBER_GTID(gtid)) {
6503
      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6504
      if (__kmp_root[gtid]->r.r_active) {
6505
        __kmp_global.g.g_abort = -1;
6506
        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6507
        __kmp_unregister_library();
6508
        KA_TRACE(10,
6509
                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6510
                  gtid));
6511
        return;
6512
      } else {
6513
        __kmp_itthash_clean(__kmp_threads[gtid]);
6514
        KA_TRACE(
6515
            10,
6516
            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6517
        __kmp_unregister_root_current_thread(gtid);
6518
      }
6519
    } else {
6520
/* worker threads may call this function through the atexit handler, if they
6521
 * call exit() */
6522
/* For now, skip the usual subsequent processing and just dump the debug buffer.
6523
   TODO: do a thorough shutdown instead */
6524
#ifdef DUMP_DEBUG_ON_EXIT
6525
      if (__kmp_debug_buf)
6526
        __kmp_dump_debug_buffer();
6527
#endif
6528
      // added unregister library call here when we switch to shm linux
6529
      // if we don't, it will leave lots of files in /dev/shm
6530
      // cleanup shared memory file before exiting.
6531
      __kmp_unregister_library();
6532
      return;
6533
    }
6534
  }
6535
  /* synchronize the termination process */
6536
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6537

6538
  /* have we already finished */
6539
  if (__kmp_global.g.g_abort) {
6540
    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6541
    /* TODO abort? */
6542
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543
    return;
6544
  }
6545
  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6546
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547
    return;
6548
  }
6549

6550
  /* We need this lock to enforce mutex between this reading of
6551
     __kmp_threads_capacity and the writing by __kmp_register_root.
6552
     Alternatively, we can use a counter of roots that is atomically updated by
6553
     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6554
     __kmp_internal_end_*.  */
6555
  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6556

6557
  /* now we can safely conduct the actual termination */
6558
  __kmp_internal_end();
6559

6560
  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6561
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6562

6563
  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6564

6565
#ifdef DUMP_DEBUG_ON_EXIT
6566
  if (__kmp_debug_buf)
6567
    __kmp_dump_debug_buffer();
6568
#endif
6569

6570
#if KMP_OS_WINDOWS
6571
  __kmp_close_console();
6572
#endif
6573

6574
  __kmp_fini_allocator();
6575

6576
} // __kmp_internal_end_library
6577

6578
void __kmp_internal_end_thread(int gtid_req) {
6579
  int i;
6580

6581
  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6582
  /* this shouldn't be a race condition because __kmp_internal_end() is the
6583
   * only place to clear __kmp_serial_init */
6584
  /* we'll check this later too, after we get the lock */
6585
  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6586
  // redundant, because the next check will work in any case.
6587
  if (__kmp_global.g.g_abort) {
6588
    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6589
    /* TODO abort? */
6590
    return;
6591
  }
6592
  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6593
    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6594
    return;
6595
  }
6596

6597
  // If hidden helper team has been initialized, we need to deinit it
6598
  if (TCR_4(__kmp_init_hidden_helper) &&
6599
      !TCR_4(__kmp_hidden_helper_team_done)) {
6600
    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6601
    // First release the main thread to let it continue its work
6602
    __kmp_hidden_helper_main_thread_release();
6603
    // Wait until the hidden helper team has been destroyed
6604
    __kmp_hidden_helper_threads_deinitz_wait();
6605
  }
6606

6607
  KMP_MB(); /* Flush all pending memory write invalidates.  */
6608

6609
  /* find out who we are and what we should do */
6610
  {
6611
    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6612
    KA_TRACE(10,
6613
             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6614
    if (gtid == KMP_GTID_SHUTDOWN) {
6615
      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6616
                    "already shutdown\n"));
6617
      return;
6618
    } else if (gtid == KMP_GTID_MONITOR) {
6619
      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6620
                    "registered, or system shutdown\n"));
6621
      return;
6622
    } else if (gtid == KMP_GTID_DNE) {
6623
      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6624
                    "shutdown\n"));
6625
      return;
6626
      /* we don't know who we are */
6627
    } else if (KMP_UBER_GTID(gtid)) {
6628
      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6629
      if (__kmp_root[gtid]->r.r_active) {
6630
        __kmp_global.g.g_abort = -1;
6631
        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6632
        KA_TRACE(10,
6633
                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6634
                  gtid));
6635
        return;
6636
      } else {
6637
        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6638
                      gtid));
6639
        __kmp_unregister_root_current_thread(gtid);
6640
      }
6641
    } else {
6642
      /* just a worker thread, let's leave */
6643
      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6644

6645
      if (gtid >= 0) {
6646
        __kmp_threads[gtid]->th.th_task_team = NULL;
6647
      }
6648

6649
      KA_TRACE(10,
6650
               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6651
                gtid));
6652
      return;
6653
    }
6654
  }
6655
#if KMP_DYNAMIC_LIB
6656
  if (__kmp_pause_status != kmp_hard_paused)
6657
  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6658
  // because we will better shutdown later in the library destructor.
6659
  {
6660
    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6661
    return;
6662
  }
6663
#endif
6664
  /* synchronize the termination process */
6665
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6666

6667
  /* have we already finished */
6668
  if (__kmp_global.g.g_abort) {
6669
    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6670
    /* TODO abort? */
6671
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6672
    return;
6673
  }
6674
  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6675
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6676
    return;
6677
  }
6678

6679
  /* We need this lock to enforce mutex between this reading of
6680
     __kmp_threads_capacity and the writing by __kmp_register_root.
6681
     Alternatively, we can use a counter of roots that is atomically updated by
6682
     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6683
     __kmp_internal_end_*.  */
6684

6685
  /* should we finish the run-time?  are all siblings done? */
6686
  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6687

6688
  for (i = 0; i < __kmp_threads_capacity; ++i) {
6689
    if (KMP_UBER_GTID(i)) {
6690
      KA_TRACE(
6691
          10,
6692
          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6693
      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6694
      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6695
      return;
6696
    }
6697
  }
6698

6699
  /* now we can safely conduct the actual termination */
6700

6701
  __kmp_internal_end();
6702

6703
  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6704
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705

6706
  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6707

6708
#ifdef DUMP_DEBUG_ON_EXIT
6709
  if (__kmp_debug_buf)
6710
    __kmp_dump_debug_buffer();
6711
#endif
6712
} // __kmp_internal_end_thread
6713

6714
// -----------------------------------------------------------------------------
6715
// Library registration stuff.
6716

6717
static long __kmp_registration_flag = 0;
6718
// Random value used to indicate library initialization.
6719
static char *__kmp_registration_str = NULL;
6720
// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6721

6722
static inline char *__kmp_reg_status_name() {
6723
/* On RHEL 3u5 if linked statically, getpid() returns different values in
6724
   each thread. If registration and unregistration go in different threads
6725
   (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6726
   env var can not be found, because the name will contain different pid. */
6727
// macOS* complains about name being too long with additional getuid()
6728
#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6729
  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6730
                          (int)getuid());
6731
#else
6732
  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6733
#endif
6734
} // __kmp_reg_status_get
6735

6736
#if defined(KMP_USE_SHM)
6737
bool __kmp_shm_available = false;
6738
bool __kmp_tmp_available = false;
6739
// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6740
char *temp_reg_status_file_name = nullptr;
6741
#endif
6742

6743
void __kmp_register_library_startup(void) {
6744

6745
  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6746
  int done = 0;
6747
  union {
6748
    double dtime;
6749
    long ltime;
6750
  } time;
6751
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6752
  __kmp_initialize_system_tick();
6753
#endif
6754
  __kmp_read_system_time(&time.dtime);
6755
  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6756
  __kmp_registration_str =
6757
      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6758
                       __kmp_registration_flag, KMP_LIBRARY_FILE);
6759

6760
  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6761
                __kmp_registration_str));
6762

6763
  while (!done) {
6764

6765
    char *value = NULL; // Actual value of the environment variable.
6766

6767
#if defined(KMP_USE_SHM)
6768
    char *shm_name = nullptr;
6769
    char *data1 = nullptr;
6770
    __kmp_shm_available = __kmp_detect_shm();
6771
    if (__kmp_shm_available) {
6772
      int fd1 = -1;
6773
      shm_name = __kmp_str_format("/%s", name);
6774
      int shm_preexist = 0;
6775
      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6776
      if ((fd1 == -1) && (errno == EEXIST)) {
6777
        // file didn't open because it already exists.
6778
        // try opening existing file
6779
        fd1 = shm_open(shm_name, O_RDWR, 0600);
6780
        if (fd1 == -1) { // file didn't open
6781
          KMP_WARNING(FunctionError, "Can't open SHM");
6782
          __kmp_shm_available = false;
6783
        } else { // able to open existing file
6784
          shm_preexist = 1;
6785
        }
6786
      }
6787
      if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6788
        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6789
          KMP_WARNING(FunctionError, "Can't set size of SHM");
6790
          __kmp_shm_available = false;
6791
        }
6792
      }
6793
      if (__kmp_shm_available) { // SHM exists, now map it
6794
        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6795
                             fd1, 0);
6796
        if (data1 == MAP_FAILED) { // failed to map shared memory
6797
          KMP_WARNING(FunctionError, "Can't map SHM");
6798
          __kmp_shm_available = false;
6799
        }
6800
      }
6801
      if (__kmp_shm_available) { // SHM mapped
6802
        if (shm_preexist == 0) { // set data to SHM, set value
6803
          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6804
        }
6805
        // Read value from either what we just wrote or existing file.
6806
        value = __kmp_str_format("%s", data1); // read value from SHM
6807
        munmap(data1, SHM_SIZE);
6808
      }
6809
      if (fd1 != -1)
6810
        close(fd1);
6811
    }
6812
    if (!__kmp_shm_available)
6813
      __kmp_tmp_available = __kmp_detect_tmp();
6814
    if (!__kmp_shm_available && __kmp_tmp_available) {
6815
      // SHM failed to work due to an error other than that the file already
6816
      // exists. Try to create a temp file under /tmp.
6817
      // If /tmp isn't accessible, fall back to using environment variable.
6818
      // TODO: /tmp might not always be the temporary directory. For now we will
6819
      // not consider TMPDIR.
6820
      int fd1 = -1;
6821
      temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6822
      int tmp_preexist = 0;
6823
      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6824
      if ((fd1 == -1) && (errno == EEXIST)) {
6825
        // file didn't open because it already exists.
6826
        // try opening existing file
6827
        fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6828
        if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6829
          KMP_WARNING(FunctionError, "Can't open TEMP");
6830
          __kmp_tmp_available = false;
6831
        } else {
6832
          tmp_preexist = 1;
6833
        }
6834
      }
6835
      if (__kmp_tmp_available && tmp_preexist == 0) {
6836
        // we created /tmp file now set size
6837
        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6838
          KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6839
          __kmp_tmp_available = false;
6840
        }
6841
      }
6842
      if (__kmp_tmp_available) {
6843
        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6844
                             fd1, 0);
6845
        if (data1 == MAP_FAILED) { // failed to map /tmp
6846
          KMP_WARNING(FunctionError, "Can't map /tmp");
6847
          __kmp_tmp_available = false;
6848
        }
6849
      }
6850
      if (__kmp_tmp_available) {
6851
        if (tmp_preexist == 0) { // set data to TMP, set value
6852
          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6853
        }
6854
        // Read value from either what we just wrote or existing file.
6855
        value = __kmp_str_format("%s", data1); // read value from SHM
6856
        munmap(data1, SHM_SIZE);
6857
      }
6858
      if (fd1 != -1)
6859
        close(fd1);
6860
    }
6861
    if (!__kmp_shm_available && !__kmp_tmp_available) {
6862
      // no /dev/shm and no /tmp -- fall back to environment variable
6863
      // Set environment variable, but do not overwrite if it exists.
6864
      __kmp_env_set(name, __kmp_registration_str, 0);
6865
      // read value to see if it got set
6866
      value = __kmp_env_get(name);
6867
    }
6868
#else // Windows and unix with static library
6869
    // Set environment variable, but do not overwrite if it exists.
6870
    __kmp_env_set(name, __kmp_registration_str, 0);
6871
    // read value to see if it got set
6872
    value = __kmp_env_get(name);
6873
#endif
6874

6875
    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6876
      done = 1; // Ok, environment variable set successfully, exit the loop.
6877
    } else {
6878
      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6879
      // Check whether it alive or dead.
6880
      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6881
      char *tail = value;
6882
      char *flag_addr_str = NULL;
6883
      char *flag_val_str = NULL;
6884
      char const *file_name = NULL;
6885
      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6886
      __kmp_str_split(tail, '-', &flag_val_str, &tail);
6887
      file_name = tail;
6888
      if (tail != NULL) {
6889
        unsigned long *flag_addr = 0;
6890
        unsigned long flag_val = 0;
6891
        KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6892
        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6893
        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6894
          // First, check whether environment-encoded address is mapped into
6895
          // addr space.
6896
          // If so, dereference it to see if it still has the right value.
6897
          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6898
            neighbor = 1;
6899
          } else {
6900
            // If not, then we know the other copy of the library is no longer
6901
            // running.
6902
            neighbor = 2;
6903
          }
6904
        }
6905
      }
6906
      switch (neighbor) {
6907
      case 0: // Cannot parse environment variable -- neighbor status unknown.
6908
        // Assume it is the incompatible format of future version of the
6909
        // library. Assume the other library is alive.
6910
        // WARN( ... ); // TODO: Issue a warning.
6911
        file_name = "unknown library";
6912
        KMP_FALLTHROUGH();
6913
      // Attention! Falling to the next case. That's intentional.
6914
      case 1: { // Neighbor is alive.
6915
        // Check it is allowed.
6916
        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6917
        if (!__kmp_str_match_true(duplicate_ok)) {
6918
          // That's not allowed. Issue fatal error.
6919
          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6920
                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6921
        }
6922
        KMP_INTERNAL_FREE(duplicate_ok);
6923
        __kmp_duplicate_library_ok = 1;
6924
        done = 1; // Exit the loop.
6925
      } break;
6926
      case 2: { // Neighbor is dead.
6927

6928
#if defined(KMP_USE_SHM)
6929
        if (__kmp_shm_available) { // close shared memory.
6930
          shm_unlink(shm_name); // this removes file in /dev/shm
6931
        } else if (__kmp_tmp_available) {
6932
          unlink(temp_reg_status_file_name); // this removes the temp file
6933
        } else {
6934
          // Clear the variable and try to register library again.
6935
          __kmp_env_unset(name);
6936
        }
6937
#else
6938
        // Clear the variable and try to register library again.
6939
        __kmp_env_unset(name);
6940
#endif
6941
      } break;
6942
      default: {
6943
        KMP_DEBUG_ASSERT(0);
6944
      } break;
6945
      }
6946
    }
6947
    KMP_INTERNAL_FREE((void *)value);
6948
#if defined(KMP_USE_SHM)
6949
    if (shm_name)
6950
      KMP_INTERNAL_FREE((void *)shm_name);
6951
#endif
6952
  } // while
6953
  KMP_INTERNAL_FREE((void *)name);
6954

6955
} // func __kmp_register_library_startup
6956

6957
void __kmp_unregister_library(void) {
6958

6959
  char *name = __kmp_reg_status_name();
6960
  char *value = NULL;
6961

6962
#if defined(KMP_USE_SHM)
6963
  char *shm_name = nullptr;
6964
  int fd1;
6965
  if (__kmp_shm_available) {
6966
    shm_name = __kmp_str_format("/%s", name);
6967
    fd1 = shm_open(shm_name, O_RDONLY, 0600);
6968
    if (fd1 != -1) { // File opened successfully
6969
      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6970
      if (data1 != MAP_FAILED) {
6971
        value = __kmp_str_format("%s", data1); // read value from SHM
6972
        munmap(data1, SHM_SIZE);
6973
      }
6974
      close(fd1);
6975
    }
6976
  } else if (__kmp_tmp_available) { // try /tmp
6977
    fd1 = open(temp_reg_status_file_name, O_RDONLY);
6978
    if (fd1 != -1) { // File opened successfully
6979
      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6980
      if (data1 != MAP_FAILED) {
6981
        value = __kmp_str_format("%s", data1); // read value from /tmp
6982
        munmap(data1, SHM_SIZE);
6983
      }
6984
      close(fd1);
6985
    }
6986
  } else { // fall back to envirable
6987
    value = __kmp_env_get(name);
6988
  }
6989
#else
6990
  value = __kmp_env_get(name);
6991
#endif
6992

6993
  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6994
  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6995
  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6996
//  Ok, this is our variable. Delete it.
6997
#if defined(KMP_USE_SHM)
6998
    if (__kmp_shm_available) {
6999
      shm_unlink(shm_name); // this removes file in /dev/shm
7000
    } else if (__kmp_tmp_available) {
7001
      unlink(temp_reg_status_file_name); // this removes the temp file
7002
    } else {
7003
      __kmp_env_unset(name);
7004
    }
7005
#else
7006
    __kmp_env_unset(name);
7007
#endif
7008
  }
7009

7010
#if defined(KMP_USE_SHM)
7011
  if (shm_name)
7012
    KMP_INTERNAL_FREE(shm_name);
7013
  if (temp_reg_status_file_name)
7014
    KMP_INTERNAL_FREE(temp_reg_status_file_name);
7015
#endif
7016

7017
  KMP_INTERNAL_FREE(__kmp_registration_str);
7018
  KMP_INTERNAL_FREE(value);
7019
  KMP_INTERNAL_FREE(name);
7020

7021
  __kmp_registration_flag = 0;
7022
  __kmp_registration_str = NULL;
7023

7024
} // __kmp_unregister_library
7025

7026
// End of Library registration stuff.
7027
// -----------------------------------------------------------------------------
7028

7029
#if KMP_MIC_SUPPORTED
7030

7031
static void __kmp_check_mic_type() {
7032
  kmp_cpuid_t cpuid_state = {0};
7033
  kmp_cpuid_t *cs_p = &cpuid_state;
7034
  __kmp_x86_cpuid(1, 0, cs_p);
7035
  // We don't support mic1 at the moment
7036
  if ((cs_p->eax & 0xff0) == 0xB10) {
7037
    __kmp_mic_type = mic2;
7038
  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7039
    __kmp_mic_type = mic3;
7040
  } else {
7041
    __kmp_mic_type = non_mic;
7042
  }
7043
}
7044

7045
#endif /* KMP_MIC_SUPPORTED */
7046

7047
#if KMP_HAVE_UMWAIT
7048
static void __kmp_user_level_mwait_init() {
7049
  struct kmp_cpuid buf;
7050
  __kmp_x86_cpuid(7, 0, &buf);
7051
  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7052
  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7053
  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7054
  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7055
                __kmp_umwait_enabled));
7056
}
7057
#elif KMP_HAVE_MWAIT
7058
#ifndef AT_INTELPHIUSERMWAIT
7059
// Spurious, non-existent value that should always fail to return anything.
7060
// Will be replaced with the correct value when we know that.
7061
#define AT_INTELPHIUSERMWAIT 10000
7062
#endif
7063
// getauxval() function is available in RHEL7 and SLES12. If a system with an
7064
// earlier OS is used to build the RTL, we'll use the following internal
7065
// function when the entry is not found.
7066
unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7067
unsigned long getauxval(unsigned long) { return 0; }
7068

7069
static void __kmp_user_level_mwait_init() {
7070
  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7071
  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7072
  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7073
  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7074
  if (__kmp_mic_type == mic3) {
7075
    unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7076
    if ((res & 0x1) || __kmp_user_level_mwait) {
7077
      __kmp_mwait_enabled = TRUE;
7078
      if (__kmp_user_level_mwait) {
7079
        KMP_INFORM(EnvMwaitWarn);
7080
      }
7081
    } else {
7082
      __kmp_mwait_enabled = FALSE;
7083
    }
7084
  }
7085
  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7086
                "__kmp_mwait_enabled = %d\n",
7087
                __kmp_mic_type, __kmp_mwait_enabled));
7088
}
7089
#endif /* KMP_HAVE_UMWAIT */
7090

7091
static void __kmp_do_serial_initialize(void) {
7092
  int i, gtid;
7093
  size_t size;
7094

7095
  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7096

7097
  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7098
  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7099
  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7100
  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7101
  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7102

7103
#if OMPT_SUPPORT
7104
  ompt_pre_init();
7105
#endif
7106
#if OMPD_SUPPORT
7107
  __kmp_env_dump();
7108
  ompd_init();
7109
#endif
7110

7111
  __kmp_validate_locks();
7112

7113
#if ENABLE_LIBOMPTARGET
7114
  /* Initialize functions from libomptarget */
7115
  __kmp_init_omptarget();
7116
#endif
7117

7118
  /* Initialize internal memory allocator */
7119
  __kmp_init_allocator();
7120

7121
  /* Register the library startup via an environment variable or via mapped
7122
     shared memory file and check to see whether another copy of the library is
7123
     already registered. Since forked child process is often terminated, we
7124
     postpone the registration till middle initialization in the child */
7125
  if (__kmp_need_register_serial)
7126
    __kmp_register_library_startup();
7127

7128
  /* TODO reinitialization of library */
7129
  if (TCR_4(__kmp_global.g.g_done)) {
7130
    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7131
  }
7132

7133
  __kmp_global.g.g_abort = 0;
7134
  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7135

7136
/* initialize the locks */
7137
#if KMP_USE_ADAPTIVE_LOCKS
7138
#if KMP_DEBUG_ADAPTIVE_LOCKS
7139
  __kmp_init_speculative_stats();
7140
#endif
7141
#endif
7142
#if KMP_STATS_ENABLED
7143
  __kmp_stats_init();
7144
#endif
7145
  __kmp_init_lock(&__kmp_global_lock);
7146
  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7147
  __kmp_init_lock(&__kmp_debug_lock);
7148
  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7149
  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7150
  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7151
  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7152
  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7153
  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7154
  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7155
  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7156
  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7157
  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7158
  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7159
  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7160
  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7161
  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7162
  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7163
#if KMP_USE_MONITOR
7164
  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7165
#endif
7166
  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7167

7168
  /* conduct initialization and initial setup of configuration */
7169

7170
  __kmp_runtime_initialize();
7171

7172
#if KMP_MIC_SUPPORTED
7173
  __kmp_check_mic_type();
7174
#endif
7175

7176
// Some global variable initialization moved here from kmp_env_initialize()
7177
#ifdef KMP_DEBUG
7178
  kmp_diag = 0;
7179
#endif
7180
  __kmp_abort_delay = 0;
7181

7182
  // From __kmp_init_dflt_team_nth()
7183
  /* assume the entire machine will be used */
7184
  __kmp_dflt_team_nth_ub = __kmp_xproc;
7185
  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7186
    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7187
  }
7188
  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7189
    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7190
  }
7191
  __kmp_max_nth = __kmp_sys_max_nth;
7192
  __kmp_cg_max_nth = __kmp_sys_max_nth;
7193
  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7194
  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7195
    __kmp_teams_max_nth = __kmp_sys_max_nth;
7196
  }
7197

7198
  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7199
  // part
7200
  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7201
#if KMP_USE_MONITOR
7202
  __kmp_monitor_wakeups =
7203
      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7204
  __kmp_bt_intervals =
7205
      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7206
#endif
7207
  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7208
  __kmp_library = library_throughput;
7209
  // From KMP_SCHEDULE initialization
7210
  __kmp_static = kmp_sch_static_balanced;
7211
// AC: do not use analytical here, because it is non-monotonous
7212
//__kmp_guided = kmp_sch_guided_iterative_chunked;
7213
//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7214
// need to repeat assignment
7215
// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7216
// bit control and barrier method control parts
7217
#if KMP_FAST_REDUCTION_BARRIER
7218
#define kmp_reduction_barrier_gather_bb ((int)1)
7219
#define kmp_reduction_barrier_release_bb ((int)1)
7220
#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7221
#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7222
#endif // KMP_FAST_REDUCTION_BARRIER
7223
  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7224
    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7225
    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7226
    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7227
    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7228
#if KMP_FAST_REDUCTION_BARRIER
7229
    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7230
      // lin_64 ): hyper,1
7231
      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7232
      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7233
      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7234
      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7235
    }
7236
#endif // KMP_FAST_REDUCTION_BARRIER
7237
  }
7238
#if KMP_FAST_REDUCTION_BARRIER
7239
#undef kmp_reduction_barrier_release_pat
7240
#undef kmp_reduction_barrier_gather_pat
7241
#undef kmp_reduction_barrier_release_bb
7242
#undef kmp_reduction_barrier_gather_bb
7243
#endif // KMP_FAST_REDUCTION_BARRIER
7244
#if KMP_MIC_SUPPORTED
7245
  if (__kmp_mic_type == mic2) { // KNC
7246
    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7247
    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7248
    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7249
        1; // forkjoin release
7250
    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7251
    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252
  }
7253
#if KMP_FAST_REDUCTION_BARRIER
7254
  if (__kmp_mic_type == mic2) { // KNC
7255
    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7256
    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257
  }
7258
#endif // KMP_FAST_REDUCTION_BARRIER
7259
#endif // KMP_MIC_SUPPORTED
7260

7261
// From KMP_CHECKS initialization
7262
#ifdef KMP_DEBUG
7263
  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7264
#else
7265
  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7266
#endif
7267

7268
  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7269
  __kmp_foreign_tp = TRUE;
7270

7271
  __kmp_global.g.g_dynamic = FALSE;
7272
  __kmp_global.g.g_dynamic_mode = dynamic_default;
7273

7274
  __kmp_init_nesting_mode();
7275

7276
  __kmp_env_initialize(NULL);
7277

7278
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7279
  __kmp_user_level_mwait_init();
7280
#endif
7281
// Print all messages in message catalog for testing purposes.
7282
#ifdef KMP_DEBUG
7283
  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7284
  if (__kmp_str_match_true(val)) {
7285
    kmp_str_buf_t buffer;
7286
    __kmp_str_buf_init(&buffer);
7287
    __kmp_i18n_dump_catalog(&buffer);
7288
    __kmp_printf("%s", buffer.str);
7289
    __kmp_str_buf_free(&buffer);
7290
  }
7291
  __kmp_env_free(&val);
7292
#endif
7293

7294
  __kmp_threads_capacity =
7295
      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7296
  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7297
  __kmp_tp_capacity = __kmp_default_tp_capacity(
7298
      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7299

7300
  // If the library is shut down properly, both pools must be NULL. Just in
7301
  // case, set them to NULL -- some memory may leak, but subsequent code will
7302
  // work even if pools are not freed.
7303
  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7304
  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7305
  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7306
  __kmp_thread_pool = NULL;
7307
  __kmp_thread_pool_insert_pt = NULL;
7308
  __kmp_team_pool = NULL;
7309

7310
  /* Allocate all of the variable sized records */
7311
  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7312
   * expandable */
7313
  /* Since allocation is cache-aligned, just add extra padding at the end */
7314
  size =
7315
      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7316
      CACHE_LINE;
7317
  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7318
  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7319
                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
7320

7321
  /* init thread counts */
7322
  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7323
                   0); // Asserts fail if the library is reinitializing and
7324
  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7325
  __kmp_all_nth = 0;
7326
  __kmp_nth = 0;
7327

7328
  /* setup the uber master thread and hierarchy */
7329
  gtid = __kmp_register_root(TRUE);
7330
  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7331
  KMP_ASSERT(KMP_UBER_GTID(gtid));
7332
  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7333

7334
  KMP_MB(); /* Flush all pending memory write invalidates.  */
7335

7336
  __kmp_common_initialize();
7337

7338
#if KMP_OS_UNIX
7339
  /* invoke the child fork handler */
7340
  __kmp_register_atfork();
7341
#endif
7342

7343
#if !KMP_DYNAMIC_LIB ||                                                        \
7344
    ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7345
  {
7346
    /* Invoke the exit handler when the program finishes, only for static
7347
       library and macOS* dynamic. For other dynamic libraries, we already
7348
       have _fini and DllMain. */
7349
    int rc = atexit(__kmp_internal_end_atexit);
7350
    if (rc != 0) {
7351
      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7352
                  __kmp_msg_null);
7353
    }
7354
  }
7355
#endif
7356

7357
#if KMP_HANDLE_SIGNALS
7358
#if KMP_OS_UNIX
7359
  /* NOTE: make sure that this is called before the user installs their own
7360
     signal handlers so that the user handlers are called first. this way they
7361
     can return false, not call our handler, avoid terminating the library, and
7362
     continue execution where they left off. */
7363
  __kmp_install_signals(FALSE);
7364
#endif /* KMP_OS_UNIX */
7365
#if KMP_OS_WINDOWS
7366
  __kmp_install_signals(TRUE);
7367
#endif /* KMP_OS_WINDOWS */
7368
#endif
7369

7370
  /* we have finished the serial initialization */
7371
  __kmp_init_counter++;
7372

7373
  __kmp_init_serial = TRUE;
7374

7375
  if (__kmp_version) {
7376
    __kmp_print_version_1();
7377
  }
7378

7379
  if (__kmp_settings) {
7380
    __kmp_env_print();
7381
  }
7382

7383
  if (__kmp_display_env || __kmp_display_env_verbose) {
7384
    __kmp_env_print_2();
7385
  }
7386

7387
#if OMPT_SUPPORT
7388
  ompt_post_init();
7389
#endif
7390

7391
  KMP_MB();
7392

7393
  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7394
}
7395

7396
void __kmp_serial_initialize(void) {
7397
  if (__kmp_init_serial) {
7398
    return;
7399
  }
7400
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7401
  if (__kmp_init_serial) {
7402
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7403
    return;
7404
  }
7405
  __kmp_do_serial_initialize();
7406
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7407
}
7408

7409
static void __kmp_do_middle_initialize(void) {
7410
  int i, j;
7411
  int prev_dflt_team_nth;
7412

7413
  if (!__kmp_init_serial) {
7414
    __kmp_do_serial_initialize();
7415
  }
7416

7417
  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7418

7419
  if (UNLIKELY(!__kmp_need_register_serial)) {
7420
    // We are in a forked child process. The registration was skipped during
7421
    // serial initialization in __kmp_atfork_child handler. Do it here.
7422
    __kmp_register_library_startup();
7423
  }
7424

7425
  // Save the previous value for the __kmp_dflt_team_nth so that
7426
  // we can avoid some reinitialization if it hasn't changed.
7427
  prev_dflt_team_nth = __kmp_dflt_team_nth;
7428

7429
#if KMP_AFFINITY_SUPPORTED
7430
  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7431
  // number of cores on the machine.
7432
  __kmp_affinity_initialize(__kmp_affinity);
7433

7434
#endif /* KMP_AFFINITY_SUPPORTED */
7435

7436
  KMP_ASSERT(__kmp_xproc > 0);
7437
  if (__kmp_avail_proc == 0) {
7438
    __kmp_avail_proc = __kmp_xproc;
7439
  }
7440

7441
  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7442
  // correct them now
7443
  j = 0;
7444
  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7445
    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7446
        __kmp_avail_proc;
7447
    j++;
7448
  }
7449

7450
  if (__kmp_dflt_team_nth == 0) {
7451
#ifdef KMP_DFLT_NTH_CORES
7452
    // Default #threads = #cores
7453
    __kmp_dflt_team_nth = __kmp_ncores;
7454
    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7455
                  "__kmp_ncores (%d)\n",
7456
                  __kmp_dflt_team_nth));
7457
#else
7458
    // Default #threads = #available OS procs
7459
    __kmp_dflt_team_nth = __kmp_avail_proc;
7460
    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7461
                  "__kmp_avail_proc(%d)\n",
7462
                  __kmp_dflt_team_nth));
7463
#endif /* KMP_DFLT_NTH_CORES */
7464
  }
7465

7466
  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7467
    __kmp_dflt_team_nth = KMP_MIN_NTH;
7468
  }
7469
  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7470
    __kmp_dflt_team_nth = __kmp_sys_max_nth;
7471
  }
7472

7473
  if (__kmp_nesting_mode > 0)
7474
    __kmp_set_nesting_mode_threads();
7475

7476
  // There's no harm in continuing if the following check fails,
7477
  // but it indicates an error in the previous logic.
7478
  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7479

7480
  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7481
    // Run through the __kmp_threads array and set the num threads icv for each
7482
    // root thread that is currently registered with the RTL (which has not
7483
    // already explicitly set its nthreads-var with a call to
7484
    // omp_set_num_threads()).
7485
    for (i = 0; i < __kmp_threads_capacity; i++) {
7486
      kmp_info_t *thread = __kmp_threads[i];
7487
      if (thread == NULL)
7488
        continue;
7489
      if (thread->th.th_current_task->td_icvs.nproc != 0)
7490
        continue;
7491

7492
      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7493
    }
7494
  }
7495
  KA_TRACE(
7496
      20,
7497
      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7498
       __kmp_dflt_team_nth));
7499

7500
#ifdef KMP_ADJUST_BLOCKTIME
7501
  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7502
  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7503
    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7504
    if (__kmp_nth > __kmp_avail_proc) {
7505
      __kmp_zero_bt = TRUE;
7506
    }
7507
  }
7508
#endif /* KMP_ADJUST_BLOCKTIME */
7509

7510
  /* we have finished middle initialization */
7511
  TCW_SYNC_4(__kmp_init_middle, TRUE);
7512

7513
  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7514
}
7515

7516
void __kmp_middle_initialize(void) {
7517
  if (__kmp_init_middle) {
7518
    return;
7519
  }
7520
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521
  if (__kmp_init_middle) {
7522
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523
    return;
7524
  }
7525
  __kmp_do_middle_initialize();
7526
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7527
}
7528

7529
void __kmp_parallel_initialize(void) {
7530
  int gtid = __kmp_entry_gtid(); // this might be a new root
7531

7532
  /* synchronize parallel initialization (for sibling) */
7533
  if (TCR_4(__kmp_init_parallel))
7534
    return;
7535
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7536
  if (TCR_4(__kmp_init_parallel)) {
7537
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538
    return;
7539
  }
7540

7541
  /* TODO reinitialization after we have already shut down */
7542
  if (TCR_4(__kmp_global.g.g_done)) {
7543
    KA_TRACE(
7544
        10,
7545
        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7546
    __kmp_infinite_loop();
7547
  }
7548

7549
  /* jc: The lock __kmp_initz_lock is already held, so calling
7550
     __kmp_serial_initialize would cause a deadlock.  So we call
7551
     __kmp_do_serial_initialize directly. */
7552
  if (!__kmp_init_middle) {
7553
    __kmp_do_middle_initialize();
7554
  }
7555
  __kmp_assign_root_init_mask();
7556
  __kmp_resume_if_hard_paused();
7557

7558
  /* begin initialization */
7559
  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7560
  KMP_ASSERT(KMP_UBER_GTID(gtid));
7561

7562
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7563
  // Save the FP control regs.
7564
  // Worker threads will set theirs to these values at thread startup.
7565
  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7566
  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7567
  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7568
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7569

7570
#if KMP_OS_UNIX
7571
#if KMP_HANDLE_SIGNALS
7572
  /*  must be after __kmp_serial_initialize  */
7573
  __kmp_install_signals(TRUE);
7574
#endif
7575
#endif
7576

7577
  __kmp_suspend_initialize();
7578

7579
#if defined(USE_LOAD_BALANCE)
7580
  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7581
    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7582
  }
7583
#else
7584
  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7585
    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7586
  }
7587
#endif
7588

7589
  if (__kmp_version) {
7590
    __kmp_print_version_2();
7591
  }
7592

7593
  /* we have finished parallel initialization */
7594
  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7595

7596
  KMP_MB();
7597
  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7598

7599
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600
}
7601

7602
void __kmp_hidden_helper_initialize() {
7603
  if (TCR_4(__kmp_init_hidden_helper))
7604
    return;
7605

7606
  // __kmp_parallel_initialize is required before we initialize hidden helper
7607
  if (!TCR_4(__kmp_init_parallel))
7608
    __kmp_parallel_initialize();
7609

7610
  // Double check. Note that this double check should not be placed before
7611
  // __kmp_parallel_initialize as it will cause dead lock.
7612
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613
  if (TCR_4(__kmp_init_hidden_helper)) {
7614
    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7615
    return;
7616
  }
7617

7618
#if KMP_AFFINITY_SUPPORTED
7619
  // Initialize hidden helper affinity settings.
7620
  // The above __kmp_parallel_initialize() will initialize
7621
  // regular affinity (and topology) if not already done.
7622
  if (!__kmp_hh_affinity.flags.initialized)
7623
    __kmp_affinity_initialize(__kmp_hh_affinity);
7624
#endif
7625

7626
  // Set the count of hidden helper tasks to be executed to zero
7627
  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7628

7629
  // Set the global variable indicating that we're initializing hidden helper
7630
  // team/threads
7631
  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7632

7633
  // Platform independent initialization
7634
  __kmp_do_initialize_hidden_helper_threads();
7635

7636
  // Wait here for the finish of initialization of hidden helper teams
7637
  __kmp_hidden_helper_threads_initz_wait();
7638

7639
  // We have finished hidden helper initialization
7640
  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7641

7642
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7643
}
7644

7645
/* ------------------------------------------------------------------------ */
7646

7647
void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7648
                                   kmp_team_t *team) {
7649
  kmp_disp_t *dispatch;
7650

7651
  KMP_MB();
7652

7653
  /* none of the threads have encountered any constructs, yet. */
7654
  this_thr->th.th_local.this_construct = 0;
7655
#if KMP_CACHE_MANAGE
7656
  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7657
#endif /* KMP_CACHE_MANAGE */
7658
  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7659
  KMP_DEBUG_ASSERT(dispatch);
7660
  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7661
  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7662
  // this_thr->th.th_info.ds.ds_tid ] );
7663

7664
  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7665
  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7666
  if (__kmp_env_consistency_check)
7667
    __kmp_push_parallel(gtid, team->t.t_ident);
7668

7669
  KMP_MB(); /* Flush all pending memory write invalidates.  */
7670
}
7671

7672
void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7673
                                  kmp_team_t *team) {
7674
  if (__kmp_env_consistency_check)
7675
    __kmp_pop_parallel(gtid, team->t.t_ident);
7676

7677
  __kmp_finish_implicit_task(this_thr);
7678
}
7679

7680
int __kmp_invoke_task_func(int gtid) {
7681
  int rc;
7682
  int tid = __kmp_tid_from_gtid(gtid);
7683
  kmp_info_t *this_thr = __kmp_threads[gtid];
7684
  kmp_team_t *team = this_thr->th.th_team;
7685

7686
  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7687
#if USE_ITT_BUILD
7688
  if (__itt_stack_caller_create_ptr) {
7689
    // inform ittnotify about entering user's code
7690
    if (team->t.t_stack_id != NULL) {
7691
      __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7692
    } else {
7693
      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7694
      __kmp_itt_stack_callee_enter(
7695
          (__itt_caller)team->t.t_parent->t.t_stack_id);
7696
    }
7697
  }
7698
#endif /* USE_ITT_BUILD */
7699
#if INCLUDE_SSC_MARKS
7700
  SSC_MARK_INVOKING();
7701
#endif
7702

7703
#if OMPT_SUPPORT
7704
  void *dummy;
7705
  void **exit_frame_p;
7706
  ompt_data_t *my_task_data;
7707
  ompt_data_t *my_parallel_data;
7708
  int ompt_team_size;
7709

7710
  if (ompt_enabled.enabled) {
7711
    exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7712
                         .ompt_task_info.frame.exit_frame.ptr);
7713
  } else {
7714
    exit_frame_p = &dummy;
7715
  }
7716

7717
  my_task_data =
7718
      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7719
  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7720
  if (ompt_enabled.ompt_callback_implicit_task) {
7721
    ompt_team_size = team->t.t_nproc;
7722
    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723
        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7724
        __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7725
    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7726
  }
7727
#endif
7728

7729
#if KMP_STATS_ENABLED
7730
  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7731
  if (previous_state == stats_state_e::TEAMS_REGION) {
7732
    KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7733
  } else {
7734
    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7735
  }
7736
  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7737
#endif
7738

7739
  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7740
                              tid, (int)team->t.t_argc, (void **)team->t.t_argv
7741
#if OMPT_SUPPORT
7742
                              ,
7743
                              exit_frame_p
7744
#endif
7745
  );
7746
#if OMPT_SUPPORT
7747
  *exit_frame_p = NULL;
7748
  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7749
#endif
7750

7751
#if KMP_STATS_ENABLED
7752
  if (previous_state == stats_state_e::TEAMS_REGION) {
7753
    KMP_SET_THREAD_STATE(previous_state);
7754
  }
7755
  KMP_POP_PARTITIONED_TIMER();
7756
#endif
7757

7758
#if USE_ITT_BUILD
7759
  if (__itt_stack_caller_create_ptr) {
7760
    // inform ittnotify about leaving user's code
7761
    if (team->t.t_stack_id != NULL) {
7762
      __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7763
    } else {
7764
      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7765
      __kmp_itt_stack_callee_leave(
7766
          (__itt_caller)team->t.t_parent->t.t_stack_id);
7767
    }
7768
  }
7769
#endif /* USE_ITT_BUILD */
7770
  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7771

7772
  return rc;
7773
}
7774

7775
void __kmp_teams_master(int gtid) {
7776
  // This routine is called by all primary threads in teams construct
7777
  kmp_info_t *thr = __kmp_threads[gtid];
7778
  kmp_team_t *team = thr->th.th_team;
7779
  ident_t *loc = team->t.t_ident;
7780
  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7781
  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7782
  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7783
  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7784
                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7785

7786
  // This thread is a new CG root.  Set up the proper variables.
7787
  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7788
  tmp->cg_root = thr; // Make thr the CG root
7789
  // Init to thread limit stored when league primary threads were forked
7790
  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7791
  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7792
  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7793
                 " cg_nthreads to 1\n",
7794
                 thr, tmp));
7795
  tmp->up = thr->th.th_cg_roots;
7796
  thr->th.th_cg_roots = tmp;
7797

7798
// Launch league of teams now, but not let workers execute
7799
// (they hang on fork barrier until next parallel)
7800
#if INCLUDE_SSC_MARKS
7801
  SSC_MARK_FORKING();
7802
#endif
7803
  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7804
                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7805
                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7806
#if INCLUDE_SSC_MARKS
7807
  SSC_MARK_JOINING();
7808
#endif
7809
  // If the team size was reduced from the limit, set it to the new size
7810
  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7811
    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7812
  // AC: last parameter "1" eliminates join barrier which won't work because
7813
  // worker threads are in a fork barrier waiting for more parallel regions
7814
  __kmp_join_call(loc, gtid
7815
#if OMPT_SUPPORT
7816
                  ,
7817
                  fork_context_intel
7818
#endif
7819
                  ,
7820
                  1);
7821
}
7822

7823
int __kmp_invoke_teams_master(int gtid) {
7824
  kmp_info_t *this_thr = __kmp_threads[gtid];
7825
  kmp_team_t *team = this_thr->th.th_team;
7826
#if KMP_DEBUG
7827
  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7828
    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7829
                     (void *)__kmp_teams_master);
7830
#endif
7831
  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7832
#if OMPT_SUPPORT
7833
  int tid = __kmp_tid_from_gtid(gtid);
7834
  ompt_data_t *task_data =
7835
      &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7836
  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7837
  if (ompt_enabled.ompt_callback_implicit_task) {
7838
    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7839
        ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7840
        ompt_task_initial);
7841
    OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7842
  }
7843
#endif
7844
  __kmp_teams_master(gtid);
7845
#if OMPT_SUPPORT
7846
  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7847
#endif
7848
  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7849
  return 1;
7850
}
7851

7852
/* this sets the requested number of threads for the next parallel region
7853
   encountered by this team. since this should be enclosed in the forkjoin
7854
   critical section it should avoid race conditions with asymmetrical nested
7855
   parallelism */
7856
void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7857
  kmp_info_t *thr = __kmp_threads[gtid];
7858

7859
  if (num_threads > 0)
7860
    thr->th.th_set_nproc = num_threads;
7861
}
7862

7863
void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7864
                                 int *num_threads_list) {
7865
  kmp_info_t *thr = __kmp_threads[gtid];
7866

7867
  KMP_DEBUG_ASSERT(list_length > 1);
7868

7869
  if (num_threads_list[0] > 0)
7870
    thr->th.th_set_nproc = num_threads_list[0];
7871
  thr->th.th_set_nested_nth =
7872
      (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7873
  for (kmp_uint32 i = 0; i < list_length; ++i)
7874
    thr->th.th_set_nested_nth[i] = num_threads_list[i];
7875
  thr->th.th_set_nested_nth_sz = list_length;
7876
}
7877

7878
void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7879
                                  const char *msg) {
7880
  kmp_info_t *thr = __kmp_threads[gtid];
7881
  thr->th.th_nt_strict = true;
7882
  thr->th.th_nt_loc = loc;
7883
  // if sev is unset make fatal
7884
  if (sev == severity_warning)
7885
    thr->th.th_nt_sev = sev;
7886
  else
7887
    thr->th.th_nt_sev = severity_fatal;
7888
  // if msg is unset, use an appropriate message
7889
  if (msg)
7890
    thr->th.th_nt_msg = msg;
7891
  else
7892
    thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7893
                        "strict num_threads clause.";
7894
}
7895

7896
static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7897
                                    int num_threads) {
7898
  KMP_DEBUG_ASSERT(thr);
7899
  // Remember the number of threads for inner parallel regions
7900
  if (!TCR_4(__kmp_init_middle))
7901
    __kmp_middle_initialize(); // get internal globals calculated
7902
  __kmp_assign_root_init_mask();
7903
  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7904
  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7905

7906
  if (num_threads == 0) {
7907
    if (__kmp_teams_thread_limit > 0) {
7908
      num_threads = __kmp_teams_thread_limit;
7909
    } else {
7910
      num_threads = __kmp_avail_proc / num_teams;
7911
    }
7912
    // adjust num_threads w/o warning as it is not user setting
7913
    // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7914
    // no thread_limit clause specified -  do not change thread-limit-var ICV
7915
    if (num_threads > __kmp_dflt_team_nth) {
7916
      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7917
    }
7918
    if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7919
      num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7920
    } // prevent team size to exceed thread-limit-var
7921
    if (num_teams * num_threads > __kmp_teams_max_nth) {
7922
      num_threads = __kmp_teams_max_nth / num_teams;
7923
    }
7924
    if (num_threads == 0) {
7925
      num_threads = 1;
7926
    }
7927
  } else {
7928
    if (num_threads < 0) {
7929
      __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7930
                __kmp_msg_null);
7931
      num_threads = 1;
7932
    }
7933
    // This thread will be the primary thread of the league primary threads
7934
    // Store new thread limit; old limit is saved in th_cg_roots list
7935
    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7936
    // num_threads = min(num_threads, nthreads-var)
7937
    if (num_threads > __kmp_dflt_team_nth) {
7938
      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7939
    }
7940
    if (num_teams * num_threads > __kmp_teams_max_nth) {
7941
      int new_threads = __kmp_teams_max_nth / num_teams;
7942
      if (new_threads == 0) {
7943
        new_threads = 1;
7944
      }
7945
      if (new_threads != num_threads) {
7946
        if (!__kmp_reserve_warn) { // user asked for too many threads
7947
          __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7948
          __kmp_msg(kmp_ms_warning,
7949
                    KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7950
                    KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7951
        }
7952
      }
7953
      num_threads = new_threads;
7954
    }
7955
  }
7956
  thr->th.th_teams_size.nth = num_threads;
7957
}
7958

7959
/* this sets the requested number of teams for the teams region and/or
7960
   the number of threads for the next parallel region encountered  */
7961
void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7962
                          int num_threads) {
7963
  kmp_info_t *thr = __kmp_threads[gtid];
7964
  if (num_teams < 0) {
7965
    // OpenMP specification requires requested values to be positive,
7966
    // but people can send us any value, so we'd better check
7967
    __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7968
              __kmp_msg_null);
7969
    num_teams = 1;
7970
  }
7971
  if (num_teams == 0) {
7972
    if (__kmp_nteams > 0) {
7973
      num_teams = __kmp_nteams;
7974
    } else {
7975
      num_teams = 1; // default number of teams is 1.
7976
    }
7977
  }
7978
  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7979
    if (!__kmp_reserve_warn) {
7980
      __kmp_reserve_warn = 1;
7981
      __kmp_msg(kmp_ms_warning,
7982
                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983
                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984
    }
7985
    num_teams = __kmp_teams_max_nth;
7986
  }
7987
  // Set number of teams (number of threads in the outer "parallel" of the
7988
  // teams)
7989
  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7990

7991
  __kmp_push_thread_limit(thr, num_teams, num_threads);
7992
}
7993

7994
/* This sets the requested number of teams for the teams region and/or
7995
   the number of threads for the next parallel region encountered  */
7996
void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7997
                             int num_teams_ub, int num_threads) {
7998
  kmp_info_t *thr = __kmp_threads[gtid];
7999
  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8000
  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8001
  KMP_DEBUG_ASSERT(num_threads >= 0);
8002

8003
  if (num_teams_lb > num_teams_ub) {
8004
    __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8005
                KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8006
  }
8007

8008
  int num_teams = 1; // defalt number of teams is 1.
8009

8010
  if (num_teams_lb == 0 && num_teams_ub > 0)
8011
    num_teams_lb = num_teams_ub;
8012

8013
  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8014
    num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8015
    if (num_teams > __kmp_teams_max_nth) {
8016
      if (!__kmp_reserve_warn) {
8017
        __kmp_reserve_warn = 1;
8018
        __kmp_msg(kmp_ms_warning,
8019
                  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8020
                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8021
      }
8022
      num_teams = __kmp_teams_max_nth;
8023
    }
8024
  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8025
    num_teams = num_teams_ub;
8026
  } else { // num_teams_lb <= num_teams <= num_teams_ub
8027
    if (num_threads <= 0) {
8028
      if (num_teams_ub > __kmp_teams_max_nth) {
8029
        num_teams = num_teams_lb;
8030
      } else {
8031
        num_teams = num_teams_ub;
8032
      }
8033
    } else {
8034
      num_teams = (num_threads > __kmp_teams_max_nth)
8035
                      ? num_teams
8036
                      : __kmp_teams_max_nth / num_threads;
8037
      if (num_teams < num_teams_lb) {
8038
        num_teams = num_teams_lb;
8039
      } else if (num_teams > num_teams_ub) {
8040
        num_teams = num_teams_ub;
8041
      }
8042
    }
8043
  }
8044
  // Set number of teams (number of threads in the outer "parallel" of the
8045
  // teams)
8046
  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8047

8048
  __kmp_push_thread_limit(thr, num_teams, num_threads);
8049
}
8050

8051
// Set the proc_bind var to use in the following parallel region.
8052
void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8053
  kmp_info_t *thr = __kmp_threads[gtid];
8054
  thr->th.th_set_proc_bind = proc_bind;
8055
}
8056

8057
/* Launch the worker threads into the microtask. */
8058

8059
void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8060
  kmp_info_t *this_thr = __kmp_threads[gtid];
8061

8062
#ifdef KMP_DEBUG
8063
  int f;
8064
#endif /* KMP_DEBUG */
8065

8066
  KMP_DEBUG_ASSERT(team);
8067
  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8068
  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8069
  KMP_MB(); /* Flush all pending memory write invalidates.  */
8070

8071
  team->t.t_construct = 0; /* no single directives seen yet */
8072
  team->t.t_ordered.dt.t_value =
8073
      0; /* thread 0 enters the ordered section first */
8074

8075
  /* Reset the identifiers on the dispatch buffer */
8076
  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8077
  if (team->t.t_max_nproc > 1) {
8078
    int i;
8079
    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8080
      team->t.t_disp_buffer[i].buffer_index = i;
8081
      team->t.t_disp_buffer[i].doacross_buf_idx = i;
8082
    }
8083
  } else {
8084
    team->t.t_disp_buffer[0].buffer_index = 0;
8085
    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8086
  }
8087

8088
  KMP_MB(); /* Flush all pending memory write invalidates.  */
8089
  KMP_ASSERT(this_thr->th.th_team == team);
8090

8091
#ifdef KMP_DEBUG
8092
  for (f = 0; f < team->t.t_nproc; f++) {
8093
    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8094
                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8095
  }
8096
#endif /* KMP_DEBUG */
8097

8098
  /* release the worker threads so they may begin working */
8099
  __kmp_fork_barrier(gtid, 0);
8100
}
8101

8102
void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8103
  kmp_info_t *this_thr = __kmp_threads[gtid];
8104

8105
  KMP_DEBUG_ASSERT(team);
8106
  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8107
  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8108
  KMP_MB(); /* Flush all pending memory write invalidates.  */
8109

8110
  /* Join barrier after fork */
8111

8112
#ifdef KMP_DEBUG
8113
  if (__kmp_threads[gtid] &&
8114
      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8115
    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8116
                 __kmp_threads[gtid]);
8117
    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8118
                 "team->t.t_nproc=%d\n",
8119
                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8120
                 team->t.t_nproc);
8121
    __kmp_print_structure();
8122
  }
8123
  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8124
                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8125
#endif /* KMP_DEBUG */
8126

8127
  __kmp_join_barrier(gtid); /* wait for everyone */
8128
#if OMPT_SUPPORT
8129
  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8130
  if (ompt_enabled.enabled &&
8131
      (ompt_state == ompt_state_wait_barrier_teams ||
8132
       ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8133
    int ds_tid = this_thr->th.th_info.ds.ds_tid;
8134
    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8135
    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8136
#if OMPT_OPTIONAL
8137
    void *codeptr = NULL;
8138
    if (KMP_MASTER_TID(ds_tid) &&
8139
        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8140
         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8141
      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8142

8143
    ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8144
    if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8145
      sync_kind = ompt_sync_region_barrier_teams;
8146
    if (ompt_enabled.ompt_callback_sync_region_wait) {
8147
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8148
          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8149
    }
8150
    if (ompt_enabled.ompt_callback_sync_region) {
8151
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8152
          sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8153
    }
8154
#endif
8155
    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8156
      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8157
          ompt_scope_end, NULL, task_data, 0, ds_tid,
8158
          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8159
    }
8160
  }
8161
#endif
8162

8163
  KMP_MB(); /* Flush all pending memory write invalidates.  */
8164
  KMP_ASSERT(this_thr->th.th_team == team);
8165
}
8166

8167
/* ------------------------------------------------------------------------ */
8168

8169
#ifdef USE_LOAD_BALANCE
8170

8171
// Return the worker threads actively spinning in the hot team, if we
8172
// are at the outermost level of parallelism.  Otherwise, return 0.
8173
static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8174
  int i;
8175
  int retval;
8176
  kmp_team_t *hot_team;
8177

8178
  if (root->r.r_active) {
8179
    return 0;
8180
  }
8181
  hot_team = root->r.r_hot_team;
8182
  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8183
    return hot_team->t.t_nproc - 1; // Don't count primary thread
8184
  }
8185

8186
  // Skip the primary thread - it is accounted for elsewhere.
8187
  retval = 0;
8188
  for (i = 1; i < hot_team->t.t_nproc; i++) {
8189
    if (hot_team->t.t_threads[i]->th.th_active) {
8190
      retval++;
8191
    }
8192
  }
8193
  return retval;
8194
}
8195

8196
// Perform an automatic adjustment to the number of
8197
// threads used by the next parallel region.
8198
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8199
  int retval;
8200
  int pool_active;
8201
  int hot_team_active;
8202
  int team_curr_active;
8203
  int system_active;
8204

8205
  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8206
                set_nproc));
8207
  KMP_DEBUG_ASSERT(root);
8208
  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8209
                       ->th.th_current_task->td_icvs.dynamic == TRUE);
8210
  KMP_DEBUG_ASSERT(set_nproc > 1);
8211

8212
  if (set_nproc == 1) {
8213
    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8214
    return 1;
8215
  }
8216

8217
  // Threads that are active in the thread pool, active in the hot team for this
8218
  // particular root (if we are at the outer par level), and the currently
8219
  // executing thread (to become the primary thread) are available to add to the
8220
  // new team, but are currently contributing to the system load, and must be
8221
  // accounted for.
8222
  pool_active = __kmp_thread_pool_active_nth;
8223
  hot_team_active = __kmp_active_hot_team_nproc(root);
8224
  team_curr_active = pool_active + hot_team_active + 1;
8225

8226
  // Check the system load.
8227
  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8228
  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8229
                "hot team active = %d\n",
8230
                system_active, pool_active, hot_team_active));
8231

8232
  if (system_active < 0) {
8233
    // There was an error reading the necessary info from /proc, so use the
8234
    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8235
    // = dynamic_thread_limit, we shouldn't wind up getting back here.
8236
    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8237
    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8238

8239
    // Make this call behave like the thread limit algorithm.
8240
    retval = __kmp_avail_proc - __kmp_nth +
8241
             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8242
    if (retval > set_nproc) {
8243
      retval = set_nproc;
8244
    }
8245
    if (retval < KMP_MIN_NTH) {
8246
      retval = KMP_MIN_NTH;
8247
    }
8248

8249
    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8250
                  retval));
8251
    return retval;
8252
  }
8253

8254
  // There is a slight delay in the load balance algorithm in detecting new
8255
  // running procs. The real system load at this instant should be at least as
8256
  // large as the #active omp thread that are available to add to the team.
8257
  if (system_active < team_curr_active) {
8258
    system_active = team_curr_active;
8259
  }
8260
  retval = __kmp_avail_proc - system_active + team_curr_active;
8261
  if (retval > set_nproc) {
8262
    retval = set_nproc;
8263
  }
8264
  if (retval < KMP_MIN_NTH) {
8265
    retval = KMP_MIN_NTH;
8266
  }
8267

8268
  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8269
  return retval;
8270
} // __kmp_load_balance_nproc()
8271

8272
#endif /* USE_LOAD_BALANCE */
8273

8274
/* ------------------------------------------------------------------------ */
8275

8276
/* NOTE: this is called with the __kmp_init_lock held */
8277
void __kmp_cleanup(void) {
8278
  int f;
8279

8280
  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8281

8282
  if (TCR_4(__kmp_init_parallel)) {
8283
#if KMP_HANDLE_SIGNALS
8284
    __kmp_remove_signals();
8285
#endif
8286
    TCW_4(__kmp_init_parallel, FALSE);
8287
  }
8288

8289
  if (TCR_4(__kmp_init_middle)) {
8290
#if KMP_AFFINITY_SUPPORTED
8291
    __kmp_affinity_uninitialize();
8292
#endif /* KMP_AFFINITY_SUPPORTED */
8293
    __kmp_cleanup_hierarchy();
8294
    TCW_4(__kmp_init_middle, FALSE);
8295
  }
8296

8297
  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8298

8299
  if (__kmp_init_serial) {
8300
    __kmp_runtime_destroy();
8301
    __kmp_init_serial = FALSE;
8302
  }
8303

8304
  __kmp_cleanup_threadprivate_caches();
8305

8306
  for (f = 0; f < __kmp_threads_capacity; f++) {
8307
    if (__kmp_root[f] != NULL) {
8308
      __kmp_free(__kmp_root[f]);
8309
      __kmp_root[f] = NULL;
8310
    }
8311
  }
8312
  __kmp_free(__kmp_threads);
8313
  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8314
  // there is no need in freeing __kmp_root.
8315
  __kmp_threads = NULL;
8316
  __kmp_root = NULL;
8317
  __kmp_threads_capacity = 0;
8318

8319
  // Free old __kmp_threads arrays if they exist.
8320
  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8321
  while (ptr) {
8322
    kmp_old_threads_list_t *next = ptr->next;
8323
    __kmp_free(ptr->threads);
8324
    __kmp_free(ptr);
8325
    ptr = next;
8326
  }
8327

8328
#if KMP_USE_DYNAMIC_LOCK
8329
  __kmp_cleanup_indirect_user_locks();
8330
#else
8331
  __kmp_cleanup_user_locks();
8332
#endif
8333
#if OMPD_SUPPORT
8334
  if (ompd_state) {
8335
    __kmp_free(ompd_env_block);
8336
    ompd_env_block = NULL;
8337
    ompd_env_block_size = 0;
8338
  }
8339
#endif
8340

8341
#if KMP_AFFINITY_SUPPORTED
8342
  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8343
  __kmp_cpuinfo_file = NULL;
8344
#endif /* KMP_AFFINITY_SUPPORTED */
8345

8346
#if KMP_USE_ADAPTIVE_LOCKS
8347
#if KMP_DEBUG_ADAPTIVE_LOCKS
8348
  __kmp_print_speculative_stats();
8349
#endif
8350
#endif
8351
  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8352
  __kmp_nested_nth.nth = NULL;
8353
  __kmp_nested_nth.size = 0;
8354
  __kmp_nested_nth.used = 0;
8355

8356
  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8357
  __kmp_nested_proc_bind.bind_types = NULL;
8358
  __kmp_nested_proc_bind.size = 0;
8359
  __kmp_nested_proc_bind.used = 0;
8360
  if (__kmp_affinity_format) {
8361
    KMP_INTERNAL_FREE(__kmp_affinity_format);
8362
    __kmp_affinity_format = NULL;
8363
  }
8364

8365
  __kmp_i18n_catclose();
8366

8367
#if KMP_USE_HIER_SCHED
8368
  __kmp_hier_scheds.deallocate();
8369
#endif
8370

8371
#if KMP_STATS_ENABLED
8372
  __kmp_stats_fini();
8373
#endif
8374

8375
  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8376
}
8377

8378
/* ------------------------------------------------------------------------ */
8379

8380
int __kmp_ignore_mppbeg(void) {
8381
  char *env;
8382

8383
  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8384
    if (__kmp_str_match_false(env))
8385
      return FALSE;
8386
  }
8387
  // By default __kmpc_begin() is no-op.
8388
  return TRUE;
8389
}
8390

8391
int __kmp_ignore_mppend(void) {
8392
  char *env;
8393

8394
  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8395
    if (__kmp_str_match_false(env))
8396
      return FALSE;
8397
  }
8398
  // By default __kmpc_end() is no-op.
8399
  return TRUE;
8400
}
8401

8402
void __kmp_internal_begin(void) {
8403
  int gtid;
8404
  kmp_root_t *root;
8405

8406
  /* this is a very important step as it will register new sibling threads
8407
     and assign these new uber threads a new gtid */
8408
  gtid = __kmp_entry_gtid();
8409
  root = __kmp_threads[gtid]->th.th_root;
8410
  KMP_ASSERT(KMP_UBER_GTID(gtid));
8411

8412
  if (root->r.r_begin)
8413
    return;
8414
  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8415
  if (root->r.r_begin) {
8416
    __kmp_release_lock(&root->r.r_begin_lock, gtid);
8417
    return;
8418
  }
8419

8420
  root->r.r_begin = TRUE;
8421

8422
  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8423
}
8424

8425
/* ------------------------------------------------------------------------ */
8426

8427
void __kmp_user_set_library(enum library_type arg) {
8428
  int gtid;
8429
  kmp_root_t *root;
8430
  kmp_info_t *thread;
8431

8432
  /* first, make sure we are initialized so we can get our gtid */
8433

8434
  gtid = __kmp_entry_gtid();
8435
  thread = __kmp_threads[gtid];
8436

8437
  root = thread->th.th_root;
8438

8439
  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8440
                library_serial));
8441
  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8442
                                  thread */
8443
    KMP_WARNING(SetLibraryIncorrectCall);
8444
    return;
8445
  }
8446

8447
  switch (arg) {
8448
  case library_serial:
8449
    thread->th.th_set_nproc = 0;
8450
    set__nproc(thread, 1);
8451
    break;
8452
  case library_turnaround:
8453
    thread->th.th_set_nproc = 0;
8454
    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8455
                                           : __kmp_dflt_team_nth_ub);
8456
    break;
8457
  case library_throughput:
8458
    thread->th.th_set_nproc = 0;
8459
    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8460
                                           : __kmp_dflt_team_nth_ub);
8461
    break;
8462
  default:
8463
    KMP_FATAL(UnknownLibraryType, arg);
8464
  }
8465

8466
  __kmp_aux_set_library(arg);
8467
}
8468

8469
void __kmp_aux_set_stacksize(size_t arg) {
8470
  if (!__kmp_init_serial)
8471
    __kmp_serial_initialize();
8472

8473
#if KMP_OS_DARWIN
8474
  if (arg & (0x1000 - 1)) {
8475
    arg &= ~(0x1000 - 1);
8476
    if (arg + 0x1000) /* check for overflow if we round up */
8477
      arg += 0x1000;
8478
  }
8479
#endif
8480
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8481

8482
  /* only change the default stacksize before the first parallel region */
8483
  if (!TCR_4(__kmp_init_parallel)) {
8484
    size_t value = arg; /* argument is in bytes */
8485

8486
    if (value < __kmp_sys_min_stksize)
8487
      value = __kmp_sys_min_stksize;
8488
    else if (value > KMP_MAX_STKSIZE)
8489
      value = KMP_MAX_STKSIZE;
8490

8491
    __kmp_stksize = value;
8492

8493
    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8494
  }
8495

8496
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8497
}
8498

8499
/* set the behaviour of the runtime library */
8500
/* TODO this can cause some odd behaviour with sibling parallelism... */
8501
void __kmp_aux_set_library(enum library_type arg) {
8502
  __kmp_library = arg;
8503

8504
  switch (__kmp_library) {
8505
  case library_serial: {
8506
    KMP_INFORM(LibraryIsSerial);
8507
  } break;
8508
  case library_turnaround:
8509
    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8510
      __kmp_use_yield = 2; // only yield when oversubscribed
8511
    break;
8512
  case library_throughput:
8513
    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8514
      __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8515
    break;
8516
  default:
8517
    KMP_FATAL(UnknownLibraryType, arg);
8518
  }
8519
}
8520

8521
/* Getting team information common for all team API */
8522
// Returns NULL if not in teams construct
8523
static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8524
  kmp_info_t *thr = __kmp_entry_thread();
8525
  teams_serialized = 0;
8526
  if (thr->th.th_teams_microtask) {
8527
    kmp_team_t *team = thr->th.th_team;
8528
    int tlevel = thr->th.th_teams_level; // the level of the teams construct
8529
    int ii = team->t.t_level;
8530
    teams_serialized = team->t.t_serialized;
8531
    int level = tlevel + 1;
8532
    KMP_DEBUG_ASSERT(ii >= tlevel);
8533
    while (ii > level) {
8534
      for (teams_serialized = team->t.t_serialized;
8535
           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8536
      }
8537
      if (team->t.t_serialized && (!teams_serialized)) {
8538
        team = team->t.t_parent;
8539
        continue;
8540
      }
8541
      if (ii > level) {
8542
        team = team->t.t_parent;
8543
        ii--;
8544
      }
8545
    }
8546
    return team;
8547
  }
8548
  return NULL;
8549
}
8550

8551
int __kmp_aux_get_team_num() {
8552
  int serialized;
8553
  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8554
  if (team) {
8555
    if (serialized > 1) {
8556
      return 0; // teams region is serialized ( 1 team of 1 thread ).
8557
    } else {
8558
      return team->t.t_master_tid;
8559
    }
8560
  }
8561
  return 0;
8562
}
8563

8564
int __kmp_aux_get_num_teams() {
8565
  int serialized;
8566
  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8567
  if (team) {
8568
    if (serialized > 1) {
8569
      return 1;
8570
    } else {
8571
      return team->t.t_parent->t.t_nproc;
8572
    }
8573
  }
8574
  return 1;
8575
}
8576

8577
/* ------------------------------------------------------------------------ */
8578

8579
/*
8580
 * Affinity Format Parser
8581
 *
8582
 * Field is in form of: %[[[0].]size]type
8583
 * % and type are required (%% means print a literal '%')
8584
 * type is either single char or long name surrounded by {},
8585
 * e.g., N or {num_threads}
8586
 * 0 => leading zeros
8587
 * . => right justified when size is specified
8588
 * by default output is left justified
8589
 * size is the *minimum* field length
8590
 * All other characters are printed as is
8591
 *
8592
 * Available field types:
8593
 * L {thread_level}      - omp_get_level()
8594
 * n {thread_num}        - omp_get_thread_num()
8595
 * h {host}              - name of host machine
8596
 * P {process_id}        - process id (integer)
8597
 * T {thread_identifier} - native thread identifier (integer)
8598
 * N {num_threads}       - omp_get_num_threads()
8599
 * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8600
 * a {thread_affinity}   - comma separated list of integers or integer ranges
8601
 *                         (values of affinity mask)
8602
 *
8603
 * Implementation-specific field types can be added
8604
 * If a type is unknown, print "undefined"
8605
 */
8606

8607
// Structure holding the short name, long name, and corresponding data type
8608
// for snprintf.  A table of these will represent the entire valid keyword
8609
// field types.
8610
typedef struct kmp_affinity_format_field_t {
8611
  char short_name; // from spec e.g., L -> thread level
8612
  const char *long_name; // from spec thread_level -> thread level
8613
  char field_format; // data type for snprintf (typically 'd' or 's'
8614
  // for integer or string)
8615
} kmp_affinity_format_field_t;
8616

8617
static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8618
#if KMP_AFFINITY_SUPPORTED
8619
    {'A', "thread_affinity", 's'},
8620
#endif
8621
    {'t', "team_num", 'd'},
8622
    {'T', "num_teams", 'd'},
8623
    {'L', "nesting_level", 'd'},
8624
    {'n', "thread_num", 'd'},
8625
    {'N', "num_threads", 'd'},
8626
    {'a', "ancestor_tnum", 'd'},
8627
    {'H', "host", 's'},
8628
    {'P', "process_id", 'd'},
8629
    {'i', "native_thread_id", 'd'}};
8630

8631
// Return the number of characters it takes to hold field
8632
static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8633
                                            const char **ptr,
8634
                                            kmp_str_buf_t *field_buffer) {
8635
  int rc, format_index, field_value;
8636
  const char *width_left, *width_right;
8637
  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8638
  static const int FORMAT_SIZE = 20;
8639
  char format[FORMAT_SIZE] = {0};
8640
  char absolute_short_name = 0;
8641

8642
  KMP_DEBUG_ASSERT(gtid >= 0);
8643
  KMP_DEBUG_ASSERT(th);
8644
  KMP_DEBUG_ASSERT(**ptr == '%');
8645
  KMP_DEBUG_ASSERT(field_buffer);
8646

8647
  __kmp_str_buf_clear(field_buffer);
8648

8649
  // Skip the initial %
8650
  (*ptr)++;
8651

8652
  // Check for %% first
8653
  if (**ptr == '%') {
8654
    __kmp_str_buf_cat(field_buffer, "%", 1);
8655
    (*ptr)++; // skip over the second %
8656
    return 1;
8657
  }
8658

8659
  // Parse field modifiers if they are present
8660
  pad_zeros = false;
8661
  if (**ptr == '0') {
8662
    pad_zeros = true;
8663
    (*ptr)++; // skip over 0
8664
  }
8665
  right_justify = false;
8666
  if (**ptr == '.') {
8667
    right_justify = true;
8668
    (*ptr)++; // skip over .
8669
  }
8670
  // Parse width of field: [width_left, width_right)
8671
  width_left = width_right = NULL;
8672
  if (**ptr >= '0' && **ptr <= '9') {
8673
    width_left = *ptr;
8674
    SKIP_DIGITS(*ptr);
8675
    width_right = *ptr;
8676
  }
8677

8678
  // Create the format for KMP_SNPRINTF based on flags parsed above
8679
  format_index = 0;
8680
  format[format_index++] = '%';
8681
  if (!right_justify)
8682
    format[format_index++] = '-';
8683
  if (pad_zeros)
8684
    format[format_index++] = '0';
8685
  if (width_left && width_right) {
8686
    int i = 0;
8687
    // Only allow 8 digit number widths.
8688
    // This also prevents overflowing format variable
8689
    while (i < 8 && width_left < width_right) {
8690
      format[format_index++] = *width_left;
8691
      width_left++;
8692
      i++;
8693
    }
8694
  }
8695

8696
  // Parse a name (long or short)
8697
  // Canonicalize the name into absolute_short_name
8698
  found_valid_name = false;
8699
  parse_long_name = (**ptr == '{');
8700
  if (parse_long_name)
8701
    (*ptr)++; // skip initial left brace
8702
  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8703
                             sizeof(__kmp_affinity_format_table[0]);
8704
       ++i) {
8705
    char short_name = __kmp_affinity_format_table[i].short_name;
8706
    const char *long_name = __kmp_affinity_format_table[i].long_name;
8707
    char field_format = __kmp_affinity_format_table[i].field_format;
8708
    if (parse_long_name) {
8709
      size_t length = KMP_STRLEN(long_name);
8710
      if (strncmp(*ptr, long_name, length) == 0) {
8711
        found_valid_name = true;
8712
        (*ptr) += length; // skip the long name
8713
      }
8714
    } else if (**ptr == short_name) {
8715
      found_valid_name = true;
8716
      (*ptr)++; // skip the short name
8717
    }
8718
    if (found_valid_name) {
8719
      format[format_index++] = field_format;
8720
      format[format_index++] = '\0';
8721
      absolute_short_name = short_name;
8722
      break;
8723
    }
8724
  }
8725
  if (parse_long_name) {
8726
    if (**ptr != '}') {
8727
      absolute_short_name = 0;
8728
    } else {
8729
      (*ptr)++; // skip over the right brace
8730
    }
8731
  }
8732

8733
  // Attempt to fill the buffer with the requested
8734
  // value using snprintf within __kmp_str_buf_print()
8735
  switch (absolute_short_name) {
8736
  case 't':
8737
    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8738
    break;
8739
  case 'T':
8740
    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8741
    break;
8742
  case 'L':
8743
    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8744
    break;
8745
  case 'n':
8746
    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8747
    break;
8748
  case 'H': {
8749
    static const int BUFFER_SIZE = 256;
8750
    char buf[BUFFER_SIZE];
8751
    __kmp_expand_host_name(buf, BUFFER_SIZE);
8752
    rc = __kmp_str_buf_print(field_buffer, format, buf);
8753
  } break;
8754
  case 'P':
8755
    rc = __kmp_str_buf_print(field_buffer, format, getpid());
8756
    break;
8757
  case 'i':
8758
    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8759
    break;
8760
  case 'N':
8761
    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8762
    break;
8763
  case 'a':
8764
    field_value =
8765
        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8766
    rc = __kmp_str_buf_print(field_buffer, format, field_value);
8767
    break;
8768
#if KMP_AFFINITY_SUPPORTED
8769
  case 'A': {
8770
    kmp_str_buf_t buf;
8771
    __kmp_str_buf_init(&buf);
8772
    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8773
    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8774
    __kmp_str_buf_free(&buf);
8775
  } break;
8776
#endif
8777
  default:
8778
    // According to spec, If an implementation does not have info for field
8779
    // type, then "undefined" is printed
8780
    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8781
    // Skip the field
8782
    if (parse_long_name) {
8783
      SKIP_TOKEN(*ptr);
8784
      if (**ptr == '}')
8785
        (*ptr)++;
8786
    } else {
8787
      (*ptr)++;
8788
    }
8789
  }
8790

8791
  KMP_ASSERT(format_index <= FORMAT_SIZE);
8792
  return rc;
8793
}
8794

8795
/*
8796
 * Return number of characters needed to hold the affinity string
8797
 * (not including null byte character)
8798
 * The resultant string is printed to buffer, which the caller can then
8799
 * handle afterwards
8800
 */
8801
size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8802
                                  kmp_str_buf_t *buffer) {
8803
  const char *parse_ptr;
8804
  size_t retval;
8805
  const kmp_info_t *th;
8806
  kmp_str_buf_t field;
8807

8808
  KMP_DEBUG_ASSERT(buffer);
8809
  KMP_DEBUG_ASSERT(gtid >= 0);
8810

8811
  __kmp_str_buf_init(&field);
8812
  __kmp_str_buf_clear(buffer);
8813

8814
  th = __kmp_threads[gtid];
8815
  retval = 0;
8816

8817
  // If format is NULL or zero-length string, then we use
8818
  // affinity-format-var ICV
8819
  parse_ptr = format;
8820
  if (parse_ptr == NULL || *parse_ptr == '\0') {
8821
    parse_ptr = __kmp_affinity_format;
8822
  }
8823
  KMP_DEBUG_ASSERT(parse_ptr);
8824

8825
  while (*parse_ptr != '\0') {
8826
    // Parse a field
8827
    if (*parse_ptr == '%') {
8828
      // Put field in the buffer
8829
      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8830
      __kmp_str_buf_catbuf(buffer, &field);
8831
      retval += rc;
8832
    } else {
8833
      // Put literal character in buffer
8834
      __kmp_str_buf_cat(buffer, parse_ptr, 1);
8835
      retval++;
8836
      parse_ptr++;
8837
    }
8838
  }
8839
  __kmp_str_buf_free(&field);
8840
  return retval;
8841
}
8842

8843
// Displays the affinity string to stdout
8844
void __kmp_aux_display_affinity(int gtid, const char *format) {
8845
  kmp_str_buf_t buf;
8846
  __kmp_str_buf_init(&buf);
8847
  __kmp_aux_capture_affinity(gtid, format, &buf);
8848
  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8849
  __kmp_str_buf_free(&buf);
8850
}
8851

8852
/* ------------------------------------------------------------------------ */
8853
void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8854
  int blocktime = arg; /* argument is in microseconds */
8855
#if KMP_USE_MONITOR
8856
  int bt_intervals;
8857
#endif
8858
  kmp_int8 bt_set;
8859

8860
  __kmp_save_internal_controls(thread);
8861

8862
  /* Normalize and set blocktime for the teams */
8863
  if (blocktime < KMP_MIN_BLOCKTIME)
8864
    blocktime = KMP_MIN_BLOCKTIME;
8865
  else if (blocktime > KMP_MAX_BLOCKTIME)
8866
    blocktime = KMP_MAX_BLOCKTIME;
8867

8868
  set__blocktime_team(thread->th.th_team, tid, blocktime);
8869
  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8870

8871
#if KMP_USE_MONITOR
8872
  /* Calculate and set blocktime intervals for the teams */
8873
  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8874

8875
  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8876
  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8877
#endif
8878

8879
  /* Set whether blocktime has been set to "TRUE" */
8880
  bt_set = TRUE;
8881

8882
  set__bt_set_team(thread->th.th_team, tid, bt_set);
8883
  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8884
#if KMP_USE_MONITOR
8885
  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8886
                "bt_intervals=%d, monitor_updates=%d\n",
8887
                __kmp_gtid_from_tid(tid, thread->th.th_team),
8888
                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8889
                __kmp_monitor_wakeups));
8890
#else
8891
  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8892
                __kmp_gtid_from_tid(tid, thread->th.th_team),
8893
                thread->th.th_team->t.t_id, tid, blocktime));
8894
#endif
8895
}
8896

8897
void __kmp_aux_set_defaults(char const *str, size_t len) {
8898
  if (!__kmp_init_serial) {
8899
    __kmp_serial_initialize();
8900
  }
8901
  __kmp_env_initialize(str);
8902

8903
  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8904
    __kmp_env_print();
8905
  }
8906
} // __kmp_aux_set_defaults
8907

8908
/* ------------------------------------------------------------------------ */
8909
/* internal fast reduction routines */
8910

8911
PACKED_REDUCTION_METHOD_T
8912
__kmp_determine_reduction_method(
8913
    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8914
    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8915
    kmp_critical_name *lck) {
8916

8917
  // Default reduction method: critical construct ( lck != NULL, like in current
8918
  // PAROPT )
8919
  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8920
  // can be selected by RTL
8921
  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8922
  // can be selected by RTL
8923
  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8924
  // among generated by PAROPT.
8925

8926
  PACKED_REDUCTION_METHOD_T retval;
8927

8928
  int team_size;
8929

8930
  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8931

8932
#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8933
  (loc &&                                                                      \
8934
   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8935
#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8936

8937
  retval = critical_reduce_block;
8938

8939
  // another choice of getting a team size (with 1 dynamic deference) is slower
8940
  team_size = __kmp_get_team_num_threads(global_tid);
8941
  if (team_size == 1) {
8942

8943
    retval = empty_reduce_block;
8944

8945
  } else {
8946

8947
    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8948

8949
#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8950
    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
8951
    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8952

8953
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8954
    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
8955
    KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8956

8957
    int teamsize_cutoff = 4;
8958

8959
#if KMP_MIC_SUPPORTED
8960
    if (__kmp_mic_type != non_mic) {
8961
      teamsize_cutoff = 8;
8962
    }
8963
#endif
8964
    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8965
    if (tree_available) {
8966
      if (team_size <= teamsize_cutoff) {
8967
        if (atomic_available) {
8968
          retval = atomic_reduce_block;
8969
        }
8970
      } else {
8971
        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8972
      }
8973
    } else if (atomic_available) {
8974
      retval = atomic_reduce_block;
8975
    }
8976
#else
8977
#error "Unknown or unsupported OS"
8978
#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8979
       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8980
       // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8981

8982
#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
8983
    KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8984

8985
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8986
    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
8987
    KMP_OS_WASI || KMP_OS_AIX
8988

8989
    // basic tuning
8990

8991
    if (atomic_available) {
8992
      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8993
        retval = atomic_reduce_block;
8994
      }
8995
    } // otherwise: use critical section
8996

8997
#elif KMP_OS_DARWIN
8998

8999
    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9000
    if (atomic_available && (num_vars <= 3)) {
9001
      retval = atomic_reduce_block;
9002
    } else if (tree_available) {
9003
      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9004
          (reduce_size < (2000 * sizeof(kmp_real64)))) {
9005
        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9006
      }
9007
    } // otherwise: use critical section
9008

9009
#else
9010
#error "Unknown or unsupported OS"
9011
#endif
9012

9013
#else
9014
#error "Unknown or unsupported architecture"
9015
#endif
9016
  }
9017

9018
  // KMP_FORCE_REDUCTION
9019

9020
  // If the team is serialized (team_size == 1), ignore the forced reduction
9021
  // method and stay with the unsynchronized method (empty_reduce_block)
9022
  if (__kmp_force_reduction_method != reduction_method_not_defined &&
9023
      team_size != 1) {
9024

9025
    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9026

9027
    int atomic_available, tree_available;
9028

9029
    switch ((forced_retval = __kmp_force_reduction_method)) {
9030
    case critical_reduce_block:
9031
      KMP_ASSERT(lck); // lck should be != 0
9032
      break;
9033

9034
    case atomic_reduce_block:
9035
      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9036
      if (!atomic_available) {
9037
        KMP_WARNING(RedMethodNotSupported, "atomic");
9038
        forced_retval = critical_reduce_block;
9039
      }
9040
      break;
9041

9042
    case tree_reduce_block:
9043
      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9044
      if (!tree_available) {
9045
        KMP_WARNING(RedMethodNotSupported, "tree");
9046
        forced_retval = critical_reduce_block;
9047
      } else {
9048
#if KMP_FAST_REDUCTION_BARRIER
9049
        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9050
#endif
9051
      }
9052
      break;
9053

9054
    default:
9055
      KMP_ASSERT(0); // "unsupported method specified"
9056
    }
9057

9058
    retval = forced_retval;
9059
  }
9060

9061
  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9062

9063
#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9064
#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9065

9066
  return (retval);
9067
}
9068
// this function is for testing set/get/determine reduce method
9069
kmp_int32 __kmp_get_reduce_method(void) {
9070
  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9071
}
9072

9073
// Soft pause sets up threads to ignore blocktime and just go to sleep.
9074
// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9075
void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9076

9077
// Hard pause shuts down the runtime completely.  Resume happens naturally when
9078
// OpenMP is used subsequently.
9079
void __kmp_hard_pause() {
9080
  __kmp_pause_status = kmp_hard_paused;
9081
  __kmp_internal_end_thread(-1);
9082
}
9083

9084
// Soft resume sets __kmp_pause_status, and wakes up all threads.
9085
void __kmp_resume_if_soft_paused() {
9086
  if (__kmp_pause_status == kmp_soft_paused) {
9087
    __kmp_pause_status = kmp_not_paused;
9088

9089
    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9090
      kmp_info_t *thread = __kmp_threads[gtid];
9091
      if (thread) { // Wake it if sleeping
9092
        kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9093
                         thread);
9094
        if (fl.is_sleeping())
9095
          fl.resume(gtid);
9096
        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9097
          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9098
        } else { // thread holds the lock and may sleep soon
9099
          do { // until either the thread sleeps, or we can get the lock
9100
            if (fl.is_sleeping()) {
9101
              fl.resume(gtid);
9102
              break;
9103
            } else if (__kmp_try_suspend_mx(thread)) {
9104
              __kmp_unlock_suspend_mx(thread);
9105
              break;
9106
            }
9107
          } while (1);
9108
        }
9109
      }
9110
    }
9111
  }
9112
}
9113

9114
// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9115
// TODO: add warning messages
9116
int __kmp_pause_resource(kmp_pause_status_t level) {
9117
  if (level == kmp_not_paused) { // requesting resume
9118
    if (__kmp_pause_status == kmp_not_paused) {
9119
      // error message about runtime not being paused, so can't resume
9120
      return 1;
9121
    } else {
9122
      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9123
                       __kmp_pause_status == kmp_hard_paused);
9124
      __kmp_pause_status = kmp_not_paused;
9125
      return 0;
9126
    }
9127
  } else if (level == kmp_soft_paused) { // requesting soft pause
9128
    if (__kmp_pause_status != kmp_not_paused) {
9129
      // error message about already being paused
9130
      return 1;
9131
    } else {
9132
      __kmp_soft_pause();
9133
      return 0;
9134
    }
9135
  } else if (level == kmp_hard_paused) { // requesting hard pause
9136
    if (__kmp_pause_status != kmp_not_paused) {
9137
      // error message about already being paused
9138
      return 1;
9139
    } else {
9140
      __kmp_hard_pause();
9141
      return 0;
9142
    }
9143
  } else {
9144
    // error message about invalid level
9145
    return 1;
9146
  }
9147
}
9148

9149
void __kmp_omp_display_env(int verbose) {
9150
  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9151
  if (__kmp_init_serial == 0)
9152
    __kmp_do_serial_initialize();
9153
  __kmp_display_env_impl(!verbose, verbose);
9154
  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9155
}
9156

9157
// The team size is changing, so distributed barrier must be modified
9158
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9159
                               int new_nthreads) {
9160
  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9161
                   bp_dist_bar);
9162
  kmp_info_t **other_threads = team->t.t_threads;
9163

9164
  // We want all the workers to stop waiting on the barrier while we adjust the
9165
  // size of the team.
9166
  for (int f = 1; f < old_nthreads; ++f) {
9167
    KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9168
    // Ignore threads that are already inactive or not present in the team
9169
    if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9170
      // teams construct causes thread_limit to get passed in, and some of
9171
      // those could be inactive; just ignore them
9172
      continue;
9173
    }
9174
    // If thread is transitioning still to in_use state, wait for it
9175
    if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9176
      while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9177
        KMP_CPU_PAUSE();
9178
    }
9179
    // The thread should be in_use now
9180
    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9181
    // Transition to unused state
9182
    team->t.t_threads[f]->th.th_used_in_team.store(2);
9183
    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9184
  }
9185
  // Release all the workers
9186
  team->t.b->go_release();
9187

9188
  KMP_MFENCE();
9189

9190
  // Workers should see transition status 2 and move to 0; but may need to be
9191
  // woken up first
9192
  int count = old_nthreads - 1;
9193
  while (count > 0) {
9194
    count = old_nthreads - 1;
9195
    for (int f = 1; f < old_nthreads; ++f) {
9196
      if (other_threads[f]->th.th_used_in_team.load() != 0) {
9197
        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9198
          kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9199
              void *, other_threads[f]->th.th_sleep_loc);
9200
          __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9201
        }
9202
      } else {
9203
        KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9204
        count--;
9205
      }
9206
    }
9207
  }
9208
  // Now update the barrier size
9209
  team->t.b->update_num_threads(new_nthreads);
9210
  team->t.b->go_reset();
9211
}
9212

9213
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9214
  // Add the threads back to the team
9215
  KMP_DEBUG_ASSERT(team);
9216
  // Threads were paused and pointed at th_used_in_team temporarily during a
9217
  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9218
  // the thread that it should transition itself back into the team. Then, if
9219
  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9220
  // to wake it up.
9221
  for (int f = 1; f < new_nthreads; ++f) {
9222
    KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9223
    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9224
                                3);
9225
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9226
      __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9227
                      (kmp_flag_32<false, false> *)NULL);
9228
    }
9229
  }
9230
  // The threads should be transitioning to the team; when they are done, they
9231
  // should have set th_used_in_team to 1. This loop forces master to wait until
9232
  // all threads have moved into the team and are waiting in the barrier.
9233
  int count = new_nthreads - 1;
9234
  while (count > 0) {
9235
    count = new_nthreads - 1;
9236
    for (int f = 1; f < new_nthreads; ++f) {
9237
      if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9238
        count--;
9239
      }
9240
    }
9241
  }
9242
}
9243

9244
// Globals and functions for hidden helper task
9245
kmp_info_t **__kmp_hidden_helper_threads;
9246
kmp_info_t *__kmp_hidden_helper_main_thread;
9247
std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9248
#if KMP_OS_LINUX
9249
kmp_int32 __kmp_hidden_helper_threads_num = 8;
9250
kmp_int32 __kmp_enable_hidden_helper = TRUE;
9251
#else
9252
kmp_int32 __kmp_hidden_helper_threads_num = 0;
9253
kmp_int32 __kmp_enable_hidden_helper = FALSE;
9254
#endif
9255

9256
namespace {
9257
std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9258

9259
void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9260
  // This is an explicit synchronization on all hidden helper threads in case
9261
  // that when a regular thread pushes a hidden helper task to one hidden
9262
  // helper thread, the thread has not been awaken once since they're released
9263
  // by the main thread after creating the team.
9264
  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9265
  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9266
         __kmp_hidden_helper_threads_num)
9267
    ;
9268

9269
  // If main thread, then wait for signal
9270
  if (__kmpc_master(nullptr, *gtid)) {
9271
    // First, unset the initial state and release the initial thread
9272
    TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9273
    __kmp_hidden_helper_initz_release();
9274
    __kmp_hidden_helper_main_thread_wait();
9275
    // Now wake up all worker threads
9276
    for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9277
      __kmp_hidden_helper_worker_thread_signal();
9278
    }
9279
  }
9280
}
9281
} // namespace
9282

9283
void __kmp_hidden_helper_threads_initz_routine() {
9284
  // Create a new root for hidden helper team/threads
9285
  const int gtid = __kmp_register_root(TRUE);
9286
  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9287
  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9288
  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9289
      __kmp_hidden_helper_threads_num;
9290

9291
  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9292

9293
  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9294

9295
  // Set the initialization flag to FALSE
9296
  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9297

9298
  __kmp_hidden_helper_threads_deinitz_release();
9299
}
9300

9301
/* Nesting Mode:
9302
   Set via KMP_NESTING_MODE, which takes an integer.
9303
   Note: we skip duplicate topology levels, and skip levels with only
9304
      one entity.
9305
   KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9306
   KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9307
      in the topology, and initializes the number of threads at each of those
9308
      levels to the number of entities at each level, respectively, below the
9309
      entity at the parent level.
9310
   KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9311
      but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9312
      the user to turn nesting on explicitly. This is an even more experimental
9313
      option to this experimental feature, and may change or go away in the
9314
      future.
9315
*/
9316

9317
// Allocate space to store nesting levels
9318
void __kmp_init_nesting_mode() {
9319
  int levels = KMP_HW_LAST;
9320
  __kmp_nesting_mode_nlevels = levels;
9321
  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9322
  for (int i = 0; i < levels; ++i)
9323
    __kmp_nesting_nth_level[i] = 0;
9324
  if (__kmp_nested_nth.size < levels) {
9325
    __kmp_nested_nth.nth =
9326
        (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9327
    __kmp_nested_nth.size = levels;
9328
  }
9329
}
9330

9331
// Set # threads for top levels of nesting; must be called after topology set
9332
void __kmp_set_nesting_mode_threads() {
9333
  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9334

9335
  if (__kmp_nesting_mode == 1)
9336
    __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9337
  else if (__kmp_nesting_mode > 1)
9338
    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9339

9340
  if (__kmp_topology) { // use topology info
9341
    int loc, hw_level;
9342
    for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9343
                                loc < __kmp_nesting_mode_nlevels;
9344
         loc++, hw_level++) {
9345
      __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9346
      if (__kmp_nesting_nth_level[loc] == 1)
9347
        loc--;
9348
    }
9349
    // Make sure all cores are used
9350
    if (__kmp_nesting_mode > 1 && loc > 1) {
9351
      int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9352
      int num_cores = __kmp_topology->get_count(core_level);
9353
      int upper_levels = 1;
9354
      for (int level = 0; level < loc - 1; ++level)
9355
        upper_levels *= __kmp_nesting_nth_level[level];
9356
      if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9357
        __kmp_nesting_nth_level[loc - 1] =
9358
            num_cores / __kmp_nesting_nth_level[loc - 2];
9359
    }
9360
    __kmp_nesting_mode_nlevels = loc;
9361
    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9362
  } else { // no topology info available; provide a reasonable guesstimation
9363
    if (__kmp_avail_proc >= 4) {
9364
      __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9365
      __kmp_nesting_nth_level[1] = 2;
9366
      __kmp_nesting_mode_nlevels = 2;
9367
    } else {
9368
      __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9369
      __kmp_nesting_mode_nlevels = 1;
9370
    }
9371
    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9372
  }
9373
  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9374
    __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9375
  }
9376
  set__nproc(thread, __kmp_nesting_nth_level[0]);
9377
  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9378
    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9379
  if (get__max_active_levels(thread) > 1) {
9380
    // if max levels was set, set nesting mode levels to same
9381
    __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9382
  }
9383
  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9384
    set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9385
}
9386

9387
// Empty symbols to export (see exports_so.txt) when feature is disabled
9388
extern "C" {
9389
#if !KMP_STATS_ENABLED
9390
void __kmp_reset_stats() {}
9391
#endif
9392
#if !USE_DEBUGGER
9393
int __kmp_omp_debug_struct_info = FALSE;
9394
int __kmp_debugging = FALSE;
9395
#endif
9396
#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9397
void __kmp_itt_fini_ittlib() {}
9398
void __kmp_itt_init_ittlib() {}
9399
#endif
9400
}
9401

9402
// end of file
9403

9404
Product

Resources

Company