CoCalc -- kmp_dispatch

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_dispatch_hier.h
³⁵²⁵⁸ views
1
/*
2
 * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
#ifndef KMP_DISPATCH_HIER_H
14
#define KMP_DISPATCH_HIER_H
15
#include "kmp.h"
16
#include "kmp_dispatch.h"
17

18
// Layer type for scheduling hierarchy
19
enum kmp_hier_layer_e {
20
  LAYER_THREAD = -1,
21
  LAYER_L1,
22
  LAYER_L2,
23
  LAYER_L3,
24
  LAYER_NUMA,
25
  LAYER_LOOP,
26
  LAYER_LAST
27
};
28

29
// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
30
static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
31
  switch (type) {
32
  case kmp_hier_layer_e::LAYER_THREAD:
33
    return "THREAD";
34
  case kmp_hier_layer_e::LAYER_L1:
35
    return "L1";
36
  case kmp_hier_layer_e::LAYER_L2:
37
    return "L2";
38
  case kmp_hier_layer_e::LAYER_L3:
39
    return "L3";
40
  case kmp_hier_layer_e::LAYER_NUMA:
41
    return "NUMA";
42
  case kmp_hier_layer_e::LAYER_LOOP:
43
    return "WHOLE_LOOP";
44
  case kmp_hier_layer_e::LAYER_LAST:
45
    return "LAST";
46
  }
47
  KMP_ASSERT(0);
48
  // Appease compilers, should never get here
49
  return "ERROR";
50
}
51

52
// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
53
typedef struct kmp_hier_sched_env_t {
54
  int size;
55
  int capacity;
56
  enum sched_type *scheds;
57
  kmp_int32 *small_chunks;
58
  kmp_int64 *large_chunks;
59
  kmp_hier_layer_e *layers;
60
  // Append a level of the hierarchy
61
  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
62
    if (capacity == 0) {
63
      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
64
                                                 kmp_hier_layer_e::LAYER_LAST);
65
      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
66
                                                 kmp_hier_layer_e::LAYER_LAST);
67
      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
68
                                                 kmp_hier_layer_e::LAYER_LAST);
69
      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
70
                                                  kmp_hier_layer_e::LAYER_LAST);
71
      capacity = kmp_hier_layer_e::LAYER_LAST;
72
    }
73
    int current_size = size;
74
    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
75
    scheds[current_size] = sched;
76
    layers[current_size] = layer;
77
    small_chunks[current_size] = chunk;
78
    large_chunks[current_size] = (kmp_int64)chunk;
79
    size++;
80
  }
81
  // Sort the hierarchy using selection sort, size will always be small
82
  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
83
  void sort() {
84
    if (size <= 1)
85
      return;
86
    for (int i = 0; i < size; ++i) {
87
      int switch_index = i;
88
      for (int j = i + 1; j < size; ++j) {
89
        if (layers[j] < layers[switch_index])
90
          switch_index = j;
91
      }
92
      if (switch_index != i) {
93
        kmp_hier_layer_e temp1 = layers[i];
94
        enum sched_type temp2 = scheds[i];
95
        kmp_int32 temp3 = small_chunks[i];
96
        kmp_int64 temp4 = large_chunks[i];
97
        layers[i] = layers[switch_index];
98
        scheds[i] = scheds[switch_index];
99
        small_chunks[i] = small_chunks[switch_index];
100
        large_chunks[i] = large_chunks[switch_index];
101
        layers[switch_index] = temp1;
102
        scheds[switch_index] = temp2;
103
        small_chunks[switch_index] = temp3;
104
        large_chunks[switch_index] = temp4;
105
      }
106
    }
107
  }
108
  // Free all memory
109
  void deallocate() {
110
    if (capacity > 0) {
111
      __kmp_free(scheds);
112
      __kmp_free(layers);
113
      __kmp_free(small_chunks);
114
      __kmp_free(large_chunks);
115
      scheds = NULL;
116
      layers = NULL;
117
      small_chunks = NULL;
118
      large_chunks = NULL;
119
    }
120
    size = 0;
121
    capacity = 0;
122
  }
123
} kmp_hier_sched_env_t;
124

125
extern int __kmp_dispatch_hand_threading;
126
extern kmp_hier_sched_env_t __kmp_hier_scheds;
127

128
// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
129
extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
130
extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
131

132
extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
133
extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
134
extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
135
                                        kmp_hier_layer_e t2);
136
extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
137

138
template <typename T> struct kmp_hier_shared_bdata_t {
139
  typedef typename traits_t<T>::signed_t ST;
140
  volatile kmp_uint64 val[2];
141
  kmp_int32 status[2];
142
  T lb[2];
143
  T ub[2];
144
  ST st[2];
145
  dispatch_shared_info_template<T> sh[2];
146
  void zero() {
147
    val[0] = val[1] = 0;
148
    status[0] = status[1] = 0;
149
    lb[0] = lb[1] = 0;
150
    ub[0] = ub[1] = 0;
151
    st[0] = st[1] = 0;
152
    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
153
  }
154
  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
155
                            kmp_uint64 index) {
156
    lb[1 - index] = nlb;
157
    ub[1 - index] = nub;
158
    st[1 - index] = nst;
159
    status[1 - index] = nstatus;
160
  }
161
  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
162
    lb[1 - index] = nlb;
163
    ub[1 - index] = nub;
164
    st[1 - index] = nst;
165
    status[1 - index] = nstatus;
166
    sh[1 - index].u.s.iteration = 0;
167
  }
168

169
  kmp_int32 get_next_status(kmp_uint64 index) const {
170
    return status[1 - index];
171
  }
172
  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
173
  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
174
  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
175
  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
176
    return &(sh[1 - index]);
177
  }
178

179
  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
180
  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
181
  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
182
  ST get_curr_st(kmp_uint64 index) const { return st[index]; }
183
  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
184
    return &(sh[index]);
185
  }
186
};
187

188
/*
189
 * In the barrier implementations, num_active is the number of threads that are
190
 * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
191
 * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
192
 * structure. tdata is the thread private data that resides on the thread
193
 * data structure.
194
 *
195
 * The reset_shared() method is used to initialize the barrier data on the
196
 * kmp_hier_top_unit_t hierarchy structure
197
 *
198
 * The reset_private() method is used to initialize the barrier data on the
199
 * thread's private dispatch buffer structure
200
 *
201
 * The barrier() method takes an id, which is that thread's id for the
202
 * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
203
 * inside barrier() until all fellow threads who are attached to that
204
 * kmp_hier_top_unit_t structure have arrived.
205
 */
206

207
// Core barrier implementation
208
// Can be used in a unit with between 2 to 8 threads
209
template <typename T> class core_barrier_impl {
210
  static inline kmp_uint64 get_wait_val(int num_active) {
211
    kmp_uint64 wait_val = 0LL;
212
    switch (num_active) {
213
    case 2:
214
      wait_val = 0x0101LL;
215
      break;
216
    case 3:
217
      wait_val = 0x010101LL;
218
      break;
219
    case 4:
220
      wait_val = 0x01010101LL;
221
      break;
222
    case 5:
223
      wait_val = 0x0101010101LL;
224
      break;
225
    case 6:
226
      wait_val = 0x010101010101LL;
227
      break;
228
    case 7:
229
      wait_val = 0x01010101010101LL;
230
      break;
231
    case 8:
232
      wait_val = 0x0101010101010101LL;
233
      break;
234
    default:
235
      // don't use the core_barrier_impl for more than 8 threads
236
      KMP_ASSERT(0);
237
    }
238
    return wait_val;
239
  }
240

241
public:
242
  static void reset_private(kmp_int32 num_active,
243
                            kmp_hier_private_bdata_t *tdata);
244
  static void reset_shared(kmp_int32 num_active,
245
                           kmp_hier_shared_bdata_t<T> *bdata);
246
  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
247
                      kmp_hier_private_bdata_t *tdata);
248
};
249

250
template <typename T>
251
void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
252
                                         kmp_hier_private_bdata_t *tdata) {
253
  tdata->num_active = num_active;
254
  tdata->index = 0;
255
  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
256
}
257
template <typename T>
258
void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
259
                                        kmp_hier_shared_bdata_t<T> *bdata) {
260
  bdata->val[0] = bdata->val[1] = 0LL;
261
  bdata->status[0] = bdata->status[1] = 0LL;
262
}
263
template <typename T>
264
void core_barrier_impl<T>::barrier(kmp_int32 id,
265
                                   kmp_hier_shared_bdata_t<T> *bdata,
266
                                   kmp_hier_private_bdata_t *tdata) {
267
  kmp_uint64 current_index = tdata->index;
268
  kmp_uint64 next_index = 1 - current_index;
269
  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
270
  kmp_uint64 next_wait_value =
271
      (current_wait_value ? 0 : get_wait_val(tdata->num_active));
272
  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
273
                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
274
                __kmp_get_gtid(), current_index, next_index, current_wait_value,
275
                next_wait_value));
276
  char v = (current_wait_value ? '\1' : '\0');
277
  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
278
  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
279
                         __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
280
  tdata->wait_val[current_index] = next_wait_value;
281
  tdata->index = next_index;
282
}
283

284
// Counter barrier implementation
285
// Can be used in a unit with arbitrary number of active threads
286
template <typename T> class counter_barrier_impl {
287
public:
288
  static void reset_private(kmp_int32 num_active,
289
                            kmp_hier_private_bdata_t *tdata);
290
  static void reset_shared(kmp_int32 num_active,
291
                           kmp_hier_shared_bdata_t<T> *bdata);
292
  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
293
                      kmp_hier_private_bdata_t *tdata);
294
};
295

296
template <typename T>
297
void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
298
                                            kmp_hier_private_bdata_t *tdata) {
299
  tdata->num_active = num_active;
300
  tdata->index = 0;
301
  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
302
}
303
template <typename T>
304
void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
305
                                           kmp_hier_shared_bdata_t<T> *bdata) {
306
  bdata->val[0] = bdata->val[1] = 0LL;
307
  bdata->status[0] = bdata->status[1] = 0LL;
308
}
309
template <typename T>
310
void counter_barrier_impl<T>::barrier(kmp_int32 id,
311
                                      kmp_hier_shared_bdata_t<T> *bdata,
312
                                      kmp_hier_private_bdata_t *tdata) {
313
  volatile kmp_int64 *val;
314
  kmp_uint64 current_index = tdata->index;
315
  kmp_uint64 next_index = 1 - current_index;
316
  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
317
  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
318

319
  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
320
                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
321
                __kmp_get_gtid(), current_index, next_index, current_wait_value,
322
                next_wait_value));
323
  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
324
  KMP_TEST_THEN_INC64(val);
325
  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
326
                         __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
327
  tdata->wait_val[current_index] = next_wait_value;
328
  tdata->index = next_index;
329
}
330

331
// Data associated with topology unit within a layer
332
// For example, one kmp_hier_top_unit_t corresponds to one L1 cache
333
template <typename T> struct kmp_hier_top_unit_t {
334
  typedef typename traits_t<T>::signed_t ST;
335
  typedef typename traits_t<T>::unsigned_t UT;
336
  kmp_int32 active; // number of topology units that communicate with this unit
337
  // chunk information (lower/upper bound, stride, etc.)
338
  dispatch_private_info_template<T> hier_pr;
339
  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
340
  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
341

342
  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
343
  void reset_shared_barrier() {
344
    KMP_DEBUG_ASSERT(active > 0);
345
    if (active == 1)
346
      return;
347
    hier_barrier.zero();
348
    if (active >= 2 && active <= 8) {
349
      core_barrier_impl<T>::reset_shared(active, &hier_barrier);
350
    } else {
351
      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
352
    }
353
  }
354
  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
355
    KMP_DEBUG_ASSERT(tdata);
356
    KMP_DEBUG_ASSERT(active > 0);
357
    if (active == 1)
358
      return;
359
    if (active >= 2 && active <= 8) {
360
      core_barrier_impl<T>::reset_private(active, tdata);
361
    } else {
362
      counter_barrier_impl<T>::reset_private(active, tdata);
363
    }
364
  }
365
  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
366
    KMP_DEBUG_ASSERT(tdata);
367
    KMP_DEBUG_ASSERT(active > 0);
368
    KMP_DEBUG_ASSERT(id >= 0 && id < active);
369
    if (active == 1) {
370
      tdata->index = 1 - tdata->index;
371
      return;
372
    }
373
    if (active >= 2 && active <= 8) {
374
      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
375
    } else {
376
      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
377
    }
378
  }
379

380
  kmp_int32 get_next_status(kmp_uint64 index) const {
381
    return hier_barrier.get_next_status(index);
382
  }
383
  T get_next_lb(kmp_uint64 index) const {
384
    return hier_barrier.get_next_lb(index);
385
  }
386
  T get_next_ub(kmp_uint64 index) const {
387
    return hier_barrier.get_next_ub(index);
388
  }
389
  ST get_next_st(kmp_uint64 index) const {
390
    return hier_barrier.get_next_st(index);
391
  }
392
  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
393
    return hier_barrier.get_next_sh(index);
394
  }
395

396
  kmp_int32 get_curr_status(kmp_uint64 index) const {
397
    return hier_barrier.get_curr_status(index);
398
  }
399
  T get_curr_lb(kmp_uint64 index) const {
400
    return hier_barrier.get_curr_lb(index);
401
  }
402
  T get_curr_ub(kmp_uint64 index) const {
403
    return hier_barrier.get_curr_ub(index);
404
  }
405
  ST get_curr_st(kmp_uint64 index) const {
406
    return hier_barrier.get_curr_st(index);
407
  }
408
  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
409
    return hier_barrier.get_curr_sh(index);
410
  }
411

412
  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
413
                            kmp_uint64 index) {
414
    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
415
  }
416
  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
417
    hier_barrier.set_next(lb, ub, st, status, index);
418
  }
419
  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
420
  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
421
  dispatch_private_info_template<T> *get_parent_pr() {
422
    return &(hier_parent->hier_pr);
423
  }
424

425
  kmp_int32 is_active() const { return active; }
426
  kmp_int32 get_num_active() const { return active; }
427
#ifdef KMP_DEBUG
428
  void print() {
429
    KD_TRACE(
430
        10,
431
        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
432
         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
433
         hier_pr.u.p.tc));
434
  }
435
#endif
436
};
437

438
// Information regarding a single layer within the scheduling hierarchy
439
template <typename T> struct kmp_hier_layer_info_t {
440
  int num_active; // number of threads active in this level
441
  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
442
  enum sched_type sched; // static, dynamic, guided, etc.
443
  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
444
  int length; // length of the kmp_hier_top_unit_t array
445

446
#ifdef KMP_DEBUG
447
  // Print this layer's information
448
  void print() {
449
    const char *t = __kmp_get_hier_str(type);
450
    KD_TRACE(
451
        10,
452
        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
453
         "length:%d\n",
454
         num_active, t, sched, chunk, length));
455
  }
456
#endif
457
};
458

459
/*
460
 * Structure to implement entire hierarchy
461
 *
462
 * The hierarchy is kept as an array of arrays to represent the different
463
 * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
464
 * highest layer.
465
 * Example:
466
 * [ 2 ] -> [ L3 | L3 ]
467
 * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
468
 * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
469
 * There is also an array of layer_info_t which has information regarding
470
 * each layer
471
 */
472
template <typename T> struct kmp_hier_t {
473
public:
474
  typedef typename traits_t<T>::unsigned_t UT;
475
  typedef typename traits_t<T>::signed_t ST;
476

477
private:
478
  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
479
                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
480
                   kmp_int32 previous_id, int hier_level) {
481
    int status;
482
    kmp_info_t *th = __kmp_threads[gtid];
483
    auto parent = current->get_parent();
484
    bool last_layer = (hier_level == get_num_layers() - 1);
485
    KMP_DEBUG_ASSERT(th);
486
    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
487
    KMP_DEBUG_ASSERT(current);
488
    KMP_DEBUG_ASSERT(hier_level >= 0);
489
    KMP_DEBUG_ASSERT(hier_level < get_num_layers());
490
    KMP_DEBUG_ASSERT(tdata);
491
    KMP_DEBUG_ASSERT(parent || last_layer);
492

493
    KD_TRACE(
494
        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
495

496
    T hier_id = (T)current->get_hier_id();
497
    // Attempt to grab next iteration range for this level
498
    if (previous_id == 0) {
499
      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
500
                   gtid, hier_level));
501
      kmp_int32 contains_last;
502
      T my_lb, my_ub;
503
      ST my_st;
504
      T nproc;
505
      dispatch_shared_info_template<T> volatile *my_sh;
506
      dispatch_private_info_template<T> *my_pr;
507
      if (last_layer) {
508
        // last layer below the very top uses the single shared buffer
509
        // from the team struct.
510
        KD_TRACE(10,
511
                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
512
                  gtid, hier_level));
513
        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
514
            th->th.th_dispatch->th_dispatch_sh_current);
515
        nproc = (T)get_top_level_nproc();
516
      } else {
517
        // middle layers use the shared buffer inside the kmp_hier_top_unit_t
518
        // structure
519
        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
520
                      gtid, hier_level));
521
        my_sh =
522
            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
523
        nproc = (T)parent->get_num_active();
524
      }
525
      my_pr = current->get_my_pr();
526
      KMP_DEBUG_ASSERT(my_sh);
527
      KMP_DEBUG_ASSERT(my_pr);
528
      enum sched_type schedule = get_sched(hier_level);
529
      ST chunk = (ST)get_chunk(hier_level);
530
      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
531
                                                &contains_last, &my_lb, &my_ub,
532
                                                &my_st, nproc, hier_id);
533
      KD_TRACE(
534
          10,
535
          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536
           gtid, hier_level, status));
537
      // When no iterations are found (status == 0) and this is not the last
538
      // layer, attempt to go up the hierarchy for more iterations
539
      if (status == 0 && !last_layer) {
540
        kmp_int32 hid;
541
        __kmp_type_convert(hier_id, &hid);
542
        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
543
                              &my_st, hid, hier_level + 1);
544
        KD_TRACE(
545
            10,
546
            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
547
             gtid, hier_level, status));
548
        if (status == 1) {
549
          kmp_hier_private_bdata_t *upper_tdata =
550
              &(th->th.th_hier_bar_data[hier_level + 1]);
551
          my_sh = parent->get_curr_sh(upper_tdata->index);
552
          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
553
                        gtid, hier_level));
554
          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
555
                                        parent->get_curr_lb(upper_tdata->index),
556
                                        parent->get_curr_ub(upper_tdata->index),
557
                                        parent->get_curr_st(upper_tdata->index),
558
#if USE_ITT_BUILD
559
                                        NULL,
560
#endif
561
                                        chunk, nproc, hier_id);
562
          status = __kmp_dispatch_next_algorithm<T>(
563
              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
564
              hier_id);
565
          if (!status) {
566
            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
567
                          "setting to 2!\n",
568
                          gtid, hier_level));
569
            status = 2;
570
          }
571
        }
572
      }
573
      current->set_next(my_lb, my_ub, my_st, status, tdata->index);
574
      // Propagate whether a unit holds the actual global last iteration
575
      // The contains_last attribute is sent downwards from the top to the
576
      // bottom of the hierarchy via the contains_last flag inside the
577
      // private dispatch buffers in the hierarchy's middle layers
578
      if (contains_last) {
579
        // If the next_algorithm() method returns 1 for p_last and it is the
580
        // last layer or our parent contains the last serial chunk, then the
581
        // chunk must contain the last serial iteration.
582
        if (last_layer || parent->hier_pr.flags.contains_last) {
583
          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
584
                        "to contain last.\n",
585
                        gtid, hier_level));
586
          current->hier_pr.flags.contains_last = contains_last;
587
        }
588
        if (!current->hier_pr.flags.contains_last)
589
          contains_last = FALSE;
590
      }
591
      if (p_last)
592
        *p_last = contains_last;
593
    } // if primary thread of this unit
594
    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
595
      KD_TRACE(10,
596
               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
597
                gtid, hier_level));
598
      current->barrier(previous_id, tdata);
599
      KD_TRACE(10,
600
               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
601
                gtid, hier_level, current->get_curr_status(tdata->index)));
602
    } else {
603
      KMP_DEBUG_ASSERT(previous_id == 0);
604
      return status;
605
    }
606
    return current->get_curr_status(tdata->index);
607
  }
608

609
public:
610
  int top_level_nproc;
611
  int num_layers;
612
  bool valid;
613
  int type_size;
614
  kmp_hier_layer_info_t<T> *info;
615
  kmp_hier_top_unit_t<T> **layers;
616
  // Deallocate all memory from this hierarchy
617
  void deallocate() {
618
    for (int i = 0; i < num_layers; ++i)
619
      if (layers[i] != NULL) {
620
        __kmp_free(layers[i]);
621
      }
622
    if (layers != NULL) {
623
      __kmp_free(layers);
624
      layers = NULL;
625
    }
626
    if (info != NULL) {
627
      __kmp_free(info);
628
      info = NULL;
629
    }
630
    num_layers = 0;
631
    valid = false;
632
  }
633
  // Returns true if reallocation is needed else false
634
  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
635
                          const enum sched_type *new_scheds,
636
                          const ST *new_chunks) const {
637
    if (!valid || layers == NULL || info == NULL ||
638
        traits_t<T>::type_size != type_size || n != num_layers)
639
      return true;
640
    for (int i = 0; i < n; ++i) {
641
      if (info[i].type != new_layers[i])
642
        return true;
643
      if (info[i].sched != new_scheds[i])
644
        return true;
645
      if (info[i].chunk != new_chunks[i])
646
        return true;
647
    }
648
    return false;
649
  }
650
  // A single thread should call this function while the other threads wait
651
  // create a new scheduling hierarchy consisting of new_layers, new_scheds
652
  // and new_chunks.  These should come pre-sorted according to
653
  // kmp_hier_layer_e value.  This function will try to avoid reallocation
654
  // if it can
655
  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
656
                     const enum sched_type *new_scheds, const ST *new_chunks) {
657
    top_level_nproc = 0;
658
    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
659
      KD_TRACE(
660
          10,
661
          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
662
      for (int i = 0; i < n; ++i) {
663
        info[i].num_active = 0;
664
        for (int j = 0; j < get_length(i); ++j)
665
          layers[i][j].active = 0;
666
      }
667
      return;
668
    }
669
    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
670
    deallocate();
671
    type_size = traits_t<T>::type_size;
672
    num_layers = n;
673
    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
674
        sizeof(kmp_hier_layer_info_t<T>) * n);
675
    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
676
        sizeof(kmp_hier_top_unit_t<T> *) * n);
677
    for (int i = 0; i < n; ++i) {
678
      int max = 0;
679
      kmp_hier_layer_e layer = new_layers[i];
680
      info[i].num_active = 0;
681
      info[i].type = layer;
682
      info[i].sched = new_scheds[i];
683
      info[i].chunk = new_chunks[i];
684
      max = __kmp_hier_max_units[layer + 1];
685
      if (max == 0) {
686
        valid = false;
687
        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
688
        deallocate();
689
        return;
690
      }
691
      info[i].length = max;
692
      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
693
          sizeof(kmp_hier_top_unit_t<T>) * max);
694
      for (int j = 0; j < max; ++j) {
695
        layers[i][j].active = 0;
696
        layers[i][j].hier_pr.flags.use_hier = TRUE;
697
      }
698
    }
699
    valid = true;
700
  }
701
  // loc - source file location
702
  // gtid - global thread identifier
703
  // pr - this thread's private dispatch buffer (corresponding with gtid)
704
  // p_last (return value) - pointer to flag indicating this set of iterations
705
  // contains last
706
  //          iteration
707
  // p_lb (return value) - lower bound for this chunk of iterations
708
  // p_ub (return value) - upper bound for this chunk of iterations
709
  // p_st (return value) - stride for this chunk of iterations
710
  //
711
  // Returns 1 if there are more iterations to perform, 0 otherwise
712
  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
713
           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
714
    int status;
715
    kmp_int32 contains_last = 0;
716
    kmp_info_t *th = __kmp_threads[gtid];
717
    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
718
    auto parent = pr->get_parent();
719
    KMP_DEBUG_ASSERT(parent);
720
    KMP_DEBUG_ASSERT(th);
721
    KMP_DEBUG_ASSERT(tdata);
722
    KMP_DEBUG_ASSERT(parent);
723
    T nproc = (T)parent->get_num_active();
724
    T unit_id = (T)pr->get_hier_id();
725
    KD_TRACE(
726
        10,
727
        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
728
         gtid, nproc, unit_id));
729
    // Handthreading implementation
730
    // Each iteration is performed by all threads on last unit (typically
731
    // cores/tiles)
732
    // e.g., threads 0,1,2,3 all execute iteration 0
733
    //       threads 0,1,2,3 all execute iteration 1
734
    //       threads 4,5,6,7 all execute iteration 2
735
    //       threads 4,5,6,7 all execute iteration 3
736
    //       ... etc.
737
    if (__kmp_dispatch_hand_threading) {
738
      KD_TRACE(10,
739
               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
740
                gtid));
741
      if (unit_id == 0) {
742
        // For hand threading, the sh buffer on the lowest level is only ever
743
        // modified and read by the primary thread on that level.  Because of
744
        // this, we can always use the first sh buffer.
745
        auto sh = &(parent->hier_barrier.sh[0]);
746
        KMP_DEBUG_ASSERT(sh);
747
        status = __kmp_dispatch_next_algorithm<T>(
748
            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
749
        if (!status) {
750
          bool done = false;
751
          while (!done) {
752
            done = true;
753
            kmp_int32 uid;
754
            __kmp_type_convert(unit_id, &uid);
755
            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
756
                                  p_st, uid, 0);
757
            if (status == 1) {
758
              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
759
                                            parent->get_next_lb(tdata->index),
760
                                            parent->get_next_ub(tdata->index),
761
                                            parent->get_next_st(tdata->index),
762
#if USE_ITT_BUILD
763
                                            NULL,
764
#endif
765
                                            pr->u.p.parm1, nproc, unit_id);
766
              sh->u.s.iteration = 0;
767
              status = __kmp_dispatch_next_algorithm<T>(
768
                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
769
                  unit_id);
770
              if (!status) {
771
                KD_TRACE(10,
772
                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
773
                          "after next_pr_sh()"
774
                          "trying again.\n",
775
                          gtid));
776
                done = false;
777
              }
778
            } else if (status == 2) {
779
              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
780
                            "trying again.\n",
781
                            gtid));
782
              done = false;
783
            }
784
          }
785
        }
786
        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
787
      } // if primary thread of lowest unit level
788
      parent->barrier(pr->get_hier_id(), tdata);
789
      if (unit_id != 0) {
790
        *p_lb = parent->get_curr_lb(tdata->index);
791
        *p_ub = parent->get_curr_ub(tdata->index);
792
        *p_st = parent->get_curr_st(tdata->index);
793
        status = parent->get_curr_status(tdata->index);
794
      }
795
    } else {
796
      // Normal implementation
797
      // Each thread grabs an iteration chunk and executes it (no cooperation)
798
      auto sh = parent->get_curr_sh(tdata->index);
799
      KMP_DEBUG_ASSERT(sh);
800
      status = __kmp_dispatch_next_algorithm<T>(
801
          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
802
      KD_TRACE(10,
803
               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
804
                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
805
                gtid, status, contains_last, *p_lb, *p_ub, *p_st));
806
      if (!status) {
807
        bool done = false;
808
        while (!done) {
809
          done = true;
810
          kmp_int32 uid;
811
          __kmp_type_convert(unit_id, &uid);
812
          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
813
                                p_st, uid, 0);
814
          if (status == 1) {
815
            sh = parent->get_curr_sh(tdata->index);
816
            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
817
                                          parent->get_curr_lb(tdata->index),
818
                                          parent->get_curr_ub(tdata->index),
819
                                          parent->get_curr_st(tdata->index),
820
#if USE_ITT_BUILD
821
                                          NULL,
822
#endif
823
                                          pr->u.p.parm1, nproc, unit_id);
824
            status = __kmp_dispatch_next_algorithm<T>(
825
                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
826
            if (!status) {
827
              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
828
                            "after next_pr_sh()"
829
                            "trying again.\n",
830
                            gtid));
831
              done = false;
832
            }
833
          } else if (status == 2) {
834
            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
835
                          "trying again.\n",
836
                          gtid));
837
            done = false;
838
          }
839
        }
840
      }
841
    }
842
    if (contains_last && !parent->hier_pr.flags.contains_last) {
843
      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
844
                    "contains_last to FALSE\n",
845
                    gtid));
846
      contains_last = FALSE;
847
    }
848
    if (p_last)
849
      *p_last = contains_last;
850
    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
851
                  status));
852
    return status;
853
  }
854
  // These functions probe the layer info structure
855
  // Returns the type of topology unit given level
856
  kmp_hier_layer_e get_type(int level) const {
857
    KMP_DEBUG_ASSERT(level >= 0);
858
    KMP_DEBUG_ASSERT(level < num_layers);
859
    return info[level].type;
860
  }
861
  // Returns the schedule type at given level
862
  enum sched_type get_sched(int level) const {
863
    KMP_DEBUG_ASSERT(level >= 0);
864
    KMP_DEBUG_ASSERT(level < num_layers);
865
    return info[level].sched;
866
  }
867
  // Returns the chunk size at given level
868
  ST get_chunk(int level) const {
869
    KMP_DEBUG_ASSERT(level >= 0);
870
    KMP_DEBUG_ASSERT(level < num_layers);
871
    return info[level].chunk;
872
  }
873
  // Returns the number of active threads at given level
874
  int get_num_active(int level) const {
875
    KMP_DEBUG_ASSERT(level >= 0);
876
    KMP_DEBUG_ASSERT(level < num_layers);
877
    return info[level].num_active;
878
  }
879
  // Returns the length of topology unit array at given level
880
  int get_length(int level) const {
881
    KMP_DEBUG_ASSERT(level >= 0);
882
    KMP_DEBUG_ASSERT(level < num_layers);
883
    return info[level].length;
884
  }
885
  // Returns the topology unit given the level and index
886
  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
887
    KMP_DEBUG_ASSERT(level >= 0);
888
    KMP_DEBUG_ASSERT(level < num_layers);
889
    KMP_DEBUG_ASSERT(index >= 0);
890
    KMP_DEBUG_ASSERT(index < get_length(level));
891
    return &(layers[level][index]);
892
  }
893
  // Returns the number of layers in the hierarchy
894
  int get_num_layers() const { return num_layers; }
895
  // Returns the number of threads in the top layer
896
  // This is necessary because we don't store a topology unit as
897
  // the very top level and the scheduling algorithms need this information
898
  int get_top_level_nproc() const { return top_level_nproc; }
899
  // Return whether this hierarchy is valid or not
900
  bool is_valid() const { return valid; }
901
#ifdef KMP_DEBUG
902
  // Print the hierarchy
903
  void print() {
904
    KD_TRACE(10, ("kmp_hier_t:\n"));
905
    for (int i = num_layers - 1; i >= 0; --i) {
906
      KD_TRACE(10, ("Info[%d] = ", i));
907
      info[i].print();
908
    }
909
    for (int i = num_layers - 1; i >= 0; --i) {
910
      KD_TRACE(10, ("Layer[%d] =\n", i));
911
      for (int j = 0; j < info[i].length; ++j) {
912
        layers[i][j].print();
913
      }
914
    }
915
  }
916
#endif
917
};
918

919
template <typename T>
920
void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
921
                                   kmp_hier_layer_e *new_layers,
922
                                   enum sched_type *new_scheds,
923
                                   typename traits_t<T>::signed_t *new_chunks,
924
                                   T lb, T ub,
925
                                   typename traits_t<T>::signed_t st) {
926
  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
927
  unsigned int my_buffer_index;
928
  kmp_info_t *th;
929
  kmp_team_t *team;
930
  dispatch_private_info_template<T> *pr;
931
  dispatch_shared_info_template<T> volatile *sh;
932
  gtid = __kmp_entry_gtid();
933
  tid = __kmp_tid_from_gtid(gtid);
934
#ifdef KMP_DEBUG
935
  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
936
                gtid, n));
937
  for (int i = 0; i < n; ++i) {
938
    const char *layer = __kmp_get_hier_str(new_layers[i]);
939
    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
940
                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
941
                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
942
  }
943
#endif // KMP_DEBUG
944
  KMP_DEBUG_ASSERT(n > 0);
945
  KMP_DEBUG_ASSERT(new_layers);
946
  KMP_DEBUG_ASSERT(new_scheds);
947
  KMP_DEBUG_ASSERT(new_chunks);
948
  if (!TCR_4(__kmp_init_parallel))
949
    __kmp_parallel_initialize();
950
  __kmp_resume_if_soft_paused();
951

952
  th = __kmp_threads[gtid];
953
  team = th->th.th_team;
954
  active = !team->t.t_serialized;
955
  th->th.th_ident = loc;
956
  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
957
  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
958
                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
959
  my_buffer_index = th->th.th_dispatch->th_disp_index;
960
  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
961
      &th->th.th_dispatch
962
           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
963
  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
964
      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
965
  if (!active) {
966
    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
967
                  "Using normal dispatch functions.\n",
968
                  gtid));
969
    KMP_DEBUG_ASSERT(pr);
970
    pr->flags.use_hier = FALSE;
971
    pr->flags.contains_last = FALSE;
972
    return;
973
  }
974
  KMP_DEBUG_ASSERT(pr);
975
  KMP_DEBUG_ASSERT(sh);
976
  pr->flags.use_hier = TRUE;
977
  pr->u.p.tc = 0;
978
  // Have primary thread allocate the hierarchy
979
  if (__kmp_tid_from_gtid(gtid) == 0) {
980
    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
981
                  "hierarchy\n",
982
                  gtid, pr, sh));
983
    if (sh->hier == NULL) {
984
      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
985
    }
986
    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
987
    sh->u.s.iteration = 0;
988
  }
989
  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
990
  // Check to make sure the hierarchy is valid
991
  kmp_hier_t<T> *hier = sh->hier;
992
  if (!sh->hier->is_valid()) {
993
    pr->flags.use_hier = FALSE;
994
    return;
995
  }
996
  // Have threads allocate their thread-private barrier data if it hasn't
997
  // already been allocated
998
  if (th->th.th_hier_bar_data == NULL) {
999
    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
1000
        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
1001
  }
1002
  // Have threads "register" themselves by modifying the active count for each
1003
  // level they are involved in. The active count will act as nthreads for that
1004
  // level regarding the scheduling algorithms
1005
  for (int i = 0; i < n; ++i) {
1006
    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1007
    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1008
    // Setup the thread's private dispatch buffer's hierarchy pointers
1009
    if (i == 0)
1010
      pr->hier_parent = my_unit;
1011
    // If this unit is already active, then increment active count and wait
1012
    if (my_unit->is_active()) {
1013
      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1014
                    "is already active (%d)\n",
1015
                    gtid, my_unit, my_unit->active));
1016
      KMP_TEST_THEN_INC32(&(my_unit->active));
1017
      break;
1018
    }
1019
    // Flag that this unit is active
1020
    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
1021
      // Do not setup parent pointer for top level unit since it has no parent
1022
      if (i < n - 1) {
1023
        // Setup middle layer pointers to parents
1024
        my_unit->get_my_pr()->hier_id =
1025
            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1026
                                                 hier->get_type(i + 1));
1027
        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1028
        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1029
      } else {
1030
        // Setup top layer information (no parent pointers are set)
1031
        my_unit->get_my_pr()->hier_id =
1032
            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1033
                                                 kmp_hier_layer_e::LAYER_LOOP);
1034
        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1035
        my_unit->hier_parent = nullptr;
1036
      }
1037
      // Set trip count to 0 so that next() operation will initially climb up
1038
      // the hierarchy to get more iterations (early exit in next() for tc == 0)
1039
      my_unit->get_my_pr()->u.p.tc = 0;
1040
      // Increment this layer's number of active units
1041
      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1042
      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1043
                    "incrementing num_active\n",
1044
                    gtid, my_unit));
1045
    } else {
1046
      KMP_TEST_THEN_INC32(&(my_unit->active));
1047
      break;
1048
    }
1049
  }
1050
  // Set this thread's id
1051
  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1052
      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1053
  pr->hier_id = tid % num_threads_per_layer1;
1054
  // For oversubscribed threads, increment their index within the lowest unit
1055
  // This is done to prevent having two or more threads with id 0, id 1, etc.
1056
  if (tid >= num_hw_threads)
1057
    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1058
  KD_TRACE(
1059
      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1060
           gtid, pr->hier_id));
1061

1062
  pr->flags.contains_last = FALSE;
1063
  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1064

1065
  // Now that the number of active threads at each level is determined,
1066
  // the barrier data for each unit can be initialized and the last layer's
1067
  // loop information can be initialized.
1068
  int prev_id = pr->get_hier_id();
1069
  for (int i = 0; i < n; ++i) {
1070
    if (prev_id != 0)
1071
      break;
1072
    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1073
    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1074
    // Only primary threads of this unit within the hierarchy do initialization
1075
    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1076
                  gtid, i));
1077
    my_unit->reset_shared_barrier();
1078
    my_unit->hier_pr.flags.contains_last = FALSE;
1079
    // Last layer, initialize the private buffers with entire loop information
1080
    // Now the next next_algorithm() call will get the first chunk of
1081
    // iterations properly
1082
    if (i == n - 1) {
1083
      __kmp_dispatch_init_algorithm<T>(
1084
          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1085
#if USE_ITT_BUILD
1086
          NULL,
1087
#endif
1088
          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1089
    }
1090
    prev_id = my_unit->get_hier_id();
1091
  }
1092
  // Initialize each layer of the thread's private barrier data
1093
  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1094
  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1095
    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1096
    unit->reset_private_barrier(tdata);
1097
  }
1098
  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1099

1100
#ifdef KMP_DEBUG
1101
  if (__kmp_tid_from_gtid(gtid) == 0) {
1102
    for (int i = 0; i < n; ++i) {
1103
      KD_TRACE(10,
1104
               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1105
                gtid, i, hier->get_num_active(i)));
1106
    }
1107
    hier->print();
1108
  }
1109
  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1110
#endif // KMP_DEBUG
1111
}
1112
#endif
1113

1114
Product

Resources

Company