CoCalc -- kmp_affinity.h

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h
³⁵²⁵⁸ views
1
/*
2
 * kmp_affinity.h -- header for affinity management
3
 */
4

5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12

13
#ifndef KMP_AFFINITY_H
14
#define KMP_AFFINITY_H
15

16
#include "kmp.h"
17
#include "kmp_os.h"
18
#include <limits>
19

20
#if KMP_AFFINITY_SUPPORTED
21
#if KMP_USE_HWLOC
22
class KMPHwlocAffinity : public KMPAffinity {
23
public:
24
  class Mask : public KMPAffinity::Mask {
25
    hwloc_cpuset_t mask;
26

27
  public:
28
    Mask() {
29
      mask = hwloc_bitmap_alloc();
30
      this->zero();
31
    }
32
    ~Mask() { hwloc_bitmap_free(mask); }
33
    void set(int i) override { hwloc_bitmap_set(mask, i); }
34
    bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35
    void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36
    void zero() override { hwloc_bitmap_zero(mask); }
37
    bool empty() const override { return hwloc_bitmap_iszero(mask); }
38
    void copy(const KMPAffinity::Mask *src) override {
39
      const Mask *convert = static_cast<const Mask *>(src);
40
      hwloc_bitmap_copy(mask, convert->mask);
41
    }
42
    void bitwise_and(const KMPAffinity::Mask *rhs) override {
43
      const Mask *convert = static_cast<const Mask *>(rhs);
44
      hwloc_bitmap_and(mask, mask, convert->mask);
45
    }
46
    void bitwise_or(const KMPAffinity::Mask *rhs) override {
47
      const Mask *convert = static_cast<const Mask *>(rhs);
48
      hwloc_bitmap_or(mask, mask, convert->mask);
49
    }
50
    void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51
    bool is_equal(const KMPAffinity::Mask *rhs) const override {
52
      const Mask *convert = static_cast<const Mask *>(rhs);
53
      return hwloc_bitmap_isequal(mask, convert->mask);
54
    }
55
    int begin() const override { return hwloc_bitmap_first(mask); }
56
    int end() const override { return -1; }
57
    int next(int previous) const override {
58
      return hwloc_bitmap_next(mask, previous);
59
    }
60
    int get_system_affinity(bool abort_on_error) override {
61
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62
                  "Illegal get affinity operation when not capable");
63
      long retval =
64
          hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65
      if (retval >= 0) {
66
        return 0;
67
      }
68
      int error = errno;
69
      if (abort_on_error) {
70
        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71
                    KMP_ERR(error), __kmp_msg_null);
72
      }
73
      return error;
74
    }
75
    int set_system_affinity(bool abort_on_error) const override {
76
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77
                  "Illegal set affinity operation when not capable");
78
      long retval =
79
          hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80
      if (retval >= 0) {
81
        return 0;
82
      }
83
      int error = errno;
84
      if (abort_on_error) {
85
        __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86
                    KMP_ERR(error), __kmp_msg_null);
87
      }
88
      return error;
89
    }
90
#if KMP_OS_WINDOWS
91
    int set_process_affinity(bool abort_on_error) const override {
92
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93
                  "Illegal set process affinity operation when not capable");
94
      int error = 0;
95
      const hwloc_topology_support *support =
96
          hwloc_topology_get_support(__kmp_hwloc_topology);
97
      if (support->cpubind->set_proc_cpubind) {
98
        int retval;
99
        retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100
                                   HWLOC_CPUBIND_PROCESS);
101
        if (retval >= 0)
102
          return 0;
103
        error = errno;
104
        if (abort_on_error)
105
          __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106
                      KMP_ERR(error), __kmp_msg_null);
107
      }
108
      return error;
109
    }
110
#endif
111
    int get_proc_group() const override {
112
      int group = -1;
113
#if KMP_OS_WINDOWS
114
      if (__kmp_num_proc_groups == 1) {
115
        return 1;
116
      }
117
      for (int i = 0; i < __kmp_num_proc_groups; i++) {
118
        // On windows, the long type is always 32 bits
119
        unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120
        unsigned long second_32_bits =
121
            hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122
        if (first_32_bits == 0 && second_32_bits == 0) {
123
          continue;
124
        }
125
        if (group >= 0) {
126
          return -1;
127
        }
128
        group = i;
129
      }
130
#endif /* KMP_OS_WINDOWS */
131
      return group;
132
    }
133
  };
134
  void determine_capable(const char *var) override {
135
    const hwloc_topology_support *topology_support;
136
    if (__kmp_hwloc_topology == NULL) {
137
      if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138
        __kmp_hwloc_error = TRUE;
139
        if (__kmp_affinity.flags.verbose) {
140
          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141
        }
142
      }
143
      if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144
        __kmp_hwloc_error = TRUE;
145
        if (__kmp_affinity.flags.verbose) {
146
          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147
        }
148
      }
149
    }
150
    topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151
    // Is the system capable of setting/getting this thread's affinity?
152
    // Also, is topology discovery possible? (pu indicates ability to discover
153
    // processing units). And finally, were there no errors when calling any
154
    // hwloc_* API functions?
155
    if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156
        topology_support->cpubind->get_thisthread_cpubind &&
157
        topology_support->discovery->pu && !__kmp_hwloc_error) {
158
      // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159
      KMP_AFFINITY_ENABLE(TRUE);
160
    } else {
161
      // indicate that hwloc didn't work and disable affinity
162
      __kmp_hwloc_error = TRUE;
163
      KMP_AFFINITY_DISABLE();
164
    }
165
  }
166
  void bind_thread(int which) override {
167
    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168
                "Illegal set affinity operation when not capable");
169
    KMPAffinity::Mask *mask;
170
    KMP_CPU_ALLOC_ON_STACK(mask);
171
    KMP_CPU_ZERO(mask);
172
    KMP_CPU_SET(which, mask);
173
    __kmp_set_system_affinity(mask, TRUE);
174
    KMP_CPU_FREE_FROM_STACK(mask);
175
  }
176
  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177
  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178
  KMPAffinity::Mask *allocate_mask_array(int num) override {
179
    return new Mask[num];
180
  }
181
  void deallocate_mask_array(KMPAffinity::Mask *array) override {
182
    Mask *hwloc_array = static_cast<Mask *>(array);
183
    delete[] hwloc_array;
184
  }
185
  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186
                                      int index) override {
187
    Mask *hwloc_array = static_cast<Mask *>(array);
188
    return &(hwloc_array[index]);
189
  }
190
  api_type get_api_type() const override { return HWLOC; }
191
};
192
#endif /* KMP_USE_HWLOC */
193

194
#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
195
    KMP_OS_AIX
196
#if KMP_OS_LINUX
197
/* On some of the older OS's that we build on, these constants aren't present
198
   in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199
   all systems of the same arch where they are defined, and they cannot change.
200
   stone forever. */
201
#include <sys/syscall.h>
202
#if KMP_ARCH_X86 || KMP_ARCH_ARM
203
#ifndef __NR_sched_setaffinity
204
#define __NR_sched_setaffinity 241
205
#elif __NR_sched_setaffinity != 241
206
#error Wrong code for setaffinity system call.
207
#endif /* __NR_sched_setaffinity */
208
#ifndef __NR_sched_getaffinity
209
#define __NR_sched_getaffinity 242
210
#elif __NR_sched_getaffinity != 242
211
#error Wrong code for getaffinity system call.
212
#endif /* __NR_sched_getaffinity */
213
#elif KMP_ARCH_AARCH64
214
#ifndef __NR_sched_setaffinity
215
#define __NR_sched_setaffinity 122
216
#elif __NR_sched_setaffinity != 122
217
#error Wrong code for setaffinity system call.
218
#endif /* __NR_sched_setaffinity */
219
#ifndef __NR_sched_getaffinity
220
#define __NR_sched_getaffinity 123
221
#elif __NR_sched_getaffinity != 123
222
#error Wrong code for getaffinity system call.
223
#endif /* __NR_sched_getaffinity */
224
#elif KMP_ARCH_X86_64
225
#ifndef __NR_sched_setaffinity
226
#define __NR_sched_setaffinity 203
227
#elif __NR_sched_setaffinity != 203
228
#error Wrong code for setaffinity system call.
229
#endif /* __NR_sched_setaffinity */
230
#ifndef __NR_sched_getaffinity
231
#define __NR_sched_getaffinity 204
232
#elif __NR_sched_getaffinity != 204
233
#error Wrong code for getaffinity system call.
234
#endif /* __NR_sched_getaffinity */
235
#elif KMP_ARCH_PPC64
236
#ifndef __NR_sched_setaffinity
237
#define __NR_sched_setaffinity 222
238
#elif __NR_sched_setaffinity != 222
239
#error Wrong code for setaffinity system call.
240
#endif /* __NR_sched_setaffinity */
241
#ifndef __NR_sched_getaffinity
242
#define __NR_sched_getaffinity 223
243
#elif __NR_sched_getaffinity != 223
244
#error Wrong code for getaffinity system call.
245
#endif /* __NR_sched_getaffinity */
246
#elif KMP_ARCH_MIPS
247
#ifndef __NR_sched_setaffinity
248
#define __NR_sched_setaffinity 4239
249
#elif __NR_sched_setaffinity != 4239
250
#error Wrong code for setaffinity system call.
251
#endif /* __NR_sched_setaffinity */
252
#ifndef __NR_sched_getaffinity
253
#define __NR_sched_getaffinity 4240
254
#elif __NR_sched_getaffinity != 4240
255
#error Wrong code for getaffinity system call.
256
#endif /* __NR_sched_getaffinity */
257
#elif KMP_ARCH_MIPS64
258
#ifndef __NR_sched_setaffinity
259
#define __NR_sched_setaffinity 5195
260
#elif __NR_sched_setaffinity != 5195
261
#error Wrong code for setaffinity system call.
262
#endif /* __NR_sched_setaffinity */
263
#ifndef __NR_sched_getaffinity
264
#define __NR_sched_getaffinity 5196
265
#elif __NR_sched_getaffinity != 5196
266
#error Wrong code for getaffinity system call.
267
#endif /* __NR_sched_getaffinity */
268
#elif KMP_ARCH_LOONGARCH64
269
#ifndef __NR_sched_setaffinity
270
#define __NR_sched_setaffinity 122
271
#elif __NR_sched_setaffinity != 122
272
#error Wrong code for setaffinity system call.
273
#endif /* __NR_sched_setaffinity */
274
#ifndef __NR_sched_getaffinity
275
#define __NR_sched_getaffinity 123
276
#elif __NR_sched_getaffinity != 123
277
#error Wrong code for getaffinity system call.
278
#endif /* __NR_sched_getaffinity */
279
#elif KMP_ARCH_RISCV64
280
#ifndef __NR_sched_setaffinity
281
#define __NR_sched_setaffinity 122
282
#elif __NR_sched_setaffinity != 122
283
#error Wrong code for setaffinity system call.
284
#endif /* __NR_sched_setaffinity */
285
#ifndef __NR_sched_getaffinity
286
#define __NR_sched_getaffinity 123
287
#elif __NR_sched_getaffinity != 123
288
#error Wrong code for getaffinity system call.
289
#endif /* __NR_sched_getaffinity */
290
#elif KMP_ARCH_VE
291
#ifndef __NR_sched_setaffinity
292
#define __NR_sched_setaffinity 203
293
#elif __NR_sched_setaffinity != 203
294
#error Wrong code for setaffinity system call.
295
#endif /* __NR_sched_setaffinity */
296
#ifndef __NR_sched_getaffinity
297
#define __NR_sched_getaffinity 204
298
#elif __NR_sched_getaffinity != 204
299
#error Wrong code for getaffinity system call.
300
#endif /* __NR_sched_getaffinity */
301
#elif KMP_ARCH_S390X
302
#ifndef __NR_sched_setaffinity
303
#define __NR_sched_setaffinity 239
304
#elif __NR_sched_setaffinity != 239
305
#error Wrong code for setaffinity system call.
306
#endif /* __NR_sched_setaffinity */
307
#ifndef __NR_sched_getaffinity
308
#define __NR_sched_getaffinity 240
309
#elif __NR_sched_getaffinity != 240
310
#error Wrong code for getaffinity system call.
311
#endif /* __NR_sched_getaffinity */
312
#else
313
#error Unknown or unsupported architecture
314
#endif /* KMP_ARCH_* */
315
#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
316
#include <pthread.h>
317
#include <pthread_np.h>
318
#elif KMP_OS_NETBSD
319
#include <pthread.h>
320
#include <sched.h>
321
#elif KMP_OS_AIX
322
#include <sys/dr.h>
323
#include <sys/rset.h>
324
#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
325
#define GET_NUMBER_SMT_SETS 0x0004
326
extern "C" int syssmt(int flags, int, int, int *);
327
#endif
328
class KMPNativeAffinity : public KMPAffinity {
329
  class Mask : public KMPAffinity::Mask {
330
    typedef unsigned long mask_t;
331
    typedef decltype(__kmp_affin_mask_size) mask_size_type;
332
    static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
333
    static const mask_t ONE = 1;
334
    mask_size_type get_num_mask_types() const {
335
      return __kmp_affin_mask_size / sizeof(mask_t);
336
    }
337

338
  public:
339
    mask_t *mask;
340
    Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
341
    ~Mask() {
342
      if (mask)
343
        __kmp_free(mask);
344
    }
345
    void set(int i) override {
346
      mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
347
    }
348
    bool is_set(int i) const override {
349
      return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
350
    }
351
    void clear(int i) override {
352
      mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
353
    }
354
    void zero() override {
355
      mask_size_type e = get_num_mask_types();
356
      for (mask_size_type i = 0; i < e; ++i)
357
        mask[i] = (mask_t)0;
358
    }
359
    bool empty() const override {
360
      mask_size_type e = get_num_mask_types();
361
      for (mask_size_type i = 0; i < e; ++i)
362
        if (mask[i] != (mask_t)0)
363
          return false;
364
      return true;
365
    }
366
    void copy(const KMPAffinity::Mask *src) override {
367
      const Mask *convert = static_cast<const Mask *>(src);
368
      mask_size_type e = get_num_mask_types();
369
      for (mask_size_type i = 0; i < e; ++i)
370
        mask[i] = convert->mask[i];
371
    }
372
    void bitwise_and(const KMPAffinity::Mask *rhs) override {
373
      const Mask *convert = static_cast<const Mask *>(rhs);
374
      mask_size_type e = get_num_mask_types();
375
      for (mask_size_type i = 0; i < e; ++i)
376
        mask[i] &= convert->mask[i];
377
    }
378
    void bitwise_or(const KMPAffinity::Mask *rhs) override {
379
      const Mask *convert = static_cast<const Mask *>(rhs);
380
      mask_size_type e = get_num_mask_types();
381
      for (mask_size_type i = 0; i < e; ++i)
382
        mask[i] |= convert->mask[i];
383
    }
384
    void bitwise_not() override {
385
      mask_size_type e = get_num_mask_types();
386
      for (mask_size_type i = 0; i < e; ++i)
387
        mask[i] = ~(mask[i]);
388
    }
389
    bool is_equal(const KMPAffinity::Mask *rhs) const override {
390
      const Mask *convert = static_cast<const Mask *>(rhs);
391
      mask_size_type e = get_num_mask_types();
392
      for (mask_size_type i = 0; i < e; ++i)
393
        if (mask[i] != convert->mask[i])
394
          return false;
395
      return true;
396
    }
397
    int begin() const override {
398
      int retval = 0;
399
      while (retval < end() && !is_set(retval))
400
        ++retval;
401
      return retval;
402
    }
403
    int end() const override {
404
      int e;
405
      __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
406
      return e;
407
    }
408
    int next(int previous) const override {
409
      int retval = previous + 1;
410
      while (retval < end() && !is_set(retval))
411
        ++retval;
412
      return retval;
413
    }
414
#if KMP_OS_AIX
415
    // On AIX, we don't have a way to get CPU(s) a thread is bound to.
416
    // This routine is only used to get the full mask.
417
    int get_system_affinity(bool abort_on_error) override {
418
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
419
                  "Illegal get affinity operation when not capable");
420

421
      (void)abort_on_error;
422

423
      // Set the mask with all CPUs that are available.
424
      for (int i = 0; i < __kmp_xproc; ++i)
425
        KMP_CPU_SET(i, this);
426
      return 0;
427
    }
428
    int set_system_affinity(bool abort_on_error) const override {
429
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
430

431
                  "Illegal set affinity operation when not capable");
432

433
      int location;
434
      int gtid = __kmp_entry_gtid();
435
      int tid = thread_self();
436

437
      // Unbind the thread if it was bound to any processors before so that
438
      // we can bind the thread to CPUs specified by the mask not others.
439
      int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
440

441
      // On AIX, we can only bind to one instead of a set of CPUs with the
442
      // bindprocessor() system call.
443
      KMP_CPU_SET_ITERATE(location, this) {
444
        if (KMP_CPU_ISSET(location, this)) {
445
          retval = bindprocessor(BINDTHREAD, tid, location);
446
          if (retval == -1 && errno == 1) {
447
            rsid_t rsid;
448
            rsethandle_t rsh;
449
            // Put something in rsh to prevent compiler warning
450
            // about uninitalized use
451
            rsh = rs_alloc(RS_EMPTY);
452
            rsid.at_pid = getpid();
453
            if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
454
              retval = ra_detachrset(R_PROCESS, rsid, 0);
455
              retval = bindprocessor(BINDTHREAD, tid, location);
456
            }
457
          }
458
          if (retval == 0) {
459
            KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
460
                          "T#%d to cpu=%d.\n",
461
                          gtid, location));
462
            continue;
463
          }
464
          int error = errno;
465
          if (abort_on_error) {
466
            __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
467
                        KMP_ERR(error), __kmp_msg_null);
468
            KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
469
                          "T#%d to cpu=%d, errno=%d.\n",
470
                          gtid, location, error));
471
            return error;
472
          }
473
        }
474
      }
475
      return 0;
476
    }
477
#else // !KMP_OS_AIX
478
    int get_system_affinity(bool abort_on_error) override {
479
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
480
                  "Illegal get affinity operation when not capable");
481
#if KMP_OS_LINUX
482
      long retval =
483
          syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
484
#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
485
      int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
486
                                     reinterpret_cast<cpuset_t *>(mask));
487
      int retval = (r == 0 ? 0 : -1);
488
#endif
489
      if (retval >= 0) {
490
        return 0;
491
      }
492
      int error = errno;
493
      if (abort_on_error) {
494
        __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
495
                    KMP_ERR(error), __kmp_msg_null);
496
      }
497
      return error;
498
    }
499
    int set_system_affinity(bool abort_on_error) const override {
500
      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
501
                  "Illegal set affinity operation when not capable");
502
#if KMP_OS_LINUX
503
      long retval =
504
          syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
505
#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
506
      int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
507
                                     reinterpret_cast<cpuset_t *>(mask));
508
      int retval = (r == 0 ? 0 : -1);
509
#endif
510
      if (retval >= 0) {
511
        return 0;
512
      }
513
      int error = errno;
514
      if (abort_on_error) {
515
        __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
516
                    KMP_ERR(error), __kmp_msg_null);
517
      }
518
      return error;
519
    }
520
#endif // KMP_OS_AIX
521
  };
522
  void determine_capable(const char *env_var) override {
523
    __kmp_affinity_determine_capable(env_var);
524
  }
525
  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
526
  KMPAffinity::Mask *allocate_mask() override {
527
    KMPNativeAffinity::Mask *retval = new Mask();
528
    return retval;
529
  }
530
  void deallocate_mask(KMPAffinity::Mask *m) override {
531
    KMPNativeAffinity::Mask *native_mask =
532
        static_cast<KMPNativeAffinity::Mask *>(m);
533
    delete native_mask;
534
  }
535
  KMPAffinity::Mask *allocate_mask_array(int num) override {
536
    return new Mask[num];
537
  }
538
  void deallocate_mask_array(KMPAffinity::Mask *array) override {
539
    Mask *linux_array = static_cast<Mask *>(array);
540
    delete[] linux_array;
541
  }
542
  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
543
                                      int index) override {
544
    Mask *linux_array = static_cast<Mask *>(array);
545
    return &(linux_array[index]);
546
  }
547
  api_type get_api_type() const override { return NATIVE_OS; }
548
};
549
#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY  \
550
          || KMP_OS_AIX */
551

552
#if KMP_OS_WINDOWS
553
class KMPNativeAffinity : public KMPAffinity {
554
  class Mask : public KMPAffinity::Mask {
555
    typedef ULONG_PTR mask_t;
556
    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
557
    mask_t *mask;
558

559
  public:
560
    Mask() {
561
      mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
562
    }
563
    ~Mask() {
564
      if (mask)
565
        __kmp_free(mask);
566
    }
567
    void set(int i) override {
568
      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
569
    }
570
    bool is_set(int i) const override {
571
      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
572
    }
573
    void clear(int i) override {
574
      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
575
    }
576
    void zero() override {
577
      for (int i = 0; i < __kmp_num_proc_groups; ++i)
578
        mask[i] = 0;
579
    }
580
    bool empty() const override {
581
      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
582
        if (mask[i])
583
          return false;
584
      return true;
585
    }
586
    void copy(const KMPAffinity::Mask *src) override {
587
      const Mask *convert = static_cast<const Mask *>(src);
588
      for (int i = 0; i < __kmp_num_proc_groups; ++i)
589
        mask[i] = convert->mask[i];
590
    }
591
    void bitwise_and(const KMPAffinity::Mask *rhs) override {
592
      const Mask *convert = static_cast<const Mask *>(rhs);
593
      for (int i = 0; i < __kmp_num_proc_groups; ++i)
594
        mask[i] &= convert->mask[i];
595
    }
596
    void bitwise_or(const KMPAffinity::Mask *rhs) override {
597
      const Mask *convert = static_cast<const Mask *>(rhs);
598
      for (int i = 0; i < __kmp_num_proc_groups; ++i)
599
        mask[i] |= convert->mask[i];
600
    }
601
    void bitwise_not() override {
602
      for (int i = 0; i < __kmp_num_proc_groups; ++i)
603
        mask[i] = ~(mask[i]);
604
    }
605
    bool is_equal(const KMPAffinity::Mask *rhs) const override {
606
      const Mask *convert = static_cast<const Mask *>(rhs);
607
      for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
608
        if (mask[i] != convert->mask[i])
609
          return false;
610
      return true;
611
    }
612
    int begin() const override {
613
      int retval = 0;
614
      while (retval < end() && !is_set(retval))
615
        ++retval;
616
      return retval;
617
    }
618
    int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
619
    int next(int previous) const override {
620
      int retval = previous + 1;
621
      while (retval < end() && !is_set(retval))
622
        ++retval;
623
      return retval;
624
    }
625
    int set_process_affinity(bool abort_on_error) const override {
626
      if (__kmp_num_proc_groups <= 1) {
627
        if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
628
          DWORD error = GetLastError();
629
          if (abort_on_error) {
630
            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
631
                        __kmp_msg_null);
632
          }
633
          return error;
634
        }
635
      }
636
      return 0;
637
    }
638
    int set_system_affinity(bool abort_on_error) const override {
639
      if (__kmp_num_proc_groups > 1) {
640
        // Check for a valid mask.
641
        GROUP_AFFINITY ga;
642
        int group = get_proc_group();
643
        if (group < 0) {
644
          if (abort_on_error) {
645
            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
646
          }
647
          return -1;
648
        }
649
        // Transform the bit vector into a GROUP_AFFINITY struct
650
        // and make the system call to set affinity.
651
        ga.Group = group;
652
        ga.Mask = mask[group];
653
        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
654

655
        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
656
        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
657
          DWORD error = GetLastError();
658
          if (abort_on_error) {
659
            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
660
                        __kmp_msg_null);
661
          }
662
          return error;
663
        }
664
      } else {
665
        if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
666
          DWORD error = GetLastError();
667
          if (abort_on_error) {
668
            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
669
                        __kmp_msg_null);
670
          }
671
          return error;
672
        }
673
      }
674
      return 0;
675
    }
676
    int get_system_affinity(bool abort_on_error) override {
677
      if (__kmp_num_proc_groups > 1) {
678
        this->zero();
679
        GROUP_AFFINITY ga;
680
        KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
681
        if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
682
          DWORD error = GetLastError();
683
          if (abort_on_error) {
684
            __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
685
                        KMP_ERR(error), __kmp_msg_null);
686
          }
687
          return error;
688
        }
689
        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
690
            (ga.Mask == 0)) {
691
          return -1;
692
        }
693
        mask[ga.Group] = ga.Mask;
694
      } else {
695
        mask_t newMask, sysMask, retval;
696
        if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
697
          DWORD error = GetLastError();
698
          if (abort_on_error) {
699
            __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
700
                        KMP_ERR(error), __kmp_msg_null);
701
          }
702
          return error;
703
        }
704
        retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
705
        if (!retval) {
706
          DWORD error = GetLastError();
707
          if (abort_on_error) {
708
            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
709
                        KMP_ERR(error), __kmp_msg_null);
710
          }
711
          return error;
712
        }
713
        newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
714
        if (!newMask) {
715
          DWORD error = GetLastError();
716
          if (abort_on_error) {
717
            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
718
                        KMP_ERR(error), __kmp_msg_null);
719
          }
720
        }
721
        *mask = retval;
722
      }
723
      return 0;
724
    }
725
    int get_proc_group() const override {
726
      int group = -1;
727
      if (__kmp_num_proc_groups == 1) {
728
        return 1;
729
      }
730
      for (int i = 0; i < __kmp_num_proc_groups; i++) {
731
        if (mask[i] == 0)
732
          continue;
733
        if (group >= 0)
734
          return -1;
735
        group = i;
736
      }
737
      return group;
738
    }
739
  };
740
  void determine_capable(const char *env_var) override {
741
    __kmp_affinity_determine_capable(env_var);
742
  }
743
  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
744
  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
745
  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
746
  KMPAffinity::Mask *allocate_mask_array(int num) override {
747
    return new Mask[num];
748
  }
749
  void deallocate_mask_array(KMPAffinity::Mask *array) override {
750
    Mask *windows_array = static_cast<Mask *>(array);
751
    delete[] windows_array;
752
  }
753
  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
754
                                      int index) override {
755
    Mask *windows_array = static_cast<Mask *>(array);
756
    return &(windows_array[index]);
757
  }
758
  api_type get_api_type() const override { return NATIVE_OS; }
759
};
760
#endif /* KMP_OS_WINDOWS */
761
#endif /* KMP_AFFINITY_SUPPORTED */
762

763
// Describe an attribute for a level in the machine topology
764
struct kmp_hw_attr_t {
765
  int core_type : 8;
766
  int core_eff : 8;
767
  unsigned valid : 1;
768
  unsigned reserved : 15;
769

770
  static const int UNKNOWN_CORE_EFF = -1;
771

772
  kmp_hw_attr_t()
773
      : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
774
        valid(0), reserved(0) {}
775
  void set_core_type(kmp_hw_core_type_t type) {
776
    valid = 1;
777
    core_type = type;
778
  }
779
  void set_core_eff(int eff) {
780
    valid = 1;
781
    core_eff = eff;
782
  }
783
  kmp_hw_core_type_t get_core_type() const {
784
    return (kmp_hw_core_type_t)core_type;
785
  }
786
  int get_core_eff() const { return core_eff; }
787
  bool is_core_type_valid() const {
788
    return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
789
  }
790
  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
791
  operator bool() const { return valid; }
792
  void clear() {
793
    core_type = KMP_HW_CORE_TYPE_UNKNOWN;
794
    core_eff = UNKNOWN_CORE_EFF;
795
    valid = 0;
796
  }
797
  bool contains(const kmp_hw_attr_t &other) const {
798
    if (!valid && !other.valid)
799
      return true;
800
    if (valid && other.valid) {
801
      if (other.is_core_type_valid()) {
802
        if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
803
          return false;
804
      }
805
      if (other.is_core_eff_valid()) {
806
        if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
807
          return false;
808
      }
809
      return true;
810
    }
811
    return false;
812
  }
813
#if KMP_AFFINITY_SUPPORTED
814
  bool contains(const kmp_affinity_attrs_t &attr) const {
815
    if (!valid && !attr.valid)
816
      return true;
817
    if (valid && attr.valid) {
818
      if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
819
        return (is_core_type_valid() &&
820
                (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
821
      if (attr.core_eff != UNKNOWN_CORE_EFF)
822
        return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
823
      return true;
824
    }
825
    return false;
826
  }
827
#endif // KMP_AFFINITY_SUPPORTED
828
  bool operator==(const kmp_hw_attr_t &rhs) const {
829
    return (rhs.valid == valid && rhs.core_eff == core_eff &&
830
            rhs.core_type == core_type);
831
  }
832
  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
833
};
834

835
#if KMP_AFFINITY_SUPPORTED
836
KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
837
#endif
838

839
class kmp_hw_thread_t {
840
public:
841
  static const int UNKNOWN_ID = -1;
842
  static const int MULTIPLE_ID = -2;
843
  static int compare_ids(const void *a, const void *b);
844
  static int compare_compact(const void *a, const void *b);
845
  int ids[KMP_HW_LAST];
846
  int sub_ids[KMP_HW_LAST];
847
  bool leader;
848
  int os_id;
849
  kmp_hw_attr_t attrs;
850

851
  void print() const;
852
  void clear() {
853
    for (int i = 0; i < (int)KMP_HW_LAST; ++i)
854
      ids[i] = UNKNOWN_ID;
855
    leader = false;
856
    attrs.clear();
857
  }
858
};
859

860
class kmp_topology_t {
861

862
  struct flags_t {
863
    int uniform : 1;
864
    int reserved : 31;
865
  };
866

867
  int depth;
868

869
  // The following arrays are all 'depth' long and have been
870
  // allocated to hold up to KMP_HW_LAST number of objects if
871
  // needed so layers can be added without reallocation of any array
872

873
  // Orderd array of the types in the topology
874
  kmp_hw_t *types;
875

876
  // Keep quick topology ratios, for non-uniform topologies,
877
  // this ratio holds the max number of itemAs per itemB
878
  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
879
  int *ratio;
880

881
  // Storage containing the absolute number of each topology layer
882
  int *count;
883

884
  // The number of core efficiencies. This is only useful for hybrid
885
  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
886
  int num_core_efficiencies;
887
  int num_core_types;
888
  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
889

890
  // The hardware threads array
891
  // hw_threads is num_hw_threads long
892
  // Each hw_thread's ids and sub_ids are depth deep
893
  int num_hw_threads;
894
  kmp_hw_thread_t *hw_threads;
895

896
  // Equivalence hash where the key is the hardware topology item
897
  // and the value is the equivalent hardware topology type in the
898
  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
899
  // known equivalence for the topology type
900
  kmp_hw_t equivalent[KMP_HW_LAST];
901

902
  // Flags describing the topology
903
  flags_t flags;
904

905
  // Compact value used during sort_compact()
906
  int compact;
907

908
  // Insert a new topology layer after allocation
909
  void _insert_layer(kmp_hw_t type, const int *ids);
910

911
#if KMP_GROUP_AFFINITY
912
  // Insert topology information about Windows Processor groups
913
  void _insert_windows_proc_groups();
914
#endif
915

916
  // Count each item & get the num x's per y
917
  // e.g., get the number of cores and the number of threads per core
918
  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
919
  void _gather_enumeration_information();
920

921
  // Remove layers that don't add information to the topology.
922
  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
923
  void _remove_radix1_layers();
924

925
  // Find out if the topology is uniform
926
  void _discover_uniformity();
927

928
  // Set all the sub_ids for each hardware thread
929
  void _set_sub_ids();
930

931
  // Set global affinity variables describing the number of threads per
932
  // core, the number of packages, the number of cores per package, and
933
  // the number of cores.
934
  void _set_globals();
935

936
  // Set the last level cache equivalent type
937
  void _set_last_level_cache();
938

939
  // Return the number of cores with a particular attribute, 'attr'.
940
  // If 'find_all' is true, then find all cores on the machine, otherwise find
941
  // all cores per the layer 'above'
942
  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
943
                            bool find_all = false) const;
944

945
public:
946
  // Force use of allocate()/deallocate()
947
  kmp_topology_t() = delete;
948
  kmp_topology_t(const kmp_topology_t &t) = delete;
949
  kmp_topology_t(kmp_topology_t &&t) = delete;
950
  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
951
  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
952

953
  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
954
  static void deallocate(kmp_topology_t *);
955

956
  // Functions used in create_map() routines
957
  kmp_hw_thread_t &at(int index) {
958
    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
959
    return hw_threads[index];
960
  }
961
  const kmp_hw_thread_t &at(int index) const {
962
    KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963
    return hw_threads[index];
964
  }
965
  int get_num_hw_threads() const { return num_hw_threads; }
966
  void sort_ids() {
967
    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
968
          kmp_hw_thread_t::compare_ids);
969
  }
970
  // Check if the hardware ids are unique, if they are
971
  // return true, otherwise return false
972
  bool check_ids() const;
973

974
  // Function to call after the create_map() routine
975
  void canonicalize();
976
  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
977

978
// Functions used after canonicalize() called
979

980
#if KMP_AFFINITY_SUPPORTED
981
  // Set the granularity for affinity settings
982
  void set_granularity(kmp_affinity_t &stgs) const;
983
  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
984
  bool restrict_to_mask(const kmp_affin_mask_t *mask);
985
  bool filter_hw_subset();
986
#endif
987
  bool is_uniform() const { return flags.uniform; }
988
  // Tell whether a type is a valid type in the topology
989
  // returns KMP_HW_UNKNOWN when there is no equivalent type
990
  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
991
    if (type == KMP_HW_UNKNOWN)
992
      return KMP_HW_UNKNOWN;
993
    return equivalent[type];
994
  }
995
  // Set type1 = type2
996
  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
997
    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
998
    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
999
    kmp_hw_t real_type2 = equivalent[type2];
1000
    if (real_type2 == KMP_HW_UNKNOWN)
1001
      real_type2 = type2;
1002
    equivalent[type1] = real_type2;
1003
    // This loop is required since any of the types may have been set to
1004
    // be equivalent to type1.  They all must be checked and reset to type2.
1005
    KMP_FOREACH_HW_TYPE(type) {
1006
      if (equivalent[type] == type1) {
1007
        equivalent[type] = real_type2;
1008
      }
1009
    }
1010
  }
1011
  // Calculate number of types corresponding to level1
1012
  // per types corresponding to level2 (e.g., number of threads per core)
1013
  int calculate_ratio(int level1, int level2) const {
1014
    KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1015
    KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1016
    int r = 1;
1017
    for (int level = level1; level > level2; --level)
1018
      r *= ratio[level];
1019
    return r;
1020
  }
1021
  int get_ratio(int level) const {
1022
    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1023
    return ratio[level];
1024
  }
1025
  int get_depth() const { return depth; };
1026
  kmp_hw_t get_type(int level) const {
1027
    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1028
    return types[level];
1029
  }
1030
  int get_level(kmp_hw_t type) const {
1031
    KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1032
    int eq_type = equivalent[type];
1033
    if (eq_type == KMP_HW_UNKNOWN)
1034
      return -1;
1035
    for (int i = 0; i < depth; ++i)
1036
      if (types[i] == eq_type)
1037
        return i;
1038
    return -1;
1039
  }
1040
  int get_count(int level) const {
1041
    KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1042
    return count[level];
1043
  }
1044
  // Return the total number of cores with attribute 'attr'
1045
  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1046
    return _get_ncores_with_attr(attr, -1, true);
1047
  }
1048
  // Return the number of cores with attribute
1049
  // 'attr' per topology level 'above'
1050
  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1051
    return _get_ncores_with_attr(attr, above, false);
1052
  }
1053

1054
#if KMP_AFFINITY_SUPPORTED
1055
  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1056
  void sort_compact(kmp_affinity_t &affinity) {
1057
    compact = affinity.compact;
1058
    qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1059
          kmp_hw_thread_t::compare_compact);
1060
  }
1061
#endif
1062
  void print(const char *env_var = "KMP_AFFINITY") const;
1063
  void dump() const;
1064
};
1065
extern kmp_topology_t *__kmp_topology;
1066

1067
class kmp_hw_subset_t {
1068
  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1069

1070
public:
1071
  // Describe a machine topology item in KMP_HW_SUBSET
1072
  struct item_t {
1073
    kmp_hw_t type;
1074
    int num_attrs;
1075
    int num[MAX_ATTRS];
1076
    int offset[MAX_ATTRS];
1077
    kmp_hw_attr_t attr[MAX_ATTRS];
1078
  };
1079
  // Put parenthesis around max to avoid accidental use of Windows max macro.
1080
  const static int USE_ALL = (std::numeric_limits<int>::max)();
1081

1082
private:
1083
  int depth;
1084
  int capacity;
1085
  item_t *items;
1086
  kmp_uint64 set;
1087
  bool absolute;
1088
  // The set must be able to handle up to KMP_HW_LAST number of layers
1089
  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1090
  // Sorting the KMP_HW_SUBSET items to follow topology order
1091
  // All unknown topology types will be at the beginning of the subset
1092
  static int hw_subset_compare(const void *i1, const void *i2) {
1093
    kmp_hw_t type1 = ((const item_t *)i1)->type;
1094
    kmp_hw_t type2 = ((const item_t *)i2)->type;
1095
    int level1 = __kmp_topology->get_level(type1);
1096
    int level2 = __kmp_topology->get_level(type2);
1097
    return level1 - level2;
1098
  }
1099

1100
public:
1101
  // Force use of allocate()/deallocate()
1102
  kmp_hw_subset_t() = delete;
1103
  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1104
  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1105
  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1106
  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1107

1108
  static kmp_hw_subset_t *allocate() {
1109
    int initial_capacity = 5;
1110
    kmp_hw_subset_t *retval =
1111
        (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1112
    retval->depth = 0;
1113
    retval->capacity = initial_capacity;
1114
    retval->set = 0ull;
1115
    retval->absolute = false;
1116
    retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1117
    return retval;
1118
  }
1119
  static void deallocate(kmp_hw_subset_t *subset) {
1120
    __kmp_free(subset->items);
1121
    __kmp_free(subset);
1122
  }
1123
  void set_absolute() { absolute = true; }
1124
  bool is_absolute() const { return absolute; }
1125
  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1126
    for (int i = 0; i < depth; ++i) {
1127
      // Found an existing item for this layer type
1128
      // Add the num, offset, and attr to this item
1129
      if (items[i].type == type) {
1130
        int idx = items[i].num_attrs++;
1131
        if ((size_t)idx >= MAX_ATTRS)
1132
          return;
1133
        items[i].num[idx] = num;
1134
        items[i].offset[idx] = offset;
1135
        items[i].attr[idx] = attr;
1136
        return;
1137
      }
1138
    }
1139
    if (depth == capacity - 1) {
1140
      capacity *= 2;
1141
      item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1142
      for (int i = 0; i < depth; ++i)
1143
        new_items[i] = items[i];
1144
      __kmp_free(items);
1145
      items = new_items;
1146
    }
1147
    items[depth].num_attrs = 1;
1148
    items[depth].type = type;
1149
    items[depth].num[0] = num;
1150
    items[depth].offset[0] = offset;
1151
    items[depth].attr[0] = attr;
1152
    depth++;
1153
    set |= (1ull << type);
1154
  }
1155
  int get_depth() const { return depth; }
1156
  const item_t &at(int index) const {
1157
    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1158
    return items[index];
1159
  }
1160
  item_t &at(int index) {
1161
    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162
    return items[index];
1163
  }
1164
  void remove(int index) {
1165
    KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166
    set &= ~(1ull << items[index].type);
1167
    for (int j = index + 1; j < depth; ++j) {
1168
      items[j - 1] = items[j];
1169
    }
1170
    depth--;
1171
  }
1172
  void sort() {
1173
    KMP_DEBUG_ASSERT(__kmp_topology);
1174
    qsort(items, depth, sizeof(item_t), hw_subset_compare);
1175
  }
1176
  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1177

1178
  // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1179
  // This means putting each of {sockets, cores, threads} in the topology if
1180
  // they are not specified:
1181
  // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1182
  // e.g., 3module => *s,3module,*c,*t
1183
  // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1184
  // are expecting the traditional sockets/cores/threads topology. For newer
1185
  // hardware, there can be intervening layers like dies/tiles/modules
1186
  // (usually corresponding to a cache level). So when a user asks for
1187
  // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1188
  // should get 12 hardware threads across 6 cores and effectively ignore the
1189
  // module layer.
1190
  void canonicalize(const kmp_topology_t *top) {
1191
    // Layers to target for KMP_HW_SUBSET canonicalization
1192
    kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1193

1194
    // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1195
    if (is_absolute())
1196
      return;
1197

1198
    // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1199
    // topology doesn't have these layers
1200
    for (kmp_hw_t type : targeted)
1201
      if (top->get_level(type) == KMP_HW_UNKNOWN)
1202
        return;
1203

1204
    // Put targeted layers in topology if they do not exist
1205
    for (kmp_hw_t type : targeted) {
1206
      bool found = false;
1207
      for (int i = 0; i < get_depth(); ++i) {
1208
        if (top->get_equivalent_type(items[i].type) == type) {
1209
          found = true;
1210
          break;
1211
        }
1212
      }
1213
      if (!found) {
1214
        push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1215
      }
1216
    }
1217
    sort();
1218
    // Set as an absolute topology that only targets the targeted layers
1219
    set_absolute();
1220
  }
1221
  void dump() const {
1222
    printf("**********************\n");
1223
    printf("*** kmp_hw_subset: ***\n");
1224
    printf("* depth: %d\n", depth);
1225
    printf("* items:\n");
1226
    for (int i = 0; i < depth; ++i) {
1227
      printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1228
      for (int j = 0; j < items[i].num_attrs; ++j) {
1229
        printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1230
               items[i].offset[j]);
1231
        if (!items[i].attr[j]) {
1232
          printf(" (none)\n");
1233
        } else {
1234
          printf(
1235
              " core_type = %s, core_eff = %d\n",
1236
              __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1237
              items[i].attr[j].get_core_eff());
1238
        }
1239
      }
1240
    }
1241
    printf("* set: 0x%llx\n", set);
1242
    printf("* absolute: %d\n", absolute);
1243
    printf("**********************\n");
1244
  }
1245
};
1246
extern kmp_hw_subset_t *__kmp_hw_subset;
1247

1248
/* A structure for holding machine-specific hierarchy info to be computed once
1249
   at init. This structure represents a mapping of threads to the actual machine
1250
   hierarchy, or to our best guess at what the hierarchy might be, for the
1251
   purpose of performing an efficient barrier. In the worst case, when there is
1252
   no machine hierarchy information, it produces a tree suitable for a barrier,
1253
   similar to the tree used in the hyper barrier. */
1254
class hierarchy_info {
1255
public:
1256
  /* Good default values for number of leaves and branching factor, given no
1257
     affinity information. Behaves a bit like hyper barrier. */
1258
  static const kmp_uint32 maxLeaves = 4;
1259
  static const kmp_uint32 minBranch = 4;
1260
  /** Number of levels in the hierarchy. Typical levels are threads/core,
1261
      cores/package or socket, packages/node, nodes/machine, etc. We don't want
1262
      to get specific with nomenclature. When the machine is oversubscribed we
1263
      add levels to duplicate the hierarchy, doubling the thread capacity of the
1264
      hierarchy each time we add a level. */
1265
  kmp_uint32 maxLevels;
1266

1267
  /** This is specifically the depth of the machine configuration hierarchy, in
1268
      terms of the number of levels along the longest path from root to any
1269
      leaf. It corresponds to the number of entries in numPerLevel if we exclude
1270
      all but one trailing 1. */
1271
  kmp_uint32 depth;
1272
  kmp_uint32 base_num_threads;
1273
  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1274
  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1275
  // 2=initialization in progress
1276
  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1277

1278
  /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1279
      the parent of a node at level i has. For example, if we have a machine
1280
      with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1281
      {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1282
  kmp_uint32 *numPerLevel;
1283
  kmp_uint32 *skipPerLevel;
1284

1285
  void deriveLevels() {
1286
    int hier_depth = __kmp_topology->get_depth();
1287
    for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1288
      numPerLevel[level] = __kmp_topology->get_ratio(i);
1289
    }
1290
  }
1291

1292
  hierarchy_info()
1293
      : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1294

1295
  void fini() {
1296
    if (!uninitialized && numPerLevel) {
1297
      __kmp_free(numPerLevel);
1298
      numPerLevel = NULL;
1299
      uninitialized = not_initialized;
1300
    }
1301
  }
1302

1303
  void init(int num_addrs) {
1304
    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1305
        &uninitialized, not_initialized, initializing);
1306
    if (bool_result == 0) { // Wait for initialization
1307
      while (TCR_1(uninitialized) != initialized)
1308
        KMP_CPU_PAUSE();
1309
      return;
1310
    }
1311
    KMP_DEBUG_ASSERT(bool_result == 1);
1312

1313
    /* Added explicit initialization of the data fields here to prevent usage of
1314
       dirty value observed when static library is re-initialized multiple times
1315
       (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1316
       OpenMP). */
1317
    depth = 1;
1318
    resizing = 0;
1319
    maxLevels = 7;
1320
    numPerLevel =
1321
        (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1322
    skipPerLevel = &(numPerLevel[maxLevels]);
1323
    for (kmp_uint32 i = 0; i < maxLevels;
1324
         ++i) { // init numPerLevel[*] to 1 item per level
1325
      numPerLevel[i] = 1;
1326
      skipPerLevel[i] = 1;
1327
    }
1328

1329
    // Sort table by physical ID
1330
    if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1331
      deriveLevels();
1332
    } else {
1333
      numPerLevel[0] = maxLeaves;
1334
      numPerLevel[1] = num_addrs / maxLeaves;
1335
      if (num_addrs % maxLeaves)
1336
        numPerLevel[1]++;
1337
    }
1338

1339
    base_num_threads = num_addrs;
1340
    for (int i = maxLevels - 1; i >= 0;
1341
         --i) // count non-empty levels to get depth
1342
      if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1343
        depth++;
1344

1345
    kmp_uint32 branch = minBranch;
1346
    if (numPerLevel[0] == 1)
1347
      branch = num_addrs / maxLeaves;
1348
    if (branch < minBranch)
1349
      branch = minBranch;
1350
    for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1351
      while (numPerLevel[d] > branch ||
1352
             (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1353
        if (numPerLevel[d] & 1)
1354
          numPerLevel[d]++;
1355
        numPerLevel[d] = numPerLevel[d] >> 1;
1356
        if (numPerLevel[d + 1] == 1)
1357
          depth++;
1358
        numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1359
      }
1360
      if (numPerLevel[0] == 1) {
1361
        branch = branch >> 1;
1362
        if (branch < 4)
1363
          branch = minBranch;
1364
      }
1365
    }
1366

1367
    for (kmp_uint32 i = 1; i < depth; ++i)
1368
      skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1369
    // Fill in hierarchy in the case of oversubscription
1370
    for (kmp_uint32 i = depth; i < maxLevels; ++i)
1371
      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1372

1373
    uninitialized = initialized; // One writer
1374
  }
1375

1376
  // Resize the hierarchy if nproc changes to something larger than before
1377
  void resize(kmp_uint32 nproc) {
1378
    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1379
    while (bool_result == 0) { // someone else is trying to resize
1380
      KMP_CPU_PAUSE();
1381
      if (nproc <= base_num_threads) // happy with other thread's resize
1382
        return;
1383
      else // try to resize
1384
        bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1385
    }
1386
    KMP_DEBUG_ASSERT(bool_result != 0);
1387
    if (nproc <= base_num_threads)
1388
      return; // happy with other thread's resize
1389

1390
    // Calculate new maxLevels
1391
    kmp_uint32 old_sz = skipPerLevel[depth - 1];
1392
    kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1393
    // First see if old maxLevels is enough to contain new size
1394
    for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1395
      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1396
      numPerLevel[i - 1] *= 2;
1397
      old_sz *= 2;
1398
      depth++;
1399
    }
1400
    if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1401
      while (nproc > old_sz) {
1402
        old_sz *= 2;
1403
        incs++;
1404
        depth++;
1405
      }
1406
      maxLevels += incs;
1407

1408
      // Resize arrays
1409
      kmp_uint32 *old_numPerLevel = numPerLevel;
1410
      kmp_uint32 *old_skipPerLevel = skipPerLevel;
1411
      numPerLevel = skipPerLevel = NULL;
1412
      numPerLevel =
1413
          (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1414
      skipPerLevel = &(numPerLevel[maxLevels]);
1415

1416
      // Copy old elements from old arrays
1417
      for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1418
        // init numPerLevel[*] to 1 item per level
1419
        numPerLevel[i] = old_numPerLevel[i];
1420
        skipPerLevel[i] = old_skipPerLevel[i];
1421
      }
1422

1423
      // Init new elements in arrays to 1
1424
      for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1425
        // init numPerLevel[*] to 1 item per level
1426
        numPerLevel[i] = 1;
1427
        skipPerLevel[i] = 1;
1428
      }
1429

1430
      // Free old arrays
1431
      __kmp_free(old_numPerLevel);
1432
    }
1433

1434
    // Fill in oversubscription levels of hierarchy
1435
    for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1436
      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1437

1438
    base_num_threads = nproc;
1439
    resizing = 0; // One writer
1440
  }
1441
};
1442
#endif // KMP_AFFINITY_H
1443

1444
Product

Resources

Company