Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h
35258 views
1
/*
2
* kmp_affinity.h -- header for affinity management
3
*/
4
5
//===----------------------------------------------------------------------===//
6
//
7
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8
// See https://llvm.org/LICENSE.txt for license information.
9
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10
//
11
//===----------------------------------------------------------------------===//
12
13
#ifndef KMP_AFFINITY_H
14
#define KMP_AFFINITY_H
15
16
#include "kmp.h"
17
#include "kmp_os.h"
18
#include <limits>
19
20
#if KMP_AFFINITY_SUPPORTED
21
#if KMP_USE_HWLOC
22
class KMPHwlocAffinity : public KMPAffinity {
23
public:
24
class Mask : public KMPAffinity::Mask {
25
hwloc_cpuset_t mask;
26
27
public:
28
Mask() {
29
mask = hwloc_bitmap_alloc();
30
this->zero();
31
}
32
~Mask() { hwloc_bitmap_free(mask); }
33
void set(int i) override { hwloc_bitmap_set(mask, i); }
34
bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35
void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36
void zero() override { hwloc_bitmap_zero(mask); }
37
bool empty() const override { return hwloc_bitmap_iszero(mask); }
38
void copy(const KMPAffinity::Mask *src) override {
39
const Mask *convert = static_cast<const Mask *>(src);
40
hwloc_bitmap_copy(mask, convert->mask);
41
}
42
void bitwise_and(const KMPAffinity::Mask *rhs) override {
43
const Mask *convert = static_cast<const Mask *>(rhs);
44
hwloc_bitmap_and(mask, mask, convert->mask);
45
}
46
void bitwise_or(const KMPAffinity::Mask *rhs) override {
47
const Mask *convert = static_cast<const Mask *>(rhs);
48
hwloc_bitmap_or(mask, mask, convert->mask);
49
}
50
void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51
bool is_equal(const KMPAffinity::Mask *rhs) const override {
52
const Mask *convert = static_cast<const Mask *>(rhs);
53
return hwloc_bitmap_isequal(mask, convert->mask);
54
}
55
int begin() const override { return hwloc_bitmap_first(mask); }
56
int end() const override { return -1; }
57
int next(int previous) const override {
58
return hwloc_bitmap_next(mask, previous);
59
}
60
int get_system_affinity(bool abort_on_error) override {
61
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62
"Illegal get affinity operation when not capable");
63
long retval =
64
hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65
if (retval >= 0) {
66
return 0;
67
}
68
int error = errno;
69
if (abort_on_error) {
70
__kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71
KMP_ERR(error), __kmp_msg_null);
72
}
73
return error;
74
}
75
int set_system_affinity(bool abort_on_error) const override {
76
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77
"Illegal set affinity operation when not capable");
78
long retval =
79
hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80
if (retval >= 0) {
81
return 0;
82
}
83
int error = errno;
84
if (abort_on_error) {
85
__kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86
KMP_ERR(error), __kmp_msg_null);
87
}
88
return error;
89
}
90
#if KMP_OS_WINDOWS
91
int set_process_affinity(bool abort_on_error) const override {
92
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93
"Illegal set process affinity operation when not capable");
94
int error = 0;
95
const hwloc_topology_support *support =
96
hwloc_topology_get_support(__kmp_hwloc_topology);
97
if (support->cpubind->set_proc_cpubind) {
98
int retval;
99
retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100
HWLOC_CPUBIND_PROCESS);
101
if (retval >= 0)
102
return 0;
103
error = errno;
104
if (abort_on_error)
105
__kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106
KMP_ERR(error), __kmp_msg_null);
107
}
108
return error;
109
}
110
#endif
111
int get_proc_group() const override {
112
int group = -1;
113
#if KMP_OS_WINDOWS
114
if (__kmp_num_proc_groups == 1) {
115
return 1;
116
}
117
for (int i = 0; i < __kmp_num_proc_groups; i++) {
118
// On windows, the long type is always 32 bits
119
unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120
unsigned long second_32_bits =
121
hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122
if (first_32_bits == 0 && second_32_bits == 0) {
123
continue;
124
}
125
if (group >= 0) {
126
return -1;
127
}
128
group = i;
129
}
130
#endif /* KMP_OS_WINDOWS */
131
return group;
132
}
133
};
134
void determine_capable(const char *var) override {
135
const hwloc_topology_support *topology_support;
136
if (__kmp_hwloc_topology == NULL) {
137
if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138
__kmp_hwloc_error = TRUE;
139
if (__kmp_affinity.flags.verbose) {
140
KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141
}
142
}
143
if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144
__kmp_hwloc_error = TRUE;
145
if (__kmp_affinity.flags.verbose) {
146
KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147
}
148
}
149
}
150
topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151
// Is the system capable of setting/getting this thread's affinity?
152
// Also, is topology discovery possible? (pu indicates ability to discover
153
// processing units). And finally, were there no errors when calling any
154
// hwloc_* API functions?
155
if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156
topology_support->cpubind->get_thisthread_cpubind &&
157
topology_support->discovery->pu && !__kmp_hwloc_error) {
158
// enables affinity according to KMP_AFFINITY_CAPABLE() macro
159
KMP_AFFINITY_ENABLE(TRUE);
160
} else {
161
// indicate that hwloc didn't work and disable affinity
162
__kmp_hwloc_error = TRUE;
163
KMP_AFFINITY_DISABLE();
164
}
165
}
166
void bind_thread(int which) override {
167
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168
"Illegal set affinity operation when not capable");
169
KMPAffinity::Mask *mask;
170
KMP_CPU_ALLOC_ON_STACK(mask);
171
KMP_CPU_ZERO(mask);
172
KMP_CPU_SET(which, mask);
173
__kmp_set_system_affinity(mask, TRUE);
174
KMP_CPU_FREE_FROM_STACK(mask);
175
}
176
KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177
void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178
KMPAffinity::Mask *allocate_mask_array(int num) override {
179
return new Mask[num];
180
}
181
void deallocate_mask_array(KMPAffinity::Mask *array) override {
182
Mask *hwloc_array = static_cast<Mask *>(array);
183
delete[] hwloc_array;
184
}
185
KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186
int index) override {
187
Mask *hwloc_array = static_cast<Mask *>(array);
188
return &(hwloc_array[index]);
189
}
190
api_type get_api_type() const override { return HWLOC; }
191
};
192
#endif /* KMP_USE_HWLOC */
193
194
#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
195
KMP_OS_AIX
196
#if KMP_OS_LINUX
197
/* On some of the older OS's that we build on, these constants aren't present
198
in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199
all systems of the same arch where they are defined, and they cannot change.
200
stone forever. */
201
#include <sys/syscall.h>
202
#if KMP_ARCH_X86 || KMP_ARCH_ARM
203
#ifndef __NR_sched_setaffinity
204
#define __NR_sched_setaffinity 241
205
#elif __NR_sched_setaffinity != 241
206
#error Wrong code for setaffinity system call.
207
#endif /* __NR_sched_setaffinity */
208
#ifndef __NR_sched_getaffinity
209
#define __NR_sched_getaffinity 242
210
#elif __NR_sched_getaffinity != 242
211
#error Wrong code for getaffinity system call.
212
#endif /* __NR_sched_getaffinity */
213
#elif KMP_ARCH_AARCH64
214
#ifndef __NR_sched_setaffinity
215
#define __NR_sched_setaffinity 122
216
#elif __NR_sched_setaffinity != 122
217
#error Wrong code for setaffinity system call.
218
#endif /* __NR_sched_setaffinity */
219
#ifndef __NR_sched_getaffinity
220
#define __NR_sched_getaffinity 123
221
#elif __NR_sched_getaffinity != 123
222
#error Wrong code for getaffinity system call.
223
#endif /* __NR_sched_getaffinity */
224
#elif KMP_ARCH_X86_64
225
#ifndef __NR_sched_setaffinity
226
#define __NR_sched_setaffinity 203
227
#elif __NR_sched_setaffinity != 203
228
#error Wrong code for setaffinity system call.
229
#endif /* __NR_sched_setaffinity */
230
#ifndef __NR_sched_getaffinity
231
#define __NR_sched_getaffinity 204
232
#elif __NR_sched_getaffinity != 204
233
#error Wrong code for getaffinity system call.
234
#endif /* __NR_sched_getaffinity */
235
#elif KMP_ARCH_PPC64
236
#ifndef __NR_sched_setaffinity
237
#define __NR_sched_setaffinity 222
238
#elif __NR_sched_setaffinity != 222
239
#error Wrong code for setaffinity system call.
240
#endif /* __NR_sched_setaffinity */
241
#ifndef __NR_sched_getaffinity
242
#define __NR_sched_getaffinity 223
243
#elif __NR_sched_getaffinity != 223
244
#error Wrong code for getaffinity system call.
245
#endif /* __NR_sched_getaffinity */
246
#elif KMP_ARCH_MIPS
247
#ifndef __NR_sched_setaffinity
248
#define __NR_sched_setaffinity 4239
249
#elif __NR_sched_setaffinity != 4239
250
#error Wrong code for setaffinity system call.
251
#endif /* __NR_sched_setaffinity */
252
#ifndef __NR_sched_getaffinity
253
#define __NR_sched_getaffinity 4240
254
#elif __NR_sched_getaffinity != 4240
255
#error Wrong code for getaffinity system call.
256
#endif /* __NR_sched_getaffinity */
257
#elif KMP_ARCH_MIPS64
258
#ifndef __NR_sched_setaffinity
259
#define __NR_sched_setaffinity 5195
260
#elif __NR_sched_setaffinity != 5195
261
#error Wrong code for setaffinity system call.
262
#endif /* __NR_sched_setaffinity */
263
#ifndef __NR_sched_getaffinity
264
#define __NR_sched_getaffinity 5196
265
#elif __NR_sched_getaffinity != 5196
266
#error Wrong code for getaffinity system call.
267
#endif /* __NR_sched_getaffinity */
268
#elif KMP_ARCH_LOONGARCH64
269
#ifndef __NR_sched_setaffinity
270
#define __NR_sched_setaffinity 122
271
#elif __NR_sched_setaffinity != 122
272
#error Wrong code for setaffinity system call.
273
#endif /* __NR_sched_setaffinity */
274
#ifndef __NR_sched_getaffinity
275
#define __NR_sched_getaffinity 123
276
#elif __NR_sched_getaffinity != 123
277
#error Wrong code for getaffinity system call.
278
#endif /* __NR_sched_getaffinity */
279
#elif KMP_ARCH_RISCV64
280
#ifndef __NR_sched_setaffinity
281
#define __NR_sched_setaffinity 122
282
#elif __NR_sched_setaffinity != 122
283
#error Wrong code for setaffinity system call.
284
#endif /* __NR_sched_setaffinity */
285
#ifndef __NR_sched_getaffinity
286
#define __NR_sched_getaffinity 123
287
#elif __NR_sched_getaffinity != 123
288
#error Wrong code for getaffinity system call.
289
#endif /* __NR_sched_getaffinity */
290
#elif KMP_ARCH_VE
291
#ifndef __NR_sched_setaffinity
292
#define __NR_sched_setaffinity 203
293
#elif __NR_sched_setaffinity != 203
294
#error Wrong code for setaffinity system call.
295
#endif /* __NR_sched_setaffinity */
296
#ifndef __NR_sched_getaffinity
297
#define __NR_sched_getaffinity 204
298
#elif __NR_sched_getaffinity != 204
299
#error Wrong code for getaffinity system call.
300
#endif /* __NR_sched_getaffinity */
301
#elif KMP_ARCH_S390X
302
#ifndef __NR_sched_setaffinity
303
#define __NR_sched_setaffinity 239
304
#elif __NR_sched_setaffinity != 239
305
#error Wrong code for setaffinity system call.
306
#endif /* __NR_sched_setaffinity */
307
#ifndef __NR_sched_getaffinity
308
#define __NR_sched_getaffinity 240
309
#elif __NR_sched_getaffinity != 240
310
#error Wrong code for getaffinity system call.
311
#endif /* __NR_sched_getaffinity */
312
#else
313
#error Unknown or unsupported architecture
314
#endif /* KMP_ARCH_* */
315
#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
316
#include <pthread.h>
317
#include <pthread_np.h>
318
#elif KMP_OS_NETBSD
319
#include <pthread.h>
320
#include <sched.h>
321
#elif KMP_OS_AIX
322
#include <sys/dr.h>
323
#include <sys/rset.h>
324
#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
325
#define GET_NUMBER_SMT_SETS 0x0004
326
extern "C" int syssmt(int flags, int, int, int *);
327
#endif
328
class KMPNativeAffinity : public KMPAffinity {
329
class Mask : public KMPAffinity::Mask {
330
typedef unsigned long mask_t;
331
typedef decltype(__kmp_affin_mask_size) mask_size_type;
332
static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
333
static const mask_t ONE = 1;
334
mask_size_type get_num_mask_types() const {
335
return __kmp_affin_mask_size / sizeof(mask_t);
336
}
337
338
public:
339
mask_t *mask;
340
Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
341
~Mask() {
342
if (mask)
343
__kmp_free(mask);
344
}
345
void set(int i) override {
346
mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
347
}
348
bool is_set(int i) const override {
349
return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
350
}
351
void clear(int i) override {
352
mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
353
}
354
void zero() override {
355
mask_size_type e = get_num_mask_types();
356
for (mask_size_type i = 0; i < e; ++i)
357
mask[i] = (mask_t)0;
358
}
359
bool empty() const override {
360
mask_size_type e = get_num_mask_types();
361
for (mask_size_type i = 0; i < e; ++i)
362
if (mask[i] != (mask_t)0)
363
return false;
364
return true;
365
}
366
void copy(const KMPAffinity::Mask *src) override {
367
const Mask *convert = static_cast<const Mask *>(src);
368
mask_size_type e = get_num_mask_types();
369
for (mask_size_type i = 0; i < e; ++i)
370
mask[i] = convert->mask[i];
371
}
372
void bitwise_and(const KMPAffinity::Mask *rhs) override {
373
const Mask *convert = static_cast<const Mask *>(rhs);
374
mask_size_type e = get_num_mask_types();
375
for (mask_size_type i = 0; i < e; ++i)
376
mask[i] &= convert->mask[i];
377
}
378
void bitwise_or(const KMPAffinity::Mask *rhs) override {
379
const Mask *convert = static_cast<const Mask *>(rhs);
380
mask_size_type e = get_num_mask_types();
381
for (mask_size_type i = 0; i < e; ++i)
382
mask[i] |= convert->mask[i];
383
}
384
void bitwise_not() override {
385
mask_size_type e = get_num_mask_types();
386
for (mask_size_type i = 0; i < e; ++i)
387
mask[i] = ~(mask[i]);
388
}
389
bool is_equal(const KMPAffinity::Mask *rhs) const override {
390
const Mask *convert = static_cast<const Mask *>(rhs);
391
mask_size_type e = get_num_mask_types();
392
for (mask_size_type i = 0; i < e; ++i)
393
if (mask[i] != convert->mask[i])
394
return false;
395
return true;
396
}
397
int begin() const override {
398
int retval = 0;
399
while (retval < end() && !is_set(retval))
400
++retval;
401
return retval;
402
}
403
int end() const override {
404
int e;
405
__kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
406
return e;
407
}
408
int next(int previous) const override {
409
int retval = previous + 1;
410
while (retval < end() && !is_set(retval))
411
++retval;
412
return retval;
413
}
414
#if KMP_OS_AIX
415
// On AIX, we don't have a way to get CPU(s) a thread is bound to.
416
// This routine is only used to get the full mask.
417
int get_system_affinity(bool abort_on_error) override {
418
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
419
"Illegal get affinity operation when not capable");
420
421
(void)abort_on_error;
422
423
// Set the mask with all CPUs that are available.
424
for (int i = 0; i < __kmp_xproc; ++i)
425
KMP_CPU_SET(i, this);
426
return 0;
427
}
428
int set_system_affinity(bool abort_on_error) const override {
429
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
430
431
"Illegal set affinity operation when not capable");
432
433
int location;
434
int gtid = __kmp_entry_gtid();
435
int tid = thread_self();
436
437
// Unbind the thread if it was bound to any processors before so that
438
// we can bind the thread to CPUs specified by the mask not others.
439
int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
440
441
// On AIX, we can only bind to one instead of a set of CPUs with the
442
// bindprocessor() system call.
443
KMP_CPU_SET_ITERATE(location, this) {
444
if (KMP_CPU_ISSET(location, this)) {
445
retval = bindprocessor(BINDTHREAD, tid, location);
446
if (retval == -1 && errno == 1) {
447
rsid_t rsid;
448
rsethandle_t rsh;
449
// Put something in rsh to prevent compiler warning
450
// about uninitalized use
451
rsh = rs_alloc(RS_EMPTY);
452
rsid.at_pid = getpid();
453
if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
454
retval = ra_detachrset(R_PROCESS, rsid, 0);
455
retval = bindprocessor(BINDTHREAD, tid, location);
456
}
457
}
458
if (retval == 0) {
459
KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
460
"T#%d to cpu=%d.\n",
461
gtid, location));
462
continue;
463
}
464
int error = errno;
465
if (abort_on_error) {
466
__kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
467
KMP_ERR(error), __kmp_msg_null);
468
KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
469
"T#%d to cpu=%d, errno=%d.\n",
470
gtid, location, error));
471
return error;
472
}
473
}
474
}
475
return 0;
476
}
477
#else // !KMP_OS_AIX
478
int get_system_affinity(bool abort_on_error) override {
479
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
480
"Illegal get affinity operation when not capable");
481
#if KMP_OS_LINUX
482
long retval =
483
syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
484
#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
485
int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
486
reinterpret_cast<cpuset_t *>(mask));
487
int retval = (r == 0 ? 0 : -1);
488
#endif
489
if (retval >= 0) {
490
return 0;
491
}
492
int error = errno;
493
if (abort_on_error) {
494
__kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
495
KMP_ERR(error), __kmp_msg_null);
496
}
497
return error;
498
}
499
int set_system_affinity(bool abort_on_error) const override {
500
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
501
"Illegal set affinity operation when not capable");
502
#if KMP_OS_LINUX
503
long retval =
504
syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
505
#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
506
int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
507
reinterpret_cast<cpuset_t *>(mask));
508
int retval = (r == 0 ? 0 : -1);
509
#endif
510
if (retval >= 0) {
511
return 0;
512
}
513
int error = errno;
514
if (abort_on_error) {
515
__kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
516
KMP_ERR(error), __kmp_msg_null);
517
}
518
return error;
519
}
520
#endif // KMP_OS_AIX
521
};
522
void determine_capable(const char *env_var) override {
523
__kmp_affinity_determine_capable(env_var);
524
}
525
void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
526
KMPAffinity::Mask *allocate_mask() override {
527
KMPNativeAffinity::Mask *retval = new Mask();
528
return retval;
529
}
530
void deallocate_mask(KMPAffinity::Mask *m) override {
531
KMPNativeAffinity::Mask *native_mask =
532
static_cast<KMPNativeAffinity::Mask *>(m);
533
delete native_mask;
534
}
535
KMPAffinity::Mask *allocate_mask_array(int num) override {
536
return new Mask[num];
537
}
538
void deallocate_mask_array(KMPAffinity::Mask *array) override {
539
Mask *linux_array = static_cast<Mask *>(array);
540
delete[] linux_array;
541
}
542
KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
543
int index) override {
544
Mask *linux_array = static_cast<Mask *>(array);
545
return &(linux_array[index]);
546
}
547
api_type get_api_type() const override { return NATIVE_OS; }
548
};
549
#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
550
|| KMP_OS_AIX */
551
552
#if KMP_OS_WINDOWS
553
class KMPNativeAffinity : public KMPAffinity {
554
class Mask : public KMPAffinity::Mask {
555
typedef ULONG_PTR mask_t;
556
static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
557
mask_t *mask;
558
559
public:
560
Mask() {
561
mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
562
}
563
~Mask() {
564
if (mask)
565
__kmp_free(mask);
566
}
567
void set(int i) override {
568
mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
569
}
570
bool is_set(int i) const override {
571
return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
572
}
573
void clear(int i) override {
574
mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
575
}
576
void zero() override {
577
for (int i = 0; i < __kmp_num_proc_groups; ++i)
578
mask[i] = 0;
579
}
580
bool empty() const override {
581
for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
582
if (mask[i])
583
return false;
584
return true;
585
}
586
void copy(const KMPAffinity::Mask *src) override {
587
const Mask *convert = static_cast<const Mask *>(src);
588
for (int i = 0; i < __kmp_num_proc_groups; ++i)
589
mask[i] = convert->mask[i];
590
}
591
void bitwise_and(const KMPAffinity::Mask *rhs) override {
592
const Mask *convert = static_cast<const Mask *>(rhs);
593
for (int i = 0; i < __kmp_num_proc_groups; ++i)
594
mask[i] &= convert->mask[i];
595
}
596
void bitwise_or(const KMPAffinity::Mask *rhs) override {
597
const Mask *convert = static_cast<const Mask *>(rhs);
598
for (int i = 0; i < __kmp_num_proc_groups; ++i)
599
mask[i] |= convert->mask[i];
600
}
601
void bitwise_not() override {
602
for (int i = 0; i < __kmp_num_proc_groups; ++i)
603
mask[i] = ~(mask[i]);
604
}
605
bool is_equal(const KMPAffinity::Mask *rhs) const override {
606
const Mask *convert = static_cast<const Mask *>(rhs);
607
for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
608
if (mask[i] != convert->mask[i])
609
return false;
610
return true;
611
}
612
int begin() const override {
613
int retval = 0;
614
while (retval < end() && !is_set(retval))
615
++retval;
616
return retval;
617
}
618
int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
619
int next(int previous) const override {
620
int retval = previous + 1;
621
while (retval < end() && !is_set(retval))
622
++retval;
623
return retval;
624
}
625
int set_process_affinity(bool abort_on_error) const override {
626
if (__kmp_num_proc_groups <= 1) {
627
if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
628
DWORD error = GetLastError();
629
if (abort_on_error) {
630
__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
631
__kmp_msg_null);
632
}
633
return error;
634
}
635
}
636
return 0;
637
}
638
int set_system_affinity(bool abort_on_error) const override {
639
if (__kmp_num_proc_groups > 1) {
640
// Check for a valid mask.
641
GROUP_AFFINITY ga;
642
int group = get_proc_group();
643
if (group < 0) {
644
if (abort_on_error) {
645
KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
646
}
647
return -1;
648
}
649
// Transform the bit vector into a GROUP_AFFINITY struct
650
// and make the system call to set affinity.
651
ga.Group = group;
652
ga.Mask = mask[group];
653
ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
654
655
KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
656
if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
657
DWORD error = GetLastError();
658
if (abort_on_error) {
659
__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
660
__kmp_msg_null);
661
}
662
return error;
663
}
664
} else {
665
if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
666
DWORD error = GetLastError();
667
if (abort_on_error) {
668
__kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
669
__kmp_msg_null);
670
}
671
return error;
672
}
673
}
674
return 0;
675
}
676
int get_system_affinity(bool abort_on_error) override {
677
if (__kmp_num_proc_groups > 1) {
678
this->zero();
679
GROUP_AFFINITY ga;
680
KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
681
if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
682
DWORD error = GetLastError();
683
if (abort_on_error) {
684
__kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
685
KMP_ERR(error), __kmp_msg_null);
686
}
687
return error;
688
}
689
if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
690
(ga.Mask == 0)) {
691
return -1;
692
}
693
mask[ga.Group] = ga.Mask;
694
} else {
695
mask_t newMask, sysMask, retval;
696
if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
697
DWORD error = GetLastError();
698
if (abort_on_error) {
699
__kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
700
KMP_ERR(error), __kmp_msg_null);
701
}
702
return error;
703
}
704
retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
705
if (!retval) {
706
DWORD error = GetLastError();
707
if (abort_on_error) {
708
__kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
709
KMP_ERR(error), __kmp_msg_null);
710
}
711
return error;
712
}
713
newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
714
if (!newMask) {
715
DWORD error = GetLastError();
716
if (abort_on_error) {
717
__kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
718
KMP_ERR(error), __kmp_msg_null);
719
}
720
}
721
*mask = retval;
722
}
723
return 0;
724
}
725
int get_proc_group() const override {
726
int group = -1;
727
if (__kmp_num_proc_groups == 1) {
728
return 1;
729
}
730
for (int i = 0; i < __kmp_num_proc_groups; i++) {
731
if (mask[i] == 0)
732
continue;
733
if (group >= 0)
734
return -1;
735
group = i;
736
}
737
return group;
738
}
739
};
740
void determine_capable(const char *env_var) override {
741
__kmp_affinity_determine_capable(env_var);
742
}
743
void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
744
KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
745
void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
746
KMPAffinity::Mask *allocate_mask_array(int num) override {
747
return new Mask[num];
748
}
749
void deallocate_mask_array(KMPAffinity::Mask *array) override {
750
Mask *windows_array = static_cast<Mask *>(array);
751
delete[] windows_array;
752
}
753
KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
754
int index) override {
755
Mask *windows_array = static_cast<Mask *>(array);
756
return &(windows_array[index]);
757
}
758
api_type get_api_type() const override { return NATIVE_OS; }
759
};
760
#endif /* KMP_OS_WINDOWS */
761
#endif /* KMP_AFFINITY_SUPPORTED */
762
763
// Describe an attribute for a level in the machine topology
764
struct kmp_hw_attr_t {
765
int core_type : 8;
766
int core_eff : 8;
767
unsigned valid : 1;
768
unsigned reserved : 15;
769
770
static const int UNKNOWN_CORE_EFF = -1;
771
772
kmp_hw_attr_t()
773
: core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
774
valid(0), reserved(0) {}
775
void set_core_type(kmp_hw_core_type_t type) {
776
valid = 1;
777
core_type = type;
778
}
779
void set_core_eff(int eff) {
780
valid = 1;
781
core_eff = eff;
782
}
783
kmp_hw_core_type_t get_core_type() const {
784
return (kmp_hw_core_type_t)core_type;
785
}
786
int get_core_eff() const { return core_eff; }
787
bool is_core_type_valid() const {
788
return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
789
}
790
bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
791
operator bool() const { return valid; }
792
void clear() {
793
core_type = KMP_HW_CORE_TYPE_UNKNOWN;
794
core_eff = UNKNOWN_CORE_EFF;
795
valid = 0;
796
}
797
bool contains(const kmp_hw_attr_t &other) const {
798
if (!valid && !other.valid)
799
return true;
800
if (valid && other.valid) {
801
if (other.is_core_type_valid()) {
802
if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
803
return false;
804
}
805
if (other.is_core_eff_valid()) {
806
if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
807
return false;
808
}
809
return true;
810
}
811
return false;
812
}
813
#if KMP_AFFINITY_SUPPORTED
814
bool contains(const kmp_affinity_attrs_t &attr) const {
815
if (!valid && !attr.valid)
816
return true;
817
if (valid && attr.valid) {
818
if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
819
return (is_core_type_valid() &&
820
(get_core_type() == (kmp_hw_core_type_t)attr.core_type));
821
if (attr.core_eff != UNKNOWN_CORE_EFF)
822
return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
823
return true;
824
}
825
return false;
826
}
827
#endif // KMP_AFFINITY_SUPPORTED
828
bool operator==(const kmp_hw_attr_t &rhs) const {
829
return (rhs.valid == valid && rhs.core_eff == core_eff &&
830
rhs.core_type == core_type);
831
}
832
bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
833
};
834
835
#if KMP_AFFINITY_SUPPORTED
836
KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
837
#endif
838
839
class kmp_hw_thread_t {
840
public:
841
static const int UNKNOWN_ID = -1;
842
static const int MULTIPLE_ID = -2;
843
static int compare_ids(const void *a, const void *b);
844
static int compare_compact(const void *a, const void *b);
845
int ids[KMP_HW_LAST];
846
int sub_ids[KMP_HW_LAST];
847
bool leader;
848
int os_id;
849
kmp_hw_attr_t attrs;
850
851
void print() const;
852
void clear() {
853
for (int i = 0; i < (int)KMP_HW_LAST; ++i)
854
ids[i] = UNKNOWN_ID;
855
leader = false;
856
attrs.clear();
857
}
858
};
859
860
class kmp_topology_t {
861
862
struct flags_t {
863
int uniform : 1;
864
int reserved : 31;
865
};
866
867
int depth;
868
869
// The following arrays are all 'depth' long and have been
870
// allocated to hold up to KMP_HW_LAST number of objects if
871
// needed so layers can be added without reallocation of any array
872
873
// Orderd array of the types in the topology
874
kmp_hw_t *types;
875
876
// Keep quick topology ratios, for non-uniform topologies,
877
// this ratio holds the max number of itemAs per itemB
878
// e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
879
int *ratio;
880
881
// Storage containing the absolute number of each topology layer
882
int *count;
883
884
// The number of core efficiencies. This is only useful for hybrid
885
// topologies. Core efficiencies will range from 0 to num efficiencies - 1
886
int num_core_efficiencies;
887
int num_core_types;
888
kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
889
890
// The hardware threads array
891
// hw_threads is num_hw_threads long
892
// Each hw_thread's ids and sub_ids are depth deep
893
int num_hw_threads;
894
kmp_hw_thread_t *hw_threads;
895
896
// Equivalence hash where the key is the hardware topology item
897
// and the value is the equivalent hardware topology type in the
898
// types[] array, if the value is KMP_HW_UNKNOWN, then there is no
899
// known equivalence for the topology type
900
kmp_hw_t equivalent[KMP_HW_LAST];
901
902
// Flags describing the topology
903
flags_t flags;
904
905
// Compact value used during sort_compact()
906
int compact;
907
908
// Insert a new topology layer after allocation
909
void _insert_layer(kmp_hw_t type, const int *ids);
910
911
#if KMP_GROUP_AFFINITY
912
// Insert topology information about Windows Processor groups
913
void _insert_windows_proc_groups();
914
#endif
915
916
// Count each item & get the num x's per y
917
// e.g., get the number of cores and the number of threads per core
918
// for each (x, y) in (KMP_HW_* , KMP_HW_*)
919
void _gather_enumeration_information();
920
921
// Remove layers that don't add information to the topology.
922
// This is done by having the layer take on the id = UNKNOWN_ID (-1)
923
void _remove_radix1_layers();
924
925
// Find out if the topology is uniform
926
void _discover_uniformity();
927
928
// Set all the sub_ids for each hardware thread
929
void _set_sub_ids();
930
931
// Set global affinity variables describing the number of threads per
932
// core, the number of packages, the number of cores per package, and
933
// the number of cores.
934
void _set_globals();
935
936
// Set the last level cache equivalent type
937
void _set_last_level_cache();
938
939
// Return the number of cores with a particular attribute, 'attr'.
940
// If 'find_all' is true, then find all cores on the machine, otherwise find
941
// all cores per the layer 'above'
942
int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
943
bool find_all = false) const;
944
945
public:
946
// Force use of allocate()/deallocate()
947
kmp_topology_t() = delete;
948
kmp_topology_t(const kmp_topology_t &t) = delete;
949
kmp_topology_t(kmp_topology_t &&t) = delete;
950
kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
951
kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
952
953
static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
954
static void deallocate(kmp_topology_t *);
955
956
// Functions used in create_map() routines
957
kmp_hw_thread_t &at(int index) {
958
KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
959
return hw_threads[index];
960
}
961
const kmp_hw_thread_t &at(int index) const {
962
KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963
return hw_threads[index];
964
}
965
int get_num_hw_threads() const { return num_hw_threads; }
966
void sort_ids() {
967
qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
968
kmp_hw_thread_t::compare_ids);
969
}
970
// Check if the hardware ids are unique, if they are
971
// return true, otherwise return false
972
bool check_ids() const;
973
974
// Function to call after the create_map() routine
975
void canonicalize();
976
void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
977
978
// Functions used after canonicalize() called
979
980
#if KMP_AFFINITY_SUPPORTED
981
// Set the granularity for affinity settings
982
void set_granularity(kmp_affinity_t &stgs) const;
983
bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
984
bool restrict_to_mask(const kmp_affin_mask_t *mask);
985
bool filter_hw_subset();
986
#endif
987
bool is_uniform() const { return flags.uniform; }
988
// Tell whether a type is a valid type in the topology
989
// returns KMP_HW_UNKNOWN when there is no equivalent type
990
kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
991
if (type == KMP_HW_UNKNOWN)
992
return KMP_HW_UNKNOWN;
993
return equivalent[type];
994
}
995
// Set type1 = type2
996
void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
997
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
998
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
999
kmp_hw_t real_type2 = equivalent[type2];
1000
if (real_type2 == KMP_HW_UNKNOWN)
1001
real_type2 = type2;
1002
equivalent[type1] = real_type2;
1003
// This loop is required since any of the types may have been set to
1004
// be equivalent to type1. They all must be checked and reset to type2.
1005
KMP_FOREACH_HW_TYPE(type) {
1006
if (equivalent[type] == type1) {
1007
equivalent[type] = real_type2;
1008
}
1009
}
1010
}
1011
// Calculate number of types corresponding to level1
1012
// per types corresponding to level2 (e.g., number of threads per core)
1013
int calculate_ratio(int level1, int level2) const {
1014
KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1015
KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1016
int r = 1;
1017
for (int level = level1; level > level2; --level)
1018
r *= ratio[level];
1019
return r;
1020
}
1021
int get_ratio(int level) const {
1022
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1023
return ratio[level];
1024
}
1025
int get_depth() const { return depth; };
1026
kmp_hw_t get_type(int level) const {
1027
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1028
return types[level];
1029
}
1030
int get_level(kmp_hw_t type) const {
1031
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1032
int eq_type = equivalent[type];
1033
if (eq_type == KMP_HW_UNKNOWN)
1034
return -1;
1035
for (int i = 0; i < depth; ++i)
1036
if (types[i] == eq_type)
1037
return i;
1038
return -1;
1039
}
1040
int get_count(int level) const {
1041
KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1042
return count[level];
1043
}
1044
// Return the total number of cores with attribute 'attr'
1045
int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1046
return _get_ncores_with_attr(attr, -1, true);
1047
}
1048
// Return the number of cores with attribute
1049
// 'attr' per topology level 'above'
1050
int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1051
return _get_ncores_with_attr(attr, above, false);
1052
}
1053
1054
#if KMP_AFFINITY_SUPPORTED
1055
friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1056
void sort_compact(kmp_affinity_t &affinity) {
1057
compact = affinity.compact;
1058
qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1059
kmp_hw_thread_t::compare_compact);
1060
}
1061
#endif
1062
void print(const char *env_var = "KMP_AFFINITY") const;
1063
void dump() const;
1064
};
1065
extern kmp_topology_t *__kmp_topology;
1066
1067
class kmp_hw_subset_t {
1068
const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1069
1070
public:
1071
// Describe a machine topology item in KMP_HW_SUBSET
1072
struct item_t {
1073
kmp_hw_t type;
1074
int num_attrs;
1075
int num[MAX_ATTRS];
1076
int offset[MAX_ATTRS];
1077
kmp_hw_attr_t attr[MAX_ATTRS];
1078
};
1079
// Put parenthesis around max to avoid accidental use of Windows max macro.
1080
const static int USE_ALL = (std::numeric_limits<int>::max)();
1081
1082
private:
1083
int depth;
1084
int capacity;
1085
item_t *items;
1086
kmp_uint64 set;
1087
bool absolute;
1088
// The set must be able to handle up to KMP_HW_LAST number of layers
1089
KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1090
// Sorting the KMP_HW_SUBSET items to follow topology order
1091
// All unknown topology types will be at the beginning of the subset
1092
static int hw_subset_compare(const void *i1, const void *i2) {
1093
kmp_hw_t type1 = ((const item_t *)i1)->type;
1094
kmp_hw_t type2 = ((const item_t *)i2)->type;
1095
int level1 = __kmp_topology->get_level(type1);
1096
int level2 = __kmp_topology->get_level(type2);
1097
return level1 - level2;
1098
}
1099
1100
public:
1101
// Force use of allocate()/deallocate()
1102
kmp_hw_subset_t() = delete;
1103
kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1104
kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1105
kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1106
kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1107
1108
static kmp_hw_subset_t *allocate() {
1109
int initial_capacity = 5;
1110
kmp_hw_subset_t *retval =
1111
(kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1112
retval->depth = 0;
1113
retval->capacity = initial_capacity;
1114
retval->set = 0ull;
1115
retval->absolute = false;
1116
retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1117
return retval;
1118
}
1119
static void deallocate(kmp_hw_subset_t *subset) {
1120
__kmp_free(subset->items);
1121
__kmp_free(subset);
1122
}
1123
void set_absolute() { absolute = true; }
1124
bool is_absolute() const { return absolute; }
1125
void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1126
for (int i = 0; i < depth; ++i) {
1127
// Found an existing item for this layer type
1128
// Add the num, offset, and attr to this item
1129
if (items[i].type == type) {
1130
int idx = items[i].num_attrs++;
1131
if ((size_t)idx >= MAX_ATTRS)
1132
return;
1133
items[i].num[idx] = num;
1134
items[i].offset[idx] = offset;
1135
items[i].attr[idx] = attr;
1136
return;
1137
}
1138
}
1139
if (depth == capacity - 1) {
1140
capacity *= 2;
1141
item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1142
for (int i = 0; i < depth; ++i)
1143
new_items[i] = items[i];
1144
__kmp_free(items);
1145
items = new_items;
1146
}
1147
items[depth].num_attrs = 1;
1148
items[depth].type = type;
1149
items[depth].num[0] = num;
1150
items[depth].offset[0] = offset;
1151
items[depth].attr[0] = attr;
1152
depth++;
1153
set |= (1ull << type);
1154
}
1155
int get_depth() const { return depth; }
1156
const item_t &at(int index) const {
1157
KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1158
return items[index];
1159
}
1160
item_t &at(int index) {
1161
KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162
return items[index];
1163
}
1164
void remove(int index) {
1165
KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166
set &= ~(1ull << items[index].type);
1167
for (int j = index + 1; j < depth; ++j) {
1168
items[j - 1] = items[j];
1169
}
1170
depth--;
1171
}
1172
void sort() {
1173
KMP_DEBUG_ASSERT(__kmp_topology);
1174
qsort(items, depth, sizeof(item_t), hw_subset_compare);
1175
}
1176
bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1177
1178
// Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1179
// This means putting each of {sockets, cores, threads} in the topology if
1180
// they are not specified:
1181
// e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1182
// e.g., 3module => *s,3module,*c,*t
1183
// By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1184
// are expecting the traditional sockets/cores/threads topology. For newer
1185
// hardware, there can be intervening layers like dies/tiles/modules
1186
// (usually corresponding to a cache level). So when a user asks for
1187
// 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1188
// should get 12 hardware threads across 6 cores and effectively ignore the
1189
// module layer.
1190
void canonicalize(const kmp_topology_t *top) {
1191
// Layers to target for KMP_HW_SUBSET canonicalization
1192
kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1193
1194
// Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1195
if (is_absolute())
1196
return;
1197
1198
// Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1199
// topology doesn't have these layers
1200
for (kmp_hw_t type : targeted)
1201
if (top->get_level(type) == KMP_HW_UNKNOWN)
1202
return;
1203
1204
// Put targeted layers in topology if they do not exist
1205
for (kmp_hw_t type : targeted) {
1206
bool found = false;
1207
for (int i = 0; i < get_depth(); ++i) {
1208
if (top->get_equivalent_type(items[i].type) == type) {
1209
found = true;
1210
break;
1211
}
1212
}
1213
if (!found) {
1214
push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1215
}
1216
}
1217
sort();
1218
// Set as an absolute topology that only targets the targeted layers
1219
set_absolute();
1220
}
1221
void dump() const {
1222
printf("**********************\n");
1223
printf("*** kmp_hw_subset: ***\n");
1224
printf("* depth: %d\n", depth);
1225
printf("* items:\n");
1226
for (int i = 0; i < depth; ++i) {
1227
printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1228
for (int j = 0; j < items[i].num_attrs; ++j) {
1229
printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1230
items[i].offset[j]);
1231
if (!items[i].attr[j]) {
1232
printf(" (none)\n");
1233
} else {
1234
printf(
1235
" core_type = %s, core_eff = %d\n",
1236
__kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1237
items[i].attr[j].get_core_eff());
1238
}
1239
}
1240
}
1241
printf("* set: 0x%llx\n", set);
1242
printf("* absolute: %d\n", absolute);
1243
printf("**********************\n");
1244
}
1245
};
1246
extern kmp_hw_subset_t *__kmp_hw_subset;
1247
1248
/* A structure for holding machine-specific hierarchy info to be computed once
1249
at init. This structure represents a mapping of threads to the actual machine
1250
hierarchy, or to our best guess at what the hierarchy might be, for the
1251
purpose of performing an efficient barrier. In the worst case, when there is
1252
no machine hierarchy information, it produces a tree suitable for a barrier,
1253
similar to the tree used in the hyper barrier. */
1254
class hierarchy_info {
1255
public:
1256
/* Good default values for number of leaves and branching factor, given no
1257
affinity information. Behaves a bit like hyper barrier. */
1258
static const kmp_uint32 maxLeaves = 4;
1259
static const kmp_uint32 minBranch = 4;
1260
/** Number of levels in the hierarchy. Typical levels are threads/core,
1261
cores/package or socket, packages/node, nodes/machine, etc. We don't want
1262
to get specific with nomenclature. When the machine is oversubscribed we
1263
add levels to duplicate the hierarchy, doubling the thread capacity of the
1264
hierarchy each time we add a level. */
1265
kmp_uint32 maxLevels;
1266
1267
/** This is specifically the depth of the machine configuration hierarchy, in
1268
terms of the number of levels along the longest path from root to any
1269
leaf. It corresponds to the number of entries in numPerLevel if we exclude
1270
all but one trailing 1. */
1271
kmp_uint32 depth;
1272
kmp_uint32 base_num_threads;
1273
enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1274
volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1275
// 2=initialization in progress
1276
volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1277
1278
/** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1279
the parent of a node at level i has. For example, if we have a machine
1280
with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1281
{2, 4, 4, 1, 1}. All empty levels are set to 1. */
1282
kmp_uint32 *numPerLevel;
1283
kmp_uint32 *skipPerLevel;
1284
1285
void deriveLevels() {
1286
int hier_depth = __kmp_topology->get_depth();
1287
for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1288
numPerLevel[level] = __kmp_topology->get_ratio(i);
1289
}
1290
}
1291
1292
hierarchy_info()
1293
: maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1294
1295
void fini() {
1296
if (!uninitialized && numPerLevel) {
1297
__kmp_free(numPerLevel);
1298
numPerLevel = NULL;
1299
uninitialized = not_initialized;
1300
}
1301
}
1302
1303
void init(int num_addrs) {
1304
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1305
&uninitialized, not_initialized, initializing);
1306
if (bool_result == 0) { // Wait for initialization
1307
while (TCR_1(uninitialized) != initialized)
1308
KMP_CPU_PAUSE();
1309
return;
1310
}
1311
KMP_DEBUG_ASSERT(bool_result == 1);
1312
1313
/* Added explicit initialization of the data fields here to prevent usage of
1314
dirty value observed when static library is re-initialized multiple times
1315
(e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1316
OpenMP). */
1317
depth = 1;
1318
resizing = 0;
1319
maxLevels = 7;
1320
numPerLevel =
1321
(kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1322
skipPerLevel = &(numPerLevel[maxLevels]);
1323
for (kmp_uint32 i = 0; i < maxLevels;
1324
++i) { // init numPerLevel[*] to 1 item per level
1325
numPerLevel[i] = 1;
1326
skipPerLevel[i] = 1;
1327
}
1328
1329
// Sort table by physical ID
1330
if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1331
deriveLevels();
1332
} else {
1333
numPerLevel[0] = maxLeaves;
1334
numPerLevel[1] = num_addrs / maxLeaves;
1335
if (num_addrs % maxLeaves)
1336
numPerLevel[1]++;
1337
}
1338
1339
base_num_threads = num_addrs;
1340
for (int i = maxLevels - 1; i >= 0;
1341
--i) // count non-empty levels to get depth
1342
if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1343
depth++;
1344
1345
kmp_uint32 branch = minBranch;
1346
if (numPerLevel[0] == 1)
1347
branch = num_addrs / maxLeaves;
1348
if (branch < minBranch)
1349
branch = minBranch;
1350
for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1351
while (numPerLevel[d] > branch ||
1352
(d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1353
if (numPerLevel[d] & 1)
1354
numPerLevel[d]++;
1355
numPerLevel[d] = numPerLevel[d] >> 1;
1356
if (numPerLevel[d + 1] == 1)
1357
depth++;
1358
numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1359
}
1360
if (numPerLevel[0] == 1) {
1361
branch = branch >> 1;
1362
if (branch < 4)
1363
branch = minBranch;
1364
}
1365
}
1366
1367
for (kmp_uint32 i = 1; i < depth; ++i)
1368
skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1369
// Fill in hierarchy in the case of oversubscription
1370
for (kmp_uint32 i = depth; i < maxLevels; ++i)
1371
skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1372
1373
uninitialized = initialized; // One writer
1374
}
1375
1376
// Resize the hierarchy if nproc changes to something larger than before
1377
void resize(kmp_uint32 nproc) {
1378
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1379
while (bool_result == 0) { // someone else is trying to resize
1380
KMP_CPU_PAUSE();
1381
if (nproc <= base_num_threads) // happy with other thread's resize
1382
return;
1383
else // try to resize
1384
bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1385
}
1386
KMP_DEBUG_ASSERT(bool_result != 0);
1387
if (nproc <= base_num_threads)
1388
return; // happy with other thread's resize
1389
1390
// Calculate new maxLevels
1391
kmp_uint32 old_sz = skipPerLevel[depth - 1];
1392
kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1393
// First see if old maxLevels is enough to contain new size
1394
for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1395
skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1396
numPerLevel[i - 1] *= 2;
1397
old_sz *= 2;
1398
depth++;
1399
}
1400
if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1401
while (nproc > old_sz) {
1402
old_sz *= 2;
1403
incs++;
1404
depth++;
1405
}
1406
maxLevels += incs;
1407
1408
// Resize arrays
1409
kmp_uint32 *old_numPerLevel = numPerLevel;
1410
kmp_uint32 *old_skipPerLevel = skipPerLevel;
1411
numPerLevel = skipPerLevel = NULL;
1412
numPerLevel =
1413
(kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1414
skipPerLevel = &(numPerLevel[maxLevels]);
1415
1416
// Copy old elements from old arrays
1417
for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1418
// init numPerLevel[*] to 1 item per level
1419
numPerLevel[i] = old_numPerLevel[i];
1420
skipPerLevel[i] = old_skipPerLevel[i];
1421
}
1422
1423
// Init new elements in arrays to 1
1424
for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1425
// init numPerLevel[*] to 1 item per level
1426
numPerLevel[i] = 1;
1427
skipPerLevel[i] = 1;
1428
}
1429
1430
// Free old arrays
1431
__kmp_free(old_numPerLevel);
1432
}
1433
1434
// Fill in oversubscription levels of hierarchy
1435
for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1436
skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1437
1438
base_num_threads = nproc;
1439
resizing = 0; // One writer
1440
}
1441
};
1442
#endif // KMP_AFFINITY_H
1443
1444