Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
official-stockfish
GitHub Repository: official-stockfish/Stockfish
Path: blob/master/src/numa.h
376 views
1
/*
2
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3
Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
4
5
Stockfish is free software: you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation, either version 3 of the License, or
8
(at your option) any later version.
9
10
Stockfish is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
14
15
You should have received a copy of the GNU General Public License
16
along with this program. If not, see <http://www.gnu.org/licenses/>.
17
*/
18
19
#ifndef NUMA_H_INCLUDED
20
#define NUMA_H_INCLUDED
21
22
#include <algorithm>
23
#include <atomic>
24
#include <cstdint>
25
#include <cstdlib>
26
#include <functional>
27
#include <iostream>
28
#include <limits>
29
#include <map>
30
#include <memory>
31
#include <mutex>
32
#include <set>
33
#include <sstream>
34
#include <string>
35
#include <thread>
36
#include <utility>
37
#include <vector>
38
#include <cstring>
39
40
#include "memory.h"
41
42
// We support linux very well, but we explicitly do NOT support Android,
43
// because there is no affected systems, not worth maintaining.
44
#if defined(__linux__) && !defined(__ANDROID__)
45
#if !defined(_GNU_SOURCE)
46
#define _GNU_SOURCE
47
#endif
48
#include <sched.h>
49
#elif defined(_WIN64)
50
51
#if _WIN32_WINNT < 0x0601
52
#undef _WIN32_WINNT
53
#define _WIN32_WINNT 0x0601 // Force to include needed API prototypes
54
#endif
55
56
// On Windows each processor group can have up to 64 processors.
57
// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
58
static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64;
59
60
#if !defined(NOMINMAX)
61
#define NOMINMAX
62
#endif
63
#include <windows.h>
64
#if defined small
65
#undef small
66
#endif
67
68
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks
69
using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT);
70
71
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks
72
using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT);
73
74
#endif
75
76
#include "misc.h"
77
78
namespace Stockfish {
79
80
using CpuIndex = size_t;
81
using NumaIndex = size_t;
82
83
inline CpuIndex get_hardware_concurrency() {
84
CpuIndex concurrency = std::thread::hardware_concurrency();
85
86
// Get all processors across all processor groups on windows, since
87
// hardware_concurrency() only returns the number of processors in
88
// the first group, because only these are available to std::thread.
89
#ifdef _WIN64
90
concurrency = std::max<CpuIndex>(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS));
91
#endif
92
93
return concurrency;
94
}
95
96
inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());
97
98
#if defined(_WIN64)
99
100
struct WindowsAffinity {
101
std::optional<std::set<CpuIndex>> oldApi;
102
std::optional<std::set<CpuIndex>> newApi;
103
104
// We also provide diagnostic for when the affinity is set to nullopt
105
// whether it was due to being indeterminate. If affinity is indeterminate
106
// it is best to assume it is not set at all, so consistent with the meaning
107
// of the nullopt affinity.
108
bool isNewDeterminate = true;
109
bool isOldDeterminate = true;
110
111
std::optional<std::set<CpuIndex>> get_combined() const {
112
if (!oldApi.has_value())
113
return newApi;
114
if (!newApi.has_value())
115
return oldApi;
116
117
std::set<CpuIndex> intersect;
118
std::set_intersection(oldApi->begin(), oldApi->end(), newApi->begin(), newApi->end(),
119
std::inserter(intersect, intersect.begin()));
120
return intersect;
121
}
122
123
// Since Windows 11 and Windows Server 2022 thread affinities can span
124
// processor groups and can be set as such by a new WinAPI function. However,
125
// we may need to force using the old API if we detect that the process has
126
// affinity set by the old API already and we want to override that. Due to the
127
// limitations of the old API we cannot detect its use reliably. There will be
128
// cases where we detect not use but it has actually been used and vice versa.
129
130
bool likely_used_old_api() const { return oldApi.has_value() || !isOldDeterminate; }
131
};
132
133
inline std::pair<BOOL, std::vector<USHORT>> get_process_group_affinity() {
134
135
// GetProcessGroupAffinity requires the GroupArray argument to be
136
// aligned to 4 bytes instead of just 2.
137
static constexpr size_t GroupArrayMinimumAlignment = 4;
138
static_assert(GroupArrayMinimumAlignment >= alignof(USHORT));
139
140
// The function should succeed the second time, but it may fail if the group
141
// affinity has changed between GetProcessGroupAffinity calls. In such case
142
// we consider this a hard error, as we Cannot work with unstable affinities
143
// anyway.
144
static constexpr int MAX_TRIES = 2;
145
USHORT GroupCount = 1;
146
for (int i = 0; i < MAX_TRIES; ++i)
147
{
148
auto GroupArray = std::make_unique<USHORT[]>(
149
GroupCount + (GroupArrayMinimumAlignment / alignof(USHORT) - 1));
150
151
USHORT* GroupArrayAligned = align_ptr_up<GroupArrayMinimumAlignment>(GroupArray.get());
152
153
const BOOL status =
154
GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount, GroupArrayAligned);
155
156
if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
157
{
158
break;
159
}
160
161
if (status != 0)
162
{
163
return std::make_pair(status,
164
std::vector(GroupArrayAligned, GroupArrayAligned + GroupCount));
165
}
166
}
167
168
return std::make_pair(0, std::vector<USHORT>());
169
}
170
171
// On Windows there are two ways to set affinity, and therefore 2 ways to get it.
172
// These are not consistent, so we have to check both. In some cases it is actually
173
// not possible to determine affinity. For example when two different threads have
174
// affinity on different processor groups, set using SetThreadAffinityMask, we cannot
175
// retrieve the actual affinities.
176
// From documentation on GetProcessAffinityMask:
177
// > If the calling process contains threads in multiple groups,
178
// > the function returns zero for both affinity masks.
179
// In such cases we just give up and assume we have affinity for all processors.
180
// nullopt means no affinity is set, that is, all processors are allowed
181
inline WindowsAffinity get_process_affinity() {
182
HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll"));
183
auto GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
184
(void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
185
186
BOOL status = 0;
187
188
WindowsAffinity affinity;
189
190
if (GetThreadSelectedCpuSetMasks_f != nullptr)
191
{
192
USHORT RequiredMaskCount;
193
status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
194
195
// We expect ERROR_INSUFFICIENT_BUFFER from GetThreadSelectedCpuSetMasks,
196
// but other failure is an actual error.
197
if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
198
{
199
affinity.isNewDeterminate = false;
200
}
201
else if (RequiredMaskCount > 0)
202
{
203
// If RequiredMaskCount then these affinities were never set, but it's
204
// not consistent so GetProcessAffinityMask may still return some affinity.
205
auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
206
207
status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
208
RequiredMaskCount, &RequiredMaskCount);
209
210
if (status == 0)
211
{
212
affinity.isNewDeterminate = false;
213
}
214
else
215
{
216
std::set<CpuIndex> cpus;
217
218
for (USHORT i = 0; i < RequiredMaskCount; ++i)
219
{
220
const size_t procGroupIndex = groupAffinities[i].Group;
221
222
for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
223
{
224
if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
225
cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
226
}
227
}
228
229
affinity.newApi = std::move(cpus);
230
}
231
}
232
}
233
234
// NOTE: There is no way to determine full affinity using the old API if
235
// individual threads set affinity on different processor groups.
236
237
DWORD_PTR proc, sys;
238
status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys);
239
240
// If proc == 0 then we cannot determine affinity because it spans processor groups.
241
// On Windows 11 and Server 2022 it will instead
242
// > If, however, hHandle specifies a handle to the current process, the function
243
// > always uses the calling thread's primary group (which by default is the same
244
// > as the process' primary group) in order to set the
245
// > lpProcessAffinityMask and lpSystemAffinityMask.
246
// So it will never be indeterminate here. We can only make assumptions later.
247
if (status == 0 || proc == 0)
248
{
249
affinity.isOldDeterminate = false;
250
return affinity;
251
}
252
253
// If SetProcessAffinityMask was never called the affinity must span
254
// all processor groups, but if it was called it must only span one.
255
256
std::vector<USHORT> groupAffinity; // We need to capture this later and capturing
257
// from structured bindings requires c++20.
258
259
std::tie(status, groupAffinity) = get_process_group_affinity();
260
if (status == 0)
261
{
262
affinity.isOldDeterminate = false;
263
return affinity;
264
}
265
266
if (groupAffinity.size() == 1)
267
{
268
// We detect the case when affinity is set to all processors and correctly
269
// leave affinity.oldApi as nullopt.
270
if (GetActiveProcessorGroupCount() != 1 || proc != sys)
271
{
272
std::set<CpuIndex> cpus;
273
274
const size_t procGroupIndex = groupAffinity[0];
275
276
const uint64_t mask = static_cast<uint64_t>(proc);
277
for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
278
{
279
if (mask & (KAFFINITY(1) << j))
280
cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
281
}
282
283
affinity.oldApi = std::move(cpus);
284
}
285
}
286
else
287
{
288
// If we got here it means that either SetProcessAffinityMask was never set
289
// or we're on Windows 11/Server 2022.
290
291
// Since Windows 11 and Windows Server 2022 the behaviour of
292
// GetProcessAffinityMask changed:
293
// > If, however, hHandle specifies a handle to the current process,
294
// > the function always uses the calling thread's primary group
295
// > (which by default is the same as the process' primary group)
296
// > in order to set the lpProcessAffinityMask and lpSystemAffinityMask.
297
// In which case we can actually retrieve the full affinity.
298
299
if (GetThreadSelectedCpuSetMasks_f != nullptr)
300
{
301
std::thread th([&]() {
302
std::set<CpuIndex> cpus;
303
bool isAffinityFull = true;
304
305
for (auto procGroupIndex : groupAffinity)
306
{
307
const int numActiveProcessors =
308
GetActiveProcessorCount(static_cast<WORD>(procGroupIndex));
309
310
// We have to schedule to two different processors
311
// and & the affinities we get. Otherwise our processor
312
// choice could influence the resulting affinity.
313
// We assume the processor IDs within the group are
314
// filled sequentially from 0.
315
uint64_t procCombined = std::numeric_limits<uint64_t>::max();
316
uint64_t sysCombined = std::numeric_limits<uint64_t>::max();
317
318
for (int i = 0; i < std::min(numActiveProcessors, 2); ++i)
319
{
320
GROUP_AFFINITY GroupAffinity;
321
std::memset(&GroupAffinity, 0, sizeof(GROUP_AFFINITY));
322
GroupAffinity.Group = static_cast<WORD>(procGroupIndex);
323
324
GroupAffinity.Mask = static_cast<KAFFINITY>(1) << i;
325
326
status =
327
SetThreadGroupAffinity(GetCurrentThread(), &GroupAffinity, nullptr);
328
if (status == 0)
329
{
330
affinity.isOldDeterminate = false;
331
return;
332
}
333
334
SwitchToThread();
335
336
DWORD_PTR proc2, sys2;
337
status = GetProcessAffinityMask(GetCurrentProcess(), &proc2, &sys2);
338
if (status == 0)
339
{
340
affinity.isOldDeterminate = false;
341
return;
342
}
343
344
procCombined &= static_cast<uint64_t>(proc2);
345
sysCombined &= static_cast<uint64_t>(sys2);
346
}
347
348
if (procCombined != sysCombined)
349
isAffinityFull = false;
350
351
for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
352
{
353
if (procCombined & (KAFFINITY(1) << j))
354
cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
355
}
356
}
357
358
// We have to detect the case where the affinity was not set,
359
// or is set to all processors so that we correctly produce as
360
// std::nullopt result.
361
if (!isAffinityFull)
362
{
363
affinity.oldApi = std::move(cpus);
364
}
365
});
366
367
th.join();
368
}
369
}
370
371
return affinity;
372
}
373
374
#endif
375
376
#if defined(__linux__) && !defined(__ANDROID__)
377
378
inline std::set<CpuIndex> get_process_affinity() {
379
380
std::set<CpuIndex> cpus;
381
382
// For unsupported systems, or in case of a soft error, we may assume
383
// all processors are available for use.
384
[[maybe_unused]] auto set_to_all_cpus = [&]() {
385
for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
386
cpus.insert(c);
387
};
388
389
// cpu_set_t by default holds 1024 entries. This may not be enough soon,
390
// but there is no easy way to determine how many threads there actually
391
// is. In this case we just choose a reasonable upper bound.
392
static constexpr CpuIndex MaxNumCpus = 1024 * 64;
393
394
cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
395
if (mask == nullptr)
396
std::exit(EXIT_FAILURE);
397
398
const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
399
400
CPU_ZERO_S(masksize, mask);
401
402
const int status = sched_getaffinity(0, masksize, mask);
403
404
if (status != 0)
405
{
406
CPU_FREE(mask);
407
std::exit(EXIT_FAILURE);
408
}
409
410
for (CpuIndex c = 0; c < MaxNumCpus; ++c)
411
if (CPU_ISSET_S(c, masksize, mask))
412
cpus.insert(c);
413
414
CPU_FREE(mask);
415
416
return cpus;
417
}
418
419
#endif
420
421
#if defined(__linux__) && !defined(__ANDROID__)
422
423
inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity();
424
425
#elif defined(_WIN64)
426
427
inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity();
428
inline static const auto STARTUP_USE_OLD_AFFINITY_API =
429
STARTUP_PROCESSOR_AFFINITY.likely_used_old_api();
430
431
#endif
432
433
// We want to abstract the purpose of storing the numa node index somewhat.
434
// Whoever is using this does not need to know the specifics of the replication
435
// machinery to be able to access NUMA replicated memory.
436
class NumaReplicatedAccessToken {
437
public:
438
NumaReplicatedAccessToken() :
439
n(0) {}
440
441
explicit NumaReplicatedAccessToken(NumaIndex idx) :
442
n(idx) {}
443
444
NumaIndex get_numa_index() const { return n; }
445
446
private:
447
NumaIndex n;
448
};
449
450
// Designed as immutable, because there is no good reason to alter an already
451
// existing config in a way that doesn't require recreating it completely, and
452
// it would be complex and expensive to maintain class invariants.
453
// The CPU (processor) numbers always correspond to the actual numbering used
454
// by the system. The NUMA node numbers MAY NOT correspond to the system's
455
// numbering of the NUMA nodes. In particular, empty nodes may be removed, or
456
// the user may create custom nodes. It is guaranteed that NUMA nodes are NOT
457
// empty: every node exposed by NumaConfig has at least one processor assigned.
458
//
459
// We use startup affinities so as not to modify its own behaviour in time.
460
//
461
// Since Stockfish doesn't support exceptions all places where an exception
462
// should be thrown are replaced by std::exit.
463
class NumaConfig {
464
public:
465
NumaConfig() :
466
highestCpuIndex(0),
467
customAffinity(false) {
468
const auto numCpus = SYSTEM_THREADS_NB;
469
add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1);
470
}
471
472
// This function queries the system for the mapping of processors to NUMA nodes.
473
// On Linux we read from standardized kernel sysfs, with a fallback to single NUMA
474
// node. On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see
475
// comment for Windows implementation of get_process_affinity.
476
static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) {
477
NumaConfig cfg = empty();
478
479
#if defined(__linux__) && !defined(__ANDROID__)
480
481
std::set<CpuIndex> allowedCpus;
482
483
if (respectProcessAffinity)
484
allowedCpus = STARTUP_PROCESSOR_AFFINITY;
485
486
auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) {
487
return !respectProcessAffinity || allowedCpus.count(c) == 1;
488
};
489
490
// On Linux things are straightforward, since there's no processor groups and
491
// any thread can be scheduled on all processors.
492
// We try to gather this information from the sysfs first
493
// https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node
494
495
bool useFallback = false;
496
auto fallback = [&]() {
497
useFallback = true;
498
cfg = empty();
499
};
500
501
// /sys/devices/system/node/online contains information about active NUMA nodes
502
auto nodeIdsStr = read_file_to_string("/sys/devices/system/node/online");
503
if (!nodeIdsStr.has_value() || nodeIdsStr->empty())
504
{
505
fallback();
506
}
507
else
508
{
509
remove_whitespace(*nodeIdsStr);
510
for (size_t n : indices_from_shortened_string(*nodeIdsStr))
511
{
512
// /sys/devices/system/node/node.../cpulist
513
std::string path =
514
std::string("/sys/devices/system/node/node") + std::to_string(n) + "/cpulist";
515
auto cpuIdsStr = read_file_to_string(path);
516
// Now, we only bail if the file does not exist. Some nodes may be
517
// empty, that's fine. An empty node still has a file that appears
518
// to have some whitespace, so we need to handle that.
519
if (!cpuIdsStr.has_value())
520
{
521
fallback();
522
break;
523
}
524
else
525
{
526
remove_whitespace(*cpuIdsStr);
527
for (size_t c : indices_from_shortened_string(*cpuIdsStr))
528
{
529
if (is_cpu_allowed(c))
530
cfg.add_cpu_to_node(n, c);
531
}
532
}
533
}
534
}
535
536
if (useFallback)
537
{
538
for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
539
if (is_cpu_allowed(c))
540
cfg.add_cpu_to_node(NumaIndex{0}, c);
541
}
542
543
#elif defined(_WIN64)
544
545
std::optional<std::set<CpuIndex>> allowedCpus;
546
547
if (respectProcessAffinity)
548
allowedCpus = STARTUP_PROCESSOR_AFFINITY.get_combined();
549
550
// The affinity cannot be determined in all cases on Windows,
551
// but we at least guarantee that the number of allowed processors
552
// is >= number of processors in the affinity mask. In case the user
553
// is not satisfied they must set the processor numbers explicitly.
554
auto is_cpu_allowed = [&allowedCpus](CpuIndex c) {
555
return !allowedCpus.has_value() || allowedCpus->count(c) == 1;
556
};
557
558
WORD numProcGroups = GetActiveProcessorGroupCount();
559
for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup)
560
{
561
for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number)
562
{
563
PROCESSOR_NUMBER procnum;
564
procnum.Group = procGroup;
565
procnum.Number = number;
566
procnum.Reserved = 0;
567
USHORT nodeNumber;
568
569
const BOOL status = GetNumaProcessorNodeEx(&procnum, &nodeNumber);
570
const CpuIndex c = static_cast<CpuIndex>(procGroup) * WIN_PROCESSOR_GROUP_SIZE
571
+ static_cast<CpuIndex>(number);
572
if (status != 0 && nodeNumber != std::numeric_limits<USHORT>::max()
573
&& is_cpu_allowed(c))
574
{
575
cfg.add_cpu_to_node(nodeNumber, c);
576
}
577
}
578
}
579
580
// Split the NUMA nodes to be contained within a group if necessary.
581
// This is needed between Windows 10 Build 20348 and Windows 11, because
582
// the new NUMA allocation behaviour was introduced while there was
583
// still no way to set thread affinity spanning multiple processor groups.
584
// See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
585
// We also do this is if need to force old API for some reason.
586
//
587
// 2024-08-26: It appears that we need to actually always force this behaviour.
588
// While Windows allows this to work now, such assignments have bad interaction
589
// with the scheduler - in particular it still prefers scheduling on the thread's
590
// "primary" node, even if it means scheduling SMT processors first.
591
// See https://github.com/official-stockfish/Stockfish/issues/5551
592
// See https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
593
//
594
// Each process is assigned a primary group at creation, and by default all
595
// of its threads' primary group is the same. Each thread's ideal processor
596
// is in the thread's primary group, so threads will preferentially be
597
// scheduled to processors on their primary group, but they are able to
598
// be scheduled to processors on any other group.
599
//
600
// used to be guarded by if (STARTUP_USE_OLD_AFFINITY_API)
601
{
602
NumaConfig splitCfg = empty();
603
604
NumaIndex splitNodeIndex = 0;
605
for (const auto& cpus : cfg.nodes)
606
{
607
if (cpus.empty())
608
continue;
609
610
size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE;
611
for (CpuIndex c : cpus)
612
{
613
const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE;
614
if (procGroupIndex != lastProcGroupIndex)
615
{
616
splitNodeIndex += 1;
617
lastProcGroupIndex = procGroupIndex;
618
}
619
splitCfg.add_cpu_to_node(splitNodeIndex, c);
620
}
621
splitNodeIndex += 1;
622
}
623
624
cfg = std::move(splitCfg);
625
}
626
627
#else
628
629
// Fallback for unsupported systems.
630
for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
631
cfg.add_cpu_to_node(NumaIndex{0}, c);
632
633
#endif
634
635
// We have to ensure no empty NUMA nodes persist.
636
cfg.remove_empty_numa_nodes();
637
638
// If the user explicitly opts out from respecting the current process affinity
639
// then it may be inconsistent with the current affinity (obviously), so we
640
// consider it custom.
641
if (!respectProcessAffinity)
642
cfg.customAffinity = true;
643
644
return cfg;
645
}
646
647
// ':'-separated numa nodes
648
// ','-separated cpu indices
649
// supports "first-last" range syntax for cpu indices
650
// For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191"
651
static NumaConfig from_string(const std::string& s) {
652
NumaConfig cfg = empty();
653
654
NumaIndex n = 0;
655
for (auto&& nodeStr : split(s, ":"))
656
{
657
auto indices = indices_from_shortened_string(std::string(nodeStr));
658
if (!indices.empty())
659
{
660
for (auto idx : indices)
661
{
662
if (!cfg.add_cpu_to_node(n, CpuIndex(idx)))
663
std::exit(EXIT_FAILURE);
664
}
665
666
n += 1;
667
}
668
}
669
670
cfg.customAffinity = true;
671
672
return cfg;
673
}
674
675
NumaConfig(const NumaConfig&) = delete;
676
NumaConfig(NumaConfig&&) = default;
677
NumaConfig& operator=(const NumaConfig&) = delete;
678
NumaConfig& operator=(NumaConfig&&) = default;
679
680
bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; }
681
682
NumaIndex num_numa_nodes() const { return nodes.size(); }
683
684
CpuIndex num_cpus_in_numa_node(NumaIndex n) const {
685
assert(n < nodes.size());
686
return nodes[n].size();
687
}
688
689
CpuIndex num_cpus() const { return nodeByCpu.size(); }
690
691
bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; }
692
693
std::string to_string() const {
694
std::string str;
695
696
bool isFirstNode = true;
697
for (auto&& cpus : nodes)
698
{
699
if (!isFirstNode)
700
str += ":";
701
702
bool isFirstSet = true;
703
auto rangeStart = cpus.begin();
704
for (auto it = cpus.begin(); it != cpus.end(); ++it)
705
{
706
auto next = std::next(it);
707
if (next == cpus.end() || *next != *it + 1)
708
{
709
// cpus[i] is at the end of the range (may be of size 1)
710
if (!isFirstSet)
711
str += ",";
712
713
const CpuIndex last = *it;
714
715
if (it != rangeStart)
716
{
717
const CpuIndex first = *rangeStart;
718
719
str += std::to_string(first);
720
str += "-";
721
str += std::to_string(last);
722
}
723
else
724
str += std::to_string(last);
725
726
rangeStart = next;
727
isFirstSet = false;
728
}
729
}
730
731
isFirstNode = false;
732
}
733
734
return str;
735
}
736
737
bool suggests_binding_threads(CpuIndex numThreads) const {
738
// If we can reasonably determine that the threads cannot be contained
739
// by the OS within the first NUMA node then we advise distributing
740
// and binding threads. When the threads are not bound we can only use
741
// NUMA memory replicated objects from the first node, so when the OS
742
// has to schedule on other nodes we lose performance. We also suggest
743
// binding if there's enough threads to distribute among nodes with minimal
744
// disparity. We try to ignore small nodes, in particular the empty ones.
745
746
// If the affinity set by the user does not match the affinity given by
747
// the OS then binding is necessary to ensure the threads are running on
748
// correct processors.
749
if (customAffinity)
750
return true;
751
752
// We obviously cannot distribute a single thread, so a single thread
753
// should never be bound.
754
if (numThreads <= 1)
755
return false;
756
757
size_t largestNodeSize = 0;
758
for (auto&& cpus : nodes)
759
if (cpus.size() > largestNodeSize)
760
largestNodeSize = cpus.size();
761
762
auto is_node_small = [largestNodeSize](const std::set<CpuIndex>& node) {
763
static constexpr double SmallNodeThreshold = 0.6;
764
return static_cast<double>(node.size()) / static_cast<double>(largestNodeSize)
765
<= SmallNodeThreshold;
766
};
767
768
size_t numNotSmallNodes = 0;
769
for (auto&& cpus : nodes)
770
if (!is_node_small(cpus))
771
numNotSmallNodes += 1;
772
773
return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4)
774
&& nodes.size() > 1;
775
}
776
777
std::vector<NumaIndex> distribute_threads_among_numa_nodes(CpuIndex numThreads) const {
778
std::vector<NumaIndex> ns;
779
780
if (nodes.size() == 1)
781
{
782
// Special case for when there's no NUMA nodes. This doesn't buy us
783
// much, but let's keep the default path simple.
784
ns.resize(numThreads, NumaIndex{0});
785
}
786
else
787
{
788
std::vector<size_t> occupation(nodes.size(), 0);
789
for (CpuIndex c = 0; c < numThreads; ++c)
790
{
791
NumaIndex bestNode{0};
792
float bestNodeFill = std::numeric_limits<float>::max();
793
for (NumaIndex n = 0; n < nodes.size(); ++n)
794
{
795
float fill =
796
static_cast<float>(occupation[n] + 1) / static_cast<float>(nodes[n].size());
797
// NOTE: Do we want to perhaps fill the first available node
798
// up to 50% first before considering other nodes?
799
// Probably not, because it would interfere with running
800
// multiple instances. We basically shouldn't favor any
801
// particular node.
802
if (fill < bestNodeFill)
803
{
804
bestNode = n;
805
bestNodeFill = fill;
806
}
807
}
808
ns.emplace_back(bestNode);
809
occupation[bestNode] += 1;
810
}
811
}
812
813
return ns;
814
}
815
816
NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const {
817
if (n >= nodes.size() || nodes[n].size() == 0)
818
std::exit(EXIT_FAILURE);
819
820
#if defined(__linux__) && !defined(__ANDROID__)
821
822
cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1);
823
if (mask == nullptr)
824
std::exit(EXIT_FAILURE);
825
826
const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1);
827
828
CPU_ZERO_S(masksize, mask);
829
830
for (CpuIndex c : nodes[n])
831
CPU_SET_S(c, masksize, mask);
832
833
const int status = sched_setaffinity(0, masksize, mask);
834
835
CPU_FREE(mask);
836
837
if (status != 0)
838
std::exit(EXIT_FAILURE);
839
840
// We yield this thread just to be sure it gets rescheduled.
841
// This is defensive, allowed because this code is not performance critical.
842
sched_yield();
843
844
#elif defined(_WIN64)
845
846
// Requires Windows 11. No good way to set thread affinity spanning
847
// processor groups before that.
848
HMODULE k32 = GetModuleHandle(TEXT("Kernel32.dll"));
849
auto SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
850
(void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
851
852
// We ALWAYS set affinity with the new API if available, because
853
// there's no downsides, and we forcibly keep it consistent with
854
// the old API should we need to use it. I.e. we always keep this
855
// as a superset of what we set with SetThreadGroupAffinity.
856
if (SetThreadSelectedCpuSetMasks_f != nullptr)
857
{
858
// Only available on Windows 11 and Windows Server 2022 onwards
859
const USHORT numProcGroups = USHORT(
860
((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE);
861
auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(numProcGroups);
862
std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups);
863
for (WORD i = 0; i < numProcGroups; ++i)
864
groupAffinities[i].Group = i;
865
866
for (CpuIndex c : nodes[n])
867
{
868
const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE;
869
const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
870
groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup;
871
}
872
873
HANDLE hThread = GetCurrentThread();
874
875
const BOOL status =
876
SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups);
877
if (status == 0)
878
std::exit(EXIT_FAILURE);
879
880
// We yield this thread just to be sure it gets rescheduled.
881
// This is defensive, allowed because this code is not performance critical.
882
SwitchToThread();
883
}
884
885
// Sometimes we need to force the old API, but do not use it unless necessary.
886
if (SetThreadSelectedCpuSetMasks_f == nullptr || STARTUP_USE_OLD_AFFINITY_API)
887
{
888
// On earlier windows version (since windows 7) we cannot run a single thread
889
// on multiple processor groups, so we need to restrict the group.
890
// We assume the group of the first processor listed for this node.
891
// Processors from outside this group will not be assigned for this thread.
892
// Normally this won't be an issue because windows used to assign NUMA nodes
893
// such that they cannot span processor groups. However, since Windows 10
894
// Build 20348 the behaviour changed, so there's a small window of versions
895
// between this and Windows 11 that might exhibit problems with not all
896
// processors being utilized.
897
//
898
// We handle this in NumaConfig::from_system by manually splitting the
899
// nodes when we detect that there is no function to set affinity spanning
900
// processor nodes. This is required because otherwise our thread distribution
901
// code may produce suboptimal results.
902
//
903
// See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
904
GROUP_AFFINITY affinity;
905
std::memset(&affinity, 0, sizeof(GROUP_AFFINITY));
906
// We use an ordered set to be sure to get the smallest cpu number here.
907
const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE;
908
affinity.Group = static_cast<WORD>(forcedProcGroupIndex);
909
for (CpuIndex c : nodes[n])
910
{
911
const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE;
912
const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
913
// We skip processors that are not in the same processor group.
914
// If everything was set up correctly this will never be an issue,
915
// but we have to account for bad NUMA node specification.
916
if (procGroupIndex != forcedProcGroupIndex)
917
continue;
918
919
affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup;
920
}
921
922
HANDLE hThread = GetCurrentThread();
923
924
const BOOL status = SetThreadGroupAffinity(hThread, &affinity, nullptr);
925
if (status == 0)
926
std::exit(EXIT_FAILURE);
927
928
// We yield this thread just to be sure it gets rescheduled. This is
929
// defensive, allowed because this code is not performance critical.
930
SwitchToThread();
931
}
932
933
#endif
934
935
return NumaReplicatedAccessToken(n);
936
}
937
938
template<typename FuncT>
939
void execute_on_numa_node(NumaIndex n, FuncT&& f) const {
940
std::thread th([this, &f, n]() {
941
bind_current_thread_to_numa_node(n);
942
std::forward<FuncT>(f)();
943
});
944
945
th.join();
946
}
947
948
private:
949
std::vector<std::set<CpuIndex>> nodes;
950
std::map<CpuIndex, NumaIndex> nodeByCpu;
951
CpuIndex highestCpuIndex;
952
953
bool customAffinity;
954
955
static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); }
956
957
struct EmptyNodeTag {};
958
959
NumaConfig(EmptyNodeTag) :
960
highestCpuIndex(0),
961
customAffinity(false) {}
962
963
void remove_empty_numa_nodes() {
964
std::vector<std::set<CpuIndex>> newNodes;
965
for (auto&& cpus : nodes)
966
if (!cpus.empty())
967
newNodes.emplace_back(std::move(cpus));
968
nodes = std::move(newNodes);
969
}
970
971
// Returns true if successful
972
// Returns false if failed, i.e. when the cpu is already present
973
// strong guarantee, the structure remains unmodified
974
bool add_cpu_to_node(NumaIndex n, CpuIndex c) {
975
if (is_cpu_assigned(c))
976
return false;
977
978
while (nodes.size() <= n)
979
nodes.emplace_back();
980
981
nodes[n].insert(c);
982
nodeByCpu[c] = n;
983
984
if (c > highestCpuIndex)
985
highestCpuIndex = c;
986
987
return true;
988
}
989
990
// Returns true if successful
991
// Returns false if failed, i.e. when any of the cpus is already present
992
// strong guarantee, the structure remains unmodified
993
bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) {
994
for (CpuIndex c = cfirst; c <= clast; ++c)
995
if (is_cpu_assigned(c))
996
return false;
997
998
while (nodes.size() <= n)
999
nodes.emplace_back();
1000
1001
for (CpuIndex c = cfirst; c <= clast; ++c)
1002
{
1003
nodes[n].insert(c);
1004
nodeByCpu[c] = n;
1005
}
1006
1007
if (clast > highestCpuIndex)
1008
highestCpuIndex = clast;
1009
1010
return true;
1011
}
1012
1013
static std::vector<size_t> indices_from_shortened_string(const std::string& s) {
1014
std::vector<size_t> indices;
1015
1016
if (s.empty())
1017
return indices;
1018
1019
for (const auto& ss : split(s, ","))
1020
{
1021
if (ss.empty())
1022
continue;
1023
1024
auto parts = split(ss, "-");
1025
if (parts.size() == 1)
1026
{
1027
const CpuIndex c = CpuIndex{str_to_size_t(std::string(parts[0]))};
1028
indices.emplace_back(c);
1029
}
1030
else if (parts.size() == 2)
1031
{
1032
const CpuIndex cfirst = CpuIndex{str_to_size_t(std::string(parts[0]))};
1033
const CpuIndex clast = CpuIndex{str_to_size_t(std::string(parts[1]))};
1034
for (size_t c = cfirst; c <= clast; ++c)
1035
{
1036
indices.emplace_back(c);
1037
}
1038
}
1039
}
1040
1041
return indices;
1042
}
1043
};
1044
1045
class NumaReplicationContext;
1046
1047
// Instances of this class are tracked by the NumaReplicationContext instance.
1048
// NumaReplicationContext informs all tracked instances when NUMA configuration changes.
1049
class NumaReplicatedBase {
1050
public:
1051
NumaReplicatedBase(NumaReplicationContext& ctx);
1052
1053
NumaReplicatedBase(const NumaReplicatedBase&) = delete;
1054
NumaReplicatedBase(NumaReplicatedBase&& other) noexcept;
1055
1056
NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete;
1057
NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept;
1058
1059
virtual void on_numa_config_changed() = 0;
1060
virtual ~NumaReplicatedBase();
1061
1062
const NumaConfig& get_numa_config() const;
1063
1064
private:
1065
NumaReplicationContext* context;
1066
};
1067
1068
// We force boxing with a unique_ptr. If this becomes an issue due to added
1069
// indirection we may need to add an option for a custom boxing type. When the
1070
// NUMA config changes the value stored at the index 0 is replicated to other nodes.
1071
template<typename T>
1072
class NumaReplicated: public NumaReplicatedBase {
1073
public:
1074
using ReplicatorFuncType = std::function<T(const T&)>;
1075
1076
NumaReplicated(NumaReplicationContext& ctx) :
1077
NumaReplicatedBase(ctx) {
1078
replicate_from(T{});
1079
}
1080
1081
NumaReplicated(NumaReplicationContext& ctx, T&& source) :
1082
NumaReplicatedBase(ctx) {
1083
replicate_from(std::move(source));
1084
}
1085
1086
NumaReplicated(const NumaReplicated&) = delete;
1087
NumaReplicated(NumaReplicated&& other) noexcept :
1088
NumaReplicatedBase(std::move(other)),
1089
instances(std::exchange(other.instances, {})) {}
1090
1091
NumaReplicated& operator=(const NumaReplicated&) = delete;
1092
NumaReplicated& operator=(NumaReplicated&& other) noexcept {
1093
NumaReplicatedBase::operator=(*this, std::move(other));
1094
instances = std::exchange(other.instances, {});
1095
1096
return *this;
1097
}
1098
1099
NumaReplicated& operator=(T&& source) {
1100
replicate_from(std::move(source));
1101
1102
return *this;
1103
}
1104
1105
~NumaReplicated() override = default;
1106
1107
const T& operator[](NumaReplicatedAccessToken token) const {
1108
assert(token.get_numa_index() < instances.size());
1109
return *(instances[token.get_numa_index()]);
1110
}
1111
1112
const T& operator*() const { return *(instances[0]); }
1113
1114
const T* operator->() const { return instances[0].get(); }
1115
1116
template<typename FuncT>
1117
void modify_and_replicate(FuncT&& f) {
1118
auto source = std::move(instances[0]);
1119
std::forward<FuncT>(f)(*source);
1120
replicate_from(std::move(*source));
1121
}
1122
1123
void on_numa_config_changed() override {
1124
// Use the first one as the source. It doesn't matter which one we use,
1125
// because they all must be identical, but the first one is guaranteed to exist.
1126
auto source = std::move(instances[0]);
1127
replicate_from(std::move(*source));
1128
}
1129
1130
private:
1131
std::vector<std::unique_ptr<T>> instances;
1132
1133
void replicate_from(T&& source) {
1134
instances.clear();
1135
1136
const NumaConfig& cfg = get_numa_config();
1137
if (cfg.requires_memory_replication())
1138
{
1139
for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n)
1140
{
1141
cfg.execute_on_numa_node(
1142
n, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
1143
}
1144
}
1145
else
1146
{
1147
assert(cfg.num_numa_nodes() == 1);
1148
// We take advantage of the fact that replication is not required
1149
// and reuse the source value, avoiding one copy operation.
1150
instances.emplace_back(std::make_unique<T>(std::move(source)));
1151
}
1152
}
1153
};
1154
1155
// We force boxing with a unique_ptr. If this becomes an issue due to added
1156
// indirection we may need to add an option for a custom boxing type.
1157
template<typename T>
1158
class LazyNumaReplicated: public NumaReplicatedBase {
1159
public:
1160
using ReplicatorFuncType = std::function<T(const T&)>;
1161
1162
LazyNumaReplicated(NumaReplicationContext& ctx) :
1163
NumaReplicatedBase(ctx) {
1164
prepare_replicate_from(T{});
1165
}
1166
1167
LazyNumaReplicated(NumaReplicationContext& ctx, T&& source) :
1168
NumaReplicatedBase(ctx) {
1169
prepare_replicate_from(std::move(source));
1170
}
1171
1172
LazyNumaReplicated(const LazyNumaReplicated&) = delete;
1173
LazyNumaReplicated(LazyNumaReplicated&& other) noexcept :
1174
NumaReplicatedBase(std::move(other)),
1175
instances(std::exchange(other.instances, {})) {}
1176
1177
LazyNumaReplicated& operator=(const LazyNumaReplicated&) = delete;
1178
LazyNumaReplicated& operator=(LazyNumaReplicated&& other) noexcept {
1179
NumaReplicatedBase::operator=(*this, std::move(other));
1180
instances = std::exchange(other.instances, {});
1181
1182
return *this;
1183
}
1184
1185
LazyNumaReplicated& operator=(T&& source) {
1186
prepare_replicate_from(std::move(source));
1187
1188
return *this;
1189
}
1190
1191
~LazyNumaReplicated() override = default;
1192
1193
const T& operator[](NumaReplicatedAccessToken token) const {
1194
assert(token.get_numa_index() < instances.size());
1195
ensure_present(token.get_numa_index());
1196
return *(instances[token.get_numa_index()]);
1197
}
1198
1199
const T& operator*() const { return *(instances[0]); }
1200
1201
const T* operator->() const { return instances[0].get(); }
1202
1203
template<typename FuncT>
1204
void modify_and_replicate(FuncT&& f) {
1205
auto source = std::move(instances[0]);
1206
std::forward<FuncT>(f)(*source);
1207
prepare_replicate_from(std::move(*source));
1208
}
1209
1210
void on_numa_config_changed() override {
1211
// Use the first one as the source. It doesn't matter which one we use,
1212
// because they all must be identical, but the first one is guaranteed to exist.
1213
auto source = std::move(instances[0]);
1214
prepare_replicate_from(std::move(*source));
1215
}
1216
1217
private:
1218
mutable std::vector<std::unique_ptr<T>> instances;
1219
mutable std::mutex mutex;
1220
1221
void ensure_present(NumaIndex idx) const {
1222
assert(idx < instances.size());
1223
1224
if (instances[idx] != nullptr)
1225
return;
1226
1227
assert(idx != 0);
1228
1229
std::unique_lock<std::mutex> lock(mutex);
1230
// Check again for races.
1231
if (instances[idx] != nullptr)
1232
return;
1233
1234
const NumaConfig& cfg = get_numa_config();
1235
cfg.execute_on_numa_node(
1236
idx, [this, idx]() { instances[idx] = std::make_unique<T>(*instances[0]); });
1237
}
1238
1239
void prepare_replicate_from(T&& source) {
1240
instances.clear();
1241
1242
const NumaConfig& cfg = get_numa_config();
1243
if (cfg.requires_memory_replication())
1244
{
1245
assert(cfg.num_numa_nodes() > 0);
1246
1247
// We just need to make sure the first instance is there.
1248
// Note that we cannot move here as we need to reallocate the data
1249
// on the correct NUMA node.
1250
cfg.execute_on_numa_node(
1251
0, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
1252
1253
// Prepare others for lazy init.
1254
instances.resize(cfg.num_numa_nodes());
1255
}
1256
else
1257
{
1258
assert(cfg.num_numa_nodes() == 1);
1259
// We take advantage of the fact that replication is not required
1260
// and reuse the source value, avoiding one copy operation.
1261
instances.emplace_back(std::make_unique<T>(std::move(source)));
1262
}
1263
}
1264
};
1265
1266
class NumaReplicationContext {
1267
public:
1268
NumaReplicationContext(NumaConfig&& cfg) :
1269
config(std::move(cfg)) {}
1270
1271
NumaReplicationContext(const NumaReplicationContext&) = delete;
1272
NumaReplicationContext(NumaReplicationContext&&) = delete;
1273
1274
NumaReplicationContext& operator=(const NumaReplicationContext&) = delete;
1275
NumaReplicationContext& operator=(NumaReplicationContext&&) = delete;
1276
1277
~NumaReplicationContext() {
1278
// The context must outlive replicated objects
1279
if (!trackedReplicatedObjects.empty())
1280
std::exit(EXIT_FAILURE);
1281
}
1282
1283
void attach(NumaReplicatedBase* obj) {
1284
assert(trackedReplicatedObjects.count(obj) == 0);
1285
trackedReplicatedObjects.insert(obj);
1286
}
1287
1288
void detach(NumaReplicatedBase* obj) {
1289
assert(trackedReplicatedObjects.count(obj) == 1);
1290
trackedReplicatedObjects.erase(obj);
1291
}
1292
1293
// oldObj may be invalid at this point
1294
void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) {
1295
assert(trackedReplicatedObjects.count(oldObj) == 1);
1296
assert(trackedReplicatedObjects.count(newObj) == 0);
1297
trackedReplicatedObjects.erase(oldObj);
1298
trackedReplicatedObjects.insert(newObj);
1299
}
1300
1301
void set_numa_config(NumaConfig&& cfg) {
1302
config = std::move(cfg);
1303
for (auto&& obj : trackedReplicatedObjects)
1304
obj->on_numa_config_changed();
1305
}
1306
1307
const NumaConfig& get_numa_config() const { return config; }
1308
1309
private:
1310
NumaConfig config;
1311
1312
// std::set uses std::less by default, which is required for pointer comparison
1313
std::set<NumaReplicatedBase*> trackedReplicatedObjects;
1314
};
1315
1316
inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) :
1317
context(&ctx) {
1318
context->attach(this);
1319
}
1320
1321
inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept :
1322
context(std::exchange(other.context, nullptr)) {
1323
context->move_attached(&other, this);
1324
}
1325
1326
inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept {
1327
context = std::exchange(other.context, nullptr);
1328
1329
context->move_attached(&other, this);
1330
1331
return *this;
1332
}
1333
1334
inline NumaReplicatedBase::~NumaReplicatedBase() {
1335
if (context != nullptr)
1336
context->detach(this);
1337
}
1338
1339
inline const NumaConfig& NumaReplicatedBase::get_numa_config() const {
1340
return context->get_numa_config();
1341
}
1342
1343
} // namespace Stockfish
1344
1345
1346
#endif // #ifndef NUMA_H_INCLUDED
1347
1348