CoCalc -- numa.h

GitHub Repository: official-stockfish/Stockfish
Path: blob/master/src/numa.h
³⁷⁶ views
1
/*
2
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3
  Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
4

5
  Stockfish is free software: you can redistribute it and/or modify
6
  it under the terms of the GNU General Public License as published by
7
  the Free Software Foundation, either version 3 of the License, or
8
  (at your option) any later version.
9

10
  Stockfish is distributed in the hope that it will be useful,
11
  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
  GNU General Public License for more details.
14

15
  You should have received a copy of the GNU General Public License
16
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
*/
18

19
#ifndef NUMA_H_INCLUDED
20
#define NUMA_H_INCLUDED
21

22
#include <algorithm>
23
#include <atomic>
24
#include <cstdint>
25
#include <cstdlib>
26
#include <functional>
27
#include <iostream>
28
#include <limits>
29
#include <map>
30
#include <memory>
31
#include <mutex>
32
#include <set>
33
#include <sstream>
34
#include <string>
35
#include <thread>
36
#include <utility>
37
#include <vector>
38
#include <cstring>
39

40
#include "memory.h"
41

42
// We support linux very well, but we explicitly do NOT support Android,
43
// because there is no affected systems, not worth maintaining.
44
#if defined(__linux__) && !defined(__ANDROID__)
45
    #if !defined(_GNU_SOURCE)
46
        #define _GNU_SOURCE
47
    #endif
48
    #include <sched.h>
49
#elif defined(_WIN64)
50

51
    #if _WIN32_WINNT < 0x0601
52
        #undef _WIN32_WINNT
53
        #define _WIN32_WINNT 0x0601  // Force to include needed API prototypes
54
    #endif
55

56
// On Windows each processor group can have up to 64 processors.
57
// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
58
static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64;
59

60
    #if !defined(NOMINMAX)
61
        #define NOMINMAX
62
    #endif
63
    #include <windows.h>
64
    #if defined small
65
        #undef small
66
    #endif
67

68
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks
69
using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT);
70

71
// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks
72
using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT);
73

74
#endif
75

76
#include "misc.h"
77

78
namespace Stockfish {
79

80
using CpuIndex  = size_t;
81
using NumaIndex = size_t;
82

83
inline CpuIndex get_hardware_concurrency() {
84
    CpuIndex concurrency = std::thread::hardware_concurrency();
85

86
    // Get all processors across all processor groups on windows, since
87
    // hardware_concurrency() only returns the number of processors in
88
    // the first group, because only these are available to std::thread.
89
#ifdef _WIN64
90
    concurrency = std::max<CpuIndex>(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS));
91
#endif
92

93
    return concurrency;
94
}
95

96
inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());
97

98
#if defined(_WIN64)
99

100
struct WindowsAffinity {
101
    std::optional<std::set<CpuIndex>> oldApi;
102
    std::optional<std::set<CpuIndex>> newApi;
103

104
    // We also provide diagnostic for when the affinity is set to nullopt
105
    // whether it was due to being indeterminate. If affinity is indeterminate
106
    // it is best to assume it is not set at all, so consistent with the meaning
107
    // of the nullopt affinity.
108
    bool isNewDeterminate = true;
109
    bool isOldDeterminate = true;
110

111
    std::optional<std::set<CpuIndex>> get_combined() const {
112
        if (!oldApi.has_value())
113
            return newApi;
114
        if (!newApi.has_value())
115
            return oldApi;
116

117
        std::set<CpuIndex> intersect;
118
        std::set_intersection(oldApi->begin(), oldApi->end(), newApi->begin(), newApi->end(),
119
                              std::inserter(intersect, intersect.begin()));
120
        return intersect;
121
    }
122

123
    // Since Windows 11 and Windows Server 2022 thread affinities can span
124
    // processor groups and can be set as such by a new WinAPI function. However,
125
    // we may need to force using the old API if we detect that the process has
126
    // affinity set by the old API already and we want to override that. Due to the
127
    // limitations of the old API we cannot detect its use reliably. There will be
128
    // cases where we detect not use but it has actually been used and vice versa.
129

130
    bool likely_used_old_api() const { return oldApi.has_value() || !isOldDeterminate; }
131
};
132

133
inline std::pair<BOOL, std::vector<USHORT>> get_process_group_affinity() {
134

135
    // GetProcessGroupAffinity requires the GroupArray argument to be
136
    // aligned to 4 bytes instead of just 2.
137
    static constexpr size_t GroupArrayMinimumAlignment = 4;
138
    static_assert(GroupArrayMinimumAlignment >= alignof(USHORT));
139

140
    // The function should succeed the second time, but it may fail if the group
141
    // affinity has changed between GetProcessGroupAffinity calls. In such case
142
    // we consider this a hard error, as we Cannot work with unstable affinities
143
    // anyway.
144
    static constexpr int MAX_TRIES  = 2;
145
    USHORT               GroupCount = 1;
146
    for (int i = 0; i < MAX_TRIES; ++i)
147
    {
148
        auto GroupArray = std::make_unique<USHORT[]>(
149
          GroupCount + (GroupArrayMinimumAlignment / alignof(USHORT) - 1));
150

151
        USHORT* GroupArrayAligned = align_ptr_up<GroupArrayMinimumAlignment>(GroupArray.get());
152

153
        const BOOL status =
154
          GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount, GroupArrayAligned);
155

156
        if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
157
        {
158
            break;
159
        }
160

161
        if (status != 0)
162
        {
163
            return std::make_pair(status,
164
                                  std::vector(GroupArrayAligned, GroupArrayAligned + GroupCount));
165
        }
166
    }
167

168
    return std::make_pair(0, std::vector<USHORT>());
169
}
170

171
// On Windows there are two ways to set affinity, and therefore 2 ways to get it.
172
// These are not consistent, so we have to check both. In some cases it is actually
173
// not possible to determine affinity. For example when two different threads have
174
// affinity on different processor groups, set using SetThreadAffinityMask, we cannot
175
// retrieve the actual affinities.
176
// From documentation on GetProcessAffinityMask:
177
//     > If the calling process contains threads in multiple groups,
178
//     > the function returns zero for both affinity masks.
179
// In such cases we just give up and assume we have affinity for all processors.
180
// nullopt means no affinity is set, that is, all processors are allowed
181
inline WindowsAffinity get_process_affinity() {
182
    HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
183
    auto    GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
184
      (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
185

186
    BOOL status = 0;
187

188
    WindowsAffinity affinity;
189

190
    if (GetThreadSelectedCpuSetMasks_f != nullptr)
191
    {
192
        USHORT RequiredMaskCount;
193
        status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
194

195
        // We expect ERROR_INSUFFICIENT_BUFFER from GetThreadSelectedCpuSetMasks,
196
        // but other failure is an actual error.
197
        if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
198
        {
199
            affinity.isNewDeterminate = false;
200
        }
201
        else if (RequiredMaskCount > 0)
202
        {
203
            // If RequiredMaskCount then these affinities were never set, but it's
204
            // not consistent so GetProcessAffinityMask may still return some affinity.
205
            auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
206

207
            status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
208
                                                    RequiredMaskCount, &RequiredMaskCount);
209

210
            if (status == 0)
211
            {
212
                affinity.isNewDeterminate = false;
213
            }
214
            else
215
            {
216
                std::set<CpuIndex> cpus;
217

218
                for (USHORT i = 0; i < RequiredMaskCount; ++i)
219
                {
220
                    const size_t procGroupIndex = groupAffinities[i].Group;
221

222
                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
223
                    {
224
                        if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
225
                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
226
                    }
227
                }
228

229
                affinity.newApi = std::move(cpus);
230
            }
231
        }
232
    }
233

234
    // NOTE: There is no way to determine full affinity using the old API if
235
    //       individual threads set affinity on different processor groups.
236

237
    DWORD_PTR proc, sys;
238
    status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys);
239

240
    // If proc == 0 then we cannot determine affinity because it spans processor groups.
241
    // On Windows 11 and Server 2022 it will instead
242
    //     > If, however, hHandle specifies a handle to the current process, the function
243
    //     > always uses the calling thread's primary group (which by default is the same
244
    //     > as the process' primary group) in order to set the
245
    //     > lpProcessAffinityMask and lpSystemAffinityMask.
246
    // So it will never be indeterminate here. We can only make assumptions later.
247
    if (status == 0 || proc == 0)
248
    {
249
        affinity.isOldDeterminate = false;
250
        return affinity;
251
    }
252

253
    // If SetProcessAffinityMask was never called the affinity must span
254
    // all processor groups, but if it was called it must only span one.
255

256
    std::vector<USHORT> groupAffinity;  // We need to capture this later and capturing
257
                                        // from structured bindings requires c++20.
258

259
    std::tie(status, groupAffinity) = get_process_group_affinity();
260
    if (status == 0)
261
    {
262
        affinity.isOldDeterminate = false;
263
        return affinity;
264
    }
265

266
    if (groupAffinity.size() == 1)
267
    {
268
        // We detect the case when affinity is set to all processors and correctly
269
        // leave affinity.oldApi as nullopt.
270
        if (GetActiveProcessorGroupCount() != 1 || proc != sys)
271
        {
272
            std::set<CpuIndex> cpus;
273

274
            const size_t procGroupIndex = groupAffinity[0];
275

276
            const uint64_t mask = static_cast<uint64_t>(proc);
277
            for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
278
            {
279
                if (mask & (KAFFINITY(1) << j))
280
                    cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
281
            }
282

283
            affinity.oldApi = std::move(cpus);
284
        }
285
    }
286
    else
287
    {
288
        // If we got here it means that either SetProcessAffinityMask was never set
289
        // or we're on Windows 11/Server 2022.
290

291
        // Since Windows 11 and Windows Server 2022 the behaviour of
292
        // GetProcessAffinityMask changed:
293
        //     > If, however, hHandle specifies a handle to the current process,
294
        //     > the function always uses the calling thread's primary group
295
        //     > (which by default is the same as the process' primary group)
296
        //     > in order to set the lpProcessAffinityMask and lpSystemAffinityMask.
297
        // In which case we can actually retrieve the full affinity.
298

299
        if (GetThreadSelectedCpuSetMasks_f != nullptr)
300
        {
301
            std::thread th([&]() {
302
                std::set<CpuIndex> cpus;
303
                bool               isAffinityFull = true;
304

305
                for (auto procGroupIndex : groupAffinity)
306
                {
307
                    const int numActiveProcessors =
308
                      GetActiveProcessorCount(static_cast<WORD>(procGroupIndex));
309

310
                    // We have to schedule to two different processors
311
                    // and & the affinities we get. Otherwise our processor
312
                    // choice could influence the resulting affinity.
313
                    // We assume the processor IDs within the group are
314
                    // filled sequentially from 0.
315
                    uint64_t procCombined = std::numeric_limits<uint64_t>::max();
316
                    uint64_t sysCombined  = std::numeric_limits<uint64_t>::max();
317

318
                    for (int i = 0; i < std::min(numActiveProcessors, 2); ++i)
319
                    {
320
                        GROUP_AFFINITY GroupAffinity;
321
                        std::memset(&GroupAffinity, 0, sizeof(GROUP_AFFINITY));
322
                        GroupAffinity.Group = static_cast<WORD>(procGroupIndex);
323

324
                        GroupAffinity.Mask = static_cast<KAFFINITY>(1) << i;
325

326
                        status =
327
                          SetThreadGroupAffinity(GetCurrentThread(), &GroupAffinity, nullptr);
328
                        if (status == 0)
329
                        {
330
                            affinity.isOldDeterminate = false;
331
                            return;
332
                        }
333

334
                        SwitchToThread();
335

336
                        DWORD_PTR proc2, sys2;
337
                        status = GetProcessAffinityMask(GetCurrentProcess(), &proc2, &sys2);
338
                        if (status == 0)
339
                        {
340
                            affinity.isOldDeterminate = false;
341
                            return;
342
                        }
343

344
                        procCombined &= static_cast<uint64_t>(proc2);
345
                        sysCombined &= static_cast<uint64_t>(sys2);
346
                    }
347

348
                    if (procCombined != sysCombined)
349
                        isAffinityFull = false;
350

351
                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
352
                    {
353
                        if (procCombined & (KAFFINITY(1) << j))
354
                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
355
                    }
356
                }
357

358
                // We have to detect the case where the affinity was not set,
359
                // or is set to all processors so that we correctly produce as
360
                // std::nullopt result.
361
                if (!isAffinityFull)
362
                {
363
                    affinity.oldApi = std::move(cpus);
364
                }
365
            });
366

367
            th.join();
368
        }
369
    }
370

371
    return affinity;
372
}
373

374
#endif
375

376
#if defined(__linux__) && !defined(__ANDROID__)
377

378
inline std::set<CpuIndex> get_process_affinity() {
379

380
    std::set<CpuIndex> cpus;
381

382
    // For unsupported systems, or in case of a soft error, we may assume
383
    // all processors are available for use.
384
    [[maybe_unused]] auto set_to_all_cpus = [&]() {
385
        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
386
            cpus.insert(c);
387
    };
388

389
    // cpu_set_t by default holds 1024 entries. This may not be enough soon,
390
    // but there is no easy way to determine how many threads there actually
391
    // is. In this case we just choose a reasonable upper bound.
392
    static constexpr CpuIndex MaxNumCpus = 1024 * 64;
393

394
    cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
395
    if (mask == nullptr)
396
        std::exit(EXIT_FAILURE);
397

398
    const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
399

400
    CPU_ZERO_S(masksize, mask);
401

402
    const int status = sched_getaffinity(0, masksize, mask);
403

404
    if (status != 0)
405
    {
406
        CPU_FREE(mask);
407
        std::exit(EXIT_FAILURE);
408
    }
409

410
    for (CpuIndex c = 0; c < MaxNumCpus; ++c)
411
        if (CPU_ISSET_S(c, masksize, mask))
412
            cpus.insert(c);
413

414
    CPU_FREE(mask);
415

416
    return cpus;
417
}
418

419
#endif
420

421
#if defined(__linux__) && !defined(__ANDROID__)
422

423
inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity();
424

425
#elif defined(_WIN64)
426

427
inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity();
428
inline static const auto STARTUP_USE_OLD_AFFINITY_API =
429
  STARTUP_PROCESSOR_AFFINITY.likely_used_old_api();
430

431
#endif
432

433
// We want to abstract the purpose of storing the numa node index somewhat.
434
// Whoever is using this does not need to know the specifics of the replication
435
// machinery to be able to access NUMA replicated memory.
436
class NumaReplicatedAccessToken {
437
   public:
438
    NumaReplicatedAccessToken() :
439
        n(0) {}
440

441
    explicit NumaReplicatedAccessToken(NumaIndex idx) :
442
        n(idx) {}
443

444
    NumaIndex get_numa_index() const { return n; }
445

446
   private:
447
    NumaIndex n;
448
};
449

450
// Designed as immutable, because there is no good reason to alter an already
451
// existing config in a way that doesn't require recreating it completely, and
452
// it would be complex and expensive to maintain class invariants.
453
// The CPU (processor) numbers always correspond to the actual numbering used
454
// by the system. The NUMA node numbers MAY NOT correspond to the system's
455
// numbering of the NUMA nodes. In particular, empty nodes may be removed, or
456
// the user may create custom nodes. It is guaranteed that NUMA nodes are NOT
457
// empty: every node exposed by NumaConfig has at least one processor assigned.
458
//
459
// We use startup affinities so as not to modify its own behaviour in time.
460
//
461
// Since Stockfish doesn't support exceptions all places where an exception
462
// should be thrown are replaced by std::exit.
463
class NumaConfig {
464
   public:
465
    NumaConfig() :
466
        highestCpuIndex(0),
467
        customAffinity(false) {
468
        const auto numCpus = SYSTEM_THREADS_NB;
469
        add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1);
470
    }
471

472
    // This function queries the system for the mapping of processors to NUMA nodes.
473
    // On Linux we read from standardized kernel sysfs, with a fallback to single NUMA
474
    // node. On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see
475
    // comment for Windows implementation of get_process_affinity.
476
    static NumaConfig from_system([[maybe_unused]] bool respectProcessAffinity = true) {
477
        NumaConfig cfg = empty();
478

479
#if defined(__linux__) && !defined(__ANDROID__)
480

481
        std::set<CpuIndex> allowedCpus;
482

483
        if (respectProcessAffinity)
484
            allowedCpus = STARTUP_PROCESSOR_AFFINITY;
485

486
        auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) {
487
            return !respectProcessAffinity || allowedCpus.count(c) == 1;
488
        };
489

490
        // On Linux things are straightforward, since there's no processor groups and
491
        // any thread can be scheduled on all processors.
492
        // We try to gather this information from the sysfs first
493
        // https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node
494

495
        bool useFallback = false;
496
        auto fallback    = [&]() {
497
            useFallback = true;
498
            cfg         = empty();
499
        };
500

501
        // /sys/devices/system/node/online contains information about active NUMA nodes
502
        auto nodeIdsStr = read_file_to_string("/sys/devices/system/node/online");
503
        if (!nodeIdsStr.has_value() || nodeIdsStr->empty())
504
        {
505
            fallback();
506
        }
507
        else
508
        {
509
            remove_whitespace(*nodeIdsStr);
510
            for (size_t n : indices_from_shortened_string(*nodeIdsStr))
511
            {
512
                // /sys/devices/system/node/node.../cpulist
513
                std::string path =
514
                  std::string("/sys/devices/system/node/node") + std::to_string(n) + "/cpulist";
515
                auto cpuIdsStr = read_file_to_string(path);
516
                // Now, we only bail if the file does not exist. Some nodes may be
517
                // empty, that's fine. An empty node still has a file that appears
518
                // to have some whitespace, so we need to handle that.
519
                if (!cpuIdsStr.has_value())
520
                {
521
                    fallback();
522
                    break;
523
                }
524
                else
525
                {
526
                    remove_whitespace(*cpuIdsStr);
527
                    for (size_t c : indices_from_shortened_string(*cpuIdsStr))
528
                    {
529
                        if (is_cpu_allowed(c))
530
                            cfg.add_cpu_to_node(n, c);
531
                    }
532
                }
533
            }
534
        }
535

536
        if (useFallback)
537
        {
538
            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
539
                if (is_cpu_allowed(c))
540
                    cfg.add_cpu_to_node(NumaIndex{0}, c);
541
        }
542

543
#elif defined(_WIN64)
544

545
        std::optional<std::set<CpuIndex>> allowedCpus;
546

547
        if (respectProcessAffinity)
548
            allowedCpus = STARTUP_PROCESSOR_AFFINITY.get_combined();
549

550
        // The affinity cannot be determined in all cases on Windows,
551
        // but we at least guarantee that the number of allowed processors
552
        // is >= number of processors in the affinity mask. In case the user
553
        // is not satisfied they must set the processor numbers explicitly.
554
        auto is_cpu_allowed = [&allowedCpus](CpuIndex c) {
555
            return !allowedCpus.has_value() || allowedCpus->count(c) == 1;
556
        };
557

558
        WORD numProcGroups = GetActiveProcessorGroupCount();
559
        for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup)
560
        {
561
            for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number)
562
            {
563
                PROCESSOR_NUMBER procnum;
564
                procnum.Group    = procGroup;
565
                procnum.Number   = number;
566
                procnum.Reserved = 0;
567
                USHORT nodeNumber;
568

569
                const BOOL     status = GetNumaProcessorNodeEx(&procnum, &nodeNumber);
570
                const CpuIndex c      = static_cast<CpuIndex>(procGroup) * WIN_PROCESSOR_GROUP_SIZE
571
                                 + static_cast<CpuIndex>(number);
572
                if (status != 0 && nodeNumber != std::numeric_limits<USHORT>::max()
573
                    && is_cpu_allowed(c))
574
                {
575
                    cfg.add_cpu_to_node(nodeNumber, c);
576
                }
577
            }
578
        }
579

580
        // Split the NUMA nodes to be contained within a group if necessary.
581
        // This is needed between Windows 10 Build 20348 and Windows 11, because
582
        // the new NUMA allocation behaviour was introduced while there was
583
        // still no way to set thread affinity spanning multiple processor groups.
584
        // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
585
        // We also do this is if need to force old API for some reason.
586
        //
587
        // 2024-08-26: It appears that we need to actually always force this behaviour.
588
        // While Windows allows this to work now, such assignments have bad interaction
589
        // with the scheduler - in particular it still prefers scheduling on the thread's
590
        // "primary" node, even if it means scheduling SMT processors first.
591
        // See https://github.com/official-stockfish/Stockfish/issues/5551
592
        // See https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
593
        //
594
        //     Each process is assigned a primary group at creation, and by default all
595
        //     of its threads' primary group is the same. Each thread's ideal processor
596
        //     is in the thread's primary group, so threads will preferentially be
597
        //     scheduled to processors on their primary group, but they are able to
598
        //     be scheduled to processors on any other group.
599
        //
600
        // used to be guarded by if (STARTUP_USE_OLD_AFFINITY_API)
601
        {
602
            NumaConfig splitCfg = empty();
603

604
            NumaIndex splitNodeIndex = 0;
605
            for (const auto& cpus : cfg.nodes)
606
            {
607
                if (cpus.empty())
608
                    continue;
609

610
                size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE;
611
                for (CpuIndex c : cpus)
612
                {
613
                    const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE;
614
                    if (procGroupIndex != lastProcGroupIndex)
615
                    {
616
                        splitNodeIndex += 1;
617
                        lastProcGroupIndex = procGroupIndex;
618
                    }
619
                    splitCfg.add_cpu_to_node(splitNodeIndex, c);
620
                }
621
                splitNodeIndex += 1;
622
            }
623

624
            cfg = std::move(splitCfg);
625
        }
626

627
#else
628

629
        // Fallback for unsupported systems.
630
        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
631
            cfg.add_cpu_to_node(NumaIndex{0}, c);
632

633
#endif
634

635
        // We have to ensure no empty NUMA nodes persist.
636
        cfg.remove_empty_numa_nodes();
637

638
        // If the user explicitly opts out from respecting the current process affinity
639
        // then it may be inconsistent with the current affinity (obviously), so we
640
        // consider it custom.
641
        if (!respectProcessAffinity)
642
            cfg.customAffinity = true;
643

644
        return cfg;
645
    }
646

647
    // ':'-separated numa nodes
648
    // ','-separated cpu indices
649
    // supports "first-last" range syntax for cpu indices
650
    // For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191"
651
    static NumaConfig from_string(const std::string& s) {
652
        NumaConfig cfg = empty();
653

654
        NumaIndex n = 0;
655
        for (auto&& nodeStr : split(s, ":"))
656
        {
657
            auto indices = indices_from_shortened_string(std::string(nodeStr));
658
            if (!indices.empty())
659
            {
660
                for (auto idx : indices)
661
                {
662
                    if (!cfg.add_cpu_to_node(n, CpuIndex(idx)))
663
                        std::exit(EXIT_FAILURE);
664
                }
665

666
                n += 1;
667
            }
668
        }
669

670
        cfg.customAffinity = true;
671

672
        return cfg;
673
    }
674

675
    NumaConfig(const NumaConfig&)            = delete;
676
    NumaConfig(NumaConfig&&)                 = default;
677
    NumaConfig& operator=(const NumaConfig&) = delete;
678
    NumaConfig& operator=(NumaConfig&&)      = default;
679

680
    bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; }
681

682
    NumaIndex num_numa_nodes() const { return nodes.size(); }
683

684
    CpuIndex num_cpus_in_numa_node(NumaIndex n) const {
685
        assert(n < nodes.size());
686
        return nodes[n].size();
687
    }
688

689
    CpuIndex num_cpus() const { return nodeByCpu.size(); }
690

691
    bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; }
692

693
    std::string to_string() const {
694
        std::string str;
695

696
        bool isFirstNode = true;
697
        for (auto&& cpus : nodes)
698
        {
699
            if (!isFirstNode)
700
                str += ":";
701

702
            bool isFirstSet = true;
703
            auto rangeStart = cpus.begin();
704
            for (auto it = cpus.begin(); it != cpus.end(); ++it)
705
            {
706
                auto next = std::next(it);
707
                if (next == cpus.end() || *next != *it + 1)
708
                {
709
                    // cpus[i] is at the end of the range (may be of size 1)
710
                    if (!isFirstSet)
711
                        str += ",";
712

713
                    const CpuIndex last = *it;
714

715
                    if (it != rangeStart)
716
                    {
717
                        const CpuIndex first = *rangeStart;
718

719
                        str += std::to_string(first);
720
                        str += "-";
721
                        str += std::to_string(last);
722
                    }
723
                    else
724
                        str += std::to_string(last);
725

726
                    rangeStart = next;
727
                    isFirstSet = false;
728
                }
729
            }
730

731
            isFirstNode = false;
732
        }
733

734
        return str;
735
    }
736

737
    bool suggests_binding_threads(CpuIndex numThreads) const {
738
        // If we can reasonably determine that the threads cannot be contained
739
        // by the OS within the first NUMA node then we advise distributing
740
        // and binding threads. When the threads are not bound we can only use
741
        // NUMA memory replicated objects from the first node, so when the OS
742
        // has to schedule on other nodes we lose performance. We also suggest
743
        // binding if there's enough threads to distribute among nodes with minimal
744
        // disparity. We try to ignore small nodes, in particular the empty ones.
745

746
        // If the affinity set by the user does not match the affinity given by
747
        // the OS then binding is necessary to ensure the threads are running on
748
        // correct processors.
749
        if (customAffinity)
750
            return true;
751

752
        // We obviously cannot distribute a single thread, so a single thread
753
        // should never be bound.
754
        if (numThreads <= 1)
755
            return false;
756

757
        size_t largestNodeSize = 0;
758
        for (auto&& cpus : nodes)
759
            if (cpus.size() > largestNodeSize)
760
                largestNodeSize = cpus.size();
761

762
        auto is_node_small = [largestNodeSize](const std::set<CpuIndex>& node) {
763
            static constexpr double SmallNodeThreshold = 0.6;
764
            return static_cast<double>(node.size()) / static_cast<double>(largestNodeSize)
765
                <= SmallNodeThreshold;
766
        };
767

768
        size_t numNotSmallNodes = 0;
769
        for (auto&& cpus : nodes)
770
            if (!is_node_small(cpus))
771
                numNotSmallNodes += 1;
772

773
        return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4)
774
            && nodes.size() > 1;
775
    }
776

777
    std::vector<NumaIndex> distribute_threads_among_numa_nodes(CpuIndex numThreads) const {
778
        std::vector<NumaIndex> ns;
779

780
        if (nodes.size() == 1)
781
        {
782
            // Special case for when there's no NUMA nodes. This doesn't buy us
783
            // much, but let's keep the default path simple.
784
            ns.resize(numThreads, NumaIndex{0});
785
        }
786
        else
787
        {
788
            std::vector<size_t> occupation(nodes.size(), 0);
789
            for (CpuIndex c = 0; c < numThreads; ++c)
790
            {
791
                NumaIndex bestNode{0};
792
                float     bestNodeFill = std::numeric_limits<float>::max();
793
                for (NumaIndex n = 0; n < nodes.size(); ++n)
794
                {
795
                    float fill =
796
                      static_cast<float>(occupation[n] + 1) / static_cast<float>(nodes[n].size());
797
                    // NOTE: Do we want to perhaps fill the first available node
798
                    //       up to 50% first before considering other nodes?
799
                    //       Probably not, because it would interfere with running
800
                    //       multiple instances. We basically shouldn't favor any
801
                    //       particular node.
802
                    if (fill < bestNodeFill)
803
                    {
804
                        bestNode     = n;
805
                        bestNodeFill = fill;
806
                    }
807
                }
808
                ns.emplace_back(bestNode);
809
                occupation[bestNode] += 1;
810
            }
811
        }
812

813
        return ns;
814
    }
815

816
    NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const {
817
        if (n >= nodes.size() || nodes[n].size() == 0)
818
            std::exit(EXIT_FAILURE);
819

820
#if defined(__linux__) && !defined(__ANDROID__)
821

822
        cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1);
823
        if (mask == nullptr)
824
            std::exit(EXIT_FAILURE);
825

826
        const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1);
827

828
        CPU_ZERO_S(masksize, mask);
829

830
        for (CpuIndex c : nodes[n])
831
            CPU_SET_S(c, masksize, mask);
832

833
        const int status = sched_setaffinity(0, masksize, mask);
834

835
        CPU_FREE(mask);
836

837
        if (status != 0)
838
            std::exit(EXIT_FAILURE);
839

840
        // We yield this thread just to be sure it gets rescheduled.
841
        // This is defensive, allowed because this code is not performance critical.
842
        sched_yield();
843

844
#elif defined(_WIN64)
845

846
        // Requires Windows 11. No good way to set thread affinity spanning
847
        // processor groups before that.
848
        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
849
        auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
850
          (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
851

852
        // We ALWAYS set affinity with the new API if available, because
853
        // there's no downsides, and we forcibly keep it consistent with
854
        // the old API should we need to use it. I.e. we always keep this
855
        // as a superset of what we set with SetThreadGroupAffinity.
856
        if (SetThreadSelectedCpuSetMasks_f != nullptr)
857
        {
858
            // Only available on Windows 11 and Windows Server 2022 onwards
859
            const USHORT numProcGroups = USHORT(
860
              ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE);
861
            auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(numProcGroups);
862
            std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups);
863
            for (WORD i = 0; i < numProcGroups; ++i)
864
                groupAffinities[i].Group = i;
865

866
            for (CpuIndex c : nodes[n])
867
            {
868
                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
869
                const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
870
                groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup;
871
            }
872

873
            HANDLE hThread = GetCurrentThread();
874

875
            const BOOL status =
876
              SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups);
877
            if (status == 0)
878
                std::exit(EXIT_FAILURE);
879

880
            // We yield this thread just to be sure it gets rescheduled.
881
            // This is defensive, allowed because this code is not performance critical.
882
            SwitchToThread();
883
        }
884

885
        // Sometimes we need to force the old API, but do not use it unless necessary.
886
        if (SetThreadSelectedCpuSetMasks_f == nullptr || STARTUP_USE_OLD_AFFINITY_API)
887
        {
888
            // On earlier windows version (since windows 7) we cannot run a single thread
889
            // on multiple processor groups, so we need to restrict the group.
890
            // We assume the group of the first processor listed for this node.
891
            // Processors from outside this group will not be assigned for this thread.
892
            // Normally this won't be an issue because windows used to assign NUMA nodes
893
            // such that they cannot span processor groups. However, since Windows 10
894
            // Build 20348 the behaviour changed, so there's a small window of versions
895
            // between this and Windows 11 that might exhibit problems with not all
896
            // processors being utilized.
897
            //
898
            // We handle this in NumaConfig::from_system by manually splitting the
899
            // nodes when we detect that there is no function to set affinity spanning
900
            // processor nodes. This is required because otherwise our thread distribution
901
            // code may produce suboptimal results.
902
            //
903
            // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
904
            GROUP_AFFINITY affinity;
905
            std::memset(&affinity, 0, sizeof(GROUP_AFFINITY));
906
            // We use an ordered set to be sure to get the smallest cpu number here.
907
            const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE;
908
            affinity.Group                    = static_cast<WORD>(forcedProcGroupIndex);
909
            for (CpuIndex c : nodes[n])
910
            {
911
                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
912
                const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
913
                // We skip processors that are not in the same processor group.
914
                // If everything was set up correctly this will never be an issue,
915
                // but we have to account for bad NUMA node specification.
916
                if (procGroupIndex != forcedProcGroupIndex)
917
                    continue;
918

919
                affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup;
920
            }
921

922
            HANDLE hThread = GetCurrentThread();
923

924
            const BOOL status = SetThreadGroupAffinity(hThread, &affinity, nullptr);
925
            if (status == 0)
926
                std::exit(EXIT_FAILURE);
927

928
            // We yield this thread just to be sure it gets rescheduled. This is
929
            // defensive, allowed because this code is not performance critical.
930
            SwitchToThread();
931
        }
932

933
#endif
934

935
        return NumaReplicatedAccessToken(n);
936
    }
937

938
    template<typename FuncT>
939
    void execute_on_numa_node(NumaIndex n, FuncT&& f) const {
940
        std::thread th([this, &f, n]() {
941
            bind_current_thread_to_numa_node(n);
942
            std::forward<FuncT>(f)();
943
        });
944

945
        th.join();
946
    }
947

948
   private:
949
    std::vector<std::set<CpuIndex>> nodes;
950
    std::map<CpuIndex, NumaIndex>   nodeByCpu;
951
    CpuIndex                        highestCpuIndex;
952

953
    bool customAffinity;
954

955
    static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); }
956

957
    struct EmptyNodeTag {};
958

959
    NumaConfig(EmptyNodeTag) :
960
        highestCpuIndex(0),
961
        customAffinity(false) {}
962

963
    void remove_empty_numa_nodes() {
964
        std::vector<std::set<CpuIndex>> newNodes;
965
        for (auto&& cpus : nodes)
966
            if (!cpus.empty())
967
                newNodes.emplace_back(std::move(cpus));
968
        nodes = std::move(newNodes);
969
    }
970

971
    // Returns true if successful
972
    // Returns false if failed, i.e. when the cpu is already present
973
    //                          strong guarantee, the structure remains unmodified
974
    bool add_cpu_to_node(NumaIndex n, CpuIndex c) {
975
        if (is_cpu_assigned(c))
976
            return false;
977

978
        while (nodes.size() <= n)
979
            nodes.emplace_back();
980

981
        nodes[n].insert(c);
982
        nodeByCpu[c] = n;
983

984
        if (c > highestCpuIndex)
985
            highestCpuIndex = c;
986

987
        return true;
988
    }
989

990
    // Returns true if successful
991
    // Returns false if failed, i.e. when any of the cpus is already present
992
    //                          strong guarantee, the structure remains unmodified
993
    bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) {
994
        for (CpuIndex c = cfirst; c <= clast; ++c)
995
            if (is_cpu_assigned(c))
996
                return false;
997

998
        while (nodes.size() <= n)
999
            nodes.emplace_back();
1000

1001
        for (CpuIndex c = cfirst; c <= clast; ++c)
1002
        {
1003
            nodes[n].insert(c);
1004
            nodeByCpu[c] = n;
1005
        }
1006

1007
        if (clast > highestCpuIndex)
1008
            highestCpuIndex = clast;
1009

1010
        return true;
1011
    }
1012

1013
    static std::vector<size_t> indices_from_shortened_string(const std::string& s) {
1014
        std::vector<size_t> indices;
1015

1016
        if (s.empty())
1017
            return indices;
1018

1019
        for (const auto& ss : split(s, ","))
1020
        {
1021
            if (ss.empty())
1022
                continue;
1023

1024
            auto parts = split(ss, "-");
1025
            if (parts.size() == 1)
1026
            {
1027
                const CpuIndex c = CpuIndex{str_to_size_t(std::string(parts[0]))};
1028
                indices.emplace_back(c);
1029
            }
1030
            else if (parts.size() == 2)
1031
            {
1032
                const CpuIndex cfirst = CpuIndex{str_to_size_t(std::string(parts[0]))};
1033
                const CpuIndex clast  = CpuIndex{str_to_size_t(std::string(parts[1]))};
1034
                for (size_t c = cfirst; c <= clast; ++c)
1035
                {
1036
                    indices.emplace_back(c);
1037
                }
1038
            }
1039
        }
1040

1041
        return indices;
1042
    }
1043
};
1044

1045
class NumaReplicationContext;
1046

1047
// Instances of this class are tracked by the NumaReplicationContext instance.
1048
// NumaReplicationContext informs all tracked instances when NUMA configuration changes.
1049
class NumaReplicatedBase {
1050
   public:
1051
    NumaReplicatedBase(NumaReplicationContext& ctx);
1052

1053
    NumaReplicatedBase(const NumaReplicatedBase&) = delete;
1054
    NumaReplicatedBase(NumaReplicatedBase&& other) noexcept;
1055

1056
    NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete;
1057
    NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept;
1058

1059
    virtual void on_numa_config_changed() = 0;
1060
    virtual ~NumaReplicatedBase();
1061

1062
    const NumaConfig& get_numa_config() const;
1063

1064
   private:
1065
    NumaReplicationContext* context;
1066
};
1067

1068
// We force boxing with a unique_ptr. If this becomes an issue due to added
1069
// indirection we may need to add an option for a custom boxing type. When the
1070
// NUMA config changes the value stored at the index 0 is replicated to other nodes.
1071
template<typename T>
1072
class NumaReplicated: public NumaReplicatedBase {
1073
   public:
1074
    using ReplicatorFuncType = std::function<T(const T&)>;
1075

1076
    NumaReplicated(NumaReplicationContext& ctx) :
1077
        NumaReplicatedBase(ctx) {
1078
        replicate_from(T{});
1079
    }
1080

1081
    NumaReplicated(NumaReplicationContext& ctx, T&& source) :
1082
        NumaReplicatedBase(ctx) {
1083
        replicate_from(std::move(source));
1084
    }
1085

1086
    NumaReplicated(const NumaReplicated&) = delete;
1087
    NumaReplicated(NumaReplicated&& other) noexcept :
1088
        NumaReplicatedBase(std::move(other)),
1089
        instances(std::exchange(other.instances, {})) {}
1090

1091
    NumaReplicated& operator=(const NumaReplicated&) = delete;
1092
    NumaReplicated& operator=(NumaReplicated&& other) noexcept {
1093
        NumaReplicatedBase::operator=(*this, std::move(other));
1094
        instances = std::exchange(other.instances, {});
1095

1096
        return *this;
1097
    }
1098

1099
    NumaReplicated& operator=(T&& source) {
1100
        replicate_from(std::move(source));
1101

1102
        return *this;
1103
    }
1104

1105
    ~NumaReplicated() override = default;
1106

1107
    const T& operator[](NumaReplicatedAccessToken token) const {
1108
        assert(token.get_numa_index() < instances.size());
1109
        return *(instances[token.get_numa_index()]);
1110
    }
1111

1112
    const T& operator*() const { return *(instances[0]); }
1113

1114
    const T* operator->() const { return instances[0].get(); }
1115

1116
    template<typename FuncT>
1117
    void modify_and_replicate(FuncT&& f) {
1118
        auto source = std::move(instances[0]);
1119
        std::forward<FuncT>(f)(*source);
1120
        replicate_from(std::move(*source));
1121
    }
1122

1123
    void on_numa_config_changed() override {
1124
        // Use the first one as the source. It doesn't matter which one we use,
1125
        // because they all must be identical, but the first one is guaranteed to exist.
1126
        auto source = std::move(instances[0]);
1127
        replicate_from(std::move(*source));
1128
    }
1129

1130
   private:
1131
    std::vector<std::unique_ptr<T>> instances;
1132

1133
    void replicate_from(T&& source) {
1134
        instances.clear();
1135

1136
        const NumaConfig& cfg = get_numa_config();
1137
        if (cfg.requires_memory_replication())
1138
        {
1139
            for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n)
1140
            {
1141
                cfg.execute_on_numa_node(
1142
                  n, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
1143
            }
1144
        }
1145
        else
1146
        {
1147
            assert(cfg.num_numa_nodes() == 1);
1148
            // We take advantage of the fact that replication is not required
1149
            // and reuse the source value, avoiding one copy operation.
1150
            instances.emplace_back(std::make_unique<T>(std::move(source)));
1151
        }
1152
    }
1153
};
1154

1155
// We force boxing with a unique_ptr. If this becomes an issue due to added
1156
// indirection we may need to add an option for a custom boxing type.
1157
template<typename T>
1158
class LazyNumaReplicated: public NumaReplicatedBase {
1159
   public:
1160
    using ReplicatorFuncType = std::function<T(const T&)>;
1161

1162
    LazyNumaReplicated(NumaReplicationContext& ctx) :
1163
        NumaReplicatedBase(ctx) {
1164
        prepare_replicate_from(T{});
1165
    }
1166

1167
    LazyNumaReplicated(NumaReplicationContext& ctx, T&& source) :
1168
        NumaReplicatedBase(ctx) {
1169
        prepare_replicate_from(std::move(source));
1170
    }
1171

1172
    LazyNumaReplicated(const LazyNumaReplicated&) = delete;
1173
    LazyNumaReplicated(LazyNumaReplicated&& other) noexcept :
1174
        NumaReplicatedBase(std::move(other)),
1175
        instances(std::exchange(other.instances, {})) {}
1176

1177
    LazyNumaReplicated& operator=(const LazyNumaReplicated&) = delete;
1178
    LazyNumaReplicated& operator=(LazyNumaReplicated&& other) noexcept {
1179
        NumaReplicatedBase::operator=(*this, std::move(other));
1180
        instances = std::exchange(other.instances, {});
1181

1182
        return *this;
1183
    }
1184

1185
    LazyNumaReplicated& operator=(T&& source) {
1186
        prepare_replicate_from(std::move(source));
1187

1188
        return *this;
1189
    }
1190

1191
    ~LazyNumaReplicated() override = default;
1192

1193
    const T& operator[](NumaReplicatedAccessToken token) const {
1194
        assert(token.get_numa_index() < instances.size());
1195
        ensure_present(token.get_numa_index());
1196
        return *(instances[token.get_numa_index()]);
1197
    }
1198

1199
    const T& operator*() const { return *(instances[0]); }
1200

1201
    const T* operator->() const { return instances[0].get(); }
1202

1203
    template<typename FuncT>
1204
    void modify_and_replicate(FuncT&& f) {
1205
        auto source = std::move(instances[0]);
1206
        std::forward<FuncT>(f)(*source);
1207
        prepare_replicate_from(std::move(*source));
1208
    }
1209

1210
    void on_numa_config_changed() override {
1211
        // Use the first one as the source. It doesn't matter which one we use,
1212
        // because they all must be identical, but the first one is guaranteed to exist.
1213
        auto source = std::move(instances[0]);
1214
        prepare_replicate_from(std::move(*source));
1215
    }
1216

1217
   private:
1218
    mutable std::vector<std::unique_ptr<T>> instances;
1219
    mutable std::mutex                      mutex;
1220

1221
    void ensure_present(NumaIndex idx) const {
1222
        assert(idx < instances.size());
1223

1224
        if (instances[idx] != nullptr)
1225
            return;
1226

1227
        assert(idx != 0);
1228

1229
        std::unique_lock<std::mutex> lock(mutex);
1230
        // Check again for races.
1231
        if (instances[idx] != nullptr)
1232
            return;
1233

1234
        const NumaConfig& cfg = get_numa_config();
1235
        cfg.execute_on_numa_node(
1236
          idx, [this, idx]() { instances[idx] = std::make_unique<T>(*instances[0]); });
1237
    }
1238

1239
    void prepare_replicate_from(T&& source) {
1240
        instances.clear();
1241

1242
        const NumaConfig& cfg = get_numa_config();
1243
        if (cfg.requires_memory_replication())
1244
        {
1245
            assert(cfg.num_numa_nodes() > 0);
1246

1247
            // We just need to make sure the first instance is there.
1248
            // Note that we cannot move here as we need to reallocate the data
1249
            // on the correct NUMA node.
1250
            cfg.execute_on_numa_node(
1251
              0, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
1252

1253
            // Prepare others for lazy init.
1254
            instances.resize(cfg.num_numa_nodes());
1255
        }
1256
        else
1257
        {
1258
            assert(cfg.num_numa_nodes() == 1);
1259
            // We take advantage of the fact that replication is not required
1260
            // and reuse the source value, avoiding one copy operation.
1261
            instances.emplace_back(std::make_unique<T>(std::move(source)));
1262
        }
1263
    }
1264
};
1265

1266
class NumaReplicationContext {
1267
   public:
1268
    NumaReplicationContext(NumaConfig&& cfg) :
1269
        config(std::move(cfg)) {}
1270

1271
    NumaReplicationContext(const NumaReplicationContext&) = delete;
1272
    NumaReplicationContext(NumaReplicationContext&&)      = delete;
1273

1274
    NumaReplicationContext& operator=(const NumaReplicationContext&) = delete;
1275
    NumaReplicationContext& operator=(NumaReplicationContext&&)      = delete;
1276

1277
    ~NumaReplicationContext() {
1278
        // The context must outlive replicated objects
1279
        if (!trackedReplicatedObjects.empty())
1280
            std::exit(EXIT_FAILURE);
1281
    }
1282

1283
    void attach(NumaReplicatedBase* obj) {
1284
        assert(trackedReplicatedObjects.count(obj) == 0);
1285
        trackedReplicatedObjects.insert(obj);
1286
    }
1287

1288
    void detach(NumaReplicatedBase* obj) {
1289
        assert(trackedReplicatedObjects.count(obj) == 1);
1290
        trackedReplicatedObjects.erase(obj);
1291
    }
1292

1293
    // oldObj may be invalid at this point
1294
    void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) {
1295
        assert(trackedReplicatedObjects.count(oldObj) == 1);
1296
        assert(trackedReplicatedObjects.count(newObj) == 0);
1297
        trackedReplicatedObjects.erase(oldObj);
1298
        trackedReplicatedObjects.insert(newObj);
1299
    }
1300

1301
    void set_numa_config(NumaConfig&& cfg) {
1302
        config = std::move(cfg);
1303
        for (auto&& obj : trackedReplicatedObjects)
1304
            obj->on_numa_config_changed();
1305
    }
1306

1307
    const NumaConfig& get_numa_config() const { return config; }
1308

1309
   private:
1310
    NumaConfig config;
1311

1312
    // std::set uses std::less by default, which is required for pointer comparison
1313
    std::set<NumaReplicatedBase*> trackedReplicatedObjects;
1314
};
1315

1316
inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) :
1317
    context(&ctx) {
1318
    context->attach(this);
1319
}
1320

1321
inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept :
1322
    context(std::exchange(other.context, nullptr)) {
1323
    context->move_attached(&other, this);
1324
}
1325

1326
inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept {
1327
    context = std::exchange(other.context, nullptr);
1328

1329
    context->move_attached(&other, this);
1330

1331
    return *this;
1332
}
1333

1334
inline NumaReplicatedBase::~NumaReplicatedBase() {
1335
    if (context != nullptr)
1336
        context->detach(this);
1337
}
1338

1339
inline const NumaConfig& NumaReplicatedBase::get_numa_config() const {
1340
    return context->get_numa_config();
1341
}
1342

1343
}  // namespace Stockfish
1344

1345

1346
#endif  // #ifndef NUMA_H_INCLUDED
1347

1348
Product

Resources

Company