CoCalc -- nnue_feature

GitHub Repository: official-stockfish/Stockfish
Path: blob/master/src/nnue/nnue_feature_transformer.h
³⁷⁵ views
1
/*
2
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3
  Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
4

5
  Stockfish is free software: you can redistribute it and/or modify
6
  it under the terms of the GNU General Public License as published by
7
  the Free Software Foundation, either version 3 of the License, or
8
  (at your option) any later version.
9

10
  Stockfish is distributed in the hope that it will be useful,
11
  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
  GNU General Public License for more details.
14

15
  You should have received a copy of the GNU General Public License
16
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
*/
18

19
// A class that converts the input features of the NNUE evaluation function
20

21
#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
22
#define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
23

24
#include <algorithm>
25
#include <cstdint>
26
#include <cstring>
27
#include <iosfwd>
28

29
#include "../position.h"
30
#include "../types.h"
31
#include "nnue_accumulator.h"
32
#include "nnue_architecture.h"
33
#include "nnue_common.h"
34
#include "simd.h"
35

36
namespace Stockfish::Eval::NNUE {
37

38
// Returns the inverse of a permutation
39
template<std::size_t Len>
40
constexpr std::array<std::size_t, Len>
41
invert_permutation(const std::array<std::size_t, Len>& order) {
42
    std::array<std::size_t, Len> inverse{};
43
    for (std::size_t i = 0; i < order.size(); i++)
44
        inverse[order[i]] = i;
45
    return inverse;
46
}
47

48
// Divide a byte region of size TotalSize to chunks of size
49
// BlockSize, and permute the blocks by a given order
50
template<std::size_t BlockSize, typename T, std::size_t N, std::size_t OrderSize>
51
void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) {
52
    constexpr std::size_t TotalSize = N * sizeof(T);
53

54
    static_assert(TotalSize % (BlockSize * OrderSize) == 0,
55
                  "ChunkSize * OrderSize must perfectly divide TotalSize");
56

57
    constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize;
58

59
    std::array<std::byte, ProcessChunkSize> buffer{};
60

61
    std::byte* const bytes = reinterpret_cast<std::byte*>(data);
62

63
    for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize)
64
    {
65
        std::byte* const values = &bytes[i];
66

67
        for (std::size_t j = 0; j < OrderSize; j++)
68
        {
69
            auto* const buffer_chunk = &buffer[j * BlockSize];
70
            auto* const value_chunk  = &values[order[j] * BlockSize];
71

72
            std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk);
73
        }
74

75
        std::copy(std::begin(buffer), std::end(buffer), values);
76
    }
77
}
78

79
// Input feature converter
80
template<IndexType TransformedFeatureDimensions>
81
class FeatureTransformer {
82

83
    // Number of output dimensions for one side
84
    static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
85

86
   public:
87
    // Output type
88
    using OutputType = TransformedFeatureType;
89

90
    // Number of input/output dimensions
91
    static constexpr IndexType InputDimensions  = FeatureSet::Dimensions;
92
    static constexpr IndexType OutputDimensions = HalfDimensions;
93

94
    // Size of forward propagation buffer
95
    static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType);
96

97
    // Store the order by which 128-bit blocks of a 1024-bit data must
98
    // be permuted so that calling packus on adjacent vectors of 16-bit
99
    // integers loaded from the data results in the pre-permutation order
100
    static constexpr auto PackusEpi16Order = []() -> std::array<std::size_t, 8> {
101
#if defined(USE_AVX512)
102
        // _mm512_packus_epi16 after permutation:
103
        // |   0   |   2   |   4   |   6   | // Vector 0
104
        // |   1   |   3   |   5   |   7   | // Vector 1
105
        // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | // Packed Result
106
        return {0, 2, 4, 6, 1, 3, 5, 7};
107
#elif defined(USE_AVX2)
108
        // _mm256_packus_epi16 after permutation:
109
        // |   0   |   2   |  |   4   |   6   | // Vector 0, 2
110
        // |   1   |   3   |  |   5   |   7   | // Vector 1, 3
111
        // | 0 | 1 | 2 | 3 |  | 4 | 5 | 6 | 7 | // Packed Result
112
        return {0, 2, 1, 3, 4, 6, 5, 7};
113
#else
114
        return {0, 1, 2, 3, 4, 5, 6, 7};
115
#endif
116
    }();
117

118
    static constexpr auto InversePackusEpi16Order = invert_permutation(PackusEpi16Order);
119

120
    // Hash value embedded in the evaluation file
121
    static constexpr std::uint32_t get_hash_value() {
122
        return FeatureSet::HashValue ^ (OutputDimensions * 2);
123
    }
124

125
    void permute_weights() {
126
        permute<16>(biases, PackusEpi16Order);
127
        permute<16>(weights, PackusEpi16Order);
128
    }
129

130
    void unpermute_weights() {
131
        permute<16>(biases, InversePackusEpi16Order);
132
        permute<16>(weights, InversePackusEpi16Order);
133
    }
134

135
    inline void scale_weights(bool read) {
136
        for (IndexType j = 0; j < InputDimensions; ++j)
137
        {
138
            WeightType* w = &weights[j * HalfDimensions];
139
            for (IndexType i = 0; i < HalfDimensions; ++i)
140
                w[i] = read ? w[i] * 2 : w[i] / 2;
141
        }
142

143
        for (IndexType i = 0; i < HalfDimensions; ++i)
144
            biases[i] = read ? biases[i] * 2 : biases[i] / 2;
145
    }
146

147
    // Read network parameters
148
    bool read_parameters(std::istream& stream) {
149

150
        read_leb_128<BiasType>(stream, biases, HalfDimensions);
151
        read_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
152
        read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
153

154
        permute_weights();
155
        scale_weights(true);
156
        return !stream.fail();
157
    }
158

159
    // Write network parameters
160
    bool write_parameters(std::ostream& stream) {
161

162
        unpermute_weights();
163
        scale_weights(false);
164

165
        write_leb_128<BiasType>(stream, biases, HalfDimensions);
166
        write_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
167
        write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
168

169
        permute_weights();
170
        scale_weights(true);
171
        return !stream.fail();
172
    }
173

174
    // Convert input features
175
    std::int32_t transform(const Position&                           pos,
176
                           AccumulatorStack&                         accumulatorStack,
177
                           AccumulatorCaches::Cache<HalfDimensions>* cache,
178
                           OutputType*                               output,
179
                           int                                       bucket) const {
180

181
        using namespace SIMD;
182

183
        accumulatorStack.evaluate(pos, *this, *cache);
184
        const auto& accumulatorState = accumulatorStack.latest();
185

186
        const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
187
        const auto& psqtAccumulation = (accumulatorState.acc<HalfDimensions>()).psqtAccumulation;
188
        const auto  psqt =
189
          (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
190
          / 2;
191

192
        const auto& accumulation = (accumulatorState.acc<HalfDimensions>()).accumulation;
193

194
        for (IndexType p = 0; p < 2; ++p)
195
        {
196
            const IndexType offset = (HalfDimensions / 2) * p;
197

198
#if defined(VECTOR)
199

200
            constexpr IndexType OutputChunkSize = MaxChunkSize;
201
            static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
202
            constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
203

204
            const vec_t Zero = vec_zero();
205
            const vec_t One  = vec_set_16(127 * 2);
206

207
            const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
208
            const vec_t* in1 =
209
              reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
210
            vec_t* out = reinterpret_cast<vec_t*>(output + offset);
211

212
            // Per the NNUE architecture, here we want to multiply pairs of
213
            // clipped elements and divide the product by 128. To do this,
214
            // we can naively perform min/max operation to clip each of the
215
            // four int16 vectors, mullo pairs together, then pack them into
216
            // one int8 vector. However, there exists a faster way.
217

218
            // The idea here is to use the implicit clipping from packus to
219
            // save us two vec_max_16 instructions. This clipping works due
220
            // to the fact that any int16 integer below zero will be zeroed
221
            // on packus.
222

223
            // Consider the case where the second element is negative.
224
            // If we do standard clipping, that element will be zero, which
225
            // means our pairwise product is zero. If we perform packus and
226
            // remove the lower-side clip for the second element, then our
227
            // product before packus will be negative, and is zeroed on pack.
228
            // The two operation produce equivalent results, but the second
229
            // one (using packus) saves one max operation per pair.
230

231
            // But here we run into a problem: mullo does not preserve the
232
            // sign of the multiplication. We can get around this by doing
233
            // mulhi, which keeps the sign. But that requires an additional
234
            // tweak.
235

236
            // mulhi cuts off the last 16 bits of the resulting product,
237
            // which is the same as performing a rightward shift of 16 bits.
238
            // We can use this to our advantage. Recall that we want to
239
            // divide the final product by 128, which is equivalent to a
240
            // 7-bit right shift. Intuitively, if we shift the clipped
241
            // value left by 9, and perform mulhi, which shifts the product
242
            // right by 16 bits, then we will net a right shift of 7 bits.
243
            // However, this won't work as intended. Since we clip the
244
            // values to have a maximum value of 127, shifting it by 9 bits
245
            // might occupy the signed bit, resulting in some positive
246
            // values being interpreted as negative after the shift.
247

248
            // There is a way, however, to get around this limitation. When
249
            // loading the network, scale accumulator weights and biases by
250
            // 2. To get the same pairwise multiplication result as before,
251
            // we need to divide the product by 128 * 2 * 2 = 512, which
252
            // amounts to a right shift of 9 bits. So now we only have to
253
            // shift left by 7 bits, perform mulhi (shifts right by 16 bits)
254
            // and net a 9 bit right shift. Since we scaled everything by
255
            // two, the values are clipped at 127 * 2 = 254, which occupies
256
            // 8 bits. Shifting it by 7 bits left will no longer occupy the
257
            // signed bit, so we are safe.
258

259
            // Note that on NEON processors, we shift left by 6 instead
260
            // because the instruction "vqdmulhq_s16" also doubles the
261
            // return value after the multiplication, adding an extra shift
262
            // to the left by 1, so we compensate by shifting less before
263
            // the multiplication.
264

265
            constexpr int shift =
266
    #if defined(USE_SSE2)
267
              7;
268
    #else
269
              6;
270
    #endif
271

272
            for (IndexType j = 0; j < NumOutputChunks; ++j)
273
            {
274
                const vec_t sum0a =
275
                  vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift);
276
                const vec_t sum0b =
277
                  vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift);
278
                const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One);
279
                const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One);
280

281
                const vec_t pa = vec_mulhi_16(sum0a, sum1a);
282
                const vec_t pb = vec_mulhi_16(sum0b, sum1b);
283

284
                out[j] = vec_packus_16(pa, pb);
285
            }
286

287
#else
288

289
            for (IndexType j = 0; j < HalfDimensions / 2; ++j)
290
            {
291
                BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
292
                BiasType sum1 =
293
                  accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
294
                sum0               = std::clamp<BiasType>(sum0, 0, 127 * 2);
295
                sum1               = std::clamp<BiasType>(sum1, 0, 127 * 2);
296
                output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 512);
297
            }
298

299
#endif
300
        }
301

302
        return psqt;
303
    }  // end of function transform()
304

305
    alignas(CacheLineSize) BiasType biases[HalfDimensions];
306
    alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
307
    alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
308
};
309

310
}  // namespace Stockfish::Eval::NNUE
311

312
#endif  // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
313

314
Product

Resources

Company