Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
official-stockfish
GitHub Repository: official-stockfish/Stockfish
Path: blob/master/src/nnue/nnue_feature_transformer.h
375 views
1
/*
2
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3
Copyright (C) 2004-2025 The Stockfish developers (see AUTHORS file)
4
5
Stockfish is free software: you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation, either version 3 of the License, or
8
(at your option) any later version.
9
10
Stockfish is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
14
15
You should have received a copy of the GNU General Public License
16
along with this program. If not, see <http://www.gnu.org/licenses/>.
17
*/
18
19
// A class that converts the input features of the NNUE evaluation function
20
21
#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
22
#define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
23
24
#include <algorithm>
25
#include <cstdint>
26
#include <cstring>
27
#include <iosfwd>
28
29
#include "../position.h"
30
#include "../types.h"
31
#include "nnue_accumulator.h"
32
#include "nnue_architecture.h"
33
#include "nnue_common.h"
34
#include "simd.h"
35
36
namespace Stockfish::Eval::NNUE {
37
38
// Returns the inverse of a permutation
39
template<std::size_t Len>
40
constexpr std::array<std::size_t, Len>
41
invert_permutation(const std::array<std::size_t, Len>& order) {
42
std::array<std::size_t, Len> inverse{};
43
for (std::size_t i = 0; i < order.size(); i++)
44
inverse[order[i]] = i;
45
return inverse;
46
}
47
48
// Divide a byte region of size TotalSize to chunks of size
49
// BlockSize, and permute the blocks by a given order
50
template<std::size_t BlockSize, typename T, std::size_t N, std::size_t OrderSize>
51
void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) {
52
constexpr std::size_t TotalSize = N * sizeof(T);
53
54
static_assert(TotalSize % (BlockSize * OrderSize) == 0,
55
"ChunkSize * OrderSize must perfectly divide TotalSize");
56
57
constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize;
58
59
std::array<std::byte, ProcessChunkSize> buffer{};
60
61
std::byte* const bytes = reinterpret_cast<std::byte*>(data);
62
63
for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize)
64
{
65
std::byte* const values = &bytes[i];
66
67
for (std::size_t j = 0; j < OrderSize; j++)
68
{
69
auto* const buffer_chunk = &buffer[j * BlockSize];
70
auto* const value_chunk = &values[order[j] * BlockSize];
71
72
std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk);
73
}
74
75
std::copy(std::begin(buffer), std::end(buffer), values);
76
}
77
}
78
79
// Input feature converter
80
template<IndexType TransformedFeatureDimensions>
81
class FeatureTransformer {
82
83
// Number of output dimensions for one side
84
static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
85
86
public:
87
// Output type
88
using OutputType = TransformedFeatureType;
89
90
// Number of input/output dimensions
91
static constexpr IndexType InputDimensions = FeatureSet::Dimensions;
92
static constexpr IndexType OutputDimensions = HalfDimensions;
93
94
// Size of forward propagation buffer
95
static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType);
96
97
// Store the order by which 128-bit blocks of a 1024-bit data must
98
// be permuted so that calling packus on adjacent vectors of 16-bit
99
// integers loaded from the data results in the pre-permutation order
100
static constexpr auto PackusEpi16Order = []() -> std::array<std::size_t, 8> {
101
#if defined(USE_AVX512)
102
// _mm512_packus_epi16 after permutation:
103
// | 0 | 2 | 4 | 6 | // Vector 0
104
// | 1 | 3 | 5 | 7 | // Vector 1
105
// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | // Packed Result
106
return {0, 2, 4, 6, 1, 3, 5, 7};
107
#elif defined(USE_AVX2)
108
// _mm256_packus_epi16 after permutation:
109
// | 0 | 2 | | 4 | 6 | // Vector 0, 2
110
// | 1 | 3 | | 5 | 7 | // Vector 1, 3
111
// | 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | // Packed Result
112
return {0, 2, 1, 3, 4, 6, 5, 7};
113
#else
114
return {0, 1, 2, 3, 4, 5, 6, 7};
115
#endif
116
}();
117
118
static constexpr auto InversePackusEpi16Order = invert_permutation(PackusEpi16Order);
119
120
// Hash value embedded in the evaluation file
121
static constexpr std::uint32_t get_hash_value() {
122
return FeatureSet::HashValue ^ (OutputDimensions * 2);
123
}
124
125
void permute_weights() {
126
permute<16>(biases, PackusEpi16Order);
127
permute<16>(weights, PackusEpi16Order);
128
}
129
130
void unpermute_weights() {
131
permute<16>(biases, InversePackusEpi16Order);
132
permute<16>(weights, InversePackusEpi16Order);
133
}
134
135
inline void scale_weights(bool read) {
136
for (IndexType j = 0; j < InputDimensions; ++j)
137
{
138
WeightType* w = &weights[j * HalfDimensions];
139
for (IndexType i = 0; i < HalfDimensions; ++i)
140
w[i] = read ? w[i] * 2 : w[i] / 2;
141
}
142
143
for (IndexType i = 0; i < HalfDimensions; ++i)
144
biases[i] = read ? biases[i] * 2 : biases[i] / 2;
145
}
146
147
// Read network parameters
148
bool read_parameters(std::istream& stream) {
149
150
read_leb_128<BiasType>(stream, biases, HalfDimensions);
151
read_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
152
read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
153
154
permute_weights();
155
scale_weights(true);
156
return !stream.fail();
157
}
158
159
// Write network parameters
160
bool write_parameters(std::ostream& stream) {
161
162
unpermute_weights();
163
scale_weights(false);
164
165
write_leb_128<BiasType>(stream, biases, HalfDimensions);
166
write_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
167
write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
168
169
permute_weights();
170
scale_weights(true);
171
return !stream.fail();
172
}
173
174
// Convert input features
175
std::int32_t transform(const Position& pos,
176
AccumulatorStack& accumulatorStack,
177
AccumulatorCaches::Cache<HalfDimensions>* cache,
178
OutputType* output,
179
int bucket) const {
180
181
using namespace SIMD;
182
183
accumulatorStack.evaluate(pos, *this, *cache);
184
const auto& accumulatorState = accumulatorStack.latest();
185
186
const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
187
const auto& psqtAccumulation = (accumulatorState.acc<HalfDimensions>()).psqtAccumulation;
188
const auto psqt =
189
(psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
190
/ 2;
191
192
const auto& accumulation = (accumulatorState.acc<HalfDimensions>()).accumulation;
193
194
for (IndexType p = 0; p < 2; ++p)
195
{
196
const IndexType offset = (HalfDimensions / 2) * p;
197
198
#if defined(VECTOR)
199
200
constexpr IndexType OutputChunkSize = MaxChunkSize;
201
static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
202
constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
203
204
const vec_t Zero = vec_zero();
205
const vec_t One = vec_set_16(127 * 2);
206
207
const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
208
const vec_t* in1 =
209
reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
210
vec_t* out = reinterpret_cast<vec_t*>(output + offset);
211
212
// Per the NNUE architecture, here we want to multiply pairs of
213
// clipped elements and divide the product by 128. To do this,
214
// we can naively perform min/max operation to clip each of the
215
// four int16 vectors, mullo pairs together, then pack them into
216
// one int8 vector. However, there exists a faster way.
217
218
// The idea here is to use the implicit clipping from packus to
219
// save us two vec_max_16 instructions. This clipping works due
220
// to the fact that any int16 integer below zero will be zeroed
221
// on packus.
222
223
// Consider the case where the second element is negative.
224
// If we do standard clipping, that element will be zero, which
225
// means our pairwise product is zero. If we perform packus and
226
// remove the lower-side clip for the second element, then our
227
// product before packus will be negative, and is zeroed on pack.
228
// The two operation produce equivalent results, but the second
229
// one (using packus) saves one max operation per pair.
230
231
// But here we run into a problem: mullo does not preserve the
232
// sign of the multiplication. We can get around this by doing
233
// mulhi, which keeps the sign. But that requires an additional
234
// tweak.
235
236
// mulhi cuts off the last 16 bits of the resulting product,
237
// which is the same as performing a rightward shift of 16 bits.
238
// We can use this to our advantage. Recall that we want to
239
// divide the final product by 128, which is equivalent to a
240
// 7-bit right shift. Intuitively, if we shift the clipped
241
// value left by 9, and perform mulhi, which shifts the product
242
// right by 16 bits, then we will net a right shift of 7 bits.
243
// However, this won't work as intended. Since we clip the
244
// values to have a maximum value of 127, shifting it by 9 bits
245
// might occupy the signed bit, resulting in some positive
246
// values being interpreted as negative after the shift.
247
248
// There is a way, however, to get around this limitation. When
249
// loading the network, scale accumulator weights and biases by
250
// 2. To get the same pairwise multiplication result as before,
251
// we need to divide the product by 128 * 2 * 2 = 512, which
252
// amounts to a right shift of 9 bits. So now we only have to
253
// shift left by 7 bits, perform mulhi (shifts right by 16 bits)
254
// and net a 9 bit right shift. Since we scaled everything by
255
// two, the values are clipped at 127 * 2 = 254, which occupies
256
// 8 bits. Shifting it by 7 bits left will no longer occupy the
257
// signed bit, so we are safe.
258
259
// Note that on NEON processors, we shift left by 6 instead
260
// because the instruction "vqdmulhq_s16" also doubles the
261
// return value after the multiplication, adding an extra shift
262
// to the left by 1, so we compensate by shifting less before
263
// the multiplication.
264
265
constexpr int shift =
266
#if defined(USE_SSE2)
267
7;
268
#else
269
6;
270
#endif
271
272
for (IndexType j = 0; j < NumOutputChunks; ++j)
273
{
274
const vec_t sum0a =
275
vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift);
276
const vec_t sum0b =
277
vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift);
278
const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One);
279
const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One);
280
281
const vec_t pa = vec_mulhi_16(sum0a, sum1a);
282
const vec_t pb = vec_mulhi_16(sum0b, sum1b);
283
284
out[j] = vec_packus_16(pa, pb);
285
}
286
287
#else
288
289
for (IndexType j = 0; j < HalfDimensions / 2; ++j)
290
{
291
BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
292
BiasType sum1 =
293
accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
294
sum0 = std::clamp<BiasType>(sum0, 0, 127 * 2);
295
sum1 = std::clamp<BiasType>(sum1, 0, 127 * 2);
296
output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 512);
297
}
298
299
#endif
300
}
301
302
return psqt;
303
} // end of function transform()
304
305
alignas(CacheLineSize) BiasType biases[HalfDimensions];
306
alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
307
alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
308
};
309
310
} // namespace Stockfish::Eval::NNUE
311
312
#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
313
314