Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/dep/lzma/src/Sha256Opt.c
4253 views
1
/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2
2024-03-01 : Igor Pavlov : Public domain */
3
4
#include "Precomp.h"
5
#include "Compiler.h"
6
#include "CpuArch.h"
7
8
#if defined(_MSC_VER)
9
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10
// #define USE_MY_MM
11
#endif
12
#endif
13
14
// #define Z7_USE_HW_SHA_STUB // for debug
15
16
#ifdef MY_CPU_X86_OR_AMD64
17
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18
#define USE_HW_SHA
19
#elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
20
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22
#define USE_HW_SHA
23
#if !defined(_INTEL_COMPILER)
24
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25
#if !defined(__SHA__) || !defined(__SSSE3__)
26
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27
#endif
28
#endif
29
#elif defined(_MSC_VER)
30
#ifdef USE_MY_MM
31
#define USE_VER_MIN 1300
32
#else
33
#define USE_VER_MIN 1900
34
#endif
35
#if (_MSC_VER >= USE_VER_MIN)
36
#define USE_HW_SHA
37
#else
38
#define Z7_USE_HW_SHA_STUB
39
#endif
40
#endif
41
// #endif // MY_CPU_X86_OR_AMD64
42
#ifndef USE_HW_SHA
43
// #define Z7_USE_HW_SHA_STUB // for debug
44
#endif
45
46
#ifdef USE_HW_SHA
47
48
// #pragma message("Sha256 HW")
49
50
// sse/sse2/ssse3:
51
#include <tmmintrin.h>
52
// sha*:
53
#include <immintrin.h>
54
55
#if defined (__clang__) && defined(_MSC_VER)
56
// #if !defined(__SSSE3__)
57
// #endif
58
#if !defined(__SHA__)
59
#include <shaintrin.h>
60
#endif
61
#else
62
63
#ifdef USE_MY_MM
64
#include "My_mm.h"
65
#endif
66
67
#endif
68
69
/*
70
SHA256 uses:
71
SSE2:
72
_mm_loadu_si128
73
_mm_storeu_si128
74
_mm_set_epi32
75
_mm_add_epi32
76
_mm_shuffle_epi32 / pshufd
77
78
79
80
SSSE3:
81
_mm_shuffle_epi8 / pshufb
82
_mm_alignr_epi8
83
SHA:
84
_mm_sha256*
85
*/
86
87
// K array must be aligned for 16-bytes at least.
88
// The compiler can look align attribute and selects
89
// movdqu - for code without align attribute
90
// movdqa - for code with align attribute
91
extern
92
MY_ALIGN(64)
93
const UInt32 SHA256_K_ARRAY[64];
94
95
#define K SHA256_K_ARRAY
96
97
98
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
99
#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
100
#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
101
102
103
#define LOAD_SHUFFLE(m, k) \
104
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105
m = _mm_shuffle_epi8(m, mask); \
106
107
#define SM1(g0, g1, g2, g3) \
108
SHA256_MSG1(g3, g0); \
109
110
#define SM2(g0, g1, g2, g3) \
111
tmp = _mm_alignr_epi8(g1, g0, 4); \
112
ADD_EPI32(g2, tmp) \
113
SHA25G_MSG2(g2, g1); \
114
115
// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
116
// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
117
118
119
#define NNN(g0, g1, g2, g3)
120
121
122
#define RND2(t0, t1) \
123
t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
124
125
#define RND2_0(m, k) \
126
msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
127
RND2(state0, state1); \
128
msg = _mm_shuffle_epi32(msg, 0x0E); \
129
130
131
#define RND2_1 \
132
RND2(state1, state0); \
133
134
135
// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
136
137
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
138
RND2_0(g0, k) \
139
OP0(g0, g1, g2, g3) \
140
RND2_1 \
141
OP1(g0, g1, g2, g3) \
142
143
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
144
R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
145
R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
146
R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
147
R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
148
149
#define PREPARE_STATE \
150
tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
151
state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
152
state1 = state0; \
153
state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
154
state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
155
156
157
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
158
#ifdef ATTRIB_SHA
159
ATTRIB_SHA
160
#endif
161
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
162
{
163
const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
164
__m128i tmp;
165
__m128i state0, state1;
166
167
if (numBlocks == 0)
168
return;
169
170
state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
171
state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
172
173
PREPARE_STATE
174
175
do
176
{
177
__m128i state0_save, state1_save;
178
__m128i m0, m1, m2, m3;
179
__m128i msg;
180
// #define msg tmp
181
182
state0_save = state0;
183
state1_save = state1;
184
185
LOAD_SHUFFLE (m0, 0)
186
LOAD_SHUFFLE (m1, 1)
187
LOAD_SHUFFLE (m2, 2)
188
LOAD_SHUFFLE (m3, 3)
189
190
191
192
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
193
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
194
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
195
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
196
197
ADD_EPI32(state0, state0_save)
198
ADD_EPI32(state1, state1_save)
199
200
data += 64;
201
}
202
while (--numBlocks);
203
204
PREPARE_STATE
205
206
_mm_storeu_si128((__m128i *) (void *) &state[0], state0);
207
_mm_storeu_si128((__m128i *) (void *) &state[4], state1);
208
}
209
210
#endif // USE_HW_SHA
211
212
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
213
214
#if defined(__ARM_FEATURE_SHA2) \
215
|| defined(__ARM_FEATURE_CRYPTO)
216
#define USE_HW_SHA
217
#else
218
#if defined(MY_CPU_ARM64) \
219
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
220
|| defined(Z7_MSC_VER_ORIGINAL)
221
#if defined(__ARM_FP) && \
222
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
223
|| defined(__GNUC__) && (__GNUC__ >= 6) \
224
) \
225
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
226
#if defined(MY_CPU_ARM64) \
227
|| !defined(Z7_CLANG_VERSION) \
228
|| defined(__ARM_NEON) && \
229
(Z7_CLANG_VERSION < 170000 || \
230
Z7_CLANG_VERSION > 170001)
231
#define USE_HW_SHA
232
#endif
233
#endif
234
#endif
235
#endif
236
237
#ifdef USE_HW_SHA
238
239
// #pragma message("=== Sha256 HW === ")
240
241
242
#if defined(__clang__) || defined(__GNUC__)
243
#if !defined(__ARM_FEATURE_SHA2) && \
244
!defined(__ARM_FEATURE_CRYPTO)
245
#ifdef MY_CPU_ARM64
246
#if defined(__clang__)
247
#define ATTRIB_SHA __attribute__((__target__("crypto")))
248
#else
249
#define ATTRIB_SHA __attribute__((__target__("+crypto")))
250
#endif
251
#else
252
#if defined(__clang__) && (__clang_major__ >= 1)
253
#define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
254
#else
255
#define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
256
#endif
257
#endif
258
#endif
259
#else
260
// _MSC_VER
261
// for arm32
262
#define _ARM_USE_NEW_NEON_INTRINSICS
263
#endif
264
265
266
267
268
269
#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270
#include <arm64_neon.h>
271
#else
272
273
274
275
276
277
278
279
280
281
#if defined(__clang__) && __clang_major__ < 16
282
#if !defined(__ARM_FEATURE_SHA2) && \
283
!defined(__ARM_FEATURE_CRYPTO)
284
// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
285
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
286
#define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
287
// #if defined(__clang__) && __clang_major__ < 13
288
#define __ARM_FEATURE_CRYPTO 1
289
// #else
290
#define __ARM_FEATURE_SHA2 1
291
// #endif
292
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
293
#endif
294
#endif // clang
295
296
#if defined(__clang__)
297
298
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
299
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
300
// #pragma message("#define __ARM_ARCH 8")
301
#undef __ARM_ARCH
302
#define __ARM_ARCH 8
303
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
304
#endif
305
306
#endif // clang
307
308
#include <arm_neon.h>
309
310
#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
311
defined(__ARM_FEATURE_CRYPTO) && \
312
defined(__ARM_FEATURE_SHA2)
313
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
314
#undef __ARM_FEATURE_CRYPTO
315
#undef __ARM_FEATURE_SHA2
316
#undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
317
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
318
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
319
#endif
320
321
#endif // Z7_MSC_VER_ORIGINAL
322
323
typedef uint32x4_t v128;
324
// typedef __n128 v128; // MSVC
325
326
#ifdef MY_CPU_BE
327
#define MY_rev32_for_LE(x)
328
#else
329
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
330
#endif
331
332
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
333
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
334
335
#define LOAD_SHUFFLE(m, k) \
336
m = LOAD_128((data + (k) * 16)); \
337
MY_rev32_for_LE(m); \
338
339
// K array must be aligned for 16-bytes at least.
340
extern
341
MY_ALIGN(64)
342
const UInt32 SHA256_K_ARRAY[64];
343
344
#define K SHA256_K_ARRAY
345
346
347
#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
348
#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
349
350
#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
351
#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
352
#define NNN(g0, g1, g2, g3)
353
354
355
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
356
msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
357
tmp = state0; \
358
state0 = vsha256hq_u32( state0, state1, msg ); \
359
state1 = vsha256h2q_u32( state1, tmp, msg ); \
360
OP0(g0, g1, g2, g3); \
361
OP1(g0, g1, g2, g3); \
362
363
364
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
365
R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
366
R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
367
R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
368
R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
369
370
371
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
372
#ifdef ATTRIB_SHA
373
ATTRIB_SHA
374
#endif
375
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
376
{
377
v128 state0, state1;
378
379
if (numBlocks == 0)
380
return;
381
382
state0 = LOAD_128(&state[0]);
383
state1 = LOAD_128(&state[4]);
384
385
do
386
{
387
v128 state0_save, state1_save;
388
v128 m0, m1, m2, m3;
389
v128 msg, tmp;
390
391
state0_save = state0;
392
state1_save = state1;
393
394
LOAD_SHUFFLE (m0, 0)
395
LOAD_SHUFFLE (m1, 1)
396
LOAD_SHUFFLE (m2, 2)
397
LOAD_SHUFFLE (m3, 3)
398
399
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
400
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
401
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
402
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
403
404
state0 = vaddq_u32(state0, state0_save);
405
state1 = vaddq_u32(state1, state1_save);
406
407
data += 64;
408
}
409
while (--numBlocks);
410
411
STORE_128(&state[0], state0);
412
STORE_128(&state[4], state1);
413
}
414
415
#endif // USE_HW_SHA
416
417
#endif // MY_CPU_ARM_OR_ARM64
418
419
420
#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
421
// #error Stop_Compiling_UNSUPPORTED_SHA
422
// #include <stdlib.h>
423
// We can compile this file with another C compiler,
424
// or we can compile asm version.
425
// So we can generate real code instead of this stub function.
426
// #include "Sha256.h"
427
// #if defined(_MSC_VER)
428
#pragma message("Sha256 HW-SW stub was used")
429
// #endif
430
void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks);
431
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
432
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
433
{
434
Sha256_UpdateBlocks(state, data, numBlocks);
435
/*
436
UNUSED_VAR(state);
437
UNUSED_VAR(data);
438
UNUSED_VAR(numBlocks);
439
exit(1);
440
return;
441
*/
442
}
443
#endif
444
445
446
447
#undef K
448
#undef RND2
449
#undef RND2_0
450
#undef RND2_1
451
452
#undef MY_rev32_for_LE
453
#undef NNN
454
#undef LOAD_128
455
#undef STORE_128
456
#undef LOAD_SHUFFLE
457
#undef SM1
458
#undef SM2
459
460
#undef NNN
461
#undef R4
462
#undef R16
463
#undef PREPARE_STATE
464
#undef USE_HW_SHA
465
#undef ATTRIB_SHA
466
#undef USE_VER_MIN
467
#undef Z7_USE_HW_SHA_STUB
468
469