Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/nh-sse2.S
121833 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4
*
5
* Copyright 2018 Google LLC
6
*
7
* Author: Eric Biggers <[email protected]>
8
*/
9
10
#include <linux/linkage.h>
11
12
#define PASS0_SUMS %xmm0
13
#define PASS1_SUMS %xmm1
14
#define PASS2_SUMS %xmm2
15
#define PASS3_SUMS %xmm3
16
#define K0 %xmm4
17
#define K1 %xmm5
18
#define K2 %xmm6
19
#define K3 %xmm7
20
#define T0 %xmm8
21
#define T1 %xmm9
22
#define T2 %xmm10
23
#define T3 %xmm11
24
#define T4 %xmm12
25
#define T5 %xmm13
26
#define T6 %xmm14
27
#define T7 %xmm15
28
#define KEY %rdi
29
#define MESSAGE %rsi
30
#define MESSAGE_LEN %rdx
31
#define HASH %rcx
32
33
.macro _nh_stride k0, k1, k2, k3, offset
34
35
// Load next message stride
36
movdqu \offset(MESSAGE), T1
37
38
// Load next key stride
39
movdqu \offset(KEY), \k3
40
41
// Add message words to key words
42
movdqa T1, T2
43
movdqa T1, T3
44
paddd T1, \k0 // reuse k0 to avoid a move
45
paddd \k1, T1
46
paddd \k2, T2
47
paddd \k3, T3
48
49
// Multiply 32x32 => 64 and accumulate
50
pshufd $0x10, \k0, T4
51
pshufd $0x32, \k0, \k0
52
pshufd $0x10, T1, T5
53
pshufd $0x32, T1, T1
54
pshufd $0x10, T2, T6
55
pshufd $0x32, T2, T2
56
pshufd $0x10, T3, T7
57
pshufd $0x32, T3, T3
58
pmuludq T4, \k0
59
pmuludq T5, T1
60
pmuludq T6, T2
61
pmuludq T7, T3
62
paddq \k0, PASS0_SUMS
63
paddq T1, PASS1_SUMS
64
paddq T2, PASS2_SUMS
65
paddq T3, PASS3_SUMS
66
.endm
67
68
/*
69
* void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
70
* __le64 hash[NH_NUM_PASSES])
71
*
72
* It's guaranteed that message_len % 16 == 0.
73
*/
74
SYM_FUNC_START(nh_sse2)
75
76
movdqu 0x00(KEY), K0
77
movdqu 0x10(KEY), K1
78
movdqu 0x20(KEY), K2
79
add $0x30, KEY
80
pxor PASS0_SUMS, PASS0_SUMS
81
pxor PASS1_SUMS, PASS1_SUMS
82
pxor PASS2_SUMS, PASS2_SUMS
83
pxor PASS3_SUMS, PASS3_SUMS
84
85
sub $0x40, MESSAGE_LEN
86
jl .Lloop4_done
87
.Lloop4:
88
_nh_stride K0, K1, K2, K3, 0x00
89
_nh_stride K1, K2, K3, K0, 0x10
90
_nh_stride K2, K3, K0, K1, 0x20
91
_nh_stride K3, K0, K1, K2, 0x30
92
add $0x40, KEY
93
add $0x40, MESSAGE
94
sub $0x40, MESSAGE_LEN
95
jge .Lloop4
96
97
.Lloop4_done:
98
and $0x3f, MESSAGE_LEN
99
jz .Ldone
100
_nh_stride K0, K1, K2, K3, 0x00
101
102
sub $0x10, MESSAGE_LEN
103
jz .Ldone
104
_nh_stride K1, K2, K3, K0, 0x10
105
106
sub $0x10, MESSAGE_LEN
107
jz .Ldone
108
_nh_stride K2, K3, K0, K1, 0x20
109
110
.Ldone:
111
// Sum the accumulators for each pass, then store the sums to 'hash'
112
movdqa PASS0_SUMS, T0
113
movdqa PASS2_SUMS, T1
114
punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
115
punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
116
punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
117
punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
118
paddq PASS0_SUMS, T0
119
paddq PASS2_SUMS, T1
120
movdqu T0, 0x00(HASH)
121
movdqu T1, 0x10(HASH)
122
RET
123
SYM_FUNC_END(nh_sse2)
124
125