Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/nh-sse2-x86_64.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4
*
5
* Copyright 2018 Google LLC
6
*
7
* Author: Eric Biggers <[email protected]>
8
*/
9
10
#include <linux/linkage.h>
11
#include <linux/cfi_types.h>
12
13
#define PASS0_SUMS %xmm0
14
#define PASS1_SUMS %xmm1
15
#define PASS2_SUMS %xmm2
16
#define PASS3_SUMS %xmm3
17
#define K0 %xmm4
18
#define K1 %xmm5
19
#define K2 %xmm6
20
#define K3 %xmm7
21
#define T0 %xmm8
22
#define T1 %xmm9
23
#define T2 %xmm10
24
#define T3 %xmm11
25
#define T4 %xmm12
26
#define T5 %xmm13
27
#define T6 %xmm14
28
#define T7 %xmm15
29
#define KEY %rdi
30
#define MESSAGE %rsi
31
#define MESSAGE_LEN %rdx
32
#define HASH %rcx
33
34
.macro _nh_stride k0, k1, k2, k3, offset
35
36
// Load next message stride
37
movdqu \offset(MESSAGE), T1
38
39
// Load next key stride
40
movdqu \offset(KEY), \k3
41
42
// Add message words to key words
43
movdqa T1, T2
44
movdqa T1, T3
45
paddd T1, \k0 // reuse k0 to avoid a move
46
paddd \k1, T1
47
paddd \k2, T2
48
paddd \k3, T3
49
50
// Multiply 32x32 => 64 and accumulate
51
pshufd $0x10, \k0, T4
52
pshufd $0x32, \k0, \k0
53
pshufd $0x10, T1, T5
54
pshufd $0x32, T1, T1
55
pshufd $0x10, T2, T6
56
pshufd $0x32, T2, T2
57
pshufd $0x10, T3, T7
58
pshufd $0x32, T3, T3
59
pmuludq T4, \k0
60
pmuludq T5, T1
61
pmuludq T6, T2
62
pmuludq T7, T3
63
paddq \k0, PASS0_SUMS
64
paddq T1, PASS1_SUMS
65
paddq T2, PASS2_SUMS
66
paddq T3, PASS3_SUMS
67
.endm
68
69
/*
70
* void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
71
* __le64 hash[NH_NUM_PASSES])
72
*
73
* It's guaranteed that message_len % 16 == 0.
74
*/
75
SYM_TYPED_FUNC_START(nh_sse2)
76
77
movdqu 0x00(KEY), K0
78
movdqu 0x10(KEY), K1
79
movdqu 0x20(KEY), K2
80
add $0x30, KEY
81
pxor PASS0_SUMS, PASS0_SUMS
82
pxor PASS1_SUMS, PASS1_SUMS
83
pxor PASS2_SUMS, PASS2_SUMS
84
pxor PASS3_SUMS, PASS3_SUMS
85
86
sub $0x40, MESSAGE_LEN
87
jl .Lloop4_done
88
.Lloop4:
89
_nh_stride K0, K1, K2, K3, 0x00
90
_nh_stride K1, K2, K3, K0, 0x10
91
_nh_stride K2, K3, K0, K1, 0x20
92
_nh_stride K3, K0, K1, K2, 0x30
93
add $0x40, KEY
94
add $0x40, MESSAGE
95
sub $0x40, MESSAGE_LEN
96
jge .Lloop4
97
98
.Lloop4_done:
99
and $0x3f, MESSAGE_LEN
100
jz .Ldone
101
_nh_stride K0, K1, K2, K3, 0x00
102
103
sub $0x10, MESSAGE_LEN
104
jz .Ldone
105
_nh_stride K1, K2, K3, K0, 0x10
106
107
sub $0x10, MESSAGE_LEN
108
jz .Ldone
109
_nh_stride K2, K3, K0, K1, 0x20
110
111
.Ldone:
112
// Sum the accumulators for each pass, then store the sums to 'hash'
113
movdqa PASS0_SUMS, T0
114
movdqa PASS2_SUMS, T1
115
punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
116
punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
117
punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
118
punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
119
paddq PASS0_SUMS, T0
120
paddq PASS2_SUMS, T1
121
movdqu T0, 0x00(HASH)
122
movdqu T1, 0x10(HASH)
123
RET
124
SYM_FUNC_END(nh_sse2)
125
126