Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/nh-neon-core.S
26439 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* NH - ε-almost-universal hash function, ARM64 NEON accelerated version
4
*
5
* Copyright 2018 Google LLC
6
*
7
* Author: Eric Biggers <[email protected]>
8
*/
9
10
#include <linux/linkage.h>
11
#include <linux/cfi_types.h>
12
13
KEY .req x0
14
MESSAGE .req x1
15
MESSAGE_LEN .req x2
16
HASH .req x3
17
18
PASS0_SUMS .req v0
19
PASS1_SUMS .req v1
20
PASS2_SUMS .req v2
21
PASS3_SUMS .req v3
22
K0 .req v4
23
K1 .req v5
24
K2 .req v6
25
K3 .req v7
26
T0 .req v8
27
T1 .req v9
28
T2 .req v10
29
T3 .req v11
30
T4 .req v12
31
T5 .req v13
32
T6 .req v14
33
T7 .req v15
34
35
.macro _nh_stride k0, k1, k2, k3
36
37
// Load next message stride
38
ld1 {T3.16b}, [MESSAGE], #16
39
40
// Load next key stride
41
ld1 {\k3\().4s}, [KEY], #16
42
43
// Add message words to key words
44
add T0.4s, T3.4s, \k0\().4s
45
add T1.4s, T3.4s, \k1\().4s
46
add T2.4s, T3.4s, \k2\().4s
47
add T3.4s, T3.4s, \k3\().4s
48
49
// Multiply 32x32 => 64 and accumulate
50
mov T4.d[0], T0.d[1]
51
mov T5.d[0], T1.d[1]
52
mov T6.d[0], T2.d[1]
53
mov T7.d[0], T3.d[1]
54
umlal PASS0_SUMS.2d, T0.2s, T4.2s
55
umlal PASS1_SUMS.2d, T1.2s, T5.2s
56
umlal PASS2_SUMS.2d, T2.2s, T6.2s
57
umlal PASS3_SUMS.2d, T3.2s, T7.2s
58
.endm
59
60
/*
61
* void nh_neon(const u32 *key, const u8 *message, size_t message_len,
62
* __le64 hash[NH_NUM_PASSES])
63
*
64
* It's guaranteed that message_len % 16 == 0.
65
*/
66
SYM_TYPED_FUNC_START(nh_neon)
67
68
ld1 {K0.4s,K1.4s}, [KEY], #32
69
movi PASS0_SUMS.2d, #0
70
movi PASS1_SUMS.2d, #0
71
ld1 {K2.4s}, [KEY], #16
72
movi PASS2_SUMS.2d, #0
73
movi PASS3_SUMS.2d, #0
74
75
subs MESSAGE_LEN, MESSAGE_LEN, #64
76
blt .Lloop4_done
77
.Lloop4:
78
_nh_stride K0, K1, K2, K3
79
_nh_stride K1, K2, K3, K0
80
_nh_stride K2, K3, K0, K1
81
_nh_stride K3, K0, K1, K2
82
subs MESSAGE_LEN, MESSAGE_LEN, #64
83
bge .Lloop4
84
85
.Lloop4_done:
86
ands MESSAGE_LEN, MESSAGE_LEN, #63
87
beq .Ldone
88
_nh_stride K0, K1, K2, K3
89
90
subs MESSAGE_LEN, MESSAGE_LEN, #16
91
beq .Ldone
92
_nh_stride K1, K2, K3, K0
93
94
subs MESSAGE_LEN, MESSAGE_LEN, #16
95
beq .Ldone
96
_nh_stride K2, K3, K0, K1
97
98
.Ldone:
99
// Sum the accumulators for each pass, then store the sums to 'hash'
100
addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
101
addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
102
st1 {T0.16b,T1.16b}, [HASH]
103
ret
104
SYM_FUNC_END(nh_neon)
105
106