Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/ghash-neon-core.S
170891 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Accelerated GHASH implementation with NEON vmull.p8 instructions.
4
*
5
* Copyright (C) 2015 - 2017 Linaro Ltd.
6
* Copyright (C) 2023 Google LLC. <[email protected]>
7
*/
8
9
#include <linux/linkage.h>
10
#include <asm/assembler.h>
11
12
.fpu neon
13
14
SHASH .req q0
15
T1 .req q1
16
XL .req q2
17
XM .req q3
18
XH .req q4
19
IN1 .req q4
20
21
SHASH_L .req d0
22
SHASH_H .req d1
23
T1_L .req d2
24
T1_H .req d3
25
XL_L .req d4
26
XL_H .req d5
27
XM_L .req d6
28
XM_H .req d7
29
XH_L .req d8
30
31
t0l .req d10
32
t0h .req d11
33
t1l .req d12
34
t1h .req d13
35
t2l .req d14
36
t2h .req d15
37
t3l .req d16
38
t3h .req d17
39
t4l .req d18
40
t4h .req d19
41
42
t0q .req q5
43
t1q .req q6
44
t2q .req q7
45
t3q .req q8
46
t4q .req q9
47
48
s1l .req d20
49
s1h .req d21
50
s2l .req d22
51
s2h .req d23
52
s3l .req d24
53
s3h .req d25
54
s4l .req d26
55
s4h .req d27
56
57
SHASH2_p8 .req d28
58
59
k16 .req d29
60
k32 .req d30
61
k48 .req d31
62
63
T2 .req q7
64
65
.text
66
67
/*
68
* This implementation of 64x64 -> 128 bit polynomial multiplication
69
* using vmull.p8 instructions (8x8 -> 16) is taken from the paper
70
* "Fast Software Polynomial Multiplication on ARM Processors Using
71
* the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
72
* Ricardo Dahab (https://hal.inria.fr/hal-01506572)
73
*
74
* It has been slightly tweaked for in-order performance, and to allow
75
* 'rq' to overlap with 'ad' or 'bd'.
76
*/
77
.macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
78
vext.8 t0l, \ad, \ad, #1 @ A1
79
.ifc \b1, t4l
80
vext.8 t4l, \bd, \bd, #1 @ B1
81
.endif
82
vmull.p8 t0q, t0l, \bd @ F = A1*B
83
vext.8 t1l, \ad, \ad, #2 @ A2
84
vmull.p8 t4q, \ad, \b1 @ E = A*B1
85
.ifc \b2, t3l
86
vext.8 t3l, \bd, \bd, #2 @ B2
87
.endif
88
vmull.p8 t1q, t1l, \bd @ H = A2*B
89
vext.8 t2l, \ad, \ad, #3 @ A3
90
vmull.p8 t3q, \ad, \b2 @ G = A*B2
91
veor t0q, t0q, t4q @ L = E + F
92
.ifc \b3, t4l
93
vext.8 t4l, \bd, \bd, #3 @ B3
94
.endif
95
vmull.p8 t2q, t2l, \bd @ J = A3*B
96
veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
97
veor t1q, t1q, t3q @ M = G + H
98
.ifc \b4, t3l
99
vext.8 t3l, \bd, \bd, #4 @ B4
100
.endif
101
vmull.p8 t4q, \ad, \b3 @ I = A*B3
102
veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
103
vmull.p8 t3q, \ad, \b4 @ K = A*B4
104
vand t0h, t0h, k48
105
vand t1h, t1h, k32
106
veor t2q, t2q, t4q @ N = I + J
107
veor t0l, t0l, t0h
108
veor t1l, t1l, t1h
109
veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
110
vand t2h, t2h, k16
111
veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
112
vmov.i64 t3h, #0
113
vext.8 t0q, t0q, t0q, #15
114
veor t2l, t2l, t2h
115
vext.8 t1q, t1q, t1q, #14
116
vmull.p8 \rq, \ad, \bd @ D = A*B
117
vext.8 t2q, t2q, t2q, #13
118
vext.8 t3q, t3q, t3q, #12
119
veor t0q, t0q, t1q
120
veor t2q, t2q, t3q
121
veor \rq, \rq, t0q
122
veor \rq, \rq, t2q
123
.endm
124
125
.macro __pmull_reduce_p8
126
veor XL_H, XL_H, XM_L
127
veor XH_L, XH_L, XM_H
128
129
vshl.i64 T1, XL, #57
130
vshl.i64 T2, XL, #62
131
veor T1, T1, T2
132
vshl.i64 T2, XL, #63
133
veor T1, T1, T2
134
veor XL_H, XL_H, T1_L
135
veor XH_L, XH_L, T1_H
136
137
vshr.u64 T1, XL, #1
138
veor XH, XH, XL
139
veor XL, XL, T1
140
vshr.u64 T1, T1, #6
141
vshr.u64 XL, XL, #1
142
.endm
143
144
.macro vrev64_if_be a
145
#ifdef CONFIG_CPU_BIG_ENDIAN
146
vrev64.8 \a, \a
147
#endif
148
.endm
149
150
.macro ghash_update
151
vld1.64 {XL}, [r1]
152
vrev64_if_be XL
153
154
0:
155
vld1.8 {T1}, [r2]!
156
subs r0, r0, #1
157
158
/* multiply XL by SHASH in GF(2^128) */
159
vrev64.8 T1, T1
160
161
vext.8 IN1, T1, T1, #8
162
veor T1_L, T1_L, XL_H
163
veor XL, XL, IN1
164
165
__pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
166
veor T1, T1, XL
167
__pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
168
__pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0)
169
170
veor T1, XL, XH
171
veor XM, XM, T1
172
173
__pmull_reduce_p8
174
175
veor T1, T1, XH
176
veor XL, XL, T1
177
178
bne 0b
179
.endm
180
181
/*
182
* void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg,
183
* const u8 *src,
184
* const struct polyval_elem *h)
185
*/
186
ENTRY(pmull_ghash_update_p8)
187
vld1.64 {SHASH}, [r3]
188
vrev64_if_be SHASH
189
veor SHASH2_p8, SHASH_L, SHASH_H
190
191
vext.8 s1l, SHASH_L, SHASH_L, #1
192
vext.8 s2l, SHASH_L, SHASH_L, #2
193
vext.8 s3l, SHASH_L, SHASH_L, #3
194
vext.8 s4l, SHASH_L, SHASH_L, #4
195
vext.8 s1h, SHASH_H, SHASH_H, #1
196
vext.8 s2h, SHASH_H, SHASH_H, #2
197
vext.8 s3h, SHASH_H, SHASH_H, #3
198
vext.8 s4h, SHASH_H, SHASH_H, #4
199
200
vmov.i64 k16, #0xffff
201
vmov.i64 k32, #0xffffffff
202
vmov.i64 k48, #0xffffffffffff
203
204
ghash_update
205
vrev64_if_be XL
206
vst1.64 {XL}, [r1]
207
208
bx lr
209
ENDPROC(pmull_ghash_update_p8)
210
211