Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/ghashv8-armx.S
39536 views
1
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
2
#include "arm_arch.h"
3
4
#if __ARM_MAX_ARCH__>=7
5
.arch armv8-a+crypto
6
.text
7
.globl gcm_init_v8
8
.type gcm_init_v8,%function
9
.align 4
10
gcm_init_v8:
11
AARCH64_VALID_CALL_TARGET
12
ld1 {v17.2d},[x1] //load input H
13
movi v19.16b,#0xe1
14
shl v19.2d,v19.2d,#57 //0xc2.0
15
ext v3.16b,v17.16b,v17.16b,#8
16
ushr v18.2d,v19.2d,#63
17
dup v17.4s,v17.s[1]
18
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
19
ushr v18.2d,v3.2d,#63
20
sshr v17.4s,v17.4s,#31 //broadcast carry bit
21
and v18.16b,v18.16b,v16.16b
22
shl v3.2d,v3.2d,#1
23
ext v18.16b,v18.16b,v18.16b,#8
24
and v16.16b,v16.16b,v17.16b
25
orr v3.16b,v3.16b,v18.16b //H<<<=1
26
eor v20.16b,v3.16b,v16.16b //twisted H
27
st1 {v20.2d},[x0],#16 //store Htable[0]
28
29
//calculate H^2
30
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
31
pmull v0.1q,v20.1d,v20.1d
32
eor v16.16b,v16.16b,v20.16b
33
pmull2 v2.1q,v20.2d,v20.2d
34
pmull v1.1q,v16.1d,v16.1d
35
36
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
37
eor v18.16b,v0.16b,v2.16b
38
eor v1.16b,v1.16b,v17.16b
39
eor v1.16b,v1.16b,v18.16b
40
pmull v18.1q,v0.1d,v19.1d //1st phase
41
42
ins v2.d[0],v1.d[1]
43
ins v1.d[1],v0.d[0]
44
eor v0.16b,v1.16b,v18.16b
45
46
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
47
pmull v0.1q,v0.1d,v19.1d
48
eor v18.16b,v18.16b,v2.16b
49
eor v22.16b,v0.16b,v18.16b
50
51
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
52
eor v17.16b,v17.16b,v22.16b
53
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
54
st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
55
//calculate H^3 and H^4
56
pmull v0.1q,v20.1d, v22.1d
57
pmull v5.1q,v22.1d,v22.1d
58
pmull2 v2.1q,v20.2d, v22.2d
59
pmull2 v7.1q,v22.2d,v22.2d
60
pmull v1.1q,v16.1d,v17.1d
61
pmull v6.1q,v17.1d,v17.1d
62
63
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
64
ext v17.16b,v5.16b,v7.16b,#8
65
eor v18.16b,v0.16b,v2.16b
66
eor v1.16b,v1.16b,v16.16b
67
eor v4.16b,v5.16b,v7.16b
68
eor v6.16b,v6.16b,v17.16b
69
eor v1.16b,v1.16b,v18.16b
70
pmull v18.1q,v0.1d,v19.1d //1st phase
71
eor v6.16b,v6.16b,v4.16b
72
pmull v4.1q,v5.1d,v19.1d
73
74
ins v2.d[0],v1.d[1]
75
ins v7.d[0],v6.d[1]
76
ins v1.d[1],v0.d[0]
77
ins v6.d[1],v5.d[0]
78
eor v0.16b,v1.16b,v18.16b
79
eor v5.16b,v6.16b,v4.16b
80
81
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
82
ext v4.16b,v5.16b,v5.16b,#8
83
pmull v0.1q,v0.1d,v19.1d
84
pmull v5.1q,v5.1d,v19.1d
85
eor v18.16b,v18.16b,v2.16b
86
eor v4.16b,v4.16b,v7.16b
87
eor v23.16b, v0.16b,v18.16b //H^3
88
eor v25.16b,v5.16b,v4.16b //H^4
89
90
ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing
91
ext v17.16b,v25.16b,v25.16b,#8
92
ext v18.16b,v22.16b,v22.16b,#8
93
eor v16.16b,v16.16b,v23.16b
94
eor v17.16b,v17.16b,v25.16b
95
eor v18.16b,v18.16b,v22.16b
96
ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
97
st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5]
98
99
//calculate H^5 and H^6
100
pmull v0.1q,v22.1d, v23.1d
101
pmull v5.1q,v23.1d,v23.1d
102
pmull2 v2.1q,v22.2d, v23.2d
103
pmull2 v7.1q,v23.2d,v23.2d
104
pmull v1.1q,v16.1d,v18.1d
105
pmull v6.1q,v16.1d,v16.1d
106
107
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
108
ext v17.16b,v5.16b,v7.16b,#8
109
eor v18.16b,v0.16b,v2.16b
110
eor v1.16b,v1.16b,v16.16b
111
eor v4.16b,v5.16b,v7.16b
112
eor v6.16b,v6.16b,v17.16b
113
eor v1.16b,v1.16b,v18.16b
114
pmull v18.1q,v0.1d,v19.1d //1st phase
115
eor v6.16b,v6.16b,v4.16b
116
pmull v4.1q,v5.1d,v19.1d
117
118
ins v2.d[0],v1.d[1]
119
ins v7.d[0],v6.d[1]
120
ins v1.d[1],v0.d[0]
121
ins v6.d[1],v5.d[0]
122
eor v0.16b,v1.16b,v18.16b
123
eor v5.16b,v6.16b,v4.16b
124
125
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
126
ext v4.16b,v5.16b,v5.16b,#8
127
pmull v0.1q,v0.1d,v19.1d
128
pmull v5.1q,v5.1d,v19.1d
129
eor v18.16b,v18.16b,v2.16b
130
eor v4.16b,v4.16b,v7.16b
131
eor v26.16b,v0.16b,v18.16b //H^5
132
eor v28.16b,v5.16b,v4.16b //H^6
133
134
ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing
135
ext v17.16b,v28.16b,v28.16b,#8
136
ext v18.16b,v22.16b,v22.16b,#8
137
eor v16.16b,v16.16b,v26.16b
138
eor v17.16b,v17.16b,v28.16b
139
eor v18.16b,v18.16b,v22.16b
140
ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
141
st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8]
142
143
//calculate H^7 and H^8
144
pmull v0.1q,v22.1d,v26.1d
145
pmull v5.1q,v22.1d,v28.1d
146
pmull2 v2.1q,v22.2d,v26.2d
147
pmull2 v7.1q,v22.2d,v28.2d
148
pmull v1.1q,v16.1d,v18.1d
149
pmull v6.1q,v17.1d,v18.1d
150
151
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
152
ext v17.16b,v5.16b,v7.16b,#8
153
eor v18.16b,v0.16b,v2.16b
154
eor v1.16b,v1.16b,v16.16b
155
eor v4.16b,v5.16b,v7.16b
156
eor v6.16b,v6.16b,v17.16b
157
eor v1.16b,v1.16b,v18.16b
158
pmull v18.1q,v0.1d,v19.1d //1st phase
159
eor v6.16b,v6.16b,v4.16b
160
pmull v4.1q,v5.1d,v19.1d
161
162
ins v2.d[0],v1.d[1]
163
ins v7.d[0],v6.d[1]
164
ins v1.d[1],v0.d[0]
165
ins v6.d[1],v5.d[0]
166
eor v0.16b,v1.16b,v18.16b
167
eor v5.16b,v6.16b,v4.16b
168
169
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
170
ext v4.16b,v5.16b,v5.16b,#8
171
pmull v0.1q,v0.1d,v19.1d
172
pmull v5.1q,v5.1d,v19.1d
173
eor v18.16b,v18.16b,v2.16b
174
eor v4.16b,v4.16b,v7.16b
175
eor v29.16b,v0.16b,v18.16b //H^7
176
eor v31.16b,v5.16b,v4.16b //H^8
177
178
ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing
179
ext v17.16b,v31.16b,v31.16b,#8
180
eor v16.16b,v16.16b,v29.16b
181
eor v17.16b,v17.16b,v31.16b
182
ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
183
st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11]
184
ret
185
.size gcm_init_v8,.-gcm_init_v8
186
.globl gcm_gmult_v8
187
.type gcm_gmult_v8,%function
188
.align 4
189
gcm_gmult_v8:
190
AARCH64_VALID_CALL_TARGET
191
ld1 {v17.2d},[x0] //load Xi
192
movi v19.16b,#0xe1
193
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
194
shl v19.2d,v19.2d,#57
195
#ifndef __AARCH64EB__
196
rev64 v17.16b,v17.16b
197
#endif
198
ext v3.16b,v17.16b,v17.16b,#8
199
200
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
201
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
202
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
203
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
204
205
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
206
eor v18.16b,v0.16b,v2.16b
207
eor v1.16b,v1.16b,v17.16b
208
eor v1.16b,v1.16b,v18.16b
209
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
210
211
ins v2.d[0],v1.d[1]
212
ins v1.d[1],v0.d[0]
213
eor v0.16b,v1.16b,v18.16b
214
215
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
216
pmull v0.1q,v0.1d,v19.1d
217
eor v18.16b,v18.16b,v2.16b
218
eor v0.16b,v0.16b,v18.16b
219
220
#ifndef __AARCH64EB__
221
rev64 v0.16b,v0.16b
222
#endif
223
ext v0.16b,v0.16b,v0.16b,#8
224
st1 {v0.2d},[x0] //write out Xi
225
226
ret
227
.size gcm_gmult_v8,.-gcm_gmult_v8
228
.globl gcm_ghash_v8
229
.type gcm_ghash_v8,%function
230
.align 4
231
gcm_ghash_v8:
232
AARCH64_VALID_CALL_TARGET
233
cmp x3,#64
234
b.hs .Lgcm_ghash_v8_4x
235
ld1 {v0.2d},[x0] //load [rotated] Xi
236
//"[rotated]" means that
237
//loaded value would have
238
//to be rotated in order to
239
//make it appear as in
240
//algorithm specification
241
subs x3,x3,#32 //see if x3 is 32 or larger
242
mov x12,#16 //x12 is used as post-
243
//increment for input pointer;
244
//as loop is modulo-scheduled
245
//x12 is zeroed just in time
246
//to preclude overstepping
247
//inp[len], which means that
248
//last block[s] are actually
249
//loaded twice, but last
250
//copy is not processed
251
ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
252
movi v19.16b,#0xe1
253
ld1 {v22.2d},[x1]
254
csel x12,xzr,x12,eq //is it time to zero x12?
255
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
256
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
257
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
258
#ifndef __AARCH64EB__
259
rev64 v16.16b,v16.16b
260
rev64 v0.16b,v0.16b
261
#endif
262
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
263
b.lo .Lodd_tail_v8 //x3 was less than 32
264
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
265
#ifndef __AARCH64EB__
266
rev64 v17.16b,v17.16b
267
#endif
268
ext v7.16b,v17.16b,v17.16b,#8
269
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
270
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
271
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
272
pmull2 v6.1q,v20.2d,v7.2d
273
b .Loop_mod2x_v8
274
275
.align 4
276
.Loop_mod2x_v8:
277
ext v18.16b,v3.16b,v3.16b,#8
278
subs x3,x3,#32 //is there more data?
279
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
280
csel x12,xzr,x12,lo //is it time to zero x12?
281
282
pmull v5.1q,v21.1d,v17.1d
283
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
284
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
285
eor v0.16b,v0.16b,v4.16b //accumulate
286
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
287
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
288
289
eor v2.16b,v2.16b,v6.16b
290
csel x12,xzr,x12,eq //is it time to zero x12?
291
eor v1.16b,v1.16b,v5.16b
292
293
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
294
eor v18.16b,v0.16b,v2.16b
295
eor v1.16b,v1.16b,v17.16b
296
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
297
#ifndef __AARCH64EB__
298
rev64 v16.16b,v16.16b
299
#endif
300
eor v1.16b,v1.16b,v18.16b
301
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
302
303
#ifndef __AARCH64EB__
304
rev64 v17.16b,v17.16b
305
#endif
306
ins v2.d[0],v1.d[1]
307
ins v1.d[1],v0.d[0]
308
ext v7.16b,v17.16b,v17.16b,#8
309
ext v3.16b,v16.16b,v16.16b,#8
310
eor v0.16b,v1.16b,v18.16b
311
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
312
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
313
314
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
315
pmull v0.1q,v0.1d,v19.1d
316
eor v3.16b,v3.16b,v18.16b
317
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
318
eor v3.16b,v3.16b,v0.16b
319
pmull2 v6.1q,v20.2d,v7.2d
320
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
321
322
eor v2.16b,v2.16b,v18.16b
323
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
324
adds x3,x3,#32 //re-construct x3
325
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
326
b.eq .Ldone_v8 //is x3 zero?
327
.Lodd_tail_v8:
328
ext v18.16b,v0.16b,v0.16b,#8
329
eor v3.16b,v3.16b,v0.16b //inp^=Xi
330
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
331
332
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
333
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
334
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
335
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
336
337
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
338
eor v18.16b,v0.16b,v2.16b
339
eor v1.16b,v1.16b,v17.16b
340
eor v1.16b,v1.16b,v18.16b
341
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
342
343
ins v2.d[0],v1.d[1]
344
ins v1.d[1],v0.d[0]
345
eor v0.16b,v1.16b,v18.16b
346
347
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
348
pmull v0.1q,v0.1d,v19.1d
349
eor v18.16b,v18.16b,v2.16b
350
eor v0.16b,v0.16b,v18.16b
351
352
.Ldone_v8:
353
#ifndef __AARCH64EB__
354
rev64 v0.16b,v0.16b
355
#endif
356
ext v0.16b,v0.16b,v0.16b,#8
357
st1 {v0.2d},[x0] //write out Xi
358
359
ret
360
.size gcm_ghash_v8,.-gcm_ghash_v8
361
.type gcm_ghash_v8_4x,%function
362
.align 4
363
gcm_ghash_v8_4x:
364
.Lgcm_ghash_v8_4x:
365
ld1 {v0.2d},[x0] //load [rotated] Xi
366
ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
367
movi v19.16b,#0xe1
368
ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
369
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
370
371
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
372
#ifndef __AARCH64EB__
373
rev64 v0.16b,v0.16b
374
rev64 v5.16b,v5.16b
375
rev64 v6.16b,v6.16b
376
rev64 v7.16b,v7.16b
377
rev64 v4.16b,v4.16b
378
#endif
379
ext v25.16b,v7.16b,v7.16b,#8
380
ext v24.16b,v6.16b,v6.16b,#8
381
ext v23.16b,v5.16b,v5.16b,#8
382
383
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
384
eor v7.16b,v7.16b,v25.16b
385
pmull2 v31.1q,v20.2d,v25.2d
386
pmull v30.1q,v21.1d,v7.1d
387
388
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
389
eor v6.16b,v6.16b,v24.16b
390
pmull2 v24.1q,v22.2d,v24.2d
391
pmull2 v6.1q,v21.2d,v6.2d
392
393
eor v29.16b,v29.16b,v16.16b
394
eor v31.16b,v31.16b,v24.16b
395
eor v30.16b,v30.16b,v6.16b
396
397
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
398
eor v5.16b,v5.16b,v23.16b
399
pmull2 v23.1q,v26.2d,v23.2d
400
pmull v5.1q,v27.1d,v5.1d
401
402
eor v29.16b,v29.16b,v7.16b
403
eor v31.16b,v31.16b,v23.16b
404
eor v30.16b,v30.16b,v5.16b
405
406
subs x3,x3,#128
407
b.lo .Ltail4x
408
409
b .Loop4x
410
411
.align 4
412
.Loop4x:
413
eor v16.16b,v4.16b,v0.16b
414
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
415
ext v3.16b,v16.16b,v16.16b,#8
416
#ifndef __AARCH64EB__
417
rev64 v5.16b,v5.16b
418
rev64 v6.16b,v6.16b
419
rev64 v7.16b,v7.16b
420
rev64 v4.16b,v4.16b
421
#endif
422
423
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
424
eor v16.16b,v16.16b,v3.16b
425
pmull2 v2.1q,v28.2d,v3.2d
426
ext v25.16b,v7.16b,v7.16b,#8
427
pmull2 v1.1q,v27.2d,v16.2d
428
429
eor v0.16b,v0.16b,v29.16b
430
eor v2.16b,v2.16b,v31.16b
431
ext v24.16b,v6.16b,v6.16b,#8
432
eor v1.16b,v1.16b,v30.16b
433
ext v23.16b,v5.16b,v5.16b,#8
434
435
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
436
eor v18.16b,v0.16b,v2.16b
437
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
438
eor v7.16b,v7.16b,v25.16b
439
eor v1.16b,v1.16b,v17.16b
440
pmull2 v31.1q,v20.2d,v25.2d
441
eor v1.16b,v1.16b,v18.16b
442
pmull v30.1q,v21.1d,v7.1d
443
444
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
445
ins v2.d[0],v1.d[1]
446
ins v1.d[1],v0.d[0]
447
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
448
eor v6.16b,v6.16b,v24.16b
449
pmull2 v24.1q,v22.2d,v24.2d
450
eor v0.16b,v1.16b,v18.16b
451
pmull2 v6.1q,v21.2d,v6.2d
452
453
eor v29.16b,v29.16b,v16.16b
454
eor v31.16b,v31.16b,v24.16b
455
eor v30.16b,v30.16b,v6.16b
456
457
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
458
pmull v0.1q,v0.1d,v19.1d
459
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
460
eor v5.16b,v5.16b,v23.16b
461
eor v18.16b,v18.16b,v2.16b
462
pmull2 v23.1q,v26.2d,v23.2d
463
pmull v5.1q,v27.1d,v5.1d
464
465
eor v0.16b,v0.16b,v18.16b
466
eor v29.16b,v29.16b,v7.16b
467
eor v31.16b,v31.16b,v23.16b
468
ext v0.16b,v0.16b,v0.16b,#8
469
eor v30.16b,v30.16b,v5.16b
470
471
subs x3,x3,#64
472
b.hs .Loop4x
473
474
.Ltail4x:
475
eor v16.16b,v4.16b,v0.16b
476
ext v3.16b,v16.16b,v16.16b,#8
477
478
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
479
eor v16.16b,v16.16b,v3.16b
480
pmull2 v2.1q,v28.2d,v3.2d
481
pmull2 v1.1q,v27.2d,v16.2d
482
483
eor v0.16b,v0.16b,v29.16b
484
eor v2.16b,v2.16b,v31.16b
485
eor v1.16b,v1.16b,v30.16b
486
487
adds x3,x3,#64
488
b.eq .Ldone4x
489
490
cmp x3,#32
491
b.lo .Lone
492
b.eq .Ltwo
493
.Lthree:
494
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
495
eor v18.16b,v0.16b,v2.16b
496
eor v1.16b,v1.16b,v17.16b
497
ld1 {v4.2d,v5.2d,v6.2d},[x2]
498
eor v1.16b,v1.16b,v18.16b
499
#ifndef __AARCH64EB__
500
rev64 v5.16b,v5.16b
501
rev64 v6.16b,v6.16b
502
rev64 v4.16b,v4.16b
503
#endif
504
505
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
506
ins v2.d[0],v1.d[1]
507
ins v1.d[1],v0.d[0]
508
ext v24.16b,v6.16b,v6.16b,#8
509
ext v23.16b,v5.16b,v5.16b,#8
510
eor v0.16b,v1.16b,v18.16b
511
512
pmull v29.1q,v20.1d,v24.1d //H·Ii+2
513
eor v6.16b,v6.16b,v24.16b
514
515
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
516
pmull v0.1q,v0.1d,v19.1d
517
eor v18.16b,v18.16b,v2.16b
518
pmull2 v31.1q,v20.2d,v24.2d
519
pmull v30.1q,v21.1d,v6.1d
520
eor v0.16b,v0.16b,v18.16b
521
pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
522
eor v5.16b,v5.16b,v23.16b
523
ext v0.16b,v0.16b,v0.16b,#8
524
525
pmull2 v23.1q,v22.2d,v23.2d
526
eor v16.16b,v4.16b,v0.16b
527
pmull2 v5.1q,v21.2d,v5.2d
528
ext v3.16b,v16.16b,v16.16b,#8
529
530
eor v29.16b,v29.16b,v7.16b
531
eor v31.16b,v31.16b,v23.16b
532
eor v30.16b,v30.16b,v5.16b
533
534
pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
535
eor v16.16b,v16.16b,v3.16b
536
pmull2 v2.1q,v26.2d,v3.2d
537
pmull v1.1q,v27.1d,v16.1d
538
539
eor v0.16b,v0.16b,v29.16b
540
eor v2.16b,v2.16b,v31.16b
541
eor v1.16b,v1.16b,v30.16b
542
b .Ldone4x
543
544
.align 4
545
.Ltwo:
546
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
547
eor v18.16b,v0.16b,v2.16b
548
eor v1.16b,v1.16b,v17.16b
549
ld1 {v4.2d,v5.2d},[x2]
550
eor v1.16b,v1.16b,v18.16b
551
#ifndef __AARCH64EB__
552
rev64 v5.16b,v5.16b
553
rev64 v4.16b,v4.16b
554
#endif
555
556
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
557
ins v2.d[0],v1.d[1]
558
ins v1.d[1],v0.d[0]
559
ext v23.16b,v5.16b,v5.16b,#8
560
eor v0.16b,v1.16b,v18.16b
561
562
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
563
pmull v0.1q,v0.1d,v19.1d
564
eor v18.16b,v18.16b,v2.16b
565
eor v0.16b,v0.16b,v18.16b
566
ext v0.16b,v0.16b,v0.16b,#8
567
568
pmull v29.1q,v20.1d,v23.1d //H·Ii+1
569
eor v5.16b,v5.16b,v23.16b
570
571
eor v16.16b,v4.16b,v0.16b
572
ext v3.16b,v16.16b,v16.16b,#8
573
574
pmull2 v31.1q,v20.2d,v23.2d
575
pmull v30.1q,v21.1d,v5.1d
576
577
pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
578
eor v16.16b,v16.16b,v3.16b
579
pmull2 v2.1q,v22.2d,v3.2d
580
pmull2 v1.1q,v21.2d,v16.2d
581
582
eor v0.16b,v0.16b,v29.16b
583
eor v2.16b,v2.16b,v31.16b
584
eor v1.16b,v1.16b,v30.16b
585
b .Ldone4x
586
587
.align 4
588
.Lone:
589
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
590
eor v18.16b,v0.16b,v2.16b
591
eor v1.16b,v1.16b,v17.16b
592
ld1 {v4.2d},[x2]
593
eor v1.16b,v1.16b,v18.16b
594
#ifndef __AARCH64EB__
595
rev64 v4.16b,v4.16b
596
#endif
597
598
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
599
ins v2.d[0],v1.d[1]
600
ins v1.d[1],v0.d[0]
601
eor v0.16b,v1.16b,v18.16b
602
603
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
604
pmull v0.1q,v0.1d,v19.1d
605
eor v18.16b,v18.16b,v2.16b
606
eor v0.16b,v0.16b,v18.16b
607
ext v0.16b,v0.16b,v0.16b,#8
608
609
eor v16.16b,v4.16b,v0.16b
610
ext v3.16b,v16.16b,v16.16b,#8
611
612
pmull v0.1q,v20.1d,v3.1d
613
eor v16.16b,v16.16b,v3.16b
614
pmull2 v2.1q,v20.2d,v3.2d
615
pmull v1.1q,v21.1d,v16.1d
616
617
.Ldone4x:
618
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
619
eor v18.16b,v0.16b,v2.16b
620
eor v1.16b,v1.16b,v17.16b
621
eor v1.16b,v1.16b,v18.16b
622
623
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
624
ins v2.d[0],v1.d[1]
625
ins v1.d[1],v0.d[0]
626
eor v0.16b,v1.16b,v18.16b
627
628
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
629
pmull v0.1q,v0.1d,v19.1d
630
eor v18.16b,v18.16b,v2.16b
631
eor v0.16b,v0.16b,v18.16b
632
ext v0.16b,v0.16b,v0.16b,#8
633
634
#ifndef __AARCH64EB__
635
rev64 v0.16b,v0.16b
636
#endif
637
st1 {v0.2d},[x0] //write out Xi
638
639
ret
640
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
641
.section .rodata
642
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
643
.align 2
644
.align 2
645
#endif
646
647