Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/sm3-armv8.S
39536 views
1
/* Do not modify. This file is auto-generated from sm3-armv8.pl. */
2
// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
3
//
4
// Licensed under the Apache License 2.0 (the "License"). You may not use
5
// this file except in compliance with the License. You can obtain a copy
6
// in the file LICENSE in the source distribution or at
7
// https://www.openssl.org/source/license.html
8
//
9
// This module implements support for Armv8 SM3 instructions
10
11
// $output is the last argument if it looks like a file (it has an extension)
12
// $flavour is the first argument if it doesn't look like a file
13
#include "arm_arch.h"
14
.text
15
.globl ossl_hwsm3_block_data_order
16
.type ossl_hwsm3_block_data_order,%function
17
.align 5
18
ossl_hwsm3_block_data_order:
19
AARCH64_VALID_CALL_TARGET
20
// load state
21
ld1 {v5.4s,v6.4s}, [x0]
22
rev64 v5.4s, v5.4s
23
rev64 v6.4s, v6.4s
24
ext v5.16b, v5.16b, v5.16b, #8
25
ext v6.16b, v6.16b, v6.16b, #8
26
adrp x8, .Tj
27
add x8, x8, #:lo12:.Tj
28
ldp s16, s17, [x8]
29
30
.Loop:
31
// load input
32
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x1], #64
33
sub w2, w2, #1
34
35
mov v18.16b, v5.16b
36
mov v19.16b, v6.16b
37
38
#ifndef __AARCH64EB__
39
rev32 v0.16b, v0.16b
40
rev32 v1.16b, v1.16b
41
rev32 v2.16b, v2.16b
42
rev32 v3.16b, v3.16b
43
#endif
44
45
ext v20.16b, v16.16b, v16.16b, #4
46
// s4 = w7 | w8 | w9 | w10
47
ext v4.16b, v1.16b, v2.16b, #12
48
// vtmp1 = w3 | w4 | w5 | w6
49
ext v22.16b, v0.16b, v1.16b, #12
50
// vtmp2 = w10 | w11 | w12 | w13
51
ext v23.16b, v2.16b, v3.16b, #8
52
.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s
53
.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s
54
eor v22.16b, v0.16b, v1.16b
55
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
56
shl v21.4s, v20.4s, #1
57
sri v21.4s, v20.4s, #31
58
.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
59
.inst 0xce408ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[0]
60
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
61
shl v20.4s, v21.4s, #1
62
sri v20.4s, v21.4s, #31
63
.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
64
.inst 0xce409ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[1]
65
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
66
shl v21.4s, v20.4s, #1
67
sri v21.4s, v20.4s, #31
68
.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
69
.inst 0xce40aae6 //sm3tt2a v6.4s, v23.4s, v0.4s[2]
70
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
71
shl v20.4s, v21.4s, #1
72
sri v20.4s, v21.4s, #31
73
.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
74
.inst 0xce40bae6 //sm3tt2a v6.4s, v23.4s, v0.4s[3]
75
// s4 = w7 | w8 | w9 | w10
76
ext v0.16b, v2.16b, v3.16b, #12
77
// vtmp1 = w3 | w4 | w5 | w6
78
ext v22.16b, v1.16b, v2.16b, #12
79
// vtmp2 = w10 | w11 | w12 | w13
80
ext v23.16b, v3.16b, v4.16b, #8
81
.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s
82
.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s
83
eor v22.16b, v1.16b, v2.16b
84
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
85
shl v21.4s, v20.4s, #1
86
sri v21.4s, v20.4s, #31
87
.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
88
.inst 0xce418ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[0]
89
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
90
shl v20.4s, v21.4s, #1
91
sri v20.4s, v21.4s, #31
92
.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
93
.inst 0xce419ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[1]
94
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
95
shl v21.4s, v20.4s, #1
96
sri v21.4s, v20.4s, #31
97
.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
98
.inst 0xce41aae6 //sm3tt2a v6.4s, v23.4s, v1.4s[2]
99
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
100
shl v20.4s, v21.4s, #1
101
sri v20.4s, v21.4s, #31
102
.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
103
.inst 0xce41bae6 //sm3tt2a v6.4s, v23.4s, v1.4s[3]
104
// s4 = w7 | w8 | w9 | w10
105
ext v1.16b, v3.16b, v4.16b, #12
106
// vtmp1 = w3 | w4 | w5 | w6
107
ext v22.16b, v2.16b, v3.16b, #12
108
// vtmp2 = w10 | w11 | w12 | w13
109
ext v23.16b, v4.16b, v0.16b, #8
110
.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s
111
.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s
112
eor v22.16b, v2.16b, v3.16b
113
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
114
shl v21.4s, v20.4s, #1
115
sri v21.4s, v20.4s, #31
116
.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
117
.inst 0xce428ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[0]
118
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
119
shl v20.4s, v21.4s, #1
120
sri v20.4s, v21.4s, #31
121
.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
122
.inst 0xce429ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[1]
123
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
124
shl v21.4s, v20.4s, #1
125
sri v21.4s, v20.4s, #31
126
.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
127
.inst 0xce42aae6 //sm3tt2a v6.4s, v23.4s, v2.4s[2]
128
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
129
shl v20.4s, v21.4s, #1
130
sri v20.4s, v21.4s, #31
131
.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
132
.inst 0xce42bae6 //sm3tt2a v6.4s, v23.4s, v2.4s[3]
133
// s4 = w7 | w8 | w9 | w10
134
ext v2.16b, v4.16b, v0.16b, #12
135
// vtmp1 = w3 | w4 | w5 | w6
136
ext v22.16b, v3.16b, v4.16b, #12
137
// vtmp2 = w10 | w11 | w12 | w13
138
ext v23.16b, v0.16b, v1.16b, #8
139
.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s
140
.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s
141
eor v22.16b, v3.16b, v4.16b
142
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
143
shl v21.4s, v20.4s, #1
144
sri v21.4s, v20.4s, #31
145
.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0]
146
.inst 0xce438ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[0]
147
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
148
shl v20.4s, v21.4s, #1
149
sri v20.4s, v21.4s, #31
150
.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1]
151
.inst 0xce439ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[1]
152
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
153
shl v21.4s, v20.4s, #1
154
sri v21.4s, v20.4s, #31
155
.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2]
156
.inst 0xce43aae6 //sm3tt2a v6.4s, v23.4s, v3.4s[2]
157
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
158
shl v20.4s, v21.4s, #1
159
sri v20.4s, v21.4s, #31
160
.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3]
161
.inst 0xce43bae6 //sm3tt2a v6.4s, v23.4s, v3.4s[3]
162
ext v20.16b, v17.16b, v17.16b, #4
163
// s4 = w7 | w8 | w9 | w10
164
ext v3.16b, v0.16b, v1.16b, #12
165
// vtmp1 = w3 | w4 | w5 | w6
166
ext v22.16b, v4.16b, v0.16b, #12
167
// vtmp2 = w10 | w11 | w12 | w13
168
ext v23.16b, v1.16b, v2.16b, #8
169
.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s
170
.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s
171
eor v22.16b, v4.16b, v0.16b
172
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
173
shl v21.4s, v20.4s, #1
174
sri v21.4s, v20.4s, #31
175
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
176
.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0]
177
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
178
shl v20.4s, v21.4s, #1
179
sri v20.4s, v21.4s, #31
180
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
181
.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1]
182
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
183
shl v21.4s, v20.4s, #1
184
sri v21.4s, v20.4s, #31
185
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
186
.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2]
187
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
188
shl v20.4s, v21.4s, #1
189
sri v20.4s, v21.4s, #31
190
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
191
.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3]
192
// s4 = w7 | w8 | w9 | w10
193
ext v4.16b, v1.16b, v2.16b, #12
194
// vtmp1 = w3 | w4 | w5 | w6
195
ext v22.16b, v0.16b, v1.16b, #12
196
// vtmp2 = w10 | w11 | w12 | w13
197
ext v23.16b, v2.16b, v3.16b, #8
198
.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s
199
.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s
200
eor v22.16b, v0.16b, v1.16b
201
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
202
shl v21.4s, v20.4s, #1
203
sri v21.4s, v20.4s, #31
204
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
205
.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0]
206
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
207
shl v20.4s, v21.4s, #1
208
sri v20.4s, v21.4s, #31
209
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
210
.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1]
211
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
212
shl v21.4s, v20.4s, #1
213
sri v21.4s, v20.4s, #31
214
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
215
.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2]
216
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
217
shl v20.4s, v21.4s, #1
218
sri v20.4s, v21.4s, #31
219
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
220
.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3]
221
// s4 = w7 | w8 | w9 | w10
222
ext v0.16b, v2.16b, v3.16b, #12
223
// vtmp1 = w3 | w4 | w5 | w6
224
ext v22.16b, v1.16b, v2.16b, #12
225
// vtmp2 = w10 | w11 | w12 | w13
226
ext v23.16b, v3.16b, v4.16b, #8
227
.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s
228
.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s
229
eor v22.16b, v1.16b, v2.16b
230
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
231
shl v21.4s, v20.4s, #1
232
sri v21.4s, v20.4s, #31
233
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
234
.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0]
235
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
236
shl v20.4s, v21.4s, #1
237
sri v20.4s, v21.4s, #31
238
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
239
.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1]
240
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
241
shl v21.4s, v20.4s, #1
242
sri v21.4s, v20.4s, #31
243
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
244
.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2]
245
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
246
shl v20.4s, v21.4s, #1
247
sri v20.4s, v21.4s, #31
248
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
249
.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3]
250
// s4 = w7 | w8 | w9 | w10
251
ext v1.16b, v3.16b, v4.16b, #12
252
// vtmp1 = w3 | w4 | w5 | w6
253
ext v22.16b, v2.16b, v3.16b, #12
254
// vtmp2 = w10 | w11 | w12 | w13
255
ext v23.16b, v4.16b, v0.16b, #8
256
.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s
257
.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s
258
eor v22.16b, v2.16b, v3.16b
259
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
260
shl v21.4s, v20.4s, #1
261
sri v21.4s, v20.4s, #31
262
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
263
.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0]
264
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
265
shl v20.4s, v21.4s, #1
266
sri v20.4s, v21.4s, #31
267
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
268
.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1]
269
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
270
shl v21.4s, v20.4s, #1
271
sri v21.4s, v20.4s, #31
272
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
273
.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2]
274
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
275
shl v20.4s, v21.4s, #1
276
sri v20.4s, v21.4s, #31
277
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
278
.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3]
279
// s4 = w7 | w8 | w9 | w10
280
ext v2.16b, v4.16b, v0.16b, #12
281
// vtmp1 = w3 | w4 | w5 | w6
282
ext v22.16b, v3.16b, v4.16b, #12
283
// vtmp2 = w10 | w11 | w12 | w13
284
ext v23.16b, v0.16b, v1.16b, #8
285
.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s
286
.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s
287
eor v22.16b, v3.16b, v4.16b
288
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
289
shl v21.4s, v20.4s, #1
290
sri v21.4s, v20.4s, #31
291
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
292
.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0]
293
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
294
shl v20.4s, v21.4s, #1
295
sri v20.4s, v21.4s, #31
296
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
297
.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1]
298
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
299
shl v21.4s, v20.4s, #1
300
sri v21.4s, v20.4s, #31
301
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
302
.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2]
303
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
304
shl v20.4s, v21.4s, #1
305
sri v20.4s, v21.4s, #31
306
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
307
.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3]
308
// s4 = w7 | w8 | w9 | w10
309
ext v3.16b, v0.16b, v1.16b, #12
310
// vtmp1 = w3 | w4 | w5 | w6
311
ext v22.16b, v4.16b, v0.16b, #12
312
// vtmp2 = w10 | w11 | w12 | w13
313
ext v23.16b, v1.16b, v2.16b, #8
314
.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s
315
.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s
316
eor v22.16b, v4.16b, v0.16b
317
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
318
shl v21.4s, v20.4s, #1
319
sri v21.4s, v20.4s, #31
320
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
321
.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0]
322
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
323
shl v20.4s, v21.4s, #1
324
sri v20.4s, v21.4s, #31
325
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
326
.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1]
327
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
328
shl v21.4s, v20.4s, #1
329
sri v21.4s, v20.4s, #31
330
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
331
.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2]
332
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
333
shl v20.4s, v21.4s, #1
334
sri v20.4s, v21.4s, #31
335
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
336
.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3]
337
// s4 = w7 | w8 | w9 | w10
338
ext v4.16b, v1.16b, v2.16b, #12
339
// vtmp1 = w3 | w4 | w5 | w6
340
ext v22.16b, v0.16b, v1.16b, #12
341
// vtmp2 = w10 | w11 | w12 | w13
342
ext v23.16b, v2.16b, v3.16b, #8
343
.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s
344
.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s
345
eor v22.16b, v0.16b, v1.16b
346
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
347
shl v21.4s, v20.4s, #1
348
sri v21.4s, v20.4s, #31
349
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
350
.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0]
351
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
352
shl v20.4s, v21.4s, #1
353
sri v20.4s, v21.4s, #31
354
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
355
.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1]
356
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
357
shl v21.4s, v20.4s, #1
358
sri v21.4s, v20.4s, #31
359
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
360
.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2]
361
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
362
shl v20.4s, v21.4s, #1
363
sri v20.4s, v21.4s, #31
364
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
365
.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3]
366
// s4 = w7 | w8 | w9 | w10
367
ext v0.16b, v2.16b, v3.16b, #12
368
// vtmp1 = w3 | w4 | w5 | w6
369
ext v22.16b, v1.16b, v2.16b, #12
370
// vtmp2 = w10 | w11 | w12 | w13
371
ext v23.16b, v3.16b, v4.16b, #8
372
.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s
373
.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s
374
eor v22.16b, v1.16b, v2.16b
375
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
376
shl v21.4s, v20.4s, #1
377
sri v21.4s, v20.4s, #31
378
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
379
.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0]
380
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
381
shl v20.4s, v21.4s, #1
382
sri v20.4s, v21.4s, #31
383
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
384
.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1]
385
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
386
shl v21.4s, v20.4s, #1
387
sri v21.4s, v20.4s, #31
388
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
389
.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2]
390
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
391
shl v20.4s, v21.4s, #1
392
sri v20.4s, v21.4s, #31
393
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
394
.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3]
395
// s4 = w7 | w8 | w9 | w10
396
ext v1.16b, v3.16b, v4.16b, #12
397
// vtmp1 = w3 | w4 | w5 | w6
398
ext v22.16b, v2.16b, v3.16b, #12
399
// vtmp2 = w10 | w11 | w12 | w13
400
ext v23.16b, v4.16b, v0.16b, #8
401
.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s
402
.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s
403
eor v22.16b, v2.16b, v3.16b
404
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
405
shl v21.4s, v20.4s, #1
406
sri v21.4s, v20.4s, #31
407
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
408
.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0]
409
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
410
shl v20.4s, v21.4s, #1
411
sri v20.4s, v21.4s, #31
412
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
413
.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1]
414
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
415
shl v21.4s, v20.4s, #1
416
sri v21.4s, v20.4s, #31
417
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
418
.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2]
419
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
420
shl v20.4s, v21.4s, #1
421
sri v20.4s, v21.4s, #31
422
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
423
.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3]
424
eor v22.16b, v3.16b, v4.16b
425
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
426
shl v21.4s, v20.4s, #1
427
sri v21.4s, v20.4s, #31
428
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
429
.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0]
430
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
431
shl v20.4s, v21.4s, #1
432
sri v20.4s, v21.4s, #31
433
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
434
.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1]
435
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
436
shl v21.4s, v20.4s, #1
437
sri v21.4s, v20.4s, #31
438
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
439
.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2]
440
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
441
shl v20.4s, v21.4s, #1
442
sri v20.4s, v21.4s, #31
443
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
444
.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3]
445
eor v22.16b, v4.16b, v0.16b
446
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
447
shl v21.4s, v20.4s, #1
448
sri v21.4s, v20.4s, #31
449
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
450
.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0]
451
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
452
shl v20.4s, v21.4s, #1
453
sri v20.4s, v21.4s, #31
454
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
455
.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1]
456
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
457
shl v21.4s, v20.4s, #1
458
sri v21.4s, v20.4s, #31
459
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
460
.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2]
461
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
462
shl v20.4s, v21.4s, #1
463
sri v20.4s, v21.4s, #31
464
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
465
.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3]
466
eor v22.16b, v0.16b, v1.16b
467
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
468
shl v21.4s, v20.4s, #1
469
sri v21.4s, v20.4s, #31
470
.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0]
471
.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0]
472
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
473
shl v20.4s, v21.4s, #1
474
sri v20.4s, v21.4s, #31
475
.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1]
476
.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1]
477
.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
478
shl v21.4s, v20.4s, #1
479
sri v21.4s, v20.4s, #31
480
.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2]
481
.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2]
482
.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
483
shl v20.4s, v21.4s, #1
484
sri v20.4s, v21.4s, #31
485
.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3]
486
.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3]
487
eor v5.16b, v5.16b, v18.16b
488
eor v6.16b, v6.16b, v19.16b
489
490
// any remained blocks?
491
cbnz w2, .Loop
492
493
// save state
494
rev64 v5.4s, v5.4s
495
rev64 v6.4s, v6.4s
496
ext v5.16b, v5.16b, v5.16b, #8
497
ext v6.16b, v6.16b, v6.16b, #8
498
st1 {v5.4s,v6.4s}, [x0]
499
ret
500
.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
501
.section .rodata
502
503
.type _sm3_consts,%object
504
.align 3
505
_sm3_consts:
506
.Tj:
507
.word 0x79cc4519, 0x9d8a7a87
508
.size _sm3_consts,.-_sm3_consts
509
.previous
510
511