Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/powerpc/sha256-spe-asm.S
26289 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* Fast SHA-256 implementation for SPE instruction set (PPC)
4
*
5
* This code makes use of the SPE SIMD instruction set as defined in
6
* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7
* Implementation is based on optimization guide notes from
8
* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9
*
10
* Copyright (c) 2015 Markus Stockhausen <[email protected]>
11
*/
12
13
#include <asm/ppc_asm.h>
14
#include <asm/asm-offsets.h>
15
16
#define rHP r3 /* pointer to hash values in memory */
17
#define rKP r24 /* pointer to round constants */
18
#define rWP r4 /* pointer to input data */
19
20
#define rH0 r5 /* 8 32 bit hash values in 8 registers */
21
#define rH1 r6
22
#define rH2 r7
23
#define rH3 r8
24
#define rH4 r9
25
#define rH5 r10
26
#define rH6 r11
27
#define rH7 r12
28
29
#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
30
#define rW1 r15
31
#define rW2 r16
32
#define rW3 r17
33
#define rW4 r18
34
#define rW5 r19
35
#define rW6 r20
36
#define rW7 r21
37
38
#define rT0 r22 /* 64 bit temporaries */
39
#define rT1 r23
40
#define rT2 r0 /* 32 bit temporaries */
41
#define rT3 r25
42
43
#define CMP_KN_LOOP
44
#define CMP_KC_LOOP \
45
cmpwi rT1,0;
46
47
#define INITIALIZE \
48
stwu r1,-128(r1); /* create stack frame */ \
49
evstdw r14,8(r1); /* We must save non volatile */ \
50
evstdw r15,16(r1); /* registers. Take the chance */ \
51
evstdw r16,24(r1); /* and save the SPE part too */ \
52
evstdw r17,32(r1); \
53
evstdw r18,40(r1); \
54
evstdw r19,48(r1); \
55
evstdw r20,56(r1); \
56
evstdw r21,64(r1); \
57
evstdw r22,72(r1); \
58
evstdw r23,80(r1); \
59
stw r24,88(r1); /* save normal registers */ \
60
stw r25,92(r1);
61
62
63
#define FINALIZE \
64
evldw r14,8(r1); /* restore SPE registers */ \
65
evldw r15,16(r1); \
66
evldw r16,24(r1); \
67
evldw r17,32(r1); \
68
evldw r18,40(r1); \
69
evldw r19,48(r1); \
70
evldw r20,56(r1); \
71
evldw r21,64(r1); \
72
evldw r22,72(r1); \
73
evldw r23,80(r1); \
74
lwz r24,88(r1); /* restore normal registers */ \
75
lwz r25,92(r1); \
76
xor r0,r0,r0; \
77
stw r0,8(r1); /* Delete sensitive data */ \
78
stw r0,16(r1); /* that we might have pushed */ \
79
stw r0,24(r1); /* from other context that runs */ \
80
stw r0,32(r1); /* the same code. Assume that */ \
81
stw r0,40(r1); /* the lower part of the GPRs */ \
82
stw r0,48(r1); /* was already overwritten on */ \
83
stw r0,56(r1); /* the way down to here */ \
84
stw r0,64(r1); \
85
stw r0,72(r1); \
86
stw r0,80(r1); \
87
addi r1,r1,128; /* cleanup stack frame */
88
89
#ifdef __BIG_ENDIAN__
90
#define LOAD_DATA(reg, off) \
91
lwz reg,off(rWP); /* load data */
92
#define NEXT_BLOCK \
93
addi rWP,rWP,64; /* increment per block */
94
#else
95
#define LOAD_DATA(reg, off) \
96
lwbrx reg,0,rWP; /* load data */ \
97
addi rWP,rWP,4; /* increment per word */
98
#define NEXT_BLOCK /* nothing to do */
99
#endif
100
101
#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
102
LOAD_DATA(w, off) /* 1: W */ \
103
rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
104
rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
105
rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
106
xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
107
and rT3,e,f; /* 1: ch = e and f */ \
108
xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
109
andc rT1,g,e; /* 1: ch' = ~e and g */ \
110
lwz rT2,off(rKP); /* 1: K */ \
111
xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
112
add h,h,rT0; /* 1: temp1 = h + S1 */ \
113
add rT3,rT3,w; /* 1: temp1' = ch + w */ \
114
rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
115
add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
116
rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
117
add h,h,rT2; /* 1: temp1 = temp1 + K */ \
118
rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
119
xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
120
add d,d,h; /* 1: d = d + temp1 */ \
121
xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
122
evmergelo w,w,w; /* shift W */ \
123
or rT2,a,b; /* 1: maj = a or b */ \
124
and rT1,a,b; /* 1: maj' = a and b */ \
125
and rT2,rT2,c; /* 1: maj = maj and c */ \
126
LOAD_DATA(w, off+4) /* 2: W */ \
127
or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
128
rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
129
add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
130
rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
131
add h,h,rT3; /* 1: h = temp1 + temp2 */ \
132
rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
133
xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
134
and rT3,d,e; /* 2: ch = e and f */ \
135
xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
136
andc rT1,f,d; /* 2: ch' = ~e and g */ \
137
lwz rT2,off+4(rKP); /* 2: K */ \
138
xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
139
add g,g,rT0; /* 2: temp1 = h + S1 */ \
140
add rT3,rT3,w; /* 2: temp1' = ch + w */ \
141
rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
142
add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
143
rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
144
add g,g,rT2; /* 2: temp1 = temp1 + K */ \
145
rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
146
xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
147
or rT2,h,a; /* 2: maj = a or b */ \
148
xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
149
and rT1,h,a; /* 2: maj' = a and b */ \
150
and rT2,rT2,b; /* 2: maj = maj and c */ \
151
add c,c,g; /* 2: d = d + temp1 */ \
152
or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
153
add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
154
add g,g,rT3 /* 2: h = temp1 + temp2 */
155
156
#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
157
rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
158
evmergelohi rT0,w0,w1; /* w[-15] */ \
159
rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
160
evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
161
xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
162
evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
163
rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
164
evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
165
xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
166
evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
167
add h,h,rT2; /* 1: temp1 = h + S1 */ \
168
evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
169
and rT2,e,f; /* 1: ch = e and f */ \
170
evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
171
andc rT3,g,e; /* 1: ch' = ~e and g */ \
172
evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
173
xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
174
evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
175
add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
176
evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
177
rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
178
evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
179
rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
180
evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
181
xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
182
evldw rT1,off(rKP); /* k */ \
183
rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
184
evaddw w0,w0,rT0; /* w = w + s1 */ \
185
xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
186
evmergelohi rT0,w4,w5; /* w[-7] */ \
187
and rT3,a,b; /* 1: maj = a and b */ \
188
evaddw w0,w0,rT0; /* w = w + w[-7] */ \
189
CMP_K##k##_LOOP \
190
add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
191
evaddw rT1,rT1,w0; /* wk = w + k */ \
192
xor rT3,a,b; /* 1: maj = a xor b */ \
193
evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
194
and rT3,rT3,c; /* 1: maj = maj and c */ \
195
add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
196
add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
197
add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
198
add d,d,h; /* 1: d = d + temp1 */ \
199
rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
200
add h,h,rT2; /* 1: h = temp1 + temp2 */ \
201
rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
202
rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
203
xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
204
and rT3,d,e; /* 2: ch = e and f */ \
205
xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
206
andc rT1,f,d; /* 2: ch' = ~e and g */ \
207
add g,g,rT0; /* 2: temp1 = h + S1 */ \
208
xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
209
rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
210
add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
211
rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
212
rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
213
xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
214
or rT2,h,a; /* 2: maj = a or b */ \
215
and rT1,h,a; /* 2: maj' = a and b */ \
216
and rT2,rT2,b; /* 2: maj = maj and c */ \
217
xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
218
or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
219
add c,c,g; /* 2: d = d + temp1 */ \
220
add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
221
add g,g,rT3 /* 2: h = temp1 + temp2 */
222
223
_GLOBAL(ppc_spe_sha256_transform)
224
INITIALIZE
225
226
mtctr r5
227
lwz rH0,0(rHP)
228
lwz rH1,4(rHP)
229
lwz rH2,8(rHP)
230
lwz rH3,12(rHP)
231
lwz rH4,16(rHP)
232
lwz rH5,20(rHP)
233
lwz rH6,24(rHP)
234
lwz rH7,28(rHP)
235
236
ppc_spe_sha256_main:
237
lis rKP,PPC_SPE_SHA256_K@ha
238
addi rKP,rKP,PPC_SPE_SHA256_K@l
239
240
R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
241
R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
242
R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
243
R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
244
R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
245
R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
246
R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
247
R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
248
ppc_spe_sha256_16_rounds:
249
addi rKP,rKP,64
250
R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
251
rW0, rW1, rW4, rW5, rW7, N, 0)
252
R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
253
rW1, rW2, rW5, rW6, rW0, N, 8)
254
R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
255
rW2, rW3, rW6, rW7, rW1, N, 16)
256
R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
257
rW3, rW4, rW7, rW0, rW2, N, 24)
258
R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
259
rW4, rW5, rW0, rW1, rW3, N, 32)
260
R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
261
rW5, rW6, rW1, rW2, rW4, N, 40)
262
R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
263
rW6, rW7, rW2, rW3, rW5, N, 48)
264
R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
265
rW7, rW0, rW3, rW4, rW6, C, 56)
266
bt gt,ppc_spe_sha256_16_rounds
267
268
lwz rW0,0(rHP)
269
NEXT_BLOCK
270
lwz rW1,4(rHP)
271
lwz rW2,8(rHP)
272
lwz rW3,12(rHP)
273
lwz rW4,16(rHP)
274
lwz rW5,20(rHP)
275
lwz rW6,24(rHP)
276
lwz rW7,28(rHP)
277
278
add rH0,rH0,rW0
279
stw rH0,0(rHP)
280
add rH1,rH1,rW1
281
stw rH1,4(rHP)
282
add rH2,rH2,rW2
283
stw rH2,8(rHP)
284
add rH3,rH3,rW3
285
stw rH3,12(rHP)
286
add rH4,rH4,rW4
287
stw rH4,16(rHP)
288
add rH5,rH5,rW5
289
stw rH5,20(rHP)
290
add rH6,rH6,rW6
291
stw rH6,24(rHP)
292
add rH7,rH7,rW7
293
stw rH7,28(rHP)
294
295
bdnz ppc_spe_sha256_main
296
297
FINALIZE
298
blr
299
300
.data
301
.align 5
302
PPC_SPE_SHA256_K:
303
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
304
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
305
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
306
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
307
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
308
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
309
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
310
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
311
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
312
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
313
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
314
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
315
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
316
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
317
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
318
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
319
320