Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/powerpc/sha1-spe-asm.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* Fast SHA-1 implementation for SPE instruction set (PPC)
4
*
5
* This code makes use of the SPE SIMD instruction set as defined in
6
* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7
* Implementation is based on optimization guide notes from
8
* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9
*
10
* Copyright (c) 2015 Markus Stockhausen <[email protected]>
11
*/
12
13
#include <asm/ppc_asm.h>
14
#include <asm/asm-offsets.h>
15
16
#define rHP r3 /* pointer to hash value */
17
#define rWP r4 /* pointer to input */
18
#define rKP r5 /* pointer to constants */
19
20
#define rW0 r14 /* 64 bit round words */
21
#define rW1 r15
22
#define rW2 r16
23
#define rW3 r17
24
#define rW4 r18
25
#define rW5 r19
26
#define rW6 r20
27
#define rW7 r21
28
29
#define rH0 r6 /* 32 bit hash values */
30
#define rH1 r7
31
#define rH2 r8
32
#define rH3 r9
33
#define rH4 r10
34
35
#define rT0 r22 /* 64 bit temporary */
36
#define rT1 r0 /* 32 bit temporaries */
37
#define rT2 r11
38
#define rT3 r12
39
40
#define rK r23 /* 64 bit constant in volatile register */
41
42
#define LOAD_K01
43
44
#define LOAD_K11 \
45
evlwwsplat rK,0(rKP);
46
47
#define LOAD_K21 \
48
evlwwsplat rK,4(rKP);
49
50
#define LOAD_K31 \
51
evlwwsplat rK,8(rKP);
52
53
#define LOAD_K41 \
54
evlwwsplat rK,12(rKP);
55
56
#define INITIALIZE \
57
stwu r1,-128(r1); /* create stack frame */ \
58
evstdw r14,8(r1); /* We must save non volatile */ \
59
evstdw r15,16(r1); /* registers. Take the chance */ \
60
evstdw r16,24(r1); /* and save the SPE part too */ \
61
evstdw r17,32(r1); \
62
evstdw r18,40(r1); \
63
evstdw r19,48(r1); \
64
evstdw r20,56(r1); \
65
evstdw r21,64(r1); \
66
evstdw r22,72(r1); \
67
evstdw r23,80(r1);
68
69
70
#define FINALIZE \
71
evldw r14,8(r1); /* restore SPE registers */ \
72
evldw r15,16(r1); \
73
evldw r16,24(r1); \
74
evldw r17,32(r1); \
75
evldw r18,40(r1); \
76
evldw r19,48(r1); \
77
evldw r20,56(r1); \
78
evldw r21,64(r1); \
79
evldw r22,72(r1); \
80
evldw r23,80(r1); \
81
xor r0,r0,r0; \
82
stw r0,8(r1); /* Delete sensitive data */ \
83
stw r0,16(r1); /* that we might have pushed */ \
84
stw r0,24(r1); /* from other context that runs */ \
85
stw r0,32(r1); /* the same code. Assume that */ \
86
stw r0,40(r1); /* the lower part of the GPRs */ \
87
stw r0,48(r1); /* were already overwritten on */ \
88
stw r0,56(r1); /* the way down to here */ \
89
stw r0,64(r1); \
90
stw r0,72(r1); \
91
stw r0,80(r1); \
92
addi r1,r1,128; /* cleanup stack frame */
93
94
#ifdef __BIG_ENDIAN__
95
#define LOAD_DATA(reg, off) \
96
lwz reg,off(rWP); /* load data */
97
#define NEXT_BLOCK \
98
addi rWP,rWP,64; /* increment per block */
99
#else
100
#define LOAD_DATA(reg, off) \
101
lwbrx reg,0,rWP; /* load data */ \
102
addi rWP,rWP,4; /* increment per word */
103
#define NEXT_BLOCK /* nothing to do */
104
#endif
105
106
#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
107
LOAD_DATA(w0, off) /* 1: W */ \
108
and rT2,b,c; /* 1: F' = B and C */ \
109
LOAD_K##k##1 \
110
andc rT1,d,b; /* 1: F" = ~B and D */ \
111
rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
112
or rT2,rT2,rT1; /* 1: F = F' or F" */ \
113
add e,e,rT0; /* 1: E = E + A' */ \
114
rotrwi b,b,2; /* 1: B = B rotl 30 */ \
115
add e,e,w0; /* 1: E = E + W */ \
116
LOAD_DATA(w1, off+4) /* 2: W */ \
117
add e,e,rT2; /* 1: E = E + F */ \
118
and rT1,a,b; /* 2: F' = B and C */ \
119
add e,e,rK; /* 1: E = E + K */ \
120
andc rT2,c,a; /* 2: F" = ~B and D */ \
121
add d,d,rK; /* 2: E = E + K */ \
122
or rT2,rT2,rT1; /* 2: F = F' or F" */ \
123
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
124
add d,d,w1; /* 2: E = E + W */ \
125
rotrwi a,a,2; /* 2: B = B rotl 30 */ \
126
add d,d,rT0; /* 2: E = E + A' */ \
127
evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
128
add d,d,rT2 /* 2: E = E + F */
129
130
#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
131
and rT2,b,c; /* 1: F' = B and C */ \
132
evmergelohi rT0,w7,w6; /* W[-3] */ \
133
andc rT1,d,b; /* 1: F" = ~B and D */ \
134
evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
135
or rT1,rT1,rT2; /* 1: F = F' or F" */ \
136
evxor w0,w0,w4; /* W = W xor W[-8] */ \
137
add e,e,rT1; /* 1: E = E + F */ \
138
evxor w0,w0,w1; /* W = W xor W[-14] */ \
139
rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
140
evrlwi w0,w0,1; /* W = W rotl 1 */ \
141
add e,e,rT2; /* 1: E = E + A' */ \
142
evaddw rT0,w0,rK; /* WK = W + K */ \
143
rotrwi b,b,2; /* 1: B = B rotl 30 */ \
144
LOAD_K##k##1 \
145
evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
146
add e,e,rT0; /* 1: E = E + WK */ \
147
add d,d,rT1; /* 2: E = E + WK */ \
148
and rT2,a,b; /* 2: F' = B and C */ \
149
andc rT1,c,a; /* 2: F" = ~B and D */ \
150
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
151
or rT1,rT1,rT2; /* 2: F = F' or F" */ \
152
add d,d,rT0; /* 2: E = E + A' */ \
153
rotrwi a,a,2; /* 2: B = B rotl 30 */ \
154
add d,d,rT1 /* 2: E = E + F */
155
156
#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
157
evmergelohi rT0,w7,w6; /* W[-3] */ \
158
xor rT2,b,c; /* 1: F' = B xor C */ \
159
evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
160
xor rT2,rT2,d; /* 1: F = F' xor D */ \
161
evxor w0,w0,w4; /* W = W xor W[-8] */ \
162
add e,e,rT2; /* 1: E = E + F */ \
163
evxor w0,w0,w1; /* W = W xor W[-14] */ \
164
rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
165
evrlwi w0,w0,1; /* W = W rotl 1 */ \
166
add e,e,rT2; /* 1: E = E + A' */ \
167
evaddw rT0,w0,rK; /* WK = W + K */ \
168
rotrwi b,b,2; /* 1: B = B rotl 30 */ \
169
LOAD_K##k##1 \
170
evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
171
add e,e,rT0; /* 1: E = E + WK */ \
172
xor rT2,a,b; /* 2: F' = B xor C */ \
173
add d,d,rT1; /* 2: E = E + WK */ \
174
xor rT2,rT2,c; /* 2: F = F' xor D */ \
175
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
176
add d,d,rT2; /* 2: E = E + F */ \
177
rotrwi a,a,2; /* 2: B = B rotl 30 */ \
178
add d,d,rT0 /* 2: E = E + A' */
179
180
#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
181
and rT2,b,c; /* 1: F' = B and C */ \
182
evmergelohi rT0,w7,w6; /* W[-3] */ \
183
or rT1,b,c; /* 1: F" = B or C */ \
184
evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
185
and rT1,d,rT1; /* 1: F" = F" and D */ \
186
evxor w0,w0,w4; /* W = W xor W[-8] */ \
187
or rT2,rT2,rT1; /* 1: F = F' or F" */ \
188
evxor w0,w0,w1; /* W = W xor W[-14] */ \
189
add e,e,rT2; /* 1: E = E + F */ \
190
evrlwi w0,w0,1; /* W = W rotl 1 */ \
191
rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
192
evaddw rT0,w0,rK; /* WK = W + K */ \
193
add e,e,rT2; /* 1: E = E + A' */ \
194
LOAD_K##k##1 \
195
evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
196
rotrwi b,b,2; /* 1: B = B rotl 30 */ \
197
add e,e,rT0; /* 1: E = E + WK */ \
198
and rT2,a,b; /* 2: F' = B and C */ \
199
or rT0,a,b; /* 2: F" = B or C */ \
200
add d,d,rT1; /* 2: E = E + WK */ \
201
and rT0,c,rT0; /* 2: F" = F" and D */ \
202
rotrwi a,a,2; /* 2: B = B rotl 30 */ \
203
or rT2,rT2,rT0; /* 2: F = F' or F" */ \
204
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
205
add d,d,rT2; /* 2: E = E + F */ \
206
add d,d,rT0 /* 2: E = E + A' */
207
208
#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
209
R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
210
211
_GLOBAL(ppc_spe_sha1_transform)
212
INITIALIZE
213
214
lwz rH0,0(rHP)
215
lwz rH1,4(rHP)
216
mtctr r5
217
lwz rH2,8(rHP)
218
lis rKP,PPC_SPE_SHA1_K@h
219
lwz rH3,12(rHP)
220
ori rKP,rKP,PPC_SPE_SHA1_K@l
221
lwz rH4,16(rHP)
222
223
ppc_spe_sha1_main:
224
R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
225
R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
226
R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
227
R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
228
R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
229
R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
230
R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
231
R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
232
233
R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
234
R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
235
236
R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
237
R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
238
R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
239
R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
240
R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
241
R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
242
R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
243
R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
244
R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
245
R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
246
247
R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
248
R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
249
R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
250
R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
251
R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
252
R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
253
R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
254
R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
255
R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
256
R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
257
258
R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
259
R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
260
R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
261
R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
262
R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
263
R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
264
R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
265
lwz rT3,0(rHP)
266
R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
267
lwz rW1,4(rHP)
268
R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
269
lwz rW2,8(rHP)
270
R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
271
lwz rW3,12(rHP)
272
NEXT_BLOCK
273
lwz rW4,16(rHP)
274
275
add rH0,rH0,rT3
276
stw rH0,0(rHP)
277
add rH1,rH1,rW1
278
stw rH1,4(rHP)
279
add rH2,rH2,rW2
280
stw rH2,8(rHP)
281
add rH3,rH3,rW3
282
stw rH3,12(rHP)
283
add rH4,rH4,rW4
284
stw rH4,16(rHP)
285
286
bdnz ppc_spe_sha1_main
287
288
FINALIZE
289
blr
290
291
.data
292
.align 4
293
PPC_SPE_SHA1_K:
294
.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
295
296