Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/blowfish-x86_64-asm_64.S
26451 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* Blowfish Cipher Algorithm (x86_64)
4
*
5
* Copyright (C) 2011 Jussi Kivilinna <[email protected]>
6
*/
7
8
#include <linux/linkage.h>
9
10
.file "blowfish-x86_64-asm.S"
11
.text
12
13
/* structure of crypto context */
14
#define p 0
15
#define s0 ((16 + 2) * 4)
16
#define s1 ((16 + 2 + (1 * 256)) * 4)
17
#define s2 ((16 + 2 + (2 * 256)) * 4)
18
#define s3 ((16 + 2 + (3 * 256)) * 4)
19
20
/* register macros */
21
#define CTX %r12
22
#define RIO %rsi
23
24
#define RX0 %rax
25
#define RX1 %rbx
26
#define RX2 %rcx
27
#define RX3 %rdx
28
29
#define RX0d %eax
30
#define RX1d %ebx
31
#define RX2d %ecx
32
#define RX3d %edx
33
34
#define RX0bl %al
35
#define RX1bl %bl
36
#define RX2bl %cl
37
#define RX3bl %dl
38
39
#define RX0bh %ah
40
#define RX1bh %bh
41
#define RX2bh %ch
42
#define RX3bh %dh
43
44
#define RT0 %rdi
45
#define RT1 %rsi
46
#define RT2 %r8
47
#define RT3 %r9
48
49
#define RT0d %edi
50
#define RT1d %esi
51
#define RT2d %r8d
52
#define RT3d %r9d
53
54
#define RKEY %r10
55
56
/***********************************************************************
57
* 1-way blowfish
58
***********************************************************************/
59
#define F() \
60
rorq $16, RX0; \
61
movzbl RX0bh, RT0d; \
62
movzbl RX0bl, RT1d; \
63
rolq $16, RX0; \
64
movl s0(CTX,RT0,4), RT0d; \
65
addl s1(CTX,RT1,4), RT0d; \
66
movzbl RX0bh, RT1d; \
67
movzbl RX0bl, RT2d; \
68
rolq $32, RX0; \
69
xorl s2(CTX,RT1,4), RT0d; \
70
addl s3(CTX,RT2,4), RT0d; \
71
xorq RT0, RX0;
72
73
#define add_roundkey_enc(n) \
74
xorq p+4*(n)(CTX), RX0;
75
76
#define round_enc(n) \
77
add_roundkey_enc(n); \
78
\
79
F(); \
80
F();
81
82
#define add_roundkey_dec(n) \
83
movq p+4*(n-1)(CTX), RT0; \
84
rorq $32, RT0; \
85
xorq RT0, RX0;
86
87
#define round_dec(n) \
88
add_roundkey_dec(n); \
89
\
90
F(); \
91
F(); \
92
93
#define read_block() \
94
movq (RIO), RX0; \
95
rorq $32, RX0; \
96
bswapq RX0;
97
98
#define write_block() \
99
bswapq RX0; \
100
movq RX0, (RIO);
101
102
SYM_FUNC_START(blowfish_enc_blk)
103
/* input:
104
* %rdi: ctx
105
* %rsi: dst
106
* %rdx: src
107
*/
108
movq %r12, %r11;
109
110
movq %rdi, CTX;
111
movq %rsi, %r10;
112
movq %rdx, RIO;
113
114
read_block();
115
116
round_enc(0);
117
round_enc(2);
118
round_enc(4);
119
round_enc(6);
120
round_enc(8);
121
round_enc(10);
122
round_enc(12);
123
round_enc(14);
124
add_roundkey_enc(16);
125
126
movq %r11, %r12;
127
movq %r10, RIO;
128
129
write_block();
130
RET;
131
SYM_FUNC_END(blowfish_enc_blk)
132
133
SYM_FUNC_START(blowfish_dec_blk)
134
/* input:
135
* %rdi: ctx
136
* %rsi: dst
137
* %rdx: src
138
*/
139
movq %r12, %r11;
140
141
movq %rdi, CTX;
142
movq %rsi, %r10;
143
movq %rdx, RIO;
144
145
read_block();
146
147
round_dec(17);
148
round_dec(15);
149
round_dec(13);
150
round_dec(11);
151
round_dec(9);
152
round_dec(7);
153
round_dec(5);
154
round_dec(3);
155
add_roundkey_dec(1);
156
157
movq %r10, RIO;
158
write_block();
159
160
movq %r11, %r12;
161
162
RET;
163
SYM_FUNC_END(blowfish_dec_blk)
164
165
/**********************************************************************
166
4-way blowfish, four blocks parallel
167
**********************************************************************/
168
169
/* F() for 4-way. Slower when used alone/1-way, but faster when used
170
* parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
171
*/
172
#define F4(x) \
173
movzbl x ## bh, RT1d; \
174
movzbl x ## bl, RT3d; \
175
rorq $16, x; \
176
movzbl x ## bh, RT0d; \
177
movzbl x ## bl, RT2d; \
178
rorq $16, x; \
179
movl s0(CTX,RT0,4), RT0d; \
180
addl s1(CTX,RT2,4), RT0d; \
181
xorl s2(CTX,RT1,4), RT0d; \
182
addl s3(CTX,RT3,4), RT0d; \
183
xorq RT0, x;
184
185
#define add_preloaded_roundkey4() \
186
xorq RKEY, RX0; \
187
xorq RKEY, RX1; \
188
xorq RKEY, RX2; \
189
xorq RKEY, RX3;
190
191
#define preload_roundkey_enc(n) \
192
movq p+4*(n)(CTX), RKEY;
193
194
#define add_roundkey_enc4(n) \
195
add_preloaded_roundkey4(); \
196
preload_roundkey_enc(n + 2);
197
198
#define round_enc4(n) \
199
add_roundkey_enc4(n); \
200
\
201
F4(RX0); \
202
F4(RX1); \
203
F4(RX2); \
204
F4(RX3); \
205
\
206
F4(RX0); \
207
F4(RX1); \
208
F4(RX2); \
209
F4(RX3);
210
211
#define preload_roundkey_dec(n) \
212
movq p+4*((n)-1)(CTX), RKEY; \
213
rorq $32, RKEY;
214
215
#define add_roundkey_dec4(n) \
216
add_preloaded_roundkey4(); \
217
preload_roundkey_dec(n - 2);
218
219
#define round_dec4(n) \
220
add_roundkey_dec4(n); \
221
\
222
F4(RX0); \
223
F4(RX1); \
224
F4(RX2); \
225
F4(RX3); \
226
\
227
F4(RX0); \
228
F4(RX1); \
229
F4(RX2); \
230
F4(RX3);
231
232
#define read_block4() \
233
movq (RIO), RX0; \
234
rorq $32, RX0; \
235
bswapq RX0; \
236
\
237
movq 8(RIO), RX1; \
238
rorq $32, RX1; \
239
bswapq RX1; \
240
\
241
movq 16(RIO), RX2; \
242
rorq $32, RX2; \
243
bswapq RX2; \
244
\
245
movq 24(RIO), RX3; \
246
rorq $32, RX3; \
247
bswapq RX3;
248
249
#define write_block4() \
250
bswapq RX0; \
251
movq RX0, (RIO); \
252
\
253
bswapq RX1; \
254
movq RX1, 8(RIO); \
255
\
256
bswapq RX2; \
257
movq RX2, 16(RIO); \
258
\
259
bswapq RX3; \
260
movq RX3, 24(RIO);
261
262
#define xor_block4() \
263
movq (RIO), RT0; \
264
bswapq RT0; \
265
xorq RT0, RX1; \
266
\
267
movq 8(RIO), RT2; \
268
bswapq RT2; \
269
xorq RT2, RX2; \
270
\
271
movq 16(RIO), RT3; \
272
bswapq RT3; \
273
xorq RT3, RX3;
274
275
SYM_FUNC_START(blowfish_enc_blk_4way)
276
/* input:
277
* %rdi: ctx
278
* %rsi: dst
279
* %rdx: src
280
*/
281
pushq %r12;
282
pushq %rbx;
283
284
movq %rdi, CTX
285
movq %rsi, %r11;
286
movq %rdx, RIO;
287
288
preload_roundkey_enc(0);
289
290
read_block4();
291
292
round_enc4(0);
293
round_enc4(2);
294
round_enc4(4);
295
round_enc4(6);
296
round_enc4(8);
297
round_enc4(10);
298
round_enc4(12);
299
round_enc4(14);
300
add_preloaded_roundkey4();
301
302
movq %r11, RIO;
303
write_block4();
304
305
popq %rbx;
306
popq %r12;
307
RET;
308
SYM_FUNC_END(blowfish_enc_blk_4way)
309
310
SYM_FUNC_START(__blowfish_dec_blk_4way)
311
/* input:
312
* %rdi: ctx
313
* %rsi: dst
314
* %rdx: src
315
* %rcx: cbc (bool)
316
*/
317
pushq %r12;
318
pushq %rbx;
319
pushq %rcx;
320
pushq %rdx;
321
322
movq %rdi, CTX;
323
movq %rsi, %r11;
324
movq %rdx, RIO;
325
326
preload_roundkey_dec(17);
327
read_block4();
328
329
round_dec4(17);
330
round_dec4(15);
331
round_dec4(13);
332
round_dec4(11);
333
round_dec4(9);
334
round_dec4(7);
335
round_dec4(5);
336
round_dec4(3);
337
add_preloaded_roundkey4();
338
339
popq RIO;
340
popq %r12;
341
testq %r12, %r12;
342
jz .L_no_cbc_xor;
343
344
xor_block4();
345
346
.L_no_cbc_xor:
347
movq %r11, RIO;
348
write_block4();
349
350
popq %rbx;
351
popq %r12;
352
353
RET;
354
SYM_FUNC_END(__blowfish_dec_blk_4way)
355
356