Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/crypto/openssl/engines/asm/e_padlock-x86_64.pl
34876 views
1
#! /usr/bin/env perl
2
# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
3
#
4
# Licensed under the Apache License 2.0 (the "License"). You may not use
5
# this file except in compliance with the License. You can obtain a copy
6
# in the file LICENSE in the source distribution or at
7
# https://www.openssl.org/source/license.html
8
9
10
# ====================================================================
11
# Written by Andy Polyakov <[email protected]> for the OpenSSL
12
# project. The module is, however, dual licensed under OpenSSL and
13
# CRYPTOGAMS licenses depending on where you obtain it. For further
14
# details see http://www.openssl.org/~appro/cryptogams/.
15
# ====================================================================
16
17
# September 2011
18
#
19
# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20
# details.
21
22
# $output is the last argument if it looks like a file (it has an extension)
23
# $flavour is the first argument if it doesn't look like a file
24
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
26
27
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31
( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32
die "can't locate x86_64-xlate.pl";
33
34
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35
or die "can't call $xlate: $!";
36
*STDOUT=*OUT;
37
38
$code=".text\n";
39
40
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
41
$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
42
43
$ctx="%rdx";
44
$out="%rdi";
45
$inp="%rsi";
46
$len="%rcx";
47
$chunk="%rbx";
48
49
($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50
("%rdi","%rsi","%rdx","%rcx"); # Unix order
51
52
$code.=<<___;
53
.globl padlock_capability
54
.type padlock_capability,\@abi-omnipotent
55
.align 16
56
padlock_capability:
57
mov %rbx,%r8
58
xor %eax,%eax
59
cpuid
60
xor %eax,%eax
61
cmp \$`"0x".unpack("H*",'tneC')`,%ebx
62
jne .Lzhaoxin
63
cmp \$`"0x".unpack("H*",'Hrua')`,%edx
64
jne .Lnoluck
65
cmp \$`"0x".unpack("H*",'slua')`,%ecx
66
jne .Lnoluck
67
jmp .LzhaoxinEnd
68
.Lzhaoxin:
69
cmp \$`"0x".unpack("H*",'hS ')`,%ebx
70
jne .Lnoluck
71
cmp \$`"0x".unpack("H*",'hgna')`,%edx
72
jne .Lnoluck
73
cmp \$`"0x".unpack("H*",' ia')`,%ecx
74
jne .Lnoluck
75
.LzhaoxinEnd:
76
mov \$0xC0000000,%eax
77
cpuid
78
mov %eax,%edx
79
xor %eax,%eax
80
cmp \$0xC0000001,%edx
81
jb .Lnoluck
82
mov \$0xC0000001,%eax
83
cpuid
84
mov %edx,%eax
85
and \$0xffffffef,%eax
86
or \$0x10,%eax # set Nano bit#4
87
.Lnoluck:
88
mov %r8,%rbx
89
ret
90
.size padlock_capability,.-padlock_capability
91
92
.globl padlock_key_bswap
93
.type padlock_key_bswap,\@abi-omnipotent,0
94
.align 16
95
padlock_key_bswap:
96
mov 240($arg1),%edx
97
inc %edx
98
shl \$2,%edx
99
.Lbswap_loop:
100
mov ($arg1),%eax
101
bswap %eax
102
mov %eax,($arg1)
103
lea 4($arg1),$arg1
104
sub \$1,%edx
105
jnz .Lbswap_loop
106
ret
107
.size padlock_key_bswap,.-padlock_key_bswap
108
109
.globl padlock_verify_context
110
.type padlock_verify_context,\@abi-omnipotent
111
.align 16
112
padlock_verify_context:
113
mov $arg1,$ctx
114
pushf
115
lea .Lpadlock_saved_context(%rip),%rax
116
call _padlock_verify_ctx
117
lea 8(%rsp),%rsp
118
ret
119
.size padlock_verify_context,.-padlock_verify_context
120
121
.type _padlock_verify_ctx,\@abi-omnipotent
122
.align 16
123
_padlock_verify_ctx:
124
mov 8(%rsp),%r8
125
bt \$30,%r8
126
jnc .Lverified
127
cmp (%rax),$ctx
128
je .Lverified
129
pushf
130
popf
131
.Lverified:
132
mov $ctx,(%rax)
133
ret
134
.size _padlock_verify_ctx,.-_padlock_verify_ctx
135
136
.globl padlock_reload_key
137
.type padlock_reload_key,\@abi-omnipotent
138
.align 16
139
padlock_reload_key:
140
pushf
141
popf
142
ret
143
.size padlock_reload_key,.-padlock_reload_key
144
145
.globl padlock_aes_block
146
.type padlock_aes_block,\@function,3
147
.align 16
148
padlock_aes_block:
149
mov %rbx,%r8
150
mov \$1,$len
151
lea 32($ctx),%rbx # key
152
lea 16($ctx),$ctx # control word
153
.byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
154
mov %r8,%rbx
155
ret
156
.size padlock_aes_block,.-padlock_aes_block
157
158
.globl padlock_xstore
159
.type padlock_xstore,\@function,2
160
.align 16
161
padlock_xstore:
162
mov %esi,%edx
163
.byte 0x0f,0xa7,0xc0 # xstore
164
ret
165
.size padlock_xstore,.-padlock_xstore
166
167
.globl padlock_sha1_oneshot
168
.type padlock_sha1_oneshot,\@function,3
169
.align 16
170
padlock_sha1_oneshot:
171
mov %rdx,%rcx
172
mov %rdi,%rdx # put aside %rdi
173
movups (%rdi),%xmm0 # copy-in context
174
sub \$128+8,%rsp
175
mov 16(%rdi),%eax
176
movaps %xmm0,(%rsp)
177
mov %rsp,%rdi
178
mov %eax,16(%rsp)
179
xor %rax,%rax
180
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
181
movaps (%rsp),%xmm0
182
mov 16(%rsp),%eax
183
add \$128+8,%rsp
184
movups %xmm0,(%rdx) # copy-out context
185
mov %eax,16(%rdx)
186
ret
187
.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
188
189
.globl padlock_sha1_blocks
190
.type padlock_sha1_blocks,\@function,3
191
.align 16
192
padlock_sha1_blocks:
193
mov %rdx,%rcx
194
mov %rdi,%rdx # put aside %rdi
195
movups (%rdi),%xmm0 # copy-in context
196
sub \$128+8,%rsp
197
mov 16(%rdi),%eax
198
movaps %xmm0,(%rsp)
199
mov %rsp,%rdi
200
mov %eax,16(%rsp)
201
mov \$-1,%rax
202
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
203
movaps (%rsp),%xmm0
204
mov 16(%rsp),%eax
205
add \$128+8,%rsp
206
movups %xmm0,(%rdx) # copy-out context
207
mov %eax,16(%rdx)
208
ret
209
.size padlock_sha1_blocks,.-padlock_sha1_blocks
210
211
.globl padlock_sha256_oneshot
212
.type padlock_sha256_oneshot,\@function,3
213
.align 16
214
padlock_sha256_oneshot:
215
mov %rdx,%rcx
216
mov %rdi,%rdx # put aside %rdi
217
movups (%rdi),%xmm0 # copy-in context
218
sub \$128+8,%rsp
219
movups 16(%rdi),%xmm1
220
movaps %xmm0,(%rsp)
221
mov %rsp,%rdi
222
movaps %xmm1,16(%rsp)
223
xor %rax,%rax
224
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
225
movaps (%rsp),%xmm0
226
movaps 16(%rsp),%xmm1
227
add \$128+8,%rsp
228
movups %xmm0,(%rdx) # copy-out context
229
movups %xmm1,16(%rdx)
230
ret
231
.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
232
233
.globl padlock_sha256_blocks
234
.type padlock_sha256_blocks,\@function,3
235
.align 16
236
padlock_sha256_blocks:
237
mov %rdx,%rcx
238
mov %rdi,%rdx # put aside %rdi
239
movups (%rdi),%xmm0 # copy-in context
240
sub \$128+8,%rsp
241
movups 16(%rdi),%xmm1
242
movaps %xmm0,(%rsp)
243
mov %rsp,%rdi
244
movaps %xmm1,16(%rsp)
245
mov \$-1,%rax
246
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
247
movaps (%rsp),%xmm0
248
movaps 16(%rsp),%xmm1
249
add \$128+8,%rsp
250
movups %xmm0,(%rdx) # copy-out context
251
movups %xmm1,16(%rdx)
252
ret
253
.size padlock_sha256_blocks,.-padlock_sha256_blocks
254
255
.globl padlock_sha512_blocks
256
.type padlock_sha512_blocks,\@function,3
257
.align 16
258
padlock_sha512_blocks:
259
mov %rdx,%rcx
260
mov %rdi,%rdx # put aside %rdi
261
movups (%rdi),%xmm0 # copy-in context
262
sub \$128+8,%rsp
263
movups 16(%rdi),%xmm1
264
movups 32(%rdi),%xmm2
265
movups 48(%rdi),%xmm3
266
movaps %xmm0,(%rsp)
267
mov %rsp,%rdi
268
movaps %xmm1,16(%rsp)
269
movaps %xmm2,32(%rsp)
270
movaps %xmm3,48(%rsp)
271
.byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
272
movaps (%rsp),%xmm0
273
movaps 16(%rsp),%xmm1
274
movaps 32(%rsp),%xmm2
275
movaps 48(%rsp),%xmm3
276
add \$128+8,%rsp
277
movups %xmm0,(%rdx) # copy-out context
278
movups %xmm1,16(%rdx)
279
movups %xmm2,32(%rdx)
280
movups %xmm3,48(%rdx)
281
ret
282
.size padlock_sha512_blocks,.-padlock_sha512_blocks
283
___
284
285
sub generate_mode {
286
my ($mode,$opcode) = @_;
287
# int padlock_$mode_encrypt(void *out, const void *inp,
288
# struct padlock_cipher_data *ctx, size_t len);
289
$code.=<<___;
290
.globl padlock_${mode}_encrypt
291
.type padlock_${mode}_encrypt,\@function,4
292
.align 16
293
padlock_${mode}_encrypt:
294
push %rbp
295
push %rbx
296
297
xor %eax,%eax
298
test \$15,$ctx
299
jnz .L${mode}_abort
300
test \$15,$len
301
jnz .L${mode}_abort
302
lea .Lpadlock_saved_context(%rip),%rax
303
pushf
304
cld
305
call _padlock_verify_ctx
306
lea 16($ctx),$ctx # control word
307
xor %eax,%eax
308
xor %ebx,%ebx
309
testl \$`1<<5`,($ctx) # align bit in control word
310
jnz .L${mode}_aligned
311
test \$0x0f,$out
312
setz %al # !out_misaligned
313
test \$0x0f,$inp
314
setz %bl # !inp_misaligned
315
test %ebx,%eax
316
jnz .L${mode}_aligned
317
neg %rax
318
mov \$$PADLOCK_CHUNK,$chunk
319
not %rax # out_misaligned?-1:0
320
lea (%rsp),%rbp
321
cmp $chunk,$len
322
cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
323
and $chunk,%rax # out_misaligned?chunk:0
324
mov $len,$chunk
325
neg %rax
326
and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
327
lea (%rax,%rbp),%rsp
328
mov \$$PADLOCK_CHUNK,%rax
329
cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
330
___
331
$code.=<<___ if ($mode eq "ctr32");
332
.L${mode}_reenter:
333
mov -4($ctx),%eax # pull 32-bit counter
334
bswap %eax
335
neg %eax
336
and \$`$PADLOCK_CHUNK/16-1`,%eax
337
mov \$$PADLOCK_CHUNK,$chunk
338
shl \$4,%eax
339
cmovz $chunk,%rax
340
cmp %rax,$len
341
cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
342
cmovbe $len,$chunk
343
___
344
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
345
cmp $chunk,$len
346
ja .L${mode}_loop
347
mov $inp,%rax # check if prefetch crosses page
348
cmp %rsp,%rbp
349
cmove $out,%rax
350
add $len,%rax
351
neg %rax
352
and \$0xfff,%rax # distance to page boundary
353
cmp \$$PADLOCK_PREFETCH{$mode},%rax
354
mov \$-$PADLOCK_PREFETCH{$mode},%rax
355
cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
356
and %rax,$chunk
357
jz .L${mode}_unaligned_tail
358
___
359
$code.=<<___;
360
jmp .L${mode}_loop
361
.align 16
362
.L${mode}_loop:
363
cmp $len,$chunk # ctr32 artefact
364
cmova $len,$chunk # ctr32 artefact
365
mov $out,%r8 # save parameters
366
mov $inp,%r9
367
mov $len,%r10
368
mov $chunk,$len
369
mov $chunk,%r11
370
test \$0x0f,$out # out_misaligned
371
cmovnz %rsp,$out
372
test \$0x0f,$inp # inp_misaligned
373
jz .L${mode}_inp_aligned
374
shr \$3,$len
375
.byte 0xf3,0x48,0xa5 # rep movsq
376
sub $chunk,$out
377
mov $chunk,$len
378
mov $out,$inp
379
.L${mode}_inp_aligned:
380
lea -16($ctx),%rax # ivp
381
lea 16($ctx),%rbx # key
382
shr \$4,$len
383
.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
384
___
385
$code.=<<___ if ($mode !~ /ecb|ctr/);
386
movdqa (%rax),%xmm0
387
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
388
___
389
$code.=<<___ if ($mode eq "ctr32");
390
mov -4($ctx),%eax # pull 32-bit counter
391
test \$0xffff0000,%eax
392
jnz .L${mode}_no_carry
393
bswap %eax
394
add \$0x10000,%eax
395
bswap %eax
396
mov %eax,-4($ctx)
397
.L${mode}_no_carry:
398
___
399
$code.=<<___;
400
mov %r8,$out # restore parameters
401
mov %r11,$chunk
402
test \$0x0f,$out
403
jz .L${mode}_out_aligned
404
mov $chunk,$len
405
lea (%rsp),$inp
406
shr \$3,$len
407
.byte 0xf3,0x48,0xa5 # rep movsq
408
sub $chunk,$out
409
.L${mode}_out_aligned:
410
mov %r9,$inp
411
mov %r10,$len
412
add $chunk,$out
413
add $chunk,$inp
414
sub $chunk,$len
415
mov \$$PADLOCK_CHUNK,$chunk
416
___
417
if (!$PADLOCK_PREFETCH{$mode}) {
418
$code.=<<___;
419
jnz .L${mode}_loop
420
___
421
} else {
422
$code.=<<___;
423
jz .L${mode}_break
424
cmp $chunk,$len
425
jae .L${mode}_loop
426
___
427
$code.=<<___ if ($mode eq "ctr32");
428
mov $len,$chunk
429
mov $inp,%rax # check if prefetch crosses page
430
cmp %rsp,%rbp
431
cmove $out,%rax
432
add $len,%rax
433
neg %rax
434
and \$0xfff,%rax # distance to page boundary
435
cmp \$$PADLOCK_PREFETCH{$mode},%rax
436
mov \$-$PADLOCK_PREFETCH{$mode},%rax
437
cmovae $chunk,%rax
438
and %rax,$chunk
439
jnz .L${mode}_loop
440
___
441
$code.=<<___;
442
.L${mode}_unaligned_tail:
443
xor %eax,%eax
444
cmp %rsp,%rbp
445
cmove $len,%rax
446
mov $out,%r8 # save parameters
447
mov $len,$chunk
448
sub %rax,%rsp # alloca
449
shr \$3,$len
450
lea (%rsp),$out
451
.byte 0xf3,0x48,0xa5 # rep movsq
452
mov %rsp,$inp
453
mov %r8, $out # restore parameters
454
mov $chunk,$len
455
jmp .L${mode}_loop
456
.align 16
457
.L${mode}_break:
458
___
459
}
460
$code.=<<___;
461
cmp %rbp,%rsp
462
je .L${mode}_done
463
464
pxor %xmm0,%xmm0
465
lea (%rsp),%rax
466
.L${mode}_bzero:
467
movaps %xmm0,(%rax)
468
lea 16(%rax),%rax
469
cmp %rax,%rbp
470
ja .L${mode}_bzero
471
472
.L${mode}_done:
473
lea (%rbp),%rsp
474
jmp .L${mode}_exit
475
476
.align 16
477
.L${mode}_aligned:
478
___
479
$code.=<<___ if ($mode eq "ctr32");
480
mov -4($ctx),%eax # pull 32-bit counter
481
bswap %eax
482
neg %eax
483
and \$0xffff,%eax
484
mov \$`16*0x10000`,$chunk
485
shl \$4,%eax
486
cmovz $chunk,%rax
487
cmp %rax,$len
488
cmova %rax,$chunk # don't let counter cross 2^16
489
cmovbe $len,$chunk
490
jbe .L${mode}_aligned_skip
491
492
.L${mode}_aligned_loop:
493
mov $len,%r10 # save parameters
494
mov $chunk,$len
495
mov $chunk,%r11
496
497
lea -16($ctx),%rax # ivp
498
lea 16($ctx),%rbx # key
499
shr \$4,$len # len/=AES_BLOCK_SIZE
500
.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
501
502
mov -4($ctx),%eax # pull 32-bit counter
503
bswap %eax
504
add \$0x10000,%eax
505
bswap %eax
506
mov %eax,-4($ctx)
507
508
mov %r10,$len # restore parameters
509
sub %r11,$len
510
mov \$`16*0x10000`,$chunk
511
jz .L${mode}_exit
512
cmp $chunk,$len
513
jae .L${mode}_aligned_loop
514
515
.L${mode}_aligned_skip:
516
___
517
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
518
lea ($inp,$len),%rbp
519
neg %rbp
520
and \$0xfff,%rbp # distance to page boundary
521
xor %eax,%eax
522
cmp \$$PADLOCK_PREFETCH{$mode},%rbp
523
mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
524
cmovae %rax,%rbp
525
and $len,%rbp # remainder
526
sub %rbp,$len
527
jz .L${mode}_aligned_tail
528
___
529
$code.=<<___;
530
lea -16($ctx),%rax # ivp
531
lea 16($ctx),%rbx # key
532
shr \$4,$len # len/=AES_BLOCK_SIZE
533
.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
534
___
535
$code.=<<___ if ($mode !~ /ecb|ctr/);
536
movdqa (%rax),%xmm0
537
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
538
___
539
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
540
test %rbp,%rbp # check remainder
541
jz .L${mode}_exit
542
543
.L${mode}_aligned_tail:
544
mov $out,%r8
545
mov %rbp,$chunk
546
mov %rbp,$len
547
lea (%rsp),%rbp
548
sub $len,%rsp
549
shr \$3,$len
550
lea (%rsp),$out
551
.byte 0xf3,0x48,0xa5 # rep movsq
552
lea (%r8),$out
553
lea (%rsp),$inp
554
mov $chunk,$len
555
jmp .L${mode}_loop
556
___
557
$code.=<<___;
558
.L${mode}_exit:
559
mov \$1,%eax
560
lea 8(%rsp),%rsp
561
.L${mode}_abort:
562
pop %rbx
563
pop %rbp
564
ret
565
.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
566
___
567
}
568
569
&generate_mode("ecb",0xc8);
570
&generate_mode("cbc",0xd0);
571
&generate_mode("cfb",0xe0);
572
&generate_mode("ofb",0xe8);
573
&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
574
575
$code.=<<___;
576
.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
577
.align 16
578
.data
579
.align 8
580
.Lpadlock_saved_context:
581
.quad 0
582
___
583
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
584
585
print $code;
586
587
close STDOUT;
588
589