Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/crypto/aesni-intel_asm.S
10817 views
1
/*
2
* Implement AES algorithm in Intel AES-NI instructions.
3
*
4
* The white paper of AES-NI instructions can be downloaded from:
5
* http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6
*
7
* Copyright (C) 2008, Intel Corp.
8
* Author: Huang Ying <[email protected]>
9
* Vinodh Gopal <[email protected]>
10
* Kahraman Akdemir
11
*
12
* Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13
* interface for 64-bit kernels.
14
* Authors: Erdinc Ozturk ([email protected])
15
* Aidan O'Mahony ([email protected])
16
* Adrian Hoban <[email protected]>
17
* James Guilford ([email protected])
18
* Gabriele Paoloni <[email protected]>
19
* Tadeusz Struk ([email protected])
20
* Wajdi Feghali ([email protected])
21
* Copyright (c) 2010, Intel Corporation.
22
*
23
* Ported x86_64 version to x86:
24
* Author: Mathias Krause <[email protected]>
25
*
26
* This program is free software; you can redistribute it and/or modify
27
* it under the terms of the GNU General Public License as published by
28
* the Free Software Foundation; either version 2 of the License, or
29
* (at your option) any later version.
30
*/
31
32
#include <linux/linkage.h>
33
#include <asm/inst.h>
34
35
#ifdef __x86_64__
36
.data
37
POLY: .octa 0xC2000000000000000000000000000001
38
TWOONE: .octa 0x00000001000000000000000000000001
39
40
# order of these constants should not change.
41
# more specifically, ALL_F should follow SHIFT_MASK,
42
# and ZERO should follow ALL_F
43
44
SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45
MASK1: .octa 0x0000000000000000ffffffffffffffff
46
MASK2: .octa 0xffffffffffffffff0000000000000000
47
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49
ZERO: .octa 0x00000000000000000000000000000000
50
ONE: .octa 0x00000000000000000000000000000001
51
F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52
dec: .octa 0x1
53
enc: .octa 0x2
54
55
56
.text
57
58
59
#define STACK_OFFSET 8*3
60
#define HashKey 16*0 // store HashKey <<1 mod poly here
61
#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62
#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63
#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64
#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65
// bits of HashKey <<1 mod poly here
66
//(for Karatsuba purposes)
67
#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68
// bits of HashKey^2 <<1 mod poly here
69
// (for Karatsuba purposes)
70
#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71
// bits of HashKey^3 <<1 mod poly here
72
// (for Karatsuba purposes)
73
#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74
// bits of HashKey^4 <<1 mod poly here
75
// (for Karatsuba purposes)
76
#define VARIABLE_OFFSET 16*8
77
78
#define arg1 rdi
79
#define arg2 rsi
80
#define arg3 rdx
81
#define arg4 rcx
82
#define arg5 r8
83
#define arg6 r9
84
#define arg7 STACK_OFFSET+8(%r14)
85
#define arg8 STACK_OFFSET+16(%r14)
86
#define arg9 STACK_OFFSET+24(%r14)
87
#define arg10 STACK_OFFSET+32(%r14)
88
#endif
89
90
91
#define STATE1 %xmm0
92
#define STATE2 %xmm4
93
#define STATE3 %xmm5
94
#define STATE4 %xmm6
95
#define STATE STATE1
96
#define IN1 %xmm1
97
#define IN2 %xmm7
98
#define IN3 %xmm8
99
#define IN4 %xmm9
100
#define IN IN1
101
#define KEY %xmm2
102
#define IV %xmm3
103
104
#define BSWAP_MASK %xmm10
105
#define CTR %xmm11
106
#define INC %xmm12
107
108
#ifdef __x86_64__
109
#define AREG %rax
110
#define KEYP %rdi
111
#define OUTP %rsi
112
#define UKEYP OUTP
113
#define INP %rdx
114
#define LEN %rcx
115
#define IVP %r8
116
#define KLEN %r9d
117
#define T1 %r10
118
#define TKEYP T1
119
#define T2 %r11
120
#define TCTR_LOW T2
121
#else
122
#define AREG %eax
123
#define KEYP %edi
124
#define OUTP AREG
125
#define UKEYP OUTP
126
#define INP %edx
127
#define LEN %esi
128
#define IVP %ebp
129
#define KLEN %ebx
130
#define T1 %ecx
131
#define TKEYP T1
132
#endif
133
134
135
#ifdef __x86_64__
136
/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137
*
138
*
139
* Input: A and B (128-bits each, bit-reflected)
140
* Output: C = A*B*x mod poly, (i.e. >>1 )
141
* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142
* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143
*
144
*/
145
.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146
movdqa \GH, \TMP1
147
pshufd $78, \GH, \TMP2
148
pshufd $78, \HK, \TMP3
149
pxor \GH, \TMP2 # TMP2 = a1+a0
150
pxor \HK, \TMP3 # TMP3 = b1+b0
151
PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152
PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153
PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154
pxor \GH, \TMP2
155
pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156
movdqa \TMP2, \TMP3
157
pslldq $8, \TMP3 # left shift TMP3 2 DWs
158
psrldq $8, \TMP2 # right shift TMP2 2 DWs
159
pxor \TMP3, \GH
160
pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162
# first phase of the reduction
163
164
movdqa \GH, \TMP2
165
movdqa \GH, \TMP3
166
movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167
# in in order to perform
168
# independent shifts
169
pslld $31, \TMP2 # packed right shift <<31
170
pslld $30, \TMP3 # packed right shift <<30
171
pslld $25, \TMP4 # packed right shift <<25
172
pxor \TMP3, \TMP2 # xor the shifted versions
173
pxor \TMP4, \TMP2
174
movdqa \TMP2, \TMP5
175
psrldq $4, \TMP5 # right shift TMP5 1 DW
176
pslldq $12, \TMP2 # left shift TMP2 3 DWs
177
pxor \TMP2, \GH
178
179
# second phase of the reduction
180
181
movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182
# in in order to perform
183
# independent shifts
184
movdqa \GH,\TMP3
185
movdqa \GH,\TMP4
186
psrld $1,\TMP2 # packed left shift >>1
187
psrld $2,\TMP3 # packed left shift >>2
188
psrld $7,\TMP4 # packed left shift >>7
189
pxor \TMP3,\TMP2 # xor the shifted versions
190
pxor \TMP4,\TMP2
191
pxor \TMP5, \TMP2
192
pxor \TMP2, \GH
193
pxor \TMP1, \GH # result is in TMP1
194
.endm
195
196
/*
197
* if a = number of total plaintext bytes
198
* b = floor(a/16)
199
* num_initial_blocks = b mod 4
200
* encrypt the initial num_initial_blocks blocks and apply ghash on
201
* the ciphertext
202
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203
* are clobbered
204
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205
*/
206
207
208
.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210
mov arg7, %r10 # %r10 = AAD
211
mov arg8, %r12 # %r12 = aadLen
212
mov %r12, %r11
213
pxor %xmm\i, %xmm\i
214
_get_AAD_loop\num_initial_blocks\operation:
215
movd (%r10), \TMP1
216
pslldq $12, \TMP1
217
psrldq $4, %xmm\i
218
pxor \TMP1, %xmm\i
219
add $4, %r10
220
sub $4, %r12
221
jne _get_AAD_loop\num_initial_blocks\operation
222
cmp $16, %r11
223
je _get_AAD_loop2_done\num_initial_blocks\operation
224
mov $16, %r12
225
_get_AAD_loop2\num_initial_blocks\operation:
226
psrldq $4, %xmm\i
227
sub $4, %r12
228
cmp %r11, %r12
229
jne _get_AAD_loop2\num_initial_blocks\operation
230
_get_AAD_loop2_done\num_initial_blocks\operation:
231
movdqa SHUF_MASK(%rip), %xmm14
232
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234
xor %r11, %r11 # initialise the data pointer offset as zero
235
236
# start AES for num_initial_blocks blocks
237
238
mov %arg5, %rax # %rax = *Y0
239
movdqu (%rax), \XMM0 # XMM0 = Y0
240
movdqa SHUF_MASK(%rip), %xmm14
241
PSHUFB_XMM %xmm14, \XMM0
242
243
.if (\i == 5) || (\i == 6) || (\i == 7)
244
.irpc index, \i_seq
245
paddd ONE(%rip), \XMM0 # INCR Y0
246
movdqa \XMM0, %xmm\index
247
movdqa SHUF_MASK(%rip), %xmm14
248
PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250
.endr
251
.irpc index, \i_seq
252
pxor 16*0(%arg1), %xmm\index
253
.endr
254
.irpc index, \i_seq
255
movaps 0x10(%rdi), \TMP1
256
AESENC \TMP1, %xmm\index # Round 1
257
.endr
258
.irpc index, \i_seq
259
movaps 0x20(%arg1), \TMP1
260
AESENC \TMP1, %xmm\index # Round 2
261
.endr
262
.irpc index, \i_seq
263
movaps 0x30(%arg1), \TMP1
264
AESENC \TMP1, %xmm\index # Round 2
265
.endr
266
.irpc index, \i_seq
267
movaps 0x40(%arg1), \TMP1
268
AESENC \TMP1, %xmm\index # Round 2
269
.endr
270
.irpc index, \i_seq
271
movaps 0x50(%arg1), \TMP1
272
AESENC \TMP1, %xmm\index # Round 2
273
.endr
274
.irpc index, \i_seq
275
movaps 0x60(%arg1), \TMP1
276
AESENC \TMP1, %xmm\index # Round 2
277
.endr
278
.irpc index, \i_seq
279
movaps 0x70(%arg1), \TMP1
280
AESENC \TMP1, %xmm\index # Round 2
281
.endr
282
.irpc index, \i_seq
283
movaps 0x80(%arg1), \TMP1
284
AESENC \TMP1, %xmm\index # Round 2
285
.endr
286
.irpc index, \i_seq
287
movaps 0x90(%arg1), \TMP1
288
AESENC \TMP1, %xmm\index # Round 2
289
.endr
290
.irpc index, \i_seq
291
movaps 0xa0(%arg1), \TMP1
292
AESENCLAST \TMP1, %xmm\index # Round 10
293
.endr
294
.irpc index, \i_seq
295
movdqu (%arg3 , %r11, 1), \TMP1
296
pxor \TMP1, %xmm\index
297
movdqu %xmm\index, (%arg2 , %r11, 1)
298
# write back plaintext/ciphertext for num_initial_blocks
299
add $16, %r11
300
301
movdqa \TMP1, %xmm\index
302
movdqa SHUF_MASK(%rip), %xmm14
303
PSHUFB_XMM %xmm14, %xmm\index
304
305
# prepare plaintext/ciphertext for GHASH computation
306
.endr
307
.endif
308
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309
# apply GHASH on num_initial_blocks blocks
310
311
.if \i == 5
312
pxor %xmm5, %xmm6
313
GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314
pxor %xmm6, %xmm7
315
GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316
pxor %xmm7, %xmm8
317
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318
.elseif \i == 6
319
pxor %xmm6, %xmm7
320
GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321
pxor %xmm7, %xmm8
322
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323
.elseif \i == 7
324
pxor %xmm7, %xmm8
325
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326
.endif
327
cmp $64, %r13
328
jl _initial_blocks_done\num_initial_blocks\operation
329
# no need for precomputed values
330
/*
331
*
332
* Precomputations for HashKey parallel with encryption of first 4 blocks.
333
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334
*/
335
paddd ONE(%rip), \XMM0 # INCR Y0
336
movdqa \XMM0, \XMM1
337
movdqa SHUF_MASK(%rip), %xmm14
338
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340
paddd ONE(%rip), \XMM0 # INCR Y0
341
movdqa \XMM0, \XMM2
342
movdqa SHUF_MASK(%rip), %xmm14
343
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345
paddd ONE(%rip), \XMM0 # INCR Y0
346
movdqa \XMM0, \XMM3
347
movdqa SHUF_MASK(%rip), %xmm14
348
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350
paddd ONE(%rip), \XMM0 # INCR Y0
351
movdqa \XMM0, \XMM4
352
movdqa SHUF_MASK(%rip), %xmm14
353
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355
pxor 16*0(%arg1), \XMM1
356
pxor 16*0(%arg1), \XMM2
357
pxor 16*0(%arg1), \XMM3
358
pxor 16*0(%arg1), \XMM4
359
movdqa \TMP3, \TMP5
360
pshufd $78, \TMP3, \TMP1
361
pxor \TMP3, \TMP1
362
movdqa \TMP1, HashKey_k(%rsp)
363
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364
# TMP5 = HashKey^2<<1 (mod poly)
365
movdqa \TMP5, HashKey_2(%rsp)
366
# HashKey_2 = HashKey^2<<1 (mod poly)
367
pshufd $78, \TMP5, \TMP1
368
pxor \TMP5, \TMP1
369
movdqa \TMP1, HashKey_2_k(%rsp)
370
.irpc index, 1234 # do 4 rounds
371
movaps 0x10*\index(%arg1), \TMP1
372
AESENC \TMP1, \XMM1
373
AESENC \TMP1, \XMM2
374
AESENC \TMP1, \XMM3
375
AESENC \TMP1, \XMM4
376
.endr
377
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378
# TMP5 = HashKey^3<<1 (mod poly)
379
movdqa \TMP5, HashKey_3(%rsp)
380
pshufd $78, \TMP5, \TMP1
381
pxor \TMP5, \TMP1
382
movdqa \TMP1, HashKey_3_k(%rsp)
383
.irpc index, 56789 # do next 5 rounds
384
movaps 0x10*\index(%arg1), \TMP1
385
AESENC \TMP1, \XMM1
386
AESENC \TMP1, \XMM2
387
AESENC \TMP1, \XMM3
388
AESENC \TMP1, \XMM4
389
.endr
390
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391
# TMP5 = HashKey^3<<1 (mod poly)
392
movdqa \TMP5, HashKey_4(%rsp)
393
pshufd $78, \TMP5, \TMP1
394
pxor \TMP5, \TMP1
395
movdqa \TMP1, HashKey_4_k(%rsp)
396
movaps 0xa0(%arg1), \TMP2
397
AESENCLAST \TMP2, \XMM1
398
AESENCLAST \TMP2, \XMM2
399
AESENCLAST \TMP2, \XMM3
400
AESENCLAST \TMP2, \XMM4
401
movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402
pxor \TMP1, \XMM1
403
movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404
movdqa \TMP1, \XMM1
405
movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406
pxor \TMP1, \XMM2
407
movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408
movdqa \TMP1, \XMM2
409
movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410
pxor \TMP1, \XMM3
411
movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412
movdqa \TMP1, \XMM3
413
movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414
pxor \TMP1, \XMM4
415
movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416
movdqa \TMP1, \XMM4
417
add $64, %r11
418
movdqa SHUF_MASK(%rip), %xmm14
419
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420
pxor \XMMDst, \XMM1
421
# combine GHASHed value with the corresponding ciphertext
422
movdqa SHUF_MASK(%rip), %xmm14
423
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424
movdqa SHUF_MASK(%rip), %xmm14
425
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426
movdqa SHUF_MASK(%rip), %xmm14
427
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429
_initial_blocks_done\num_initial_blocks\operation:
430
431
.endm
432
433
434
/*
435
* if a = number of total plaintext bytes
436
* b = floor(a/16)
437
* num_initial_blocks = b mod 4
438
* encrypt the initial num_initial_blocks blocks and apply ghash on
439
* the ciphertext
440
* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441
* are clobbered
442
* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443
*/
444
445
446
.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448
mov arg7, %r10 # %r10 = AAD
449
mov arg8, %r12 # %r12 = aadLen
450
mov %r12, %r11
451
pxor %xmm\i, %xmm\i
452
_get_AAD_loop\num_initial_blocks\operation:
453
movd (%r10), \TMP1
454
pslldq $12, \TMP1
455
psrldq $4, %xmm\i
456
pxor \TMP1, %xmm\i
457
add $4, %r10
458
sub $4, %r12
459
jne _get_AAD_loop\num_initial_blocks\operation
460
cmp $16, %r11
461
je _get_AAD_loop2_done\num_initial_blocks\operation
462
mov $16, %r12
463
_get_AAD_loop2\num_initial_blocks\operation:
464
psrldq $4, %xmm\i
465
sub $4, %r12
466
cmp %r11, %r12
467
jne _get_AAD_loop2\num_initial_blocks\operation
468
_get_AAD_loop2_done\num_initial_blocks\operation:
469
movdqa SHUF_MASK(%rip), %xmm14
470
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472
xor %r11, %r11 # initialise the data pointer offset as zero
473
474
# start AES for num_initial_blocks blocks
475
476
mov %arg5, %rax # %rax = *Y0
477
movdqu (%rax), \XMM0 # XMM0 = Y0
478
movdqa SHUF_MASK(%rip), %xmm14
479
PSHUFB_XMM %xmm14, \XMM0
480
481
.if (\i == 5) || (\i == 6) || (\i == 7)
482
.irpc index, \i_seq
483
paddd ONE(%rip), \XMM0 # INCR Y0
484
movdqa \XMM0, %xmm\index
485
movdqa SHUF_MASK(%rip), %xmm14
486
PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488
.endr
489
.irpc index, \i_seq
490
pxor 16*0(%arg1), %xmm\index
491
.endr
492
.irpc index, \i_seq
493
movaps 0x10(%rdi), \TMP1
494
AESENC \TMP1, %xmm\index # Round 1
495
.endr
496
.irpc index, \i_seq
497
movaps 0x20(%arg1), \TMP1
498
AESENC \TMP1, %xmm\index # Round 2
499
.endr
500
.irpc index, \i_seq
501
movaps 0x30(%arg1), \TMP1
502
AESENC \TMP1, %xmm\index # Round 2
503
.endr
504
.irpc index, \i_seq
505
movaps 0x40(%arg1), \TMP1
506
AESENC \TMP1, %xmm\index # Round 2
507
.endr
508
.irpc index, \i_seq
509
movaps 0x50(%arg1), \TMP1
510
AESENC \TMP1, %xmm\index # Round 2
511
.endr
512
.irpc index, \i_seq
513
movaps 0x60(%arg1), \TMP1
514
AESENC \TMP1, %xmm\index # Round 2
515
.endr
516
.irpc index, \i_seq
517
movaps 0x70(%arg1), \TMP1
518
AESENC \TMP1, %xmm\index # Round 2
519
.endr
520
.irpc index, \i_seq
521
movaps 0x80(%arg1), \TMP1
522
AESENC \TMP1, %xmm\index # Round 2
523
.endr
524
.irpc index, \i_seq
525
movaps 0x90(%arg1), \TMP1
526
AESENC \TMP1, %xmm\index # Round 2
527
.endr
528
.irpc index, \i_seq
529
movaps 0xa0(%arg1), \TMP1
530
AESENCLAST \TMP1, %xmm\index # Round 10
531
.endr
532
.irpc index, \i_seq
533
movdqu (%arg3 , %r11, 1), \TMP1
534
pxor \TMP1, %xmm\index
535
movdqu %xmm\index, (%arg2 , %r11, 1)
536
# write back plaintext/ciphertext for num_initial_blocks
537
add $16, %r11
538
539
movdqa SHUF_MASK(%rip), %xmm14
540
PSHUFB_XMM %xmm14, %xmm\index
541
542
# prepare plaintext/ciphertext for GHASH computation
543
.endr
544
.endif
545
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546
# apply GHASH on num_initial_blocks blocks
547
548
.if \i == 5
549
pxor %xmm5, %xmm6
550
GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551
pxor %xmm6, %xmm7
552
GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553
pxor %xmm7, %xmm8
554
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555
.elseif \i == 6
556
pxor %xmm6, %xmm7
557
GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558
pxor %xmm7, %xmm8
559
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560
.elseif \i == 7
561
pxor %xmm7, %xmm8
562
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563
.endif
564
cmp $64, %r13
565
jl _initial_blocks_done\num_initial_blocks\operation
566
# no need for precomputed values
567
/*
568
*
569
* Precomputations for HashKey parallel with encryption of first 4 blocks.
570
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571
*/
572
paddd ONE(%rip), \XMM0 # INCR Y0
573
movdqa \XMM0, \XMM1
574
movdqa SHUF_MASK(%rip), %xmm14
575
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577
paddd ONE(%rip), \XMM0 # INCR Y0
578
movdqa \XMM0, \XMM2
579
movdqa SHUF_MASK(%rip), %xmm14
580
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582
paddd ONE(%rip), \XMM0 # INCR Y0
583
movdqa \XMM0, \XMM3
584
movdqa SHUF_MASK(%rip), %xmm14
585
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587
paddd ONE(%rip), \XMM0 # INCR Y0
588
movdqa \XMM0, \XMM4
589
movdqa SHUF_MASK(%rip), %xmm14
590
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592
pxor 16*0(%arg1), \XMM1
593
pxor 16*0(%arg1), \XMM2
594
pxor 16*0(%arg1), \XMM3
595
pxor 16*0(%arg1), \XMM4
596
movdqa \TMP3, \TMP5
597
pshufd $78, \TMP3, \TMP1
598
pxor \TMP3, \TMP1
599
movdqa \TMP1, HashKey_k(%rsp)
600
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601
# TMP5 = HashKey^2<<1 (mod poly)
602
movdqa \TMP5, HashKey_2(%rsp)
603
# HashKey_2 = HashKey^2<<1 (mod poly)
604
pshufd $78, \TMP5, \TMP1
605
pxor \TMP5, \TMP1
606
movdqa \TMP1, HashKey_2_k(%rsp)
607
.irpc index, 1234 # do 4 rounds
608
movaps 0x10*\index(%arg1), \TMP1
609
AESENC \TMP1, \XMM1
610
AESENC \TMP1, \XMM2
611
AESENC \TMP1, \XMM3
612
AESENC \TMP1, \XMM4
613
.endr
614
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615
# TMP5 = HashKey^3<<1 (mod poly)
616
movdqa \TMP5, HashKey_3(%rsp)
617
pshufd $78, \TMP5, \TMP1
618
pxor \TMP5, \TMP1
619
movdqa \TMP1, HashKey_3_k(%rsp)
620
.irpc index, 56789 # do next 5 rounds
621
movaps 0x10*\index(%arg1), \TMP1
622
AESENC \TMP1, \XMM1
623
AESENC \TMP1, \XMM2
624
AESENC \TMP1, \XMM3
625
AESENC \TMP1, \XMM4
626
.endr
627
GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628
# TMP5 = HashKey^3<<1 (mod poly)
629
movdqa \TMP5, HashKey_4(%rsp)
630
pshufd $78, \TMP5, \TMP1
631
pxor \TMP5, \TMP1
632
movdqa \TMP1, HashKey_4_k(%rsp)
633
movaps 0xa0(%arg1), \TMP2
634
AESENCLAST \TMP2, \XMM1
635
AESENCLAST \TMP2, \XMM2
636
AESENCLAST \TMP2, \XMM3
637
AESENCLAST \TMP2, \XMM4
638
movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639
pxor \TMP1, \XMM1
640
movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641
pxor \TMP1, \XMM2
642
movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643
pxor \TMP1, \XMM3
644
movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645
pxor \TMP1, \XMM4
646
movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647
movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648
movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649
movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651
add $64, %r11
652
movdqa SHUF_MASK(%rip), %xmm14
653
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654
pxor \XMMDst, \XMM1
655
# combine GHASHed value with the corresponding ciphertext
656
movdqa SHUF_MASK(%rip), %xmm14
657
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658
movdqa SHUF_MASK(%rip), %xmm14
659
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660
movdqa SHUF_MASK(%rip), %xmm14
661
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663
_initial_blocks_done\num_initial_blocks\operation:
664
665
.endm
666
667
/*
668
* encrypt 4 blocks at a time
669
* ghash the 4 previously encrypted ciphertext blocks
670
* arg1, %arg2, %arg3 are used as pointers only, not modified
671
* %r11 is the data offset value
672
*/
673
.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676
movdqa \XMM1, \XMM5
677
movdqa \XMM2, \XMM6
678
movdqa \XMM3, \XMM7
679
movdqa \XMM4, \XMM8
680
681
movdqa SHUF_MASK(%rip), %xmm15
682
# multiply TMP5 * HashKey using karatsuba
683
684
movdqa \XMM5, \TMP4
685
pshufd $78, \XMM5, \TMP6
686
pxor \XMM5, \TMP6
687
paddd ONE(%rip), \XMM0 # INCR CNT
688
movdqa HashKey_4(%rsp), \TMP5
689
PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690
movdqa \XMM0, \XMM1
691
paddd ONE(%rip), \XMM0 # INCR CNT
692
movdqa \XMM0, \XMM2
693
paddd ONE(%rip), \XMM0 # INCR CNT
694
movdqa \XMM0, \XMM3
695
paddd ONE(%rip), \XMM0 # INCR CNT
696
movdqa \XMM0, \XMM4
697
PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698
PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699
PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700
PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701
PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703
pxor (%arg1), \XMM1
704
pxor (%arg1), \XMM2
705
pxor (%arg1), \XMM3
706
pxor (%arg1), \XMM4
707
movdqa HashKey_4_k(%rsp), \TMP5
708
PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709
movaps 0x10(%arg1), \TMP1
710
AESENC \TMP1, \XMM1 # Round 1
711
AESENC \TMP1, \XMM2
712
AESENC \TMP1, \XMM3
713
AESENC \TMP1, \XMM4
714
movaps 0x20(%arg1), \TMP1
715
AESENC \TMP1, \XMM1 # Round 2
716
AESENC \TMP1, \XMM2
717
AESENC \TMP1, \XMM3
718
AESENC \TMP1, \XMM4
719
movdqa \XMM6, \TMP1
720
pshufd $78, \XMM6, \TMP2
721
pxor \XMM6, \TMP2
722
movdqa HashKey_3(%rsp), \TMP5
723
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724
movaps 0x30(%arg1), \TMP3
725
AESENC \TMP3, \XMM1 # Round 3
726
AESENC \TMP3, \XMM2
727
AESENC \TMP3, \XMM3
728
AESENC \TMP3, \XMM4
729
PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730
movaps 0x40(%arg1), \TMP3
731
AESENC \TMP3, \XMM1 # Round 4
732
AESENC \TMP3, \XMM2
733
AESENC \TMP3, \XMM3
734
AESENC \TMP3, \XMM4
735
movdqa HashKey_3_k(%rsp), \TMP5
736
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737
movaps 0x50(%arg1), \TMP3
738
AESENC \TMP3, \XMM1 # Round 5
739
AESENC \TMP3, \XMM2
740
AESENC \TMP3, \XMM3
741
AESENC \TMP3, \XMM4
742
pxor \TMP1, \TMP4
743
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744
pxor \XMM6, \XMM5
745
pxor \TMP2, \TMP6
746
movdqa \XMM7, \TMP1
747
pshufd $78, \XMM7, \TMP2
748
pxor \XMM7, \TMP2
749
movdqa HashKey_2(%rsp ), \TMP5
750
751
# Multiply TMP5 * HashKey using karatsuba
752
753
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754
movaps 0x60(%arg1), \TMP3
755
AESENC \TMP3, \XMM1 # Round 6
756
AESENC \TMP3, \XMM2
757
AESENC \TMP3, \XMM3
758
AESENC \TMP3, \XMM4
759
PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760
movaps 0x70(%arg1), \TMP3
761
AESENC \TMP3, \XMM1 # Round 7
762
AESENC \TMP3, \XMM2
763
AESENC \TMP3, \XMM3
764
AESENC \TMP3, \XMM4
765
movdqa HashKey_2_k(%rsp), \TMP5
766
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767
movaps 0x80(%arg1), \TMP3
768
AESENC \TMP3, \XMM1 # Round 8
769
AESENC \TMP3, \XMM2
770
AESENC \TMP3, \XMM3
771
AESENC \TMP3, \XMM4
772
pxor \TMP1, \TMP4
773
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774
pxor \XMM7, \XMM5
775
pxor \TMP2, \TMP6
776
777
# Multiply XMM8 * HashKey
778
# XMM8 and TMP5 hold the values for the two operands
779
780
movdqa \XMM8, \TMP1
781
pshufd $78, \XMM8, \TMP2
782
pxor \XMM8, \TMP2
783
movdqa HashKey(%rsp), \TMP5
784
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785
movaps 0x90(%arg1), \TMP3
786
AESENC \TMP3, \XMM1 # Round 9
787
AESENC \TMP3, \XMM2
788
AESENC \TMP3, \XMM3
789
AESENC \TMP3, \XMM4
790
PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791
movaps 0xa0(%arg1), \TMP3
792
AESENCLAST \TMP3, \XMM1 # Round 10
793
AESENCLAST \TMP3, \XMM2
794
AESENCLAST \TMP3, \XMM3
795
AESENCLAST \TMP3, \XMM4
796
movdqa HashKey_k(%rsp), \TMP5
797
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798
movdqu (%arg3,%r11,1), \TMP3
799
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800
movdqu 16(%arg3,%r11,1), \TMP3
801
pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802
movdqu 32(%arg3,%r11,1), \TMP3
803
pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804
movdqu 48(%arg3,%r11,1), \TMP3
805
pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806
movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807
movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808
movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809
movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810
PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811
PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812
PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813
PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815
pxor \TMP4, \TMP1
816
pxor \XMM8, \XMM5
817
pxor \TMP6, \TMP2
818
pxor \TMP1, \TMP2
819
pxor \XMM5, \TMP2
820
movdqa \TMP2, \TMP3
821
pslldq $8, \TMP3 # left shift TMP3 2 DWs
822
psrldq $8, \TMP2 # right shift TMP2 2 DWs
823
pxor \TMP3, \XMM5
824
pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826
# first phase of reduction
827
828
movdqa \XMM5, \TMP2
829
movdqa \XMM5, \TMP3
830
movdqa \XMM5, \TMP4
831
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832
pslld $31, \TMP2 # packed right shift << 31
833
pslld $30, \TMP3 # packed right shift << 30
834
pslld $25, \TMP4 # packed right shift << 25
835
pxor \TMP3, \TMP2 # xor the shifted versions
836
pxor \TMP4, \TMP2
837
movdqa \TMP2, \TMP5
838
psrldq $4, \TMP5 # right shift T5 1 DW
839
pslldq $12, \TMP2 # left shift T2 3 DWs
840
pxor \TMP2, \XMM5
841
842
# second phase of reduction
843
844
movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845
movdqa \XMM5,\TMP3
846
movdqa \XMM5,\TMP4
847
psrld $1, \TMP2 # packed left shift >>1
848
psrld $2, \TMP3 # packed left shift >>2
849
psrld $7, \TMP4 # packed left shift >>7
850
pxor \TMP3,\TMP2 # xor the shifted versions
851
pxor \TMP4,\TMP2
852
pxor \TMP5, \TMP2
853
pxor \TMP2, \XMM5
854
pxor \TMP1, \XMM5 # result is in TMP1
855
856
pxor \XMM5, \XMM1
857
.endm
858
859
/*
860
* decrypt 4 blocks at a time
861
* ghash the 4 previously decrypted ciphertext blocks
862
* arg1, %arg2, %arg3 are used as pointers only, not modified
863
* %r11 is the data offset value
864
*/
865
.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866
TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868
movdqa \XMM1, \XMM5
869
movdqa \XMM2, \XMM6
870
movdqa \XMM3, \XMM7
871
movdqa \XMM4, \XMM8
872
873
movdqa SHUF_MASK(%rip), %xmm15
874
# multiply TMP5 * HashKey using karatsuba
875
876
movdqa \XMM5, \TMP4
877
pshufd $78, \XMM5, \TMP6
878
pxor \XMM5, \TMP6
879
paddd ONE(%rip), \XMM0 # INCR CNT
880
movdqa HashKey_4(%rsp), \TMP5
881
PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882
movdqa \XMM0, \XMM1
883
paddd ONE(%rip), \XMM0 # INCR CNT
884
movdqa \XMM0, \XMM2
885
paddd ONE(%rip), \XMM0 # INCR CNT
886
movdqa \XMM0, \XMM3
887
paddd ONE(%rip), \XMM0 # INCR CNT
888
movdqa \XMM0, \XMM4
889
PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890
PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891
PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892
PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893
PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895
pxor (%arg1), \XMM1
896
pxor (%arg1), \XMM2
897
pxor (%arg1), \XMM3
898
pxor (%arg1), \XMM4
899
movdqa HashKey_4_k(%rsp), \TMP5
900
PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901
movaps 0x10(%arg1), \TMP1
902
AESENC \TMP1, \XMM1 # Round 1
903
AESENC \TMP1, \XMM2
904
AESENC \TMP1, \XMM3
905
AESENC \TMP1, \XMM4
906
movaps 0x20(%arg1), \TMP1
907
AESENC \TMP1, \XMM1 # Round 2
908
AESENC \TMP1, \XMM2
909
AESENC \TMP1, \XMM3
910
AESENC \TMP1, \XMM4
911
movdqa \XMM6, \TMP1
912
pshufd $78, \XMM6, \TMP2
913
pxor \XMM6, \TMP2
914
movdqa HashKey_3(%rsp), \TMP5
915
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916
movaps 0x30(%arg1), \TMP3
917
AESENC \TMP3, \XMM1 # Round 3
918
AESENC \TMP3, \XMM2
919
AESENC \TMP3, \XMM3
920
AESENC \TMP3, \XMM4
921
PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922
movaps 0x40(%arg1), \TMP3
923
AESENC \TMP3, \XMM1 # Round 4
924
AESENC \TMP3, \XMM2
925
AESENC \TMP3, \XMM3
926
AESENC \TMP3, \XMM4
927
movdqa HashKey_3_k(%rsp), \TMP5
928
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929
movaps 0x50(%arg1), \TMP3
930
AESENC \TMP3, \XMM1 # Round 5
931
AESENC \TMP3, \XMM2
932
AESENC \TMP3, \XMM3
933
AESENC \TMP3, \XMM4
934
pxor \TMP1, \TMP4
935
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936
pxor \XMM6, \XMM5
937
pxor \TMP2, \TMP6
938
movdqa \XMM7, \TMP1
939
pshufd $78, \XMM7, \TMP2
940
pxor \XMM7, \TMP2
941
movdqa HashKey_2(%rsp ), \TMP5
942
943
# Multiply TMP5 * HashKey using karatsuba
944
945
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946
movaps 0x60(%arg1), \TMP3
947
AESENC \TMP3, \XMM1 # Round 6
948
AESENC \TMP3, \XMM2
949
AESENC \TMP3, \XMM3
950
AESENC \TMP3, \XMM4
951
PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952
movaps 0x70(%arg1), \TMP3
953
AESENC \TMP3, \XMM1 # Round 7
954
AESENC \TMP3, \XMM2
955
AESENC \TMP3, \XMM3
956
AESENC \TMP3, \XMM4
957
movdqa HashKey_2_k(%rsp), \TMP5
958
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959
movaps 0x80(%arg1), \TMP3
960
AESENC \TMP3, \XMM1 # Round 8
961
AESENC \TMP3, \XMM2
962
AESENC \TMP3, \XMM3
963
AESENC \TMP3, \XMM4
964
pxor \TMP1, \TMP4
965
# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966
pxor \XMM7, \XMM5
967
pxor \TMP2, \TMP6
968
969
# Multiply XMM8 * HashKey
970
# XMM8 and TMP5 hold the values for the two operands
971
972
movdqa \XMM8, \TMP1
973
pshufd $78, \XMM8, \TMP2
974
pxor \XMM8, \TMP2
975
movdqa HashKey(%rsp), \TMP5
976
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977
movaps 0x90(%arg1), \TMP3
978
AESENC \TMP3, \XMM1 # Round 9
979
AESENC \TMP3, \XMM2
980
AESENC \TMP3, \XMM3
981
AESENC \TMP3, \XMM4
982
PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983
movaps 0xa0(%arg1), \TMP3
984
AESENCLAST \TMP3, \XMM1 # Round 10
985
AESENCLAST \TMP3, \XMM2
986
AESENCLAST \TMP3, \XMM3
987
AESENCLAST \TMP3, \XMM4
988
movdqa HashKey_k(%rsp), \TMP5
989
PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990
movdqu (%arg3,%r11,1), \TMP3
991
pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992
movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993
movdqa \TMP3, \XMM1
994
movdqu 16(%arg3,%r11,1), \TMP3
995
pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996
movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997
movdqa \TMP3, \XMM2
998
movdqu 32(%arg3,%r11,1), \TMP3
999
pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000
movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001
movdqa \TMP3, \XMM3
1002
movdqu 48(%arg3,%r11,1), \TMP3
1003
pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004
movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005
movdqa \TMP3, \XMM4
1006
PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007
PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008
PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009
PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011
pxor \TMP4, \TMP1
1012
pxor \XMM8, \XMM5
1013
pxor \TMP6, \TMP2
1014
pxor \TMP1, \TMP2
1015
pxor \XMM5, \TMP2
1016
movdqa \TMP2, \TMP3
1017
pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018
psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019
pxor \TMP3, \XMM5
1020
pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022
# first phase of reduction
1023
1024
movdqa \XMM5, \TMP2
1025
movdqa \XMM5, \TMP3
1026
movdqa \XMM5, \TMP4
1027
# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028
pslld $31, \TMP2 # packed right shift << 31
1029
pslld $30, \TMP3 # packed right shift << 30
1030
pslld $25, \TMP4 # packed right shift << 25
1031
pxor \TMP3, \TMP2 # xor the shifted versions
1032
pxor \TMP4, \TMP2
1033
movdqa \TMP2, \TMP5
1034
psrldq $4, \TMP5 # right shift T5 1 DW
1035
pslldq $12, \TMP2 # left shift T2 3 DWs
1036
pxor \TMP2, \XMM5
1037
1038
# second phase of reduction
1039
1040
movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041
movdqa \XMM5,\TMP3
1042
movdqa \XMM5,\TMP4
1043
psrld $1, \TMP2 # packed left shift >>1
1044
psrld $2, \TMP3 # packed left shift >>2
1045
psrld $7, \TMP4 # packed left shift >>7
1046
pxor \TMP3,\TMP2 # xor the shifted versions
1047
pxor \TMP4,\TMP2
1048
pxor \TMP5, \TMP2
1049
pxor \TMP2, \XMM5
1050
pxor \TMP1, \XMM5 # result is in TMP1
1051
1052
pxor \XMM5, \XMM1
1053
.endm
1054
1055
/* GHASH the last 4 ciphertext blocks. */
1056
.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057
TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059
# Multiply TMP6 * HashKey (using Karatsuba)
1060
1061
movdqa \XMM1, \TMP6
1062
pshufd $78, \XMM1, \TMP2
1063
pxor \XMM1, \TMP2
1064
movdqa HashKey_4(%rsp), \TMP5
1065
PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066
PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067
movdqa HashKey_4_k(%rsp), \TMP4
1068
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069
movdqa \XMM1, \XMMDst
1070
movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072
# Multiply TMP1 * HashKey (using Karatsuba)
1073
1074
movdqa \XMM2, \TMP1
1075
pshufd $78, \XMM2, \TMP2
1076
pxor \XMM2, \TMP2
1077
movdqa HashKey_3(%rsp), \TMP5
1078
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079
PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080
movdqa HashKey_3_k(%rsp), \TMP4
1081
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082
pxor \TMP1, \TMP6
1083
pxor \XMM2, \XMMDst
1084
pxor \TMP2, \XMM1
1085
# results accumulated in TMP6, XMMDst, XMM1
1086
1087
# Multiply TMP1 * HashKey (using Karatsuba)
1088
1089
movdqa \XMM3, \TMP1
1090
pshufd $78, \XMM3, \TMP2
1091
pxor \XMM3, \TMP2
1092
movdqa HashKey_2(%rsp), \TMP5
1093
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094
PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095
movdqa HashKey_2_k(%rsp), \TMP4
1096
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097
pxor \TMP1, \TMP6
1098
pxor \XMM3, \XMMDst
1099
pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101
# Multiply TMP1 * HashKey (using Karatsuba)
1102
movdqa \XMM4, \TMP1
1103
pshufd $78, \XMM4, \TMP2
1104
pxor \XMM4, \TMP2
1105
movdqa HashKey(%rsp), \TMP5
1106
PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107
PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108
movdqa HashKey_k(%rsp), \TMP4
1109
PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110
pxor \TMP1, \TMP6
1111
pxor \XMM4, \XMMDst
1112
pxor \XMM1, \TMP2
1113
pxor \TMP6, \TMP2
1114
pxor \XMMDst, \TMP2
1115
# middle section of the temp results combined as in karatsuba algorithm
1116
movdqa \TMP2, \TMP4
1117
pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118
psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119
pxor \TMP4, \XMMDst
1120
pxor \TMP2, \TMP6
1121
# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122
# first phase of the reduction
1123
movdqa \XMMDst, \TMP2
1124
movdqa \XMMDst, \TMP3
1125
movdqa \XMMDst, \TMP4
1126
# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127
pslld $31, \TMP2 # packed right shifting << 31
1128
pslld $30, \TMP3 # packed right shifting << 30
1129
pslld $25, \TMP4 # packed right shifting << 25
1130
pxor \TMP3, \TMP2 # xor the shifted versions
1131
pxor \TMP4, \TMP2
1132
movdqa \TMP2, \TMP7
1133
psrldq $4, \TMP7 # right shift TMP7 1 DW
1134
pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135
pxor \TMP2, \XMMDst
1136
1137
# second phase of the reduction
1138
movdqa \XMMDst, \TMP2
1139
# make 3 copies of XMMDst for doing 3 shift operations
1140
movdqa \XMMDst, \TMP3
1141
movdqa \XMMDst, \TMP4
1142
psrld $1, \TMP2 # packed left shift >> 1
1143
psrld $2, \TMP3 # packed left shift >> 2
1144
psrld $7, \TMP4 # packed left shift >> 7
1145
pxor \TMP3, \TMP2 # xor the shifted versions
1146
pxor \TMP4, \TMP2
1147
pxor \TMP7, \TMP2
1148
pxor \TMP2, \XMMDst
1149
pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150
.endm
1151
1152
/* Encryption of a single block done*/
1153
.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155
pxor (%arg1), \XMM0
1156
movaps 16(%arg1), \TMP1
1157
AESENC \TMP1, \XMM0
1158
movaps 32(%arg1), \TMP1
1159
AESENC \TMP1, \XMM0
1160
movaps 48(%arg1), \TMP1
1161
AESENC \TMP1, \XMM0
1162
movaps 64(%arg1), \TMP1
1163
AESENC \TMP1, \XMM0
1164
movaps 80(%arg1), \TMP1
1165
AESENC \TMP1, \XMM0
1166
movaps 96(%arg1), \TMP1
1167
AESENC \TMP1, \XMM0
1168
movaps 112(%arg1), \TMP1
1169
AESENC \TMP1, \XMM0
1170
movaps 128(%arg1), \TMP1
1171
AESENC \TMP1, \XMM0
1172
movaps 144(%arg1), \TMP1
1173
AESENC \TMP1, \XMM0
1174
movaps 160(%arg1), \TMP1
1175
AESENCLAST \TMP1, \XMM0
1176
.endm
1177
1178
1179
/*****************************************************************************
1180
* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181
* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182
* const u8 *in, // Ciphertext input
1183
* u64 plaintext_len, // Length of data in bytes for decryption.
1184
* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185
* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186
* // concatenated with 0x00000001. 16-byte aligned pointer.
1187
* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188
* const u8 *aad, // Additional Authentication Data (AAD)
1189
* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190
* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191
* // given authentication tag and only return the plaintext if they match.
1192
* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193
* // (most likely), 12 or 8.
1194
*
1195
* Assumptions:
1196
*
1197
* keys:
1198
* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199
* set of 11 keys in the data structure void *aes_ctx
1200
*
1201
* iv:
1202
* 0 1 2 3
1203
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205
* | Salt (From the SA) |
1206
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207
* | Initialization Vector |
1208
* | (This is the sequence number from IPSec header) |
1209
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210
* | 0x1 |
1211
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212
*
1213
*
1214
*
1215
* AAD:
1216
* AAD padded to 128 bits with 0
1217
* for example, assume AAD is a u32 vector
1218
*
1219
* if AAD is 8 bytes:
1220
* AAD[3] = {A0, A1};
1221
* padded AAD in xmm register = {A1 A0 0 0}
1222
*
1223
* 0 1 2 3
1224
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226
* | SPI (A1) |
1227
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228
* | 32-bit Sequence Number (A0) |
1229
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230
* | 0x0 |
1231
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232
*
1233
* AAD Format with 32-bit Sequence Number
1234
*
1235
* if AAD is 12 bytes:
1236
* AAD[3] = {A0, A1, A2};
1237
* padded AAD in xmm register = {A2 A1 A0 0}
1238
*
1239
* 0 1 2 3
1240
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244
* | SPI (A2) |
1245
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246
* | 64-bit Extended Sequence Number {A1,A0} |
1247
* | |
1248
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249
* | 0x0 |
1250
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251
*
1252
* AAD Format with 64-bit Extended Sequence Number
1253
*
1254
* aadLen:
1255
* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256
* The code supports 16 too but for other sizes, the code will fail.
1257
*
1258
* TLen:
1259
* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260
* For other sizes, the code will fail.
1261
*
1262
* poly = x^128 + x^127 + x^126 + x^121 + 1
1263
*
1264
*****************************************************************************/
1265
1266
ENTRY(aesni_gcm_dec)
1267
push %r12
1268
push %r13
1269
push %r14
1270
mov %rsp, %r14
1271
/*
1272
* states of %xmm registers %xmm6:%xmm15 not saved
1273
* all %xmm registers are clobbered
1274
*/
1275
sub $VARIABLE_OFFSET, %rsp
1276
and $~63, %rsp # align rsp to 64 bytes
1277
mov %arg6, %r12
1278
movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279
movdqa SHUF_MASK(%rip), %xmm2
1280
PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283
# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285
movdqa %xmm13, %xmm2
1286
psllq $1, %xmm13
1287
psrlq $63, %xmm2
1288
movdqa %xmm2, %xmm1
1289
pslldq $8, %xmm2
1290
psrldq $8, %xmm1
1291
por %xmm2, %xmm13
1292
1293
# Reduction
1294
1295
pshufd $0x24, %xmm1, %xmm2
1296
pcmpeqd TWOONE(%rip), %xmm2
1297
pand POLY(%rip), %xmm2
1298
pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301
# Decrypt first few blocks
1302
1303
movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304
mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305
and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306
mov %r13, %r12
1307
and $(3<<4), %r12
1308
jz _initial_num_blocks_is_0_decrypt
1309
cmp $(2<<4), %r12
1310
jb _initial_num_blocks_is_1_decrypt
1311
je _initial_num_blocks_is_2_decrypt
1312
_initial_num_blocks_is_3_decrypt:
1313
INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315
sub $48, %r13
1316
jmp _initial_blocks_decrypted
1317
_initial_num_blocks_is_2_decrypt:
1318
INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320
sub $32, %r13
1321
jmp _initial_blocks_decrypted
1322
_initial_num_blocks_is_1_decrypt:
1323
INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325
sub $16, %r13
1326
jmp _initial_blocks_decrypted
1327
_initial_num_blocks_is_0_decrypt:
1328
INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330
_initial_blocks_decrypted:
1331
cmp $0, %r13
1332
je _zero_cipher_left_decrypt
1333
sub $64, %r13
1334
je _four_cipher_left_decrypt
1335
_decrypt_by_4:
1336
GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337
%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338
add $64, %r11
1339
sub $64, %r13
1340
jne _decrypt_by_4
1341
_four_cipher_left_decrypt:
1342
GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344
_zero_cipher_left_decrypt:
1345
mov %arg4, %r13
1346
and $15, %r13 # %r13 = arg4 (mod 16)
1347
je _multiple_of_16_bytes_decrypt
1348
1349
# Handle the last <16 byte block separately
1350
1351
paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352
movdqa SHUF_MASK(%rip), %xmm10
1353
PSHUFB_XMM %xmm10, %xmm0
1354
1355
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356
sub $16, %r11
1357
add %r13, %r11
1358
movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359
lea SHIFT_MASK+16(%rip), %r12
1360
sub %r13, %r12
1361
# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362
# (%r13 is the number of bytes in plaintext mod 16)
1363
movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364
PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366
movdqa %xmm1, %xmm2
1367
pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369
# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370
pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371
pand %xmm1, %xmm2
1372
movdqa SHUF_MASK(%rip), %xmm10
1373
PSHUFB_XMM %xmm10 ,%xmm2
1374
1375
pxor %xmm2, %xmm8
1376
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377
# GHASH computation for the last <16 byte block
1378
sub %r13, %r11
1379
add $16, %r11
1380
1381
# output %r13 bytes
1382
MOVQ_R64_XMM %xmm0, %rax
1383
cmp $8, %r13
1384
jle _less_than_8_bytes_left_decrypt
1385
mov %rax, (%arg2 , %r11, 1)
1386
add $8, %r11
1387
psrldq $8, %xmm0
1388
MOVQ_R64_XMM %xmm0, %rax
1389
sub $8, %r13
1390
_less_than_8_bytes_left_decrypt:
1391
mov %al, (%arg2, %r11, 1)
1392
add $1, %r11
1393
shr $8, %rax
1394
sub $1, %r13
1395
jne _less_than_8_bytes_left_decrypt
1396
_multiple_of_16_bytes_decrypt:
1397
mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398
shl $3, %r12 # convert into number of bits
1399
movd %r12d, %xmm15 # len(A) in %xmm15
1400
shl $3, %arg4 # len(C) in bits (*128)
1401
MOVQ_R64_XMM %arg4, %xmm1
1402
pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403
pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404
pxor %xmm15, %xmm8
1405
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406
# final GHASH computation
1407
movdqa SHUF_MASK(%rip), %xmm10
1408
PSHUFB_XMM %xmm10, %xmm8
1409
1410
mov %arg5, %rax # %rax = *Y0
1411
movdqu (%rax), %xmm0 # %xmm0 = Y0
1412
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413
pxor %xmm8, %xmm0
1414
_return_T_decrypt:
1415
mov arg9, %r10 # %r10 = authTag
1416
mov arg10, %r11 # %r11 = auth_tag_len
1417
cmp $16, %r11
1418
je _T_16_decrypt
1419
cmp $12, %r11
1420
je _T_12_decrypt
1421
_T_8_decrypt:
1422
MOVQ_R64_XMM %xmm0, %rax
1423
mov %rax, (%r10)
1424
jmp _return_T_done_decrypt
1425
_T_12_decrypt:
1426
MOVQ_R64_XMM %xmm0, %rax
1427
mov %rax, (%r10)
1428
psrldq $8, %xmm0
1429
movd %xmm0, %eax
1430
mov %eax, 8(%r10)
1431
jmp _return_T_done_decrypt
1432
_T_16_decrypt:
1433
movdqu %xmm0, (%r10)
1434
_return_T_done_decrypt:
1435
mov %r14, %rsp
1436
pop %r14
1437
pop %r13
1438
pop %r12
1439
ret
1440
1441
1442
/*****************************************************************************
1443
* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444
* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445
* const u8 *in, // Plaintext input
1446
* u64 plaintext_len, // Length of data in bytes for encryption.
1447
* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448
* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449
* // concatenated with 0x00000001. 16-byte aligned pointer.
1450
* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451
* const u8 *aad, // Additional Authentication Data (AAD)
1452
* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453
* u8 *auth_tag, // Authenticated Tag output.
1454
* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455
* // 12 or 8.
1456
*
1457
* Assumptions:
1458
*
1459
* keys:
1460
* keys are pre-expanded and aligned to 16 bytes. we are using the
1461
* first set of 11 keys in the data structure void *aes_ctx
1462
*
1463
*
1464
* iv:
1465
* 0 1 2 3
1466
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468
* | Salt (From the SA) |
1469
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470
* | Initialization Vector |
1471
* | (This is the sequence number from IPSec header) |
1472
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473
* | 0x1 |
1474
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475
*
1476
*
1477
*
1478
* AAD:
1479
* AAD padded to 128 bits with 0
1480
* for example, assume AAD is a u32 vector
1481
*
1482
* if AAD is 8 bytes:
1483
* AAD[3] = {A0, A1};
1484
* padded AAD in xmm register = {A1 A0 0 0}
1485
*
1486
* 0 1 2 3
1487
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489
* | SPI (A1) |
1490
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491
* | 32-bit Sequence Number (A0) |
1492
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493
* | 0x0 |
1494
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495
*
1496
* AAD Format with 32-bit Sequence Number
1497
*
1498
* if AAD is 12 bytes:
1499
* AAD[3] = {A0, A1, A2};
1500
* padded AAD in xmm register = {A2 A1 A0 0}
1501
*
1502
* 0 1 2 3
1503
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505
* | SPI (A2) |
1506
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507
* | 64-bit Extended Sequence Number {A1,A0} |
1508
* | |
1509
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510
* | 0x0 |
1511
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512
*
1513
* AAD Format with 64-bit Extended Sequence Number
1514
*
1515
* aadLen:
1516
* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517
* The code supports 16 too but for other sizes, the code will fail.
1518
*
1519
* TLen:
1520
* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521
* For other sizes, the code will fail.
1522
*
1523
* poly = x^128 + x^127 + x^126 + x^121 + 1
1524
***************************************************************************/
1525
ENTRY(aesni_gcm_enc)
1526
push %r12
1527
push %r13
1528
push %r14
1529
mov %rsp, %r14
1530
#
1531
# states of %xmm registers %xmm6:%xmm15 not saved
1532
# all %xmm registers are clobbered
1533
#
1534
sub $VARIABLE_OFFSET, %rsp
1535
and $~63, %rsp
1536
mov %arg6, %r12
1537
movdqu (%r12), %xmm13
1538
movdqa SHUF_MASK(%rip), %xmm2
1539
PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542
# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544
movdqa %xmm13, %xmm2
1545
psllq $1, %xmm13
1546
psrlq $63, %xmm2
1547
movdqa %xmm2, %xmm1
1548
pslldq $8, %xmm2
1549
psrldq $8, %xmm1
1550
por %xmm2, %xmm13
1551
1552
# reduce HashKey<<1
1553
1554
pshufd $0x24, %xmm1, %xmm2
1555
pcmpeqd TWOONE(%rip), %xmm2
1556
pand POLY(%rip), %xmm2
1557
pxor %xmm2, %xmm13
1558
movdqa %xmm13, HashKey(%rsp)
1559
mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560
and $-16, %r13
1561
mov %r13, %r12
1562
1563
# Encrypt first few blocks
1564
1565
and $(3<<4), %r12
1566
jz _initial_num_blocks_is_0_encrypt
1567
cmp $(2<<4), %r12
1568
jb _initial_num_blocks_is_1_encrypt
1569
je _initial_num_blocks_is_2_encrypt
1570
_initial_num_blocks_is_3_encrypt:
1571
INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573
sub $48, %r13
1574
jmp _initial_blocks_encrypted
1575
_initial_num_blocks_is_2_encrypt:
1576
INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578
sub $32, %r13
1579
jmp _initial_blocks_encrypted
1580
_initial_num_blocks_is_1_encrypt:
1581
INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583
sub $16, %r13
1584
jmp _initial_blocks_encrypted
1585
_initial_num_blocks_is_0_encrypt:
1586
INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588
_initial_blocks_encrypted:
1589
1590
# Main loop - Encrypt remaining blocks
1591
1592
cmp $0, %r13
1593
je _zero_cipher_left_encrypt
1594
sub $64, %r13
1595
je _four_cipher_left_encrypt
1596
_encrypt_by_4_encrypt:
1597
GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598
%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599
add $64, %r11
1600
sub $64, %r13
1601
jne _encrypt_by_4_encrypt
1602
_four_cipher_left_encrypt:
1603
GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605
_zero_cipher_left_encrypt:
1606
mov %arg4, %r13
1607
and $15, %r13 # %r13 = arg4 (mod 16)
1608
je _multiple_of_16_bytes_encrypt
1609
1610
# Handle the last <16 Byte block separately
1611
paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612
movdqa SHUF_MASK(%rip), %xmm10
1613
PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617
sub $16, %r11
1618
add %r13, %r11
1619
movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620
lea SHIFT_MASK+16(%rip), %r12
1621
sub %r13, %r12
1622
# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623
# (%r13 is the number of bytes in plaintext mod 16)
1624
movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625
PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626
pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628
# get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629
pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630
movdqa SHUF_MASK(%rip), %xmm10
1631
PSHUFB_XMM %xmm10,%xmm0
1632
1633
pxor %xmm0, %xmm8
1634
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635
# GHASH computation for the last <16 byte block
1636
sub %r13, %r11
1637
add $16, %r11
1638
1639
movdqa SHUF_MASK(%rip), %xmm10
1640
PSHUFB_XMM %xmm10, %xmm0
1641
1642
# shuffle xmm0 back to output as ciphertext
1643
1644
# Output %r13 bytes
1645
MOVQ_R64_XMM %xmm0, %rax
1646
cmp $8, %r13
1647
jle _less_than_8_bytes_left_encrypt
1648
mov %rax, (%arg2 , %r11, 1)
1649
add $8, %r11
1650
psrldq $8, %xmm0
1651
MOVQ_R64_XMM %xmm0, %rax
1652
sub $8, %r13
1653
_less_than_8_bytes_left_encrypt:
1654
mov %al, (%arg2, %r11, 1)
1655
add $1, %r11
1656
shr $8, %rax
1657
sub $1, %r13
1658
jne _less_than_8_bytes_left_encrypt
1659
_multiple_of_16_bytes_encrypt:
1660
mov arg8, %r12 # %r12 = addLen (number of bytes)
1661
shl $3, %r12
1662
movd %r12d, %xmm15 # len(A) in %xmm15
1663
shl $3, %arg4 # len(C) in bits (*128)
1664
MOVQ_R64_XMM %arg4, %xmm1
1665
pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666
pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667
pxor %xmm15, %xmm8
1668
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669
# final GHASH computation
1670
movdqa SHUF_MASK(%rip), %xmm10
1671
PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1672
1673
mov %arg5, %rax # %rax = *Y0
1674
movdqu (%rax), %xmm0 # %xmm0 = Y0
1675
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676
pxor %xmm8, %xmm0
1677
_return_T_encrypt:
1678
mov arg9, %r10 # %r10 = authTag
1679
mov arg10, %r11 # %r11 = auth_tag_len
1680
cmp $16, %r11
1681
je _T_16_encrypt
1682
cmp $12, %r11
1683
je _T_12_encrypt
1684
_T_8_encrypt:
1685
MOVQ_R64_XMM %xmm0, %rax
1686
mov %rax, (%r10)
1687
jmp _return_T_done_encrypt
1688
_T_12_encrypt:
1689
MOVQ_R64_XMM %xmm0, %rax
1690
mov %rax, (%r10)
1691
psrldq $8, %xmm0
1692
movd %xmm0, %eax
1693
mov %eax, 8(%r10)
1694
jmp _return_T_done_encrypt
1695
_T_16_encrypt:
1696
movdqu %xmm0, (%r10)
1697
_return_T_done_encrypt:
1698
mov %r14, %rsp
1699
pop %r14
1700
pop %r13
1701
pop %r12
1702
ret
1703
1704
#endif
1705
1706
1707
_key_expansion_128:
1708
_key_expansion_256a:
1709
pshufd $0b11111111, %xmm1, %xmm1
1710
shufps $0b00010000, %xmm0, %xmm4
1711
pxor %xmm4, %xmm0
1712
shufps $0b10001100, %xmm0, %xmm4
1713
pxor %xmm4, %xmm0
1714
pxor %xmm1, %xmm0
1715
movaps %xmm0, (TKEYP)
1716
add $0x10, TKEYP
1717
ret
1718
1719
.align 4
1720
_key_expansion_192a:
1721
pshufd $0b01010101, %xmm1, %xmm1
1722
shufps $0b00010000, %xmm0, %xmm4
1723
pxor %xmm4, %xmm0
1724
shufps $0b10001100, %xmm0, %xmm4
1725
pxor %xmm4, %xmm0
1726
pxor %xmm1, %xmm0
1727
1728
movaps %xmm2, %xmm5
1729
movaps %xmm2, %xmm6
1730
pslldq $4, %xmm5
1731
pshufd $0b11111111, %xmm0, %xmm3
1732
pxor %xmm3, %xmm2
1733
pxor %xmm5, %xmm2
1734
1735
movaps %xmm0, %xmm1
1736
shufps $0b01000100, %xmm0, %xmm6
1737
movaps %xmm6, (TKEYP)
1738
shufps $0b01001110, %xmm2, %xmm1
1739
movaps %xmm1, 0x10(TKEYP)
1740
add $0x20, TKEYP
1741
ret
1742
1743
.align 4
1744
_key_expansion_192b:
1745
pshufd $0b01010101, %xmm1, %xmm1
1746
shufps $0b00010000, %xmm0, %xmm4
1747
pxor %xmm4, %xmm0
1748
shufps $0b10001100, %xmm0, %xmm4
1749
pxor %xmm4, %xmm0
1750
pxor %xmm1, %xmm0
1751
1752
movaps %xmm2, %xmm5
1753
pslldq $4, %xmm5
1754
pshufd $0b11111111, %xmm0, %xmm3
1755
pxor %xmm3, %xmm2
1756
pxor %xmm5, %xmm2
1757
1758
movaps %xmm0, (TKEYP)
1759
add $0x10, TKEYP
1760
ret
1761
1762
.align 4
1763
_key_expansion_256b:
1764
pshufd $0b10101010, %xmm1, %xmm1
1765
shufps $0b00010000, %xmm2, %xmm4
1766
pxor %xmm4, %xmm2
1767
shufps $0b10001100, %xmm2, %xmm4
1768
pxor %xmm4, %xmm2
1769
pxor %xmm1, %xmm2
1770
movaps %xmm2, (TKEYP)
1771
add $0x10, TKEYP
1772
ret
1773
1774
/*
1775
* int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1776
* unsigned int key_len)
1777
*/
1778
ENTRY(aesni_set_key)
1779
#ifndef __x86_64__
1780
pushl KEYP
1781
movl 8(%esp), KEYP # ctx
1782
movl 12(%esp), UKEYP # in_key
1783
movl 16(%esp), %edx # key_len
1784
#endif
1785
movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786
movaps %xmm0, (KEYP)
1787
lea 0x10(KEYP), TKEYP # key addr
1788
movl %edx, 480(KEYP)
1789
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1790
cmp $24, %dl
1791
jb .Lenc_key128
1792
je .Lenc_key192
1793
movups 0x10(UKEYP), %xmm2 # other user key
1794
movaps %xmm2, (TKEYP)
1795
add $0x10, TKEYP
1796
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1797
call _key_expansion_256a
1798
AESKEYGENASSIST 0x1 %xmm0 %xmm1
1799
call _key_expansion_256b
1800
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1801
call _key_expansion_256a
1802
AESKEYGENASSIST 0x2 %xmm0 %xmm1
1803
call _key_expansion_256b
1804
AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1805
call _key_expansion_256a
1806
AESKEYGENASSIST 0x4 %xmm0 %xmm1
1807
call _key_expansion_256b
1808
AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1809
call _key_expansion_256a
1810
AESKEYGENASSIST 0x8 %xmm0 %xmm1
1811
call _key_expansion_256b
1812
AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1813
call _key_expansion_256a
1814
AESKEYGENASSIST 0x10 %xmm0 %xmm1
1815
call _key_expansion_256b
1816
AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1817
call _key_expansion_256a
1818
AESKEYGENASSIST 0x20 %xmm0 %xmm1
1819
call _key_expansion_256b
1820
AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1821
call _key_expansion_256a
1822
jmp .Ldec_key
1823
.Lenc_key192:
1824
movq 0x10(UKEYP), %xmm2 # other user key
1825
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1826
call _key_expansion_192a
1827
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1828
call _key_expansion_192b
1829
AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1830
call _key_expansion_192a
1831
AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1832
call _key_expansion_192b
1833
AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1834
call _key_expansion_192a
1835
AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1836
call _key_expansion_192b
1837
AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1838
call _key_expansion_192a
1839
AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1840
call _key_expansion_192b
1841
jmp .Ldec_key
1842
.Lenc_key128:
1843
AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1844
call _key_expansion_128
1845
AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1846
call _key_expansion_128
1847
AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1848
call _key_expansion_128
1849
AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1850
call _key_expansion_128
1851
AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1852
call _key_expansion_128
1853
AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1854
call _key_expansion_128
1855
AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1856
call _key_expansion_128
1857
AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1858
call _key_expansion_128
1859
AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1860
call _key_expansion_128
1861
AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1862
call _key_expansion_128
1863
.Ldec_key:
1864
sub $0x10, TKEYP
1865
movaps (KEYP), %xmm0
1866
movaps (TKEYP), %xmm1
1867
movaps %xmm0, 240(TKEYP)
1868
movaps %xmm1, 240(KEYP)
1869
add $0x10, KEYP
1870
lea 240-16(TKEYP), UKEYP
1871
.align 4
1872
.Ldec_key_loop:
1873
movaps (KEYP), %xmm0
1874
AESIMC %xmm0 %xmm1
1875
movaps %xmm1, (UKEYP)
1876
add $0x10, KEYP
1877
sub $0x10, UKEYP
1878
cmp TKEYP, KEYP
1879
jb .Ldec_key_loop
1880
xor AREG, AREG
1881
#ifndef __x86_64__
1882
popl KEYP
1883
#endif
1884
ret
1885
1886
/*
1887
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1888
*/
1889
ENTRY(aesni_enc)
1890
#ifndef __x86_64__
1891
pushl KEYP
1892
pushl KLEN
1893
movl 12(%esp), KEYP
1894
movl 16(%esp), OUTP
1895
movl 20(%esp), INP
1896
#endif
1897
movl 480(KEYP), KLEN # key length
1898
movups (INP), STATE # input
1899
call _aesni_enc1
1900
movups STATE, (OUTP) # output
1901
#ifndef __x86_64__
1902
popl KLEN
1903
popl KEYP
1904
#endif
1905
ret
1906
1907
/*
1908
* _aesni_enc1: internal ABI
1909
* input:
1910
* KEYP: key struct pointer
1911
* KLEN: round count
1912
* STATE: initial state (input)
1913
* output:
1914
* STATE: finial state (output)
1915
* changed:
1916
* KEY
1917
* TKEYP (T1)
1918
*/
1919
.align 4
1920
_aesni_enc1:
1921
movaps (KEYP), KEY # key
1922
mov KEYP, TKEYP
1923
pxor KEY, STATE # round 0
1924
add $0x30, TKEYP
1925
cmp $24, KLEN
1926
jb .Lenc128
1927
lea 0x20(TKEYP), TKEYP
1928
je .Lenc192
1929
add $0x20, TKEYP
1930
movaps -0x60(TKEYP), KEY
1931
AESENC KEY STATE
1932
movaps -0x50(TKEYP), KEY
1933
AESENC KEY STATE
1934
.align 4
1935
.Lenc192:
1936
movaps -0x40(TKEYP), KEY
1937
AESENC KEY STATE
1938
movaps -0x30(TKEYP), KEY
1939
AESENC KEY STATE
1940
.align 4
1941
.Lenc128:
1942
movaps -0x20(TKEYP), KEY
1943
AESENC KEY STATE
1944
movaps -0x10(TKEYP), KEY
1945
AESENC KEY STATE
1946
movaps (TKEYP), KEY
1947
AESENC KEY STATE
1948
movaps 0x10(TKEYP), KEY
1949
AESENC KEY STATE
1950
movaps 0x20(TKEYP), KEY
1951
AESENC KEY STATE
1952
movaps 0x30(TKEYP), KEY
1953
AESENC KEY STATE
1954
movaps 0x40(TKEYP), KEY
1955
AESENC KEY STATE
1956
movaps 0x50(TKEYP), KEY
1957
AESENC KEY STATE
1958
movaps 0x60(TKEYP), KEY
1959
AESENC KEY STATE
1960
movaps 0x70(TKEYP), KEY
1961
AESENCLAST KEY STATE
1962
ret
1963
1964
/*
1965
* _aesni_enc4: internal ABI
1966
* input:
1967
* KEYP: key struct pointer
1968
* KLEN: round count
1969
* STATE1: initial state (input)
1970
* STATE2
1971
* STATE3
1972
* STATE4
1973
* output:
1974
* STATE1: finial state (output)
1975
* STATE2
1976
* STATE3
1977
* STATE4
1978
* changed:
1979
* KEY
1980
* TKEYP (T1)
1981
*/
1982
.align 4
1983
_aesni_enc4:
1984
movaps (KEYP), KEY # key
1985
mov KEYP, TKEYP
1986
pxor KEY, STATE1 # round 0
1987
pxor KEY, STATE2
1988
pxor KEY, STATE3
1989
pxor KEY, STATE4
1990
add $0x30, TKEYP
1991
cmp $24, KLEN
1992
jb .L4enc128
1993
lea 0x20(TKEYP), TKEYP
1994
je .L4enc192
1995
add $0x20, TKEYP
1996
movaps -0x60(TKEYP), KEY
1997
AESENC KEY STATE1
1998
AESENC KEY STATE2
1999
AESENC KEY STATE3
2000
AESENC KEY STATE4
2001
movaps -0x50(TKEYP), KEY
2002
AESENC KEY STATE1
2003
AESENC KEY STATE2
2004
AESENC KEY STATE3
2005
AESENC KEY STATE4
2006
#.align 4
2007
.L4enc192:
2008
movaps -0x40(TKEYP), KEY
2009
AESENC KEY STATE1
2010
AESENC KEY STATE2
2011
AESENC KEY STATE3
2012
AESENC KEY STATE4
2013
movaps -0x30(TKEYP), KEY
2014
AESENC KEY STATE1
2015
AESENC KEY STATE2
2016
AESENC KEY STATE3
2017
AESENC KEY STATE4
2018
#.align 4
2019
.L4enc128:
2020
movaps -0x20(TKEYP), KEY
2021
AESENC KEY STATE1
2022
AESENC KEY STATE2
2023
AESENC KEY STATE3
2024
AESENC KEY STATE4
2025
movaps -0x10(TKEYP), KEY
2026
AESENC KEY STATE1
2027
AESENC KEY STATE2
2028
AESENC KEY STATE3
2029
AESENC KEY STATE4
2030
movaps (TKEYP), KEY
2031
AESENC KEY STATE1
2032
AESENC KEY STATE2
2033
AESENC KEY STATE3
2034
AESENC KEY STATE4
2035
movaps 0x10(TKEYP), KEY
2036
AESENC KEY STATE1
2037
AESENC KEY STATE2
2038
AESENC KEY STATE3
2039
AESENC KEY STATE4
2040
movaps 0x20(TKEYP), KEY
2041
AESENC KEY STATE1
2042
AESENC KEY STATE2
2043
AESENC KEY STATE3
2044
AESENC KEY STATE4
2045
movaps 0x30(TKEYP), KEY
2046
AESENC KEY STATE1
2047
AESENC KEY STATE2
2048
AESENC KEY STATE3
2049
AESENC KEY STATE4
2050
movaps 0x40(TKEYP), KEY
2051
AESENC KEY STATE1
2052
AESENC KEY STATE2
2053
AESENC KEY STATE3
2054
AESENC KEY STATE4
2055
movaps 0x50(TKEYP), KEY
2056
AESENC KEY STATE1
2057
AESENC KEY STATE2
2058
AESENC KEY STATE3
2059
AESENC KEY STATE4
2060
movaps 0x60(TKEYP), KEY
2061
AESENC KEY STATE1
2062
AESENC KEY STATE2
2063
AESENC KEY STATE3
2064
AESENC KEY STATE4
2065
movaps 0x70(TKEYP), KEY
2066
AESENCLAST KEY STATE1 # last round
2067
AESENCLAST KEY STATE2
2068
AESENCLAST KEY STATE3
2069
AESENCLAST KEY STATE4
2070
ret
2071
2072
/*
2073
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2074
*/
2075
ENTRY(aesni_dec)
2076
#ifndef __x86_64__
2077
pushl KEYP
2078
pushl KLEN
2079
movl 12(%esp), KEYP
2080
movl 16(%esp), OUTP
2081
movl 20(%esp), INP
2082
#endif
2083
mov 480(KEYP), KLEN # key length
2084
add $240, KEYP
2085
movups (INP), STATE # input
2086
call _aesni_dec1
2087
movups STATE, (OUTP) #output
2088
#ifndef __x86_64__
2089
popl KLEN
2090
popl KEYP
2091
#endif
2092
ret
2093
2094
/*
2095
* _aesni_dec1: internal ABI
2096
* input:
2097
* KEYP: key struct pointer
2098
* KLEN: key length
2099
* STATE: initial state (input)
2100
* output:
2101
* STATE: finial state (output)
2102
* changed:
2103
* KEY
2104
* TKEYP (T1)
2105
*/
2106
.align 4
2107
_aesni_dec1:
2108
movaps (KEYP), KEY # key
2109
mov KEYP, TKEYP
2110
pxor KEY, STATE # round 0
2111
add $0x30, TKEYP
2112
cmp $24, KLEN
2113
jb .Ldec128
2114
lea 0x20(TKEYP), TKEYP
2115
je .Ldec192
2116
add $0x20, TKEYP
2117
movaps -0x60(TKEYP), KEY
2118
AESDEC KEY STATE
2119
movaps -0x50(TKEYP), KEY
2120
AESDEC KEY STATE
2121
.align 4
2122
.Ldec192:
2123
movaps -0x40(TKEYP), KEY
2124
AESDEC KEY STATE
2125
movaps -0x30(TKEYP), KEY
2126
AESDEC KEY STATE
2127
.align 4
2128
.Ldec128:
2129
movaps -0x20(TKEYP), KEY
2130
AESDEC KEY STATE
2131
movaps -0x10(TKEYP), KEY
2132
AESDEC KEY STATE
2133
movaps (TKEYP), KEY
2134
AESDEC KEY STATE
2135
movaps 0x10(TKEYP), KEY
2136
AESDEC KEY STATE
2137
movaps 0x20(TKEYP), KEY
2138
AESDEC KEY STATE
2139
movaps 0x30(TKEYP), KEY
2140
AESDEC KEY STATE
2141
movaps 0x40(TKEYP), KEY
2142
AESDEC KEY STATE
2143
movaps 0x50(TKEYP), KEY
2144
AESDEC KEY STATE
2145
movaps 0x60(TKEYP), KEY
2146
AESDEC KEY STATE
2147
movaps 0x70(TKEYP), KEY
2148
AESDECLAST KEY STATE
2149
ret
2150
2151
/*
2152
* _aesni_dec4: internal ABI
2153
* input:
2154
* KEYP: key struct pointer
2155
* KLEN: key length
2156
* STATE1: initial state (input)
2157
* STATE2
2158
* STATE3
2159
* STATE4
2160
* output:
2161
* STATE1: finial state (output)
2162
* STATE2
2163
* STATE3
2164
* STATE4
2165
* changed:
2166
* KEY
2167
* TKEYP (T1)
2168
*/
2169
.align 4
2170
_aesni_dec4:
2171
movaps (KEYP), KEY # key
2172
mov KEYP, TKEYP
2173
pxor KEY, STATE1 # round 0
2174
pxor KEY, STATE2
2175
pxor KEY, STATE3
2176
pxor KEY, STATE4
2177
add $0x30, TKEYP
2178
cmp $24, KLEN
2179
jb .L4dec128
2180
lea 0x20(TKEYP), TKEYP
2181
je .L4dec192
2182
add $0x20, TKEYP
2183
movaps -0x60(TKEYP), KEY
2184
AESDEC KEY STATE1
2185
AESDEC KEY STATE2
2186
AESDEC KEY STATE3
2187
AESDEC KEY STATE4
2188
movaps -0x50(TKEYP), KEY
2189
AESDEC KEY STATE1
2190
AESDEC KEY STATE2
2191
AESDEC KEY STATE3
2192
AESDEC KEY STATE4
2193
.align 4
2194
.L4dec192:
2195
movaps -0x40(TKEYP), KEY
2196
AESDEC KEY STATE1
2197
AESDEC KEY STATE2
2198
AESDEC KEY STATE3
2199
AESDEC KEY STATE4
2200
movaps -0x30(TKEYP), KEY
2201
AESDEC KEY STATE1
2202
AESDEC KEY STATE2
2203
AESDEC KEY STATE3
2204
AESDEC KEY STATE4
2205
.align 4
2206
.L4dec128:
2207
movaps -0x20(TKEYP), KEY
2208
AESDEC KEY STATE1
2209
AESDEC KEY STATE2
2210
AESDEC KEY STATE3
2211
AESDEC KEY STATE4
2212
movaps -0x10(TKEYP), KEY
2213
AESDEC KEY STATE1
2214
AESDEC KEY STATE2
2215
AESDEC KEY STATE3
2216
AESDEC KEY STATE4
2217
movaps (TKEYP), KEY
2218
AESDEC KEY STATE1
2219
AESDEC KEY STATE2
2220
AESDEC KEY STATE3
2221
AESDEC KEY STATE4
2222
movaps 0x10(TKEYP), KEY
2223
AESDEC KEY STATE1
2224
AESDEC KEY STATE2
2225
AESDEC KEY STATE3
2226
AESDEC KEY STATE4
2227
movaps 0x20(TKEYP), KEY
2228
AESDEC KEY STATE1
2229
AESDEC KEY STATE2
2230
AESDEC KEY STATE3
2231
AESDEC KEY STATE4
2232
movaps 0x30(TKEYP), KEY
2233
AESDEC KEY STATE1
2234
AESDEC KEY STATE2
2235
AESDEC KEY STATE3
2236
AESDEC KEY STATE4
2237
movaps 0x40(TKEYP), KEY
2238
AESDEC KEY STATE1
2239
AESDEC KEY STATE2
2240
AESDEC KEY STATE3
2241
AESDEC KEY STATE4
2242
movaps 0x50(TKEYP), KEY
2243
AESDEC KEY STATE1
2244
AESDEC KEY STATE2
2245
AESDEC KEY STATE3
2246
AESDEC KEY STATE4
2247
movaps 0x60(TKEYP), KEY
2248
AESDEC KEY STATE1
2249
AESDEC KEY STATE2
2250
AESDEC KEY STATE3
2251
AESDEC KEY STATE4
2252
movaps 0x70(TKEYP), KEY
2253
AESDECLAST KEY STATE1 # last round
2254
AESDECLAST KEY STATE2
2255
AESDECLAST KEY STATE3
2256
AESDECLAST KEY STATE4
2257
ret
2258
2259
/*
2260
* void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2261
* size_t len)
2262
*/
2263
ENTRY(aesni_ecb_enc)
2264
#ifndef __x86_64__
2265
pushl LEN
2266
pushl KEYP
2267
pushl KLEN
2268
movl 16(%esp), KEYP
2269
movl 20(%esp), OUTP
2270
movl 24(%esp), INP
2271
movl 28(%esp), LEN
2272
#endif
2273
test LEN, LEN # check length
2274
jz .Lecb_enc_ret
2275
mov 480(KEYP), KLEN
2276
cmp $16, LEN
2277
jb .Lecb_enc_ret
2278
cmp $64, LEN
2279
jb .Lecb_enc_loop1
2280
.align 4
2281
.Lecb_enc_loop4:
2282
movups (INP), STATE1
2283
movups 0x10(INP), STATE2
2284
movups 0x20(INP), STATE3
2285
movups 0x30(INP), STATE4
2286
call _aesni_enc4
2287
movups STATE1, (OUTP)
2288
movups STATE2, 0x10(OUTP)
2289
movups STATE3, 0x20(OUTP)
2290
movups STATE4, 0x30(OUTP)
2291
sub $64, LEN
2292
add $64, INP
2293
add $64, OUTP
2294
cmp $64, LEN
2295
jge .Lecb_enc_loop4
2296
cmp $16, LEN
2297
jb .Lecb_enc_ret
2298
.align 4
2299
.Lecb_enc_loop1:
2300
movups (INP), STATE1
2301
call _aesni_enc1
2302
movups STATE1, (OUTP)
2303
sub $16, LEN
2304
add $16, INP
2305
add $16, OUTP
2306
cmp $16, LEN
2307
jge .Lecb_enc_loop1
2308
.Lecb_enc_ret:
2309
#ifndef __x86_64__
2310
popl KLEN
2311
popl KEYP
2312
popl LEN
2313
#endif
2314
ret
2315
2316
/*
2317
* void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318
* size_t len);
2319
*/
2320
ENTRY(aesni_ecb_dec)
2321
#ifndef __x86_64__
2322
pushl LEN
2323
pushl KEYP
2324
pushl KLEN
2325
movl 16(%esp), KEYP
2326
movl 20(%esp), OUTP
2327
movl 24(%esp), INP
2328
movl 28(%esp), LEN
2329
#endif
2330
test LEN, LEN
2331
jz .Lecb_dec_ret
2332
mov 480(KEYP), KLEN
2333
add $240, KEYP
2334
cmp $16, LEN
2335
jb .Lecb_dec_ret
2336
cmp $64, LEN
2337
jb .Lecb_dec_loop1
2338
.align 4
2339
.Lecb_dec_loop4:
2340
movups (INP), STATE1
2341
movups 0x10(INP), STATE2
2342
movups 0x20(INP), STATE3
2343
movups 0x30(INP), STATE4
2344
call _aesni_dec4
2345
movups STATE1, (OUTP)
2346
movups STATE2, 0x10(OUTP)
2347
movups STATE3, 0x20(OUTP)
2348
movups STATE4, 0x30(OUTP)
2349
sub $64, LEN
2350
add $64, INP
2351
add $64, OUTP
2352
cmp $64, LEN
2353
jge .Lecb_dec_loop4
2354
cmp $16, LEN
2355
jb .Lecb_dec_ret
2356
.align 4
2357
.Lecb_dec_loop1:
2358
movups (INP), STATE1
2359
call _aesni_dec1
2360
movups STATE1, (OUTP)
2361
sub $16, LEN
2362
add $16, INP
2363
add $16, OUTP
2364
cmp $16, LEN
2365
jge .Lecb_dec_loop1
2366
.Lecb_dec_ret:
2367
#ifndef __x86_64__
2368
popl KLEN
2369
popl KEYP
2370
popl LEN
2371
#endif
2372
ret
2373
2374
/*
2375
* void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2376
* size_t len, u8 *iv)
2377
*/
2378
ENTRY(aesni_cbc_enc)
2379
#ifndef __x86_64__
2380
pushl IVP
2381
pushl LEN
2382
pushl KEYP
2383
pushl KLEN
2384
movl 20(%esp), KEYP
2385
movl 24(%esp), OUTP
2386
movl 28(%esp), INP
2387
movl 32(%esp), LEN
2388
movl 36(%esp), IVP
2389
#endif
2390
cmp $16, LEN
2391
jb .Lcbc_enc_ret
2392
mov 480(KEYP), KLEN
2393
movups (IVP), STATE # load iv as initial state
2394
.align 4
2395
.Lcbc_enc_loop:
2396
movups (INP), IN # load input
2397
pxor IN, STATE
2398
call _aesni_enc1
2399
movups STATE, (OUTP) # store output
2400
sub $16, LEN
2401
add $16, INP
2402
add $16, OUTP
2403
cmp $16, LEN
2404
jge .Lcbc_enc_loop
2405
movups STATE, (IVP)
2406
.Lcbc_enc_ret:
2407
#ifndef __x86_64__
2408
popl KLEN
2409
popl KEYP
2410
popl LEN
2411
popl IVP
2412
#endif
2413
ret
2414
2415
/*
2416
* void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2417
* size_t len, u8 *iv)
2418
*/
2419
ENTRY(aesni_cbc_dec)
2420
#ifndef __x86_64__
2421
pushl IVP
2422
pushl LEN
2423
pushl KEYP
2424
pushl KLEN
2425
movl 20(%esp), KEYP
2426
movl 24(%esp), OUTP
2427
movl 28(%esp), INP
2428
movl 32(%esp), LEN
2429
movl 36(%esp), IVP
2430
#endif
2431
cmp $16, LEN
2432
jb .Lcbc_dec_just_ret
2433
mov 480(KEYP), KLEN
2434
add $240, KEYP
2435
movups (IVP), IV
2436
cmp $64, LEN
2437
jb .Lcbc_dec_loop1
2438
.align 4
2439
.Lcbc_dec_loop4:
2440
movups (INP), IN1
2441
movaps IN1, STATE1
2442
movups 0x10(INP), IN2
2443
movaps IN2, STATE2
2444
#ifdef __x86_64__
2445
movups 0x20(INP), IN3
2446
movaps IN3, STATE3
2447
movups 0x30(INP), IN4
2448
movaps IN4, STATE4
2449
#else
2450
movups 0x20(INP), IN1
2451
movaps IN1, STATE3
2452
movups 0x30(INP), IN2
2453
movaps IN2, STATE4
2454
#endif
2455
call _aesni_dec4
2456
pxor IV, STATE1
2457
#ifdef __x86_64__
2458
pxor IN1, STATE2
2459
pxor IN2, STATE3
2460
pxor IN3, STATE4
2461
movaps IN4, IV
2462
#else
2463
pxor (INP), STATE2
2464
pxor 0x10(INP), STATE3
2465
pxor IN1, STATE4
2466
movaps IN2, IV
2467
#endif
2468
movups STATE1, (OUTP)
2469
movups STATE2, 0x10(OUTP)
2470
movups STATE3, 0x20(OUTP)
2471
movups STATE4, 0x30(OUTP)
2472
sub $64, LEN
2473
add $64, INP
2474
add $64, OUTP
2475
cmp $64, LEN
2476
jge .Lcbc_dec_loop4
2477
cmp $16, LEN
2478
jb .Lcbc_dec_ret
2479
.align 4
2480
.Lcbc_dec_loop1:
2481
movups (INP), IN
2482
movaps IN, STATE
2483
call _aesni_dec1
2484
pxor IV, STATE
2485
movups STATE, (OUTP)
2486
movaps IN, IV
2487
sub $16, LEN
2488
add $16, INP
2489
add $16, OUTP
2490
cmp $16, LEN
2491
jge .Lcbc_dec_loop1
2492
.Lcbc_dec_ret:
2493
movups IV, (IVP)
2494
.Lcbc_dec_just_ret:
2495
#ifndef __x86_64__
2496
popl KLEN
2497
popl KEYP
2498
popl LEN
2499
popl IVP
2500
#endif
2501
ret
2502
2503
#ifdef __x86_64__
2504
.align 16
2505
.Lbswap_mask:
2506
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2507
2508
/*
2509
* _aesni_inc_init: internal ABI
2510
* setup registers used by _aesni_inc
2511
* input:
2512
* IV
2513
* output:
2514
* CTR: == IV, in little endian
2515
* TCTR_LOW: == lower qword of CTR
2516
* INC: == 1, in little endian
2517
* BSWAP_MASK == endian swapping mask
2518
*/
2519
.align 4
2520
_aesni_inc_init:
2521
movaps .Lbswap_mask, BSWAP_MASK
2522
movaps IV, CTR
2523
PSHUFB_XMM BSWAP_MASK CTR
2524
mov $1, TCTR_LOW
2525
MOVQ_R64_XMM TCTR_LOW INC
2526
MOVQ_R64_XMM CTR TCTR_LOW
2527
ret
2528
2529
/*
2530
* _aesni_inc: internal ABI
2531
* Increase IV by 1, IV is in big endian
2532
* input:
2533
* IV
2534
* CTR: == IV, in little endian
2535
* TCTR_LOW: == lower qword of CTR
2536
* INC: == 1, in little endian
2537
* BSWAP_MASK == endian swapping mask
2538
* output:
2539
* IV: Increase by 1
2540
* changed:
2541
* CTR: == output IV, in little endian
2542
* TCTR_LOW: == lower qword of CTR
2543
*/
2544
.align 4
2545
_aesni_inc:
2546
paddq INC, CTR
2547
add $1, TCTR_LOW
2548
jnc .Linc_low
2549
pslldq $8, INC
2550
paddq INC, CTR
2551
psrldq $8, INC
2552
.Linc_low:
2553
movaps CTR, IV
2554
PSHUFB_XMM BSWAP_MASK IV
2555
ret
2556
2557
/*
2558
* void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2559
* size_t len, u8 *iv)
2560
*/
2561
ENTRY(aesni_ctr_enc)
2562
cmp $16, LEN
2563
jb .Lctr_enc_just_ret
2564
mov 480(KEYP), KLEN
2565
movups (IVP), IV
2566
call _aesni_inc_init
2567
cmp $64, LEN
2568
jb .Lctr_enc_loop1
2569
.align 4
2570
.Lctr_enc_loop4:
2571
movaps IV, STATE1
2572
call _aesni_inc
2573
movups (INP), IN1
2574
movaps IV, STATE2
2575
call _aesni_inc
2576
movups 0x10(INP), IN2
2577
movaps IV, STATE3
2578
call _aesni_inc
2579
movups 0x20(INP), IN3
2580
movaps IV, STATE4
2581
call _aesni_inc
2582
movups 0x30(INP), IN4
2583
call _aesni_enc4
2584
pxor IN1, STATE1
2585
movups STATE1, (OUTP)
2586
pxor IN2, STATE2
2587
movups STATE2, 0x10(OUTP)
2588
pxor IN3, STATE3
2589
movups STATE3, 0x20(OUTP)
2590
pxor IN4, STATE4
2591
movups STATE4, 0x30(OUTP)
2592
sub $64, LEN
2593
add $64, INP
2594
add $64, OUTP
2595
cmp $64, LEN
2596
jge .Lctr_enc_loop4
2597
cmp $16, LEN
2598
jb .Lctr_enc_ret
2599
.align 4
2600
.Lctr_enc_loop1:
2601
movaps IV, STATE
2602
call _aesni_inc
2603
movups (INP), IN
2604
call _aesni_enc1
2605
pxor IN, STATE
2606
movups STATE, (OUTP)
2607
sub $16, LEN
2608
add $16, INP
2609
add $16, OUTP
2610
cmp $16, LEN
2611
jge .Lctr_enc_loop1
2612
.Lctr_enc_ret:
2613
movups IV, (IVP)
2614
.Lctr_enc_just_ret:
2615
ret
2616
#endif
2617
2618