Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aes-gcm-aesni-x86_64.S
54913 views
1
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
2
//
3
// AES-NI optimized AES-GCM for x86_64
4
//
5
// Copyright 2024 Google LLC
6
//
7
// Author: Eric Biggers <ebiggers@google.com>
8
//
9
//------------------------------------------------------------------------------
10
//
11
// This file is dual-licensed, meaning that you can use it under your choice of
12
// either of the following two licenses:
13
//
14
// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
15
// of the License at
16
//
17
// http://www.apache.org/licenses/LICENSE-2.0
18
//
19
// Unless required by applicable law or agreed to in writing, software
20
// distributed under the License is distributed on an "AS IS" BASIS,
21
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
// See the License for the specific language governing permissions and
23
// limitations under the License.
24
//
25
// or
26
//
27
// Redistribution and use in source and binary forms, with or without
28
// modification, are permitted provided that the following conditions are met:
29
//
30
// 1. Redistributions of source code must retain the above copyright notice,
31
// this list of conditions and the following disclaimer.
32
//
33
// 2. Redistributions in binary form must reproduce the above copyright
34
// notice, this list of conditions and the following disclaimer in the
35
// documentation and/or other materials provided with the distribution.
36
//
37
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
41
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47
// POSSIBILITY OF SUCH DAMAGE.
48
//
49
//------------------------------------------------------------------------------
50
//
51
// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
52
// support the original set of AES instructions, i.e. AES-NI. Two
53
// implementations are provided, one that uses AVX and one that doesn't. They
54
// are very similar, being generated by the same macros. The only difference is
55
// that the AVX implementation takes advantage of VEX-coded instructions in some
56
// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
57
// implementation does *not* use 256-bit vectors, as AES is not supported on
58
// 256-bit vectors until the VAES feature (which this file doesn't target).
59
//
60
// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
61
// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
62
// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
63
//
64
// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
65
// more thoroughly commented. This file has the following notable changes:
66
//
67
// - The vector length is fixed at 128-bit, i.e. xmm registers. This means
68
// there is only one AES block (and GHASH block) per register.
69
//
70
// - Without AVX512, only 16 SIMD registers are available instead of 32. We
71
// work around this by being much more careful about using registers,
72
// relying heavily on loads to load values as they are needed.
73
//
74
// - Masking is not available either. We work around this by implementing
75
// partial block loads and stores using overlapping scalar loads and stores
76
// combined with shifts and SSE4.1 insertion and extraction instructions.
77
//
78
// - The main loop is organized differently due to the different design
79
// constraints. First, with just one AES block per SIMD register, on some
80
// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
81
// do an 8-register wide loop. Considering that and the fact that we have
82
// just 16 SIMD registers to work with, it's not feasible to cache AES
83
// round keys and GHASH key powers in registers across loop iterations.
84
// That's not ideal, but also not actually that bad, since loads can run in
85
// parallel with other instructions. Significantly, this also makes it
86
// possible to roll up the inner loops, relying on hardware loop unrolling
87
// instead of software loop unrolling, greatly reducing code size.
88
//
89
// - We implement the GHASH multiplications in the main loop using Karatsuba
90
// multiplication instead of schoolbook multiplication. This saves one
91
// pclmulqdq instruction per block, at the cost of one 64-bit load, one
92
// pshufd, and 0.25 pxors per block. (This is without the three-argument
93
// XOR support that would be provided by AVX512, which would be more
94
// beneficial to schoolbook than Karatsuba.)
95
//
96
// As a rough approximation, we can assume that Karatsuba multiplication is
97
// faster than schoolbook multiplication in this context if one pshufd and
98
// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
99
// load is "free" due to running in parallel with arithmetic instructions.)
100
// This is true on AMD CPUs, including all that support pclmulqdq up to at
101
// least Zen 3. It's also true on older Intel CPUs: Westmere through
102
// Haswell on the Core side, and Silvermont through Goldmont Plus on the
103
// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
104
// benefit of Karatsuba should be substantial. On newer Intel CPUs,
105
// schoolbook multiplication should be faster, but only marginally.
106
//
107
// Not all these CPUs were available to be tested. However, benchmarks on
108
// available CPUs suggest that this approximation is plausible. Switching
109
// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
110
// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
111
// Considering that and the fact that Karatsuba should be even more
112
// beneficial on older Intel CPUs, it seems like the right choice here.
113
//
114
// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115
// saved by using a multiplication-less reduction method. We don't do that
116
// because it would require a large number of shift and xor instructions,
117
// making it less worthwhile and likely harmful on newer CPUs.
118
//
119
// It does make sense to sometimes use a different reduction optimization
120
// that saves a pclmulqdq, though: precompute the hash key times x^64, and
121
// multiply the low half of the data block by the hash key with the extra
122
// factor of x^64. This eliminates one step of the reduction. However,
123
// this is incompatible with Karatsuba multiplication. Therefore, for
124
// multi-block processing we use Karatsuba multiplication with a regular
125
// reduction. For single-block processing, we use the x^64 optimization.
126
127
#include <linux/linkage.h>
128
129
.section .rodata
130
.p2align 4
131
.Lbswap_mask:
132
.octa 0x000102030405060708090a0b0c0d0e0f
133
.Lgfpoly:
134
.quad 0xc200000000000000
135
.Lone:
136
.quad 1
137
.Lgfpoly_and_internal_carrybit:
138
.octa 0xc2000000000000010000000000000001
139
// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
140
// 'len' 0xff bytes and the rest zeroes.
141
.Lzeropad_mask:
142
.octa 0xffffffffffffffffffffffffffffffff
143
.octa 0
144
145
// Offsets in struct aes_gcm_key_aesni
146
#define OFFSETOF_AESKEYLEN 0
147
#define OFFSETOF_AESROUNDKEYS 16
148
#define OFFSETOF_H_POWERS 272
149
#define OFFSETOF_H_POWERS_XORED 400
150
#define OFFSETOF_H_TIMES_X64 464
151
152
.text
153
154
// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
155
// assumes that all operands are distinct and that any mem operand is aligned.
156
.macro _vpclmulqdq imm, src1, src2, dst
157
.if USE_AVX
158
vpclmulqdq \imm, \src1, \src2, \dst
159
.else
160
movdqa \src2, \dst
161
pclmulqdq \imm, \src1, \dst
162
.endif
163
.endm
164
165
// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
166
// that all operands are distinct and that any mem operand is aligned.
167
.macro _vpshufb src1, src2, dst
168
.if USE_AVX
169
vpshufb \src1, \src2, \dst
170
.else
171
movdqa \src2, \dst
172
pshufb \src1, \dst
173
.endif
174
.endm
175
176
// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
177
// all operands are distinct.
178
.macro _vpand src1, src2, dst
179
.if USE_AVX
180
vpand \src1, \src2, \dst
181
.else
182
movdqu \src1, \dst
183
pand \src2, \dst
184
.endif
185
.endm
186
187
// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
188
// be a temporary xmm register.
189
.macro _xor_mem_to_reg mem, reg, tmp
190
.if USE_AVX
191
vpxor \mem, \reg, \reg
192
.else
193
movdqu \mem, \tmp
194
pxor \tmp, \reg
195
.endif
196
.endm
197
198
// Test the unaligned memory operand \mem against the xmm register \reg. \tmp
199
// must be a temporary xmm register.
200
.macro _test_mem mem, reg, tmp
201
.if USE_AVX
202
vptest \mem, \reg
203
.else
204
movdqu \mem, \tmp
205
ptest \tmp, \reg
206
.endif
207
.endm
208
209
// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
210
// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
211
.macro _load_partial_block src, dst, tmp64, tmp32
212
sub $8, %ecx // LEN - 8
213
jle .Lle8\@
214
215
// Load 9 <= LEN <= 15 bytes.
216
movq (\src), \dst // Load first 8 bytes
217
mov (\src, %rcx), %rax // Load last 8 bytes
218
neg %ecx
219
shl $3, %ecx
220
shr %cl, %rax // Discard overlapping bytes
221
pinsrq $1, %rax, \dst
222
jmp .Ldone\@
223
224
.Lle8\@:
225
add $4, %ecx // LEN - 4
226
jl .Llt4\@
227
228
// Load 4 <= LEN <= 8 bytes.
229
mov (\src), %eax // Load first 4 bytes
230
mov (\src, %rcx), \tmp32 // Load last 4 bytes
231
jmp .Lcombine\@
232
233
.Llt4\@:
234
// Load 1 <= LEN <= 3 bytes.
235
add $2, %ecx // LEN - 2
236
movzbl (\src), %eax // Load first byte
237
jl .Lmovq\@
238
movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
239
.Lcombine\@:
240
shl $3, %ecx
241
shl %cl, \tmp64
242
or \tmp64, %rax // Combine the two parts
243
.Lmovq\@:
244
movq %rax, \dst
245
.Ldone\@:
246
.endm
247
248
// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
249
// Clobbers %rax, %rcx, and %rsi.
250
.macro _store_partial_block src, dst
251
sub $8, %ecx // LEN - 8
252
jl .Llt8\@
253
254
// Store 8 <= LEN <= 15 bytes.
255
pextrq $1, \src, %rax
256
mov %ecx, %esi
257
shl $3, %ecx
258
ror %cl, %rax
259
mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
260
movq \src, (\dst) // Store first 8 bytes
261
jmp .Ldone\@
262
263
.Llt8\@:
264
add $4, %ecx // LEN - 4
265
jl .Llt4\@
266
267
// Store 4 <= LEN <= 7 bytes.
268
pextrd $1, \src, %eax
269
mov %ecx, %esi
270
shl $3, %ecx
271
ror %cl, %eax
272
mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
273
movd \src, (\dst) // Store first 4 bytes
274
jmp .Ldone\@
275
276
.Llt4\@:
277
// Store 1 <= LEN <= 3 bytes.
278
pextrb $0, \src, 0(\dst)
279
cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
280
jl .Ldone\@
281
pextrb $1, \src, 1(\dst)
282
je .Ldone\@
283
pextrb $2, \src, 2(\dst)
284
.Ldone\@:
285
.endm
286
287
// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
288
// \b. To complete all steps, this must be invoked with \i=0 through \i=9.
289
// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
290
// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
291
.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
292
293
// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
294
.if \i == 0
295
_vpclmulqdq $0x01, \a, \b, \t0
296
.elseif \i == 1
297
_vpclmulqdq $0x00, \a_times_x64, \b, \t1
298
.elseif \i == 2
299
pxor \t1, \t0
300
301
// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
302
.elseif \i == 3
303
_vpclmulqdq $0x11, \a, \b, \t1
304
.elseif \i == 4
305
pclmulqdq $0x10, \a_times_x64, \b
306
.elseif \i == 5
307
pxor \t1, \b
308
.elseif \i == 6
309
310
// Fold MI into HI.
311
pshufd $0x4e, \t0, \t1 // Swap halves of MI
312
.elseif \i == 7
313
pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
314
.elseif \i == 8
315
pxor \t1, \b
316
.elseif \i == 9
317
pxor \t0, \b
318
.endif
319
.endm
320
321
// GHASH-multiply \a by \b and store the reduced product in \b.
322
// See _ghash_mul_step for details.
323
.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
324
.irp i, 0,1,2,3,4,5,6,7,8,9
325
_ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
326
.endr
327
.endm
328
329
// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
330
// This does Karatsuba multiplication and must be paired with _ghash_reduce. On
331
// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
332
// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
333
.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
334
335
// LO += a_L * b_L
336
_vpclmulqdq $0x00, \a, \b, \t0
337
pxor \t0, \lo
338
339
// b_L + b_H
340
pshufd $0x4e, \b, \t0
341
pxor \b, \t0
342
343
// HI += a_H * b_H
344
pclmulqdq $0x11, \a, \b
345
pxor \b, \hi
346
347
// MI += (a_L + a_H) * (b_L + b_H)
348
pclmulqdq $0x00, \a_xored, \t0
349
pxor \t0, \mi
350
.endm
351
352
// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
353
// This assumes that _ghash_mul_noreduce was used.
354
.macro _ghash_reduce lo, mi, hi, dst, t0
355
356
movq .Lgfpoly(%rip), \t0
357
358
// MI += LO + HI (needed because we used Karatsuba multiplication)
359
pxor \lo, \mi
360
pxor \hi, \mi
361
362
// Fold LO into MI.
363
pshufd $0x4e, \lo, \dst
364
pclmulqdq $0x00, \t0, \lo
365
pxor \dst, \mi
366
pxor \lo, \mi
367
368
// Fold MI into HI.
369
pshufd $0x4e, \mi, \dst
370
pclmulqdq $0x00, \t0, \mi
371
pxor \hi, \dst
372
pxor \mi, \dst
373
.endm
374
375
// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
376
//
377
// The whole GHASH update does:
378
//
379
// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
380
// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
381
//
382
// This macro just does the first step: it does the unreduced multiplication
383
// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
384
// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
385
// inner block counter in %rax, which is a value that counts up by 8 for each
386
// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
387
//
388
// To reduce the number of pclmulqdq instructions required, both this macro and
389
// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
390
// multiplication. See the file comment for more details about this choice.
391
//
392
// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
393
// encrypting, or SRC if decrypting. They also expect the precomputed hash key
394
// powers H^i and their XOR'd-together halves to be available in the struct
395
// pointed to by KEY. Both macros clobber TMP[0-2].
396
.macro _ghash_update_begin_8x enc
397
398
// Initialize the inner block counter.
399
xor %eax, %eax
400
401
// Load the highest hash key power, H^8.
402
movdqa OFFSETOF_H_POWERS(KEY), TMP0
403
404
// Load the first ciphertext block and byte-reflect it.
405
.if \enc
406
movdqu (DST), TMP1
407
.else
408
movdqu (SRC), TMP1
409
.endif
410
pshufb BSWAP_MASK, TMP1
411
412
// Add the GHASH accumulator to the ciphertext block to get the block
413
// 'b' that needs to be multiplied with the hash key power 'a'.
414
pxor TMP1, GHASH_ACC
415
416
// b_L + b_H
417
pshufd $0x4e, GHASH_ACC, MI
418
pxor GHASH_ACC, MI
419
420
// LO = a_L * b_L
421
_vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
422
423
// HI = a_H * b_H
424
pclmulqdq $0x11, TMP0, GHASH_ACC
425
426
// MI = (a_L + a_H) * (b_L + b_H)
427
pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
428
.endm
429
430
// Continue the GHASH update of 8 ciphertext blocks as described above by doing
431
// an unreduced multiplication of the next ciphertext block by the next lowest
432
// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
433
.macro _ghash_update_continue_8x enc
434
add $8, %eax
435
436
// Load the next lowest key power.
437
movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
438
439
// Load the next ciphertext block and byte-reflect it.
440
.if \enc
441
movdqu (DST,%rax,2), TMP1
442
.else
443
movdqu (SRC,%rax,2), TMP1
444
.endif
445
pshufb BSWAP_MASK, TMP1
446
447
// LO += a_L * b_L
448
_vpclmulqdq $0x00, TMP0, TMP1, TMP2
449
pxor TMP2, LO
450
451
// b_L + b_H
452
pshufd $0x4e, TMP1, TMP2
453
pxor TMP1, TMP2
454
455
// HI += a_H * b_H
456
pclmulqdq $0x11, TMP0, TMP1
457
pxor TMP1, GHASH_ACC
458
459
// MI += (a_L + a_H) * (b_L + b_H)
460
movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
461
pclmulqdq $0x00, TMP1, TMP2
462
pxor TMP2, MI
463
.endm
464
465
// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
466
// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
467
// it uses the same register for HI and the destination. It's also divided into
468
// two steps. TMP1 must be preserved across steps.
469
//
470
// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
471
// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
472
// increase the critical path length, and it seems to slightly hurt performance.
473
.macro _ghash_update_end_8x_step i
474
.if \i == 0
475
movq .Lgfpoly(%rip), TMP1
476
pxor LO, MI
477
pxor GHASH_ACC, MI
478
pshufd $0x4e, LO, TMP2
479
pclmulqdq $0x00, TMP1, LO
480
pxor TMP2, MI
481
pxor LO, MI
482
.elseif \i == 1
483
pshufd $0x4e, MI, TMP2
484
pclmulqdq $0x00, TMP1, MI
485
pxor TMP2, GHASH_ACC
486
pxor MI, GHASH_ACC
487
.endif
488
.endm
489
490
// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
491
//
492
// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
493
// related fields in the key struct.
494
.macro _aes_gcm_precompute
495
496
// Function arguments
497
.set KEY, %rdi
498
499
// Additional local variables.
500
// %xmm0-%xmm1 and %rax are used as temporaries.
501
.set RNDKEYLAST_PTR, %rsi
502
.set H_CUR, %xmm2
503
.set H_POW1, %xmm3 // H^1
504
.set H_POW1_X64, %xmm4 // H^1 * x^64
505
.set GFPOLY, %xmm5
506
507
// Encrypt an all-zeroes block to get the raw hash subkey.
508
movl OFFSETOF_AESKEYLEN(KEY), %eax
509
lea OFFSETOF_AESROUNDKEYS+6*16(KEY,%rax,4), RNDKEYLAST_PTR
510
movdqa OFFSETOF_AESROUNDKEYS(KEY), H_POW1
511
lea OFFSETOF_AESROUNDKEYS+16(KEY), %rax
512
1:
513
aesenc (%rax), H_POW1
514
add $16, %rax
515
cmp %rax, RNDKEYLAST_PTR
516
jne 1b
517
aesenclast (RNDKEYLAST_PTR), H_POW1
518
519
// Preprocess the raw hash subkey as needed to operate on GHASH's
520
// bit-reflected values directly: reflect its bytes, then multiply it by
521
// x^-1 (using the backwards interpretation of polynomial coefficients
522
// from the GCM spec) or equivalently x^1 (using the alternative,
523
// natural interpretation of polynomial coefficients).
524
pshufb .Lbswap_mask(%rip), H_POW1
525
movdqa H_POW1, %xmm0
526
pshufd $0xd3, %xmm0, %xmm0
527
psrad $31, %xmm0
528
paddq H_POW1, H_POW1
529
pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
530
pxor %xmm0, H_POW1
531
532
// Store H^1.
533
movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
534
535
// Compute and store H^1 * x^64.
536
movq .Lgfpoly(%rip), GFPOLY
537
pshufd $0x4e, H_POW1, %xmm0
538
_vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
539
pxor %xmm0, H_POW1_X64
540
movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
541
542
// Compute and store the halves of H^1 XOR'd together.
543
pxor H_POW1, %xmm0
544
movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
545
546
// Compute and store the remaining key powers H^2 through H^8.
547
movdqa H_POW1, H_CUR
548
mov $6*8, %eax
549
.Lprecompute_next\@:
550
// Compute H^i = H^{i-1} * H^1.
551
_ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
552
// Store H^i.
553
movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
554
// Compute and store the halves of H^i XOR'd together.
555
pshufd $0x4e, H_CUR, %xmm0
556
pxor H_CUR, %xmm0
557
movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
558
sub $8, %eax
559
jge .Lprecompute_next\@
560
561
RET
562
.endm
563
564
// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
565
// u8 ghash_acc[16], const u8 *aad, int aadlen);
566
//
567
// This function processes the AAD (Additional Authenticated Data) in GCM.
568
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
569
// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
570
// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
571
// can be any length. The caller must do any buffering needed to ensure this.
572
.macro _aes_gcm_aad_update
573
574
// Function arguments
575
.set KEY, %rdi
576
.set GHASH_ACC_PTR, %rsi
577
.set AAD, %rdx
578
.set AADLEN, %ecx
579
// Note: _load_partial_block relies on AADLEN being in %ecx.
580
581
// Additional local variables.
582
// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
583
.set BSWAP_MASK, %xmm2
584
.set GHASH_ACC, %xmm3
585
.set H_POW1, %xmm4 // H^1
586
.set H_POW1_X64, %xmm5 // H^1 * x^64
587
.set GFPOLY, %xmm6
588
589
movdqa .Lbswap_mask(%rip), BSWAP_MASK
590
movdqu (GHASH_ACC_PTR), GHASH_ACC
591
movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
592
movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
593
movq .Lgfpoly(%rip), GFPOLY
594
595
// Process the AAD one full block at a time.
596
sub $16, AADLEN
597
jl .Laad_loop_1x_done\@
598
.Laad_loop_1x\@:
599
movdqu (AAD), %xmm0
600
pshufb BSWAP_MASK, %xmm0
601
pxor %xmm0, GHASH_ACC
602
_ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
603
add $16, AAD
604
sub $16, AADLEN
605
jge .Laad_loop_1x\@
606
.Laad_loop_1x_done\@:
607
// Check whether there is a partial block at the end.
608
add $16, AADLEN
609
jz .Laad_done\@
610
611
// Process a partial block of length 1 <= AADLEN <= 15.
612
// _load_partial_block assumes that %ecx contains AADLEN.
613
_load_partial_block AAD, %xmm0, %r10, %r10d
614
pshufb BSWAP_MASK, %xmm0
615
pxor %xmm0, GHASH_ACC
616
_ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
617
618
.Laad_done\@:
619
movdqu GHASH_ACC, (GHASH_ACC_PTR)
620
RET
621
.endm
622
623
// Increment LE_CTR eight times to generate eight little-endian counter blocks,
624
// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
625
// the zero-th AES round key. Clobbers TMP0 and TMP1.
626
.macro _ctr_begin_8x
627
movq .Lone(%rip), TMP0
628
movdqa OFFSETOF_AESROUNDKEYS(KEY), TMP1 // zero-th round key
629
.irp i, 0,1,2,3,4,5,6,7
630
_vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
631
pxor TMP1, AESDATA\i
632
paddd TMP0, LE_CTR
633
.endr
634
.endm
635
636
// Do a non-last round of AES on AESDATA[0-7] using \round_key.
637
.macro _aesenc_8x round_key
638
.irp i, 0,1,2,3,4,5,6,7
639
aesenc \round_key, AESDATA\i
640
.endr
641
.endm
642
643
// Do the last round of AES on AESDATA[0-7] using \round_key.
644
.macro _aesenclast_8x round_key
645
.irp i, 0,1,2,3,4,5,6,7
646
aesenclast \round_key, AESDATA\i
647
.endr
648
.endm
649
650
// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
651
// store the result to DST. Clobbers TMP0.
652
.macro _xor_data_8x
653
.irp i, 0,1,2,3,4,5,6,7
654
_xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
655
.endr
656
.irp i, 0,1,2,3,4,5,6,7
657
movdqu AESDATA\i, \i*16(DST)
658
.endr
659
.endm
660
661
// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
662
// const u32 le_ctr[4], u8 ghash_acc[16],
663
// const u8 *src, u8 *dst, int datalen);
664
//
665
// This macro generates a GCM encryption or decryption update function with the
666
// above prototype (with \enc selecting which one).
667
//
668
// This function computes the next portion of the CTR keystream, XOR's it with
669
// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
670
// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
671
// next |datalen| ciphertext bytes.
672
//
673
// |datalen| must be a multiple of 16, except on the last call where it can be
674
// any length. The caller must do any buffering needed to ensure this. Both
675
// in-place and out-of-place en/decryption are supported.
676
//
677
// |le_ctr| must give the current counter in little-endian format. For a new
678
// message, the low word of the counter must be 2. This function loads the
679
// counter from |le_ctr| and increments the loaded counter as needed, but it
680
// does *not* store the updated counter back to |le_ctr|. The caller must
681
// update |le_ctr| if any more data segments follow. Internally, only the low
682
// 32-bit word of the counter is incremented, following the GCM standard.
683
.macro _aes_gcm_update enc
684
685
// Function arguments
686
.set KEY, %rdi
687
.set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
688
.set GHASH_ACC_PTR, %rdx
689
.set SRC, %rcx
690
.set DST, %r8
691
.set DATALEN, %r9d
692
.set DATALEN64, %r9 // Zero-extend DATALEN before using!
693
// Note: the code setting up for _load_partial_block assumes that SRC is
694
// in %rcx (and that DATALEN is *not* in %rcx).
695
696
// Additional local variables
697
698
// %rax and %rsi are used as temporary registers. Note: %rsi overlaps
699
// with LE_CTR_PTR, which is used only at the beginning.
700
701
.set AESKEYLEN, %r10d // AES key length in bytes
702
.set AESKEYLEN64, %r10
703
.set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
704
705
// Put the most frequently used values in %xmm0-%xmm7 to reduce code
706
// size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
707
.set TMP0, %xmm0
708
.set TMP1, %xmm1
709
.set TMP2, %xmm2
710
.set LO, %xmm3 // Low part of unreduced product
711
.set MI, %xmm4 // Middle part of unreduced product
712
.set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
713
// the high part of unreduced product
714
.set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
715
.set LE_CTR, %xmm7 // Little-endian counter value
716
.set AESDATA0, %xmm8
717
.set AESDATA1, %xmm9
718
.set AESDATA2, %xmm10
719
.set AESDATA3, %xmm11
720
.set AESDATA4, %xmm12
721
.set AESDATA5, %xmm13
722
.set AESDATA6, %xmm14
723
.set AESDATA7, %xmm15
724
725
movdqa .Lbswap_mask(%rip), BSWAP_MASK
726
movdqu (GHASH_ACC_PTR), GHASH_ACC
727
movdqu (LE_CTR_PTR), LE_CTR
728
729
movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
730
lea OFFSETOF_AESROUNDKEYS+6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
731
732
// If there are at least 8*16 bytes of data, then continue into the main
733
// loop, which processes 8*16 bytes of data per iteration.
734
//
735
// The main loop interleaves AES and GHASH to improve performance on
736
// CPUs that can execute these instructions in parallel. When
737
// decrypting, the GHASH input (the ciphertext) is immediately
738
// available. When encrypting, we instead encrypt a set of 8 blocks
739
// first and then GHASH those blocks while encrypting the next set of 8,
740
// repeat that as needed, and finally GHASH the last set of 8 blocks.
741
//
742
// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
743
// as this makes the immediate fit in a signed byte, saving 3 bytes.
744
add $-8*16, DATALEN
745
jl .Lcrypt_loop_8x_done\@
746
.if \enc
747
// Encrypt the first 8 plaintext blocks.
748
_ctr_begin_8x
749
lea OFFSETOF_AESROUNDKEYS+16(KEY), %rsi
750
.p2align 4
751
1:
752
movdqa (%rsi), TMP0
753
_aesenc_8x TMP0
754
add $16, %rsi
755
cmp %rsi, RNDKEYLAST_PTR
756
jne 1b
757
movdqa (%rsi), TMP0
758
_aesenclast_8x TMP0
759
_xor_data_8x
760
// Don't increment DST until the ciphertext blocks have been hashed.
761
sub $-8*16, SRC
762
add $-8*16, DATALEN
763
jl .Lghash_last_ciphertext_8x\@
764
.endif
765
766
.p2align 4
767
.Lcrypt_loop_8x\@:
768
769
// Generate the next set of 8 counter blocks and start encrypting them.
770
_ctr_begin_8x
771
lea OFFSETOF_AESROUNDKEYS+16(KEY), %rsi
772
773
// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
774
// by doing the unreduced multiplication for the first ciphertext block.
775
movdqa (%rsi), TMP0
776
add $16, %rsi
777
_aesenc_8x TMP0
778
_ghash_update_begin_8x \enc
779
780
// Do 7 more rounds of AES, and continue the GHASH update by doing the
781
// unreduced multiplication for the remaining ciphertext blocks.
782
.p2align 4
783
1:
784
movdqa (%rsi), TMP0
785
add $16, %rsi
786
_aesenc_8x TMP0
787
_ghash_update_continue_8x \enc
788
cmp $7*8, %eax
789
jne 1b
790
791
// Do the remaining AES rounds.
792
.p2align 4
793
1:
794
movdqa (%rsi), TMP0
795
add $16, %rsi
796
_aesenc_8x TMP0
797
cmp %rsi, RNDKEYLAST_PTR
798
jne 1b
799
800
// Do the GHASH reduction and the last round of AES.
801
movdqa (RNDKEYLAST_PTR), TMP0
802
_ghash_update_end_8x_step 0
803
_aesenclast_8x TMP0
804
_ghash_update_end_8x_step 1
805
806
// XOR the data with the AES-CTR keystream blocks.
807
.if \enc
808
sub $-8*16, DST
809
.endif
810
_xor_data_8x
811
sub $-8*16, SRC
812
.if !\enc
813
sub $-8*16, DST
814
.endif
815
add $-8*16, DATALEN
816
jge .Lcrypt_loop_8x\@
817
818
.if \enc
819
.Lghash_last_ciphertext_8x\@:
820
// Update GHASH with the last set of 8 ciphertext blocks.
821
_ghash_update_begin_8x \enc
822
.p2align 4
823
1:
824
_ghash_update_continue_8x \enc
825
cmp $7*8, %eax
826
jne 1b
827
_ghash_update_end_8x_step 0
828
_ghash_update_end_8x_step 1
829
sub $-8*16, DST
830
.endif
831
832
.Lcrypt_loop_8x_done\@:
833
834
sub $-8*16, DATALEN
835
jz .Ldone\@
836
837
// Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
838
// things simple and keep the code size down by just going one block at
839
// a time, again taking advantage of hardware loop unrolling. Since
840
// there are enough key powers available for all remaining data, we do
841
// the GHASH multiplications unreduced, and only reduce at the very end.
842
843
.set HI, TMP2
844
.set H_POW, AESDATA0
845
.set H_POW_XORED, AESDATA1
846
.set ONE, AESDATA2
847
848
movq .Lone(%rip), ONE
849
850
// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
851
pxor LO, LO
852
pxor MI, MI
853
pxor HI, HI
854
855
// Set up a block counter %rax to contain 8*(8-n), where n is the number
856
// of blocks that remain, counting any partial block. This will be used
857
// to access the key powers H^n through H^1.
858
mov DATALEN, %eax
859
neg %eax
860
and $~15, %eax
861
sar $1, %eax
862
add $64, %eax
863
864
sub $16, DATALEN
865
jl .Lcrypt_loop_1x_done\@
866
867
// Process the data one full block at a time.
868
.Lcrypt_loop_1x\@:
869
870
// Encrypt the next counter block.
871
_vpshufb BSWAP_MASK, LE_CTR, TMP0
872
paddd ONE, LE_CTR
873
pxor OFFSETOF_AESROUNDKEYS(KEY), TMP0
874
lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
875
cmp $24, AESKEYLEN
876
jl 128f // AES-128?
877
je 192f // AES-192?
878
// AES-256
879
aesenc -7*16(%rsi), TMP0
880
aesenc -6*16(%rsi), TMP0
881
192:
882
aesenc -5*16(%rsi), TMP0
883
aesenc -4*16(%rsi), TMP0
884
128:
885
.irp i, -3,-2,-1,0,1,2,3,4,5
886
aesenc \i*16(%rsi), TMP0
887
.endr
888
aesenclast (RNDKEYLAST_PTR), TMP0
889
890
// Load the next key power H^i.
891
movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
892
movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
893
894
// XOR the keystream block that was just generated in TMP0 with the next
895
// source data block and store the resulting en/decrypted data to DST.
896
.if \enc
897
_xor_mem_to_reg (SRC), TMP0, tmp=TMP1
898
movdqu TMP0, (DST)
899
.else
900
movdqu (SRC), TMP1
901
pxor TMP1, TMP0
902
movdqu TMP0, (DST)
903
.endif
904
905
// Update GHASH with the ciphertext block.
906
.if \enc
907
pshufb BSWAP_MASK, TMP0
908
pxor TMP0, GHASH_ACC
909
.else
910
pshufb BSWAP_MASK, TMP1
911
pxor TMP1, GHASH_ACC
912
.endif
913
_ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
914
pxor GHASH_ACC, GHASH_ACC
915
916
add $8, %eax
917
add $16, SRC
918
add $16, DST
919
sub $16, DATALEN
920
jge .Lcrypt_loop_1x\@
921
.Lcrypt_loop_1x_done\@:
922
// Check whether there is a partial block at the end.
923
add $16, DATALEN
924
jz .Lghash_reduce\@
925
926
// Process a partial block of length 1 <= DATALEN <= 15.
927
928
// Encrypt a counter block for the last time.
929
pshufb BSWAP_MASK, LE_CTR
930
pxor OFFSETOF_AESROUNDKEYS(KEY), LE_CTR
931
lea OFFSETOF_AESROUNDKEYS+16(KEY), %rsi
932
1:
933
aesenc (%rsi), LE_CTR
934
add $16, %rsi
935
cmp %rsi, RNDKEYLAST_PTR
936
jne 1b
937
aesenclast (RNDKEYLAST_PTR), LE_CTR
938
939
// Load the lowest key power, H^1.
940
movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
941
movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
942
943
// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
944
// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
945
// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
946
mov SRC, RNDKEYLAST_PTR
947
mov DATALEN, %ecx
948
_load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
949
950
// XOR the keystream block that was just generated in LE_CTR with the
951
// source data block and store the resulting en/decrypted data to DST.
952
pxor TMP0, LE_CTR
953
mov DATALEN, %ecx
954
_store_partial_block LE_CTR, DST
955
956
// If encrypting, zero-pad the final ciphertext block for GHASH. (If
957
// decrypting, this was already done by _load_partial_block.)
958
.if \enc
959
lea .Lzeropad_mask+16(%rip), %rax
960
sub DATALEN64, %rax
961
_vpand (%rax), LE_CTR, TMP0
962
.endif
963
964
// Update GHASH with the final ciphertext block.
965
pshufb BSWAP_MASK, TMP0
966
pxor TMP0, GHASH_ACC
967
_ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
968
969
.Lghash_reduce\@:
970
// Finally, do the GHASH reduction.
971
_ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
972
973
.Ldone\@:
974
// Store the updated GHASH accumulator back to memory.
975
movdqu GHASH_ACC, (GHASH_ACC_PTR)
976
977
RET
978
.endm
979
980
// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
981
// const u32 le_ctr[4], u8 ghash_acc[16],
982
// u64 total_aadlen, u64 total_datalen);
983
// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
984
// const u32 le_ctr[4], const u8 ghash_acc[16],
985
// u64 total_aadlen, u64 total_datalen,
986
// const u8 tag[16], int taglen);
987
//
988
// This macro generates one of the above two functions (with \enc selecting
989
// which one). Both functions finish computing the GCM authentication tag by
990
// updating GHASH with the lengths block and encrypting the GHASH accumulator.
991
// |total_aadlen| and |total_datalen| must be the total length of the additional
992
// authenticated data and the en/decrypted data in bytes, respectively.
993
//
994
// The encryption function then stores the full-length (16-byte) computed
995
// authentication tag to |ghash_acc|. The decryption function instead loads the
996
// expected authentication tag (the one that was transmitted) from the 16-byte
997
// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
998
// computed tag in constant time, and returns true if and only if they match.
999
.macro _aes_gcm_final enc
1000
1001
// Function arguments
1002
.set KEY, %rdi
1003
.set LE_CTR_PTR, %rsi
1004
.set GHASH_ACC_PTR, %rdx
1005
.set TOTAL_AADLEN, %rcx
1006
.set TOTAL_DATALEN, %r8
1007
.set TAG, %r9
1008
.set TAGLEN, %r10d // Originally at 8(%rsp)
1009
.set TAGLEN64, %r10
1010
1011
// Additional local variables.
1012
// %rax and %xmm0-%xmm2 are used as temporary registers.
1013
.set AESKEYLEN, %r11d
1014
.set AESKEYLEN64, %r11
1015
.set BSWAP_MASK, %xmm3
1016
.set GHASH_ACC, %xmm4
1017
.set H_POW1, %xmm5 // H^1
1018
.set H_POW1_X64, %xmm6 // H^1 * x^64
1019
.set GFPOLY, %xmm7
1020
1021
movdqa .Lbswap_mask(%rip), BSWAP_MASK
1022
movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
1023
1024
// Set up a counter block with 1 in the low 32-bit word. This is the
1025
// counter that produces the ciphertext needed to encrypt the auth tag.
1026
movdqu (LE_CTR_PTR), %xmm0
1027
mov $1, %eax
1028
pinsrd $0, %eax, %xmm0
1029
1030
// Build the lengths block and XOR it into the GHASH accumulator.
1031
movq TOTAL_DATALEN, GHASH_ACC
1032
pinsrq $1, TOTAL_AADLEN, GHASH_ACC
1033
psllq $3, GHASH_ACC // Bytes to bits
1034
_xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
1035
1036
movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
1037
movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
1038
movq .Lgfpoly(%rip), GFPOLY
1039
1040
// Make %rax point to the 6th from last AES round key. (Using signed
1041
// byte offsets -7*16 through 6*16 decreases code size.)
1042
lea OFFSETOF_AESROUNDKEYS(KEY,AESKEYLEN64,4), %rax
1043
1044
// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1045
// Interleave the AES and GHASH instructions to improve performance.
1046
pshufb BSWAP_MASK, %xmm0
1047
pxor OFFSETOF_AESROUNDKEYS(KEY), %xmm0
1048
cmp $24, AESKEYLEN
1049
jl 128f // AES-128?
1050
je 192f // AES-192?
1051
// AES-256
1052
aesenc -7*16(%rax), %xmm0
1053
aesenc -6*16(%rax), %xmm0
1054
192:
1055
aesenc -5*16(%rax), %xmm0
1056
aesenc -4*16(%rax), %xmm0
1057
128:
1058
.irp i, 0,1,2,3,4,5,6,7,8
1059
aesenc (\i-3)*16(%rax), %xmm0
1060
_ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1061
.endr
1062
aesenclast 6*16(%rax), %xmm0
1063
_ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1064
1065
// Undo the byte reflection of the GHASH accumulator.
1066
pshufb BSWAP_MASK, GHASH_ACC
1067
1068
// Encrypt the GHASH accumulator.
1069
pxor %xmm0, GHASH_ACC
1070
1071
.if \enc
1072
// Return the computed auth tag.
1073
movdqu GHASH_ACC, (GHASH_ACC_PTR)
1074
.else
1075
.set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
1076
1077
// Verify the auth tag in constant time by XOR'ing the transmitted and
1078
// computed auth tags together and using the ptest instruction to check
1079
// whether the first TAGLEN bytes of the result are zero.
1080
_xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
1081
movl 8(%rsp), TAGLEN
1082
lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
1083
sub TAGLEN64, ZEROPAD_MASK_PTR
1084
xor %eax, %eax
1085
_test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
1086
sete %al
1087
.endif
1088
RET
1089
.endm
1090
1091
.set USE_AVX, 0
1092
SYM_FUNC_START(aes_gcm_precompute_aesni)
1093
_aes_gcm_precompute
1094
SYM_FUNC_END(aes_gcm_precompute_aesni)
1095
SYM_FUNC_START(aes_gcm_aad_update_aesni)
1096
_aes_gcm_aad_update
1097
SYM_FUNC_END(aes_gcm_aad_update_aesni)
1098
SYM_FUNC_START(aes_gcm_enc_update_aesni)
1099
_aes_gcm_update 1
1100
SYM_FUNC_END(aes_gcm_enc_update_aesni)
1101
SYM_FUNC_START(aes_gcm_dec_update_aesni)
1102
_aes_gcm_update 0
1103
SYM_FUNC_END(aes_gcm_dec_update_aesni)
1104
SYM_FUNC_START(aes_gcm_enc_final_aesni)
1105
_aes_gcm_final 1
1106
SYM_FUNC_END(aes_gcm_enc_final_aesni)
1107
SYM_FUNC_START(aes_gcm_dec_final_aesni)
1108
_aes_gcm_final 0
1109
SYM_FUNC_END(aes_gcm_dec_final_aesni)
1110
1111
.set USE_AVX, 1
1112
SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
1113
_aes_gcm_precompute
1114
SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
1115
SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
1116
_aes_gcm_aad_update
1117
SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
1118
SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
1119
_aes_gcm_update 1
1120
SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
1121
SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
1122
_aes_gcm_update 0
1123
SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
1124
SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
1125
_aes_gcm_final 1
1126
SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
1127
SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
1128
_aes_gcm_final 0
1129
SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
1130
1131