CoCalc -- aes-gcm-aesni-x86

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aes-gcm-aesni-x86_64.S
²⁶⁴⁵¹ views
1
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
2
//
3
// AES-NI optimized AES-GCM for x86_64
4
//
5
// Copyright 2024 Google LLC
6
//
7
// Author: Eric Biggers <ebiggers@google.com>
8
//
9
//------------------------------------------------------------------------------
10
//
11
// This file is dual-licensed, meaning that you can use it under your choice of
12
// either of the following two licenses:
13
//
14
// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
15
// of the License at
16
//
17
//	http://www.apache.org/licenses/LICENSE-2.0
18
//
19
// Unless required by applicable law or agreed to in writing, software
20
// distributed under the License is distributed on an "AS IS" BASIS,
21
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
// See the License for the specific language governing permissions and
23
// limitations under the License.
24
//
25
// or
26
//
27
// Redistribution and use in source and binary forms, with or without
28
// modification, are permitted provided that the following conditions are met:
29
//
30
// 1. Redistributions of source code must retain the above copyright notice,
31
//    this list of conditions and the following disclaimer.
32
//
33
// 2. Redistributions in binary form must reproduce the above copyright
34
//    notice, this list of conditions and the following disclaimer in the
35
//    documentation and/or other materials provided with the distribution.
36
//
37
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
41
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47
// POSSIBILITY OF SUCH DAMAGE.
48
//
49
//------------------------------------------------------------------------------
50
//
51
// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
52
// support the original set of AES instructions, i.e. AES-NI.  Two
53
// implementations are provided, one that uses AVX and one that doesn't.  They
54
// are very similar, being generated by the same macros.  The only difference is
55
// that the AVX implementation takes advantage of VEX-coded instructions in some
56
// places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
57
// implementation does *not* use 256-bit vectors, as AES is not supported on
58
// 256-bit vectors until the VAES feature (which this file doesn't target).
59
//
60
// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
61
// for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
62
// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
63
//
64
// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
65
// more thoroughly commented.  This file has the following notable changes:
66
//
67
//    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
68
//      there is only one AES block (and GHASH block) per register.
69
//
70
//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
71
//      32.  We work around this by being much more careful about using
72
//      registers, relying heavily on loads to load values as they are needed.
73
//
74
//    - Masking is not available either.  We work around this by implementing
75
//      partial block loads and stores using overlapping scalar loads and stores
76
//      combined with shifts and SSE4.1 insertion and extraction instructions.
77
//
78
//    - The main loop is organized differently due to the different design
79
//      constraints.  First, with just one AES block per SIMD register, on some
80
//      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
81
//      do an 8-register wide loop.  Considering that and the fact that we have
82
//      just 16 SIMD registers to work with, it's not feasible to cache AES
83
//      round keys and GHASH key powers in registers across loop iterations.
84
//      That's not ideal, but also not actually that bad, since loads can run in
85
//      parallel with other instructions.  Significantly, this also makes it
86
//      possible to roll up the inner loops, relying on hardware loop unrolling
87
//      instead of software loop unrolling, greatly reducing code size.
88
//
89
//    - We implement the GHASH multiplications in the main loop using Karatsuba
90
//      multiplication instead of schoolbook multiplication.  This saves one
91
//      pclmulqdq instruction per block, at the cost of one 64-bit load, one
92
//      pshufd, and 0.25 pxors per block.  (This is without the three-argument
93
//      XOR support that would be provided by AVX512 / AVX10, which would be
94
//      more beneficial to schoolbook than Karatsuba.)
95
//
96
//      As a rough approximation, we can assume that Karatsuba multiplication is
97
//      faster than schoolbook multiplication in this context if one pshufd and
98
//      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
99
//      load is "free" due to running in parallel with arithmetic instructions.)
100
//      This is true on AMD CPUs, including all that support pclmulqdq up to at
101
//      least Zen 3.  It's also true on older Intel CPUs: Westmere through
102
//      Haswell on the Core side, and Silvermont through Goldmont Plus on the
103
//      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
104
//      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
105
//      schoolbook multiplication should be faster, but only marginally.
106
//
107
//      Not all these CPUs were available to be tested.  However, benchmarks on
108
//      available CPUs suggest that this approximation is plausible.  Switching
109
//      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
110
//      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
111
//      Considering that and the fact that Karatsuba should be even more
112
//      beneficial on older Intel CPUs, it seems like the right choice here.
113
//
114
//      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115
//      saved by using a multiplication-less reduction method.  We don't do that
116
//      because it would require a large number of shift and xor instructions,
117
//      making it less worthwhile and likely harmful on newer CPUs.
118
//
119
//      It does make sense to sometimes use a different reduction optimization
120
//      that saves a pclmulqdq, though: precompute the hash key times x^64, and
121
//      multiply the low half of the data block by the hash key with the extra
122
//      factor of x^64.  This eliminates one step of the reduction.  However,
123
//      this is incompatible with Karatsuba multiplication.  Therefore, for
124
//      multi-block processing we use Karatsuba multiplication with a regular
125
//      reduction.  For single-block processing, we use the x^64 optimization.
126

127
#include <linux/linkage.h>
128

129
.section .rodata
130
.p2align 4
131
.Lbswap_mask:
132
	.octa   0x000102030405060708090a0b0c0d0e0f
133
.Lgfpoly:
134
	.quad	0xc200000000000000
135
.Lone:
136
	.quad	1
137
.Lgfpoly_and_internal_carrybit:
138
	.octa	0xc2000000000000010000000000000001
139
	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
140
	// 'len' 0xff bytes and the rest zeroes.
141
.Lzeropad_mask:
142
	.octa	0xffffffffffffffffffffffffffffffff
143
	.octa	0
144

145
// Offsets in struct aes_gcm_key_aesni
146
#define OFFSETOF_AESKEYLEN	480
147
#define OFFSETOF_H_POWERS	496
148
#define OFFSETOF_H_POWERS_XORED	624
149
#define OFFSETOF_H_TIMES_X64	688
150

151
.text
152

153
// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
154
// assumes that all operands are distinct and that any mem operand is aligned.
155
.macro	_vpclmulqdq	imm, src1, src2, dst
156
.if USE_AVX
157
	vpclmulqdq	\imm, \src1, \src2, \dst
158
.else
159
	movdqa		\src2, \dst
160
	pclmulqdq	\imm, \src1, \dst
161
.endif
162
.endm
163

164
// Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
165
// that all operands are distinct and that any mem operand is aligned.
166
.macro	_vpshufb	src1, src2, dst
167
.if USE_AVX
168
	vpshufb		\src1, \src2, \dst
169
.else
170
	movdqa		\src2, \dst
171
	pshufb		\src1, \dst
172
.endif
173
.endm
174

175
// Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
176
// all operands are distinct.
177
.macro	_vpand		src1, src2, dst
178
.if USE_AVX
179
	vpand		\src1, \src2, \dst
180
.else
181
	movdqu		\src1, \dst
182
	pand		\src2, \dst
183
.endif
184
.endm
185

186
// XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
187
// be a temporary xmm register.
188
.macro	_xor_mem_to_reg	mem, reg, tmp
189
.if USE_AVX
190
	vpxor		\mem, \reg, \reg
191
.else
192
	movdqu		\mem, \tmp
193
	pxor		\tmp, \reg
194
.endif
195
.endm
196

197
// Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
198
// must be a temporary xmm register.
199
.macro	_test_mem	mem, reg, tmp
200
.if USE_AVX
201
	vptest		\mem, \reg
202
.else
203
	movdqu		\mem, \tmp
204
	ptest		\tmp, \reg
205
.endif
206
.endm
207

208
// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
209
// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
210
.macro	_load_partial_block	src, dst, tmp64, tmp32
211
	sub		$8, %ecx		// LEN - 8
212
	jle		.Lle8\@
213

214
	// Load 9 <= LEN <= 15 bytes.
215
	movq		(\src), \dst		// Load first 8 bytes
216
	mov		(\src, %rcx), %rax	// Load last 8 bytes
217
	neg		%ecx
218
	shl		$3, %ecx
219
	shr		%cl, %rax		// Discard overlapping bytes
220
	pinsrq		$1, %rax, \dst
221
	jmp		.Ldone\@
222

223
.Lle8\@:
224
	add		$4, %ecx		// LEN - 4
225
	jl		.Llt4\@
226

227
	// Load 4 <= LEN <= 8 bytes.
228
	mov		(\src), %eax		// Load first 4 bytes
229
	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
230
	jmp		.Lcombine\@
231

232
.Llt4\@:
233
	// Load 1 <= LEN <= 3 bytes.
234
	add		$2, %ecx		// LEN - 2
235
	movzbl		(\src), %eax		// Load first byte
236
	jl		.Lmovq\@
237
	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
238
.Lcombine\@:
239
	shl		$3, %ecx
240
	shl		%cl, \tmp64
241
	or		\tmp64, %rax		// Combine the two parts
242
.Lmovq\@:
243
	movq		%rax, \dst
244
.Ldone\@:
245
.endm
246

247
// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
248
// Clobbers %rax, %rcx, and %rsi.
249
.macro	_store_partial_block	src, dst
250
	sub		$8, %ecx		// LEN - 8
251
	jl		.Llt8\@
252

253
	// Store 8 <= LEN <= 15 bytes.
254
	pextrq		$1, \src, %rax
255
	mov		%ecx, %esi
256
	shl		$3, %ecx
257
	ror		%cl, %rax
258
	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
259
	movq		\src, (\dst)		// Store first 8 bytes
260
	jmp		.Ldone\@
261

262
.Llt8\@:
263
	add		$4, %ecx		// LEN - 4
264
	jl		.Llt4\@
265

266
	// Store 4 <= LEN <= 7 bytes.
267
	pextrd		$1, \src, %eax
268
	mov		%ecx, %esi
269
	shl		$3, %ecx
270
	ror		%cl, %eax
271
	mov		%eax, (\dst, %rsi)	// Store last LEN - 4 bytes
272
	movd		\src, (\dst)		// Store first 4 bytes
273
	jmp		.Ldone\@
274

275
.Llt4\@:
276
	// Store 1 <= LEN <= 3 bytes.
277
	pextrb		$0, \src, 0(\dst)
278
	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
279
	jl		.Ldone\@
280
	pextrb		$1, \src, 1(\dst)
281
	je		.Ldone\@
282
	pextrb		$2, \src, 2(\dst)
283
.Ldone\@:
284
.endm
285

286
// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
287
// \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
288
// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
289
// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
290
.macro	_ghash_mul_step	i, a, a_times_x64, b, gfpoly, t0, t1
291

292
	// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
293
.if \i == 0
294
	_vpclmulqdq	$0x01, \a, \b, \t0
295
.elseif \i == 1
296
	_vpclmulqdq	$0x00, \a_times_x64, \b, \t1
297
.elseif \i == 2
298
	pxor		\t1, \t0
299

300
	// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
301
.elseif \i == 3
302
	_vpclmulqdq	$0x11, \a, \b, \t1
303
.elseif \i == 4
304
	pclmulqdq	$0x10, \a_times_x64, \b
305
.elseif \i == 5
306
	pxor		\t1, \b
307
.elseif \i == 6
308

309
	// Fold MI into HI.
310
	pshufd		$0x4e, \t0, \t1		// Swap halves of MI
311
.elseif \i == 7
312
	pclmulqdq	$0x00, \gfpoly, \t0	// MI_L*(x^63 + x^62 + x^57)
313
.elseif \i == 8
314
	pxor		\t1, \b
315
.elseif \i == 9
316
	pxor		\t0, \b
317
.endif
318
.endm
319

320
// GHASH-multiply \a by \b and store the reduced product in \b.
321
// See _ghash_mul_step for details.
322
.macro	_ghash_mul	a, a_times_x64, b, gfpoly, t0, t1
323
.irp i, 0,1,2,3,4,5,6,7,8,9
324
	_ghash_mul_step	\i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
325
.endr
326
.endm
327

328
// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
329
// This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
330
// the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
331
// two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
332
.macro	_ghash_mul_noreduce	a, a_xored, b, lo, mi, hi, t0
333

334
	// LO += a_L * b_L
335
	_vpclmulqdq	$0x00, \a, \b, \t0
336
	pxor		\t0, \lo
337

338
	// b_L + b_H
339
	pshufd		$0x4e, \b, \t0
340
	pxor		\b, \t0
341

342
	// HI += a_H * b_H
343
	pclmulqdq	$0x11, \a, \b
344
	pxor		\b, \hi
345

346
	// MI += (a_L + a_H) * (b_L + b_H)
347
	pclmulqdq	$0x00, \a_xored, \t0
348
	pxor		\t0, \mi
349
.endm
350

351
// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
352
// This assumes that _ghash_mul_noreduce was used.
353
.macro	_ghash_reduce	lo, mi, hi, dst, t0
354

355
	movq		.Lgfpoly(%rip), \t0
356

357
	// MI += LO + HI (needed because we used Karatsuba multiplication)
358
	pxor		\lo, \mi
359
	pxor		\hi, \mi
360

361
	// Fold LO into MI.
362
	pshufd		$0x4e, \lo, \dst
363
	pclmulqdq	$0x00, \t0, \lo
364
	pxor		\dst, \mi
365
	pxor		\lo, \mi
366

367
	// Fold MI into HI.
368
	pshufd		$0x4e, \mi, \dst
369
	pclmulqdq	$0x00, \t0, \mi
370
	pxor		\hi, \dst
371
	pxor		\mi, \dst
372
.endm
373

374
// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
375
//
376
// The whole GHASH update does:
377
//
378
//	GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
379
//				blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
380
//
381
// This macro just does the first step: it does the unreduced multiplication
382
// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
383
// registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
384
// inner block counter in %rax, which is a value that counts up by 8 for each
385
// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
386
//
387
// To reduce the number of pclmulqdq instructions required, both this macro and
388
// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
389
// multiplication.  See the file comment for more details about this choice.
390
//
391
// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
392
// encrypting, or SRC if decrypting.  They also expect the precomputed hash key
393
// powers H^i and their XOR'd-together halves to be available in the struct
394
// pointed to by KEY.  Both macros clobber TMP[0-2].
395
.macro	_ghash_update_begin_8x	enc
396

397
	// Initialize the inner block counter.
398
	xor		%eax, %eax
399

400
	// Load the highest hash key power, H^8.
401
	movdqa		OFFSETOF_H_POWERS(KEY), TMP0
402

403
	// Load the first ciphertext block and byte-reflect it.
404
.if \enc
405
	movdqu		(DST), TMP1
406
.else
407
	movdqu		(SRC), TMP1
408
.endif
409
	pshufb		BSWAP_MASK, TMP1
410

411
	// Add the GHASH accumulator to the ciphertext block to get the block
412
	// 'b' that needs to be multiplied with the hash key power 'a'.
413
	pxor		TMP1, GHASH_ACC
414

415
	// b_L + b_H
416
	pshufd		$0x4e, GHASH_ACC, MI
417
	pxor		GHASH_ACC, MI
418

419
	// LO = a_L * b_L
420
	_vpclmulqdq	$0x00, TMP0, GHASH_ACC, LO
421

422
	// HI = a_H * b_H
423
	pclmulqdq	$0x11, TMP0, GHASH_ACC
424

425
	// MI = (a_L + a_H) * (b_L + b_H)
426
	pclmulqdq	$0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
427
.endm
428

429
// Continue the GHASH update of 8 ciphertext blocks as described above by doing
430
// an unreduced multiplication of the next ciphertext block by the next lowest
431
// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
432
.macro	_ghash_update_continue_8x enc
433
	add		$8, %eax
434

435
	// Load the next lowest key power.
436
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
437

438
	// Load the next ciphertext block and byte-reflect it.
439
.if \enc
440
	movdqu		(DST,%rax,2), TMP1
441
.else
442
	movdqu		(SRC,%rax,2), TMP1
443
.endif
444
	pshufb		BSWAP_MASK, TMP1
445

446
	// LO += a_L * b_L
447
	_vpclmulqdq	$0x00, TMP0, TMP1, TMP2
448
	pxor		TMP2, LO
449

450
	// b_L + b_H
451
	pshufd		$0x4e, TMP1, TMP2
452
	pxor		TMP1, TMP2
453

454
	// HI += a_H * b_H
455
	pclmulqdq	$0x11, TMP0, TMP1
456
	pxor		TMP1, GHASH_ACC
457

458
	// MI += (a_L + a_H) * (b_L + b_H)
459
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
460
	pclmulqdq	$0x00, TMP1, TMP2
461
	pxor		TMP2, MI
462
.endm
463

464
// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
465
// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
466
// it uses the same register for HI and the destination.  It's also divided into
467
// two steps.  TMP1 must be preserved across steps.
468
//
469
// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
470
// shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
471
// increase the critical path length, and it seems to slightly hurt performance.
472
.macro	_ghash_update_end_8x_step	i
473
.if \i == 0
474
	movq		.Lgfpoly(%rip), TMP1
475
	pxor		LO, MI
476
	pxor		GHASH_ACC, MI
477
	pshufd		$0x4e, LO, TMP2
478
	pclmulqdq	$0x00, TMP1, LO
479
	pxor		TMP2, MI
480
	pxor		LO, MI
481
.elseif \i == 1
482
	pshufd		$0x4e, MI, TMP2
483
	pclmulqdq	$0x00, TMP1, MI
484
	pxor		TMP2, GHASH_ACC
485
	pxor		MI, GHASH_ACC
486
.endif
487
.endm
488

489
// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
490
//
491
// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
492
// related fields in the key struct.
493
.macro	_aes_gcm_precompute
494

495
	// Function arguments
496
	.set	KEY,		%rdi
497

498
	// Additional local variables.
499
	// %xmm0-%xmm1 and %rax are used as temporaries.
500
	.set	RNDKEYLAST_PTR,	%rsi
501
	.set	H_CUR,		%xmm2
502
	.set	H_POW1,		%xmm3	// H^1
503
	.set	H_POW1_X64,	%xmm4	// H^1 * x^64
504
	.set	GFPOLY,		%xmm5
505

506
	// Encrypt an all-zeroes block to get the raw hash subkey.
507
	movl		OFFSETOF_AESKEYLEN(KEY), %eax
508
	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
509
	movdqa		(KEY), H_POW1  // Zero-th round key XOR all-zeroes block
510
	lea		16(KEY), %rax
511
1:
512
	aesenc		(%rax), H_POW1
513
	add		$16, %rax
514
	cmp		%rax, RNDKEYLAST_PTR
515
	jne		1b
516
	aesenclast	(RNDKEYLAST_PTR), H_POW1
517

518
	// Preprocess the raw hash subkey as needed to operate on GHASH's
519
	// bit-reflected values directly: reflect its bytes, then multiply it by
520
	// x^-1 (using the backwards interpretation of polynomial coefficients
521
	// from the GCM spec) or equivalently x^1 (using the alternative,
522
	// natural interpretation of polynomial coefficients).
523
	pshufb		.Lbswap_mask(%rip), H_POW1
524
	movdqa		H_POW1, %xmm0
525
	pshufd		$0xd3, %xmm0, %xmm0
526
	psrad		$31, %xmm0
527
	paddq		H_POW1, H_POW1
528
	pand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0
529
	pxor		%xmm0, H_POW1
530

531
	// Store H^1.
532
	movdqa		H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
533

534
	// Compute and store H^1 * x^64.
535
	movq		.Lgfpoly(%rip), GFPOLY
536
	pshufd		$0x4e, H_POW1, %xmm0
537
	_vpclmulqdq	$0x00, H_POW1, GFPOLY, H_POW1_X64
538
	pxor		%xmm0, H_POW1_X64
539
	movdqa		H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
540

541
	// Compute and store the halves of H^1 XOR'd together.
542
	pxor		H_POW1, %xmm0
543
	movq		%xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
544

545
	// Compute and store the remaining key powers H^2 through H^8.
546
	movdqa		H_POW1, H_CUR
547
	mov		$6*8, %eax
548
.Lprecompute_next\@:
549
	// Compute H^i = H^{i-1} * H^1.
550
	_ghash_mul	H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
551
	// Store H^i.
552
	movdqa		H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
553
	// Compute and store the halves of H^i XOR'd together.
554
	pshufd		$0x4e, H_CUR, %xmm0
555
	pxor		H_CUR, %xmm0
556
	movq		%xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
557
	sub		$8, %eax
558
	jge		.Lprecompute_next\@
559

560
	RET
561
.endm
562

563
// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
564
//				 u8 ghash_acc[16], const u8 *aad, int aadlen);
565
//
566
// This function processes the AAD (Additional Authenticated Data) in GCM.
567
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
568
// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
569
// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
570
// can be any length.  The caller must do any buffering needed to ensure this.
571
.macro	_aes_gcm_aad_update
572

573
	// Function arguments
574
	.set	KEY,		%rdi
575
	.set	GHASH_ACC_PTR,	%rsi
576
	.set	AAD,		%rdx
577
	.set	AADLEN,		%ecx
578
	// Note: _load_partial_block relies on AADLEN being in %ecx.
579

580
	// Additional local variables.
581
	// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
582
	.set	BSWAP_MASK,	%xmm2
583
	.set	GHASH_ACC,	%xmm3
584
	.set	H_POW1,		%xmm4	// H^1
585
	.set	H_POW1_X64,	%xmm5	// H^1 * x^64
586
	.set	GFPOLY,		%xmm6
587

588
	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
589
	movdqu		(GHASH_ACC_PTR), GHASH_ACC
590
	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
591
	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
592
	movq		.Lgfpoly(%rip), GFPOLY
593

594
	// Process the AAD one full block at a time.
595
	sub		$16, AADLEN
596
	jl		.Laad_loop_1x_done\@
597
.Laad_loop_1x\@:
598
	movdqu		(AAD), %xmm0
599
	pshufb		BSWAP_MASK, %xmm0
600
	pxor		%xmm0, GHASH_ACC
601
	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
602
	add		$16, AAD
603
	sub		$16, AADLEN
604
	jge		.Laad_loop_1x\@
605
.Laad_loop_1x_done\@:
606
	// Check whether there is a partial block at the end.
607
	add		$16, AADLEN
608
	jz		.Laad_done\@
609

610
	// Process a partial block of length 1 <= AADLEN <= 15.
611
	// _load_partial_block assumes that %ecx contains AADLEN.
612
	_load_partial_block	AAD, %xmm0, %r10, %r10d
613
	pshufb		BSWAP_MASK, %xmm0
614
	pxor		%xmm0, GHASH_ACC
615
	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
616

617
.Laad_done\@:
618
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
619
	RET
620
.endm
621

622
// Increment LE_CTR eight times to generate eight little-endian counter blocks,
623
// swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
624
// the zero-th AES round key.  Clobbers TMP0 and TMP1.
625
.macro	_ctr_begin_8x
626
	movq		.Lone(%rip), TMP0
627
	movdqa		(KEY), TMP1		// zero-th round key
628
.irp i, 0,1,2,3,4,5,6,7
629
	_vpshufb	BSWAP_MASK, LE_CTR, AESDATA\i
630
	pxor		TMP1, AESDATA\i
631
	paddd		TMP0, LE_CTR
632
.endr
633
.endm
634

635
// Do a non-last round of AES on AESDATA[0-7] using \round_key.
636
.macro	_aesenc_8x	round_key
637
.irp i, 0,1,2,3,4,5,6,7
638
	aesenc		\round_key, AESDATA\i
639
.endr
640
.endm
641

642
// Do the last round of AES on AESDATA[0-7] using \round_key.
643
.macro	_aesenclast_8x	round_key
644
.irp i, 0,1,2,3,4,5,6,7
645
	aesenclast	\round_key, AESDATA\i
646
.endr
647
.endm
648

649
// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
650
// store the result to DST.  Clobbers TMP0.
651
.macro	_xor_data_8x
652
.irp i, 0,1,2,3,4,5,6,7
653
	_xor_mem_to_reg	\i*16(SRC), AESDATA\i, tmp=TMP0
654
.endr
655
.irp i, 0,1,2,3,4,5,6,7
656
	movdqu		AESDATA\i, \i*16(DST)
657
.endr
658
.endm
659

660
// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
661
//					  const u32 le_ctr[4], u8 ghash_acc[16],
662
//					  const u8 *src, u8 *dst, int datalen);
663
//
664
// This macro generates a GCM encryption or decryption update function with the
665
// above prototype (with \enc selecting which one).
666
//
667
// This function computes the next portion of the CTR keystream, XOR's it with
668
// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
669
// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
670
// next |datalen| ciphertext bytes.
671
//
672
// |datalen| must be a multiple of 16, except on the last call where it can be
673
// any length.  The caller must do any buffering needed to ensure this.  Both
674
// in-place and out-of-place en/decryption are supported.
675
//
676
// |le_ctr| must give the current counter in little-endian format.  For a new
677
// message, the low word of the counter must be 2.  This function loads the
678
// counter from |le_ctr| and increments the loaded counter as needed, but it
679
// does *not* store the updated counter back to |le_ctr|.  The caller must
680
// update |le_ctr| if any more data segments follow.  Internally, only the low
681
// 32-bit word of the counter is incremented, following the GCM standard.
682
.macro	_aes_gcm_update	enc
683

684
	// Function arguments
685
	.set	KEY,		%rdi
686
	.set	LE_CTR_PTR,	%rsi	// Note: overlaps with usage as temp reg
687
	.set	GHASH_ACC_PTR,	%rdx
688
	.set	SRC,		%rcx
689
	.set	DST,		%r8
690
	.set	DATALEN,	%r9d
691
	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
692
	// Note: the code setting up for _load_partial_block assumes that SRC is
693
	// in %rcx (and that DATALEN is *not* in %rcx).
694

695
	// Additional local variables
696

697
	// %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
698
	// with LE_CTR_PTR, which is used only at the beginning.
699

700
	.set	AESKEYLEN,	%r10d	// AES key length in bytes
701
	.set	AESKEYLEN64,	%r10
702
	.set	RNDKEYLAST_PTR,	%r11	// Pointer to last AES round key
703

704
	// Put the most frequently used values in %xmm0-%xmm7 to reduce code
705
	// size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
706
	.set	TMP0,		%xmm0
707
	.set	TMP1,		%xmm1
708
	.set	TMP2,		%xmm2
709
	.set	LO,		%xmm3	// Low part of unreduced product
710
	.set	MI,		%xmm4	// Middle part of unreduced product
711
	.set	GHASH_ACC,	%xmm5	// GHASH accumulator; in main loop also
712
					// the high part of unreduced product
713
	.set	BSWAP_MASK,	%xmm6	// Shuffle mask for reflecting bytes
714
	.set	LE_CTR,		%xmm7	// Little-endian counter value
715
	.set	AESDATA0,	%xmm8
716
	.set	AESDATA1,	%xmm9
717
	.set	AESDATA2,	%xmm10
718
	.set	AESDATA3,	%xmm11
719
	.set	AESDATA4,	%xmm12
720
	.set	AESDATA5,	%xmm13
721
	.set	AESDATA6,	%xmm14
722
	.set	AESDATA7,	%xmm15
723

724
	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
725
	movdqu		(GHASH_ACC_PTR), GHASH_ACC
726
	movdqu		(LE_CTR_PTR), LE_CTR
727

728
	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
729
	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
730

731
	// If there are at least 8*16 bytes of data, then continue into the main
732
	// loop, which processes 8*16 bytes of data per iteration.
733
	//
734
	// The main loop interleaves AES and GHASH to improve performance on
735
	// CPUs that can execute these instructions in parallel.  When
736
	// decrypting, the GHASH input (the ciphertext) is immediately
737
	// available.  When encrypting, we instead encrypt a set of 8 blocks
738
	// first and then GHASH those blocks while encrypting the next set of 8,
739
	// repeat that as needed, and finally GHASH the last set of 8 blocks.
740
	//
741
	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
742
	// as this makes the immediate fit in a signed byte, saving 3 bytes.
743
	add		$-8*16, DATALEN
744
	jl		.Lcrypt_loop_8x_done\@
745
.if \enc
746
	// Encrypt the first 8 plaintext blocks.
747
	_ctr_begin_8x
748
	lea		16(KEY), %rsi
749
	.p2align 4
750
1:
751
	movdqa		(%rsi), TMP0
752
	_aesenc_8x	TMP0
753
	add		$16, %rsi
754
	cmp		%rsi, RNDKEYLAST_PTR
755
	jne		1b
756
	movdqa		(%rsi), TMP0
757
	_aesenclast_8x	TMP0
758
	_xor_data_8x
759
	// Don't increment DST until the ciphertext blocks have been hashed.
760
	sub		$-8*16, SRC
761
	add		$-8*16, DATALEN
762
	jl		.Lghash_last_ciphertext_8x\@
763
.endif
764

765
	.p2align 4
766
.Lcrypt_loop_8x\@:
767

768
	// Generate the next set of 8 counter blocks and start encrypting them.
769
	_ctr_begin_8x
770
	lea		16(KEY), %rsi
771

772
	// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
773
	// by doing the unreduced multiplication for the first ciphertext block.
774
	movdqa		(%rsi), TMP0
775
	add		$16, %rsi
776
	_aesenc_8x	TMP0
777
	_ghash_update_begin_8x \enc
778

779
	// Do 7 more rounds of AES, and continue the GHASH update by doing the
780
	// unreduced multiplication for the remaining ciphertext blocks.
781
	.p2align 4
782
1:
783
	movdqa		(%rsi), TMP0
784
	add		$16, %rsi
785
	_aesenc_8x	TMP0
786
	_ghash_update_continue_8x \enc
787
	cmp		$7*8, %eax
788
	jne		1b
789

790
	// Do the remaining AES rounds.
791
	.p2align 4
792
1:
793
	movdqa		(%rsi), TMP0
794
	add		$16, %rsi
795
	_aesenc_8x	TMP0
796
	cmp		%rsi, RNDKEYLAST_PTR
797
	jne		1b
798

799
	// Do the GHASH reduction and the last round of AES.
800
	movdqa		(RNDKEYLAST_PTR), TMP0
801
	_ghash_update_end_8x_step	0
802
	_aesenclast_8x	TMP0
803
	_ghash_update_end_8x_step	1
804

805
	// XOR the data with the AES-CTR keystream blocks.
806
.if \enc
807
	sub		$-8*16, DST
808
.endif
809
	_xor_data_8x
810
	sub		$-8*16, SRC
811
.if !\enc
812
	sub		$-8*16, DST
813
.endif
814
	add		$-8*16, DATALEN
815
	jge		.Lcrypt_loop_8x\@
816

817
.if \enc
818
.Lghash_last_ciphertext_8x\@:
819
	// Update GHASH with the last set of 8 ciphertext blocks.
820
	_ghash_update_begin_8x		\enc
821
	.p2align 4
822
1:
823
	_ghash_update_continue_8x	\enc
824
	cmp		$7*8, %eax
825
	jne		1b
826
	_ghash_update_end_8x_step	0
827
	_ghash_update_end_8x_step	1
828
	sub		$-8*16, DST
829
.endif
830

831
.Lcrypt_loop_8x_done\@:
832

833
	sub		$-8*16, DATALEN
834
	jz		.Ldone\@
835

836
	// Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
837
	// things simple and keep the code size down by just going one block at
838
	// a time, again taking advantage of hardware loop unrolling.  Since
839
	// there are enough key powers available for all remaining data, we do
840
	// the GHASH multiplications unreduced, and only reduce at the very end.
841

842
	.set	HI,		TMP2
843
	.set	H_POW,		AESDATA0
844
	.set	H_POW_XORED,	AESDATA1
845
	.set	ONE,		AESDATA2
846

847
	movq		.Lone(%rip), ONE
848

849
	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
850
	pxor		LO, LO
851
	pxor		MI, MI
852
	pxor		HI, HI
853

854
	// Set up a block counter %rax to contain 8*(8-n), where n is the number
855
	// of blocks that remain, counting any partial block.  This will be used
856
	// to access the key powers H^n through H^1.
857
	mov		DATALEN, %eax
858
	neg		%eax
859
	and		$~15, %eax
860
	sar		$1, %eax
861
	add		$64, %eax
862

863
	sub		$16, DATALEN
864
	jl		.Lcrypt_loop_1x_done\@
865

866
	// Process the data one full block at a time.
867
.Lcrypt_loop_1x\@:
868

869
	// Encrypt the next counter block.
870
	_vpshufb	BSWAP_MASK, LE_CTR, TMP0
871
	paddd		ONE, LE_CTR
872
	pxor		(KEY), TMP0
873
	lea		-6*16(RNDKEYLAST_PTR), %rsi	// Reduce code size
874
	cmp		$24, AESKEYLEN
875
	jl		128f	// AES-128?
876
	je		192f	// AES-192?
877
	// AES-256
878
	aesenc		-7*16(%rsi), TMP0
879
	aesenc		-6*16(%rsi), TMP0
880
192:
881
	aesenc		-5*16(%rsi), TMP0
882
	aesenc		-4*16(%rsi), TMP0
883
128:
884
.irp i, -3,-2,-1,0,1,2,3,4,5
885
	aesenc		\i*16(%rsi), TMP0
886
.endr
887
	aesenclast	(RNDKEYLAST_PTR), TMP0
888

889
	// Load the next key power H^i.
890
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
891
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
892

893
	// XOR the keystream block that was just generated in TMP0 with the next
894
	// source data block and store the resulting en/decrypted data to DST.
895
.if \enc
896
	_xor_mem_to_reg	(SRC), TMP0, tmp=TMP1
897
	movdqu		TMP0, (DST)
898
.else
899
	movdqu		(SRC), TMP1
900
	pxor		TMP1, TMP0
901
	movdqu		TMP0, (DST)
902
.endif
903

904
	// Update GHASH with the ciphertext block.
905
.if \enc
906
	pshufb		BSWAP_MASK, TMP0
907
	pxor		TMP0, GHASH_ACC
908
.else
909
	pshufb		BSWAP_MASK, TMP1
910
	pxor		TMP1, GHASH_ACC
911
.endif
912
	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
913
	pxor		GHASH_ACC, GHASH_ACC
914

915
	add		$8, %eax
916
	add		$16, SRC
917
	add		$16, DST
918
	sub		$16, DATALEN
919
	jge		.Lcrypt_loop_1x\@
920
.Lcrypt_loop_1x_done\@:
921
	// Check whether there is a partial block at the end.
922
	add		$16, DATALEN
923
	jz		.Lghash_reduce\@
924

925
	// Process a partial block of length 1 <= DATALEN <= 15.
926

927
	// Encrypt a counter block for the last time.
928
	pshufb		BSWAP_MASK, LE_CTR
929
	pxor		(KEY), LE_CTR
930
	lea		16(KEY), %rsi
931
1:
932
	aesenc		(%rsi), LE_CTR
933
	add		$16, %rsi
934
	cmp		%rsi, RNDKEYLAST_PTR
935
	jne		1b
936
	aesenclast	(RNDKEYLAST_PTR), LE_CTR
937

938
	// Load the lowest key power, H^1.
939
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
940
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
941

942
	// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
943
	// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
944
	// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
945
	mov		SRC, RNDKEYLAST_PTR
946
	mov		DATALEN, %ecx
947
	_load_partial_block	RNDKEYLAST_PTR, TMP0, %rsi, %esi
948

949
	// XOR the keystream block that was just generated in LE_CTR with the
950
	// source data block and store the resulting en/decrypted data to DST.
951
	pxor		TMP0, LE_CTR
952
	mov		DATALEN, %ecx
953
	_store_partial_block	LE_CTR, DST
954

955
	// If encrypting, zero-pad the final ciphertext block for GHASH.  (If
956
	// decrypting, this was already done by _load_partial_block.)
957
.if \enc
958
	lea		.Lzeropad_mask+16(%rip), %rax
959
	sub		DATALEN64, %rax
960
	_vpand		(%rax), LE_CTR, TMP0
961
.endif
962

963
	// Update GHASH with the final ciphertext block.
964
	pshufb		BSWAP_MASK, TMP0
965
	pxor		TMP0, GHASH_ACC
966
	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
967

968
.Lghash_reduce\@:
969
	// Finally, do the GHASH reduction.
970
	_ghash_reduce	LO, MI, HI, GHASH_ACC, TMP0
971

972
.Ldone\@:
973
	// Store the updated GHASH accumulator back to memory.
974
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
975

976
	RET
977
.endm
978

979
// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
980
//				   const u32 le_ctr[4], u8 ghash_acc[16],
981
//				   u64 total_aadlen, u64 total_datalen);
982
// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
983
//				   const u32 le_ctr[4], const u8 ghash_acc[16],
984
//				   u64 total_aadlen, u64 total_datalen,
985
//				   const u8 tag[16], int taglen);
986
//
987
// This macro generates one of the above two functions (with \enc selecting
988
// which one).  Both functions finish computing the GCM authentication tag by
989
// updating GHASH with the lengths block and encrypting the GHASH accumulator.
990
// |total_aadlen| and |total_datalen| must be the total length of the additional
991
// authenticated data and the en/decrypted data in bytes, respectively.
992
//
993
// The encryption function then stores the full-length (16-byte) computed
994
// authentication tag to |ghash_acc|.  The decryption function instead loads the
995
// expected authentication tag (the one that was transmitted) from the 16-byte
996
// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
997
// computed tag in constant time, and returns true if and only if they match.
998
.macro	_aes_gcm_final	enc
999

1000
	// Function arguments
1001
	.set	KEY,		%rdi
1002
	.set	LE_CTR_PTR,	%rsi
1003
	.set	GHASH_ACC_PTR,	%rdx
1004
	.set	TOTAL_AADLEN,	%rcx
1005
	.set	TOTAL_DATALEN,	%r8
1006
	.set	TAG,		%r9
1007
	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
1008
	.set	TAGLEN64,	%r10
1009

1010
	// Additional local variables.
1011
	// %rax and %xmm0-%xmm2 are used as temporary registers.
1012
	.set	AESKEYLEN,	%r11d
1013
	.set	AESKEYLEN64,	%r11
1014
	.set	BSWAP_MASK,	%xmm3
1015
	.set	GHASH_ACC,	%xmm4
1016
	.set	H_POW1,		%xmm5	// H^1
1017
	.set	H_POW1_X64,	%xmm6	// H^1 * x^64
1018
	.set	GFPOLY,		%xmm7
1019

1020
	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
1021
	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
1022

1023
	// Set up a counter block with 1 in the low 32-bit word.  This is the
1024
	// counter that produces the ciphertext needed to encrypt the auth tag.
1025
	movdqu		(LE_CTR_PTR), %xmm0
1026
	mov		$1, %eax
1027
	pinsrd		$0, %eax, %xmm0
1028

1029
	// Build the lengths block and XOR it into the GHASH accumulator.
1030
	movq		TOTAL_DATALEN, GHASH_ACC
1031
	pinsrq		$1, TOTAL_AADLEN, GHASH_ACC
1032
	psllq		$3, GHASH_ACC	// Bytes to bits
1033
	_xor_mem_to_reg	(GHASH_ACC_PTR), GHASH_ACC, %xmm1
1034

1035
	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
1036
	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
1037
	movq		.Lgfpoly(%rip), GFPOLY
1038

1039
	// Make %rax point to the 6th from last AES round key.  (Using signed
1040
	// byte offsets -7*16 through 6*16 decreases code size.)
1041
	lea		(KEY,AESKEYLEN64,4), %rax
1042

1043
	// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1044
	// Interleave the AES and GHASH instructions to improve performance.
1045
	pshufb		BSWAP_MASK, %xmm0
1046
	pxor		(KEY), %xmm0
1047
	cmp		$24, AESKEYLEN
1048
	jl		128f	// AES-128?
1049
	je		192f	// AES-192?
1050
	// AES-256
1051
	aesenc		-7*16(%rax), %xmm0
1052
	aesenc		-6*16(%rax), %xmm0
1053
192:
1054
	aesenc		-5*16(%rax), %xmm0
1055
	aesenc		-4*16(%rax), %xmm0
1056
128:
1057
.irp i, 0,1,2,3,4,5,6,7,8
1058
	aesenc		(\i-3)*16(%rax), %xmm0
1059
	_ghash_mul_step	\i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1060
.endr
1061
	aesenclast	6*16(%rax), %xmm0
1062
	_ghash_mul_step	9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1063

1064
	// Undo the byte reflection of the GHASH accumulator.
1065
	pshufb		BSWAP_MASK, GHASH_ACC
1066

1067
	// Encrypt the GHASH accumulator.
1068
	pxor		%xmm0, GHASH_ACC
1069

1070
.if \enc
1071
	// Return the computed auth tag.
1072
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
1073
.else
1074
	.set		ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
1075

1076
	// Verify the auth tag in constant time by XOR'ing the transmitted and
1077
	// computed auth tags together and using the ptest instruction to check
1078
	// whether the first TAGLEN bytes of the result are zero.
1079
	_xor_mem_to_reg	(TAG), GHASH_ACC, tmp=%xmm0
1080
	movl		8(%rsp), TAGLEN
1081
	lea		.Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
1082
	sub		TAGLEN64, ZEROPAD_MASK_PTR
1083
	xor		%eax, %eax
1084
	_test_mem	(ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
1085
	sete		%al
1086
.endif
1087
	RET
1088
.endm
1089

1090
.set	USE_AVX, 0
1091
SYM_FUNC_START(aes_gcm_precompute_aesni)
1092
	_aes_gcm_precompute
1093
SYM_FUNC_END(aes_gcm_precompute_aesni)
1094
SYM_FUNC_START(aes_gcm_aad_update_aesni)
1095
	_aes_gcm_aad_update
1096
SYM_FUNC_END(aes_gcm_aad_update_aesni)
1097
SYM_FUNC_START(aes_gcm_enc_update_aesni)
1098
	_aes_gcm_update	1
1099
SYM_FUNC_END(aes_gcm_enc_update_aesni)
1100
SYM_FUNC_START(aes_gcm_dec_update_aesni)
1101
	_aes_gcm_update	0
1102
SYM_FUNC_END(aes_gcm_dec_update_aesni)
1103
SYM_FUNC_START(aes_gcm_enc_final_aesni)
1104
	_aes_gcm_final	1
1105
SYM_FUNC_END(aes_gcm_enc_final_aesni)
1106
SYM_FUNC_START(aes_gcm_dec_final_aesni)
1107
	_aes_gcm_final	0
1108
SYM_FUNC_END(aes_gcm_dec_final_aesni)
1109

1110
.set	USE_AVX, 1
1111
SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
1112
	_aes_gcm_precompute
1113
SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
1114
SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
1115
	_aes_gcm_aad_update
1116
SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
1117
SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
1118
	_aes_gcm_update	1
1119
SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
1120
SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
1121
	_aes_gcm_update	0
1122
SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
1123
SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
1124
	_aes_gcm_final	1
1125
SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
1126
SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
1127
	_aes_gcm_final	0
1128
SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
1129

1130
Product

Resources

Company