Path: blob/master/arch/x86/crypto/aes-gcm-aesni-x86_64.S
54913 views
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */1//2// AES-NI optimized AES-GCM for x86_643//4// Copyright 2024 Google LLC5//6// Author: Eric Biggers <ebiggers@google.com>7//8//------------------------------------------------------------------------------9//10// This file is dual-licensed, meaning that you can use it under your choice of11// either of the following two licenses:12//13// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy14// of the License at15//16// http://www.apache.org/licenses/LICENSE-2.017//18// Unless required by applicable law or agreed to in writing, software19// distributed under the License is distributed on an "AS IS" BASIS,20// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.21// See the License for the specific language governing permissions and22// limitations under the License.23//24// or25//26// Redistribution and use in source and binary forms, with or without27// modification, are permitted provided that the following conditions are met:28//29// 1. Redistributions of source code must retain the above copyright notice,30// this list of conditions and the following disclaimer.31//32// 2. Redistributions in binary form must reproduce the above copyright33// notice, this list of conditions and the following disclaimer in the34// documentation and/or other materials provided with the distribution.35//36// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"37// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE38// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE39// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE40// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR41// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF42// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS43// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN44// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)45// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE46// POSSIBILITY OF SUCH DAMAGE.47//48//------------------------------------------------------------------------------49//50// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that51// support the original set of AES instructions, i.e. AES-NI. Two52// implementations are provided, one that uses AVX and one that doesn't. They53// are very similar, being generated by the same macros. The only difference is54// that the AVX implementation takes advantage of VEX-coded instructions in some55// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX56// implementation does *not* use 256-bit vectors, as AES is not supported on57// 256-bit vectors until the VAES feature (which this file doesn't target).58//59// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.160// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems61// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)62//63// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is64// more thoroughly commented. This file has the following notable changes:65//66// - The vector length is fixed at 128-bit, i.e. xmm registers. This means67// there is only one AES block (and GHASH block) per register.68//69// - Without AVX512, only 16 SIMD registers are available instead of 32. We70// work around this by being much more careful about using registers,71// relying heavily on loads to load values as they are needed.72//73// - Masking is not available either. We work around this by implementing74// partial block loads and stores using overlapping scalar loads and stores75// combined with shifts and SSE4.1 insertion and extraction instructions.76//77// - The main loop is organized differently due to the different design78// constraints. First, with just one AES block per SIMD register, on some79// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore80// do an 8-register wide loop. Considering that and the fact that we have81// just 16 SIMD registers to work with, it's not feasible to cache AES82// round keys and GHASH key powers in registers across loop iterations.83// That's not ideal, but also not actually that bad, since loads can run in84// parallel with other instructions. Significantly, this also makes it85// possible to roll up the inner loops, relying on hardware loop unrolling86// instead of software loop unrolling, greatly reducing code size.87//88// - We implement the GHASH multiplications in the main loop using Karatsuba89// multiplication instead of schoolbook multiplication. This saves one90// pclmulqdq instruction per block, at the cost of one 64-bit load, one91// pshufd, and 0.25 pxors per block. (This is without the three-argument92// XOR support that would be provided by AVX512, which would be more93// beneficial to schoolbook than Karatsuba.)94//95// As a rough approximation, we can assume that Karatsuba multiplication is96// faster than schoolbook multiplication in this context if one pshufd and97// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit98// load is "free" due to running in parallel with arithmetic instructions.)99// This is true on AMD CPUs, including all that support pclmulqdq up to at100// least Zen 3. It's also true on older Intel CPUs: Westmere through101// Haswell on the Core side, and Silvermont through Goldmont Plus on the102// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the103// benefit of Karatsuba should be substantial. On newer Intel CPUs,104// schoolbook multiplication should be faster, but only marginally.105//106// Not all these CPUs were available to be tested. However, benchmarks on107// available CPUs suggest that this approximation is plausible. Switching108// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,109// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.110// Considering that and the fact that Karatsuba should be even more111// beneficial on older Intel CPUs, it seems like the right choice here.112//113// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be114// saved by using a multiplication-less reduction method. We don't do that115// because it would require a large number of shift and xor instructions,116// making it less worthwhile and likely harmful on newer CPUs.117//118// It does make sense to sometimes use a different reduction optimization119// that saves a pclmulqdq, though: precompute the hash key times x^64, and120// multiply the low half of the data block by the hash key with the extra121// factor of x^64. This eliminates one step of the reduction. However,122// this is incompatible with Karatsuba multiplication. Therefore, for123// multi-block processing we use Karatsuba multiplication with a regular124// reduction. For single-block processing, we use the x^64 optimization.125126#include <linux/linkage.h>127128.section .rodata129.p2align 4130.Lbswap_mask:131.octa 0x000102030405060708090a0b0c0d0e0f132.Lgfpoly:133.quad 0xc200000000000000134.Lone:135.quad 1136.Lgfpoly_and_internal_carrybit:137.octa 0xc2000000000000010000000000000001138// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of139// 'len' 0xff bytes and the rest zeroes.140.Lzeropad_mask:141.octa 0xffffffffffffffffffffffffffffffff142.octa 0143144// Offsets in struct aes_gcm_key_aesni145#define OFFSETOF_AESKEYLEN 0146#define OFFSETOF_AESROUNDKEYS 16147#define OFFSETOF_H_POWERS 272148#define OFFSETOF_H_POWERS_XORED 400149#define OFFSETOF_H_TIMES_X64 464150151.text152153// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback154// assumes that all operands are distinct and that any mem operand is aligned.155.macro _vpclmulqdq imm, src1, src2, dst156.if USE_AVX157vpclmulqdq \imm, \src1, \src2, \dst158.else159movdqa \src2, \dst160pclmulqdq \imm, \src1, \dst161.endif162.endm163164// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes165// that all operands are distinct and that any mem operand is aligned.166.macro _vpshufb src1, src2, dst167.if USE_AVX168vpshufb \src1, \src2, \dst169.else170movdqa \src2, \dst171pshufb \src1, \dst172.endif173.endm174175// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that176// all operands are distinct.177.macro _vpand src1, src2, dst178.if USE_AVX179vpand \src1, \src2, \dst180.else181movdqu \src1, \dst182pand \src2, \dst183.endif184.endm185186// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must187// be a temporary xmm register.188.macro _xor_mem_to_reg mem, reg, tmp189.if USE_AVX190vpxor \mem, \reg, \reg191.else192movdqu \mem, \tmp193pxor \tmp, \reg194.endif195.endm196197// Test the unaligned memory operand \mem against the xmm register \reg. \tmp198// must be a temporary xmm register.199.macro _test_mem mem, reg, tmp200.if USE_AVX201vptest \mem, \reg202.else203movdqu \mem, \tmp204ptest \tmp, \reg205.endif206.endm207208// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst209// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.210.macro _load_partial_block src, dst, tmp64, tmp32211sub $8, %ecx // LEN - 8212jle .Lle8\@213214// Load 9 <= LEN <= 15 bytes.215movq (\src), \dst // Load first 8 bytes216mov (\src, %rcx), %rax // Load last 8 bytes217neg %ecx218shl $3, %ecx219shr %cl, %rax // Discard overlapping bytes220pinsrq $1, %rax, \dst221jmp .Ldone\@222223.Lle8\@:224add $4, %ecx // LEN - 4225jl .Llt4\@226227// Load 4 <= LEN <= 8 bytes.228mov (\src), %eax // Load first 4 bytes229mov (\src, %rcx), \tmp32 // Load last 4 bytes230jmp .Lcombine\@231232.Llt4\@:233// Load 1 <= LEN <= 3 bytes.234add $2, %ecx // LEN - 2235movzbl (\src), %eax // Load first byte236jl .Lmovq\@237movzwl (\src, %rcx), \tmp32 // Load last 2 bytes238.Lcombine\@:239shl $3, %ecx240shl %cl, \tmp64241or \tmp64, %rax // Combine the two parts242.Lmovq\@:243movq %rax, \dst244.Ldone\@:245.endm246247// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.248// Clobbers %rax, %rcx, and %rsi.249.macro _store_partial_block src, dst250sub $8, %ecx // LEN - 8251jl .Llt8\@252253// Store 8 <= LEN <= 15 bytes.254pextrq $1, \src, %rax255mov %ecx, %esi256shl $3, %ecx257ror %cl, %rax258mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes259movq \src, (\dst) // Store first 8 bytes260jmp .Ldone\@261262.Llt8\@:263add $4, %ecx // LEN - 4264jl .Llt4\@265266// Store 4 <= LEN <= 7 bytes.267pextrd $1, \src, %eax268mov %ecx, %esi269shl $3, %ecx270ror %cl, %eax271mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes272movd \src, (\dst) // Store first 4 bytes273jmp .Ldone\@274275.Llt4\@:276// Store 1 <= LEN <= 3 bytes.277pextrb $0, \src, 0(\dst)278cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?279jl .Ldone\@280pextrb $1, \src, 1(\dst)281je .Ldone\@282pextrb $2, \src, 2(\dst)283.Ldone\@:284.endm285286// Do one step of GHASH-multiplying \a by \b and storing the reduced product in287// \b. To complete all steps, this must be invoked with \i=0 through \i=9.288// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the289// .Lgfpoly constant, and \t0-\t1 must be temporary registers.290.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1291292// MI = (a_L * b_H) + ((a*x^64)_L * b_L)293.if \i == 0294_vpclmulqdq $0x01, \a, \b, \t0295.elseif \i == 1296_vpclmulqdq $0x00, \a_times_x64, \b, \t1297.elseif \i == 2298pxor \t1, \t0299300// HI = (a_H * b_H) + ((a*x^64)_H * b_L)301.elseif \i == 3302_vpclmulqdq $0x11, \a, \b, \t1303.elseif \i == 4304pclmulqdq $0x10, \a_times_x64, \b305.elseif \i == 5306pxor \t1, \b307.elseif \i == 6308309// Fold MI into HI.310pshufd $0x4e, \t0, \t1 // Swap halves of MI311.elseif \i == 7312pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)313.elseif \i == 8314pxor \t1, \b315.elseif \i == 9316pxor \t0, \b317.endif318.endm319320// GHASH-multiply \a by \b and store the reduced product in \b.321// See _ghash_mul_step for details.322.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1323.irp i, 0,1,2,3,4,5,6,7,8,9324_ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1325.endr326.endm327328// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.329// This does Karatsuba multiplication and must be paired with _ghash_reduce. On330// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the331// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.332.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0333334// LO += a_L * b_L335_vpclmulqdq $0x00, \a, \b, \t0336pxor \t0, \lo337338// b_L + b_H339pshufd $0x4e, \b, \t0340pxor \b, \t0341342// HI += a_H * b_H343pclmulqdq $0x11, \a, \b344pxor \b, \hi345346// MI += (a_L + a_H) * (b_L + b_H)347pclmulqdq $0x00, \a_xored, \t0348pxor \t0, \mi349.endm350351// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.352// This assumes that _ghash_mul_noreduce was used.353.macro _ghash_reduce lo, mi, hi, dst, t0354355movq .Lgfpoly(%rip), \t0356357// MI += LO + HI (needed because we used Karatsuba multiplication)358pxor \lo, \mi359pxor \hi, \mi360361// Fold LO into MI.362pshufd $0x4e, \lo, \dst363pclmulqdq $0x00, \t0, \lo364pxor \dst, \mi365pxor \lo, \mi366367// Fold MI into HI.368pshufd $0x4e, \mi, \dst369pclmulqdq $0x00, \t0, \mi370pxor \hi, \dst371pxor \mi, \dst372.endm373374// Do the first step of the GHASH update of a set of 8 ciphertext blocks.375//376// The whole GHASH update does:377//378// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +379// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1380//381// This macro just does the first step: it does the unreduced multiplication382// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm383// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the384// inner block counter in %rax, which is a value that counts up by 8 for each385// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.386//387// To reduce the number of pclmulqdq instructions required, both this macro and388// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook389// multiplication. See the file comment for more details about this choice.390//391// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if392// encrypting, or SRC if decrypting. They also expect the precomputed hash key393// powers H^i and their XOR'd-together halves to be available in the struct394// pointed to by KEY. Both macros clobber TMP[0-2].395.macro _ghash_update_begin_8x enc396397// Initialize the inner block counter.398xor %eax, %eax399400// Load the highest hash key power, H^8.401movdqa OFFSETOF_H_POWERS(KEY), TMP0402403// Load the first ciphertext block and byte-reflect it.404.if \enc405movdqu (DST), TMP1406.else407movdqu (SRC), TMP1408.endif409pshufb BSWAP_MASK, TMP1410411// Add the GHASH accumulator to the ciphertext block to get the block412// 'b' that needs to be multiplied with the hash key power 'a'.413pxor TMP1, GHASH_ACC414415// b_L + b_H416pshufd $0x4e, GHASH_ACC, MI417pxor GHASH_ACC, MI418419// LO = a_L * b_L420_vpclmulqdq $0x00, TMP0, GHASH_ACC, LO421422// HI = a_H * b_H423pclmulqdq $0x11, TMP0, GHASH_ACC424425// MI = (a_L + a_H) * (b_L + b_H)426pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI427.endm428429// Continue the GHASH update of 8 ciphertext blocks as described above by doing430// an unreduced multiplication of the next ciphertext block by the next lowest431// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.432.macro _ghash_update_continue_8x enc433add $8, %eax434435// Load the next lowest key power.436movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0437438// Load the next ciphertext block and byte-reflect it.439.if \enc440movdqu (DST,%rax,2), TMP1441.else442movdqu (SRC,%rax,2), TMP1443.endif444pshufb BSWAP_MASK, TMP1445446// LO += a_L * b_L447_vpclmulqdq $0x00, TMP0, TMP1, TMP2448pxor TMP2, LO449450// b_L + b_H451pshufd $0x4e, TMP1, TMP2452pxor TMP1, TMP2453454// HI += a_H * b_H455pclmulqdq $0x11, TMP0, TMP1456pxor TMP1, GHASH_ACC457458// MI += (a_L + a_H) * (b_L + b_H)459movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1460pclmulqdq $0x00, TMP1, TMP2461pxor TMP2, MI462.endm463464// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to465// _ghash_reduce, but it's hardcoded to use the registers of the main loop and466// it uses the same register for HI and the destination. It's also divided into467// two steps. TMP1 must be preserved across steps.468//469// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of470// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would471// increase the critical path length, and it seems to slightly hurt performance.472.macro _ghash_update_end_8x_step i473.if \i == 0474movq .Lgfpoly(%rip), TMP1475pxor LO, MI476pxor GHASH_ACC, MI477pshufd $0x4e, LO, TMP2478pclmulqdq $0x00, TMP1, LO479pxor TMP2, MI480pxor LO, MI481.elseif \i == 1482pshufd $0x4e, MI, TMP2483pclmulqdq $0x00, TMP1, MI484pxor TMP2, GHASH_ACC485pxor MI, GHASH_ACC486.endif487.endm488489// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);490//491// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH492// related fields in the key struct.493.macro _aes_gcm_precompute494495// Function arguments496.set KEY, %rdi497498// Additional local variables.499// %xmm0-%xmm1 and %rax are used as temporaries.500.set RNDKEYLAST_PTR, %rsi501.set H_CUR, %xmm2502.set H_POW1, %xmm3 // H^1503.set H_POW1_X64, %xmm4 // H^1 * x^64504.set GFPOLY, %xmm5505506// Encrypt an all-zeroes block to get the raw hash subkey.507movl OFFSETOF_AESKEYLEN(KEY), %eax508lea OFFSETOF_AESROUNDKEYS+6*16(KEY,%rax,4), RNDKEYLAST_PTR509movdqa OFFSETOF_AESROUNDKEYS(KEY), H_POW1510lea OFFSETOF_AESROUNDKEYS+16(KEY), %rax5111:512aesenc (%rax), H_POW1513add $16, %rax514cmp %rax, RNDKEYLAST_PTR515jne 1b516aesenclast (RNDKEYLAST_PTR), H_POW1517518// Preprocess the raw hash subkey as needed to operate on GHASH's519// bit-reflected values directly: reflect its bytes, then multiply it by520// x^-1 (using the backwards interpretation of polynomial coefficients521// from the GCM spec) or equivalently x^1 (using the alternative,522// natural interpretation of polynomial coefficients).523pshufb .Lbswap_mask(%rip), H_POW1524movdqa H_POW1, %xmm0525pshufd $0xd3, %xmm0, %xmm0526psrad $31, %xmm0527paddq H_POW1, H_POW1528pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0529pxor %xmm0, H_POW1530531// Store H^1.532movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)533534// Compute and store H^1 * x^64.535movq .Lgfpoly(%rip), GFPOLY536pshufd $0x4e, H_POW1, %xmm0537_vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64538pxor %xmm0, H_POW1_X64539movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)540541// Compute and store the halves of H^1 XOR'd together.542pxor H_POW1, %xmm0543movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)544545// Compute and store the remaining key powers H^2 through H^8.546movdqa H_POW1, H_CUR547mov $6*8, %eax548.Lprecompute_next\@:549// Compute H^i = H^{i-1} * H^1.550_ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1551// Store H^i.552movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)553// Compute and store the halves of H^i XOR'd together.554pshufd $0x4e, H_CUR, %xmm0555pxor H_CUR, %xmm0556movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)557sub $8, %eax558jge .Lprecompute_next\@559560RET561.endm562563// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,564// u8 ghash_acc[16], const u8 *aad, int aadlen);565//566// This function processes the AAD (Additional Authenticated Data) in GCM.567// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the568// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all569// zeroes. |aadlen| must be a multiple of 16, except on the last call where it570// can be any length. The caller must do any buffering needed to ensure this.571.macro _aes_gcm_aad_update572573// Function arguments574.set KEY, %rdi575.set GHASH_ACC_PTR, %rsi576.set AAD, %rdx577.set AADLEN, %ecx578// Note: _load_partial_block relies on AADLEN being in %ecx.579580// Additional local variables.581// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.582.set BSWAP_MASK, %xmm2583.set GHASH_ACC, %xmm3584.set H_POW1, %xmm4 // H^1585.set H_POW1_X64, %xmm5 // H^1 * x^64586.set GFPOLY, %xmm6587588movdqa .Lbswap_mask(%rip), BSWAP_MASK589movdqu (GHASH_ACC_PTR), GHASH_ACC590movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1591movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64592movq .Lgfpoly(%rip), GFPOLY593594// Process the AAD one full block at a time.595sub $16, AADLEN596jl .Laad_loop_1x_done\@597.Laad_loop_1x\@:598movdqu (AAD), %xmm0599pshufb BSWAP_MASK, %xmm0600pxor %xmm0, GHASH_ACC601_ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1602add $16, AAD603sub $16, AADLEN604jge .Laad_loop_1x\@605.Laad_loop_1x_done\@:606// Check whether there is a partial block at the end.607add $16, AADLEN608jz .Laad_done\@609610// Process a partial block of length 1 <= AADLEN <= 15.611// _load_partial_block assumes that %ecx contains AADLEN.612_load_partial_block AAD, %xmm0, %r10, %r10d613pshufb BSWAP_MASK, %xmm0614pxor %xmm0, GHASH_ACC615_ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1616617.Laad_done\@:618movdqu GHASH_ACC, (GHASH_ACC_PTR)619RET620.endm621622// Increment LE_CTR eight times to generate eight little-endian counter blocks,623// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with624// the zero-th AES round key. Clobbers TMP0 and TMP1.625.macro _ctr_begin_8x626movq .Lone(%rip), TMP0627movdqa OFFSETOF_AESROUNDKEYS(KEY), TMP1 // zero-th round key628.irp i, 0,1,2,3,4,5,6,7629_vpshufb BSWAP_MASK, LE_CTR, AESDATA\i630pxor TMP1, AESDATA\i631paddd TMP0, LE_CTR632.endr633.endm634635// Do a non-last round of AES on AESDATA[0-7] using \round_key.636.macro _aesenc_8x round_key637.irp i, 0,1,2,3,4,5,6,7638aesenc \round_key, AESDATA\i639.endr640.endm641642// Do the last round of AES on AESDATA[0-7] using \round_key.643.macro _aesenclast_8x round_key644.irp i, 0,1,2,3,4,5,6,7645aesenclast \round_key, AESDATA\i646.endr647.endm648649// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and650// store the result to DST. Clobbers TMP0.651.macro _xor_data_8x652.irp i, 0,1,2,3,4,5,6,7653_xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0654.endr655.irp i, 0,1,2,3,4,5,6,7656movdqu AESDATA\i, \i*16(DST)657.endr658.endm659660// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,661// const u32 le_ctr[4], u8 ghash_acc[16],662// const u8 *src, u8 *dst, int datalen);663//664// This macro generates a GCM encryption or decryption update function with the665// above prototype (with \enc selecting which one).666//667// This function computes the next portion of the CTR keystream, XOR's it with668// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted669// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the670// next |datalen| ciphertext bytes.671//672// |datalen| must be a multiple of 16, except on the last call where it can be673// any length. The caller must do any buffering needed to ensure this. Both674// in-place and out-of-place en/decryption are supported.675//676// |le_ctr| must give the current counter in little-endian format. For a new677// message, the low word of the counter must be 2. This function loads the678// counter from |le_ctr| and increments the loaded counter as needed, but it679// does *not* store the updated counter back to |le_ctr|. The caller must680// update |le_ctr| if any more data segments follow. Internally, only the low681// 32-bit word of the counter is incremented, following the GCM standard.682.macro _aes_gcm_update enc683684// Function arguments685.set KEY, %rdi686.set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg687.set GHASH_ACC_PTR, %rdx688.set SRC, %rcx689.set DST, %r8690.set DATALEN, %r9d691.set DATALEN64, %r9 // Zero-extend DATALEN before using!692// Note: the code setting up for _load_partial_block assumes that SRC is693// in %rcx (and that DATALEN is *not* in %rcx).694695// Additional local variables696697// %rax and %rsi are used as temporary registers. Note: %rsi overlaps698// with LE_CTR_PTR, which is used only at the beginning.699700.set AESKEYLEN, %r10d // AES key length in bytes701.set AESKEYLEN64, %r10702.set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key703704// Put the most frequently used values in %xmm0-%xmm7 to reduce code705// size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)706.set TMP0, %xmm0707.set TMP1, %xmm1708.set TMP2, %xmm2709.set LO, %xmm3 // Low part of unreduced product710.set MI, %xmm4 // Middle part of unreduced product711.set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also712// the high part of unreduced product713.set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes714.set LE_CTR, %xmm7 // Little-endian counter value715.set AESDATA0, %xmm8716.set AESDATA1, %xmm9717.set AESDATA2, %xmm10718.set AESDATA3, %xmm11719.set AESDATA4, %xmm12720.set AESDATA5, %xmm13721.set AESDATA6, %xmm14722.set AESDATA7, %xmm15723724movdqa .Lbswap_mask(%rip), BSWAP_MASK725movdqu (GHASH_ACC_PTR), GHASH_ACC726movdqu (LE_CTR_PTR), LE_CTR727728movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN729lea OFFSETOF_AESROUNDKEYS+6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR730731// If there are at least 8*16 bytes of data, then continue into the main732// loop, which processes 8*16 bytes of data per iteration.733//734// The main loop interleaves AES and GHASH to improve performance on735// CPUs that can execute these instructions in parallel. When736// decrypting, the GHASH input (the ciphertext) is immediately737// available. When encrypting, we instead encrypt a set of 8 blocks738// first and then GHASH those blocks while encrypting the next set of 8,739// repeat that as needed, and finally GHASH the last set of 8 blocks.740//741// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,742// as this makes the immediate fit in a signed byte, saving 3 bytes.743add $-8*16, DATALEN744jl .Lcrypt_loop_8x_done\@745.if \enc746// Encrypt the first 8 plaintext blocks.747_ctr_begin_8x748lea OFFSETOF_AESROUNDKEYS+16(KEY), %rsi749.p2align 47501:751movdqa (%rsi), TMP0752_aesenc_8x TMP0753add $16, %rsi754cmp %rsi, RNDKEYLAST_PTR755jne 1b756movdqa (%rsi), TMP0757_aesenclast_8x TMP0758_xor_data_8x759// Don't increment DST until the ciphertext blocks have been hashed.760sub $-8*16, SRC761add $-8*16, DATALEN762jl .Lghash_last_ciphertext_8x\@763.endif764765.p2align 4766.Lcrypt_loop_8x\@:767768// Generate the next set of 8 counter blocks and start encrypting them.769_ctr_begin_8x770lea OFFSETOF_AESROUNDKEYS+16(KEY), %rsi771772// Do a round of AES, and start the GHASH update of 8 ciphertext blocks773// by doing the unreduced multiplication for the first ciphertext block.774movdqa (%rsi), TMP0775add $16, %rsi776_aesenc_8x TMP0777_ghash_update_begin_8x \enc778779// Do 7 more rounds of AES, and continue the GHASH update by doing the780// unreduced multiplication for the remaining ciphertext blocks.781.p2align 47821:783movdqa (%rsi), TMP0784add $16, %rsi785_aesenc_8x TMP0786_ghash_update_continue_8x \enc787cmp $7*8, %eax788jne 1b789790// Do the remaining AES rounds.791.p2align 47921:793movdqa (%rsi), TMP0794add $16, %rsi795_aesenc_8x TMP0796cmp %rsi, RNDKEYLAST_PTR797jne 1b798799// Do the GHASH reduction and the last round of AES.800movdqa (RNDKEYLAST_PTR), TMP0801_ghash_update_end_8x_step 0802_aesenclast_8x TMP0803_ghash_update_end_8x_step 1804805// XOR the data with the AES-CTR keystream blocks.806.if \enc807sub $-8*16, DST808.endif809_xor_data_8x810sub $-8*16, SRC811.if !\enc812sub $-8*16, DST813.endif814add $-8*16, DATALEN815jge .Lcrypt_loop_8x\@816817.if \enc818.Lghash_last_ciphertext_8x\@:819// Update GHASH with the last set of 8 ciphertext blocks.820_ghash_update_begin_8x \enc821.p2align 48221:823_ghash_update_continue_8x \enc824cmp $7*8, %eax825jne 1b826_ghash_update_end_8x_step 0827_ghash_update_end_8x_step 1828sub $-8*16, DST829.endif830831.Lcrypt_loop_8x_done\@:832833sub $-8*16, DATALEN834jz .Ldone\@835836// Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep837// things simple and keep the code size down by just going one block at838// a time, again taking advantage of hardware loop unrolling. Since839// there are enough key powers available for all remaining data, we do840// the GHASH multiplications unreduced, and only reduce at the very end.841842.set HI, TMP2843.set H_POW, AESDATA0844.set H_POW_XORED, AESDATA1845.set ONE, AESDATA2846847movq .Lone(%rip), ONE848849// Start collecting the unreduced GHASH intermediate value LO, MI, HI.850pxor LO, LO851pxor MI, MI852pxor HI, HI853854// Set up a block counter %rax to contain 8*(8-n), where n is the number855// of blocks that remain, counting any partial block. This will be used856// to access the key powers H^n through H^1.857mov DATALEN, %eax858neg %eax859and $~15, %eax860sar $1, %eax861add $64, %eax862863sub $16, DATALEN864jl .Lcrypt_loop_1x_done\@865866// Process the data one full block at a time.867.Lcrypt_loop_1x\@:868869// Encrypt the next counter block.870_vpshufb BSWAP_MASK, LE_CTR, TMP0871paddd ONE, LE_CTR872pxor OFFSETOF_AESROUNDKEYS(KEY), TMP0873lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size874cmp $24, AESKEYLEN875jl 128f // AES-128?876je 192f // AES-192?877// AES-256878aesenc -7*16(%rsi), TMP0879aesenc -6*16(%rsi), TMP0880192:881aesenc -5*16(%rsi), TMP0882aesenc -4*16(%rsi), TMP0883128:884.irp i, -3,-2,-1,0,1,2,3,4,5885aesenc \i*16(%rsi), TMP0886.endr887aesenclast (RNDKEYLAST_PTR), TMP0888889// Load the next key power H^i.890movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW891movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED892893// XOR the keystream block that was just generated in TMP0 with the next894// source data block and store the resulting en/decrypted data to DST.895.if \enc896_xor_mem_to_reg (SRC), TMP0, tmp=TMP1897movdqu TMP0, (DST)898.else899movdqu (SRC), TMP1900pxor TMP1, TMP0901movdqu TMP0, (DST)902.endif903904// Update GHASH with the ciphertext block.905.if \enc906pshufb BSWAP_MASK, TMP0907pxor TMP0, GHASH_ACC908.else909pshufb BSWAP_MASK, TMP1910pxor TMP1, GHASH_ACC911.endif912_ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0913pxor GHASH_ACC, GHASH_ACC914915add $8, %eax916add $16, SRC917add $16, DST918sub $16, DATALEN919jge .Lcrypt_loop_1x\@920.Lcrypt_loop_1x_done\@:921// Check whether there is a partial block at the end.922add $16, DATALEN923jz .Lghash_reduce\@924925// Process a partial block of length 1 <= DATALEN <= 15.926927// Encrypt a counter block for the last time.928pshufb BSWAP_MASK, LE_CTR929pxor OFFSETOF_AESROUNDKEYS(KEY), LE_CTR930lea OFFSETOF_AESROUNDKEYS+16(KEY), %rsi9311:932aesenc (%rsi), LE_CTR933add $16, %rsi934cmp %rsi, RNDKEYLAST_PTR935jne 1b936aesenclast (RNDKEYLAST_PTR), LE_CTR937938// Load the lowest key power, H^1.939movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW940movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED941942// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is943// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.944// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.945mov SRC, RNDKEYLAST_PTR946mov DATALEN, %ecx947_load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi948949// XOR the keystream block that was just generated in LE_CTR with the950// source data block and store the resulting en/decrypted data to DST.951pxor TMP0, LE_CTR952mov DATALEN, %ecx953_store_partial_block LE_CTR, DST954955// If encrypting, zero-pad the final ciphertext block for GHASH. (If956// decrypting, this was already done by _load_partial_block.)957.if \enc958lea .Lzeropad_mask+16(%rip), %rax959sub DATALEN64, %rax960_vpand (%rax), LE_CTR, TMP0961.endif962963// Update GHASH with the final ciphertext block.964pshufb BSWAP_MASK, TMP0965pxor TMP0, GHASH_ACC966_ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0967968.Lghash_reduce\@:969// Finally, do the GHASH reduction.970_ghash_reduce LO, MI, HI, GHASH_ACC, TMP0971972.Ldone\@:973// Store the updated GHASH accumulator back to memory.974movdqu GHASH_ACC, (GHASH_ACC_PTR)975976RET977.endm978979// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,980// const u32 le_ctr[4], u8 ghash_acc[16],981// u64 total_aadlen, u64 total_datalen);982// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,983// const u32 le_ctr[4], const u8 ghash_acc[16],984// u64 total_aadlen, u64 total_datalen,985// const u8 tag[16], int taglen);986//987// This macro generates one of the above two functions (with \enc selecting988// which one). Both functions finish computing the GCM authentication tag by989// updating GHASH with the lengths block and encrypting the GHASH accumulator.990// |total_aadlen| and |total_datalen| must be the total length of the additional991// authenticated data and the en/decrypted data in bytes, respectively.992//993// The encryption function then stores the full-length (16-byte) computed994// authentication tag to |ghash_acc|. The decryption function instead loads the995// expected authentication tag (the one that was transmitted) from the 16-byte996// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the997// computed tag in constant time, and returns true if and only if they match.998.macro _aes_gcm_final enc9991000// Function arguments1001.set KEY, %rdi1002.set LE_CTR_PTR, %rsi1003.set GHASH_ACC_PTR, %rdx1004.set TOTAL_AADLEN, %rcx1005.set TOTAL_DATALEN, %r81006.set TAG, %r91007.set TAGLEN, %r10d // Originally at 8(%rsp)1008.set TAGLEN64, %r1010091010// Additional local variables.1011// %rax and %xmm0-%xmm2 are used as temporary registers.1012.set AESKEYLEN, %r11d1013.set AESKEYLEN64, %r111014.set BSWAP_MASK, %xmm31015.set GHASH_ACC, %xmm41016.set H_POW1, %xmm5 // H^11017.set H_POW1_X64, %xmm6 // H^1 * x^641018.set GFPOLY, %xmm710191020movdqa .Lbswap_mask(%rip), BSWAP_MASK1021movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN10221023// Set up a counter block with 1 in the low 32-bit word. This is the1024// counter that produces the ciphertext needed to encrypt the auth tag.1025movdqu (LE_CTR_PTR), %xmm01026mov $1, %eax1027pinsrd $0, %eax, %xmm010281029// Build the lengths block and XOR it into the GHASH accumulator.1030movq TOTAL_DATALEN, GHASH_ACC1031pinsrq $1, TOTAL_AADLEN, GHASH_ACC1032psllq $3, GHASH_ACC // Bytes to bits1033_xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm110341035movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW11036movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X641037movq .Lgfpoly(%rip), GFPOLY10381039// Make %rax point to the 6th from last AES round key. (Using signed1040// byte offsets -7*16 through 6*16 decreases code size.)1041lea OFFSETOF_AESROUNDKEYS(KEY,AESKEYLEN64,4), %rax10421043// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.1044// Interleave the AES and GHASH instructions to improve performance.1045pshufb BSWAP_MASK, %xmm01046pxor OFFSETOF_AESROUNDKEYS(KEY), %xmm01047cmp $24, AESKEYLEN1048jl 128f // AES-128?1049je 192f // AES-192?1050// AES-2561051aesenc -7*16(%rax), %xmm01052aesenc -6*16(%rax), %xmm01053192:1054aesenc -5*16(%rax), %xmm01055aesenc -4*16(%rax), %xmm01056128:1057.irp i, 0,1,2,3,4,5,6,7,81058aesenc (\i-3)*16(%rax), %xmm01059_ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm21060.endr1061aesenclast 6*16(%rax), %xmm01062_ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm210631064// Undo the byte reflection of the GHASH accumulator.1065pshufb BSWAP_MASK, GHASH_ACC10661067// Encrypt the GHASH accumulator.1068pxor %xmm0, GHASH_ACC10691070.if \enc1071// Return the computed auth tag.1072movdqu GHASH_ACC, (GHASH_ACC_PTR)1073.else1074.set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!10751076// Verify the auth tag in constant time by XOR'ing the transmitted and1077// computed auth tags together and using the ptest instruction to check1078// whether the first TAGLEN bytes of the result are zero.1079_xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm01080movl 8(%rsp), TAGLEN1081lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR1082sub TAGLEN64, ZEROPAD_MASK_PTR1083xor %eax, %eax1084_test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm01085sete %al1086.endif1087RET1088.endm10891090.set USE_AVX, 01091SYM_FUNC_START(aes_gcm_precompute_aesni)1092_aes_gcm_precompute1093SYM_FUNC_END(aes_gcm_precompute_aesni)1094SYM_FUNC_START(aes_gcm_aad_update_aesni)1095_aes_gcm_aad_update1096SYM_FUNC_END(aes_gcm_aad_update_aesni)1097SYM_FUNC_START(aes_gcm_enc_update_aesni)1098_aes_gcm_update 11099SYM_FUNC_END(aes_gcm_enc_update_aesni)1100SYM_FUNC_START(aes_gcm_dec_update_aesni)1101_aes_gcm_update 01102SYM_FUNC_END(aes_gcm_dec_update_aesni)1103SYM_FUNC_START(aes_gcm_enc_final_aesni)1104_aes_gcm_final 11105SYM_FUNC_END(aes_gcm_enc_final_aesni)1106SYM_FUNC_START(aes_gcm_dec_final_aesni)1107_aes_gcm_final 01108SYM_FUNC_END(aes_gcm_dec_final_aesni)11091110.set USE_AVX, 11111SYM_FUNC_START(aes_gcm_precompute_aesni_avx)1112_aes_gcm_precompute1113SYM_FUNC_END(aes_gcm_precompute_aesni_avx)1114SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)1115_aes_gcm_aad_update1116SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)1117SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)1118_aes_gcm_update 11119SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)1120SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)1121_aes_gcm_update 01122SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)1123SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)1124_aes_gcm_final 11125SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)1126SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)1127_aes_gcm_final 01128SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)112911301131