Path: blob/master/arch/x86/crypto/aes-gcm-aesni-x86_64.S
26451 views
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */1//2// AES-NI optimized AES-GCM for x86_643//4// Copyright 2024 Google LLC5//6// Author: Eric Biggers <ebiggers@google.com>7//8//------------------------------------------------------------------------------9//10// This file is dual-licensed, meaning that you can use it under your choice of11// either of the following two licenses:12//13// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy14// of the License at15//16// http://www.apache.org/licenses/LICENSE-2.017//18// Unless required by applicable law or agreed to in writing, software19// distributed under the License is distributed on an "AS IS" BASIS,20// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.21// See the License for the specific language governing permissions and22// limitations under the License.23//24// or25//26// Redistribution and use in source and binary forms, with or without27// modification, are permitted provided that the following conditions are met:28//29// 1. Redistributions of source code must retain the above copyright notice,30// this list of conditions and the following disclaimer.31//32// 2. Redistributions in binary form must reproduce the above copyright33// notice, this list of conditions and the following disclaimer in the34// documentation and/or other materials provided with the distribution.35//36// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"37// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE38// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE39// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE40// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR41// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF42// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS43// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN44// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)45// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE46// POSSIBILITY OF SUCH DAMAGE.47//48//------------------------------------------------------------------------------49//50// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that51// support the original set of AES instructions, i.e. AES-NI. Two52// implementations are provided, one that uses AVX and one that doesn't. They53// are very similar, being generated by the same macros. The only difference is54// that the AVX implementation takes advantage of VEX-coded instructions in some55// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX56// implementation does *not* use 256-bit vectors, as AES is not supported on57// 256-bit vectors until the VAES feature (which this file doesn't target).58//59// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.160// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems61// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)62//63// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is64// more thoroughly commented. This file has the following notable changes:65//66// - The vector length is fixed at 128-bit, i.e. xmm registers. This means67// there is only one AES block (and GHASH block) per register.68//69// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of70// 32. We work around this by being much more careful about using71// registers, relying heavily on loads to load values as they are needed.72//73// - Masking is not available either. We work around this by implementing74// partial block loads and stores using overlapping scalar loads and stores75// combined with shifts and SSE4.1 insertion and extraction instructions.76//77// - The main loop is organized differently due to the different design78// constraints. First, with just one AES block per SIMD register, on some79// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore80// do an 8-register wide loop. Considering that and the fact that we have81// just 16 SIMD registers to work with, it's not feasible to cache AES82// round keys and GHASH key powers in registers across loop iterations.83// That's not ideal, but also not actually that bad, since loads can run in84// parallel with other instructions. Significantly, this also makes it85// possible to roll up the inner loops, relying on hardware loop unrolling86// instead of software loop unrolling, greatly reducing code size.87//88// - We implement the GHASH multiplications in the main loop using Karatsuba89// multiplication instead of schoolbook multiplication. This saves one90// pclmulqdq instruction per block, at the cost of one 64-bit load, one91// pshufd, and 0.25 pxors per block. (This is without the three-argument92// XOR support that would be provided by AVX512 / AVX10, which would be93// more beneficial to schoolbook than Karatsuba.)94//95// As a rough approximation, we can assume that Karatsuba multiplication is96// faster than schoolbook multiplication in this context if one pshufd and97// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit98// load is "free" due to running in parallel with arithmetic instructions.)99// This is true on AMD CPUs, including all that support pclmulqdq up to at100// least Zen 3. It's also true on older Intel CPUs: Westmere through101// Haswell on the Core side, and Silvermont through Goldmont Plus on the102// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the103// benefit of Karatsuba should be substantial. On newer Intel CPUs,104// schoolbook multiplication should be faster, but only marginally.105//106// Not all these CPUs were available to be tested. However, benchmarks on107// available CPUs suggest that this approximation is plausible. Switching108// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,109// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.110// Considering that and the fact that Karatsuba should be even more111// beneficial on older Intel CPUs, it seems like the right choice here.112//113// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be114// saved by using a multiplication-less reduction method. We don't do that115// because it would require a large number of shift and xor instructions,116// making it less worthwhile and likely harmful on newer CPUs.117//118// It does make sense to sometimes use a different reduction optimization119// that saves a pclmulqdq, though: precompute the hash key times x^64, and120// multiply the low half of the data block by the hash key with the extra121// factor of x^64. This eliminates one step of the reduction. However,122// this is incompatible with Karatsuba multiplication. Therefore, for123// multi-block processing we use Karatsuba multiplication with a regular124// reduction. For single-block processing, we use the x^64 optimization.125126#include <linux/linkage.h>127128.section .rodata129.p2align 4130.Lbswap_mask:131.octa 0x000102030405060708090a0b0c0d0e0f132.Lgfpoly:133.quad 0xc200000000000000134.Lone:135.quad 1136.Lgfpoly_and_internal_carrybit:137.octa 0xc2000000000000010000000000000001138// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of139// 'len' 0xff bytes and the rest zeroes.140.Lzeropad_mask:141.octa 0xffffffffffffffffffffffffffffffff142.octa 0143144// Offsets in struct aes_gcm_key_aesni145#define OFFSETOF_AESKEYLEN 480146#define OFFSETOF_H_POWERS 496147#define OFFSETOF_H_POWERS_XORED 624148#define OFFSETOF_H_TIMES_X64 688149150.text151152// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback153// assumes that all operands are distinct and that any mem operand is aligned.154.macro _vpclmulqdq imm, src1, src2, dst155.if USE_AVX156vpclmulqdq \imm, \src1, \src2, \dst157.else158movdqa \src2, \dst159pclmulqdq \imm, \src1, \dst160.endif161.endm162163// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes164// that all operands are distinct and that any mem operand is aligned.165.macro _vpshufb src1, src2, dst166.if USE_AVX167vpshufb \src1, \src2, \dst168.else169movdqa \src2, \dst170pshufb \src1, \dst171.endif172.endm173174// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that175// all operands are distinct.176.macro _vpand src1, src2, dst177.if USE_AVX178vpand \src1, \src2, \dst179.else180movdqu \src1, \dst181pand \src2, \dst182.endif183.endm184185// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must186// be a temporary xmm register.187.macro _xor_mem_to_reg mem, reg, tmp188.if USE_AVX189vpxor \mem, \reg, \reg190.else191movdqu \mem, \tmp192pxor \tmp, \reg193.endif194.endm195196// Test the unaligned memory operand \mem against the xmm register \reg. \tmp197// must be a temporary xmm register.198.macro _test_mem mem, reg, tmp199.if USE_AVX200vptest \mem, \reg201.else202movdqu \mem, \tmp203ptest \tmp, \reg204.endif205.endm206207// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst208// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.209.macro _load_partial_block src, dst, tmp64, tmp32210sub $8, %ecx // LEN - 8211jle .Lle8\@212213// Load 9 <= LEN <= 15 bytes.214movq (\src), \dst // Load first 8 bytes215mov (\src, %rcx), %rax // Load last 8 bytes216neg %ecx217shl $3, %ecx218shr %cl, %rax // Discard overlapping bytes219pinsrq $1, %rax, \dst220jmp .Ldone\@221222.Lle8\@:223add $4, %ecx // LEN - 4224jl .Llt4\@225226// Load 4 <= LEN <= 8 bytes.227mov (\src), %eax // Load first 4 bytes228mov (\src, %rcx), \tmp32 // Load last 4 bytes229jmp .Lcombine\@230231.Llt4\@:232// Load 1 <= LEN <= 3 bytes.233add $2, %ecx // LEN - 2234movzbl (\src), %eax // Load first byte235jl .Lmovq\@236movzwl (\src, %rcx), \tmp32 // Load last 2 bytes237.Lcombine\@:238shl $3, %ecx239shl %cl, \tmp64240or \tmp64, %rax // Combine the two parts241.Lmovq\@:242movq %rax, \dst243.Ldone\@:244.endm245246// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.247// Clobbers %rax, %rcx, and %rsi.248.macro _store_partial_block src, dst249sub $8, %ecx // LEN - 8250jl .Llt8\@251252// Store 8 <= LEN <= 15 bytes.253pextrq $1, \src, %rax254mov %ecx, %esi255shl $3, %ecx256ror %cl, %rax257mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes258movq \src, (\dst) // Store first 8 bytes259jmp .Ldone\@260261.Llt8\@:262add $4, %ecx // LEN - 4263jl .Llt4\@264265// Store 4 <= LEN <= 7 bytes.266pextrd $1, \src, %eax267mov %ecx, %esi268shl $3, %ecx269ror %cl, %eax270mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes271movd \src, (\dst) // Store first 4 bytes272jmp .Ldone\@273274.Llt4\@:275// Store 1 <= LEN <= 3 bytes.276pextrb $0, \src, 0(\dst)277cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?278jl .Ldone\@279pextrb $1, \src, 1(\dst)280je .Ldone\@281pextrb $2, \src, 2(\dst)282.Ldone\@:283.endm284285// Do one step of GHASH-multiplying \a by \b and storing the reduced product in286// \b. To complete all steps, this must be invoked with \i=0 through \i=9.287// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the288// .Lgfpoly constant, and \t0-\t1 must be temporary registers.289.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1290291// MI = (a_L * b_H) + ((a*x^64)_L * b_L)292.if \i == 0293_vpclmulqdq $0x01, \a, \b, \t0294.elseif \i == 1295_vpclmulqdq $0x00, \a_times_x64, \b, \t1296.elseif \i == 2297pxor \t1, \t0298299// HI = (a_H * b_H) + ((a*x^64)_H * b_L)300.elseif \i == 3301_vpclmulqdq $0x11, \a, \b, \t1302.elseif \i == 4303pclmulqdq $0x10, \a_times_x64, \b304.elseif \i == 5305pxor \t1, \b306.elseif \i == 6307308// Fold MI into HI.309pshufd $0x4e, \t0, \t1 // Swap halves of MI310.elseif \i == 7311pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)312.elseif \i == 8313pxor \t1, \b314.elseif \i == 9315pxor \t0, \b316.endif317.endm318319// GHASH-multiply \a by \b and store the reduced product in \b.320// See _ghash_mul_step for details.321.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1322.irp i, 0,1,2,3,4,5,6,7,8,9323_ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1324.endr325.endm326327// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.328// This does Karatsuba multiplication and must be paired with _ghash_reduce. On329// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the330// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.331.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0332333// LO += a_L * b_L334_vpclmulqdq $0x00, \a, \b, \t0335pxor \t0, \lo336337// b_L + b_H338pshufd $0x4e, \b, \t0339pxor \b, \t0340341// HI += a_H * b_H342pclmulqdq $0x11, \a, \b343pxor \b, \hi344345// MI += (a_L + a_H) * (b_L + b_H)346pclmulqdq $0x00, \a_xored, \t0347pxor \t0, \mi348.endm349350// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.351// This assumes that _ghash_mul_noreduce was used.352.macro _ghash_reduce lo, mi, hi, dst, t0353354movq .Lgfpoly(%rip), \t0355356// MI += LO + HI (needed because we used Karatsuba multiplication)357pxor \lo, \mi358pxor \hi, \mi359360// Fold LO into MI.361pshufd $0x4e, \lo, \dst362pclmulqdq $0x00, \t0, \lo363pxor \dst, \mi364pxor \lo, \mi365366// Fold MI into HI.367pshufd $0x4e, \mi, \dst368pclmulqdq $0x00, \t0, \mi369pxor \hi, \dst370pxor \mi, \dst371.endm372373// Do the first step of the GHASH update of a set of 8 ciphertext blocks.374//375// The whole GHASH update does:376//377// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +378// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1379//380// This macro just does the first step: it does the unreduced multiplication381// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm382// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the383// inner block counter in %rax, which is a value that counts up by 8 for each384// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.385//386// To reduce the number of pclmulqdq instructions required, both this macro and387// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook388// multiplication. See the file comment for more details about this choice.389//390// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if391// encrypting, or SRC if decrypting. They also expect the precomputed hash key392// powers H^i and their XOR'd-together halves to be available in the struct393// pointed to by KEY. Both macros clobber TMP[0-2].394.macro _ghash_update_begin_8x enc395396// Initialize the inner block counter.397xor %eax, %eax398399// Load the highest hash key power, H^8.400movdqa OFFSETOF_H_POWERS(KEY), TMP0401402// Load the first ciphertext block and byte-reflect it.403.if \enc404movdqu (DST), TMP1405.else406movdqu (SRC), TMP1407.endif408pshufb BSWAP_MASK, TMP1409410// Add the GHASH accumulator to the ciphertext block to get the block411// 'b' that needs to be multiplied with the hash key power 'a'.412pxor TMP1, GHASH_ACC413414// b_L + b_H415pshufd $0x4e, GHASH_ACC, MI416pxor GHASH_ACC, MI417418// LO = a_L * b_L419_vpclmulqdq $0x00, TMP0, GHASH_ACC, LO420421// HI = a_H * b_H422pclmulqdq $0x11, TMP0, GHASH_ACC423424// MI = (a_L + a_H) * (b_L + b_H)425pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI426.endm427428// Continue the GHASH update of 8 ciphertext blocks as described above by doing429// an unreduced multiplication of the next ciphertext block by the next lowest430// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.431.macro _ghash_update_continue_8x enc432add $8, %eax433434// Load the next lowest key power.435movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0436437// Load the next ciphertext block and byte-reflect it.438.if \enc439movdqu (DST,%rax,2), TMP1440.else441movdqu (SRC,%rax,2), TMP1442.endif443pshufb BSWAP_MASK, TMP1444445// LO += a_L * b_L446_vpclmulqdq $0x00, TMP0, TMP1, TMP2447pxor TMP2, LO448449// b_L + b_H450pshufd $0x4e, TMP1, TMP2451pxor TMP1, TMP2452453// HI += a_H * b_H454pclmulqdq $0x11, TMP0, TMP1455pxor TMP1, GHASH_ACC456457// MI += (a_L + a_H) * (b_L + b_H)458movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1459pclmulqdq $0x00, TMP1, TMP2460pxor TMP2, MI461.endm462463// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to464// _ghash_reduce, but it's hardcoded to use the registers of the main loop and465// it uses the same register for HI and the destination. It's also divided into466// two steps. TMP1 must be preserved across steps.467//468// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of469// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would470// increase the critical path length, and it seems to slightly hurt performance.471.macro _ghash_update_end_8x_step i472.if \i == 0473movq .Lgfpoly(%rip), TMP1474pxor LO, MI475pxor GHASH_ACC, MI476pshufd $0x4e, LO, TMP2477pclmulqdq $0x00, TMP1, LO478pxor TMP2, MI479pxor LO, MI480.elseif \i == 1481pshufd $0x4e, MI, TMP2482pclmulqdq $0x00, TMP1, MI483pxor TMP2, GHASH_ACC484pxor MI, GHASH_ACC485.endif486.endm487488// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);489//490// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH491// related fields in the key struct.492.macro _aes_gcm_precompute493494// Function arguments495.set KEY, %rdi496497// Additional local variables.498// %xmm0-%xmm1 and %rax are used as temporaries.499.set RNDKEYLAST_PTR, %rsi500.set H_CUR, %xmm2501.set H_POW1, %xmm3 // H^1502.set H_POW1_X64, %xmm4 // H^1 * x^64503.set GFPOLY, %xmm5504505// Encrypt an all-zeroes block to get the raw hash subkey.506movl OFFSETOF_AESKEYLEN(KEY), %eax507lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR508movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block509lea 16(KEY), %rax5101:511aesenc (%rax), H_POW1512add $16, %rax513cmp %rax, RNDKEYLAST_PTR514jne 1b515aesenclast (RNDKEYLAST_PTR), H_POW1516517// Preprocess the raw hash subkey as needed to operate on GHASH's518// bit-reflected values directly: reflect its bytes, then multiply it by519// x^-1 (using the backwards interpretation of polynomial coefficients520// from the GCM spec) or equivalently x^1 (using the alternative,521// natural interpretation of polynomial coefficients).522pshufb .Lbswap_mask(%rip), H_POW1523movdqa H_POW1, %xmm0524pshufd $0xd3, %xmm0, %xmm0525psrad $31, %xmm0526paddq H_POW1, H_POW1527pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0528pxor %xmm0, H_POW1529530// Store H^1.531movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)532533// Compute and store H^1 * x^64.534movq .Lgfpoly(%rip), GFPOLY535pshufd $0x4e, H_POW1, %xmm0536_vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64537pxor %xmm0, H_POW1_X64538movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)539540// Compute and store the halves of H^1 XOR'd together.541pxor H_POW1, %xmm0542movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)543544// Compute and store the remaining key powers H^2 through H^8.545movdqa H_POW1, H_CUR546mov $6*8, %eax547.Lprecompute_next\@:548// Compute H^i = H^{i-1} * H^1.549_ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1550// Store H^i.551movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)552// Compute and store the halves of H^i XOR'd together.553pshufd $0x4e, H_CUR, %xmm0554pxor H_CUR, %xmm0555movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)556sub $8, %eax557jge .Lprecompute_next\@558559RET560.endm561562// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,563// u8 ghash_acc[16], const u8 *aad, int aadlen);564//565// This function processes the AAD (Additional Authenticated Data) in GCM.566// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the567// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all568// zeroes. |aadlen| must be a multiple of 16, except on the last call where it569// can be any length. The caller must do any buffering needed to ensure this.570.macro _aes_gcm_aad_update571572// Function arguments573.set KEY, %rdi574.set GHASH_ACC_PTR, %rsi575.set AAD, %rdx576.set AADLEN, %ecx577// Note: _load_partial_block relies on AADLEN being in %ecx.578579// Additional local variables.580// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.581.set BSWAP_MASK, %xmm2582.set GHASH_ACC, %xmm3583.set H_POW1, %xmm4 // H^1584.set H_POW1_X64, %xmm5 // H^1 * x^64585.set GFPOLY, %xmm6586587movdqa .Lbswap_mask(%rip), BSWAP_MASK588movdqu (GHASH_ACC_PTR), GHASH_ACC589movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1590movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64591movq .Lgfpoly(%rip), GFPOLY592593// Process the AAD one full block at a time.594sub $16, AADLEN595jl .Laad_loop_1x_done\@596.Laad_loop_1x\@:597movdqu (AAD), %xmm0598pshufb BSWAP_MASK, %xmm0599pxor %xmm0, GHASH_ACC600_ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1601add $16, AAD602sub $16, AADLEN603jge .Laad_loop_1x\@604.Laad_loop_1x_done\@:605// Check whether there is a partial block at the end.606add $16, AADLEN607jz .Laad_done\@608609// Process a partial block of length 1 <= AADLEN <= 15.610// _load_partial_block assumes that %ecx contains AADLEN.611_load_partial_block AAD, %xmm0, %r10, %r10d612pshufb BSWAP_MASK, %xmm0613pxor %xmm0, GHASH_ACC614_ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1615616.Laad_done\@:617movdqu GHASH_ACC, (GHASH_ACC_PTR)618RET619.endm620621// Increment LE_CTR eight times to generate eight little-endian counter blocks,622// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with623// the zero-th AES round key. Clobbers TMP0 and TMP1.624.macro _ctr_begin_8x625movq .Lone(%rip), TMP0626movdqa (KEY), TMP1 // zero-th round key627.irp i, 0,1,2,3,4,5,6,7628_vpshufb BSWAP_MASK, LE_CTR, AESDATA\i629pxor TMP1, AESDATA\i630paddd TMP0, LE_CTR631.endr632.endm633634// Do a non-last round of AES on AESDATA[0-7] using \round_key.635.macro _aesenc_8x round_key636.irp i, 0,1,2,3,4,5,6,7637aesenc \round_key, AESDATA\i638.endr639.endm640641// Do the last round of AES on AESDATA[0-7] using \round_key.642.macro _aesenclast_8x round_key643.irp i, 0,1,2,3,4,5,6,7644aesenclast \round_key, AESDATA\i645.endr646.endm647648// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and649// store the result to DST. Clobbers TMP0.650.macro _xor_data_8x651.irp i, 0,1,2,3,4,5,6,7652_xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0653.endr654.irp i, 0,1,2,3,4,5,6,7655movdqu AESDATA\i, \i*16(DST)656.endr657.endm658659// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,660// const u32 le_ctr[4], u8 ghash_acc[16],661// const u8 *src, u8 *dst, int datalen);662//663// This macro generates a GCM encryption or decryption update function with the664// above prototype (with \enc selecting which one).665//666// This function computes the next portion of the CTR keystream, XOR's it with667// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted668// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the669// next |datalen| ciphertext bytes.670//671// |datalen| must be a multiple of 16, except on the last call where it can be672// any length. The caller must do any buffering needed to ensure this. Both673// in-place and out-of-place en/decryption are supported.674//675// |le_ctr| must give the current counter in little-endian format. For a new676// message, the low word of the counter must be 2. This function loads the677// counter from |le_ctr| and increments the loaded counter as needed, but it678// does *not* store the updated counter back to |le_ctr|. The caller must679// update |le_ctr| if any more data segments follow. Internally, only the low680// 32-bit word of the counter is incremented, following the GCM standard.681.macro _aes_gcm_update enc682683// Function arguments684.set KEY, %rdi685.set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg686.set GHASH_ACC_PTR, %rdx687.set SRC, %rcx688.set DST, %r8689.set DATALEN, %r9d690.set DATALEN64, %r9 // Zero-extend DATALEN before using!691// Note: the code setting up for _load_partial_block assumes that SRC is692// in %rcx (and that DATALEN is *not* in %rcx).693694// Additional local variables695696// %rax and %rsi are used as temporary registers. Note: %rsi overlaps697// with LE_CTR_PTR, which is used only at the beginning.698699.set AESKEYLEN, %r10d // AES key length in bytes700.set AESKEYLEN64, %r10701.set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key702703// Put the most frequently used values in %xmm0-%xmm7 to reduce code704// size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)705.set TMP0, %xmm0706.set TMP1, %xmm1707.set TMP2, %xmm2708.set LO, %xmm3 // Low part of unreduced product709.set MI, %xmm4 // Middle part of unreduced product710.set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also711// the high part of unreduced product712.set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes713.set LE_CTR, %xmm7 // Little-endian counter value714.set AESDATA0, %xmm8715.set AESDATA1, %xmm9716.set AESDATA2, %xmm10717.set AESDATA3, %xmm11718.set AESDATA4, %xmm12719.set AESDATA5, %xmm13720.set AESDATA6, %xmm14721.set AESDATA7, %xmm15722723movdqa .Lbswap_mask(%rip), BSWAP_MASK724movdqu (GHASH_ACC_PTR), GHASH_ACC725movdqu (LE_CTR_PTR), LE_CTR726727movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN728lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR729730// If there are at least 8*16 bytes of data, then continue into the main731// loop, which processes 8*16 bytes of data per iteration.732//733// The main loop interleaves AES and GHASH to improve performance on734// CPUs that can execute these instructions in parallel. When735// decrypting, the GHASH input (the ciphertext) is immediately736// available. When encrypting, we instead encrypt a set of 8 blocks737// first and then GHASH those blocks while encrypting the next set of 8,738// repeat that as needed, and finally GHASH the last set of 8 blocks.739//740// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,741// as this makes the immediate fit in a signed byte, saving 3 bytes.742add $-8*16, DATALEN743jl .Lcrypt_loop_8x_done\@744.if \enc745// Encrypt the first 8 plaintext blocks.746_ctr_begin_8x747lea 16(KEY), %rsi748.p2align 47491:750movdqa (%rsi), TMP0751_aesenc_8x TMP0752add $16, %rsi753cmp %rsi, RNDKEYLAST_PTR754jne 1b755movdqa (%rsi), TMP0756_aesenclast_8x TMP0757_xor_data_8x758// Don't increment DST until the ciphertext blocks have been hashed.759sub $-8*16, SRC760add $-8*16, DATALEN761jl .Lghash_last_ciphertext_8x\@762.endif763764.p2align 4765.Lcrypt_loop_8x\@:766767// Generate the next set of 8 counter blocks and start encrypting them.768_ctr_begin_8x769lea 16(KEY), %rsi770771// Do a round of AES, and start the GHASH update of 8 ciphertext blocks772// by doing the unreduced multiplication for the first ciphertext block.773movdqa (%rsi), TMP0774add $16, %rsi775_aesenc_8x TMP0776_ghash_update_begin_8x \enc777778// Do 7 more rounds of AES, and continue the GHASH update by doing the779// unreduced multiplication for the remaining ciphertext blocks.780.p2align 47811:782movdqa (%rsi), TMP0783add $16, %rsi784_aesenc_8x TMP0785_ghash_update_continue_8x \enc786cmp $7*8, %eax787jne 1b788789// Do the remaining AES rounds.790.p2align 47911:792movdqa (%rsi), TMP0793add $16, %rsi794_aesenc_8x TMP0795cmp %rsi, RNDKEYLAST_PTR796jne 1b797798// Do the GHASH reduction and the last round of AES.799movdqa (RNDKEYLAST_PTR), TMP0800_ghash_update_end_8x_step 0801_aesenclast_8x TMP0802_ghash_update_end_8x_step 1803804// XOR the data with the AES-CTR keystream blocks.805.if \enc806sub $-8*16, DST807.endif808_xor_data_8x809sub $-8*16, SRC810.if !\enc811sub $-8*16, DST812.endif813add $-8*16, DATALEN814jge .Lcrypt_loop_8x\@815816.if \enc817.Lghash_last_ciphertext_8x\@:818// Update GHASH with the last set of 8 ciphertext blocks.819_ghash_update_begin_8x \enc820.p2align 48211:822_ghash_update_continue_8x \enc823cmp $7*8, %eax824jne 1b825_ghash_update_end_8x_step 0826_ghash_update_end_8x_step 1827sub $-8*16, DST828.endif829830.Lcrypt_loop_8x_done\@:831832sub $-8*16, DATALEN833jz .Ldone\@834835// Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep836// things simple and keep the code size down by just going one block at837// a time, again taking advantage of hardware loop unrolling. Since838// there are enough key powers available for all remaining data, we do839// the GHASH multiplications unreduced, and only reduce at the very end.840841.set HI, TMP2842.set H_POW, AESDATA0843.set H_POW_XORED, AESDATA1844.set ONE, AESDATA2845846movq .Lone(%rip), ONE847848// Start collecting the unreduced GHASH intermediate value LO, MI, HI.849pxor LO, LO850pxor MI, MI851pxor HI, HI852853// Set up a block counter %rax to contain 8*(8-n), where n is the number854// of blocks that remain, counting any partial block. This will be used855// to access the key powers H^n through H^1.856mov DATALEN, %eax857neg %eax858and $~15, %eax859sar $1, %eax860add $64, %eax861862sub $16, DATALEN863jl .Lcrypt_loop_1x_done\@864865// Process the data one full block at a time.866.Lcrypt_loop_1x\@:867868// Encrypt the next counter block.869_vpshufb BSWAP_MASK, LE_CTR, TMP0870paddd ONE, LE_CTR871pxor (KEY), TMP0872lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size873cmp $24, AESKEYLEN874jl 128f // AES-128?875je 192f // AES-192?876// AES-256877aesenc -7*16(%rsi), TMP0878aesenc -6*16(%rsi), TMP0879192:880aesenc -5*16(%rsi), TMP0881aesenc -4*16(%rsi), TMP0882128:883.irp i, -3,-2,-1,0,1,2,3,4,5884aesenc \i*16(%rsi), TMP0885.endr886aesenclast (RNDKEYLAST_PTR), TMP0887888// Load the next key power H^i.889movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW890movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED891892// XOR the keystream block that was just generated in TMP0 with the next893// source data block and store the resulting en/decrypted data to DST.894.if \enc895_xor_mem_to_reg (SRC), TMP0, tmp=TMP1896movdqu TMP0, (DST)897.else898movdqu (SRC), TMP1899pxor TMP1, TMP0900movdqu TMP0, (DST)901.endif902903// Update GHASH with the ciphertext block.904.if \enc905pshufb BSWAP_MASK, TMP0906pxor TMP0, GHASH_ACC907.else908pshufb BSWAP_MASK, TMP1909pxor TMP1, GHASH_ACC910.endif911_ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0912pxor GHASH_ACC, GHASH_ACC913914add $8, %eax915add $16, SRC916add $16, DST917sub $16, DATALEN918jge .Lcrypt_loop_1x\@919.Lcrypt_loop_1x_done\@:920// Check whether there is a partial block at the end.921add $16, DATALEN922jz .Lghash_reduce\@923924// Process a partial block of length 1 <= DATALEN <= 15.925926// Encrypt a counter block for the last time.927pshufb BSWAP_MASK, LE_CTR928pxor (KEY), LE_CTR929lea 16(KEY), %rsi9301:931aesenc (%rsi), LE_CTR932add $16, %rsi933cmp %rsi, RNDKEYLAST_PTR934jne 1b935aesenclast (RNDKEYLAST_PTR), LE_CTR936937// Load the lowest key power, H^1.938movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW939movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED940941// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is942// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.943// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.944mov SRC, RNDKEYLAST_PTR945mov DATALEN, %ecx946_load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi947948// XOR the keystream block that was just generated in LE_CTR with the949// source data block and store the resulting en/decrypted data to DST.950pxor TMP0, LE_CTR951mov DATALEN, %ecx952_store_partial_block LE_CTR, DST953954// If encrypting, zero-pad the final ciphertext block for GHASH. (If955// decrypting, this was already done by _load_partial_block.)956.if \enc957lea .Lzeropad_mask+16(%rip), %rax958sub DATALEN64, %rax959_vpand (%rax), LE_CTR, TMP0960.endif961962// Update GHASH with the final ciphertext block.963pshufb BSWAP_MASK, TMP0964pxor TMP0, GHASH_ACC965_ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0966967.Lghash_reduce\@:968// Finally, do the GHASH reduction.969_ghash_reduce LO, MI, HI, GHASH_ACC, TMP0970971.Ldone\@:972// Store the updated GHASH accumulator back to memory.973movdqu GHASH_ACC, (GHASH_ACC_PTR)974975RET976.endm977978// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,979// const u32 le_ctr[4], u8 ghash_acc[16],980// u64 total_aadlen, u64 total_datalen);981// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,982// const u32 le_ctr[4], const u8 ghash_acc[16],983// u64 total_aadlen, u64 total_datalen,984// const u8 tag[16], int taglen);985//986// This macro generates one of the above two functions (with \enc selecting987// which one). Both functions finish computing the GCM authentication tag by988// updating GHASH with the lengths block and encrypting the GHASH accumulator.989// |total_aadlen| and |total_datalen| must be the total length of the additional990// authenticated data and the en/decrypted data in bytes, respectively.991//992// The encryption function then stores the full-length (16-byte) computed993// authentication tag to |ghash_acc|. The decryption function instead loads the994// expected authentication tag (the one that was transmitted) from the 16-byte995// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the996// computed tag in constant time, and returns true if and only if they match.997.macro _aes_gcm_final enc998999// Function arguments1000.set KEY, %rdi1001.set LE_CTR_PTR, %rsi1002.set GHASH_ACC_PTR, %rdx1003.set TOTAL_AADLEN, %rcx1004.set TOTAL_DATALEN, %r81005.set TAG, %r91006.set TAGLEN, %r10d // Originally at 8(%rsp)1007.set TAGLEN64, %r1010081009// Additional local variables.1010// %rax and %xmm0-%xmm2 are used as temporary registers.1011.set AESKEYLEN, %r11d1012.set AESKEYLEN64, %r111013.set BSWAP_MASK, %xmm31014.set GHASH_ACC, %xmm41015.set H_POW1, %xmm5 // H^11016.set H_POW1_X64, %xmm6 // H^1 * x^641017.set GFPOLY, %xmm710181019movdqa .Lbswap_mask(%rip), BSWAP_MASK1020movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN10211022// Set up a counter block with 1 in the low 32-bit word. This is the1023// counter that produces the ciphertext needed to encrypt the auth tag.1024movdqu (LE_CTR_PTR), %xmm01025mov $1, %eax1026pinsrd $0, %eax, %xmm010271028// Build the lengths block and XOR it into the GHASH accumulator.1029movq TOTAL_DATALEN, GHASH_ACC1030pinsrq $1, TOTAL_AADLEN, GHASH_ACC1031psllq $3, GHASH_ACC // Bytes to bits1032_xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm110331034movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW11035movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X641036movq .Lgfpoly(%rip), GFPOLY10371038// Make %rax point to the 6th from last AES round key. (Using signed1039// byte offsets -7*16 through 6*16 decreases code size.)1040lea (KEY,AESKEYLEN64,4), %rax10411042// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.1043// Interleave the AES and GHASH instructions to improve performance.1044pshufb BSWAP_MASK, %xmm01045pxor (KEY), %xmm01046cmp $24, AESKEYLEN1047jl 128f // AES-128?1048je 192f // AES-192?1049// AES-2561050aesenc -7*16(%rax), %xmm01051aesenc -6*16(%rax), %xmm01052192:1053aesenc -5*16(%rax), %xmm01054aesenc -4*16(%rax), %xmm01055128:1056.irp i, 0,1,2,3,4,5,6,7,81057aesenc (\i-3)*16(%rax), %xmm01058_ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm21059.endr1060aesenclast 6*16(%rax), %xmm01061_ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm210621063// Undo the byte reflection of the GHASH accumulator.1064pshufb BSWAP_MASK, GHASH_ACC10651066// Encrypt the GHASH accumulator.1067pxor %xmm0, GHASH_ACC10681069.if \enc1070// Return the computed auth tag.1071movdqu GHASH_ACC, (GHASH_ACC_PTR)1072.else1073.set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!10741075// Verify the auth tag in constant time by XOR'ing the transmitted and1076// computed auth tags together and using the ptest instruction to check1077// whether the first TAGLEN bytes of the result are zero.1078_xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm01079movl 8(%rsp), TAGLEN1080lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR1081sub TAGLEN64, ZEROPAD_MASK_PTR1082xor %eax, %eax1083_test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm01084sete %al1085.endif1086RET1087.endm10881089.set USE_AVX, 01090SYM_FUNC_START(aes_gcm_precompute_aesni)1091_aes_gcm_precompute1092SYM_FUNC_END(aes_gcm_precompute_aesni)1093SYM_FUNC_START(aes_gcm_aad_update_aesni)1094_aes_gcm_aad_update1095SYM_FUNC_END(aes_gcm_aad_update_aesni)1096SYM_FUNC_START(aes_gcm_enc_update_aesni)1097_aes_gcm_update 11098SYM_FUNC_END(aes_gcm_enc_update_aesni)1099SYM_FUNC_START(aes_gcm_dec_update_aesni)1100_aes_gcm_update 01101SYM_FUNC_END(aes_gcm_dec_update_aesni)1102SYM_FUNC_START(aes_gcm_enc_final_aesni)1103_aes_gcm_final 11104SYM_FUNC_END(aes_gcm_enc_final_aesni)1105SYM_FUNC_START(aes_gcm_dec_final_aesni)1106_aes_gcm_final 01107SYM_FUNC_END(aes_gcm_dec_final_aesni)11081109.set USE_AVX, 11110SYM_FUNC_START(aes_gcm_precompute_aesni_avx)1111_aes_gcm_precompute1112SYM_FUNC_END(aes_gcm_precompute_aesni_avx)1113SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)1114_aes_gcm_aad_update1115SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)1116SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)1117_aes_gcm_update 11118SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)1119SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)1120_aes_gcm_update 01121SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)1122SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)1123_aes_gcm_final 11124SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)1125SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)1126_aes_gcm_final 01127SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)112811291130