Path: blob/master/arch/x86/crypto/aes-gcm-vaes-avx512.S
54866 views
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */1//2// AES-GCM implementation for x86_64 CPUs that support the following CPU3// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI24//5// Copyright 2024 Google LLC6//7// Author: Eric Biggers <ebiggers@google.com>8//9//------------------------------------------------------------------------------10//11// This file is dual-licensed, meaning that you can use it under your choice of12// either of the following two licenses:13//14// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy15// of the License at16//17// http://www.apache.org/licenses/LICENSE-2.018//19// Unless required by applicable law or agreed to in writing, software20// distributed under the License is distributed on an "AS IS" BASIS,21// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.22// See the License for the specific language governing permissions and23// limitations under the License.24//25// or26//27// Redistribution and use in source and binary forms, with or without28// modification, are permitted provided that the following conditions are met:29//30// 1. Redistributions of source code must retain the above copyright notice,31// this list of conditions and the following disclaimer.32//33// 2. Redistributions in binary form must reproduce the above copyright34// notice, this list of conditions and the following disclaimer in the35// documentation and/or other materials provided with the distribution.36//37// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"38// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE39// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE40// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE41// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR42// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF43// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS44// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN45// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)46// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE47// POSSIBILITY OF SUCH DAMAGE.4849#include <linux/linkage.h>5051.section .rodata52.p2align 65354// A shuffle mask that reflects the bytes of 16-byte blocks55.Lbswap_mask:56.octa 0x000102030405060708090a0b0c0d0e0f5758// This is the GHASH reducing polynomial without its constant term, i.e.59// x^128 + x^7 + x^2 + x, represented using the backwards mapping60// between bits and polynomial coefficients.61//62// Alternatively, it can be interpreted as the naturally-ordered63// representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the64// "reversed" GHASH reducing polynomial without its x^128 term.65.Lgfpoly:66.octa 0xc20000000000000000000000000000016768// Same as above, but with the (1 << 64) bit set.69.Lgfpoly_and_internal_carrybit:70.octa 0xc20000000000000100000000000000017172// Values needed to prepare the initial vector of counter blocks.73.Lctr_pattern:74.octa 075.octa 176.octa 277.octa 37879// The number of AES blocks per vector, as a 128-bit value.80.Linc_4blocks:81.octa 48283// Number of powers of the hash key stored in the key struct. The powers are84// stored from highest (H^NUM_H_POWERS) to lowest (H^1).85#define NUM_H_POWERS 168687// Offset to AES key length (in bytes) in the key struct88#define OFFSETOF_AESKEYLEN 08990// Offset to AES round keys in the key struct91#define OFFSETOF_AESROUNDKEYS 169293// Offset to start of hash key powers array in the key struct94#define OFFSETOF_H_POWERS 3209596// Offset to end of hash key powers array in the key struct.97//98// This is immediately followed by three zeroized padding blocks, which are99// included so that partial vectors can be handled more easily. E.g. if two100// blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most padding101// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.102#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))103104.text105106// The _ghash_mul_step macro does one step of GHASH multiplication of the107// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the108// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the109// same size as \a and \b. To complete all steps, this must invoked with \i=0110// through \i=9. The division into steps allows users of this macro to111// optionally interleave the computation with other instructions. Users of this112// macro must preserve the parameter registers across steps.113//114// The multiplications are done in GHASH's representation of the finite field115// GF(2^128). Elements of GF(2^128) are represented as binary polynomials116// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial117// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is118// just XOR, while multiplication is more complex and has two parts: (a) do119// carryless multiplication of two 128-bit input polynomials to get a 256-bit120// intermediate product polynomial, and (b) reduce the intermediate product to121// 128 bits by adding multiples of G that cancel out terms in it. (Adding122// multiples of G doesn't change which field element the polynomial represents.)123//124// Unfortunately, the GCM specification maps bits to/from polynomial125// coefficients backwards from the natural order. In each byte it specifies the126// highest bit to be the lowest order polynomial coefficient, *not* the highest!127// This makes it nontrivial to work with the GHASH polynomials. We could128// reflect the bits, but x86 doesn't have an instruction that does that.129//130// Instead, we operate on the values without bit-reflecting them. This *mostly*131// just works, since XOR and carryless multiplication are symmetric with respect132// to bit order, but it has some consequences. First, due to GHASH's byte133// order, by skipping bit reflection, *byte* reflection becomes necessary to134// give the polynomial terms a consistent order. E.g., considering an N-bit135// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0136// through N-1 of the byte-reflected value represent the coefficients of x^(N-1)137// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value138// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked139// with. Fortunately, x86's vpshufb instruction can do byte reflection.140//141// Second, forgoing the bit reflection causes an extra multiple of x (still142// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each143// multiplication. This is because an M-bit by N-bit carryless multiplication144// really produces a (M+N-1)-bit product, but in practice it's zero-extended to145// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits146// to polynomial coefficients backwards, this zero-extension actually changes147// the product by introducing an extra factor of x. Therefore, users of this148// macro must ensure that one of the inputs has an extra factor of x^-1, i.e.149// the multiplicative inverse of x, to cancel out the extra x.150//151// Third, the backwards coefficients convention is just confusing to work with,152// since it makes "low" and "high" in the polynomial math mean the opposite of153// their normal meaning in computer programming. This can be solved by using an154// alternative interpretation: the polynomial coefficients are understood to be155// in the natural order, and the multiplication is actually \a * \b * x^-128 mod156// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs,157// or the implementation at all; it just changes the mathematical interpretation158// of what each instruction is doing. Starting from here, we'll use this159// alternative interpretation, as it's easier to understand the code that way.160//161// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>162// 128-bit carryless multiplication, so we break the 128 x 128 multiplication163// into parts as follows (the _L and _H suffixes denote low and high 64 bits):164//165// LO = a_L * b_L166// MI = (a_L * b_H) + (a_H * b_L)167// HI = a_H * b_H168//169// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.170// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and171// HI right away, since the way the reduction works makes that unnecessary.172//173// For the reduction, we cancel out the low 128 bits by adding multiples of G =174// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of175// which cancels out the next lowest 64 bits. Consider a value x^64*A + B,176// where A and B are 128-bit. Adding B_L*G to that value gives:177//178// x^64*A + B + B_L*G179// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)180// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L181// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L182// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))183//184// So: if we sum A, B with its halves swapped, and the low half of B times x^63185// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the186// original value x^64*A + B. I.e., the low 64 bits got canceled out.187//188// We just need to apply this twice: first to fold LO into MI, and second to189// fold the updated MI into HI.190//191// The needed three-argument XORs are done using the vpternlogd instruction with192// immediate 0x96, since this is faster than two vpxord instructions.193//194// A potential optimization, assuming that b is fixed per-key (if a is fixed195// per-key it would work the other way around), is to use one iteration of the196// reduction described above to precompute a value c such that x^64*c = b mod G,197// and then multiply a_L by c (and implicitly by x^64) instead of by b:198//199// MI = (a_L * c_L) + (a_H * b_L)200// HI = (a_L * c_H) + (a_H * b_H)201//202// This would eliminate the LO part of the intermediate product, which would203// eliminate the need to fold LO into MI. This would save two instructions,204// including a vpclmulqdq. However, we currently don't use this optimization205// because it would require twice as many per-key precomputed values.206//207// Using Karatsuba multiplication instead of "schoolbook" multiplication208// similarly would save a vpclmulqdq but does not seem to be worth it.209.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2210.if \i == 0211vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L212vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H213.elseif \i == 1214vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L215.elseif \i == 2216vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1217.elseif \i == 3218vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57)219.elseif \i == 4220vpshufd $0x4e, \t0, \t0 // Swap halves of LO221.elseif \i == 5222vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI223.elseif \i == 6224vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H225.elseif \i == 7226vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)227.elseif \i == 8228vpshufd $0x4e, \t1, \t1 // Swap halves of MI229.elseif \i == 9230vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI231.endif232.endm233234// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store235// the reduced products in \dst. See _ghash_mul_step for full explanation.236.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2237.irp i, 0,1,2,3,4,5,6,7,8,9238_ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2239.endr240.endm241242// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the243// *unreduced* products to \lo, \mi, and \hi.244.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3245vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L246vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H247vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L248vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H249vpxord \t0, \lo, \lo250vpternlogd $0x96, \t2, \t1, \mi251vpxord \t3, \hi, \hi252.endm253254// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit255// reduced products in \hi. See _ghash_mul_step for explanation of reduction.256.macro _ghash_reduce lo, mi, hi, gfpoly, t0257vpclmulqdq $0x01, \lo, \gfpoly, \t0258vpshufd $0x4e, \lo, \lo259vpternlogd $0x96, \t0, \lo, \mi260vpclmulqdq $0x01, \mi, \gfpoly, \t0261vpshufd $0x4e, \mi, \mi262vpternlogd $0x96, \t0, \mi, \hi263.endm264265// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it266// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.267.macro _ghash_square a, dst, gfpoly, t0, t1268vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L269vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H270vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57)271vpshufd $0x4e, \t0, \t0 // Swap halves of LO272vpxord \t0, \t1, \t1 // Fold LO into MI273vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)274vpshufd $0x4e, \t1, \t1 // Swap halves of MI275vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI276.endm277278// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);279//280// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and281// initialize |key->h_powers| and |key->padding|.282SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)283284// Function arguments285.set KEY, %rdi286287// Additional local variables.288// %zmm[0-2] and %rax are used as temporaries.289.set POWERS_PTR, %rsi290.set RNDKEYLAST_PTR, %rdx291.set H_CUR, %zmm3292.set H_CUR_YMM, %ymm3293.set H_CUR_XMM, %xmm3294.set H_INC, %zmm4295.set H_INC_YMM, %ymm4296.set H_INC_XMM, %xmm4297.set GFPOLY, %zmm5298.set GFPOLY_YMM, %ymm5299.set GFPOLY_XMM, %xmm5300301// Get pointer to lowest set of key powers (located at end of array).302lea OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR303304// Encrypt an all-zeroes block to get the raw hash subkey.305movl OFFSETOF_AESKEYLEN(KEY), %eax306lea OFFSETOF_AESROUNDKEYS+6*16(KEY,%rax,4), RNDKEYLAST_PTR307vmovdqu OFFSETOF_AESROUNDKEYS(KEY), %xmm0308add $OFFSETOF_AESROUNDKEYS+16, KEY3091:310vaesenc (KEY), %xmm0, %xmm0311add $16, KEY312cmp KEY, RNDKEYLAST_PTR313jne 1b314vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0315316// Reflect the bytes of the raw hash subkey.317vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM318319// Zeroize the padding blocks.320vpxor %xmm0, %xmm0, %xmm0321vmovdqu %ymm0, 64(POWERS_PTR)322vmovdqu %xmm0, 64+2*16(POWERS_PTR)323324// Finish preprocessing the first key power, H^1. Since this GHASH325// implementation operates directly on values with the backwards bit326// order specified by the GCM standard, it's necessary to preprocess the327// raw key as follows. First, reflect its bytes. Second, multiply it328// by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards329// interpretation of polynomial coefficients), which can also be330// interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121331// + 1 using the alternative, natural interpretation of polynomial332// coefficients. For details, see the comment above _ghash_mul_step.333//334// Either way, for the multiplication the concrete operation performed335// is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2336// << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit337// wide shift instruction, so instead double each of the two 64-bit338// halves and incorporate the internal carry bit into the value XOR'd.339vpshufd $0xd3, H_CUR_XMM, %xmm0340vpsrad $31, %xmm0, %xmm0341vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM342// H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit343vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM344345// Load the gfpoly constant.346vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY347348// Square H^1 to get H^2.349//350// Note that as with H^1, all higher key powers also need an extra351// factor of x^-1 (or x using the natural interpretation). Nothing352// special needs to be done to make this happen, though: H^1 * H^1 would353// end up with two factors of x^-1, but the multiplication consumes one.354// So the product H^2 ends up with the desired one factor of x^-1.355_ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1356357// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].358vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM359vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM360361// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].362_ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \363%ymm0, %ymm1, %ymm2364vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR365vshufi64x2 $0, H_INC, H_INC, H_INC366367// Store the lowest set of key powers.368vmovdqu8 H_CUR, (POWERS_PTR)369370// Compute and store the remaining key powers.371// Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by372// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].373mov $3, %eax374.Lprecompute_next:375sub $64, POWERS_PTR376_ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2377vmovdqu8 H_CUR, (POWERS_PTR)378dec %eax379jnz .Lprecompute_next380381vzeroupper // This is needed after using ymm or zmm registers.382RET383SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)384385// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store386// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.387.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm388vextracti32x4 $1, \src, \t0_xmm389vextracti32x4 $2, \src, \t1_xmm390vextracti32x4 $3, \src, \t2_xmm391vpxord \t0_xmm, \src_xmm, \dst_xmm392vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm393.endm394395// Do one step of the GHASH update of the data blocks given in the vector396// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The397// division into steps allows users of this macro to optionally interleave the398// computation with other instructions. This macro uses the vector register399// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;400// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and401// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the402// data blocks. The parameter registers must be preserved across steps.403//404// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +405// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the406// operations are vectorized operations on 512-bit vectors of 128-bit blocks.407// The vectorized terms correspond to the following non-vectorized terms:408//409// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),410// H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)411// H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7412// H_POW2*GHASHDATA2 => H^8*blk8, H^7*blk9, H^6*blk10, and H^5*blk11413// H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15414//415// More concretely, this code does:416// - Do vectorized "schoolbook" multiplications to compute the intermediate417// 256-bit product of each block and its corresponding hash key power.418// - Sum (XOR) the intermediate 256-bit products across vectors.419// - Do a vectorized reduction of these 256-bit intermediate values to420// 128-bits each.421// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.422//423// See _ghash_mul_step for the full explanation of the operations performed for424// each individual finite field multiplication and reduction.425.macro _ghash_step_4x i426.if \i == 0427vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0428vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0429vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1430vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2431.elseif \i == 1432vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3433vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0434vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1435vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2436.elseif \i == 2437vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0})438vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3439vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0})440vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0441.elseif \i == 3442vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1443vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2444vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0})445vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3446.elseif \i == 4447vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4448vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0})449vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5450vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6451.elseif \i == 5452vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0})453vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57)454vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7455vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0})456.elseif \i == 6457vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO458vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0459vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1460vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2461.elseif \i == 7462vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI463vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3464vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0})465vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57)466.elseif \i == 8467vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0})468vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI469vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI470.elseif \i == 9471_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \472GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM473.endif474.endm475476// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full477// explanation.478.macro _ghash_4x479.irp i, 0,1,2,3,4,5,6,7,8,9480_ghash_step_4x \i481.endr482.endm483484// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,485// u8 ghash_acc[16],486// const u8 *aad, int aadlen);487//488// This function processes the AAD (Additional Authenticated Data) in GCM.489// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the490// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all491// zeroes. |aadlen| must be a multiple of 16, except on the last call where it492// can be any length. The caller must do any buffering needed to ensure this.493//494// This handles large amounts of AAD efficiently, while also keeping overhead495// low for small amounts which is the common case. TLS and IPsec use less than496// one block of AAD, but (uncommonly) other use cases may use much more.497SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)498499// Function arguments500.set KEY, %rdi501.set GHASH_ACC_PTR, %rsi502.set AAD, %rdx503.set AADLEN, %ecx504.set AADLEN64, %rcx // Zero-extend AADLEN before using!505506// Additional local variables.507// %rax and %k1 are used as temporary registers.508.set GHASHDATA0, %zmm0509.set GHASHDATA0_XMM, %xmm0510.set GHASHDATA1, %zmm1511.set GHASHDATA1_XMM, %xmm1512.set GHASHDATA2, %zmm2513.set GHASHDATA2_XMM, %xmm2514.set GHASHDATA3, %zmm3515.set BSWAP_MASK, %zmm4516.set BSWAP_MASK_XMM, %xmm4517.set GHASH_ACC, %zmm5518.set GHASH_ACC_XMM, %xmm5519.set H_POW4, %zmm6520.set H_POW3, %zmm7521.set H_POW2, %zmm8522.set H_POW1, %zmm9523.set H_POW1_XMM, %xmm9524.set GFPOLY, %zmm10525.set GFPOLY_XMM, %xmm10526.set GHASHTMP0, %zmm11527.set GHASHTMP1, %zmm12528.set GHASHTMP2, %zmm13529530// Load the GHASH accumulator.531vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM532533// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.534cmp $16, AADLEN535jg .Laad_more_than_16bytes536test AADLEN, AADLEN537jz .Laad_done538539// Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD.540vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM541vmovdqu .Lgfpoly(%rip), GFPOLY_XMM542mov $-1, %eax543bzhi AADLEN, %eax, %eax544kmovd %eax, %k1545vmovdqu8 (AAD), GHASHDATA0_XMM{%k1}{z}546vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM547vpshufb BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM548vpxor GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM549_ghash_mul H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \550GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM551jmp .Laad_done552553.Laad_more_than_16bytes:554vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK555vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY556557// If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time.558sub $256, AADLEN559jl .Laad_loop_4x_done560vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4561vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3562vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2563vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1564.Laad_loop_4x:565vmovdqu8 0*64(AAD), GHASHDATA0566vmovdqu8 1*64(AAD), GHASHDATA1567vmovdqu8 2*64(AAD), GHASHDATA2568vmovdqu8 3*64(AAD), GHASHDATA3569_ghash_4x570add $256, AAD571sub $256, AADLEN572jge .Laad_loop_4x573.Laad_loop_4x_done:574575// If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time.576add $192, AADLEN577jl .Laad_loop_1x_done578vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1579.Laad_loop_1x:580vmovdqu8 (AAD), GHASHDATA0581vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0582vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC583_ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \584GHASHDATA0, GHASHDATA1, GHASHDATA2585_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \586GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM587add $64, AAD588sub $64, AADLEN589jge .Laad_loop_1x590.Laad_loop_1x_done:591592// Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD.593add $64, AADLEN594jz .Laad_done595mov $-1, %rax596bzhi AADLEN64, %rax, %rax597kmovq %rax, %k1598vmovdqu8 (AAD), GHASHDATA0{%k1}{z}599neg AADLEN64600and $~15, AADLEN64 // -round_up(AADLEN, 16)601vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1602vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0603vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC604_ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \605GHASHDATA0, GHASHDATA1, GHASHDATA2606_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \607GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM608609.Laad_done:610// Store the updated GHASH accumulator back to memory.611vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)612613vzeroupper // This is needed after using ymm or zmm registers.614RET615SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)616617// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the618// round key that has been broadcast to all 128-bit lanes of \round_key.619.macro _vaesenc_4x round_key620vaesenc \round_key, %zmm0, %zmm0621vaesenc \round_key, %zmm1, %zmm1622vaesenc \round_key, %zmm2, %zmm2623vaesenc \round_key, %zmm3, %zmm3624.endm625626// Start the AES encryption of four vectors of counter blocks.627.macro _ctr_begin_4x628629// Increment LE_CTR four times to generate four vectors of little-endian630// counter blocks, swap each to big-endian, and store them in %zmm[0-3].631vpshufb BSWAP_MASK, LE_CTR, %zmm0632vpaddd LE_CTR_INC, LE_CTR, LE_CTR633vpshufb BSWAP_MASK, LE_CTR, %zmm1634vpaddd LE_CTR_INC, LE_CTR, LE_CTR635vpshufb BSWAP_MASK, LE_CTR, %zmm2636vpaddd LE_CTR_INC, LE_CTR, LE_CTR637vpshufb BSWAP_MASK, LE_CTR, %zmm3638vpaddd LE_CTR_INC, LE_CTR, LE_CTR639640// AES "round zero": XOR in the zero-th round key.641vpxord RNDKEY0, %zmm0, %zmm0642vpxord RNDKEY0, %zmm1, %zmm1643vpxord RNDKEY0, %zmm2, %zmm2644vpxord RNDKEY0, %zmm3, %zmm3645.endm646647// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR648// source data with the resulting keystream, and write the result to DST and649// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)650.macro _aesenclast_and_xor_4x651// XOR the source data with the last round key, saving the result in652// GHASHDATA[0-3]. This reduces latency by taking advantage of the653// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).654vpxord 0*64(SRC), RNDKEYLAST, GHASHDATA0655vpxord 1*64(SRC), RNDKEYLAST, GHASHDATA1656vpxord 2*64(SRC), RNDKEYLAST, GHASHDATA2657vpxord 3*64(SRC), RNDKEYLAST, GHASHDATA3658659// Do the last AES round. This handles the XOR with the source data660// too, as per the optimization described above.661vaesenclast GHASHDATA0, %zmm0, GHASHDATA0662vaesenclast GHASHDATA1, %zmm1, GHASHDATA1663vaesenclast GHASHDATA2, %zmm2, GHASHDATA2664vaesenclast GHASHDATA3, %zmm3, GHASHDATA3665666// Store the en/decrypted data to DST.667vmovdqu8 GHASHDATA0, 0*64(DST)668vmovdqu8 GHASHDATA1, 1*64(DST)669vmovdqu8 GHASHDATA2, 2*64(DST)670vmovdqu8 GHASHDATA3, 3*64(DST)671.endm672673// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,674// const u32 le_ctr[4], u8 ghash_acc[16],675// const u8 *src, u8 *dst, int datalen);676//677// This macro generates a GCM encryption or decryption update function with the678// above prototype (with \enc selecting which one). The function computes the679// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,680// and writes the resulting encrypted or decrypted data to |dst|. It also681// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext682// bytes.683//684// |datalen| must be a multiple of 16, except on the last call where it can be685// any length. The caller must do any buffering needed to ensure this. Both686// in-place and out-of-place en/decryption are supported.687//688// |le_ctr| must give the current counter in little-endian format. This689// function loads the counter from |le_ctr| and increments the loaded counter as690// needed, but it does *not* store the updated counter back to |le_ctr|. The691// caller must update |le_ctr| if any more data segments follow. Internally,692// only the low 32-bit word of the counter is incremented, following the GCM693// standard.694.macro _aes_gcm_update enc695696// Function arguments697.set KEY, %rdi698.set LE_CTR_PTR, %rsi699.set GHASH_ACC_PTR, %rdx700.set SRC, %rcx701.set DST, %r8702.set DATALEN, %r9d703.set DATALEN64, %r9 // Zero-extend DATALEN before using!704705// Additional local variables706707// %rax and %k1 are used as temporary registers. LE_CTR_PTR is also708// available as a temporary register after the counter is loaded.709710// AES key length in bytes711.set AESKEYLEN, %r10d712.set AESKEYLEN64, %r10713714// Pointer to the last AES round key for the chosen AES variant715.set RNDKEYLAST_PTR, %r11716717// In the main loop, %zmm[0-3] are used as AES input and output.718// Elsewhere they are used as temporary registers.719720// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.721.set GHASHDATA0, %zmm4722.set GHASHDATA0_XMM, %xmm4723.set GHASHDATA1, %zmm5724.set GHASHDATA1_XMM, %xmm5725.set GHASHDATA2, %zmm6726.set GHASHDATA2_XMM, %xmm6727.set GHASHDATA3, %zmm7728729// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values730// using vpshufb, copied to all 128-bit lanes.731.set BSWAP_MASK, %zmm8732733// RNDKEY temporarily holds the next AES round key.734.set RNDKEY, %zmm9735736// GHASH_ACC is the accumulator variable for GHASH. When fully reduced,737// only the lowest 128-bit lane can be nonzero. When not fully reduced,738// more than one lane may be used, and they need to be XOR'd together.739.set GHASH_ACC, %zmm10740.set GHASH_ACC_XMM, %xmm10741742// LE_CTR_INC is the vector of 32-bit words that need to be added to a743// vector of little-endian counter blocks to advance it forwards.744.set LE_CTR_INC, %zmm11745746// LE_CTR contains the next set of little-endian counter blocks.747.set LE_CTR, %zmm12748749// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,750// copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,751// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.752.set RNDKEY0, %zmm13753.set RNDKEYLAST, %zmm14754.set RNDKEY_M9, %zmm15755.set RNDKEY_M8, %zmm16756.set RNDKEY_M7, %zmm17757.set RNDKEY_M6, %zmm18758.set RNDKEY_M5, %zmm19759.set RNDKEY_M4, %zmm20760.set RNDKEY_M3, %zmm21761.set RNDKEY_M2, %zmm22762.set RNDKEY_M1, %zmm23763764// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These765// cannot coincide with anything used for AES encryption, since for766// performance reasons GHASH and AES encryption are interleaved.767.set GHASHTMP0, %zmm24768.set GHASHTMP1, %zmm25769.set GHASHTMP2, %zmm26770771// H_POW[4-1] contain the powers of the hash key H^16...H^1. The772// descending numbering reflects the order of the key powers.773.set H_POW4, %zmm27774.set H_POW3, %zmm28775.set H_POW2, %zmm29776.set H_POW1, %zmm30777778// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.779.set GFPOLY, %zmm31780781// Load some constants.782vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK783vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY784785// Load the GHASH accumulator and the starting counter.786vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM787vbroadcasti32x4 (LE_CTR_PTR), LE_CTR788789// Load the AES key length in bytes.790movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN791792// Make RNDKEYLAST_PTR point to the last AES round key. This is the793// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256794// respectively. Then load the zero-th and last round keys.795lea OFFSETOF_AESROUNDKEYS+6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR796vbroadcasti32x4 OFFSETOF_AESROUNDKEYS(KEY), RNDKEY0797vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST798799// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.800vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR801802// Load 4 into all 128-bit lanes of LE_CTR_INC.803vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC804805// If there are at least 256 bytes of data, then continue into the loop806// that processes 256 bytes of data at a time. Otherwise skip it.807//808// Pre-subtracting 256 from DATALEN saves an instruction from the main809// loop and also ensures that at least one write always occurs to810// DATALEN, zero-extending it and allowing DATALEN64 to be used later.811sub $256, DATALEN812jl .Lcrypt_loop_4x_done\@813814// Load powers of the hash key.815vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4816vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3817vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2818vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1819820// Main loop: en/decrypt and hash 4 vectors at a time.821//822// When possible, interleave the AES encryption of the counter blocks823// with the GHASH update of the ciphertext blocks. This improves824// performance on many CPUs because the execution ports used by the VAES825// instructions often differ from those used by vpclmulqdq and other826// instructions used in GHASH. For example, many Intel CPUs dispatch827// vaesenc to ports 0 and 1 and vpclmulqdq to port 5.828//829// The interleaving is easiest to do during decryption, since during830// decryption the ciphertext blocks are immediately available. For831// encryption, instead encrypt the first set of blocks, then hash those832// blocks while encrypting the next set of blocks, repeat that as833// needed, and finally hash the last set of blocks.834835.if \enc836// Encrypt the first 4 vectors of plaintext blocks. Leave the resulting837// ciphertext in GHASHDATA[0-3] for GHASH.838_ctr_begin_4x839lea OFFSETOF_AESROUNDKEYS+16(KEY), %rax8401:841vbroadcasti32x4 (%rax), RNDKEY842_vaesenc_4x RNDKEY843add $16, %rax844cmp %rax, RNDKEYLAST_PTR845jne 1b846_aesenclast_and_xor_4x847add $256, SRC848add $256, DST849sub $256, DATALEN850jl .Lghash_last_ciphertext_4x\@851.endif852853// Cache as many additional AES round keys as possible.854.irp i, 9,8,7,6,5,4,3,2,1855vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i856.endr857858.Lcrypt_loop_4x\@:859860// If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If861// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.862.if !\enc863vmovdqu8 0*64(SRC), GHASHDATA0864vmovdqu8 1*64(SRC), GHASHDATA1865vmovdqu8 2*64(SRC), GHASHDATA2866vmovdqu8 3*64(SRC), GHASHDATA3867.endif868869// Start the AES encryption of the counter blocks.870_ctr_begin_4x871cmp $24, AESKEYLEN872jl 128f // AES-128?873je 192f // AES-192?874// AES-256875vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY876_vaesenc_4x RNDKEY877vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY878_vaesenc_4x RNDKEY879192:880vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY881_vaesenc_4x RNDKEY882vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY883_vaesenc_4x RNDKEY884128:885886// Finish the AES encryption of the counter blocks in %zmm[0-3],887// interleaved with the GHASH update of the ciphertext blocks in888// GHASHDATA[0-3].889.irp i, 9,8,7,6,5,4,3,2,1890_ghash_step_4x (9 - \i)891_vaesenc_4x RNDKEY_M\i892.endr893_ghash_step_4x 9894_aesenclast_and_xor_4x895add $256, SRC896add $256, DST897sub $256, DATALEN898jge .Lcrypt_loop_4x\@899900.if \enc901.Lghash_last_ciphertext_4x\@:902// Update GHASH with the last set of ciphertext blocks.903_ghash_4x904.endif905906.Lcrypt_loop_4x_done\@:907908// Undo the extra subtraction by 256 and check whether data remains.909add $256, DATALEN910jz .Ldone\@911912// The data length isn't a multiple of 256 bytes. Process the remaining913// data of length 1 <= DATALEN < 256, up to one 64-byte vector at a914// time. Going one vector at a time may seem inefficient compared to915// having separate code paths for each possible number of vectors916// remaining. However, using a loop keeps the code size down, and it917// performs surprising well; modern CPUs will start executing the next918// iteration before the previous one finishes and also predict the919// number of loop iterations. For a similar reason, we roll up the AES920// rounds.921//922// On the last iteration, the remaining length may be less than 64923// bytes. Handle this using masking.924//925// Since there are enough key powers available for all remaining data,926// there is no need to do a GHASH reduction after each iteration.927// Instead, multiply each remaining block by its own key power, and only928// do a GHASH reduction at the very end.929930// Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N931// is the number of blocks that remain.932.set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused.933mov DATALEN, %eax934neg %rax935and $~15, %rax // -round_up(DATALEN, 16)936lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR937938// Start collecting the unreduced GHASH intermediate value LO, MI, HI.939.set LO, GHASHDATA0940.set LO_XMM, GHASHDATA0_XMM941.set MI, GHASHDATA1942.set MI_XMM, GHASHDATA1_XMM943.set HI, GHASHDATA2944.set HI_XMM, GHASHDATA2_XMM945vpxor LO_XMM, LO_XMM, LO_XMM946vpxor MI_XMM, MI_XMM, MI_XMM947vpxor HI_XMM, HI_XMM, HI_XMM948949.Lcrypt_loop_1x\@:950951// Select the appropriate mask for this iteration: all 1's if952// DATALEN >= 64, otherwise DATALEN 1's. Do this branchlessly using the953// bzhi instruction from BMI2. (This relies on DATALEN <= 255.)954mov $-1, %rax955bzhi DATALEN64, %rax, %rax956kmovq %rax, %k1957958// Encrypt a vector of counter blocks. This does not need to be masked.959vpshufb BSWAP_MASK, LE_CTR, %zmm0960vpaddd LE_CTR_INC, LE_CTR, LE_CTR961vpxord RNDKEY0, %zmm0, %zmm0962lea OFFSETOF_AESROUNDKEYS+16(KEY), %rax9631:964vbroadcasti32x4 (%rax), RNDKEY965vaesenc RNDKEY, %zmm0, %zmm0966add $16, %rax967cmp %rax, RNDKEYLAST_PTR968jne 1b969vaesenclast RNDKEYLAST, %zmm0, %zmm0970971// XOR the data with the appropriate number of keystream bytes.972vmovdqu8 (SRC), %zmm1{%k1}{z}973vpxord %zmm1, %zmm0, %zmm0974vmovdqu8 %zmm0, (DST){%k1}975976// Update GHASH with the ciphertext block(s), without reducing.977//978// In the case of DATALEN < 64, the ciphertext is zero-padded to 64979// bytes. (If decrypting, it's done by the above masked load. If980// encrypting, it's done by the below masked register-to-register move.)981// Note that if DATALEN <= 48, there will be additional padding beyond982// the padding of the last block specified by GHASH itself; i.e., there983// may be whole block(s) that get processed by the GHASH multiplication984// and reduction instructions but should not actually be included in the985// GHASH. However, any such blocks are all-zeroes, and the values that986// they're multiplied with are also all-zeroes. Therefore they just add987// 0 * 0 = 0 to the final GHASH result, which makes no difference.988vmovdqu8 (POWERS_PTR), H_POW1989.if \enc990vmovdqu8 %zmm0, %zmm1{%k1}{z}991.endif992vpshufb BSWAP_MASK, %zmm1, %zmm0993vpxord GHASH_ACC, %zmm0, %zmm0994_ghash_mul_noreduce H_POW1, %zmm0, LO, MI, HI, \995GHASHDATA3, %zmm1, %zmm2, %zmm3996vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM997998add $64, POWERS_PTR999add $64, SRC1000add $64, DST1001sub $64, DATALEN1002jg .Lcrypt_loop_1x\@10031004// Finally, do the GHASH reduction.1005_ghash_reduce LO, MI, HI, GFPOLY, %zmm01006_horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm210071008.Ldone\@:1009// Store the updated GHASH accumulator back to memory.1010vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)10111012vzeroupper // This is needed after using ymm or zmm registers.1013RET1014.endm10151016// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,1017// const u32 le_ctr[4], u8 ghash_acc[16],1018// u64 total_aadlen, u64 total_datalen);1019// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,1020// const u32 le_ctr[4],1021// const u8 ghash_acc[16],1022// u64 total_aadlen, u64 total_datalen,1023// const u8 tag[16], int taglen);1024//1025// This macro generates one of the above two functions (with \enc selecting1026// which one). Both functions finish computing the GCM authentication tag by1027// updating GHASH with the lengths block and encrypting the GHASH accumulator.1028// |total_aadlen| and |total_datalen| must be the total length of the additional1029// authenticated data and the en/decrypted data in bytes, respectively.1030//1031// The encryption function then stores the full-length (16-byte) computed1032// authentication tag to |ghash_acc|. The decryption function instead loads the1033// expected authentication tag (the one that was transmitted) from the 16-byte1034// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the1035// computed tag in constant time, and returns true if and only if they match.1036.macro _aes_gcm_final enc10371038// Function arguments1039.set KEY, %rdi1040.set LE_CTR_PTR, %rsi1041.set GHASH_ACC_PTR, %rdx1042.set TOTAL_AADLEN, %rcx1043.set TOTAL_DATALEN, %r81044.set TAG, %r91045.set TAGLEN, %r10d // Originally at 8(%rsp)10461047// Additional local variables.1048// %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.1049.set AESKEYLEN, %r11d1050.set AESKEYLEN64, %r111051.set GFPOLY, %xmm41052.set BSWAP_MASK, %xmm51053.set LE_CTR, %xmm61054.set GHASH_ACC, %xmm71055.set H_POW1, %xmm810561057// Load some constants.1058vmovdqa .Lgfpoly(%rip), GFPOLY1059vmovdqa .Lbswap_mask(%rip), BSWAP_MASK10601061// Load the AES key length in bytes.1062movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN10631064// Set up a counter block with 1 in the low 32-bit word. This is the1065// counter that produces the ciphertext needed to encrypt the auth tag.1066// GFPOLY has 1 in the low word, so grab the 1 from there using a blend.1067vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR10681069// Build the lengths block and XOR it with the GHASH accumulator.1070// Although the lengths block is defined as the AAD length followed by1071// the en/decrypted data length, both in big-endian byte order, a byte1072// reflection of the full block is needed because of the way we compute1073// GHASH (see _ghash_mul_step). By using little-endian values in the1074// opposite order, we avoid having to reflect any bytes here.1075vmovq TOTAL_DATALEN, %xmm01076vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm01077vpsllq $3, %xmm0, %xmm0 // Bytes to bits1078vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC10791080// Load the first hash key power (H^1), which is stored last.1081vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW110821083.if !\enc1084// Prepare a mask of TAGLEN one bits.1085movl 8(%rsp), TAGLEN1086mov $-1, %eax1087bzhi TAGLEN, %eax, %eax1088kmovd %eax, %k11089.endif10901091// Make %rax point to the last AES round key for the chosen AES variant.1092lea OFFSETOF_AESROUNDKEYS+6*16(KEY,AESKEYLEN64,4), %rax10931094// Start the AES encryption of the counter block by swapping the counter1095// block to big-endian and XOR-ing it with the zero-th AES round key.1096vpshufb BSWAP_MASK, LE_CTR, %xmm01097vpxor OFFSETOF_AESROUNDKEYS(KEY), %xmm0, %xmm010981099// Complete the AES encryption and multiply GHASH_ACC by H^1.1100// Interleave the AES and GHASH instructions to improve performance.1101cmp $24, AESKEYLEN1102jl 128f // AES-128?1103je 192f // AES-192?1104// AES-2561105vaesenc -13*16(%rax), %xmm0, %xmm01106vaesenc -12*16(%rax), %xmm0, %xmm01107192:1108vaesenc -11*16(%rax), %xmm0, %xmm01109vaesenc -10*16(%rax), %xmm0, %xmm01110128:1111.irp i, 0,1,2,3,4,5,6,7,81112_ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \1113%xmm1, %xmm2, %xmm31114vaesenc (\i-9)*16(%rax), %xmm0, %xmm01115.endr1116_ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \1117%xmm1, %xmm2, %xmm311181119// Undo the byte reflection of the GHASH accumulator.1120vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC11211122// Do the last AES round and XOR the resulting keystream block with the1123// GHASH accumulator to produce the full computed authentication tag.1124//1125// Reduce latency by taking advantage of the property vaesenclast(key,1126// a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last1127// round key, instead of XOR'ing the final AES output with GHASH_ACC.1128//1129// enc_final then returns the computed auth tag, while dec_final1130// compares it with the transmitted one and returns a bool. To compare1131// the tags, dec_final XORs them together and uses vptest to check1132// whether the result is all-zeroes. This should be constant-time.1133// dec_final applies the vaesenclast optimization to this additional1134// value XOR'd too, using vpternlogd to XOR the last round key, GHASH1135// accumulator, and transmitted auth tag together in one instruction.1136.if \enc1137vpxor (%rax), GHASH_ACC, %xmm11138vaesenclast %xmm1, %xmm0, GHASH_ACC1139vmovdqu GHASH_ACC, (GHASH_ACC_PTR)1140.else1141vmovdqu (TAG), %xmm11142vpternlogd $0x96, (%rax), GHASH_ACC, %xmm11143vaesenclast %xmm1, %xmm0, %xmm01144xor %eax, %eax1145vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes1146vptest %xmm0, %xmm01147sete %al1148.endif1149// No need for vzeroupper here, since only used xmm registers were used.1150RET1151.endm11521153SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)1154_aes_gcm_update 11155SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)1156SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)1157_aes_gcm_update 01158SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)11591160SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)1161_aes_gcm_final 11162SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)1163SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)1164_aes_gcm_final 01165SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)116611671168