Path: blob/main/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
48775 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.23*/2425#include <sys/zfs_context.h>26#include <sys/cmn_err.h>27#include <modes/modes.h>28#include <sys/crypto/common.h>29#include <sys/crypto/icp.h>30#include <sys/crypto/impl.h>31#include <sys/byteorder.h>32#include <sys/simd.h>33#include <modes/gcm_impl.h>34#ifdef CAN_USE_GCM_ASM35#include <aes/aes_impl.h>36#endif3738#define GHASH(c, d, t, o) \39xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \40(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \41(uint64_t *)(void *)(t));4243/* Select GCM implementation */44#define IMPL_FASTEST (UINT32_MAX)45#define IMPL_CYCLE (UINT32_MAX-1)46#ifdef CAN_USE_GCM_ASM47#define IMPL_AVX (UINT32_MAX-2)48#if CAN_USE_GCM_ASM >= 249#define IMPL_AVX2 (UINT32_MAX-3)50#endif51#endif52#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))53static uint32_t icp_gcm_impl = IMPL_FASTEST;54static uint32_t user_sel_impl = IMPL_FASTEST;5556#ifdef CAN_USE_GCM_ASM57/* Does the architecture we run on support the MOVBE instruction? */58boolean_t gcm_avx_can_use_movbe = B_FALSE;59/*60* Whether to use the optimized openssl gcm and ghash implementations.61*/62static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;63#define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used)6465extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);6667static inline boolean_t gcm_avx_will_work(void);68static inline boolean_t gcm_avx2_will_work(void);69static inline void gcm_use_impl(gcm_impl impl);70static inline gcm_impl gcm_toggle_impl(void);7172static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,73crypto_data_t *, size_t);7475static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);76static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);77static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,78size_t, size_t);79#endif /* ifdef CAN_USE_GCM_ASM */8081/*82* Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode83* is done in another function.84*/85int86gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,87crypto_data_t *out, size_t block_size,88int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),89void (*copy_block)(uint8_t *, uint8_t *),90void (*xor_block)(uint8_t *, uint8_t *))91{92#ifdef CAN_USE_GCM_ASM93if (ctx->impl != GCM_IMPL_GENERIC)94return (gcm_mode_encrypt_contiguous_blocks_avx(95ctx, data, length, out, block_size));96#endif9798const gcm_impl_ops_t *gops;99size_t remainder = length;100size_t need = 0;101uint8_t *datap = (uint8_t *)data;102uint8_t *blockp;103uint8_t *lastp;104void *iov_or_mp;105offset_t offset;106uint8_t *out_data_1;107uint8_t *out_data_2;108size_t out_data_1_len;109uint64_t counter;110uint64_t counter_mask = ntohll(0x00000000ffffffffULL);111112if (length + ctx->gcm_remainder_len < block_size) {113/* accumulate bytes here and return */114memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,115datap,116length);117ctx->gcm_remainder_len += length;118if (ctx->gcm_copy_to == NULL) {119ctx->gcm_copy_to = datap;120}121return (CRYPTO_SUCCESS);122}123124crypto_init_ptrs(out, &iov_or_mp, &offset);125126gops = gcm_impl_get_ops();127do {128/* Unprocessed data from last call. */129if (ctx->gcm_remainder_len > 0) {130need = block_size - ctx->gcm_remainder_len;131132if (need > remainder)133return (CRYPTO_DATA_LEN_RANGE);134135memcpy(&((uint8_t *)ctx->gcm_remainder)136[ctx->gcm_remainder_len], datap, need);137138blockp = (uint8_t *)ctx->gcm_remainder;139} else {140blockp = datap;141}142143/*144* Increment counter. Counter bits are confined145* to the bottom 32 bits of the counter block.146*/147counter = ntohll(ctx->gcm_cb[1] & counter_mask);148counter = htonll(counter + 1);149counter &= counter_mask;150ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;151152encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,153(uint8_t *)ctx->gcm_tmp);154xor_block(blockp, (uint8_t *)ctx->gcm_tmp);155156lastp = (uint8_t *)ctx->gcm_tmp;157158ctx->gcm_processed_data_len += block_size;159160crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,161&out_data_1_len, &out_data_2, block_size);162163/* copy block to where it belongs */164if (out_data_1_len == block_size) {165copy_block(lastp, out_data_1);166} else {167memcpy(out_data_1, lastp, out_data_1_len);168if (out_data_2 != NULL) {169memcpy(out_data_2,170lastp + out_data_1_len,171block_size - out_data_1_len);172}173}174/* update offset */175out->cd_offset += block_size;176177/* add ciphertext to the hash */178GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);179180/* Update pointer to next block of data to be processed. */181if (ctx->gcm_remainder_len != 0) {182datap += need;183ctx->gcm_remainder_len = 0;184} else {185datap += block_size;186}187188remainder = (size_t)&data[length] - (size_t)datap;189190/* Incomplete last block. */191if (remainder > 0 && remainder < block_size) {192memcpy(ctx->gcm_remainder, datap, remainder);193ctx->gcm_remainder_len = remainder;194ctx->gcm_copy_to = datap;195goto out;196}197ctx->gcm_copy_to = NULL;198199} while (remainder > 0);200out:201return (CRYPTO_SUCCESS);202}203204int205gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,206int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),207void (*copy_block)(uint8_t *, uint8_t *),208void (*xor_block)(uint8_t *, uint8_t *))209{210(void) copy_block;211#ifdef CAN_USE_GCM_ASM212if (ctx->impl != GCM_IMPL_GENERIC)213return (gcm_encrypt_final_avx(ctx, out, block_size));214#endif215216const gcm_impl_ops_t *gops;217uint64_t counter_mask = ntohll(0x00000000ffffffffULL);218uint8_t *ghash, *macp = NULL;219int i, rv;220221if (out->cd_length <222(ctx->gcm_remainder_len + ctx->gcm_tag_len)) {223return (CRYPTO_DATA_LEN_RANGE);224}225226gops = gcm_impl_get_ops();227ghash = (uint8_t *)ctx->gcm_ghash;228229if (ctx->gcm_remainder_len > 0) {230uint64_t counter;231uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;232233/*234* Here is where we deal with data that is not a235* multiple of the block size.236*/237238/*239* Increment counter.240*/241counter = ntohll(ctx->gcm_cb[1] & counter_mask);242counter = htonll(counter + 1);243counter &= counter_mask;244ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;245246encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,247(uint8_t *)ctx->gcm_tmp);248249macp = (uint8_t *)ctx->gcm_remainder;250memset(macp + ctx->gcm_remainder_len, 0,251block_size - ctx->gcm_remainder_len);252253/* XOR with counter block */254for (i = 0; i < ctx->gcm_remainder_len; i++) {255macp[i] ^= tmpp[i];256}257258/* add ciphertext to the hash */259GHASH(ctx, macp, ghash, gops);260261ctx->gcm_processed_data_len += ctx->gcm_remainder_len;262}263264ctx->gcm_len_a_len_c[1] =265htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));266GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);267encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,268(uint8_t *)ctx->gcm_J0);269xor_block((uint8_t *)ctx->gcm_J0, ghash);270271if (ctx->gcm_remainder_len > 0) {272rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);273if (rv != CRYPTO_SUCCESS)274return (rv);275}276out->cd_offset += ctx->gcm_remainder_len;277ctx->gcm_remainder_len = 0;278rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);279if (rv != CRYPTO_SUCCESS)280return (rv);281out->cd_offset += ctx->gcm_tag_len;282283return (CRYPTO_SUCCESS);284}285286/*287* This will only deal with decrypting the last block of the input that288* might not be a multiple of block length.289*/290static void291gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,292int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),293void (*xor_block)(uint8_t *, uint8_t *))294{295uint8_t *datap, *outp, *counterp;296uint64_t counter;297uint64_t counter_mask = ntohll(0x00000000ffffffffULL);298int i;299300/*301* Increment counter.302* Counter bits are confined to the bottom 32 bits303*/304counter = ntohll(ctx->gcm_cb[1] & counter_mask);305counter = htonll(counter + 1);306counter &= counter_mask;307ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;308309datap = (uint8_t *)ctx->gcm_remainder;310outp = &((ctx->gcm_pt_buf)[index]);311counterp = (uint8_t *)ctx->gcm_tmp;312313/* authentication tag */314memset((uint8_t *)ctx->gcm_tmp, 0, block_size);315memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);316317/* add ciphertext to the hash */318GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());319320/* decrypt remaining ciphertext */321encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);322323/* XOR with counter block */324for (i = 0; i < ctx->gcm_remainder_len; i++) {325outp[i] = datap[i] ^ counterp[i];326}327}328329int330gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,331crypto_data_t *out, size_t block_size,332int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),333void (*copy_block)(uint8_t *, uint8_t *),334void (*xor_block)(uint8_t *, uint8_t *))335{336(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,337(void) xor_block;338size_t new_len;339uint8_t *new;340341/*342* Copy contiguous ciphertext input blocks to plaintext buffer.343* Ciphertext will be decrypted in the final.344*/345if (length > 0) {346new_len = ctx->gcm_pt_buf_len + length;347new = vmem_alloc(new_len, KM_SLEEP);348if (new == NULL) {349vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);350ctx->gcm_pt_buf = NULL;351return (CRYPTO_HOST_MEMORY);352}353354if (ctx->gcm_pt_buf != NULL) {355memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);356vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);357} else {358ASSERT0(ctx->gcm_pt_buf_len);359}360361ctx->gcm_pt_buf = new;362ctx->gcm_pt_buf_len = new_len;363memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,364length);365ctx->gcm_processed_data_len += length;366}367368ctx->gcm_remainder_len = 0;369return (CRYPTO_SUCCESS);370}371372int373gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,374int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),375void (*xor_block)(uint8_t *, uint8_t *))376{377#ifdef CAN_USE_GCM_ASM378if (ctx->impl != GCM_IMPL_GENERIC)379return (gcm_decrypt_final_avx(ctx, out, block_size));380#endif381382const gcm_impl_ops_t *gops;383size_t pt_len;384size_t remainder;385uint8_t *ghash;386uint8_t *blockp;387uint8_t *cbp;388uint64_t counter;389uint64_t counter_mask = ntohll(0x00000000ffffffffULL);390int processed = 0, rv;391392ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);393394gops = gcm_impl_get_ops();395pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;396ghash = (uint8_t *)ctx->gcm_ghash;397blockp = ctx->gcm_pt_buf;398remainder = pt_len;399while (remainder > 0) {400/* Incomplete last block */401if (remainder < block_size) {402memcpy(ctx->gcm_remainder, blockp, remainder);403ctx->gcm_remainder_len = remainder;404/*405* not expecting anymore ciphertext, just406* compute plaintext for the remaining input407*/408gcm_decrypt_incomplete_block(ctx, block_size,409processed, encrypt_block, xor_block);410ctx->gcm_remainder_len = 0;411goto out;412}413/* add ciphertext to the hash */414GHASH(ctx, blockp, ghash, gops);415416/*417* Increment counter.418* Counter bits are confined to the bottom 32 bits419*/420counter = ntohll(ctx->gcm_cb[1] & counter_mask);421counter = htonll(counter + 1);422counter &= counter_mask;423ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;424425cbp = (uint8_t *)ctx->gcm_tmp;426encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);427428/* XOR with ciphertext */429xor_block(cbp, blockp);430431processed += block_size;432blockp += block_size;433remainder -= block_size;434}435out:436ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));437GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);438encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,439(uint8_t *)ctx->gcm_J0);440xor_block((uint8_t *)ctx->gcm_J0, ghash);441442/* compare the input authentication tag with what we calculated */443if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {444/* They don't match */445return (CRYPTO_INVALID_MAC);446} else {447rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);448if (rv != CRYPTO_SUCCESS)449return (rv);450out->cd_offset += pt_len;451}452return (CRYPTO_SUCCESS);453}454455static int456gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)457{458size_t tag_len;459460/*461* Check the length of the authentication tag (in bits).462*/463tag_len = gcm_param->ulTagBits;464switch (tag_len) {465case 32:466case 64:467case 96:468case 104:469case 112:470case 120:471case 128:472break;473default:474return (CRYPTO_MECHANISM_PARAM_INVALID);475}476477if (gcm_param->ulIvLen == 0)478return (CRYPTO_MECHANISM_PARAM_INVALID);479480return (CRYPTO_SUCCESS);481}482483static void484gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,485gcm_ctx_t *ctx, size_t block_size,486void (*copy_block)(uint8_t *, uint8_t *),487void (*xor_block)(uint8_t *, uint8_t *))488{489const gcm_impl_ops_t *gops;490uint8_t *cb;491ulong_t remainder = iv_len;492ulong_t processed = 0;493uint8_t *datap, *ghash;494uint64_t len_a_len_c[2];495496gops = gcm_impl_get_ops();497ghash = (uint8_t *)ctx->gcm_ghash;498cb = (uint8_t *)ctx->gcm_cb;499if (iv_len == 12) {500memcpy(cb, iv, 12);501cb[12] = 0;502cb[13] = 0;503cb[14] = 0;504cb[15] = 1;505/* J0 will be used again in the final */506copy_block(cb, (uint8_t *)ctx->gcm_J0);507} else {508/* GHASH the IV */509do {510if (remainder < block_size) {511memset(cb, 0, block_size);512memcpy(cb, &(iv[processed]), remainder);513datap = (uint8_t *)cb;514remainder = 0;515} else {516datap = (uint8_t *)(&(iv[processed]));517processed += block_size;518remainder -= block_size;519}520GHASH(ctx, datap, ghash, gops);521} while (remainder > 0);522523len_a_len_c[0] = 0;524len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));525GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);526527/* J0 will be used again in the final */528copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);529}530}531532static int533gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,534const uint8_t *auth_data, size_t auth_data_len, size_t block_size,535int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),536void (*copy_block)(uint8_t *, uint8_t *),537void (*xor_block)(uint8_t *, uint8_t *))538{539const gcm_impl_ops_t *gops;540uint8_t *ghash, *datap, *authp;541size_t remainder, processed;542543/* encrypt zero block to get subkey H */544memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));545encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,546(uint8_t *)ctx->gcm_H);547548gcm_format_initial_blocks(iv, iv_len, ctx, block_size,549copy_block, xor_block);550551gops = gcm_impl_get_ops();552authp = (uint8_t *)ctx->gcm_tmp;553ghash = (uint8_t *)ctx->gcm_ghash;554memset(authp, 0, block_size);555memset(ghash, 0, block_size);556557processed = 0;558remainder = auth_data_len;559do {560if (remainder < block_size) {561/*562* There's not a block full of data, pad rest of563* buffer with zero564*/565566if (auth_data != NULL) {567memset(authp, 0, block_size);568memcpy(authp, &(auth_data[processed]),569remainder);570} else {571ASSERT0(remainder);572}573574datap = (uint8_t *)authp;575remainder = 0;576} else {577datap = (uint8_t *)(&(auth_data[processed]));578processed += block_size;579remainder -= block_size;580}581582/* add auth data to the hash */583GHASH(ctx, datap, ghash, gops);584585} while (remainder > 0);586587return (CRYPTO_SUCCESS);588}589590/*591* Init the GCM context struct. Handle the cycle and avx implementations here.592*/593int594gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,595size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,596uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),597void (*xor_block)(uint8_t *, uint8_t *))598{599CK_AES_GCM_PARAMS *gcm_param;600int rv = CRYPTO_SUCCESS;601size_t tag_len, iv_len;602603if (param != NULL) {604gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;605606/* GCM mode. */607if ((rv = gcm_validate_args(gcm_param)) != 0) {608return (rv);609}610gcm_ctx->gcm_flags |= GCM_MODE;611612size_t tbits = gcm_param->ulTagBits;613tag_len = CRYPTO_BITS2BYTES(tbits);614iv_len = gcm_param->ulIvLen;615616gcm_ctx->gcm_tag_len = tag_len;617gcm_ctx->gcm_processed_data_len = 0;618619/* these values are in bits */620gcm_ctx->gcm_len_a_len_c[0]621= htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));622} else {623return (CRYPTO_MECHANISM_PARAM_INVALID);624}625626const uint8_t *iv = (const uint8_t *)gcm_param->pIv;627const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;628size_t aad_len = gcm_param->ulAADLen;629630#ifdef CAN_USE_GCM_ASM631boolean_t needs_bswap =632((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;633634if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {635gcm_ctx->impl = GCM_IMPL_USED;636} else {637/*638* Handle the "cycle" implementation by creating different639* contexts, one per implementation.640*/641gcm_ctx->impl = gcm_toggle_impl();642643/* The AVX impl. doesn't handle byte swapped key schedules. */644if (needs_bswap == B_TRUE) {645gcm_ctx->impl = GCM_IMPL_GENERIC;646}647/*648* If this is an AVX context, use the MOVBE and the BSWAP649* variants alternately.650*/651if (gcm_ctx->impl == GCM_IMPL_AVX &&652zfs_movbe_available() == B_TRUE) {653(void) atomic_toggle_boolean_nv(654(volatile boolean_t *)&gcm_avx_can_use_movbe);655}656}657/*658* We don't handle byte swapped key schedules in the avx code path,659* still they could be created by the aes generic implementation.660* Make sure not to use them since we'll corrupt data if we do.661*/662if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {663gcm_ctx->impl = GCM_IMPL_GENERIC;664665cmn_err_once(CE_WARN,666"ICP: Can't use the aes generic or cycle implementations "667"in combination with the gcm avx or avx2-vaes "668"implementation!");669cmn_err_once(CE_WARN,670"ICP: Falling back to a compatible implementation, "671"aes-gcm performance will likely be degraded.");672cmn_err_once(CE_WARN,673"ICP: Choose at least the x86_64 aes implementation to "674"restore performance.");675}676677/*678* AVX implementations use Htable with sizes depending on679* implementation.680*/681if (gcm_ctx->impl != GCM_IMPL_GENERIC) {682rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,683block_size);684}685else686#endif /* ifdef CAN_USE_GCM_ASM */687if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,688encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {689rv = CRYPTO_MECHANISM_PARAM_INVALID;690}691692return (rv);693}694695void *696gcm_alloc_ctx(int kmflag)697{698gcm_ctx_t *gcm_ctx;699700if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)701return (NULL);702703gcm_ctx->gcm_flags = GCM_MODE;704return (gcm_ctx);705}706707/* GCM implementation that contains the fastest methods */708static gcm_impl_ops_t gcm_fastest_impl = {709.name = "fastest"710};711712/* All compiled in implementations */713static const gcm_impl_ops_t *gcm_all_impl[] = {714&gcm_generic_impl,715#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)716&gcm_pclmulqdq_impl,717#endif718};719720/* Indicate that benchmark has been completed */721static boolean_t gcm_impl_initialized = B_FALSE;722723/* Hold all supported implementations */724static size_t gcm_supp_impl_cnt = 0;725static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];726727/*728* Returns the GCM operations for encrypt/decrypt/key setup. When a729* SIMD implementation is not allowed in the current context, then730* fallback to the fastest generic implementation.731*/732const gcm_impl_ops_t *733gcm_impl_get_ops(void)734{735if (!kfpu_allowed())736return (&gcm_generic_impl);737738const gcm_impl_ops_t *ops = NULL;739const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);740741switch (impl) {742case IMPL_FASTEST:743ASSERT(gcm_impl_initialized);744ops = &gcm_fastest_impl;745break;746case IMPL_CYCLE:747/* Cycle through supported implementations */748ASSERT(gcm_impl_initialized);749ASSERT3U(gcm_supp_impl_cnt, >, 0);750static size_t cycle_impl_idx = 0;751size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;752ops = gcm_supp_impl[idx];753break;754#ifdef CAN_USE_GCM_ASM755case IMPL_AVX:756#if CAN_USE_GCM_ASM >= 2757case IMPL_AVX2:758#endif759/*760* Make sure that we return a valid implementation while761* switching to the avx implementation since there still762* may be unfinished non-avx contexts around.763*/764ops = &gcm_generic_impl;765break;766#endif767default:768ASSERT3U(impl, <, gcm_supp_impl_cnt);769ASSERT3U(gcm_supp_impl_cnt, >, 0);770if (impl < ARRAY_SIZE(gcm_all_impl))771ops = gcm_supp_impl[impl];772break;773}774775ASSERT3P(ops, !=, NULL);776777return (ops);778}779780/*781* Initialize all supported implementations.782*/783void784gcm_impl_init(void)785{786gcm_impl_ops_t *curr_impl;787int i, c;788789/* Move supported implementations into gcm_supp_impls */790for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {791curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];792793if (curr_impl->is_supported())794gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;795}796gcm_supp_impl_cnt = c;797798/*799* Set the fastest implementation given the assumption that the800* hardware accelerated version is the fastest.801*/802#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)803if (gcm_pclmulqdq_impl.is_supported()) {804memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,805sizeof (gcm_fastest_impl));806} else807#endif808{809memcpy(&gcm_fastest_impl, &gcm_generic_impl,810sizeof (gcm_fastest_impl));811}812813strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);814815#ifdef CAN_USE_GCM_ASM816/*817* Use the avx implementation if it's available and the implementation818* hasn't changed from its default value of fastest on module load.819*/820#if CAN_USE_GCM_ASM >= 2821if (gcm_avx2_will_work()) {822if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {823gcm_use_impl(GCM_IMPL_AVX2);824}825} else826#endif827if (gcm_avx_will_work()) {828#ifdef HAVE_MOVBE829if (zfs_movbe_available() == B_TRUE) {830atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);831}832#endif833if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {834gcm_use_impl(GCM_IMPL_AVX);835}836}837#endif838/* Finish initialization */839atomic_swap_32(&icp_gcm_impl, user_sel_impl);840gcm_impl_initialized = B_TRUE;841}842843static const struct {844const char *name;845uint32_t sel;846} gcm_impl_opts[] = {847{ "cycle", IMPL_CYCLE },848{ "fastest", IMPL_FASTEST },849#ifdef CAN_USE_GCM_ASM850{ "avx", IMPL_AVX },851{ "avx2-vaes", IMPL_AVX2 },852#endif853};854855/*856* Function sets desired gcm implementation.857*858* If we are called before init(), user preference will be saved in859* user_sel_impl, and applied in later init() call. This occurs when module860* parameter is specified on module load. Otherwise, directly update861* icp_gcm_impl.862*863* @val Name of gcm implementation to use864* @param Unused.865*/866int867gcm_impl_set(const char *val)868{869int err = -EINVAL;870char req_name[GCM_IMPL_NAME_MAX];871uint32_t impl = GCM_IMPL_READ(user_sel_impl);872size_t i;873874/* sanitize input */875i = strnlen(val, GCM_IMPL_NAME_MAX);876if (i == 0 || i >= GCM_IMPL_NAME_MAX)877return (err);878879strlcpy(req_name, val, GCM_IMPL_NAME_MAX);880while (i > 0 && isspace(req_name[i-1]))881i--;882req_name[i] = '\0';883884/* Check mandatory options */885for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {886#ifdef CAN_USE_GCM_ASM887#if CAN_USE_GCM_ASM >= 2888/* Ignore avx implementation if it won't work. */889if (gcm_impl_opts[i].sel == IMPL_AVX2 &&890!gcm_avx2_will_work()) {891continue;892}893#endif894if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {895continue;896}897#endif898if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {899impl = gcm_impl_opts[i].sel;900err = 0;901break;902}903}904905/* check all supported impl if init() was already called */906if (err != 0 && gcm_impl_initialized) {907/* check all supported implementations */908for (i = 0; i < gcm_supp_impl_cnt; i++) {909if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {910impl = i;911err = 0;912break;913}914}915}916#ifdef CAN_USE_GCM_ASM917/*918* Use the avx implementation if available and the requested one is919* avx or fastest.920*/921#if CAN_USE_GCM_ASM >= 2922if (gcm_avx2_will_work() == B_TRUE &&923(impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {924gcm_use_impl(GCM_IMPL_AVX2);925} else926#endif927if (gcm_avx_will_work() == B_TRUE &&928(impl == IMPL_AVX || impl == IMPL_FASTEST)) {929gcm_use_impl(GCM_IMPL_AVX);930} else {931gcm_use_impl(GCM_IMPL_GENERIC);932}933#endif934935if (err == 0) {936if (gcm_impl_initialized)937atomic_swap_32(&icp_gcm_impl, impl);938else939atomic_swap_32(&user_sel_impl, impl);940}941942return (err);943}944945#if defined(_KERNEL) && defined(__linux__)946947static int948icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)949{950return (gcm_impl_set(val));951}952953static int954icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)955{956int i, cnt = 0;957char *fmt;958const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);959960/* list mandatory options */961for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {962#ifdef CAN_USE_GCM_ASM963/* Ignore avx implementation if it won't work. */964#if CAN_USE_GCM_ASM >= 2965if (gcm_impl_opts[i].sel == IMPL_AVX2 &&966!gcm_avx2_will_work()) {967continue;968}969#endif970if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {971continue;972}973#endif974fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";975cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,976gcm_impl_opts[i].name);977}978979/* list all supported implementations */980for (i = 0; i < gcm_supp_impl_cnt; i++) {981fmt = (i == impl) ? "[%s] " : "%s ";982cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,983gcm_supp_impl[i]->name);984}985986return (cnt);987}988989module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,990NULL, 0644);991MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");992#endif /* defined(__KERNEL) */993994#ifdef CAN_USE_GCM_ASM995#define GCM_BLOCK_LEN 16996/*997* The openssl asm routines are 6x aggregated and need that many bytes998* at minimum.999*/1000#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)1001#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)1002/*1003* Ensure the chunk size is reasonable since we are allocating a1004* GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.1005*/1006#define GCM_AVX_MAX_CHUNK_SIZE \1007(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)10081009/* Clear the FPU registers since they hold sensitive internal state. */1010#define clear_fpu_regs() clear_fpu_regs_avx()10111012#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)10131014/* Get the chunk size module parameter. */1015#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size10161017/*1018* Module parameter: number of bytes to process at once while owning the FPU.1019* Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is1020* ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.1021*/1022static uint32_t gcm_avx_chunk_size =1023((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;10241025/*1026* GCM definitions: uint128_t is copied from include/crypto/modes.h1027* Avoiding u128 because it is already defined in kernel sources.1028*/1029typedef struct {1030uint64_t hi, lo;1031} uint128_t;10321033extern void ASMABI clear_fpu_regs_avx(void);1034extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);1035extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,1036const uint32_t pt[4], uint32_t ct[4]);10371038extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);1039#if CAN_USE_GCM_ASM >= 21040extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],1041const uint64_t H[2]);1042#endif1043extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,1044const uint8_t *in, size_t len);1045#if CAN_USE_GCM_ASM >= 21046extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],1047const uint64_t *Htable, const uint8_t *in, size_t len);1048#endif1049static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)1050{1051switch (ctx->impl) {1052#if CAN_USE_GCM_ASM >= 21053case GCM_IMPL_AVX2:1054gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,1055(const uint64_t *)ctx->gcm_Htable, in, len);1056break;1057#endif10581059case GCM_IMPL_AVX:1060gcm_ghash_avx(ctx->gcm_ghash,1061(const uint64_t *)ctx->gcm_Htable, in, len);1062break;10631064default:1065VERIFY(B_FALSE);1066}1067}10681069typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,1070size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);1071extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,1072const void *, uint64_t *, uint64_t *);1073#if CAN_USE_GCM_ASM >= 21074extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,1075uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],1076const uint128_t Htable[16], uint8_t Xi[16]);1077#endif10781079typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,1080size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);1081extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,1082const void *, uint64_t *, uint64_t *);1083#if CAN_USE_GCM_ASM >= 21084extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,1085uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],1086const uint128_t Htable[16], uint8_t Xi[16]);1087#endif10881089static inline boolean_t1090gcm_avx2_will_work(void)1091{1092return (kfpu_allowed() &&1093zfs_avx2_available() && zfs_vaes_available() &&1094zfs_vpclmulqdq_available());1095}10961097static inline boolean_t1098gcm_avx_will_work(void)1099{1100/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */1101return (kfpu_allowed() &&1102zfs_avx_available() && zfs_aes_available() &&1103zfs_pclmulqdq_available());1104}11051106static inline void1107gcm_use_impl(gcm_impl impl)1108{1109switch (impl) {1110#if CAN_USE_GCM_ASM >= 21111case GCM_IMPL_AVX2:1112if (gcm_avx2_will_work() == B_TRUE) {1113atomic_swap_32(&gcm_impl_used, impl);1114return;1115}11161117zfs_fallthrough;1118#endif11191120case GCM_IMPL_AVX:1121if (gcm_avx_will_work() == B_TRUE) {1122atomic_swap_32(&gcm_impl_used, impl);1123return;1124}11251126zfs_fallthrough;11271128default:1129atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);1130}1131}11321133static inline boolean_t1134gcm_impl_will_work(gcm_impl impl)1135{1136switch (impl) {1137#if CAN_USE_GCM_ASM >= 21138case GCM_IMPL_AVX2:1139return (gcm_avx2_will_work());1140#endif11411142case GCM_IMPL_AVX:1143return (gcm_avx_will_work());11441145default:1146return (B_TRUE);1147}1148}11491150static inline gcm_impl1151gcm_toggle_impl(void)1152{1153gcm_impl current_impl, new_impl;1154do { /* handle races */1155current_impl = atomic_load_32(&gcm_impl_used);1156new_impl = current_impl;1157while (B_TRUE) { /* handle incompatble implementations */1158new_impl = (new_impl + 1) % GCM_IMPL_MAX;1159if (gcm_impl_will_work(new_impl)) {1160break;1161}1162}11631164} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=1165current_impl);11661167return (new_impl);1168}116911701171/* Increment the GCM counter block by n. */1172static inline void1173gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)1174{1175uint64_t counter_mask = ntohll(0x00000000ffffffffULL);1176uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);11771178counter = htonll(counter + n);1179counter &= counter_mask;1180ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;1181}11821183static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,1184size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,1185uint64_t *Xip)1186{1187(void) Htable;1188return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));1189}11901191#if CAN_USE_GCM_ASM >= 21192// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four1193// bits of a |size_t|.1194// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc1195static const size_t kSizeTWithoutLower4Bits = (size_t)-16;11961197/* The following CRYPTO methods are from boringssl/crypto/internal.h */1198static inline uint32_t CRYPTO_bswap4(uint32_t x) {1199return (__builtin_bswap32(x));1200}12011202static inline uint32_t CRYPTO_load_u32_be(const void *in) {1203uint32_t v;1204memcpy(&v, in, sizeof (v));1205return (CRYPTO_bswap4(v));1206}12071208static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {1209v = CRYPTO_bswap4(v);1210memcpy(out, &v, sizeof (v));1211}12121213static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,1214size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,1215uint64_t *Xip)1216{1217uint8_t *ivec = (uint8_t *)iv;1218len &= kSizeTWithoutLower4Bits;1219aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,1220(const uint128_t *)Htable, (uint8_t *)Xip);1221CRYPTO_store_u32_be(&ivec[12],1222CRYPTO_load_u32_be(&ivec[12]) + len / 16);1223return (len);1224}1225#endif /* if CAN_USE_GCM_ASM >= 2 */12261227/*1228* Encrypt multiple blocks of data in GCM mode.1229* This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines1230* if possible. While processing a chunk the FPU is "locked".1231*/1232static int1233gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,1234size_t length, crypto_data_t *out, size_t block_size)1235{1236size_t bleft = length;1237size_t need = 0;1238size_t done = 0;1239uint8_t *datap = (uint8_t *)data;1240size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;1241aesni_gcm_encrypt_impl *encrypt_blocks =1242#if CAN_USE_GCM_ASM >= 21243ctx->impl == GCM_IMPL_AVX2 ?1244aesni_gcm_encrypt_avx2 :1245#endif1246aesni_gcm_encrypt_avx;1247const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);1248uint64_t *ghash = ctx->gcm_ghash;1249uint64_t *htable = ctx->gcm_Htable;1250uint64_t *cb = ctx->gcm_cb;1251uint8_t *ct_buf = NULL;1252uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;1253int rv = CRYPTO_SUCCESS;12541255ASSERT(block_size == GCM_BLOCK_LEN);1256ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,1257B_FALSE);1258/*1259* If the last call left an incomplete block, try to fill1260* it first.1261*/1262if (ctx->gcm_remainder_len > 0) {1263need = block_size - ctx->gcm_remainder_len;1264if (length < need) {1265/* Accumulate bytes here and return. */1266memcpy((uint8_t *)ctx->gcm_remainder +1267ctx->gcm_remainder_len, datap, length);12681269ctx->gcm_remainder_len += length;1270if (ctx->gcm_copy_to == NULL) {1271ctx->gcm_copy_to = datap;1272}1273return (CRYPTO_SUCCESS);1274} else {1275/* Complete incomplete block. */1276memcpy((uint8_t *)ctx->gcm_remainder +1277ctx->gcm_remainder_len, datap, need);12781279ctx->gcm_copy_to = NULL;1280}1281}12821283/* Allocate a buffer to encrypt to if there is enough input. */1284if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {1285ct_buf = vmem_alloc(chunk_size, KM_SLEEP);1286if (ct_buf == NULL) {1287return (CRYPTO_HOST_MEMORY);1288}1289}12901291/* If we completed an incomplete block, encrypt and write it out. */1292if (ctx->gcm_remainder_len > 0) {1293kfpu_begin();1294aes_encrypt_intel(key->encr_ks.ks32, key->nr,1295(const uint32_t *)cb, (uint32_t *)tmp);12961297gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);1298GHASH_AVX(ctx, tmp, block_size);1299clear_fpu_regs();1300kfpu_end();1301rv = crypto_put_output_data(tmp, out, block_size);1302out->cd_offset += block_size;1303gcm_incr_counter_block(ctx);1304ctx->gcm_processed_data_len += block_size;1305bleft -= need;1306datap += need;1307ctx->gcm_remainder_len = 0;1308}13091310/* Do the bulk encryption in chunk_size blocks. */1311for (; bleft >= chunk_size; bleft -= chunk_size) {1312kfpu_begin();1313done = encrypt_blocks(1314datap, ct_buf, chunk_size, key, cb, htable, ghash);13151316clear_fpu_regs();1317kfpu_end();1318if (done != chunk_size) {1319rv = CRYPTO_FAILED;1320goto out_nofpu;1321}1322rv = crypto_put_output_data(ct_buf, out, chunk_size);1323if (rv != CRYPTO_SUCCESS) {1324goto out_nofpu;1325}1326out->cd_offset += chunk_size;1327datap += chunk_size;1328ctx->gcm_processed_data_len += chunk_size;1329}1330/* Check if we are already done. */1331if (bleft == 0) {1332goto out_nofpu;1333}1334/* Bulk encrypt the remaining data. */1335kfpu_begin();1336if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {1337done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,1338ghash);1339if (done == 0) {1340rv = CRYPTO_FAILED;1341goto out;1342}1343rv = crypto_put_output_data(ct_buf, out, done);1344if (rv != CRYPTO_SUCCESS) {1345goto out;1346}1347out->cd_offset += done;1348ctx->gcm_processed_data_len += done;1349datap += done;1350bleft -= done;13511352}1353/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */1354while (bleft > 0) {1355if (bleft < block_size) {1356memcpy(ctx->gcm_remainder, datap, bleft);1357ctx->gcm_remainder_len = bleft;1358ctx->gcm_copy_to = datap;1359goto out;1360}1361/* Encrypt, hash and write out. */1362aes_encrypt_intel(key->encr_ks.ks32, key->nr,1363(const uint32_t *)cb, (uint32_t *)tmp);13641365gcm_xor_avx(datap, tmp);1366GHASH_AVX(ctx, tmp, block_size);1367rv = crypto_put_output_data(tmp, out, block_size);1368if (rv != CRYPTO_SUCCESS) {1369goto out;1370}1371out->cd_offset += block_size;1372gcm_incr_counter_block(ctx);1373ctx->gcm_processed_data_len += block_size;1374datap += block_size;1375bleft -= block_size;1376}1377out:1378clear_fpu_regs();1379kfpu_end();1380out_nofpu:1381if (ct_buf != NULL) {1382vmem_free(ct_buf, chunk_size);1383}1384return (rv);1385}13861387/*1388* Finalize the encryption: Zero fill, encrypt, hash and write out an eventual1389* incomplete last block. Encrypt the ICB. Calculate the tag and write it out.1390*/1391static int1392gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)1393{1394uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;1395uint32_t *J0 = (uint32_t *)ctx->gcm_J0;1396uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;1397size_t rem_len = ctx->gcm_remainder_len;1398const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;1399int aes_rounds = ((aes_key_t *)keysched)->nr;1400int rv;14011402ASSERT(block_size == GCM_BLOCK_LEN);1403ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,1404B_FALSE);14051406if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {1407return (CRYPTO_DATA_LEN_RANGE);1408}14091410kfpu_begin();1411/* Pad last incomplete block with zeros, encrypt and hash. */1412if (rem_len > 0) {1413uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;1414const uint32_t *cb = (uint32_t *)ctx->gcm_cb;14151416aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);1417memset(remainder + rem_len, 0, block_size - rem_len);1418for (int i = 0; i < rem_len; i++) {1419remainder[i] ^= tmp[i];1420}1421GHASH_AVX(ctx, remainder, block_size);1422ctx->gcm_processed_data_len += rem_len;1423/* No need to increment counter_block, it's the last block. */1424}1425/* Finish tag. */1426ctx->gcm_len_a_len_c[1] =1427htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));1428GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);1429aes_encrypt_intel(keysched, aes_rounds, J0, J0);14301431gcm_xor_avx((uint8_t *)J0, ghash);1432clear_fpu_regs();1433kfpu_end();14341435/* Output remainder. */1436if (rem_len > 0) {1437rv = crypto_put_output_data(remainder, out, rem_len);1438if (rv != CRYPTO_SUCCESS)1439return (rv);1440}1441out->cd_offset += rem_len;1442ctx->gcm_remainder_len = 0;1443rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);1444if (rv != CRYPTO_SUCCESS)1445return (rv);14461447out->cd_offset += ctx->gcm_tag_len;1448return (CRYPTO_SUCCESS);1449}14501451static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,1452size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,1453uint64_t *Xip)1454{1455(void) Htable;1456return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));1457}14581459#if CAN_USE_GCM_ASM >= 21460static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,1461size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,1462uint64_t *Xip)1463{1464uint8_t *ivec = (uint8_t *)iv;1465len &= kSizeTWithoutLower4Bits;1466aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,1467(const uint128_t *)Htable, (uint8_t *)Xip);1468CRYPTO_store_u32_be(&ivec[12],1469CRYPTO_load_u32_be(&ivec[12]) + len / 16);1470return (len);1471}1472#endif /* if CAN_USE_GCM_ASM >= 2 */14731474/*1475* Finalize decryption: We just have accumulated crypto text, so now we1476* decrypt it here inplace.1477*/1478static int1479gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)1480{1481ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);1482ASSERT3U(block_size, ==, 16);1483ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,1484B_FALSE);14851486size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;1487aesni_gcm_decrypt_impl *decrypt_blocks =1488#if CAN_USE_GCM_ASM >= 21489ctx->impl == GCM_IMPL_AVX2 ?1490aesni_gcm_decrypt_avx2 :1491#endif1492aesni_gcm_decrypt_avx;1493size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;1494uint8_t *datap = ctx->gcm_pt_buf;1495const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);1496uint32_t *cb = (uint32_t *)ctx->gcm_cb;1497uint64_t *htable = ctx->gcm_Htable;1498uint64_t *ghash = ctx->gcm_ghash;1499uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;1500int rv = CRYPTO_SUCCESS;1501size_t bleft, done;15021503/*1504* Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be1505* greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of1506* GCM_AVX_MIN_DECRYPT_BYTES.1507*/1508for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {1509kfpu_begin();1510done = decrypt_blocks(datap, datap, chunk_size,1511(const void *)key, ctx->gcm_cb, htable, ghash);1512clear_fpu_regs();1513kfpu_end();1514if (done != chunk_size) {1515return (CRYPTO_FAILED);1516}1517datap += done;1518}1519/* Decrypt remainder, which is less than chunk size, in one go. */1520kfpu_begin();1521if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {1522done = decrypt_blocks(datap, datap, bleft,1523(const void *)key, ctx->gcm_cb, htable, ghash);1524if (done == 0) {1525clear_fpu_regs();1526kfpu_end();1527return (CRYPTO_FAILED);1528}1529datap += done;1530bleft -= done;1531}1532ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);15331534/*1535* Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,1536* decrypt them block by block.1537*/1538while (bleft > 0) {1539/* Incomplete last block. */1540if (bleft < block_size) {1541uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;15421543memset(lastb, 0, block_size);1544memcpy(lastb, datap, bleft);1545/* The GCM processing. */1546GHASH_AVX(ctx, lastb, block_size);1547aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);1548for (size_t i = 0; i < bleft; i++) {1549datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];1550}1551break;1552}1553/* The GCM processing. */1554GHASH_AVX(ctx, datap, block_size);1555aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);1556gcm_xor_avx((uint8_t *)tmp, datap);1557gcm_incr_counter_block(ctx);15581559datap += block_size;1560bleft -= block_size;1561}1562if (rv != CRYPTO_SUCCESS) {1563clear_fpu_regs();1564kfpu_end();1565return (rv);1566}1567/* Decryption done, finish the tag. */1568ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));1569GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);1570aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,1571(uint32_t *)ctx->gcm_J0);15721573gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);15741575/* We are done with the FPU, restore its state. */1576clear_fpu_regs();1577kfpu_end();15781579/* Compare the input authentication tag with what we calculated. */1580if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {1581/* They don't match. */1582return (CRYPTO_INVALID_MAC);1583}1584rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);1585if (rv != CRYPTO_SUCCESS) {1586return (rv);1587}1588out->cd_offset += pt_len;1589return (CRYPTO_SUCCESS);1590}15911592/*1593* Initialize the GCM params H, Htabtle and the counter block. Save the1594* initial counter block.1595*/1596static int1597gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,1598const uint8_t *auth_data, size_t auth_data_len, size_t block_size)1599{1600uint8_t *cb = (uint8_t *)ctx->gcm_cb;1601uint64_t *H = ctx->gcm_H;1602const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;1603int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;1604const uint8_t *datap = auth_data;1605size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;1606size_t bleft;16071608ASSERT(block_size == GCM_BLOCK_LEN);1609ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,1610B_FALSE);16111612size_t htab_len = 0;1613#if CAN_USE_GCM_ASM >= 21614if (ctx->impl == GCM_IMPL_AVX2) {1615/*1616* BoringSSL's API specifies uint128_t[16] for htab; but only1617* uint128_t[12] are used.1618* See https://github.com/google/boringssl/blob/1619* 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/1620* modes/asm/aes-gcm-avx2-x86_64.pl#L198-L2001621*/1622htab_len = (2 * 8 * sizeof (uint128_t));1623} else1624#endif /* CAN_USE_GCM_ASM >= 2 */1625{1626htab_len = (2 * 6 * sizeof (uint128_t));1627}16281629ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);1630if (ctx->gcm_Htable == NULL) {1631return (CRYPTO_HOST_MEMORY);1632}16331634/* Init H (encrypt zero block) and create the initial counter block. */1635memset(H, 0, sizeof (ctx->gcm_H));1636kfpu_begin();1637aes_encrypt_intel(keysched, aes_rounds,1638(const uint32_t *)H, (uint32_t *)H);16391640#if CAN_USE_GCM_ASM >= 21641if (ctx->impl == GCM_IMPL_AVX2) {1642gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);1643} else1644#endif /* if CAN_USE_GCM_ASM >= 2 */1645{1646gcm_init_htab_avx(ctx->gcm_Htable, H);1647}16481649if (iv_len == 12) {1650memcpy(cb, iv, 12);1651cb[12] = 0;1652cb[13] = 0;1653cb[14] = 0;1654cb[15] = 1;1655/* We need the ICB later. */1656memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));1657} else {1658/*1659* Most consumers use 12 byte IVs, so it's OK to use the1660* original routines for other IV sizes, just avoid nesting1661* kfpu_begin calls.1662*/1663clear_fpu_regs();1664kfpu_end();1665gcm_format_initial_blocks(iv, iv_len, ctx, block_size,1666aes_copy_block, aes_xor_block);1667kfpu_begin();1668}16691670memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));16711672/* Openssl post increments the counter, adjust for that. */1673gcm_incr_counter_block(ctx);16741675/* Ghash AAD in chunk_size blocks. */1676for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {1677GHASH_AVX(ctx, datap, chunk_size);1678datap += chunk_size;1679clear_fpu_regs();1680kfpu_end();1681kfpu_begin();1682}1683/* Ghash the remainder and handle possible incomplete GCM block. */1684if (bleft > 0) {1685size_t incomp = bleft % block_size;16861687bleft -= incomp;1688if (bleft > 0) {1689GHASH_AVX(ctx, datap, bleft);1690datap += bleft;1691}1692if (incomp > 0) {1693/* Zero pad and hash incomplete last block. */1694uint8_t *authp = (uint8_t *)ctx->gcm_tmp;16951696memset(authp, 0, block_size);1697memcpy(authp, datap, incomp);1698GHASH_AVX(ctx, authp, block_size);1699}1700}1701clear_fpu_regs();1702kfpu_end();1703return (CRYPTO_SUCCESS);1704}17051706#if defined(_KERNEL)1707static int1708icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)1709{1710unsigned long val;1711char val_rounded[16];1712int error = 0;17131714error = kstrtoul(buf, 0, &val);1715if (error)1716return (error);17171718val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;17191720if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)1721return (-EINVAL);17221723snprintf(val_rounded, 16, "%u", (uint32_t)val);1724error = param_set_uint(val_rounded, kp);1725return (error);1726}17271728module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,1729param_get_uint, &gcm_avx_chunk_size, 0644);17301731MODULE_PARM_DESC(icp_gcm_avx_chunk_size,1732"How many bytes to process while owning the FPU");17331734#endif /* defined(__KERNEL) */1735#endif /* ifdef CAN_USE_GCM_ASM */173617371738