// SPDX-License-Identifier: GPL-2.0-or-later1/*2* POLYVAL library functions3*4* Copyright 2025 Google LLC5*/67#include <crypto/polyval.h>8#include <linux/export.h>9#include <linux/module.h>10#include <linux/string.h>11#include <linux/unaligned.h>1213/*14* POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL15* interprets the message as the coefficients of a polynomial in GF(2^128) and16* evaluates that polynomial at a secret point. POLYVAL has a simple17* mathematical relationship with GHASH, but it uses a better field convention18* which makes it easier and faster to implement.19*20* POLYVAL is not a cryptographic hash function, and it should be used only by21* algorithms that are specifically designed to use it.22*23* POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated24* Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)25*26* POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2"27* (https://eprint.iacr.org/2021/1441.pdf).28*29* This file provides a library API for POLYVAL. This API can delegate to30* either a generic implementation or an architecture-optimized implementation.31*32* For the generic implementation, we don't use the traditional table approach33* to GF(2^128) multiplication. That approach is not constant-time and requires34* a lot of memory. Instead, we use a different approach which emulates35* carryless multiplication using standard multiplications by spreading the data36* bits apart using "holes". This allows the carries to spill harmlessly. This37* approach is borrowed from BoringSSL, which in turn credits BearSSL's38* documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the39* "holes" trick and a presentation by Shay Gueron40* (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the41* 256-bit => 128-bit reduction algorithm.42*/4344#ifdef CONFIG_ARCH_SUPPORTS_INT1284546/* Do a 64 x 64 => 128 bit carryless multiplication. */47static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)48{49/*50* With 64-bit multiplicands and one term every 4 bits, there would be51* up to 64 / 4 = 16 one bits per column when each multiplication is52* written out as a series of additions in the schoolbook manner.53* Unfortunately, that doesn't work since the value 16 is 1 too large to54* fit in 4 bits. Carries would sometimes overflow into the next term.55*56* Using one term every 5 bits would work. However, that would cost57* 5 x 5 = 25 multiplications instead of 4 x 4 = 16.58*59* Instead, mask off 4 bits from one multiplicand, giving a max of 1560* one bits per column. Then handle those 4 bits separately.61*/62u64 a0 = a & 0x1111111111111110;63u64 a1 = a & 0x2222222222222220;64u64 a2 = a & 0x4444444444444440;65u64 a3 = a & 0x8888888888888880;6667u64 b0 = b & 0x1111111111111111;68u64 b1 = b & 0x2222222222222222;69u64 b2 = b & 0x4444444444444444;70u64 b3 = b & 0x8888888888888888;7172/* Multiply the high 60 bits of @a by @b. */73u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^74(a2 * (u128)b2) ^ (a3 * (u128)b1);75u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^76(a2 * (u128)b3) ^ (a3 * (u128)b2);77u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^78(a2 * (u128)b0) ^ (a3 * (u128)b3);79u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^80(a2 * (u128)b1) ^ (a3 * (u128)b0);8182/* Multiply the low 4 bits of @a by @b. */83u64 e0 = -(a & 1) & b;84u64 e1 = -((a >> 1) & 1) & b;85u64 e2 = -((a >> 2) & 1) & b;86u64 e3 = -((a >> 3) & 1) & b;87u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);88u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);8990/* Add all the intermediate products together. */91*out_lo = (((u64)c0) & 0x1111111111111111) ^92(((u64)c1) & 0x2222222222222222) ^93(((u64)c2) & 0x4444444444444444) ^94(((u64)c3) & 0x8888888888888888) ^ extra_lo;95*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^96(((u64)(c1 >> 64)) & 0x2222222222222222) ^97(((u64)(c2 >> 64)) & 0x4444444444444444) ^98(((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;99}100101#else /* CONFIG_ARCH_SUPPORTS_INT128 */102103/* Do a 32 x 32 => 64 bit carryless multiplication. */104static u64 clmul32(u32 a, u32 b)105{106/*107* With 32-bit multiplicands and one term every 4 bits, there are up to108* 32 / 4 = 8 one bits per column when each multiplication is written109* out as a series of additions in the schoolbook manner. The value 8110* fits in 4 bits, so the carries don't overflow into the next term.111*/112u32 a0 = a & 0x11111111;113u32 a1 = a & 0x22222222;114u32 a2 = a & 0x44444444;115u32 a3 = a & 0x88888888;116117u32 b0 = b & 0x11111111;118u32 b1 = b & 0x22222222;119u32 b2 = b & 0x44444444;120u32 b3 = b & 0x88888888;121122u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^123(a2 * (u64)b2) ^ (a3 * (u64)b1);124u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^125(a2 * (u64)b3) ^ (a3 * (u64)b2);126u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^127(a2 * (u64)b0) ^ (a3 * (u64)b3);128u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^129(a2 * (u64)b1) ^ (a3 * (u64)b0);130131/* Add all the intermediate products together. */132return (c0 & 0x1111111111111111) ^133(c1 & 0x2222222222222222) ^134(c2 & 0x4444444444444444) ^135(c3 & 0x8888888888888888);136}137138/* Do a 64 x 64 => 128 bit carryless multiplication. */139static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)140{141u32 a_lo = (u32)a;142u32 a_hi = a >> 32;143u32 b_lo = (u32)b;144u32 b_hi = b >> 32;145146/* Karatsuba multiplication */147u64 lo = clmul32(a_lo, b_lo);148u64 hi = clmul32(a_hi, b_hi);149u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;150151*out_lo = lo ^ (mi << 32);152*out_hi = hi ^ (mi >> 32);153}154#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */155156/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */157static void __maybe_unused158polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)159{160u64 c0, c1, c2, c3, mi0, mi1;161162/*163* Carryless-multiply @a by @b using Karatsuba multiplication. Store164* the 256-bit product in @c0 (low) through @c3 (high).165*/166clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);167clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);168clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),169&mi0, &mi1);170mi0 ^= c0 ^ c2;171mi1 ^= c1 ^ c3;172c1 ^= mi0;173c2 ^= mi1;174175/*176* Cancel out the low 128 bits of the product by adding multiples of177* G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each178* of which cancels out 64 bits. Note that we break G(x) into three179* parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.180*/181182/*183* First, add G(x) times c0 as follows:184*185* (c0, c1, c2) = (0,186* c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),187* c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))188*/189c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);190c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);191192/*193* Second, add G(x) times the new c1:194*195* (c1, c2, c3) = (0,196* c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),197* c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))198*/199c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);200c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);201202/* Return (c2, c3). This implicitly multiplies by x^-128. */203a->lo = cpu_to_le64(c2);204a->hi = cpu_to_le64(c3);205}206207static void __maybe_unused208polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,209const u8 *data, size_t nblocks)210{211do {212acc->lo ^= get_unaligned((__le64 *)data);213acc->hi ^= get_unaligned((__le64 *)(data + 8));214polyval_mul_generic(acc, key);215data += POLYVAL_BLOCK_SIZE;216} while (--nblocks);217}218219/* Include the arch-optimized implementation of POLYVAL, if one is available. */220#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH221#include "polyval.h" /* $(SRCARCH)/polyval.h */222void polyval_preparekey(struct polyval_key *key,223const u8 raw_key[POLYVAL_BLOCK_SIZE])224{225polyval_preparekey_arch(key, raw_key);226}227EXPORT_SYMBOL_GPL(polyval_preparekey);228#endif /* Else, polyval_preparekey() is an inline function. */229230/*231* polyval_mul_generic() and polyval_blocks_generic() take the key as a232* polyval_elem rather than a polyval_key, so that arch-optimized233* implementations with a different key format can use it as a fallback (if they234* have H^1 stored somewhere in their struct). Thus, the following dispatch235* code is needed to pass the appropriate key argument.236*/237238static void polyval_mul(struct polyval_ctx *ctx)239{240#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH241polyval_mul_arch(&ctx->acc, ctx->key);242#else243polyval_mul_generic(&ctx->acc, &ctx->key->h);244#endif245}246247static void polyval_blocks(struct polyval_ctx *ctx,248const u8 *data, size_t nblocks)249{250#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH251polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);252#else253polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);254#endif255}256257void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)258{259if (unlikely(ctx->partial)) {260size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);261262len -= n;263while (n--)264ctx->acc.bytes[ctx->partial++] ^= *data++;265if (ctx->partial < POLYVAL_BLOCK_SIZE)266return;267polyval_mul(ctx);268}269if (len >= POLYVAL_BLOCK_SIZE) {270size_t nblocks = len / POLYVAL_BLOCK_SIZE;271272polyval_blocks(ctx, data, nblocks);273data += len & ~(POLYVAL_BLOCK_SIZE - 1);274len &= POLYVAL_BLOCK_SIZE - 1;275}276for (size_t i = 0; i < len; i++)277ctx->acc.bytes[i] ^= data[i];278ctx->partial = len;279}280EXPORT_SYMBOL_GPL(polyval_update);281282void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])283{284if (unlikely(ctx->partial))285polyval_mul(ctx);286memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);287memzero_explicit(ctx, sizeof(*ctx));288}289EXPORT_SYMBOL_GPL(polyval_final);290291#ifdef polyval_mod_init_arch292static int __init polyval_mod_init(void)293{294polyval_mod_init_arch();295return 0;296}297subsys_initcall(polyval_mod_init);298299static void __exit polyval_mod_exit(void)300{301}302module_exit(polyval_mod_exit);303#endif304305MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");306MODULE_LICENSE("GPL");307308309