/* SPDX-License-Identifier: GPL-2.0-or-later */1/* Copyright 2025 Google LLC */23/*4* This file is a "template" that generates a CRC function optimized using the5* RISC-V Zbc (scalar carryless multiplication) extension. The includer of this6* file must define the following parameters to specify the type of CRC:7*8* crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC9* LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural10* mapping between bits and polynomial coefficients11* 1 for a lsb (least-significant-bit) first CRC, i.e. reflected12* mapping between bits and polynomial coefficients13*/1415#include <asm/byteorder.h>16#include <linux/minmax.h>1718#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */1920static inline unsigned long clmul(unsigned long a, unsigned long b)21{22unsigned long res;2324asm(".option push\n"25".option arch,+zbc\n"26"clmul %0, %1, %2\n"27".option pop\n"28: "=r" (res) : "r" (a), "r" (b));29return res;30}3132static inline unsigned long clmulh(unsigned long a, unsigned long b)33{34unsigned long res;3536asm(".option push\n"37".option arch,+zbc\n"38"clmulh %0, %1, %2\n"39".option pop\n"40: "=r" (res) : "r" (a), "r" (b));41return res;42}4344static inline unsigned long clmulr(unsigned long a, unsigned long b)45{46unsigned long res;4748asm(".option push\n"49".option arch,+zbc\n"50"clmulr %0, %1, %2\n"51".option pop\n"52: "=r" (res) : "r" (a), "r" (b));53return res;54}5556/*57* crc_load_long() loads one "unsigned long" of aligned data bytes, producing a58* polynomial whose bit order matches the CRC's bit order.59*/60#ifdef CONFIG_64BIT61# if LSB_CRC62# define crc_load_long(x) le64_to_cpup(x)63# else64# define crc_load_long(x) be64_to_cpup(x)65# endif66#else67# if LSB_CRC68# define crc_load_long(x) le32_to_cpup(x)69# else70# define crc_load_long(x) be32_to_cpup(x)71# endif72#endif7374/* XOR @crc into the end of @msgpoly that represents the high-order terms. */75static inline unsigned long76crc_clmul_prep(crc_t crc, unsigned long msgpoly)77{78#if LSB_CRC79return msgpoly ^ crc;80#else81return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));82#endif83}8485/*86* Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it87* modulo the generator polynomial G. This gives the CRC of @msgpoly.88*/89static inline crc_t90crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)91{92unsigned long tmp;9394/*95* First step of Barrett reduction with integrated multiplication by96* x^n: calculate floor((msgpoly * x^n) / G). This is the value by97* which G needs to be multiplied to cancel out the x^n and higher terms98* of msgpoly * x^n. Do it using the following formula:99*100* msb-first:101* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))102* lsb-first:103* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)104*105* barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),106* which fits a long exactly. Using any lower power of x there would107* not carry enough precision through the calculation, while using any108* higher power of x would require extra instructions to handle a wider109* multiplication. In the msb-first case, using this power of x results110* in needing a floored division by x^(BITS_PER_LONG-1), which matches111* what clmulr produces. In the lsb-first case, a factor of x gets112* implicitly introduced by each carryless multiplication (shown as113* '* x' above), and the floored division instead needs to be by114* x^BITS_PER_LONG which matches what clmul produces.115*/116#if LSB_CRC117tmp = clmul(msgpoly, consts->barrett_reduction_const_1);118#else119tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);120#endif121122/*123* Second step of Barrett reduction:124*125* crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))126*127* This reduces (msgpoly * x^n) modulo G by adding the appropriate128* multiple of G to it. The result uses only the x^0..x^(n-1) terms.129* HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those130* terms in the first place, it is more efficient to do the equivalent:131*132* crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n133*134* In the lsb-first case further modify it to the following which avoids135* a shift, as the crc ends up in the physically low n bits from clmulr:136*137* product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x138* crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n139*140* barrett_reduction_const_2 contains the constant multiplier (G - x^n)141* or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The142* cast of the result to crc_t is essential, as it applies the mod x^n!143*/144#if LSB_CRC145return clmulr(tmp, consts->barrett_reduction_const_2);146#else147return clmul(tmp, consts->barrett_reduction_const_2);148#endif149}150151/* Update @crc with the data from @msgpoly. */152static inline crc_t153crc_clmul_update_long(crc_t crc, unsigned long msgpoly,154const struct crc_clmul_consts *consts)155{156return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);157}158159/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */160static inline crc_t161crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,162const struct crc_clmul_consts *consts)163{164unsigned long msgpoly;165size_t i;166167#if LSB_CRC168msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);169for (i = 1; i < len; i++)170msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));171#else172msgpoly = p[0];173for (i = 1; i < len; i++)174msgpoly = (msgpoly << 8) ^ p[i];175#endif176177if (len >= sizeof(crc_t)) {178#if LSB_CRC179msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);180#else181msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);182#endif183return crc_clmul_long(msgpoly, consts);184}185#if LSB_CRC186msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);187return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));188#else189msgpoly ^= crc >> (CRC_BITS - 8*len);190return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));191#endif192}193194static inline crc_t195crc_clmul(crc_t crc, const void *p, size_t len,196const struct crc_clmul_consts *consts)197{198size_t align;199200/* This implementation assumes that the CRC fits in an unsigned long. */201BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));202203/* If the buffer is not long-aligned, align it. */204align = (unsigned long)p % sizeof(unsigned long);205if (align && len) {206align = min(sizeof(unsigned long) - align, len);207crc = crc_clmul_update_partial(crc, p, align, consts);208p += align;209len -= align;210}211212if (len >= 4 * sizeof(unsigned long)) {213unsigned long m0, m1;214215m0 = crc_clmul_prep(crc, crc_load_long(p));216m1 = crc_load_long(p + sizeof(unsigned long));217p += 2 * sizeof(unsigned long);218len -= 2 * sizeof(unsigned long);219/*220* Main loop. Each iteration starts with a message polynomial221* (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two222* more longs of data to form x^(3*BITS_PER_LONG)*m0 +223* x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then224* "folds" that back into a congruent (modulo G) value that uses225* just m0 and m1 again. This is done by multiplying m0 by the226* precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by227* the precomputed constant (x^(2*BITS_PER_LONG) mod G), then228* adding the results to m2 and m3 as appropriate. Each such229* multiplication produces a result twice the length of a long,230* which in RISC-V is two instructions clmul and clmulh.231*232* This could be changed to fold across more than 2 longs at a233* time if there is a CPU that can take advantage of it.234*/235do {236unsigned long p0, p1, p2, p3;237238p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);239p1 = clmul(m0, consts->fold_across_2_longs_const_hi);240p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);241p3 = clmul(m1, consts->fold_across_2_longs_const_lo);242m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);243m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^244crc_load_long(p + sizeof(unsigned long));245246p += 2 * sizeof(unsigned long);247len -= 2 * sizeof(unsigned long);248} while (len >= 2 * sizeof(unsigned long));249250crc = crc_clmul_long(m0, consts);251crc = crc_clmul_update_long(crc, m1, consts);252}253254while (len >= sizeof(unsigned long)) {255crc = crc_clmul_update_long(crc, crc_load_long(p), consts);256p += sizeof(unsigned long);257len -= sizeof(unsigned long);258}259260if (len)261crc = crc_clmul_update_partial(crc, p, len, consts);262263return crc;264}265266267