/* SPDX-License-Identifier: GPL-2.0 */1/*2* Implementation of POLYVAL using ARMv8 Crypto Extensions.3*4* Copyright 2021 Google LLC5*/6/*7* This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions8* It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,9* ..., h^1 in the POLYVAL finite field. This precomputation allows us to split10* finite field multiplication into two steps.11*12* In the first step, we consider h^i, m_i as normal polynomials of degree less13* than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication14* is simply polynomial multiplication.15*16* In the second step, we compute the reduction of p(x) modulo the finite field17* modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.18*19* This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where20* multiplication is finite field multiplication. The advantage is that the21* two-step process only requires 1 finite field reduction for every 822* polynomial multiplications. Further parallelism is gained by interleaving the23* multiplications and polynomial reductions.24*/2526#include <linux/linkage.h>27#define STRIDE_BLOCKS 82829KEY_POWERS .req x030MSG .req x131BLOCKS_LEFT .req x232ACCUMULATOR .req x333KEY_START .req x1034EXTRA_BYTES .req x1135TMP .req x133637M0 .req v038M1 .req v139M2 .req v240M3 .req v341M4 .req v442M5 .req v543M6 .req v644M7 .req v745KEY8 .req v846KEY7 .req v947KEY6 .req v1048KEY5 .req v1149KEY4 .req v1250KEY3 .req v1351KEY2 .req v1452KEY1 .req v1553PL .req v1654PH .req v1755TMP_V .req v1856LO .req v2057MI .req v2158HI .req v2259SUM .req v2360GSTAR .req v246162.text6364.arch armv8-a+crypto65.align 46667.Lgstar:68.quad 0xc200000000000000, 0xc2000000000000006970/*71* Computes the product of two 128-bit polynomials in X and Y and XORs the72* components of the 256-bit product into LO, MI, HI.73*74* Given:75* X = [X_1 : X_0]76* Y = [Y_1 : Y_0]77*78* We compute:79* LO += X_0 * Y_080* MI += (X_0 + X_1) * (Y_0 + Y_1)81* HI += X_1 * Y_182*83* Later, the 256-bit result can be extracted as:84* [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]85* This step is done when computing the polynomial reduction for efficiency86* reasons.87*88* Karatsuba multiplication is used instead of Schoolbook multiplication because89* it was found to be slightly faster on ARM64 CPUs.90*91*/92.macro karatsuba1 X Y93X .req \X94Y .req \Y95ext v25.16b, X.16b, X.16b, #896ext v26.16b, Y.16b, Y.16b, #897eor v25.16b, v25.16b, X.16b98eor v26.16b, v26.16b, Y.16b99pmull2 v28.1q, X.2d, Y.2d100pmull v29.1q, X.1d, Y.1d101pmull v27.1q, v25.1d, v26.1d102eor HI.16b, HI.16b, v28.16b103eor LO.16b, LO.16b, v29.16b104eor MI.16b, MI.16b, v27.16b105.unreq X106.unreq Y107.endm108109/*110* Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into111* them.112*/113.macro karatsuba1_store X Y114X .req \X115Y .req \Y116ext v25.16b, X.16b, X.16b, #8117ext v26.16b, Y.16b, Y.16b, #8118eor v25.16b, v25.16b, X.16b119eor v26.16b, v26.16b, Y.16b120pmull2 HI.1q, X.2d, Y.2d121pmull LO.1q, X.1d, Y.1d122pmull MI.1q, v25.1d, v26.1d123.unreq X124.unreq Y125.endm126127/*128* Computes the 256-bit polynomial represented by LO, HI, MI. Stores129* the result in PL, PH.130* [PH : PL] =131* [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]132*/133.macro karatsuba2134// v4 = [HI_1 + MI_1 : HI_0 + MI_0]135eor v4.16b, HI.16b, MI.16b136// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]137eor v4.16b, v4.16b, LO.16b138// v5 = [HI_0 : LO_1]139ext v5.16b, LO.16b, HI.16b, #8140// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]141eor v4.16b, v4.16b, v5.16b142// HI = [HI_0 : HI_1]143ext HI.16b, HI.16b, HI.16b, #8144// LO = [LO_0 : LO_1]145ext LO.16b, LO.16b, LO.16b, #8146// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]147ext PH.16b, v4.16b, HI.16b, #8148// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]149ext PL.16b, LO.16b, v4.16b, #8150.endm151152/*153* Computes the 128-bit reduction of PH : PL. Stores the result in dest.154*155* This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =156* x^128 + x^127 + x^126 + x^121 + 1.157*158* We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the159* product of two 128-bit polynomials in Montgomery form. We need to reduce it160* mod g(x). Also, since polynomials in Montgomery form have an "extra" factor161* of x^128, this product has two extra factors of x^128. To get it back into162* Montgomery form, we need to remove one of these factors by dividing by x^128.163*164* To accomplish both of these goals, we add multiples of g(x) that cancel out165* the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low166* bits are zero, the polynomial division by x^128 can be done by right167* shifting.168*169* Since the only nonzero term in the low 64 bits of g(x) is the constant term,170* the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can171* only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +172* x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to173* the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T174* = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.175*176* Repeating this same process on the next 64 bits "folds" bits 64-127 into bits177* 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1178* + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *179* x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :180* P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).181*182* So our final computation is:183* T = T_1 : T_0 = g*(x) * P_0184* V = V_1 : V_0 = g*(x) * (P_1 + T_0)185* p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0186*187* The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0188* + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :189* T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.190*/191.macro montgomery_reduction dest192DEST .req \dest193// TMP_V = T_1 : T_0 = P_0 * g*(x)194pmull TMP_V.1q, PL.1d, GSTAR.1d195// TMP_V = T_0 : T_1196ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8197// TMP_V = P_1 + T_0 : P_0 + T_1198eor TMP_V.16b, PL.16b, TMP_V.16b199// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1200eor PH.16b, PH.16b, TMP_V.16b201// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)202pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d203eor DEST.16b, PH.16b, TMP_V.16b204.unreq DEST205.endm206207/*208* Compute Polyval on 8 blocks.209*210* If reduce is set, also computes the montgomery reduction of the211* previous full_stride call and XORs with the first message block.212* (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.213* I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.214*215* Sets PL, PH.216*/217.macro full_stride reduce218eor LO.16b, LO.16b, LO.16b219eor MI.16b, MI.16b, MI.16b220eor HI.16b, HI.16b, HI.16b221222ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64223ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64224225karatsuba1 M7 KEY1226.if \reduce227pmull TMP_V.1q, PL.1d, GSTAR.1d228.endif229230karatsuba1 M6 KEY2231.if \reduce232ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8233.endif234235karatsuba1 M5 KEY3236.if \reduce237eor TMP_V.16b, PL.16b, TMP_V.16b238.endif239240karatsuba1 M4 KEY4241.if \reduce242eor PH.16b, PH.16b, TMP_V.16b243.endif244245karatsuba1 M3 KEY5246.if \reduce247pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d248.endif249250karatsuba1 M2 KEY6251.if \reduce252eor SUM.16b, PH.16b, TMP_V.16b253.endif254255karatsuba1 M1 KEY7256eor M0.16b, M0.16b, SUM.16b257258karatsuba1 M0 KEY8259karatsuba2260.endm261262/*263* Handle any extra blocks after full_stride loop.264*/265.macro partial_stride266add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)267sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4268ld1 {KEY1.16b}, [KEY_POWERS], #16269270ld1 {TMP_V.16b}, [MSG], #16271eor SUM.16b, SUM.16b, TMP_V.16b272karatsuba1_store KEY1 SUM273sub BLOCKS_LEFT, BLOCKS_LEFT, #1274275tst BLOCKS_LEFT, #4276beq .Lpartial4BlocksDone277ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64278ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64279karatsuba1 M0 KEY8280karatsuba1 M1 KEY7281karatsuba1 M2 KEY6282karatsuba1 M3 KEY5283.Lpartial4BlocksDone:284tst BLOCKS_LEFT, #2285beq .Lpartial2BlocksDone286ld1 {M0.16b, M1.16b}, [MSG], #32287ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32288karatsuba1 M0 KEY8289karatsuba1 M1 KEY7290.Lpartial2BlocksDone:291tst BLOCKS_LEFT, #1292beq .LpartialDone293ld1 {M0.16b}, [MSG], #16294ld1 {KEY8.16b}, [KEY_POWERS], #16295karatsuba1 M0 KEY8296.LpartialDone:297karatsuba2298montgomery_reduction SUM299.endm300301/*302* Perform montgomery multiplication in GF(2^128) and store result in op1.303*304* Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1305* If op1, op2 are in montgomery form, this computes the montgomery306* form of op1*op2.307*308* void pmull_polyval_mul(u8 *op1, const u8 *op2);309*/310SYM_FUNC_START(pmull_polyval_mul)311adr TMP, .Lgstar312ld1 {GSTAR.2d}, [TMP]313ld1 {v0.16b}, [x0]314ld1 {v1.16b}, [x1]315karatsuba1_store v0 v1316karatsuba2317montgomery_reduction SUM318st1 {SUM.16b}, [x0]319ret320SYM_FUNC_END(pmull_polyval_mul)321322/*323* Perform polynomial evaluation as specified by POLYVAL. This computes:324* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}325* where n=nblocks, h is the hash key, and m_i are the message blocks.326*327* x0 - pointer to precomputed key powers h^8 ... h^1328* x1 - pointer to message blocks329* x2 - number of blocks to hash330* x3 - pointer to accumulator331*332* void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,333* size_t nblocks, u8 *accumulator);334*/335SYM_FUNC_START(pmull_polyval_update)336adr TMP, .Lgstar337mov KEY_START, KEY_POWERS338ld1 {GSTAR.2d}, [TMP]339ld1 {SUM.16b}, [ACCUMULATOR]340subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS341blt .LstrideLoopExit342ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64343ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64344full_stride 0345subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS346blt .LstrideLoopExitReduce347.LstrideLoop:348full_stride 1349subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS350bge .LstrideLoop351.LstrideLoopExitReduce:352montgomery_reduction SUM353.LstrideLoopExit:354adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS355beq .LskipPartial356partial_stride357.LskipPartial:358st1 {SUM.16b}, [ACCUMULATOR]359ret360SYM_FUNC_END(pmull_polyval_update)361362363