Path: blob/master/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
26451 views
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */1//2// This file is dual-licensed, meaning that you can use it under your3// choice of either of the following two licenses:4//5// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.6//7// Licensed under the Apache License 2.0 (the "License"). You can obtain8// a copy in the file LICENSE in the source distribution or at9// https://www.openssl.org/source/license.html10//11// or12//13// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>14// Copyright 2024 Google LLC15// All rights reserved.16//17// Redistribution and use in source and binary forms, with or without18// modification, are permitted provided that the following conditions19// are met:20// 1. Redistributions of source code must retain the above copyright21// notice, this list of conditions and the following disclaimer.22// 2. Redistributions in binary form must reproduce the above copyright23// notice, this list of conditions and the following disclaimer in the24// documentation and/or other materials provided with the distribution.25//26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.3738// The generated code of this file depends on the following RISC-V extensions:39// - RV64I40// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 204841// - RISC-V Vector AES block cipher extension ('Zvkned')42// - RISC-V Vector Bit-manipulation extension ('Zvbb')43// - RISC-V Vector GCM/GMAC extension ('Zvkg')4445#include <linux/linkage.h>4647.text48.option arch, +zvkned, +zvbb, +zvkg4950#include "aes-macros.S"5152#define KEYP a053#define INP a154#define OUTP a255#define LEN a356#define TWEAKP a45758#define LEN32 a559#define TAIL_LEN a660#define VL a761#define VLMAX t46263// v1-v15 contain the AES round keys, but they are used for temporaries before64// the AES round keys have been loaded.65#define TWEAKS v16 // LMUL=4 (most of the time)66#define TWEAKS_BREV v20 // LMUL=4 (most of the time)67#define MULTS_BREV v24 // LMUL=4 (most of the time)68#define TMP0 v2869#define TMP1 v2970#define TMP2 v3071#define TMP3 v317273// xts_init initializes the following values:74//75// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)76// TWEAKS_BREV: same as TWEAKS, but bit-reversed77// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.78//79// N is the maximum number of blocks that will be processed per loop iteration,80// computed using vsetvli.81//82// The field convention used by XTS is the same as that of GHASH, but with the83// bits reversed within each byte. The zvkg extension provides the vgmul84// instruction which does multiplication in this field. Therefore, for tweak85// computation we use vgmul to do multiplications in parallel, instead of86// serially multiplying by x using shifting+xoring. Note that for this to work,87// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).88.macro xts_init8990// Load the first tweak T.91vsetivli zero, 4, e32, m1, ta, ma92vle32.v TWEAKS, (TWEAKP)9394// If there's only one block (or no blocks at all), then skip the tweak95// sequence computation because (at most) T itself is needed.96li t0, 1697ble LEN, t0, .Linit_single_block\@9899// Save a copy of T bit-reversed in v12.100vbrev8.v v12, TWEAKS101102//103// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming104// that N <= 128. Though, this code actually requires N < 64 (or105// equivalently VLEN < 2048) due to the use of 64-bit intermediate106// values here and in the x^N computation later.107//108vsetvli VL, LEN32, e32, m4, ta, ma109srli t0, VL, 2 // t0 = N (num blocks)110// Generate two sequences, each with N 32-bit values:111// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].112vsetvli zero, t0, e32, m1, ta, ma113vmv.v.i v0, 1114vid.v v1115// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them116// as two sequences, each with 2*N 32-bit values:117// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].118vsetvli zero, t0, e64, m2, ta, ma119vzext.vf2 v2, v0120vzext.vf2 v4, v1121slli t1, t0, 1 // t1 = 2*N122vsetvli zero, t1, e32, m2, ta, ma123// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],124// widening to 64 bits per element. When reinterpreted as N 128-bit125// values, this is the needed sequence of 128-bit values 1 << i (x^i).126vwsll.vv v8, v2, v4127128// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then129// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.130vsetvli zero, LEN32, e32, m4, ta, ma131vmv.v.i TWEAKS_BREV, 0132vaesz.vs TWEAKS_BREV, v12133vbrev8.v v8, v8134vgmul.vv TWEAKS_BREV, v8135136// Save a copy of the sequence T*(x^i) with the bit reversal undone.137vbrev8.v TWEAKS, TWEAKS_BREV138139// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.140li t1, 1141sll t1, t1, t0 // t1 = 1 << N142vsetivli zero, 2, e64, m1, ta, ma143vmv.v.i v0, 0144vsetivli zero, 1, e64, m1, tu, ma145vmv.v.x v0, t1146vbrev8.v v0, v0147vsetvli zero, LEN32, e32, m4, ta, ma148vmv.v.i MULTS_BREV, 0149vaesz.vs MULTS_BREV, v0150151j .Linit_done\@152153.Linit_single_block\@:154vbrev8.v TWEAKS_BREV, TWEAKS155.Linit_done\@:156.endm157158// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is159// the multiplier required to advance the tweak by one.160.macro load_x161li t0, 0x40162vsetivli zero, 4, e32, m1, ta, ma163vmv.v.i MULTS_BREV, 0164vsetivli zero, 1, e8, m1, tu, ma165vmv.v.x MULTS_BREV, t0166.endm167168.macro __aes_xts_crypt enc, keylen169// With 16 < len <= 31, there's no main loop, just ciphertext stealing.170beqz LEN32, .Lcts_without_main_loop\@171172vsetvli VLMAX, zero, e32, m4, ta, ma1731:174vsetvli VL, LEN32, e32, m4, ta, ma1752:176// Encrypt or decrypt VL/4 blocks.177vle32.v TMP0, (INP)178vxor.vv TMP0, TMP0, TWEAKS179aes_crypt TMP0, \enc, \keylen180vxor.vv TMP0, TMP0, TWEAKS181vse32.v TMP0, (OUTP)182183// Update the pointers and the remaining length.184slli t0, VL, 2185add INP, INP, t0186add OUTP, OUTP, t0187sub LEN32, LEN32, VL188189// Check whether more blocks remain.190beqz LEN32, .Lmain_loop_done\@191192// Compute the next sequence of tweaks by multiplying the previous193// sequence by x^N. Store the result in both bit-reversed order and194// regular order (i.e. with the bit reversal undone).195vgmul.vv TWEAKS_BREV, MULTS_BREV196vbrev8.v TWEAKS, TWEAKS_BREV197198// Since we compute the tweak multipliers x^N in advance, we require199// that each iteration process the same length except possibly the last.200// This conflicts slightly with the behavior allowed by RISC-V Vector201// Extension, where CPUs can select a lower length for both of the last202// two iterations. E.g., vl might take the sequence of values203// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we204// can use x^4 again instead of computing x^3. Therefore, we explicitly205// keep the vl at VLMAX if there is at least VLMAX remaining.206bge LEN32, VLMAX, 2b207j 1b208209.Lmain_loop_done\@:210load_x211212// Compute the next tweak.213addi t0, VL, -4214vsetivli zero, 4, e32, m4, ta, ma215vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak216vsetivli zero, 4, e32, m1, ta, ma217vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak218219bnez TAIL_LEN, .Lcts\@220221// Update *TWEAKP to contain the next tweak.222vbrev8.v TWEAKS, TWEAKS_BREV223vse32.v TWEAKS, (TWEAKP)224ret225226.Lcts_without_main_loop\@:227load_x228.Lcts\@:229// TWEAKS_BREV now contains the next tweak. Compute the one after that.230vsetivli zero, 4, e32, m1, ta, ma231vmv.v.v TMP0, TWEAKS_BREV232vgmul.vv TMP0, MULTS_BREV233// Undo the bit reversal of the next two tweaks and store them in TMP1234// and TMP2, such that TMP1 is the first needed and TMP2 the second.235.if \enc236vbrev8.v TMP1, TWEAKS_BREV237vbrev8.v TMP2, TMP0238.else239vbrev8.v TMP1, TMP0240vbrev8.v TMP2, TWEAKS_BREV241.endif242243// Encrypt/decrypt the last full block.244vle32.v TMP0, (INP)245vxor.vv TMP0, TMP0, TMP1246aes_crypt TMP0, \enc, \keylen247vxor.vv TMP0, TMP0, TMP1248249// Swap the first TAIL_LEN bytes of the above result with the tail.250// Note that to support in-place encryption/decryption, the load from251// the input tail must happen before the store to the output tail.252addi t0, INP, 16253addi t1, OUTP, 16254vmv.v.v TMP3, TMP0255vsetvli zero, TAIL_LEN, e8, m1, tu, ma256vle8.v TMP0, (t0)257vse8.v TMP3, (t1)258259// Encrypt/decrypt again and store the last full block.260vsetivli zero, 4, e32, m1, ta, ma261vxor.vv TMP0, TMP0, TMP2262aes_crypt TMP0, \enc, \keylen263vxor.vv TMP0, TMP0, TMP2264vse32.v TMP0, (OUTP)265266ret267.endm268269.macro aes_xts_crypt enc270271// Check whether the length is a multiple of the AES block size.272andi TAIL_LEN, LEN, 15273beqz TAIL_LEN, 1f274275// The length isn't a multiple of the AES block size, so ciphertext276// stealing will be required. Ciphertext stealing involves special277// handling of the partial block and the last full block, so subtract278// the length of both from the length to be processed in the main loop.279sub LEN, LEN, TAIL_LEN280addi LEN, LEN, -162811:282srli LEN32, LEN, 2283// LEN and LEN32 now contain the total length of the blocks that will be284// processed in the main loop, in bytes and 32-bit words respectively.285286xts_init287aes_begin KEYP, 128f, 192f288__aes_xts_crypt \enc, 256289128:290__aes_xts_crypt \enc, 128291192:292__aes_xts_crypt \enc, 192293.endm294295// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,296// const u8 *in, u8 *out, size_t len,297// u8 tweak[16]);298//299// |key| is the data key. |tweak| contains the next tweak; the encryption of300// the original IV with the tweak key was already done. This function supports301// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and302// |len| must be a multiple of 16 except on the last call. If |len| is a303// multiple of 16, then this function updates |tweak| to contain the next tweak.304SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)305aes_xts_crypt 1306SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)307308// Same prototype and calling convention as the encryption function309SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)310aes_xts_crypt 0311SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)312313314