Path: blob/master/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
170899 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* SM4 Cipher Algorithm, AES-NI/AVX2 optimized.3* as specified in4* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html5*6* Copyright (C) 2018 Markku-Juhani O. Saarinen <[email protected]>7* Copyright (C) 2020 Jussi Kivilinna <[email protected]>8* Copyright (c) 2021 Tianjia Zhang <[email protected]>9*/1011/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:12* https://github.com/mjosaarinen/sm4ni13*/1415#include <linux/linkage.h>16#include <linux/cfi_types.h>17#include <asm/frame.h>1819#define rRIP (%rip)2021/* vector registers */22#define RX0 %ymm023#define RX1 %ymm124#define MASK_4BIT %ymm225#define RTMP0 %ymm326#define RTMP1 %ymm427#define RTMP2 %ymm528#define RTMP3 %ymm629#define RTMP4 %ymm73031#define RA0 %ymm832#define RA1 %ymm933#define RA2 %ymm1034#define RA3 %ymm113536#define RB0 %ymm1237#define RB1 %ymm1338#define RB2 %ymm1439#define RB3 %ymm154041#define RNOT %ymm042#define RBSWAP %ymm14344#define RX0x %xmm045#define RX1x %xmm146#define MASK_4BITx %xmm24748#define RNOTx %xmm049#define RBSWAPx %xmm15051#define RTMP0x %xmm352#define RTMP1x %xmm453#define RTMP2x %xmm554#define RTMP3x %xmm655#define RTMP4x %xmm7565758/* helper macros */5960/* Transpose four 32-bit words between 128-bit vector lanes. */61#define transpose_4x4(x0, x1, x2, x3, t1, t2) \62vpunpckhdq x1, x0, t2; \63vpunpckldq x1, x0, x0; \64\65vpunpckldq x3, x2, t1; \66vpunpckhdq x3, x2, x2; \67\68vpunpckhqdq t1, x0, x1; \69vpunpcklqdq t1, x0, x0; \70\71vpunpckhqdq x2, t2, x3; \72vpunpcklqdq x2, t2, x2;7374/* post-SubByte transform. */75#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \76vpand x, mask4bit, tmp0; \77vpandn x, mask4bit, x; \78vpsrld $4, x, x; \79\80vpshufb tmp0, lo_t, tmp0; \81vpshufb x, hi_t, x; \82vpxor tmp0, x, x;8384/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by85* 'vaeslastenc' instruction. */86#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \87vpandn mask4bit, x, tmp0; \88vpsrld $4, x, x; \89vpand x, mask4bit, x; \90\91vpshufb tmp0, lo_t, tmp0; \92vpshufb x, hi_t, x; \93vpxor tmp0, x, x;949596.section .rodata.cst16, "aM", @progbits, 1697.align 169899/*100* Following four affine transform look-up tables are from work by101* Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni102*103* These allow exposing SM4 S-Box from AES SubByte.104*/105106/* pre-SubByte affine transform, from SM4 field to AES field. */107.Lpre_tf_lo_s:108.quad 0x9197E2E474720701, 0xC7C1B4B222245157109.Lpre_tf_hi_s:110.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012111112/* post-SubByte affine transform, from AES field to SM4 field. */113.Lpost_tf_lo_s:114.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82115.Lpost_tf_hi_s:116.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF117118/* For isolating SubBytes from AESENCLAST, inverse shift row */119.Linv_shift_row:120.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b121.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03122123/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */124.Linv_shift_row_rol_8:125.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e126.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06127128/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */129.Linv_shift_row_rol_16:130.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01131.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09132133/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */134.Linv_shift_row_rol_24:135.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04136.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c137138/* For CTR-mode IV byteswap */139.Lbswap128_mask:140.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0141142/* For input word byte-swap */143.Lbswap32_mask:144.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12145146.align 4147/* 4-bit mask */148.L0f0f0f0f:149.long 0x0f0f0f0f150151/* 12 bytes, only for padding */152.Lpadding_deadbeef:153.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef154155.text156SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)157/* input:158* %rdi: round key array, CTX159* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel160* plaintext blocks161* output:162* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel163* ciphertext blocks164*/165FRAME_BEGIN166167vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;168vpshufb RTMP2, RA0, RA0;169vpshufb RTMP2, RA1, RA1;170vpshufb RTMP2, RA2, RA2;171vpshufb RTMP2, RA3, RA3;172vpshufb RTMP2, RB0, RB0;173vpshufb RTMP2, RB1, RB1;174vpshufb RTMP2, RB2, RB2;175vpshufb RTMP2, RB3, RB3;176177vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;178transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);179transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);180181#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \182vpbroadcastd (4*(round))(%rdi), RX0; \183vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \184vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \185vmovdqa RX0, RX1; \186vpxor s1, RX0, RX0; \187vpxor s2, RX0, RX0; \188vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \189vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \190vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \191vpxor r1, RX1, RX1; \192vpxor r2, RX1, RX1; \193vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \194\195/* sbox, non-linear part */ \196transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \197transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \198vextracti128 $1, RX0, RTMP4x; \199vextracti128 $1, RX1, RTMP0x; \200vaesenclast MASK_4BITx, RX0x, RX0x; \201vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \202vaesenclast MASK_4BITx, RX1x, RX1x; \203vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \204vinserti128 $1, RTMP4x, RX0, RX0; \205vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \206vinserti128 $1, RTMP0x, RX1, RX1; \207transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \208transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \209\210/* linear part */ \211vpshufb RTMP4, RX0, RTMP0; \212vpxor RTMP0, s0, s0; /* s0 ^ x */ \213vpshufb RTMP4, RX1, RTMP2; \214vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \215vpxor RTMP2, r0, r0; /* r0 ^ x */ \216vpshufb RTMP4, RX0, RTMP1; \217vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \218vpshufb RTMP4, RX1, RTMP3; \219vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \220vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \221vpshufb RTMP4, RX0, RTMP1; \222vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \223vpshufb RTMP4, RX1, RTMP3; \224vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \225vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \226vpshufb RTMP4, RX0, RTMP1; \227vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \228vpslld $2, RTMP0, RTMP1; \229vpsrld $30, RTMP0, RTMP0; \230vpxor RTMP0, s0, s0; \231/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \232vpxor RTMP1, s0, s0; \233vpshufb RTMP4, RX1, RTMP3; \234vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \235vpslld $2, RTMP2, RTMP3; \236vpsrld $30, RTMP2, RTMP2; \237vpxor RTMP2, r0, r0; \238/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \239vpxor RTMP3, r0, r0;240241leaq (32*4)(%rdi), %rax;242.align 16243.Lroundloop_blk8:244ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);245ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);246ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);247ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);248leaq (4*4)(%rdi), %rdi;249cmpq %rax, %rdi;250jne .Lroundloop_blk8;251252#undef ROUND253254vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;255256transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);257transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);258vpshufb RTMP2, RA0, RA0;259vpshufb RTMP2, RA1, RA1;260vpshufb RTMP2, RA2, RA2;261vpshufb RTMP2, RA3, RA3;262vpshufb RTMP2, RB0, RB0;263vpshufb RTMP2, RB1, RB1;264vpshufb RTMP2, RB2, RB2;265vpshufb RTMP2, RB3, RB3;266267FRAME_END268RET;269SYM_FUNC_END(__sm4_crypt_blk16)270271#define inc_le128(x, minus_one, tmp) \272vpcmpeqq minus_one, x, tmp; \273vpsubq minus_one, x, x; \274vpslldq $8, tmp, tmp; \275vpsubq tmp, x, x;276277/*278* void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,279* const u8 *src, u8 *iv)280*/281SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)282/* input:283* %rdi: round key array, CTX284* %rsi: dst (16 blocks)285* %rdx: src (16 blocks)286* %rcx: iv (big endian, 128bit)287*/288FRAME_BEGIN289290movq 8(%rcx), %rax;291bswapq %rax;292293vzeroupper;294295vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;296vpcmpeqd RNOT, RNOT, RNOT;297vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */298vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */299300/* load IV and byteswap */301vmovdqu (%rcx), RTMP4x;302vpshufb RTMP3x, RTMP4x, RTMP4x;303vmovdqa RTMP4x, RTMP0x;304inc_le128(RTMP4x, RNOTx, RTMP1x);305vinserti128 $1, RTMP4x, RTMP0, RTMP0;306vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */307308/* check need for handling 64-bit overflow and carry */309cmpq $(0xffffffffffffffff - 16), %rax;310ja .Lhandle_ctr_carry;311312/* construct IVs */313vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */314vpshufb RTMP3, RTMP0, RA1;315vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */316vpshufb RTMP3, RTMP0, RA2;317vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */318vpshufb RTMP3, RTMP0, RA3;319vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */320vpshufb RTMP3, RTMP0, RB0;321vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */322vpshufb RTMP3, RTMP0, RB1;323vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */324vpshufb RTMP3, RTMP0, RB2;325vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */326vpshufb RTMP3, RTMP0, RB3;327vpsubq RTMP2, RTMP0, RTMP0; /* +16 */328vpshufb RTMP3x, RTMP0x, RTMP0x;329330jmp .Lctr_carry_done;331332.Lhandle_ctr_carry:333/* construct IVs */334inc_le128(RTMP0, RNOT, RTMP1);335inc_le128(RTMP0, RNOT, RTMP1);336vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */337inc_le128(RTMP0, RNOT, RTMP1);338inc_le128(RTMP0, RNOT, RTMP1);339vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */340inc_le128(RTMP0, RNOT, RTMP1);341inc_le128(RTMP0, RNOT, RTMP1);342vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */343inc_le128(RTMP0, RNOT, RTMP1);344inc_le128(RTMP0, RNOT, RTMP1);345vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */346inc_le128(RTMP0, RNOT, RTMP1);347inc_le128(RTMP0, RNOT, RTMP1);348vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */349inc_le128(RTMP0, RNOT, RTMP1);350inc_le128(RTMP0, RNOT, RTMP1);351vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */352inc_le128(RTMP0, RNOT, RTMP1);353inc_le128(RTMP0, RNOT, RTMP1);354vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */355inc_le128(RTMP0, RNOT, RTMP1);356vextracti128 $1, RTMP0, RTMP0x;357vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */358359.align 4360.Lctr_carry_done:361/* store new IV */362vmovdqu RTMP0x, (%rcx);363364call __sm4_crypt_blk16;365366vpxor (0 * 32)(%rdx), RA0, RA0;367vpxor (1 * 32)(%rdx), RA1, RA1;368vpxor (2 * 32)(%rdx), RA2, RA2;369vpxor (3 * 32)(%rdx), RA3, RA3;370vpxor (4 * 32)(%rdx), RB0, RB0;371vpxor (5 * 32)(%rdx), RB1, RB1;372vpxor (6 * 32)(%rdx), RB2, RB2;373vpxor (7 * 32)(%rdx), RB3, RB3;374375vmovdqu RA0, (0 * 32)(%rsi);376vmovdqu RA1, (1 * 32)(%rsi);377vmovdqu RA2, (2 * 32)(%rsi);378vmovdqu RA3, (3 * 32)(%rsi);379vmovdqu RB0, (4 * 32)(%rsi);380vmovdqu RB1, (5 * 32)(%rsi);381vmovdqu RB2, (6 * 32)(%rsi);382vmovdqu RB3, (7 * 32)(%rsi);383384vzeroall;385FRAME_END386RET;387SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)388389/*390* void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,391* const u8 *src, u8 *iv)392*/393SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)394/* input:395* %rdi: round key array, CTX396* %rsi: dst (16 blocks)397* %rdx: src (16 blocks)398* %rcx: iv399*/400FRAME_BEGIN401402vzeroupper;403404vmovdqu (0 * 32)(%rdx), RA0;405vmovdqu (1 * 32)(%rdx), RA1;406vmovdqu (2 * 32)(%rdx), RA2;407vmovdqu (3 * 32)(%rdx), RA3;408vmovdqu (4 * 32)(%rdx), RB0;409vmovdqu (5 * 32)(%rdx), RB1;410vmovdqu (6 * 32)(%rdx), RB2;411vmovdqu (7 * 32)(%rdx), RB3;412413call __sm4_crypt_blk16;414415vmovdqu (%rcx), RNOTx;416vinserti128 $1, (%rdx), RNOT, RNOT;417vpxor RNOT, RA0, RA0;418vpxor (0 * 32 + 16)(%rdx), RA1, RA1;419vpxor (1 * 32 + 16)(%rdx), RA2, RA2;420vpxor (2 * 32 + 16)(%rdx), RA3, RA3;421vpxor (3 * 32 + 16)(%rdx), RB0, RB0;422vpxor (4 * 32 + 16)(%rdx), RB1, RB1;423vpxor (5 * 32 + 16)(%rdx), RB2, RB2;424vpxor (6 * 32 + 16)(%rdx), RB3, RB3;425vmovdqu (7 * 32 + 16)(%rdx), RNOTx;426vmovdqu RNOTx, (%rcx); /* store new IV */427428vmovdqu RA0, (0 * 32)(%rsi);429vmovdqu RA1, (1 * 32)(%rsi);430vmovdqu RA2, (2 * 32)(%rsi);431vmovdqu RA3, (3 * 32)(%rsi);432vmovdqu RB0, (4 * 32)(%rsi);433vmovdqu RB1, (5 * 32)(%rsi);434vmovdqu RB2, (6 * 32)(%rsi);435vmovdqu RB3, (7 * 32)(%rsi);436437vzeroall;438FRAME_END439RET;440SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)441442443