Path: blob/main/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
108611 views
/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */1// Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved.2//3// Licensed under the Apache License 2.0 (the "License"). You may not use4// this file except in compliance with the License. You can obtain a copy5// in the file LICENSE in the source distribution or at6// https://www.openssl.org/source/license.html78//9// This module implements SM4 with ASIMD and AESE on AARCH6410//11// Dec 202212//1314// $output is the last argument if it looks like a file (it has an extension)15// $flavour is the first argument if it doesn't look like a file16#include "arm_arch.h"17.arch armv8-a+crypto18.text1920.type _vpsm4_ex_consts,%object21.align 722_vpsm4_ex_consts:23.Lck:24.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B626925.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D926.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B424927.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B928.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B222929.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B929930.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB020931.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B727932.Lfk:33.quad 0x56aa3350a3b1bac6,0xb27022dc677d919734.Lshuffles:35.quad 0x0B0A090807060504,0x030201000F0E0D0C36.Lxts_magic:37#ifndef __AARCH64EB__38.quad 0x0101010101010187,0x010101010101010139#else40.quad 0x0101010101010101,0x010101010101018741#endif42.Lsbox_magic:43#ifndef __AARCH64EB__44.quad 0x0b0e0104070a0d00,0x0306090c0f02050845.quad 0x62185a2042387a00,0x22581a6002783a4046.quad 0x15df62a89e54e923,0xc10bb67c4a803df747.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead48.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc49#else50.quad 0x0306090c0f020508,0x0b0e0104070a0d0051.quad 0x22581a6002783a40,0x62185a2042387a0052.quad 0xc10bb67c4a803df7,0x15df62a89e54e92353.quad 0x1407c6d56c7fbead,0xb9aa6b78c1d2130054.quad 0xe383c1a1fe9edcbc,0x6404462679195b3b55#endif56.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f5758.size _vpsm4_ex_consts,.-_vpsm4_ex_consts59.type _vpsm4_ex_set_key,%function60.align 461_vpsm4_ex_set_key:62AARCH64_VALID_CALL_TARGET63ld1 {v5.4s},[x0]64adrp x9, .Lsbox_magic65ldr q26, [x9, #:lo12:.Lsbox_magic]66ldr q27, [x9, #:lo12:.Lsbox_magic+16]67ldr q28, [x9, #:lo12:.Lsbox_magic+32]68ldr q29, [x9, #:lo12:.Lsbox_magic+48]69ldr q30, [x9, #:lo12:.Lsbox_magic+64]70ldr q31, [x9, #:lo12:.Lsbox_magic+80]71#ifndef __AARCH64EB__72rev32 v5.16b,v5.16b73#endif74adrp x5,.Lshuffles75add x5,x5,#:lo12:.Lshuffles76ld1 {v7.2d},[x5]77adrp x5,.Lfk78add x5,x5,#:lo12:.Lfk79ld1 {v6.2d},[x5]80eor v5.16b,v5.16b,v6.16b81mov x6,#3282adrp x5,.Lck83add x5,x5,#:lo12:.Lck84movi v0.16b,#6485cbnz w2,1f86add x1,x1,124871:88mov w7,v5.s[1]89ldr w8,[x5],#490eor w8,w8,w791mov w7,v5.s[2]92eor w8,w8,w793mov w7,v5.s[3]94eor w8,w8,w795// optimize sbox using AESE instruction96mov v4.s[0],w897tbl v0.16b, {v4.16b}, v26.16b98ushr v2.16b, v0.16b, 499and v0.16b, v0.16b, v31.16b100tbl v0.16b, {v28.16b}, v0.16b101tbl v2.16b, {v27.16b}, v2.16b102eor v0.16b, v0.16b, v2.16b103eor v1.16b, v1.16b, v1.16b104aese v0.16b,v1.16b105ushr v2.16b, v0.16b, 4106and v0.16b, v0.16b, v31.16b107tbl v0.16b, {v30.16b}, v0.16b108tbl v2.16b, {v29.16b}, v2.16b109eor v0.16b, v0.16b, v2.16b110mov w7,v0.s[0]111eor w8,w7,w7,ror #19112eor w8,w8,w7,ror #9113mov w7,v5.s[0]114eor w8,w8,w7115mov v5.s[0],w8116cbz w2,2f117str w8,[x1],#4118b 3f1192:120str w8,[x1],#-41213:122tbl v5.16b,{v5.16b},v7.16b123subs x6,x6,#1124b.ne 1b125ret126.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key127.type _vpsm4_ex_enc_4blks,%function128.align 4129_vpsm4_ex_enc_4blks:130AARCH64_VALID_CALL_TARGET131mov x10,x3132mov w11,#813310:134ldp w7,w8,[x10],8135dup v12.4s,w7136dup v13.4s,w8137138// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)139eor v14.16b,v6.16b,v7.16b140eor v12.16b,v5.16b,v12.16b141eor v12.16b,v14.16b,v12.16b142// optimize sbox using AESE instruction143tbl v0.16b, {v12.16b}, v26.16b144ushr v24.16b, v0.16b, 4145and v0.16b, v0.16b, v31.16b146tbl v0.16b, {v28.16b}, v0.16b147tbl v24.16b, {v27.16b}, v24.16b148eor v0.16b, v0.16b, v24.16b149eor v1.16b, v1.16b, v1.16b150aese v0.16b,v1.16b151ushr v24.16b, v0.16b, 4152and v0.16b, v0.16b, v31.16b153tbl v0.16b, {v30.16b}, v0.16b154tbl v24.16b, {v29.16b}, v24.16b155eor v0.16b, v0.16b, v24.16b156mov v12.16b,v0.16b157158// linear transformation159ushr v0.4s,v12.4s,32-2160ushr v1.4s,v12.4s,32-10161ushr v2.4s,v12.4s,32-18162ushr v3.4s,v12.4s,32-24163sli v0.4s,v12.4s,2164sli v1.4s,v12.4s,10165sli v2.4s,v12.4s,18166sli v3.4s,v12.4s,24167eor v24.16b,v0.16b,v12.16b168eor v24.16b,v24.16b,v1.16b169eor v12.16b,v2.16b,v3.16b170eor v12.16b,v12.16b,v24.16b171eor v4.16b,v4.16b,v12.16b172173// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)174eor v14.16b,v14.16b,v4.16b175eor v13.16b,v14.16b,v13.16b176// optimize sbox using AESE instruction177tbl v0.16b, {v13.16b}, v26.16b178ushr v24.16b, v0.16b, 4179and v0.16b, v0.16b, v31.16b180tbl v0.16b, {v28.16b}, v0.16b181tbl v24.16b, {v27.16b}, v24.16b182eor v0.16b, v0.16b, v24.16b183eor v1.16b, v1.16b, v1.16b184aese v0.16b,v1.16b185ushr v24.16b, v0.16b, 4186and v0.16b, v0.16b, v31.16b187tbl v0.16b, {v30.16b}, v0.16b188tbl v24.16b, {v29.16b}, v24.16b189eor v0.16b, v0.16b, v24.16b190mov v13.16b,v0.16b191192// linear transformation193ushr v0.4s,v13.4s,32-2194ushr v1.4s,v13.4s,32-10195ushr v2.4s,v13.4s,32-18196ushr v3.4s,v13.4s,32-24197sli v0.4s,v13.4s,2198sli v1.4s,v13.4s,10199sli v2.4s,v13.4s,18200sli v3.4s,v13.4s,24201eor v24.16b,v0.16b,v13.16b202eor v24.16b,v24.16b,v1.16b203eor v13.16b,v2.16b,v3.16b204eor v13.16b,v13.16b,v24.16b205ldp w7,w8,[x10],8206eor v5.16b,v5.16b,v13.16b207208dup v12.4s,w7209dup v13.4s,w8210211// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)212eor v14.16b,v4.16b,v5.16b213eor v12.16b,v7.16b,v12.16b214eor v12.16b,v14.16b,v12.16b215// optimize sbox using AESE instruction216tbl v0.16b, {v12.16b}, v26.16b217ushr v24.16b, v0.16b, 4218and v0.16b, v0.16b, v31.16b219tbl v0.16b, {v28.16b}, v0.16b220tbl v24.16b, {v27.16b}, v24.16b221eor v0.16b, v0.16b, v24.16b222eor v1.16b, v1.16b, v1.16b223aese v0.16b,v1.16b224ushr v24.16b, v0.16b, 4225and v0.16b, v0.16b, v31.16b226tbl v0.16b, {v30.16b}, v0.16b227tbl v24.16b, {v29.16b}, v24.16b228eor v0.16b, v0.16b, v24.16b229mov v12.16b,v0.16b230231// linear transformation232ushr v0.4s,v12.4s,32-2233ushr v1.4s,v12.4s,32-10234ushr v2.4s,v12.4s,32-18235ushr v3.4s,v12.4s,32-24236sli v0.4s,v12.4s,2237sli v1.4s,v12.4s,10238sli v2.4s,v12.4s,18239sli v3.4s,v12.4s,24240eor v24.16b,v0.16b,v12.16b241eor v24.16b,v24.16b,v1.16b242eor v12.16b,v2.16b,v3.16b243eor v12.16b,v12.16b,v24.16b244eor v6.16b,v6.16b,v12.16b245246// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)247eor v14.16b,v14.16b,v6.16b248eor v13.16b,v14.16b,v13.16b249// optimize sbox using AESE instruction250tbl v0.16b, {v13.16b}, v26.16b251ushr v24.16b, v0.16b, 4252and v0.16b, v0.16b, v31.16b253tbl v0.16b, {v28.16b}, v0.16b254tbl v24.16b, {v27.16b}, v24.16b255eor v0.16b, v0.16b, v24.16b256eor v1.16b, v1.16b, v1.16b257aese v0.16b,v1.16b258ushr v24.16b, v0.16b, 4259and v0.16b, v0.16b, v31.16b260tbl v0.16b, {v30.16b}, v0.16b261tbl v24.16b, {v29.16b}, v24.16b262eor v0.16b, v0.16b, v24.16b263mov v13.16b,v0.16b264265// linear transformation266ushr v0.4s,v13.4s,32-2267ushr v1.4s,v13.4s,32-10268ushr v2.4s,v13.4s,32-18269ushr v3.4s,v13.4s,32-24270sli v0.4s,v13.4s,2271sli v1.4s,v13.4s,10272sli v2.4s,v13.4s,18273sli v3.4s,v13.4s,24274eor v24.16b,v0.16b,v13.16b275eor v24.16b,v24.16b,v1.16b276eor v13.16b,v2.16b,v3.16b277eor v13.16b,v13.16b,v24.16b278eor v7.16b,v7.16b,v13.16b279subs w11,w11,#1280b.ne 10b281#ifndef __AARCH64EB__282rev32 v3.16b,v4.16b283#else284mov v3.16b,v4.16b285#endif286#ifndef __AARCH64EB__287rev32 v2.16b,v5.16b288#else289mov v2.16b,v5.16b290#endif291#ifndef __AARCH64EB__292rev32 v1.16b,v6.16b293#else294mov v1.16b,v6.16b295#endif296#ifndef __AARCH64EB__297rev32 v0.16b,v7.16b298#else299mov v0.16b,v7.16b300#endif301ret302.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks303.type _vpsm4_ex_enc_8blks,%function304.align 4305_vpsm4_ex_enc_8blks:306AARCH64_VALID_CALL_TARGET307mov x10,x3308mov w11,#830910:310ldp w7,w8,[x10],8311// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)312dup v12.4s,w7313eor v14.16b,v6.16b,v7.16b314eor v15.16b,v10.16b,v11.16b315eor v0.16b,v5.16b,v12.16b316eor v1.16b,v9.16b,v12.16b317eor v12.16b,v14.16b,v0.16b318eor v13.16b,v15.16b,v1.16b319// optimize sbox using AESE instruction320tbl v0.16b, {v12.16b}, v26.16b321tbl v1.16b, {v13.16b}, v26.16b322ushr v24.16b, v0.16b, 4323and v0.16b, v0.16b, v31.16b324tbl v0.16b, {v28.16b}, v0.16b325tbl v24.16b, {v27.16b}, v24.16b326eor v0.16b, v0.16b, v24.16b327ushr v24.16b, v1.16b, 4328and v1.16b, v1.16b, v31.16b329tbl v1.16b, {v28.16b}, v1.16b330tbl v24.16b, {v27.16b}, v24.16b331eor v1.16b, v1.16b, v24.16b332eor v25.16b, v25.16b, v25.16b333aese v0.16b,v25.16b334aese v1.16b,v25.16b335ushr v24.16b, v0.16b, 4336and v0.16b, v0.16b, v31.16b337tbl v0.16b, {v30.16b}, v0.16b338tbl v24.16b, {v29.16b}, v24.16b339eor v0.16b, v0.16b, v24.16b340ushr v24.16b, v1.16b, 4341and v1.16b, v1.16b, v31.16b342tbl v1.16b, {v30.16b}, v1.16b343tbl v24.16b, {v29.16b}, v24.16b344eor v1.16b, v1.16b, v24.16b345mov v12.16b,v0.16b346mov v13.16b,v1.16b347348// linear transformation349ushr v0.4s,v12.4s,32-2350ushr v25.4s,v13.4s,32-2351ushr v1.4s,v12.4s,32-10352ushr v2.4s,v12.4s,32-18353ushr v3.4s,v12.4s,32-24354sli v0.4s,v12.4s,2355sli v25.4s,v13.4s,2356sli v1.4s,v12.4s,10357sli v2.4s,v12.4s,18358sli v3.4s,v12.4s,24359eor v24.16b,v0.16b,v12.16b360eor v24.16b,v24.16b,v1.16b361eor v12.16b,v2.16b,v3.16b362eor v12.16b,v12.16b,v24.16b363ushr v1.4s,v13.4s,32-10364ushr v2.4s,v13.4s,32-18365ushr v3.4s,v13.4s,32-24366sli v1.4s,v13.4s,10367sli v2.4s,v13.4s,18368sli v3.4s,v13.4s,24369eor v24.16b,v25.16b,v13.16b370eor v24.16b,v24.16b,v1.16b371eor v13.16b,v2.16b,v3.16b372eor v13.16b,v13.16b,v24.16b373eor v4.16b,v4.16b,v12.16b374eor v8.16b,v8.16b,v13.16b375376// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)377dup v13.4s,w8378eor v14.16b,v14.16b,v4.16b379eor v15.16b,v15.16b,v8.16b380eor v12.16b,v14.16b,v13.16b381eor v13.16b,v15.16b,v13.16b382// optimize sbox using AESE instruction383tbl v0.16b, {v12.16b}, v26.16b384tbl v1.16b, {v13.16b}, v26.16b385ushr v24.16b, v0.16b, 4386and v0.16b, v0.16b, v31.16b387tbl v0.16b, {v28.16b}, v0.16b388tbl v24.16b, {v27.16b}, v24.16b389eor v0.16b, v0.16b, v24.16b390ushr v24.16b, v1.16b, 4391and v1.16b, v1.16b, v31.16b392tbl v1.16b, {v28.16b}, v1.16b393tbl v24.16b, {v27.16b}, v24.16b394eor v1.16b, v1.16b, v24.16b395eor v25.16b, v25.16b, v25.16b396aese v0.16b,v25.16b397aese v1.16b,v25.16b398ushr v24.16b, v0.16b, 4399and v0.16b, v0.16b, v31.16b400tbl v0.16b, {v30.16b}, v0.16b401tbl v24.16b, {v29.16b}, v24.16b402eor v0.16b, v0.16b, v24.16b403ushr v24.16b, v1.16b, 4404and v1.16b, v1.16b, v31.16b405tbl v1.16b, {v30.16b}, v1.16b406tbl v24.16b, {v29.16b}, v24.16b407eor v1.16b, v1.16b, v24.16b408mov v12.16b,v0.16b409mov v13.16b,v1.16b410411// linear transformation412ushr v0.4s,v12.4s,32-2413ushr v25.4s,v13.4s,32-2414ushr v1.4s,v12.4s,32-10415ushr v2.4s,v12.4s,32-18416ushr v3.4s,v12.4s,32-24417sli v0.4s,v12.4s,2418sli v25.4s,v13.4s,2419sli v1.4s,v12.4s,10420sli v2.4s,v12.4s,18421sli v3.4s,v12.4s,24422eor v24.16b,v0.16b,v12.16b423eor v24.16b,v24.16b,v1.16b424eor v12.16b,v2.16b,v3.16b425eor v12.16b,v12.16b,v24.16b426ushr v1.4s,v13.4s,32-10427ushr v2.4s,v13.4s,32-18428ushr v3.4s,v13.4s,32-24429sli v1.4s,v13.4s,10430sli v2.4s,v13.4s,18431sli v3.4s,v13.4s,24432eor v24.16b,v25.16b,v13.16b433eor v24.16b,v24.16b,v1.16b434eor v13.16b,v2.16b,v3.16b435eor v13.16b,v13.16b,v24.16b436ldp w7,w8,[x10],8437eor v5.16b,v5.16b,v12.16b438eor v9.16b,v9.16b,v13.16b439440// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)441dup v12.4s,w7442eor v14.16b,v4.16b,v5.16b443eor v15.16b,v8.16b,v9.16b444eor v0.16b,v7.16b,v12.16b445eor v1.16b,v11.16b,v12.16b446eor v12.16b,v14.16b,v0.16b447eor v13.16b,v15.16b,v1.16b448// optimize sbox using AESE instruction449tbl v0.16b, {v12.16b}, v26.16b450tbl v1.16b, {v13.16b}, v26.16b451ushr v24.16b, v0.16b, 4452and v0.16b, v0.16b, v31.16b453tbl v0.16b, {v28.16b}, v0.16b454tbl v24.16b, {v27.16b}, v24.16b455eor v0.16b, v0.16b, v24.16b456ushr v24.16b, v1.16b, 4457and v1.16b, v1.16b, v31.16b458tbl v1.16b, {v28.16b}, v1.16b459tbl v24.16b, {v27.16b}, v24.16b460eor v1.16b, v1.16b, v24.16b461eor v25.16b, v25.16b, v25.16b462aese v0.16b,v25.16b463aese v1.16b,v25.16b464ushr v24.16b, v0.16b, 4465and v0.16b, v0.16b, v31.16b466tbl v0.16b, {v30.16b}, v0.16b467tbl v24.16b, {v29.16b}, v24.16b468eor v0.16b, v0.16b, v24.16b469ushr v24.16b, v1.16b, 4470and v1.16b, v1.16b, v31.16b471tbl v1.16b, {v30.16b}, v1.16b472tbl v24.16b, {v29.16b}, v24.16b473eor v1.16b, v1.16b, v24.16b474mov v12.16b,v0.16b475mov v13.16b,v1.16b476477// linear transformation478ushr v0.4s,v12.4s,32-2479ushr v25.4s,v13.4s,32-2480ushr v1.4s,v12.4s,32-10481ushr v2.4s,v12.4s,32-18482ushr v3.4s,v12.4s,32-24483sli v0.4s,v12.4s,2484sli v25.4s,v13.4s,2485sli v1.4s,v12.4s,10486sli v2.4s,v12.4s,18487sli v3.4s,v12.4s,24488eor v24.16b,v0.16b,v12.16b489eor v24.16b,v24.16b,v1.16b490eor v12.16b,v2.16b,v3.16b491eor v12.16b,v12.16b,v24.16b492ushr v1.4s,v13.4s,32-10493ushr v2.4s,v13.4s,32-18494ushr v3.4s,v13.4s,32-24495sli v1.4s,v13.4s,10496sli v2.4s,v13.4s,18497sli v3.4s,v13.4s,24498eor v24.16b,v25.16b,v13.16b499eor v24.16b,v24.16b,v1.16b500eor v13.16b,v2.16b,v3.16b501eor v13.16b,v13.16b,v24.16b502eor v6.16b,v6.16b,v12.16b503eor v10.16b,v10.16b,v13.16b504505// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)506dup v13.4s,w8507eor v14.16b,v14.16b,v6.16b508eor v15.16b,v15.16b,v10.16b509eor v12.16b,v14.16b,v13.16b510eor v13.16b,v15.16b,v13.16b511// optimize sbox using AESE instruction512tbl v0.16b, {v12.16b}, v26.16b513tbl v1.16b, {v13.16b}, v26.16b514ushr v24.16b, v0.16b, 4515and v0.16b, v0.16b, v31.16b516tbl v0.16b, {v28.16b}, v0.16b517tbl v24.16b, {v27.16b}, v24.16b518eor v0.16b, v0.16b, v24.16b519ushr v24.16b, v1.16b, 4520and v1.16b, v1.16b, v31.16b521tbl v1.16b, {v28.16b}, v1.16b522tbl v24.16b, {v27.16b}, v24.16b523eor v1.16b, v1.16b, v24.16b524eor v25.16b, v25.16b, v25.16b525aese v0.16b,v25.16b526aese v1.16b,v25.16b527ushr v24.16b, v0.16b, 4528and v0.16b, v0.16b, v31.16b529tbl v0.16b, {v30.16b}, v0.16b530tbl v24.16b, {v29.16b}, v24.16b531eor v0.16b, v0.16b, v24.16b532ushr v24.16b, v1.16b, 4533and v1.16b, v1.16b, v31.16b534tbl v1.16b, {v30.16b}, v1.16b535tbl v24.16b, {v29.16b}, v24.16b536eor v1.16b, v1.16b, v24.16b537mov v12.16b,v0.16b538mov v13.16b,v1.16b539540// linear transformation541ushr v0.4s,v12.4s,32-2542ushr v25.4s,v13.4s,32-2543ushr v1.4s,v12.4s,32-10544ushr v2.4s,v12.4s,32-18545ushr v3.4s,v12.4s,32-24546sli v0.4s,v12.4s,2547sli v25.4s,v13.4s,2548sli v1.4s,v12.4s,10549sli v2.4s,v12.4s,18550sli v3.4s,v12.4s,24551eor v24.16b,v0.16b,v12.16b552eor v24.16b,v24.16b,v1.16b553eor v12.16b,v2.16b,v3.16b554eor v12.16b,v12.16b,v24.16b555ushr v1.4s,v13.4s,32-10556ushr v2.4s,v13.4s,32-18557ushr v3.4s,v13.4s,32-24558sli v1.4s,v13.4s,10559sli v2.4s,v13.4s,18560sli v3.4s,v13.4s,24561eor v24.16b,v25.16b,v13.16b562eor v24.16b,v24.16b,v1.16b563eor v13.16b,v2.16b,v3.16b564eor v13.16b,v13.16b,v24.16b565eor v7.16b,v7.16b,v12.16b566eor v11.16b,v11.16b,v13.16b567subs w11,w11,#1568b.ne 10b569#ifndef __AARCH64EB__570rev32 v3.16b,v4.16b571#else572mov v3.16b,v4.16b573#endif574#ifndef __AARCH64EB__575rev32 v2.16b,v5.16b576#else577mov v2.16b,v5.16b578#endif579#ifndef __AARCH64EB__580rev32 v1.16b,v6.16b581#else582mov v1.16b,v6.16b583#endif584#ifndef __AARCH64EB__585rev32 v0.16b,v7.16b586#else587mov v0.16b,v7.16b588#endif589#ifndef __AARCH64EB__590rev32 v7.16b,v8.16b591#else592mov v7.16b,v8.16b593#endif594#ifndef __AARCH64EB__595rev32 v6.16b,v9.16b596#else597mov v6.16b,v9.16b598#endif599#ifndef __AARCH64EB__600rev32 v5.16b,v10.16b601#else602mov v5.16b,v10.16b603#endif604#ifndef __AARCH64EB__605rev32 v4.16b,v11.16b606#else607mov v4.16b,v11.16b608#endif609ret610.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks611.globl vpsm4_ex_set_encrypt_key612.type vpsm4_ex_set_encrypt_key,%function613.align 5614vpsm4_ex_set_encrypt_key:615AARCH64_SIGN_LINK_REGISTER616stp x29,x30,[sp,#-16]!617mov w2,1618bl _vpsm4_ex_set_key619ldp x29,x30,[sp],#16620AARCH64_VALIDATE_LINK_REGISTER621ret622.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key623.globl vpsm4_ex_set_decrypt_key624.type vpsm4_ex_set_decrypt_key,%function625.align 5626vpsm4_ex_set_decrypt_key:627AARCH64_SIGN_LINK_REGISTER628stp x29,x30,[sp,#-16]!629mov w2,0630bl _vpsm4_ex_set_key631ldp x29,x30,[sp],#16632AARCH64_VALIDATE_LINK_REGISTER633ret634.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key635.globl vpsm4_ex_encrypt636.type vpsm4_ex_encrypt,%function637.align 5638vpsm4_ex_encrypt:639AARCH64_VALID_CALL_TARGET640ld1 {v4.4s},[x0]641adrp x9, .Lsbox_magic642ldr q26, [x9, #:lo12:.Lsbox_magic]643ldr q27, [x9, #:lo12:.Lsbox_magic+16]644ldr q28, [x9, #:lo12:.Lsbox_magic+32]645ldr q29, [x9, #:lo12:.Lsbox_magic+48]646ldr q30, [x9, #:lo12:.Lsbox_magic+64]647ldr q31, [x9, #:lo12:.Lsbox_magic+80]648#ifndef __AARCH64EB__649rev32 v4.16b,v4.16b650#endif651mov x3,x2652mov x10,x3653mov w11,#8654mov w12,v4.s[0]655mov w13,v4.s[1]656mov w14,v4.s[2]657mov w15,v4.s[3]65810:659ldp w7,w8,[x10],8660// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)661eor w6,w14,w15662eor w9,w7,w13663eor w6,w6,w9664mov v3.s[0],w6665// optimize sbox using AESE instruction666tbl v0.16b, {v3.16b}, v26.16b667ushr v2.16b, v0.16b, 4668and v0.16b, v0.16b, v31.16b669tbl v0.16b, {v28.16b}, v0.16b670tbl v2.16b, {v27.16b}, v2.16b671eor v0.16b, v0.16b, v2.16b672eor v1.16b, v1.16b, v1.16b673aese v0.16b,v1.16b674ushr v2.16b, v0.16b, 4675and v0.16b, v0.16b, v31.16b676tbl v0.16b, {v30.16b}, v0.16b677tbl v2.16b, {v29.16b}, v2.16b678eor v0.16b, v0.16b, v2.16b679680mov w7,v0.s[0]681eor w6,w7,w7,ror #32-2682eor w6,w6,w7,ror #32-10683eor w6,w6,w7,ror #32-18684eor w6,w6,w7,ror #32-24685eor w12,w12,w6686// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)687eor w6,w14,w15688eor w9,w12,w8689eor w6,w6,w9690mov v3.s[0],w6691// optimize sbox using AESE instruction692tbl v0.16b, {v3.16b}, v26.16b693ushr v2.16b, v0.16b, 4694and v0.16b, v0.16b, v31.16b695tbl v0.16b, {v28.16b}, v0.16b696tbl v2.16b, {v27.16b}, v2.16b697eor v0.16b, v0.16b, v2.16b698eor v1.16b, v1.16b, v1.16b699aese v0.16b,v1.16b700ushr v2.16b, v0.16b, 4701and v0.16b, v0.16b, v31.16b702tbl v0.16b, {v30.16b}, v0.16b703tbl v2.16b, {v29.16b}, v2.16b704eor v0.16b, v0.16b, v2.16b705706mov w7,v0.s[0]707eor w6,w7,w7,ror #32-2708eor w6,w6,w7,ror #32-10709eor w6,w6,w7,ror #32-18710eor w6,w6,w7,ror #32-24711ldp w7,w8,[x10],8712eor w13,w13,w6713// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)714eor w6,w12,w13715eor w9,w7,w15716eor w6,w6,w9717mov v3.s[0],w6718// optimize sbox using AESE instruction719tbl v0.16b, {v3.16b}, v26.16b720ushr v2.16b, v0.16b, 4721and v0.16b, v0.16b, v31.16b722tbl v0.16b, {v28.16b}, v0.16b723tbl v2.16b, {v27.16b}, v2.16b724eor v0.16b, v0.16b, v2.16b725eor v1.16b, v1.16b, v1.16b726aese v0.16b,v1.16b727ushr v2.16b, v0.16b, 4728and v0.16b, v0.16b, v31.16b729tbl v0.16b, {v30.16b}, v0.16b730tbl v2.16b, {v29.16b}, v2.16b731eor v0.16b, v0.16b, v2.16b732733mov w7,v0.s[0]734eor w6,w7,w7,ror #32-2735eor w6,w6,w7,ror #32-10736eor w6,w6,w7,ror #32-18737eor w6,w6,w7,ror #32-24738eor w14,w14,w6739// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)740eor w6,w12,w13741eor w9,w14,w8742eor w6,w6,w9743mov v3.s[0],w6744// optimize sbox using AESE instruction745tbl v0.16b, {v3.16b}, v26.16b746ushr v2.16b, v0.16b, 4747and v0.16b, v0.16b, v31.16b748tbl v0.16b, {v28.16b}, v0.16b749tbl v2.16b, {v27.16b}, v2.16b750eor v0.16b, v0.16b, v2.16b751eor v1.16b, v1.16b, v1.16b752aese v0.16b,v1.16b753ushr v2.16b, v0.16b, 4754and v0.16b, v0.16b, v31.16b755tbl v0.16b, {v30.16b}, v0.16b756tbl v2.16b, {v29.16b}, v2.16b757eor v0.16b, v0.16b, v2.16b758759mov w7,v0.s[0]760eor w6,w7,w7,ror #32-2761eor w6,w6,w7,ror #32-10762eor w6,w6,w7,ror #32-18763eor w6,w6,w7,ror #32-24764eor w15,w15,w6765subs w11,w11,#1766b.ne 10b767mov v4.s[0],w15768mov v4.s[1],w14769mov v4.s[2],w13770mov v4.s[3],w12771#ifndef __AARCH64EB__772rev32 v4.16b,v4.16b773#endif774st1 {v4.4s},[x1]775ret776.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt777.globl vpsm4_ex_decrypt778.type vpsm4_ex_decrypt,%function779.align 5780vpsm4_ex_decrypt:781AARCH64_VALID_CALL_TARGET782ld1 {v4.4s},[x0]783adrp x9, .Lsbox_magic784ldr q26, [x9, #:lo12:.Lsbox_magic]785ldr q27, [x9, #:lo12:.Lsbox_magic+16]786ldr q28, [x9, #:lo12:.Lsbox_magic+32]787ldr q29, [x9, #:lo12:.Lsbox_magic+48]788ldr q30, [x9, #:lo12:.Lsbox_magic+64]789ldr q31, [x9, #:lo12:.Lsbox_magic+80]790#ifndef __AARCH64EB__791rev32 v4.16b,v4.16b792#endif793mov x3,x2794mov x10,x3795mov w11,#8796mov w12,v4.s[0]797mov w13,v4.s[1]798mov w14,v4.s[2]799mov w15,v4.s[3]80010:801ldp w7,w8,[x10],8802// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)803eor w6,w14,w15804eor w9,w7,w13805eor w6,w6,w9806mov v3.s[0],w6807// optimize sbox using AESE instruction808tbl v0.16b, {v3.16b}, v26.16b809ushr v2.16b, v0.16b, 4810and v0.16b, v0.16b, v31.16b811tbl v0.16b, {v28.16b}, v0.16b812tbl v2.16b, {v27.16b}, v2.16b813eor v0.16b, v0.16b, v2.16b814eor v1.16b, v1.16b, v1.16b815aese v0.16b,v1.16b816ushr v2.16b, v0.16b, 4817and v0.16b, v0.16b, v31.16b818tbl v0.16b, {v30.16b}, v0.16b819tbl v2.16b, {v29.16b}, v2.16b820eor v0.16b, v0.16b, v2.16b821822mov w7,v0.s[0]823eor w6,w7,w7,ror #32-2824eor w6,w6,w7,ror #32-10825eor w6,w6,w7,ror #32-18826eor w6,w6,w7,ror #32-24827eor w12,w12,w6828// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)829eor w6,w14,w15830eor w9,w12,w8831eor w6,w6,w9832mov v3.s[0],w6833// optimize sbox using AESE instruction834tbl v0.16b, {v3.16b}, v26.16b835ushr v2.16b, v0.16b, 4836and v0.16b, v0.16b, v31.16b837tbl v0.16b, {v28.16b}, v0.16b838tbl v2.16b, {v27.16b}, v2.16b839eor v0.16b, v0.16b, v2.16b840eor v1.16b, v1.16b, v1.16b841aese v0.16b,v1.16b842ushr v2.16b, v0.16b, 4843and v0.16b, v0.16b, v31.16b844tbl v0.16b, {v30.16b}, v0.16b845tbl v2.16b, {v29.16b}, v2.16b846eor v0.16b, v0.16b, v2.16b847848mov w7,v0.s[0]849eor w6,w7,w7,ror #32-2850eor w6,w6,w7,ror #32-10851eor w6,w6,w7,ror #32-18852eor w6,w6,w7,ror #32-24853ldp w7,w8,[x10],8854eor w13,w13,w6855// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)856eor w6,w12,w13857eor w9,w7,w15858eor w6,w6,w9859mov v3.s[0],w6860// optimize sbox using AESE instruction861tbl v0.16b, {v3.16b}, v26.16b862ushr v2.16b, v0.16b, 4863and v0.16b, v0.16b, v31.16b864tbl v0.16b, {v28.16b}, v0.16b865tbl v2.16b, {v27.16b}, v2.16b866eor v0.16b, v0.16b, v2.16b867eor v1.16b, v1.16b, v1.16b868aese v0.16b,v1.16b869ushr v2.16b, v0.16b, 4870and v0.16b, v0.16b, v31.16b871tbl v0.16b, {v30.16b}, v0.16b872tbl v2.16b, {v29.16b}, v2.16b873eor v0.16b, v0.16b, v2.16b874875mov w7,v0.s[0]876eor w6,w7,w7,ror #32-2877eor w6,w6,w7,ror #32-10878eor w6,w6,w7,ror #32-18879eor w6,w6,w7,ror #32-24880eor w14,w14,w6881// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)882eor w6,w12,w13883eor w9,w14,w8884eor w6,w6,w9885mov v3.s[0],w6886// optimize sbox using AESE instruction887tbl v0.16b, {v3.16b}, v26.16b888ushr v2.16b, v0.16b, 4889and v0.16b, v0.16b, v31.16b890tbl v0.16b, {v28.16b}, v0.16b891tbl v2.16b, {v27.16b}, v2.16b892eor v0.16b, v0.16b, v2.16b893eor v1.16b, v1.16b, v1.16b894aese v0.16b,v1.16b895ushr v2.16b, v0.16b, 4896and v0.16b, v0.16b, v31.16b897tbl v0.16b, {v30.16b}, v0.16b898tbl v2.16b, {v29.16b}, v2.16b899eor v0.16b, v0.16b, v2.16b900901mov w7,v0.s[0]902eor w6,w7,w7,ror #32-2903eor w6,w6,w7,ror #32-10904eor w6,w6,w7,ror #32-18905eor w6,w6,w7,ror #32-24906eor w15,w15,w6907subs w11,w11,#1908b.ne 10b909mov v4.s[0],w15910mov v4.s[1],w14911mov v4.s[2],w13912mov v4.s[3],w12913#ifndef __AARCH64EB__914rev32 v4.16b,v4.16b915#endif916st1 {v4.4s},[x1]917ret918.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt919.globl vpsm4_ex_ecb_encrypt920.type vpsm4_ex_ecb_encrypt,%function921.align 5922vpsm4_ex_ecb_encrypt:923AARCH64_SIGN_LINK_REGISTER924// convert length into blocks925lsr x2,x2,4926stp d8,d9,[sp,#-80]!927stp d10,d11,[sp,#16]928stp d12,d13,[sp,#32]929stp d14,d15,[sp,#48]930stp x29,x30,[sp,#64]931adrp x9, .Lsbox_magic932ldr q26, [x9, #:lo12:.Lsbox_magic]933ldr q27, [x9, #:lo12:.Lsbox_magic+16]934ldr q28, [x9, #:lo12:.Lsbox_magic+32]935ldr q29, [x9, #:lo12:.Lsbox_magic+48]936ldr q30, [x9, #:lo12:.Lsbox_magic+64]937ldr q31, [x9, #:lo12:.Lsbox_magic+80]938.Lecb_8_blocks_process:939cmp w2,#8940b.lt .Lecb_4_blocks_process941ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64942ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64943#ifndef __AARCH64EB__944rev32 v4.16b,v4.16b945#endif946#ifndef __AARCH64EB__947rev32 v5.16b,v5.16b948#endif949#ifndef __AARCH64EB__950rev32 v6.16b,v6.16b951#endif952#ifndef __AARCH64EB__953rev32 v7.16b,v7.16b954#endif955#ifndef __AARCH64EB__956rev32 v8.16b,v8.16b957#endif958#ifndef __AARCH64EB__959rev32 v9.16b,v9.16b960#endif961#ifndef __AARCH64EB__962rev32 v10.16b,v10.16b963#endif964#ifndef __AARCH64EB__965rev32 v11.16b,v11.16b966#endif967bl _vpsm4_ex_enc_8blks968st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64969st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64970subs w2,w2,#8971b.gt .Lecb_8_blocks_process972b 100f973.Lecb_4_blocks_process:974cmp w2,#4975b.lt 1f976ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64977#ifndef __AARCH64EB__978rev32 v4.16b,v4.16b979#endif980#ifndef __AARCH64EB__981rev32 v5.16b,v5.16b982#endif983#ifndef __AARCH64EB__984rev32 v6.16b,v6.16b985#endif986#ifndef __AARCH64EB__987rev32 v7.16b,v7.16b988#endif989bl _vpsm4_ex_enc_4blks990st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64991sub w2,w2,#49921:993// process last block994cmp w2,#1995b.lt 100f996b.gt 1f997ld1 {v4.4s},[x0]998#ifndef __AARCH64EB__999rev32 v4.16b,v4.16b1000#endif1001mov x10,x31002mov w11,#81003mov w12,v4.s[0]1004mov w13,v4.s[1]1005mov w14,v4.s[2]1006mov w15,v4.s[3]100710:1008ldp w7,w8,[x10],81009// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1010eor w6,w14,w151011eor w9,w7,w131012eor w6,w6,w91013mov v3.s[0],w61014// optimize sbox using AESE instruction1015tbl v0.16b, {v3.16b}, v26.16b1016ushr v2.16b, v0.16b, 41017and v0.16b, v0.16b, v31.16b1018tbl v0.16b, {v28.16b}, v0.16b1019tbl v2.16b, {v27.16b}, v2.16b1020eor v0.16b, v0.16b, v2.16b1021eor v1.16b, v1.16b, v1.16b1022aese v0.16b,v1.16b1023ushr v2.16b, v0.16b, 41024and v0.16b, v0.16b, v31.16b1025tbl v0.16b, {v30.16b}, v0.16b1026tbl v2.16b, {v29.16b}, v2.16b1027eor v0.16b, v0.16b, v2.16b10281029mov w7,v0.s[0]1030eor w6,w7,w7,ror #32-21031eor w6,w6,w7,ror #32-101032eor w6,w6,w7,ror #32-181033eor w6,w6,w7,ror #32-241034eor w12,w12,w61035// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1036eor w6,w14,w151037eor w9,w12,w81038eor w6,w6,w91039mov v3.s[0],w61040// optimize sbox using AESE instruction1041tbl v0.16b, {v3.16b}, v26.16b1042ushr v2.16b, v0.16b, 41043and v0.16b, v0.16b, v31.16b1044tbl v0.16b, {v28.16b}, v0.16b1045tbl v2.16b, {v27.16b}, v2.16b1046eor v0.16b, v0.16b, v2.16b1047eor v1.16b, v1.16b, v1.16b1048aese v0.16b,v1.16b1049ushr v2.16b, v0.16b, 41050and v0.16b, v0.16b, v31.16b1051tbl v0.16b, {v30.16b}, v0.16b1052tbl v2.16b, {v29.16b}, v2.16b1053eor v0.16b, v0.16b, v2.16b10541055mov w7,v0.s[0]1056eor w6,w7,w7,ror #32-21057eor w6,w6,w7,ror #32-101058eor w6,w6,w7,ror #32-181059eor w6,w6,w7,ror #32-241060ldp w7,w8,[x10],81061eor w13,w13,w61062// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1063eor w6,w12,w131064eor w9,w7,w151065eor w6,w6,w91066mov v3.s[0],w61067// optimize sbox using AESE instruction1068tbl v0.16b, {v3.16b}, v26.16b1069ushr v2.16b, v0.16b, 41070and v0.16b, v0.16b, v31.16b1071tbl v0.16b, {v28.16b}, v0.16b1072tbl v2.16b, {v27.16b}, v2.16b1073eor v0.16b, v0.16b, v2.16b1074eor v1.16b, v1.16b, v1.16b1075aese v0.16b,v1.16b1076ushr v2.16b, v0.16b, 41077and v0.16b, v0.16b, v31.16b1078tbl v0.16b, {v30.16b}, v0.16b1079tbl v2.16b, {v29.16b}, v2.16b1080eor v0.16b, v0.16b, v2.16b10811082mov w7,v0.s[0]1083eor w6,w7,w7,ror #32-21084eor w6,w6,w7,ror #32-101085eor w6,w6,w7,ror #32-181086eor w6,w6,w7,ror #32-241087eor w14,w14,w61088// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1089eor w6,w12,w131090eor w9,w14,w81091eor w6,w6,w91092mov v3.s[0],w61093// optimize sbox using AESE instruction1094tbl v0.16b, {v3.16b}, v26.16b1095ushr v2.16b, v0.16b, 41096and v0.16b, v0.16b, v31.16b1097tbl v0.16b, {v28.16b}, v0.16b1098tbl v2.16b, {v27.16b}, v2.16b1099eor v0.16b, v0.16b, v2.16b1100eor v1.16b, v1.16b, v1.16b1101aese v0.16b,v1.16b1102ushr v2.16b, v0.16b, 41103and v0.16b, v0.16b, v31.16b1104tbl v0.16b, {v30.16b}, v0.16b1105tbl v2.16b, {v29.16b}, v2.16b1106eor v0.16b, v0.16b, v2.16b11071108mov w7,v0.s[0]1109eor w6,w7,w7,ror #32-21110eor w6,w6,w7,ror #32-101111eor w6,w6,w7,ror #32-181112eor w6,w6,w7,ror #32-241113eor w15,w15,w61114subs w11,w11,#11115b.ne 10b1116mov v4.s[0],w151117mov v4.s[1],w141118mov v4.s[2],w131119mov v4.s[3],w121120#ifndef __AARCH64EB__1121rev32 v4.16b,v4.16b1122#endif1123st1 {v4.4s},[x1]1124b 100f11251: // process last 2 blocks1126ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#161127ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#161128cmp w2,#21129b.gt 1f1130#ifndef __AARCH64EB__1131rev32 v4.16b,v4.16b1132#endif1133#ifndef __AARCH64EB__1134rev32 v5.16b,v5.16b1135#endif1136#ifndef __AARCH64EB__1137rev32 v6.16b,v6.16b1138#endif1139#ifndef __AARCH64EB__1140rev32 v7.16b,v7.16b1141#endif1142bl _vpsm4_ex_enc_4blks1143st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#161144st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]1145b 100f11461: // process last 3 blocks1147ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#161148#ifndef __AARCH64EB__1149rev32 v4.16b,v4.16b1150#endif1151#ifndef __AARCH64EB__1152rev32 v5.16b,v5.16b1153#endif1154#ifndef __AARCH64EB__1155rev32 v6.16b,v6.16b1156#endif1157#ifndef __AARCH64EB__1158rev32 v7.16b,v7.16b1159#endif1160bl _vpsm4_ex_enc_4blks1161st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#161162st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#161163st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]1164100:1165ldp d10,d11,[sp,#16]1166ldp d12,d13,[sp,#32]1167ldp d14,d15,[sp,#48]1168ldp x29,x30,[sp,#64]1169ldp d8,d9,[sp],#801170AARCH64_VALIDATE_LINK_REGISTER1171ret1172.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt1173.globl vpsm4_ex_cbc_encrypt1174.type vpsm4_ex_cbc_encrypt,%function1175.align 51176vpsm4_ex_cbc_encrypt:1177AARCH64_VALID_CALL_TARGET1178lsr x2,x2,41179adrp x9, .Lsbox_magic1180ldr q26, [x9, #:lo12:.Lsbox_magic]1181ldr q27, [x9, #:lo12:.Lsbox_magic+16]1182ldr q28, [x9, #:lo12:.Lsbox_magic+32]1183ldr q29, [x9, #:lo12:.Lsbox_magic+48]1184ldr q30, [x9, #:lo12:.Lsbox_magic+64]1185ldr q31, [x9, #:lo12:.Lsbox_magic+80]1186cbz w5,.Ldec1187ld1 {v3.4s},[x4]1188.Lcbc_4_blocks_enc:1189cmp w2,#41190b.lt 1f1191ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#641192eor v4.16b,v4.16b,v3.16b1193#ifndef __AARCH64EB__1194rev32 v5.16b,v5.16b1195#endif1196#ifndef __AARCH64EB__1197rev32 v4.16b,v4.16b1198#endif1199#ifndef __AARCH64EB__1200rev32 v6.16b,v6.16b1201#endif1202#ifndef __AARCH64EB__1203rev32 v7.16b,v7.16b1204#endif1205mov x10,x31206mov w11,#81207mov w12,v4.s[0]1208mov w13,v4.s[1]1209mov w14,v4.s[2]1210mov w15,v4.s[3]121110:1212ldp w7,w8,[x10],81213// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1214eor w6,w14,w151215eor w9,w7,w131216eor w6,w6,w91217mov v3.s[0],w61218// optimize sbox using AESE instruction1219tbl v0.16b, {v3.16b}, v26.16b1220ushr v2.16b, v0.16b, 41221and v0.16b, v0.16b, v31.16b1222tbl v0.16b, {v28.16b}, v0.16b1223tbl v2.16b, {v27.16b}, v2.16b1224eor v0.16b, v0.16b, v2.16b1225eor v1.16b, v1.16b, v1.16b1226aese v0.16b,v1.16b1227ushr v2.16b, v0.16b, 41228and v0.16b, v0.16b, v31.16b1229tbl v0.16b, {v30.16b}, v0.16b1230tbl v2.16b, {v29.16b}, v2.16b1231eor v0.16b, v0.16b, v2.16b12321233mov w7,v0.s[0]1234eor w6,w7,w7,ror #32-21235eor w6,w6,w7,ror #32-101236eor w6,w6,w7,ror #32-181237eor w6,w6,w7,ror #32-241238eor w12,w12,w61239// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1240eor w6,w14,w151241eor w9,w12,w81242eor w6,w6,w91243mov v3.s[0],w61244// optimize sbox using AESE instruction1245tbl v0.16b, {v3.16b}, v26.16b1246ushr v2.16b, v0.16b, 41247and v0.16b, v0.16b, v31.16b1248tbl v0.16b, {v28.16b}, v0.16b1249tbl v2.16b, {v27.16b}, v2.16b1250eor v0.16b, v0.16b, v2.16b1251eor v1.16b, v1.16b, v1.16b1252aese v0.16b,v1.16b1253ushr v2.16b, v0.16b, 41254and v0.16b, v0.16b, v31.16b1255tbl v0.16b, {v30.16b}, v0.16b1256tbl v2.16b, {v29.16b}, v2.16b1257eor v0.16b, v0.16b, v2.16b12581259mov w7,v0.s[0]1260eor w6,w7,w7,ror #32-21261eor w6,w6,w7,ror #32-101262eor w6,w6,w7,ror #32-181263eor w6,w6,w7,ror #32-241264ldp w7,w8,[x10],81265eor w13,w13,w61266// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1267eor w6,w12,w131268eor w9,w7,w151269eor w6,w6,w91270mov v3.s[0],w61271// optimize sbox using AESE instruction1272tbl v0.16b, {v3.16b}, v26.16b1273ushr v2.16b, v0.16b, 41274and v0.16b, v0.16b, v31.16b1275tbl v0.16b, {v28.16b}, v0.16b1276tbl v2.16b, {v27.16b}, v2.16b1277eor v0.16b, v0.16b, v2.16b1278eor v1.16b, v1.16b, v1.16b1279aese v0.16b,v1.16b1280ushr v2.16b, v0.16b, 41281and v0.16b, v0.16b, v31.16b1282tbl v0.16b, {v30.16b}, v0.16b1283tbl v2.16b, {v29.16b}, v2.16b1284eor v0.16b, v0.16b, v2.16b12851286mov w7,v0.s[0]1287eor w6,w7,w7,ror #32-21288eor w6,w6,w7,ror #32-101289eor w6,w6,w7,ror #32-181290eor w6,w6,w7,ror #32-241291eor w14,w14,w61292// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1293eor w6,w12,w131294eor w9,w14,w81295eor w6,w6,w91296mov v3.s[0],w61297// optimize sbox using AESE instruction1298tbl v0.16b, {v3.16b}, v26.16b1299ushr v2.16b, v0.16b, 41300and v0.16b, v0.16b, v31.16b1301tbl v0.16b, {v28.16b}, v0.16b1302tbl v2.16b, {v27.16b}, v2.16b1303eor v0.16b, v0.16b, v2.16b1304eor v1.16b, v1.16b, v1.16b1305aese v0.16b,v1.16b1306ushr v2.16b, v0.16b, 41307and v0.16b, v0.16b, v31.16b1308tbl v0.16b, {v30.16b}, v0.16b1309tbl v2.16b, {v29.16b}, v2.16b1310eor v0.16b, v0.16b, v2.16b13111312mov w7,v0.s[0]1313eor w6,w7,w7,ror #32-21314eor w6,w6,w7,ror #32-101315eor w6,w6,w7,ror #32-181316eor w6,w6,w7,ror #32-241317eor w15,w15,w61318subs w11,w11,#11319b.ne 10b1320mov v4.s[0],w151321mov v4.s[1],w141322mov v4.s[2],w131323mov v4.s[3],w121324eor v5.16b,v5.16b,v4.16b1325mov x10,x31326mov w11,#81327mov w12,v5.s[0]1328mov w13,v5.s[1]1329mov w14,v5.s[2]1330mov w15,v5.s[3]133110:1332ldp w7,w8,[x10],81333// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1334eor w6,w14,w151335eor w9,w7,w131336eor w6,w6,w91337mov v3.s[0],w61338// optimize sbox using AESE instruction1339tbl v0.16b, {v3.16b}, v26.16b1340ushr v2.16b, v0.16b, 41341and v0.16b, v0.16b, v31.16b1342tbl v0.16b, {v28.16b}, v0.16b1343tbl v2.16b, {v27.16b}, v2.16b1344eor v0.16b, v0.16b, v2.16b1345eor v1.16b, v1.16b, v1.16b1346aese v0.16b,v1.16b1347ushr v2.16b, v0.16b, 41348and v0.16b, v0.16b, v31.16b1349tbl v0.16b, {v30.16b}, v0.16b1350tbl v2.16b, {v29.16b}, v2.16b1351eor v0.16b, v0.16b, v2.16b13521353mov w7,v0.s[0]1354eor w6,w7,w7,ror #32-21355eor w6,w6,w7,ror #32-101356eor w6,w6,w7,ror #32-181357eor w6,w6,w7,ror #32-241358eor w12,w12,w61359// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1360eor w6,w14,w151361eor w9,w12,w81362eor w6,w6,w91363mov v3.s[0],w61364// optimize sbox using AESE instruction1365tbl v0.16b, {v3.16b}, v26.16b1366ushr v2.16b, v0.16b, 41367and v0.16b, v0.16b, v31.16b1368tbl v0.16b, {v28.16b}, v0.16b1369tbl v2.16b, {v27.16b}, v2.16b1370eor v0.16b, v0.16b, v2.16b1371eor v1.16b, v1.16b, v1.16b1372aese v0.16b,v1.16b1373ushr v2.16b, v0.16b, 41374and v0.16b, v0.16b, v31.16b1375tbl v0.16b, {v30.16b}, v0.16b1376tbl v2.16b, {v29.16b}, v2.16b1377eor v0.16b, v0.16b, v2.16b13781379mov w7,v0.s[0]1380eor w6,w7,w7,ror #32-21381eor w6,w6,w7,ror #32-101382eor w6,w6,w7,ror #32-181383eor w6,w6,w7,ror #32-241384ldp w7,w8,[x10],81385eor w13,w13,w61386// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1387eor w6,w12,w131388eor w9,w7,w151389eor w6,w6,w91390mov v3.s[0],w61391// optimize sbox using AESE instruction1392tbl v0.16b, {v3.16b}, v26.16b1393ushr v2.16b, v0.16b, 41394and v0.16b, v0.16b, v31.16b1395tbl v0.16b, {v28.16b}, v0.16b1396tbl v2.16b, {v27.16b}, v2.16b1397eor v0.16b, v0.16b, v2.16b1398eor v1.16b, v1.16b, v1.16b1399aese v0.16b,v1.16b1400ushr v2.16b, v0.16b, 41401and v0.16b, v0.16b, v31.16b1402tbl v0.16b, {v30.16b}, v0.16b1403tbl v2.16b, {v29.16b}, v2.16b1404eor v0.16b, v0.16b, v2.16b14051406mov w7,v0.s[0]1407eor w6,w7,w7,ror #32-21408eor w6,w6,w7,ror #32-101409eor w6,w6,w7,ror #32-181410eor w6,w6,w7,ror #32-241411eor w14,w14,w61412// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1413eor w6,w12,w131414eor w9,w14,w81415eor w6,w6,w91416mov v3.s[0],w61417// optimize sbox using AESE instruction1418tbl v0.16b, {v3.16b}, v26.16b1419ushr v2.16b, v0.16b, 41420and v0.16b, v0.16b, v31.16b1421tbl v0.16b, {v28.16b}, v0.16b1422tbl v2.16b, {v27.16b}, v2.16b1423eor v0.16b, v0.16b, v2.16b1424eor v1.16b, v1.16b, v1.16b1425aese v0.16b,v1.16b1426ushr v2.16b, v0.16b, 41427and v0.16b, v0.16b, v31.16b1428tbl v0.16b, {v30.16b}, v0.16b1429tbl v2.16b, {v29.16b}, v2.16b1430eor v0.16b, v0.16b, v2.16b14311432mov w7,v0.s[0]1433eor w6,w7,w7,ror #32-21434eor w6,w6,w7,ror #32-101435eor w6,w6,w7,ror #32-181436eor w6,w6,w7,ror #32-241437eor w15,w15,w61438subs w11,w11,#11439b.ne 10b1440mov v5.s[0],w151441mov v5.s[1],w141442mov v5.s[2],w131443mov v5.s[3],w121444#ifndef __AARCH64EB__1445rev32 v4.16b,v4.16b1446#endif1447eor v6.16b,v6.16b,v5.16b1448mov x10,x31449mov w11,#81450mov w12,v6.s[0]1451mov w13,v6.s[1]1452mov w14,v6.s[2]1453mov w15,v6.s[3]145410:1455ldp w7,w8,[x10],81456// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1457eor w6,w14,w151458eor w9,w7,w131459eor w6,w6,w91460mov v3.s[0],w61461// optimize sbox using AESE instruction1462tbl v0.16b, {v3.16b}, v26.16b1463ushr v2.16b, v0.16b, 41464and v0.16b, v0.16b, v31.16b1465tbl v0.16b, {v28.16b}, v0.16b1466tbl v2.16b, {v27.16b}, v2.16b1467eor v0.16b, v0.16b, v2.16b1468eor v1.16b, v1.16b, v1.16b1469aese v0.16b,v1.16b1470ushr v2.16b, v0.16b, 41471and v0.16b, v0.16b, v31.16b1472tbl v0.16b, {v30.16b}, v0.16b1473tbl v2.16b, {v29.16b}, v2.16b1474eor v0.16b, v0.16b, v2.16b14751476mov w7,v0.s[0]1477eor w6,w7,w7,ror #32-21478eor w6,w6,w7,ror #32-101479eor w6,w6,w7,ror #32-181480eor w6,w6,w7,ror #32-241481eor w12,w12,w61482// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1483eor w6,w14,w151484eor w9,w12,w81485eor w6,w6,w91486mov v3.s[0],w61487// optimize sbox using AESE instruction1488tbl v0.16b, {v3.16b}, v26.16b1489ushr v2.16b, v0.16b, 41490and v0.16b, v0.16b, v31.16b1491tbl v0.16b, {v28.16b}, v0.16b1492tbl v2.16b, {v27.16b}, v2.16b1493eor v0.16b, v0.16b, v2.16b1494eor v1.16b, v1.16b, v1.16b1495aese v0.16b,v1.16b1496ushr v2.16b, v0.16b, 41497and v0.16b, v0.16b, v31.16b1498tbl v0.16b, {v30.16b}, v0.16b1499tbl v2.16b, {v29.16b}, v2.16b1500eor v0.16b, v0.16b, v2.16b15011502mov w7,v0.s[0]1503eor w6,w7,w7,ror #32-21504eor w6,w6,w7,ror #32-101505eor w6,w6,w7,ror #32-181506eor w6,w6,w7,ror #32-241507ldp w7,w8,[x10],81508eor w13,w13,w61509// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1510eor w6,w12,w131511eor w9,w7,w151512eor w6,w6,w91513mov v3.s[0],w61514// optimize sbox using AESE instruction1515tbl v0.16b, {v3.16b}, v26.16b1516ushr v2.16b, v0.16b, 41517and v0.16b, v0.16b, v31.16b1518tbl v0.16b, {v28.16b}, v0.16b1519tbl v2.16b, {v27.16b}, v2.16b1520eor v0.16b, v0.16b, v2.16b1521eor v1.16b, v1.16b, v1.16b1522aese v0.16b,v1.16b1523ushr v2.16b, v0.16b, 41524and v0.16b, v0.16b, v31.16b1525tbl v0.16b, {v30.16b}, v0.16b1526tbl v2.16b, {v29.16b}, v2.16b1527eor v0.16b, v0.16b, v2.16b15281529mov w7,v0.s[0]1530eor w6,w7,w7,ror #32-21531eor w6,w6,w7,ror #32-101532eor w6,w6,w7,ror #32-181533eor w6,w6,w7,ror #32-241534eor w14,w14,w61535// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1536eor w6,w12,w131537eor w9,w14,w81538eor w6,w6,w91539mov v3.s[0],w61540// optimize sbox using AESE instruction1541tbl v0.16b, {v3.16b}, v26.16b1542ushr v2.16b, v0.16b, 41543and v0.16b, v0.16b, v31.16b1544tbl v0.16b, {v28.16b}, v0.16b1545tbl v2.16b, {v27.16b}, v2.16b1546eor v0.16b, v0.16b, v2.16b1547eor v1.16b, v1.16b, v1.16b1548aese v0.16b,v1.16b1549ushr v2.16b, v0.16b, 41550and v0.16b, v0.16b, v31.16b1551tbl v0.16b, {v30.16b}, v0.16b1552tbl v2.16b, {v29.16b}, v2.16b1553eor v0.16b, v0.16b, v2.16b15541555mov w7,v0.s[0]1556eor w6,w7,w7,ror #32-21557eor w6,w6,w7,ror #32-101558eor w6,w6,w7,ror #32-181559eor w6,w6,w7,ror #32-241560eor w15,w15,w61561subs w11,w11,#11562b.ne 10b1563mov v6.s[0],w151564mov v6.s[1],w141565mov v6.s[2],w131566mov v6.s[3],w121567#ifndef __AARCH64EB__1568rev32 v5.16b,v5.16b1569#endif1570eor v7.16b,v7.16b,v6.16b1571mov x10,x31572mov w11,#81573mov w12,v7.s[0]1574mov w13,v7.s[1]1575mov w14,v7.s[2]1576mov w15,v7.s[3]157710:1578ldp w7,w8,[x10],81579// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1580eor w6,w14,w151581eor w9,w7,w131582eor w6,w6,w91583mov v3.s[0],w61584// optimize sbox using AESE instruction1585tbl v0.16b, {v3.16b}, v26.16b1586ushr v2.16b, v0.16b, 41587and v0.16b, v0.16b, v31.16b1588tbl v0.16b, {v28.16b}, v0.16b1589tbl v2.16b, {v27.16b}, v2.16b1590eor v0.16b, v0.16b, v2.16b1591eor v1.16b, v1.16b, v1.16b1592aese v0.16b,v1.16b1593ushr v2.16b, v0.16b, 41594and v0.16b, v0.16b, v31.16b1595tbl v0.16b, {v30.16b}, v0.16b1596tbl v2.16b, {v29.16b}, v2.16b1597eor v0.16b, v0.16b, v2.16b15981599mov w7,v0.s[0]1600eor w6,w7,w7,ror #32-21601eor w6,w6,w7,ror #32-101602eor w6,w6,w7,ror #32-181603eor w6,w6,w7,ror #32-241604eor w12,w12,w61605// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1606eor w6,w14,w151607eor w9,w12,w81608eor w6,w6,w91609mov v3.s[0],w61610// optimize sbox using AESE instruction1611tbl v0.16b, {v3.16b}, v26.16b1612ushr v2.16b, v0.16b, 41613and v0.16b, v0.16b, v31.16b1614tbl v0.16b, {v28.16b}, v0.16b1615tbl v2.16b, {v27.16b}, v2.16b1616eor v0.16b, v0.16b, v2.16b1617eor v1.16b, v1.16b, v1.16b1618aese v0.16b,v1.16b1619ushr v2.16b, v0.16b, 41620and v0.16b, v0.16b, v31.16b1621tbl v0.16b, {v30.16b}, v0.16b1622tbl v2.16b, {v29.16b}, v2.16b1623eor v0.16b, v0.16b, v2.16b16241625mov w7,v0.s[0]1626eor w6,w7,w7,ror #32-21627eor w6,w6,w7,ror #32-101628eor w6,w6,w7,ror #32-181629eor w6,w6,w7,ror #32-241630ldp w7,w8,[x10],81631eor w13,w13,w61632// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1633eor w6,w12,w131634eor w9,w7,w151635eor w6,w6,w91636mov v3.s[0],w61637// optimize sbox using AESE instruction1638tbl v0.16b, {v3.16b}, v26.16b1639ushr v2.16b, v0.16b, 41640and v0.16b, v0.16b, v31.16b1641tbl v0.16b, {v28.16b}, v0.16b1642tbl v2.16b, {v27.16b}, v2.16b1643eor v0.16b, v0.16b, v2.16b1644eor v1.16b, v1.16b, v1.16b1645aese v0.16b,v1.16b1646ushr v2.16b, v0.16b, 41647and v0.16b, v0.16b, v31.16b1648tbl v0.16b, {v30.16b}, v0.16b1649tbl v2.16b, {v29.16b}, v2.16b1650eor v0.16b, v0.16b, v2.16b16511652mov w7,v0.s[0]1653eor w6,w7,w7,ror #32-21654eor w6,w6,w7,ror #32-101655eor w6,w6,w7,ror #32-181656eor w6,w6,w7,ror #32-241657eor w14,w14,w61658// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1659eor w6,w12,w131660eor w9,w14,w81661eor w6,w6,w91662mov v3.s[0],w61663// optimize sbox using AESE instruction1664tbl v0.16b, {v3.16b}, v26.16b1665ushr v2.16b, v0.16b, 41666and v0.16b, v0.16b, v31.16b1667tbl v0.16b, {v28.16b}, v0.16b1668tbl v2.16b, {v27.16b}, v2.16b1669eor v0.16b, v0.16b, v2.16b1670eor v1.16b, v1.16b, v1.16b1671aese v0.16b,v1.16b1672ushr v2.16b, v0.16b, 41673and v0.16b, v0.16b, v31.16b1674tbl v0.16b, {v30.16b}, v0.16b1675tbl v2.16b, {v29.16b}, v2.16b1676eor v0.16b, v0.16b, v2.16b16771678mov w7,v0.s[0]1679eor w6,w7,w7,ror #32-21680eor w6,w6,w7,ror #32-101681eor w6,w6,w7,ror #32-181682eor w6,w6,w7,ror #32-241683eor w15,w15,w61684subs w11,w11,#11685b.ne 10b1686mov v7.s[0],w151687mov v7.s[1],w141688mov v7.s[2],w131689mov v7.s[3],w121690#ifndef __AARCH64EB__1691rev32 v6.16b,v6.16b1692#endif1693#ifndef __AARCH64EB__1694rev32 v7.16b,v7.16b1695#endif1696orr v3.16b,v7.16b,v7.16b1697st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#641698subs w2,w2,#41699b.ne .Lcbc_4_blocks_enc1700b 2f17011:1702subs w2,w2,#11703b.lt 2f1704ld1 {v4.4s},[x0],#161705eor v3.16b,v3.16b,v4.16b1706#ifndef __AARCH64EB__1707rev32 v3.16b,v3.16b1708#endif1709mov x10,x31710mov w11,#81711mov w12,v3.s[0]1712mov w13,v3.s[1]1713mov w14,v3.s[2]1714mov w15,v3.s[3]171510:1716ldp w7,w8,[x10],81717// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1718eor w6,w14,w151719eor w9,w7,w131720eor w6,w6,w91721mov v3.s[0],w61722// optimize sbox using AESE instruction1723tbl v0.16b, {v3.16b}, v26.16b1724ushr v2.16b, v0.16b, 41725and v0.16b, v0.16b, v31.16b1726tbl v0.16b, {v28.16b}, v0.16b1727tbl v2.16b, {v27.16b}, v2.16b1728eor v0.16b, v0.16b, v2.16b1729eor v1.16b, v1.16b, v1.16b1730aese v0.16b,v1.16b1731ushr v2.16b, v0.16b, 41732and v0.16b, v0.16b, v31.16b1733tbl v0.16b, {v30.16b}, v0.16b1734tbl v2.16b, {v29.16b}, v2.16b1735eor v0.16b, v0.16b, v2.16b17361737mov w7,v0.s[0]1738eor w6,w7,w7,ror #32-21739eor w6,w6,w7,ror #32-101740eor w6,w6,w7,ror #32-181741eor w6,w6,w7,ror #32-241742eor w12,w12,w61743// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1744eor w6,w14,w151745eor w9,w12,w81746eor w6,w6,w91747mov v3.s[0],w61748// optimize sbox using AESE instruction1749tbl v0.16b, {v3.16b}, v26.16b1750ushr v2.16b, v0.16b, 41751and v0.16b, v0.16b, v31.16b1752tbl v0.16b, {v28.16b}, v0.16b1753tbl v2.16b, {v27.16b}, v2.16b1754eor v0.16b, v0.16b, v2.16b1755eor v1.16b, v1.16b, v1.16b1756aese v0.16b,v1.16b1757ushr v2.16b, v0.16b, 41758and v0.16b, v0.16b, v31.16b1759tbl v0.16b, {v30.16b}, v0.16b1760tbl v2.16b, {v29.16b}, v2.16b1761eor v0.16b, v0.16b, v2.16b17621763mov w7,v0.s[0]1764eor w6,w7,w7,ror #32-21765eor w6,w6,w7,ror #32-101766eor w6,w6,w7,ror #32-181767eor w6,w6,w7,ror #32-241768ldp w7,w8,[x10],81769eor w13,w13,w61770// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1771eor w6,w12,w131772eor w9,w7,w151773eor w6,w6,w91774mov v3.s[0],w61775// optimize sbox using AESE instruction1776tbl v0.16b, {v3.16b}, v26.16b1777ushr v2.16b, v0.16b, 41778and v0.16b, v0.16b, v31.16b1779tbl v0.16b, {v28.16b}, v0.16b1780tbl v2.16b, {v27.16b}, v2.16b1781eor v0.16b, v0.16b, v2.16b1782eor v1.16b, v1.16b, v1.16b1783aese v0.16b,v1.16b1784ushr v2.16b, v0.16b, 41785and v0.16b, v0.16b, v31.16b1786tbl v0.16b, {v30.16b}, v0.16b1787tbl v2.16b, {v29.16b}, v2.16b1788eor v0.16b, v0.16b, v2.16b17891790mov w7,v0.s[0]1791eor w6,w7,w7,ror #32-21792eor w6,w6,w7,ror #32-101793eor w6,w6,w7,ror #32-181794eor w6,w6,w7,ror #32-241795eor w14,w14,w61796// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1797eor w6,w12,w131798eor w9,w14,w81799eor w6,w6,w91800mov v3.s[0],w61801// optimize sbox using AESE instruction1802tbl v0.16b, {v3.16b}, v26.16b1803ushr v2.16b, v0.16b, 41804and v0.16b, v0.16b, v31.16b1805tbl v0.16b, {v28.16b}, v0.16b1806tbl v2.16b, {v27.16b}, v2.16b1807eor v0.16b, v0.16b, v2.16b1808eor v1.16b, v1.16b, v1.16b1809aese v0.16b,v1.16b1810ushr v2.16b, v0.16b, 41811and v0.16b, v0.16b, v31.16b1812tbl v0.16b, {v30.16b}, v0.16b1813tbl v2.16b, {v29.16b}, v2.16b1814eor v0.16b, v0.16b, v2.16b18151816mov w7,v0.s[0]1817eor w6,w7,w7,ror #32-21818eor w6,w6,w7,ror #32-101819eor w6,w6,w7,ror #32-181820eor w6,w6,w7,ror #32-241821eor w15,w15,w61822subs w11,w11,#11823b.ne 10b1824mov v3.s[0],w151825mov v3.s[1],w141826mov v3.s[2],w131827mov v3.s[3],w121828#ifndef __AARCH64EB__1829rev32 v3.16b,v3.16b1830#endif1831st1 {v3.4s},[x1],#161832b 1b18332:1834// save back IV1835st1 {v3.4s},[x4]1836ret18371838.Ldec:1839// decryption mode starts1840AARCH64_SIGN_LINK_REGISTER1841stp d8,d9,[sp,#-80]!1842stp d10,d11,[sp,#16]1843stp d12,d13,[sp,#32]1844stp d14,d15,[sp,#48]1845stp x29,x30,[sp,#64]1846.Lcbc_8_blocks_dec:1847cmp w2,#81848b.lt 1f1849ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]1850add x10,x0,#641851ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]1852#ifndef __AARCH64EB__1853rev32 v4.16b,v4.16b1854#endif1855#ifndef __AARCH64EB__1856rev32 v5.16b,v5.16b1857#endif1858#ifndef __AARCH64EB__1859rev32 v6.16b,v6.16b1860#endif1861#ifndef __AARCH64EB__1862rev32 v7.16b,v7.16b1863#endif1864#ifndef __AARCH64EB__1865rev32 v8.16b,v8.16b1866#endif1867#ifndef __AARCH64EB__1868rev32 v9.16b,v9.16b1869#endif1870#ifndef __AARCH64EB__1871rev32 v10.16b,v10.16b1872#endif1873#ifndef __AARCH64EB__1874rev32 v11.16b,v11.16b1875#endif1876bl _vpsm4_ex_enc_8blks1877zip1 v8.4s,v0.4s,v1.4s1878zip2 v9.4s,v0.4s,v1.4s1879zip1 v10.4s,v2.4s,v3.4s1880zip2 v11.4s,v2.4s,v3.4s1881zip1 v0.2d,v8.2d,v10.2d1882zip2 v1.2d,v8.2d,v10.2d1883zip1 v2.2d,v9.2d,v11.2d1884zip2 v3.2d,v9.2d,v11.2d1885zip1 v8.4s,v4.4s,v5.4s1886zip2 v9.4s,v4.4s,v5.4s1887zip1 v10.4s,v6.4s,v7.4s1888zip2 v11.4s,v6.4s,v7.4s1889zip1 v4.2d,v8.2d,v10.2d1890zip2 v5.2d,v8.2d,v10.2d1891zip1 v6.2d,v9.2d,v11.2d1892zip2 v7.2d,v9.2d,v11.2d1893ld1 {v15.4s},[x4]1894ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#641895// note ivec1 and vtmpx[3] are reusing the same register1896// care needs to be taken to avoid conflict1897eor v0.16b,v0.16b,v15.16b1898ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#641899eor v1.16b,v1.16b,v8.16b1900eor v2.16b,v2.16b,v9.16b1901eor v3.16b,v3.16b,v10.16b1902// save back IV1903st1 {v15.4s}, [x4]1904eor v4.16b,v4.16b,v11.16b1905eor v5.16b,v5.16b,v12.16b1906eor v6.16b,v6.16b,v13.16b1907eor v7.16b,v7.16b,v14.16b1908st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#641909st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#641910subs w2,w2,#81911b.gt .Lcbc_8_blocks_dec1912b.eq 100f19131:1914ld1 {v15.4s},[x4]1915.Lcbc_4_blocks_dec:1916cmp w2,#41917b.lt 1f1918ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]1919#ifndef __AARCH64EB__1920rev32 v4.16b,v4.16b1921#endif1922#ifndef __AARCH64EB__1923rev32 v5.16b,v5.16b1924#endif1925#ifndef __AARCH64EB__1926rev32 v6.16b,v6.16b1927#endif1928#ifndef __AARCH64EB__1929rev32 v7.16b,v7.16b1930#endif1931bl _vpsm4_ex_enc_4blks1932ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#641933zip1 v8.4s,v0.4s,v1.4s1934zip2 v9.4s,v0.4s,v1.4s1935zip1 v10.4s,v2.4s,v3.4s1936zip2 v11.4s,v2.4s,v3.4s1937zip1 v0.2d,v8.2d,v10.2d1938zip2 v1.2d,v8.2d,v10.2d1939zip1 v2.2d,v9.2d,v11.2d1940zip2 v3.2d,v9.2d,v11.2d1941eor v0.16b,v0.16b,v15.16b1942eor v1.16b,v1.16b,v4.16b1943orr v15.16b,v7.16b,v7.16b1944eor v2.16b,v2.16b,v5.16b1945eor v3.16b,v3.16b,v6.16b1946st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#641947subs w2,w2,#41948b.gt .Lcbc_4_blocks_dec1949// save back IV1950st1 {v7.4s}, [x4]1951b 100f19521: // last block1953subs w2,w2,#11954b.lt 100f1955b.gt 1f1956ld1 {v4.4s},[x0],#161957// save back IV1958st1 {v4.4s}, [x4]1959#ifndef __AARCH64EB__1960rev32 v8.16b,v4.16b1961#else1962mov v8.16b,v4.16b1963#endif1964mov x10,x31965mov w11,#81966mov w12,v8.s[0]1967mov w13,v8.s[1]1968mov w14,v8.s[2]1969mov w15,v8.s[3]197010:1971ldp w7,w8,[x10],81972// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1973eor w6,w14,w151974eor w9,w7,w131975eor w6,w6,w91976mov v3.s[0],w61977// optimize sbox using AESE instruction1978tbl v0.16b, {v3.16b}, v26.16b1979ushr v2.16b, v0.16b, 41980and v0.16b, v0.16b, v31.16b1981tbl v0.16b, {v28.16b}, v0.16b1982tbl v2.16b, {v27.16b}, v2.16b1983eor v0.16b, v0.16b, v2.16b1984eor v1.16b, v1.16b, v1.16b1985aese v0.16b,v1.16b1986ushr v2.16b, v0.16b, 41987and v0.16b, v0.16b, v31.16b1988tbl v0.16b, {v30.16b}, v0.16b1989tbl v2.16b, {v29.16b}, v2.16b1990eor v0.16b, v0.16b, v2.16b19911992mov w7,v0.s[0]1993eor w6,w7,w7,ror #32-21994eor w6,w6,w7,ror #32-101995eor w6,w6,w7,ror #32-181996eor w6,w6,w7,ror #32-241997eor w12,w12,w61998// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1999eor w6,w14,w152000eor w9,w12,w82001eor w6,w6,w92002mov v3.s[0],w62003// optimize sbox using AESE instruction2004tbl v0.16b, {v3.16b}, v26.16b2005ushr v2.16b, v0.16b, 42006and v0.16b, v0.16b, v31.16b2007tbl v0.16b, {v28.16b}, v0.16b2008tbl v2.16b, {v27.16b}, v2.16b2009eor v0.16b, v0.16b, v2.16b2010eor v1.16b, v1.16b, v1.16b2011aese v0.16b,v1.16b2012ushr v2.16b, v0.16b, 42013and v0.16b, v0.16b, v31.16b2014tbl v0.16b, {v30.16b}, v0.16b2015tbl v2.16b, {v29.16b}, v2.16b2016eor v0.16b, v0.16b, v2.16b20172018mov w7,v0.s[0]2019eor w6,w7,w7,ror #32-22020eor w6,w6,w7,ror #32-102021eor w6,w6,w7,ror #32-182022eor w6,w6,w7,ror #32-242023ldp w7,w8,[x10],82024eor w13,w13,w62025// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2026eor w6,w12,w132027eor w9,w7,w152028eor w6,w6,w92029mov v3.s[0],w62030// optimize sbox using AESE instruction2031tbl v0.16b, {v3.16b}, v26.16b2032ushr v2.16b, v0.16b, 42033and v0.16b, v0.16b, v31.16b2034tbl v0.16b, {v28.16b}, v0.16b2035tbl v2.16b, {v27.16b}, v2.16b2036eor v0.16b, v0.16b, v2.16b2037eor v1.16b, v1.16b, v1.16b2038aese v0.16b,v1.16b2039ushr v2.16b, v0.16b, 42040and v0.16b, v0.16b, v31.16b2041tbl v0.16b, {v30.16b}, v0.16b2042tbl v2.16b, {v29.16b}, v2.16b2043eor v0.16b, v0.16b, v2.16b20442045mov w7,v0.s[0]2046eor w6,w7,w7,ror #32-22047eor w6,w6,w7,ror #32-102048eor w6,w6,w7,ror #32-182049eor w6,w6,w7,ror #32-242050eor w14,w14,w62051// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2052eor w6,w12,w132053eor w9,w14,w82054eor w6,w6,w92055mov v3.s[0],w62056// optimize sbox using AESE instruction2057tbl v0.16b, {v3.16b}, v26.16b2058ushr v2.16b, v0.16b, 42059and v0.16b, v0.16b, v31.16b2060tbl v0.16b, {v28.16b}, v0.16b2061tbl v2.16b, {v27.16b}, v2.16b2062eor v0.16b, v0.16b, v2.16b2063eor v1.16b, v1.16b, v1.16b2064aese v0.16b,v1.16b2065ushr v2.16b, v0.16b, 42066and v0.16b, v0.16b, v31.16b2067tbl v0.16b, {v30.16b}, v0.16b2068tbl v2.16b, {v29.16b}, v2.16b2069eor v0.16b, v0.16b, v2.16b20702071mov w7,v0.s[0]2072eor w6,w7,w7,ror #32-22073eor w6,w6,w7,ror #32-102074eor w6,w6,w7,ror #32-182075eor w6,w6,w7,ror #32-242076eor w15,w15,w62077subs w11,w11,#12078b.ne 10b2079mov v8.s[0],w152080mov v8.s[1],w142081mov v8.s[2],w132082mov v8.s[3],w122083#ifndef __AARCH64EB__2084rev32 v8.16b,v8.16b2085#endif2086eor v8.16b,v8.16b,v15.16b2087st1 {v8.4s},[x1],#162088b 100f20891: // last two blocks2090ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]2091add x10,x0,#162092ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#162093subs w2,w2,12094b.gt 1f2095#ifndef __AARCH64EB__2096rev32 v4.16b,v4.16b2097#endif2098#ifndef __AARCH64EB__2099rev32 v5.16b,v5.16b2100#endif2101#ifndef __AARCH64EB__2102rev32 v6.16b,v6.16b2103#endif2104#ifndef __AARCH64EB__2105rev32 v7.16b,v7.16b2106#endif2107bl _vpsm4_ex_enc_4blks2108ld1 {v4.4s,v5.4s},[x0],#322109zip1 v8.4s,v0.4s,v1.4s2110zip2 v9.4s,v0.4s,v1.4s2111zip1 v10.4s,v2.4s,v3.4s2112zip2 v11.4s,v2.4s,v3.4s2113zip1 v0.2d,v8.2d,v10.2d2114zip2 v1.2d,v8.2d,v10.2d2115zip1 v2.2d,v9.2d,v11.2d2116zip2 v3.2d,v9.2d,v11.2d2117eor v0.16b,v0.16b,v15.16b2118eor v1.16b,v1.16b,v4.16b2119st1 {v0.4s,v1.4s},[x1],#322120// save back IV2121st1 {v5.4s}, [x4]2122b 100f21231: // last 3 blocks2124ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]2125#ifndef __AARCH64EB__2126rev32 v4.16b,v4.16b2127#endif2128#ifndef __AARCH64EB__2129rev32 v5.16b,v5.16b2130#endif2131#ifndef __AARCH64EB__2132rev32 v6.16b,v6.16b2133#endif2134#ifndef __AARCH64EB__2135rev32 v7.16b,v7.16b2136#endif2137bl _vpsm4_ex_enc_4blks2138ld1 {v4.4s,v5.4s,v6.4s},[x0],#482139zip1 v8.4s,v0.4s,v1.4s2140zip2 v9.4s,v0.4s,v1.4s2141zip1 v10.4s,v2.4s,v3.4s2142zip2 v11.4s,v2.4s,v3.4s2143zip1 v0.2d,v8.2d,v10.2d2144zip2 v1.2d,v8.2d,v10.2d2145zip1 v2.2d,v9.2d,v11.2d2146zip2 v3.2d,v9.2d,v11.2d2147eor v0.16b,v0.16b,v15.16b2148eor v1.16b,v1.16b,v4.16b2149eor v2.16b,v2.16b,v5.16b2150st1 {v0.4s,v1.4s,v2.4s},[x1],#482151// save back IV2152st1 {v6.4s}, [x4]2153100:2154ldp d10,d11,[sp,#16]2155ldp d12,d13,[sp,#32]2156ldp d14,d15,[sp,#48]2157ldp x29,x30,[sp,#64]2158ldp d8,d9,[sp],#802159AARCH64_VALIDATE_LINK_REGISTER2160ret2161.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt2162.globl vpsm4_ex_ctr32_encrypt_blocks2163.type vpsm4_ex_ctr32_encrypt_blocks,%function2164.align 52165vpsm4_ex_ctr32_encrypt_blocks:2166AARCH64_VALID_CALL_TARGET2167ld1 {v3.4s},[x4]2168#ifndef __AARCH64EB__2169rev32 v3.16b,v3.16b2170#endif2171adrp x9, .Lsbox_magic2172ldr q26, [x9, #:lo12:.Lsbox_magic]2173ldr q27, [x9, #:lo12:.Lsbox_magic+16]2174ldr q28, [x9, #:lo12:.Lsbox_magic+32]2175ldr q29, [x9, #:lo12:.Lsbox_magic+48]2176ldr q30, [x9, #:lo12:.Lsbox_magic+64]2177ldr q31, [x9, #:lo12:.Lsbox_magic+80]2178cmp w2,#12179b.ne 1f2180// fast processing for one single block without2181// context saving overhead2182mov x10,x32183mov w11,#82184mov w12,v3.s[0]2185mov w13,v3.s[1]2186mov w14,v3.s[2]2187mov w15,v3.s[3]218810:2189ldp w7,w8,[x10],82190// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2191eor w6,w14,w152192eor w9,w7,w132193eor w6,w6,w92194mov v3.s[0],w62195// optimize sbox using AESE instruction2196tbl v0.16b, {v3.16b}, v26.16b2197ushr v2.16b, v0.16b, 42198and v0.16b, v0.16b, v31.16b2199tbl v0.16b, {v28.16b}, v0.16b2200tbl v2.16b, {v27.16b}, v2.16b2201eor v0.16b, v0.16b, v2.16b2202eor v1.16b, v1.16b, v1.16b2203aese v0.16b,v1.16b2204ushr v2.16b, v0.16b, 42205and v0.16b, v0.16b, v31.16b2206tbl v0.16b, {v30.16b}, v0.16b2207tbl v2.16b, {v29.16b}, v2.16b2208eor v0.16b, v0.16b, v2.16b22092210mov w7,v0.s[0]2211eor w6,w7,w7,ror #32-22212eor w6,w6,w7,ror #32-102213eor w6,w6,w7,ror #32-182214eor w6,w6,w7,ror #32-242215eor w12,w12,w62216// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)2217eor w6,w14,w152218eor w9,w12,w82219eor w6,w6,w92220mov v3.s[0],w62221// optimize sbox using AESE instruction2222tbl v0.16b, {v3.16b}, v26.16b2223ushr v2.16b, v0.16b, 42224and v0.16b, v0.16b, v31.16b2225tbl v0.16b, {v28.16b}, v0.16b2226tbl v2.16b, {v27.16b}, v2.16b2227eor v0.16b, v0.16b, v2.16b2228eor v1.16b, v1.16b, v1.16b2229aese v0.16b,v1.16b2230ushr v2.16b, v0.16b, 42231and v0.16b, v0.16b, v31.16b2232tbl v0.16b, {v30.16b}, v0.16b2233tbl v2.16b, {v29.16b}, v2.16b2234eor v0.16b, v0.16b, v2.16b22352236mov w7,v0.s[0]2237eor w6,w7,w7,ror #32-22238eor w6,w6,w7,ror #32-102239eor w6,w6,w7,ror #32-182240eor w6,w6,w7,ror #32-242241ldp w7,w8,[x10],82242eor w13,w13,w62243// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2244eor w6,w12,w132245eor w9,w7,w152246eor w6,w6,w92247mov v3.s[0],w62248// optimize sbox using AESE instruction2249tbl v0.16b, {v3.16b}, v26.16b2250ushr v2.16b, v0.16b, 42251and v0.16b, v0.16b, v31.16b2252tbl v0.16b, {v28.16b}, v0.16b2253tbl v2.16b, {v27.16b}, v2.16b2254eor v0.16b, v0.16b, v2.16b2255eor v1.16b, v1.16b, v1.16b2256aese v0.16b,v1.16b2257ushr v2.16b, v0.16b, 42258and v0.16b, v0.16b, v31.16b2259tbl v0.16b, {v30.16b}, v0.16b2260tbl v2.16b, {v29.16b}, v2.16b2261eor v0.16b, v0.16b, v2.16b22622263mov w7,v0.s[0]2264eor w6,w7,w7,ror #32-22265eor w6,w6,w7,ror #32-102266eor w6,w6,w7,ror #32-182267eor w6,w6,w7,ror #32-242268eor w14,w14,w62269// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2270eor w6,w12,w132271eor w9,w14,w82272eor w6,w6,w92273mov v3.s[0],w62274// optimize sbox using AESE instruction2275tbl v0.16b, {v3.16b}, v26.16b2276ushr v2.16b, v0.16b, 42277and v0.16b, v0.16b, v31.16b2278tbl v0.16b, {v28.16b}, v0.16b2279tbl v2.16b, {v27.16b}, v2.16b2280eor v0.16b, v0.16b, v2.16b2281eor v1.16b, v1.16b, v1.16b2282aese v0.16b,v1.16b2283ushr v2.16b, v0.16b, 42284and v0.16b, v0.16b, v31.16b2285tbl v0.16b, {v30.16b}, v0.16b2286tbl v2.16b, {v29.16b}, v2.16b2287eor v0.16b, v0.16b, v2.16b22882289mov w7,v0.s[0]2290eor w6,w7,w7,ror #32-22291eor w6,w6,w7,ror #32-102292eor w6,w6,w7,ror #32-182293eor w6,w6,w7,ror #32-242294eor w15,w15,w62295subs w11,w11,#12296b.ne 10b2297mov v3.s[0],w152298mov v3.s[1],w142299mov v3.s[2],w132300mov v3.s[3],w122301#ifndef __AARCH64EB__2302rev32 v3.16b,v3.16b2303#endif2304ld1 {v4.4s},[x0]2305eor v4.16b,v4.16b,v3.16b2306st1 {v4.4s},[x1]2307ret23081:2309AARCH64_SIGN_LINK_REGISTER2310stp d8,d9,[sp,#-80]!2311stp d10,d11,[sp,#16]2312stp d12,d13,[sp,#32]2313stp d14,d15,[sp,#48]2314stp x29,x30,[sp,#64]2315mov w12,v3.s[0]2316mov w13,v3.s[1]2317mov w14,v3.s[2]2318mov w5,v3.s[3]2319.Lctr32_4_blocks_process:2320cmp w2,#42321b.lt 1f2322dup v4.4s,w122323dup v5.4s,w132324dup v6.4s,w142325mov v7.s[0],w52326add w5,w5,#12327mov v7.s[1],w52328add w5,w5,#12329mov v7.s[2],w52330add w5,w5,#12331mov v7.s[3],w52332add w5,w5,#12333cmp w2,#82334b.ge .Lctr32_8_blocks_process2335bl _vpsm4_ex_enc_4blks2336ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#642337eor v0.16b,v0.16b,v12.16b2338eor v1.16b,v1.16b,v13.16b2339eor v2.16b,v2.16b,v14.16b2340eor v3.16b,v3.16b,v15.16b2341st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642342subs w2,w2,#42343b.ne .Lctr32_4_blocks_process2344b 100f2345.Lctr32_8_blocks_process:2346dup v8.4s,w122347dup v9.4s,w132348dup v10.4s,w142349mov v11.s[0],w52350add w5,w5,#12351mov v11.s[1],w52352add w5,w5,#12353mov v11.s[2],w52354add w5,w5,#12355mov v11.s[3],w52356add w5,w5,#12357bl _vpsm4_ex_enc_8blks2358ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#642359ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#642360eor v0.16b,v0.16b,v12.16b2361eor v1.16b,v1.16b,v13.16b2362eor v2.16b,v2.16b,v14.16b2363eor v3.16b,v3.16b,v15.16b2364eor v4.16b,v4.16b,v8.16b2365eor v5.16b,v5.16b,v9.16b2366eor v6.16b,v6.16b,v10.16b2367eor v7.16b,v7.16b,v11.16b2368st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642369st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#642370subs w2,w2,#82371b.ne .Lctr32_4_blocks_process2372b 100f23731: // last block processing2374subs w2,w2,#12375b.lt 100f2376b.gt 1f2377mov v3.s[0],w122378mov v3.s[1],w132379mov v3.s[2],w142380mov v3.s[3],w52381mov x10,x32382mov w11,#82383mov w12,v3.s[0]2384mov w13,v3.s[1]2385mov w14,v3.s[2]2386mov w15,v3.s[3]238710:2388ldp w7,w8,[x10],82389// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2390eor w6,w14,w152391eor w9,w7,w132392eor w6,w6,w92393mov v3.s[0],w62394// optimize sbox using AESE instruction2395tbl v0.16b, {v3.16b}, v26.16b2396ushr v2.16b, v0.16b, 42397and v0.16b, v0.16b, v31.16b2398tbl v0.16b, {v28.16b}, v0.16b2399tbl v2.16b, {v27.16b}, v2.16b2400eor v0.16b, v0.16b, v2.16b2401eor v1.16b, v1.16b, v1.16b2402aese v0.16b,v1.16b2403ushr v2.16b, v0.16b, 42404and v0.16b, v0.16b, v31.16b2405tbl v0.16b, {v30.16b}, v0.16b2406tbl v2.16b, {v29.16b}, v2.16b2407eor v0.16b, v0.16b, v2.16b24082409mov w7,v0.s[0]2410eor w6,w7,w7,ror #32-22411eor w6,w6,w7,ror #32-102412eor w6,w6,w7,ror #32-182413eor w6,w6,w7,ror #32-242414eor w12,w12,w62415// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)2416eor w6,w14,w152417eor w9,w12,w82418eor w6,w6,w92419mov v3.s[0],w62420// optimize sbox using AESE instruction2421tbl v0.16b, {v3.16b}, v26.16b2422ushr v2.16b, v0.16b, 42423and v0.16b, v0.16b, v31.16b2424tbl v0.16b, {v28.16b}, v0.16b2425tbl v2.16b, {v27.16b}, v2.16b2426eor v0.16b, v0.16b, v2.16b2427eor v1.16b, v1.16b, v1.16b2428aese v0.16b,v1.16b2429ushr v2.16b, v0.16b, 42430and v0.16b, v0.16b, v31.16b2431tbl v0.16b, {v30.16b}, v0.16b2432tbl v2.16b, {v29.16b}, v2.16b2433eor v0.16b, v0.16b, v2.16b24342435mov w7,v0.s[0]2436eor w6,w7,w7,ror #32-22437eor w6,w6,w7,ror #32-102438eor w6,w6,w7,ror #32-182439eor w6,w6,w7,ror #32-242440ldp w7,w8,[x10],82441eor w13,w13,w62442// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2443eor w6,w12,w132444eor w9,w7,w152445eor w6,w6,w92446mov v3.s[0],w62447// optimize sbox using AESE instruction2448tbl v0.16b, {v3.16b}, v26.16b2449ushr v2.16b, v0.16b, 42450and v0.16b, v0.16b, v31.16b2451tbl v0.16b, {v28.16b}, v0.16b2452tbl v2.16b, {v27.16b}, v2.16b2453eor v0.16b, v0.16b, v2.16b2454eor v1.16b, v1.16b, v1.16b2455aese v0.16b,v1.16b2456ushr v2.16b, v0.16b, 42457and v0.16b, v0.16b, v31.16b2458tbl v0.16b, {v30.16b}, v0.16b2459tbl v2.16b, {v29.16b}, v2.16b2460eor v0.16b, v0.16b, v2.16b24612462mov w7,v0.s[0]2463eor w6,w7,w7,ror #32-22464eor w6,w6,w7,ror #32-102465eor w6,w6,w7,ror #32-182466eor w6,w6,w7,ror #32-242467eor w14,w14,w62468// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2469eor w6,w12,w132470eor w9,w14,w82471eor w6,w6,w92472mov v3.s[0],w62473// optimize sbox using AESE instruction2474tbl v0.16b, {v3.16b}, v26.16b2475ushr v2.16b, v0.16b, 42476and v0.16b, v0.16b, v31.16b2477tbl v0.16b, {v28.16b}, v0.16b2478tbl v2.16b, {v27.16b}, v2.16b2479eor v0.16b, v0.16b, v2.16b2480eor v1.16b, v1.16b, v1.16b2481aese v0.16b,v1.16b2482ushr v2.16b, v0.16b, 42483and v0.16b, v0.16b, v31.16b2484tbl v0.16b, {v30.16b}, v0.16b2485tbl v2.16b, {v29.16b}, v2.16b2486eor v0.16b, v0.16b, v2.16b24872488mov w7,v0.s[0]2489eor w6,w7,w7,ror #32-22490eor w6,w6,w7,ror #32-102491eor w6,w6,w7,ror #32-182492eor w6,w6,w7,ror #32-242493eor w15,w15,w62494subs w11,w11,#12495b.ne 10b2496mov v3.s[0],w152497mov v3.s[1],w142498mov v3.s[2],w132499mov v3.s[3],w122500#ifndef __AARCH64EB__2501rev32 v3.16b,v3.16b2502#endif2503ld1 {v4.4s},[x0]2504eor v4.16b,v4.16b,v3.16b2505st1 {v4.4s},[x1]2506b 100f25071: // last 2 blocks processing2508dup v4.4s,w122509dup v5.4s,w132510dup v6.4s,w142511mov v7.s[0],w52512add w5,w5,#12513mov v7.s[1],w52514subs w2,w2,#12515b.ne 1f2516bl _vpsm4_ex_enc_4blks2517ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#162518ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#162519eor v0.16b,v0.16b,v12.16b2520eor v1.16b,v1.16b,v13.16b2521eor v2.16b,v2.16b,v14.16b2522eor v3.16b,v3.16b,v15.16b2523st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#162524st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#162525b 100f25261: // last 3 blocks processing2527add w5,w5,#12528mov v7.s[2],w52529bl _vpsm4_ex_enc_4blks2530ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#162531ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#162532ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#162533eor v0.16b,v0.16b,v12.16b2534eor v1.16b,v1.16b,v13.16b2535eor v2.16b,v2.16b,v14.16b2536eor v3.16b,v3.16b,v15.16b2537st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#162538st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#162539st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#162540100:2541ldp d10,d11,[sp,#16]2542ldp d12,d13,[sp,#32]2543ldp d14,d15,[sp,#48]2544ldp x29,x30,[sp,#64]2545ldp d8,d9,[sp],#802546AARCH64_VALIDATE_LINK_REGISTER2547ret2548.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks2549.globl vpsm4_ex_xts_encrypt_gb2550.type vpsm4_ex_xts_encrypt_gb,%function2551.align 52552vpsm4_ex_xts_encrypt_gb:2553AARCH64_SIGN_LINK_REGISTER2554stp x15, x16, [sp, #-0x10]!2555stp x17, x18, [sp, #-0x10]!2556stp x19, x20, [sp, #-0x10]!2557stp x21, x22, [sp, #-0x10]!2558stp x23, x24, [sp, #-0x10]!2559stp x25, x26, [sp, #-0x10]!2560stp x27, x28, [sp, #-0x10]!2561stp x29, x30, [sp, #-0x10]!2562stp d8, d9, [sp, #-0x10]!2563stp d10, d11, [sp, #-0x10]!2564stp d12, d13, [sp, #-0x10]!2565stp d14, d15, [sp, #-0x10]!2566mov x26,x32567mov x27,x42568mov w28,w62569ld1 {v16.4s}, [x5]2570mov x3,x272571adrp x9, .Lsbox_magic2572ldr q26, [x9, #:lo12:.Lsbox_magic]2573ldr q27, [x9, #:lo12:.Lsbox_magic+16]2574ldr q28, [x9, #:lo12:.Lsbox_magic+32]2575ldr q29, [x9, #:lo12:.Lsbox_magic+48]2576ldr q30, [x9, #:lo12:.Lsbox_magic+64]2577ldr q31, [x9, #:lo12:.Lsbox_magic+80]2578#ifndef __AARCH64EB__2579rev32 v16.16b,v16.16b2580#endif2581mov x10,x32582mov w11,#82583mov w12,v16.s[0]2584mov w13,v16.s[1]2585mov w14,v16.s[2]2586mov w15,v16.s[3]258710:2588ldp w7,w8,[x10],82589// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2590eor w6,w14,w152591eor w9,w7,w132592eor w6,w6,w92593mov v3.s[0],w62594// optimize sbox using AESE instruction2595tbl v0.16b, {v3.16b}, v26.16b2596ushr v2.16b, v0.16b, 42597and v0.16b, v0.16b, v31.16b2598tbl v0.16b, {v28.16b}, v0.16b2599tbl v2.16b, {v27.16b}, v2.16b2600eor v0.16b, v0.16b, v2.16b2601eor v1.16b, v1.16b, v1.16b2602aese v0.16b,v1.16b2603ushr v2.16b, v0.16b, 42604and v0.16b, v0.16b, v31.16b2605tbl v0.16b, {v30.16b}, v0.16b2606tbl v2.16b, {v29.16b}, v2.16b2607eor v0.16b, v0.16b, v2.16b26082609mov w7,v0.s[0]2610eor w6,w7,w7,ror #32-22611eor w6,w6,w7,ror #32-102612eor w6,w6,w7,ror #32-182613eor w6,w6,w7,ror #32-242614eor w12,w12,w62615// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)2616eor w6,w14,w152617eor w9,w12,w82618eor w6,w6,w92619mov v3.s[0],w62620// optimize sbox using AESE instruction2621tbl v0.16b, {v3.16b}, v26.16b2622ushr v2.16b, v0.16b, 42623and v0.16b, v0.16b, v31.16b2624tbl v0.16b, {v28.16b}, v0.16b2625tbl v2.16b, {v27.16b}, v2.16b2626eor v0.16b, v0.16b, v2.16b2627eor v1.16b, v1.16b, v1.16b2628aese v0.16b,v1.16b2629ushr v2.16b, v0.16b, 42630and v0.16b, v0.16b, v31.16b2631tbl v0.16b, {v30.16b}, v0.16b2632tbl v2.16b, {v29.16b}, v2.16b2633eor v0.16b, v0.16b, v2.16b26342635mov w7,v0.s[0]2636eor w6,w7,w7,ror #32-22637eor w6,w6,w7,ror #32-102638eor w6,w6,w7,ror #32-182639eor w6,w6,w7,ror #32-242640ldp w7,w8,[x10],82641eor w13,w13,w62642// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2643eor w6,w12,w132644eor w9,w7,w152645eor w6,w6,w92646mov v3.s[0],w62647// optimize sbox using AESE instruction2648tbl v0.16b, {v3.16b}, v26.16b2649ushr v2.16b, v0.16b, 42650and v0.16b, v0.16b, v31.16b2651tbl v0.16b, {v28.16b}, v0.16b2652tbl v2.16b, {v27.16b}, v2.16b2653eor v0.16b, v0.16b, v2.16b2654eor v1.16b, v1.16b, v1.16b2655aese v0.16b,v1.16b2656ushr v2.16b, v0.16b, 42657and v0.16b, v0.16b, v31.16b2658tbl v0.16b, {v30.16b}, v0.16b2659tbl v2.16b, {v29.16b}, v2.16b2660eor v0.16b, v0.16b, v2.16b26612662mov w7,v0.s[0]2663eor w6,w7,w7,ror #32-22664eor w6,w6,w7,ror #32-102665eor w6,w6,w7,ror #32-182666eor w6,w6,w7,ror #32-242667eor w14,w14,w62668// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2669eor w6,w12,w132670eor w9,w14,w82671eor w6,w6,w92672mov v3.s[0],w62673// optimize sbox using AESE instruction2674tbl v0.16b, {v3.16b}, v26.16b2675ushr v2.16b, v0.16b, 42676and v0.16b, v0.16b, v31.16b2677tbl v0.16b, {v28.16b}, v0.16b2678tbl v2.16b, {v27.16b}, v2.16b2679eor v0.16b, v0.16b, v2.16b2680eor v1.16b, v1.16b, v1.16b2681aese v0.16b,v1.16b2682ushr v2.16b, v0.16b, 42683and v0.16b, v0.16b, v31.16b2684tbl v0.16b, {v30.16b}, v0.16b2685tbl v2.16b, {v29.16b}, v2.16b2686eor v0.16b, v0.16b, v2.16b26872688mov w7,v0.s[0]2689eor w6,w7,w7,ror #32-22690eor w6,w6,w7,ror #32-102691eor w6,w6,w7,ror #32-182692eor w6,w6,w7,ror #32-242693eor w15,w15,w62694subs w11,w11,#12695b.ne 10b2696mov v16.s[0],w152697mov v16.s[1],w142698mov v16.s[2],w132699mov v16.s[3],w122700#ifndef __AARCH64EB__2701rev32 v16.16b,v16.16b2702#endif2703mov x3,x262704and x29,x2,#0x0F2705// convert length into blocks2706lsr x2,x2,42707cmp x2,#12708b.lt .return_gb27092710cmp x29,02711// If the encryption/decryption Length is N times of 16,2712// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb2713b.eq .xts_encrypt_blocks_gb27142715// If the encryption/decryption length is not N times of 16,2716// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb2717// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb2718subs x2,x2,#12719b.eq .only_2blks_tweak_gb2720.xts_encrypt_blocks_gb:2721rbit v16.16b,v16.16b2722#ifdef __AARCH64EB__2723rev32 v16.16b,v16.16b2724#endif2725mov x12,v16.d[0]2726mov x13,v16.d[1]2727mov w7,0x872728extr x9,x13,x13,#322729extr x15,x13,x12,#632730and w8,w7,w9,asr#312731eor x14,x8,x12,lsl#12732mov w7,0x872733extr x9,x15,x15,#322734extr x17,x15,x14,#632735and w8,w7,w9,asr#312736eor x16,x8,x14,lsl#12737mov w7,0x872738extr x9,x17,x17,#322739extr x19,x17,x16,#632740and w8,w7,w9,asr#312741eor x18,x8,x16,lsl#12742mov w7,0x872743extr x9,x19,x19,#322744extr x21,x19,x18,#632745and w8,w7,w9,asr#312746eor x20,x8,x18,lsl#12747mov w7,0x872748extr x9,x21,x21,#322749extr x23,x21,x20,#632750and w8,w7,w9,asr#312751eor x22,x8,x20,lsl#12752mov w7,0x872753extr x9,x23,x23,#322754extr x25,x23,x22,#632755and w8,w7,w9,asr#312756eor x24,x8,x22,lsl#12757mov w7,0x872758extr x9,x25,x25,#322759extr x27,x25,x24,#632760and w8,w7,w9,asr#312761eor x26,x8,x24,lsl#12762.Lxts_8_blocks_process_gb:2763cmp x2,#82764mov v16.d[0],x122765mov v16.d[1],x132766#ifdef __AARCH64EB__2767rev32 v16.16b,v16.16b2768#endif2769mov w7,0x872770extr x9,x27,x27,#322771extr x13,x27,x26,#632772and w8,w7,w9,asr#312773eor x12,x8,x26,lsl#12774mov v17.d[0],x142775mov v17.d[1],x152776#ifdef __AARCH64EB__2777rev32 v17.16b,v17.16b2778#endif2779mov w7,0x872780extr x9,x13,x13,#322781extr x15,x13,x12,#632782and w8,w7,w9,asr#312783eor x14,x8,x12,lsl#12784mov v18.d[0],x162785mov v18.d[1],x172786#ifdef __AARCH64EB__2787rev32 v18.16b,v18.16b2788#endif2789mov w7,0x872790extr x9,x15,x15,#322791extr x17,x15,x14,#632792and w8,w7,w9,asr#312793eor x16,x8,x14,lsl#12794mov v19.d[0],x182795mov v19.d[1],x192796#ifdef __AARCH64EB__2797rev32 v19.16b,v19.16b2798#endif2799mov w7,0x872800extr x9,x17,x17,#322801extr x19,x17,x16,#632802and w8,w7,w9,asr#312803eor x18,x8,x16,lsl#12804mov v20.d[0],x202805mov v20.d[1],x212806#ifdef __AARCH64EB__2807rev32 v20.16b,v20.16b2808#endif2809mov w7,0x872810extr x9,x19,x19,#322811extr x21,x19,x18,#632812and w8,w7,w9,asr#312813eor x20,x8,x18,lsl#12814mov v21.d[0],x222815mov v21.d[1],x232816#ifdef __AARCH64EB__2817rev32 v21.16b,v21.16b2818#endif2819mov w7,0x872820extr x9,x21,x21,#322821extr x23,x21,x20,#632822and w8,w7,w9,asr#312823eor x22,x8,x20,lsl#12824mov v22.d[0],x242825mov v22.d[1],x252826#ifdef __AARCH64EB__2827rev32 v22.16b,v22.16b2828#endif2829mov w7,0x872830extr x9,x23,x23,#322831extr x25,x23,x22,#632832and w8,w7,w9,asr#312833eor x24,x8,x22,lsl#12834mov v23.d[0],x262835mov v23.d[1],x272836#ifdef __AARCH64EB__2837rev32 v23.16b,v23.16b2838#endif2839mov w7,0x872840extr x9,x25,x25,#322841extr x27,x25,x24,#632842and w8,w7,w9,asr#312843eor x26,x8,x24,lsl#12844b.lt .Lxts_4_blocks_process_gb2845ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#642846rbit v16.16b,v16.16b2847rbit v17.16b,v17.16b2848rbit v18.16b,v18.16b2849rbit v19.16b,v19.16b2850eor v4.16b, v4.16b, v16.16b2851eor v5.16b, v5.16b, v17.16b2852eor v6.16b, v6.16b, v18.16b2853eor v7.16b, v7.16b, v19.16b2854ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#642855rbit v20.16b,v20.16b2856rbit v21.16b,v21.16b2857rbit v22.16b,v22.16b2858rbit v23.16b,v23.16b2859eor v8.16b, v8.16b, v20.16b2860eor v9.16b, v9.16b, v21.16b2861eor v10.16b, v10.16b, v22.16b2862eor v11.16b, v11.16b, v23.16b2863#ifndef __AARCH64EB__2864rev32 v4.16b,v4.16b2865#endif2866#ifndef __AARCH64EB__2867rev32 v5.16b,v5.16b2868#endif2869#ifndef __AARCH64EB__2870rev32 v6.16b,v6.16b2871#endif2872#ifndef __AARCH64EB__2873rev32 v7.16b,v7.16b2874#endif2875#ifndef __AARCH64EB__2876rev32 v8.16b,v8.16b2877#endif2878#ifndef __AARCH64EB__2879rev32 v9.16b,v9.16b2880#endif2881#ifndef __AARCH64EB__2882rev32 v10.16b,v10.16b2883#endif2884#ifndef __AARCH64EB__2885rev32 v11.16b,v11.16b2886#endif2887zip1 v0.4s,v4.4s,v5.4s2888zip2 v1.4s,v4.4s,v5.4s2889zip1 v2.4s,v6.4s,v7.4s2890zip2 v3.4s,v6.4s,v7.4s2891zip1 v4.2d,v0.2d,v2.2d2892zip2 v5.2d,v0.2d,v2.2d2893zip1 v6.2d,v1.2d,v3.2d2894zip2 v7.2d,v1.2d,v3.2d2895zip1 v0.4s,v8.4s,v9.4s2896zip2 v1.4s,v8.4s,v9.4s2897zip1 v2.4s,v10.4s,v11.4s2898zip2 v3.4s,v10.4s,v11.4s2899zip1 v8.2d,v0.2d,v2.2d2900zip2 v9.2d,v0.2d,v2.2d2901zip1 v10.2d,v1.2d,v3.2d2902zip2 v11.2d,v1.2d,v3.2d2903bl _vpsm4_ex_enc_8blks2904zip1 v8.4s,v0.4s,v1.4s2905zip2 v9.4s,v0.4s,v1.4s2906zip1 v10.4s,v2.4s,v3.4s2907zip2 v11.4s,v2.4s,v3.4s2908zip1 v0.2d,v8.2d,v10.2d2909zip2 v1.2d,v8.2d,v10.2d2910zip1 v2.2d,v9.2d,v11.2d2911zip2 v3.2d,v9.2d,v11.2d2912zip1 v8.4s,v4.4s,v5.4s2913zip2 v9.4s,v4.4s,v5.4s2914zip1 v10.4s,v6.4s,v7.4s2915zip2 v11.4s,v6.4s,v7.4s2916zip1 v4.2d,v8.2d,v10.2d2917zip2 v5.2d,v8.2d,v10.2d2918zip1 v6.2d,v9.2d,v11.2d2919zip2 v7.2d,v9.2d,v11.2d2920eor v0.16b, v0.16b, v16.16b2921eor v1.16b, v1.16b, v17.16b2922eor v2.16b, v2.16b, v18.16b2923eor v3.16b, v3.16b, v19.16b2924eor v4.16b, v4.16b, v20.16b2925eor v5.16b, v5.16b, v21.16b2926eor v6.16b, v6.16b, v22.16b2927eor v7.16b, v7.16b, v23.16b29282929// save the last tweak2930mov v25.16b,v23.16b2931st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642932st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#642933subs x2,x2,#82934b.gt .Lxts_8_blocks_process_gb2935b 100f2936.Lxts_4_blocks_process_gb:2937cmp x2,#42938b.lt 1f2939ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#642940rbit v16.16b,v16.16b2941rbit v17.16b,v17.16b2942rbit v18.16b,v18.16b2943rbit v19.16b,v19.16b2944eor v4.16b, v4.16b, v16.16b2945eor v5.16b, v5.16b, v17.16b2946eor v6.16b, v6.16b, v18.16b2947eor v7.16b, v7.16b, v19.16b2948#ifndef __AARCH64EB__2949rev32 v4.16b,v4.16b2950#endif2951#ifndef __AARCH64EB__2952rev32 v5.16b,v5.16b2953#endif2954#ifndef __AARCH64EB__2955rev32 v6.16b,v6.16b2956#endif2957#ifndef __AARCH64EB__2958rev32 v7.16b,v7.16b2959#endif2960zip1 v0.4s,v4.4s,v5.4s2961zip2 v1.4s,v4.4s,v5.4s2962zip1 v2.4s,v6.4s,v7.4s2963zip2 v3.4s,v6.4s,v7.4s2964zip1 v4.2d,v0.2d,v2.2d2965zip2 v5.2d,v0.2d,v2.2d2966zip1 v6.2d,v1.2d,v3.2d2967zip2 v7.2d,v1.2d,v3.2d2968bl _vpsm4_ex_enc_4blks2969zip1 v4.4s,v0.4s,v1.4s2970zip2 v5.4s,v0.4s,v1.4s2971zip1 v6.4s,v2.4s,v3.4s2972zip2 v7.4s,v2.4s,v3.4s2973zip1 v0.2d,v4.2d,v6.2d2974zip2 v1.2d,v4.2d,v6.2d2975zip1 v2.2d,v5.2d,v7.2d2976zip2 v3.2d,v5.2d,v7.2d2977eor v0.16b, v0.16b, v16.16b2978eor v1.16b, v1.16b, v17.16b2979eor v2.16b, v2.16b, v18.16b2980eor v3.16b, v3.16b, v19.16b2981st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642982sub x2,x2,#42983mov v16.16b,v20.16b2984mov v17.16b,v21.16b2985mov v18.16b,v22.16b2986// save the last tweak2987mov v25.16b,v19.16b29881:2989// process last block2990cmp x2,#12991b.lt 100f2992b.gt 1f2993ld1 {v4.4s},[x0],#162994rbit v16.16b,v16.16b2995eor v4.16b, v4.16b, v16.16b2996#ifndef __AARCH64EB__2997rev32 v4.16b,v4.16b2998#endif2999mov x10,x33000mov w11,#83001mov w12,v4.s[0]3002mov w13,v4.s[1]3003mov w14,v4.s[2]3004mov w15,v4.s[3]300510:3006ldp w7,w8,[x10],83007// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3008eor w6,w14,w153009eor w9,w7,w133010eor w6,w6,w93011mov v3.s[0],w63012// optimize sbox using AESE instruction3013tbl v0.16b, {v3.16b}, v26.16b3014ushr v2.16b, v0.16b, 43015and v0.16b, v0.16b, v31.16b3016tbl v0.16b, {v28.16b}, v0.16b3017tbl v2.16b, {v27.16b}, v2.16b3018eor v0.16b, v0.16b, v2.16b3019eor v1.16b, v1.16b, v1.16b3020aese v0.16b,v1.16b3021ushr v2.16b, v0.16b, 43022and v0.16b, v0.16b, v31.16b3023tbl v0.16b, {v30.16b}, v0.16b3024tbl v2.16b, {v29.16b}, v2.16b3025eor v0.16b, v0.16b, v2.16b30263027mov w7,v0.s[0]3028eor w6,w7,w7,ror #32-23029eor w6,w6,w7,ror #32-103030eor w6,w6,w7,ror #32-183031eor w6,w6,w7,ror #32-243032eor w12,w12,w63033// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3034eor w6,w14,w153035eor w9,w12,w83036eor w6,w6,w93037mov v3.s[0],w63038// optimize sbox using AESE instruction3039tbl v0.16b, {v3.16b}, v26.16b3040ushr v2.16b, v0.16b, 43041and v0.16b, v0.16b, v31.16b3042tbl v0.16b, {v28.16b}, v0.16b3043tbl v2.16b, {v27.16b}, v2.16b3044eor v0.16b, v0.16b, v2.16b3045eor v1.16b, v1.16b, v1.16b3046aese v0.16b,v1.16b3047ushr v2.16b, v0.16b, 43048and v0.16b, v0.16b, v31.16b3049tbl v0.16b, {v30.16b}, v0.16b3050tbl v2.16b, {v29.16b}, v2.16b3051eor v0.16b, v0.16b, v2.16b30523053mov w7,v0.s[0]3054eor w6,w7,w7,ror #32-23055eor w6,w6,w7,ror #32-103056eor w6,w6,w7,ror #32-183057eor w6,w6,w7,ror #32-243058ldp w7,w8,[x10],83059eor w13,w13,w63060// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3061eor w6,w12,w133062eor w9,w7,w153063eor w6,w6,w93064mov v3.s[0],w63065// optimize sbox using AESE instruction3066tbl v0.16b, {v3.16b}, v26.16b3067ushr v2.16b, v0.16b, 43068and v0.16b, v0.16b, v31.16b3069tbl v0.16b, {v28.16b}, v0.16b3070tbl v2.16b, {v27.16b}, v2.16b3071eor v0.16b, v0.16b, v2.16b3072eor v1.16b, v1.16b, v1.16b3073aese v0.16b,v1.16b3074ushr v2.16b, v0.16b, 43075and v0.16b, v0.16b, v31.16b3076tbl v0.16b, {v30.16b}, v0.16b3077tbl v2.16b, {v29.16b}, v2.16b3078eor v0.16b, v0.16b, v2.16b30793080mov w7,v0.s[0]3081eor w6,w7,w7,ror #32-23082eor w6,w6,w7,ror #32-103083eor w6,w6,w7,ror #32-183084eor w6,w6,w7,ror #32-243085eor w14,w14,w63086// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3087eor w6,w12,w133088eor w9,w14,w83089eor w6,w6,w93090mov v3.s[0],w63091// optimize sbox using AESE instruction3092tbl v0.16b, {v3.16b}, v26.16b3093ushr v2.16b, v0.16b, 43094and v0.16b, v0.16b, v31.16b3095tbl v0.16b, {v28.16b}, v0.16b3096tbl v2.16b, {v27.16b}, v2.16b3097eor v0.16b, v0.16b, v2.16b3098eor v1.16b, v1.16b, v1.16b3099aese v0.16b,v1.16b3100ushr v2.16b, v0.16b, 43101and v0.16b, v0.16b, v31.16b3102tbl v0.16b, {v30.16b}, v0.16b3103tbl v2.16b, {v29.16b}, v2.16b3104eor v0.16b, v0.16b, v2.16b31053106mov w7,v0.s[0]3107eor w6,w7,w7,ror #32-23108eor w6,w6,w7,ror #32-103109eor w6,w6,w7,ror #32-183110eor w6,w6,w7,ror #32-243111eor w15,w15,w63112subs w11,w11,#13113b.ne 10b3114mov v4.s[0],w153115mov v4.s[1],w143116mov v4.s[2],w133117mov v4.s[3],w123118#ifndef __AARCH64EB__3119rev32 v4.16b,v4.16b3120#endif3121eor v4.16b, v4.16b, v16.16b3122st1 {v4.4s},[x1],#163123// save the last tweak3124mov v25.16b,v16.16b3125b 100f31261: // process last 2 blocks3127cmp x2,#23128b.gt 1f3129ld1 {v4.4s,v5.4s},[x0],#323130rbit v16.16b,v16.16b3131rbit v17.16b,v17.16b3132eor v4.16b, v4.16b, v16.16b3133eor v5.16b, v5.16b, v17.16b3134#ifndef __AARCH64EB__3135rev32 v4.16b,v4.16b3136#endif3137#ifndef __AARCH64EB__3138rev32 v5.16b,v5.16b3139#endif3140zip1 v0.4s,v4.4s,v5.4s3141zip2 v1.4s,v4.4s,v5.4s3142zip1 v2.4s,v6.4s,v7.4s3143zip2 v3.4s,v6.4s,v7.4s3144zip1 v4.2d,v0.2d,v2.2d3145zip2 v5.2d,v0.2d,v2.2d3146zip1 v6.2d,v1.2d,v3.2d3147zip2 v7.2d,v1.2d,v3.2d3148bl _vpsm4_ex_enc_4blks3149zip1 v4.4s,v0.4s,v1.4s3150zip2 v5.4s,v0.4s,v1.4s3151zip1 v6.4s,v2.4s,v3.4s3152zip2 v7.4s,v2.4s,v3.4s3153zip1 v0.2d,v4.2d,v6.2d3154zip2 v1.2d,v4.2d,v6.2d3155zip1 v2.2d,v5.2d,v7.2d3156zip2 v3.2d,v5.2d,v7.2d3157eor v0.16b, v0.16b, v16.16b3158eor v1.16b, v1.16b, v17.16b3159st1 {v0.4s,v1.4s},[x1],#323160// save the last tweak3161mov v25.16b,v17.16b3162b 100f31631: // process last 3 blocks3164ld1 {v4.4s,v5.4s,v6.4s},[x0],#483165rbit v16.16b,v16.16b3166rbit v17.16b,v17.16b3167rbit v18.16b,v18.16b3168eor v4.16b, v4.16b, v16.16b3169eor v5.16b, v5.16b, v17.16b3170eor v6.16b, v6.16b, v18.16b3171#ifndef __AARCH64EB__3172rev32 v4.16b,v4.16b3173#endif3174#ifndef __AARCH64EB__3175rev32 v5.16b,v5.16b3176#endif3177#ifndef __AARCH64EB__3178rev32 v6.16b,v6.16b3179#endif3180zip1 v0.4s,v4.4s,v5.4s3181zip2 v1.4s,v4.4s,v5.4s3182zip1 v2.4s,v6.4s,v7.4s3183zip2 v3.4s,v6.4s,v7.4s3184zip1 v4.2d,v0.2d,v2.2d3185zip2 v5.2d,v0.2d,v2.2d3186zip1 v6.2d,v1.2d,v3.2d3187zip2 v7.2d,v1.2d,v3.2d3188bl _vpsm4_ex_enc_4blks3189zip1 v4.4s,v0.4s,v1.4s3190zip2 v5.4s,v0.4s,v1.4s3191zip1 v6.4s,v2.4s,v3.4s3192zip2 v7.4s,v2.4s,v3.4s3193zip1 v0.2d,v4.2d,v6.2d3194zip2 v1.2d,v4.2d,v6.2d3195zip1 v2.2d,v5.2d,v7.2d3196zip2 v3.2d,v5.2d,v7.2d3197eor v0.16b, v0.16b, v16.16b3198eor v1.16b, v1.16b, v17.16b3199eor v2.16b, v2.16b, v18.16b3200st1 {v0.4s,v1.4s,v2.4s},[x1],#483201// save the last tweak3202mov v25.16b,v18.16b3203100:3204cmp x29,03205b.eq .return_gb32063207// This branch calculates the last two tweaks,3208// while the encryption/decryption length is larger than 323209.last_2blks_tweak_gb:3210#ifdef __AARCH64EB__3211rev32 v25.16b,v25.16b3212#endif3213rbit v2.16b,v25.16b3214adrp x9, .Lxts_magic3215ldr q0, [x9, #:lo12:.Lxts_magic]3216shl v17.16b, v2.16b, #13217ext v1.16b, v2.16b, v2.16b,#153218ushr v1.16b, v1.16b, #73219mul v1.16b, v1.16b, v0.16b3220eor v17.16b, v17.16b, v1.16b3221rbit v17.16b,v17.16b3222rbit v2.16b,v17.16b3223adrp x9, .Lxts_magic3224ldr q0, [x9, #:lo12:.Lxts_magic]3225shl v18.16b, v2.16b, #13226ext v1.16b, v2.16b, v2.16b,#153227ushr v1.16b, v1.16b, #73228mul v1.16b, v1.16b, v0.16b3229eor v18.16b, v18.16b, v1.16b3230rbit v18.16b,v18.16b3231b .check_dec_gb323232333234// This branch calculates the last two tweaks,3235// while the encryption/decryption length is equal to 32, who only need two tweaks3236.only_2blks_tweak_gb:3237mov v17.16b,v16.16b3238#ifdef __AARCH64EB__3239rev32 v17.16b,v17.16b3240#endif3241rbit v2.16b,v17.16b3242adrp x9, .Lxts_magic3243ldr q0, [x9, #:lo12:.Lxts_magic]3244shl v18.16b, v2.16b, #13245ext v1.16b, v2.16b, v2.16b,#153246ushr v1.16b, v1.16b, #73247mul v1.16b, v1.16b, v0.16b3248eor v18.16b, v18.16b, v1.16b3249rbit v18.16b,v18.16b3250b .check_dec_gb325132523253// Determine whether encryption or decryption is required.3254// The last two tweaks need to be swapped for decryption.3255.check_dec_gb:3256// encryption:1 decryption:03257cmp w28,13258b.eq .process_last_2blks_gb3259mov v0.16B,v17.16b3260mov v17.16B,v18.16b3261mov v18.16B,v0.16b32623263.process_last_2blks_gb:3264#ifdef __AARCH64EB__3265rev32 v17.16b,v17.16b3266#endif3267#ifdef __AARCH64EB__3268rev32 v18.16b,v18.16b3269#endif3270ld1 {v4.4s},[x0],#163271eor v4.16b, v4.16b, v17.16b3272#ifndef __AARCH64EB__3273rev32 v4.16b,v4.16b3274#endif3275mov x10,x33276mov w11,#83277mov w12,v4.s[0]3278mov w13,v4.s[1]3279mov w14,v4.s[2]3280mov w15,v4.s[3]328110:3282ldp w7,w8,[x10],83283// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3284eor w6,w14,w153285eor w9,w7,w133286eor w6,w6,w93287mov v3.s[0],w63288// optimize sbox using AESE instruction3289tbl v0.16b, {v3.16b}, v26.16b3290ushr v2.16b, v0.16b, 43291and v0.16b, v0.16b, v31.16b3292tbl v0.16b, {v28.16b}, v0.16b3293tbl v2.16b, {v27.16b}, v2.16b3294eor v0.16b, v0.16b, v2.16b3295eor v1.16b, v1.16b, v1.16b3296aese v0.16b,v1.16b3297ushr v2.16b, v0.16b, 43298and v0.16b, v0.16b, v31.16b3299tbl v0.16b, {v30.16b}, v0.16b3300tbl v2.16b, {v29.16b}, v2.16b3301eor v0.16b, v0.16b, v2.16b33023303mov w7,v0.s[0]3304eor w6,w7,w7,ror #32-23305eor w6,w6,w7,ror #32-103306eor w6,w6,w7,ror #32-183307eor w6,w6,w7,ror #32-243308eor w12,w12,w63309// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3310eor w6,w14,w153311eor w9,w12,w83312eor w6,w6,w93313mov v3.s[0],w63314// optimize sbox using AESE instruction3315tbl v0.16b, {v3.16b}, v26.16b3316ushr v2.16b, v0.16b, 43317and v0.16b, v0.16b, v31.16b3318tbl v0.16b, {v28.16b}, v0.16b3319tbl v2.16b, {v27.16b}, v2.16b3320eor v0.16b, v0.16b, v2.16b3321eor v1.16b, v1.16b, v1.16b3322aese v0.16b,v1.16b3323ushr v2.16b, v0.16b, 43324and v0.16b, v0.16b, v31.16b3325tbl v0.16b, {v30.16b}, v0.16b3326tbl v2.16b, {v29.16b}, v2.16b3327eor v0.16b, v0.16b, v2.16b33283329mov w7,v0.s[0]3330eor w6,w7,w7,ror #32-23331eor w6,w6,w7,ror #32-103332eor w6,w6,w7,ror #32-183333eor w6,w6,w7,ror #32-243334ldp w7,w8,[x10],83335eor w13,w13,w63336// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3337eor w6,w12,w133338eor w9,w7,w153339eor w6,w6,w93340mov v3.s[0],w63341// optimize sbox using AESE instruction3342tbl v0.16b, {v3.16b}, v26.16b3343ushr v2.16b, v0.16b, 43344and v0.16b, v0.16b, v31.16b3345tbl v0.16b, {v28.16b}, v0.16b3346tbl v2.16b, {v27.16b}, v2.16b3347eor v0.16b, v0.16b, v2.16b3348eor v1.16b, v1.16b, v1.16b3349aese v0.16b,v1.16b3350ushr v2.16b, v0.16b, 43351and v0.16b, v0.16b, v31.16b3352tbl v0.16b, {v30.16b}, v0.16b3353tbl v2.16b, {v29.16b}, v2.16b3354eor v0.16b, v0.16b, v2.16b33553356mov w7,v0.s[0]3357eor w6,w7,w7,ror #32-23358eor w6,w6,w7,ror #32-103359eor w6,w6,w7,ror #32-183360eor w6,w6,w7,ror #32-243361eor w14,w14,w63362// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3363eor w6,w12,w133364eor w9,w14,w83365eor w6,w6,w93366mov v3.s[0],w63367// optimize sbox using AESE instruction3368tbl v0.16b, {v3.16b}, v26.16b3369ushr v2.16b, v0.16b, 43370and v0.16b, v0.16b, v31.16b3371tbl v0.16b, {v28.16b}, v0.16b3372tbl v2.16b, {v27.16b}, v2.16b3373eor v0.16b, v0.16b, v2.16b3374eor v1.16b, v1.16b, v1.16b3375aese v0.16b,v1.16b3376ushr v2.16b, v0.16b, 43377and v0.16b, v0.16b, v31.16b3378tbl v0.16b, {v30.16b}, v0.16b3379tbl v2.16b, {v29.16b}, v2.16b3380eor v0.16b, v0.16b, v2.16b33813382mov w7,v0.s[0]3383eor w6,w7,w7,ror #32-23384eor w6,w6,w7,ror #32-103385eor w6,w6,w7,ror #32-183386eor w6,w6,w7,ror #32-243387eor w15,w15,w63388subs w11,w11,#13389b.ne 10b3390mov v4.s[0],w153391mov v4.s[1],w143392mov v4.s[2],w133393mov v4.s[3],w123394#ifndef __AARCH64EB__3395rev32 v4.16b,v4.16b3396#endif3397eor v4.16b, v4.16b, v17.16b3398st1 {v4.4s},[x1],#1633993400sub x26,x1,163401.loop_gb:3402subs x29,x29,13403ldrb w7,[x26,x29]3404ldrb w8,[x0,x29]3405strb w8,[x26,x29]3406strb w7,[x1,x29]3407b.gt .loop_gb3408ld1 {v4.4s}, [x26]3409eor v4.16b, v4.16b, v18.16b3410#ifndef __AARCH64EB__3411rev32 v4.16b,v4.16b3412#endif3413mov x10,x33414mov w11,#83415mov w12,v4.s[0]3416mov w13,v4.s[1]3417mov w14,v4.s[2]3418mov w15,v4.s[3]341910:3420ldp w7,w8,[x10],83421// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3422eor w6,w14,w153423eor w9,w7,w133424eor w6,w6,w93425mov v3.s[0],w63426// optimize sbox using AESE instruction3427tbl v0.16b, {v3.16b}, v26.16b3428ushr v2.16b, v0.16b, 43429and v0.16b, v0.16b, v31.16b3430tbl v0.16b, {v28.16b}, v0.16b3431tbl v2.16b, {v27.16b}, v2.16b3432eor v0.16b, v0.16b, v2.16b3433eor v1.16b, v1.16b, v1.16b3434aese v0.16b,v1.16b3435ushr v2.16b, v0.16b, 43436and v0.16b, v0.16b, v31.16b3437tbl v0.16b, {v30.16b}, v0.16b3438tbl v2.16b, {v29.16b}, v2.16b3439eor v0.16b, v0.16b, v2.16b34403441mov w7,v0.s[0]3442eor w6,w7,w7,ror #32-23443eor w6,w6,w7,ror #32-103444eor w6,w6,w7,ror #32-183445eor w6,w6,w7,ror #32-243446eor w12,w12,w63447// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3448eor w6,w14,w153449eor w9,w12,w83450eor w6,w6,w93451mov v3.s[0],w63452// optimize sbox using AESE instruction3453tbl v0.16b, {v3.16b}, v26.16b3454ushr v2.16b, v0.16b, 43455and v0.16b, v0.16b, v31.16b3456tbl v0.16b, {v28.16b}, v0.16b3457tbl v2.16b, {v27.16b}, v2.16b3458eor v0.16b, v0.16b, v2.16b3459eor v1.16b, v1.16b, v1.16b3460aese v0.16b,v1.16b3461ushr v2.16b, v0.16b, 43462and v0.16b, v0.16b, v31.16b3463tbl v0.16b, {v30.16b}, v0.16b3464tbl v2.16b, {v29.16b}, v2.16b3465eor v0.16b, v0.16b, v2.16b34663467mov w7,v0.s[0]3468eor w6,w7,w7,ror #32-23469eor w6,w6,w7,ror #32-103470eor w6,w6,w7,ror #32-183471eor w6,w6,w7,ror #32-243472ldp w7,w8,[x10],83473eor w13,w13,w63474// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3475eor w6,w12,w133476eor w9,w7,w153477eor w6,w6,w93478mov v3.s[0],w63479// optimize sbox using AESE instruction3480tbl v0.16b, {v3.16b}, v26.16b3481ushr v2.16b, v0.16b, 43482and v0.16b, v0.16b, v31.16b3483tbl v0.16b, {v28.16b}, v0.16b3484tbl v2.16b, {v27.16b}, v2.16b3485eor v0.16b, v0.16b, v2.16b3486eor v1.16b, v1.16b, v1.16b3487aese v0.16b,v1.16b3488ushr v2.16b, v0.16b, 43489and v0.16b, v0.16b, v31.16b3490tbl v0.16b, {v30.16b}, v0.16b3491tbl v2.16b, {v29.16b}, v2.16b3492eor v0.16b, v0.16b, v2.16b34933494mov w7,v0.s[0]3495eor w6,w7,w7,ror #32-23496eor w6,w6,w7,ror #32-103497eor w6,w6,w7,ror #32-183498eor w6,w6,w7,ror #32-243499eor w14,w14,w63500// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3501eor w6,w12,w133502eor w9,w14,w83503eor w6,w6,w93504mov v3.s[0],w63505// optimize sbox using AESE instruction3506tbl v0.16b, {v3.16b}, v26.16b3507ushr v2.16b, v0.16b, 43508and v0.16b, v0.16b, v31.16b3509tbl v0.16b, {v28.16b}, v0.16b3510tbl v2.16b, {v27.16b}, v2.16b3511eor v0.16b, v0.16b, v2.16b3512eor v1.16b, v1.16b, v1.16b3513aese v0.16b,v1.16b3514ushr v2.16b, v0.16b, 43515and v0.16b, v0.16b, v31.16b3516tbl v0.16b, {v30.16b}, v0.16b3517tbl v2.16b, {v29.16b}, v2.16b3518eor v0.16b, v0.16b, v2.16b35193520mov w7,v0.s[0]3521eor w6,w7,w7,ror #32-23522eor w6,w6,w7,ror #32-103523eor w6,w6,w7,ror #32-183524eor w6,w6,w7,ror #32-243525eor w15,w15,w63526subs w11,w11,#13527b.ne 10b3528mov v4.s[0],w153529mov v4.s[1],w143530mov v4.s[2],w133531mov v4.s[3],w123532#ifndef __AARCH64EB__3533rev32 v4.16b,v4.16b3534#endif3535eor v4.16b, v4.16b, v18.16b3536st1 {v4.4s}, [x26]3537.return_gb:3538ldp d14, d15, [sp], #0x103539ldp d12, d13, [sp], #0x103540ldp d10, d11, [sp], #0x103541ldp d8, d9, [sp], #0x103542ldp x29, x30, [sp], #0x103543ldp x27, x28, [sp], #0x103544ldp x25, x26, [sp], #0x103545ldp x23, x24, [sp], #0x103546ldp x21, x22, [sp], #0x103547ldp x19, x20, [sp], #0x103548ldp x17, x18, [sp], #0x103549ldp x15, x16, [sp], #0x103550AARCH64_VALIDATE_LINK_REGISTER3551ret3552.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb3553.globl vpsm4_ex_xts_encrypt3554.type vpsm4_ex_xts_encrypt,%function3555.align 53556vpsm4_ex_xts_encrypt:3557AARCH64_SIGN_LINK_REGISTER3558stp x15, x16, [sp, #-0x10]!3559stp x17, x18, [sp, #-0x10]!3560stp x19, x20, [sp, #-0x10]!3561stp x21, x22, [sp, #-0x10]!3562stp x23, x24, [sp, #-0x10]!3563stp x25, x26, [sp, #-0x10]!3564stp x27, x28, [sp, #-0x10]!3565stp x29, x30, [sp, #-0x10]!3566stp d8, d9, [sp, #-0x10]!3567stp d10, d11, [sp, #-0x10]!3568stp d12, d13, [sp, #-0x10]!3569stp d14, d15, [sp, #-0x10]!3570mov x26,x33571mov x27,x43572mov w28,w63573ld1 {v16.4s}, [x5]3574mov x3,x273575adrp x9, .Lsbox_magic3576ldr q26, [x9, #:lo12:.Lsbox_magic]3577ldr q27, [x9, #:lo12:.Lsbox_magic+16]3578ldr q28, [x9, #:lo12:.Lsbox_magic+32]3579ldr q29, [x9, #:lo12:.Lsbox_magic+48]3580ldr q30, [x9, #:lo12:.Lsbox_magic+64]3581ldr q31, [x9, #:lo12:.Lsbox_magic+80]3582#ifndef __AARCH64EB__3583rev32 v16.16b,v16.16b3584#endif3585mov x10,x33586mov w11,#83587mov w12,v16.s[0]3588mov w13,v16.s[1]3589mov w14,v16.s[2]3590mov w15,v16.s[3]359110:3592ldp w7,w8,[x10],83593// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3594eor w6,w14,w153595eor w9,w7,w133596eor w6,w6,w93597mov v3.s[0],w63598// optimize sbox using AESE instruction3599tbl v0.16b, {v3.16b}, v26.16b3600ushr v2.16b, v0.16b, 43601and v0.16b, v0.16b, v31.16b3602tbl v0.16b, {v28.16b}, v0.16b3603tbl v2.16b, {v27.16b}, v2.16b3604eor v0.16b, v0.16b, v2.16b3605eor v1.16b, v1.16b, v1.16b3606aese v0.16b,v1.16b3607ushr v2.16b, v0.16b, 43608and v0.16b, v0.16b, v31.16b3609tbl v0.16b, {v30.16b}, v0.16b3610tbl v2.16b, {v29.16b}, v2.16b3611eor v0.16b, v0.16b, v2.16b36123613mov w7,v0.s[0]3614eor w6,w7,w7,ror #32-23615eor w6,w6,w7,ror #32-103616eor w6,w6,w7,ror #32-183617eor w6,w6,w7,ror #32-243618eor w12,w12,w63619// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3620eor w6,w14,w153621eor w9,w12,w83622eor w6,w6,w93623mov v3.s[0],w63624// optimize sbox using AESE instruction3625tbl v0.16b, {v3.16b}, v26.16b3626ushr v2.16b, v0.16b, 43627and v0.16b, v0.16b, v31.16b3628tbl v0.16b, {v28.16b}, v0.16b3629tbl v2.16b, {v27.16b}, v2.16b3630eor v0.16b, v0.16b, v2.16b3631eor v1.16b, v1.16b, v1.16b3632aese v0.16b,v1.16b3633ushr v2.16b, v0.16b, 43634and v0.16b, v0.16b, v31.16b3635tbl v0.16b, {v30.16b}, v0.16b3636tbl v2.16b, {v29.16b}, v2.16b3637eor v0.16b, v0.16b, v2.16b36383639mov w7,v0.s[0]3640eor w6,w7,w7,ror #32-23641eor w6,w6,w7,ror #32-103642eor w6,w6,w7,ror #32-183643eor w6,w6,w7,ror #32-243644ldp w7,w8,[x10],83645eor w13,w13,w63646// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3647eor w6,w12,w133648eor w9,w7,w153649eor w6,w6,w93650mov v3.s[0],w63651// optimize sbox using AESE instruction3652tbl v0.16b, {v3.16b}, v26.16b3653ushr v2.16b, v0.16b, 43654and v0.16b, v0.16b, v31.16b3655tbl v0.16b, {v28.16b}, v0.16b3656tbl v2.16b, {v27.16b}, v2.16b3657eor v0.16b, v0.16b, v2.16b3658eor v1.16b, v1.16b, v1.16b3659aese v0.16b,v1.16b3660ushr v2.16b, v0.16b, 43661and v0.16b, v0.16b, v31.16b3662tbl v0.16b, {v30.16b}, v0.16b3663tbl v2.16b, {v29.16b}, v2.16b3664eor v0.16b, v0.16b, v2.16b36653666mov w7,v0.s[0]3667eor w6,w7,w7,ror #32-23668eor w6,w6,w7,ror #32-103669eor w6,w6,w7,ror #32-183670eor w6,w6,w7,ror #32-243671eor w14,w14,w63672// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3673eor w6,w12,w133674eor w9,w14,w83675eor w6,w6,w93676mov v3.s[0],w63677// optimize sbox using AESE instruction3678tbl v0.16b, {v3.16b}, v26.16b3679ushr v2.16b, v0.16b, 43680and v0.16b, v0.16b, v31.16b3681tbl v0.16b, {v28.16b}, v0.16b3682tbl v2.16b, {v27.16b}, v2.16b3683eor v0.16b, v0.16b, v2.16b3684eor v1.16b, v1.16b, v1.16b3685aese v0.16b,v1.16b3686ushr v2.16b, v0.16b, 43687and v0.16b, v0.16b, v31.16b3688tbl v0.16b, {v30.16b}, v0.16b3689tbl v2.16b, {v29.16b}, v2.16b3690eor v0.16b, v0.16b, v2.16b36913692mov w7,v0.s[0]3693eor w6,w7,w7,ror #32-23694eor w6,w6,w7,ror #32-103695eor w6,w6,w7,ror #32-183696eor w6,w6,w7,ror #32-243697eor w15,w15,w63698subs w11,w11,#13699b.ne 10b3700mov v16.s[0],w153701mov v16.s[1],w143702mov v16.s[2],w133703mov v16.s[3],w123704#ifndef __AARCH64EB__3705rev32 v16.16b,v16.16b3706#endif3707mov x3,x263708and x29,x2,#0x0F3709// convert length into blocks3710lsr x2,x2,43711cmp x2,#13712b.lt .return37133714cmp x29,03715// If the encryption/decryption Length is N times of 16,3716// the all blocks are encrypted/decrypted in .xts_encrypt_blocks3717b.eq .xts_encrypt_blocks37183719// If the encryption/decryption length is not N times of 16,3720// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak3721// the other blocks are encrypted/decrypted in .xts_encrypt_blocks3722subs x2,x2,#13723b.eq .only_2blks_tweak3724.xts_encrypt_blocks:3725#ifdef __AARCH64EB__3726rev32 v16.16b,v16.16b3727#endif3728mov x12,v16.d[0]3729mov x13,v16.d[1]3730mov w7,0x873731extr x9,x13,x13,#323732extr x15,x13,x12,#633733and w8,w7,w9,asr#313734eor x14,x8,x12,lsl#13735mov w7,0x873736extr x9,x15,x15,#323737extr x17,x15,x14,#633738and w8,w7,w9,asr#313739eor x16,x8,x14,lsl#13740mov w7,0x873741extr x9,x17,x17,#323742extr x19,x17,x16,#633743and w8,w7,w9,asr#313744eor x18,x8,x16,lsl#13745mov w7,0x873746extr x9,x19,x19,#323747extr x21,x19,x18,#633748and w8,w7,w9,asr#313749eor x20,x8,x18,lsl#13750mov w7,0x873751extr x9,x21,x21,#323752extr x23,x21,x20,#633753and w8,w7,w9,asr#313754eor x22,x8,x20,lsl#13755mov w7,0x873756extr x9,x23,x23,#323757extr x25,x23,x22,#633758and w8,w7,w9,asr#313759eor x24,x8,x22,lsl#13760mov w7,0x873761extr x9,x25,x25,#323762extr x27,x25,x24,#633763and w8,w7,w9,asr#313764eor x26,x8,x24,lsl#13765.Lxts_8_blocks_process:3766cmp x2,#83767mov v16.d[0],x123768mov v16.d[1],x133769#ifdef __AARCH64EB__3770rev32 v16.16b,v16.16b3771#endif3772mov w7,0x873773extr x9,x27,x27,#323774extr x13,x27,x26,#633775and w8,w7,w9,asr#313776eor x12,x8,x26,lsl#13777mov v17.d[0],x143778mov v17.d[1],x153779#ifdef __AARCH64EB__3780rev32 v17.16b,v17.16b3781#endif3782mov w7,0x873783extr x9,x13,x13,#323784extr x15,x13,x12,#633785and w8,w7,w9,asr#313786eor x14,x8,x12,lsl#13787mov v18.d[0],x163788mov v18.d[1],x173789#ifdef __AARCH64EB__3790rev32 v18.16b,v18.16b3791#endif3792mov w7,0x873793extr x9,x15,x15,#323794extr x17,x15,x14,#633795and w8,w7,w9,asr#313796eor x16,x8,x14,lsl#13797mov v19.d[0],x183798mov v19.d[1],x193799#ifdef __AARCH64EB__3800rev32 v19.16b,v19.16b3801#endif3802mov w7,0x873803extr x9,x17,x17,#323804extr x19,x17,x16,#633805and w8,w7,w9,asr#313806eor x18,x8,x16,lsl#13807mov v20.d[0],x203808mov v20.d[1],x213809#ifdef __AARCH64EB__3810rev32 v20.16b,v20.16b3811#endif3812mov w7,0x873813extr x9,x19,x19,#323814extr x21,x19,x18,#633815and w8,w7,w9,asr#313816eor x20,x8,x18,lsl#13817mov v21.d[0],x223818mov v21.d[1],x233819#ifdef __AARCH64EB__3820rev32 v21.16b,v21.16b3821#endif3822mov w7,0x873823extr x9,x21,x21,#323824extr x23,x21,x20,#633825and w8,w7,w9,asr#313826eor x22,x8,x20,lsl#13827mov v22.d[0],x243828mov v22.d[1],x253829#ifdef __AARCH64EB__3830rev32 v22.16b,v22.16b3831#endif3832mov w7,0x873833extr x9,x23,x23,#323834extr x25,x23,x22,#633835and w8,w7,w9,asr#313836eor x24,x8,x22,lsl#13837mov v23.d[0],x263838mov v23.d[1],x273839#ifdef __AARCH64EB__3840rev32 v23.16b,v23.16b3841#endif3842mov w7,0x873843extr x9,x25,x25,#323844extr x27,x25,x24,#633845and w8,w7,w9,asr#313846eor x26,x8,x24,lsl#13847b.lt .Lxts_4_blocks_process3848ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#643849eor v4.16b, v4.16b, v16.16b3850eor v5.16b, v5.16b, v17.16b3851eor v6.16b, v6.16b, v18.16b3852eor v7.16b, v7.16b, v19.16b3853ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#643854eor v8.16b, v8.16b, v20.16b3855eor v9.16b, v9.16b, v21.16b3856eor v10.16b, v10.16b, v22.16b3857eor v11.16b, v11.16b, v23.16b3858#ifndef __AARCH64EB__3859rev32 v4.16b,v4.16b3860#endif3861#ifndef __AARCH64EB__3862rev32 v5.16b,v5.16b3863#endif3864#ifndef __AARCH64EB__3865rev32 v6.16b,v6.16b3866#endif3867#ifndef __AARCH64EB__3868rev32 v7.16b,v7.16b3869#endif3870#ifndef __AARCH64EB__3871rev32 v8.16b,v8.16b3872#endif3873#ifndef __AARCH64EB__3874rev32 v9.16b,v9.16b3875#endif3876#ifndef __AARCH64EB__3877rev32 v10.16b,v10.16b3878#endif3879#ifndef __AARCH64EB__3880rev32 v11.16b,v11.16b3881#endif3882zip1 v0.4s,v4.4s,v5.4s3883zip2 v1.4s,v4.4s,v5.4s3884zip1 v2.4s,v6.4s,v7.4s3885zip2 v3.4s,v6.4s,v7.4s3886zip1 v4.2d,v0.2d,v2.2d3887zip2 v5.2d,v0.2d,v2.2d3888zip1 v6.2d,v1.2d,v3.2d3889zip2 v7.2d,v1.2d,v3.2d3890zip1 v0.4s,v8.4s,v9.4s3891zip2 v1.4s,v8.4s,v9.4s3892zip1 v2.4s,v10.4s,v11.4s3893zip2 v3.4s,v10.4s,v11.4s3894zip1 v8.2d,v0.2d,v2.2d3895zip2 v9.2d,v0.2d,v2.2d3896zip1 v10.2d,v1.2d,v3.2d3897zip2 v11.2d,v1.2d,v3.2d3898bl _vpsm4_ex_enc_8blks3899zip1 v8.4s,v0.4s,v1.4s3900zip2 v9.4s,v0.4s,v1.4s3901zip1 v10.4s,v2.4s,v3.4s3902zip2 v11.4s,v2.4s,v3.4s3903zip1 v0.2d,v8.2d,v10.2d3904zip2 v1.2d,v8.2d,v10.2d3905zip1 v2.2d,v9.2d,v11.2d3906zip2 v3.2d,v9.2d,v11.2d3907zip1 v8.4s,v4.4s,v5.4s3908zip2 v9.4s,v4.4s,v5.4s3909zip1 v10.4s,v6.4s,v7.4s3910zip2 v11.4s,v6.4s,v7.4s3911zip1 v4.2d,v8.2d,v10.2d3912zip2 v5.2d,v8.2d,v10.2d3913zip1 v6.2d,v9.2d,v11.2d3914zip2 v7.2d,v9.2d,v11.2d3915eor v0.16b, v0.16b, v16.16b3916eor v1.16b, v1.16b, v17.16b3917eor v2.16b, v2.16b, v18.16b3918eor v3.16b, v3.16b, v19.16b3919eor v4.16b, v4.16b, v20.16b3920eor v5.16b, v5.16b, v21.16b3921eor v6.16b, v6.16b, v22.16b3922eor v7.16b, v7.16b, v23.16b39233924// save the last tweak3925mov v25.16b,v23.16b3926st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#643927st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#643928subs x2,x2,#83929b.gt .Lxts_8_blocks_process3930b 100f3931.Lxts_4_blocks_process:3932cmp x2,#43933b.lt 1f3934ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#643935eor v4.16b, v4.16b, v16.16b3936eor v5.16b, v5.16b, v17.16b3937eor v6.16b, v6.16b, v18.16b3938eor v7.16b, v7.16b, v19.16b3939#ifndef __AARCH64EB__3940rev32 v4.16b,v4.16b3941#endif3942#ifndef __AARCH64EB__3943rev32 v5.16b,v5.16b3944#endif3945#ifndef __AARCH64EB__3946rev32 v6.16b,v6.16b3947#endif3948#ifndef __AARCH64EB__3949rev32 v7.16b,v7.16b3950#endif3951zip1 v0.4s,v4.4s,v5.4s3952zip2 v1.4s,v4.4s,v5.4s3953zip1 v2.4s,v6.4s,v7.4s3954zip2 v3.4s,v6.4s,v7.4s3955zip1 v4.2d,v0.2d,v2.2d3956zip2 v5.2d,v0.2d,v2.2d3957zip1 v6.2d,v1.2d,v3.2d3958zip2 v7.2d,v1.2d,v3.2d3959bl _vpsm4_ex_enc_4blks3960zip1 v4.4s,v0.4s,v1.4s3961zip2 v5.4s,v0.4s,v1.4s3962zip1 v6.4s,v2.4s,v3.4s3963zip2 v7.4s,v2.4s,v3.4s3964zip1 v0.2d,v4.2d,v6.2d3965zip2 v1.2d,v4.2d,v6.2d3966zip1 v2.2d,v5.2d,v7.2d3967zip2 v3.2d,v5.2d,v7.2d3968eor v0.16b, v0.16b, v16.16b3969eor v1.16b, v1.16b, v17.16b3970eor v2.16b, v2.16b, v18.16b3971eor v3.16b, v3.16b, v19.16b3972st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#643973sub x2,x2,#43974mov v16.16b,v20.16b3975mov v17.16b,v21.16b3976mov v18.16b,v22.16b3977// save the last tweak3978mov v25.16b,v19.16b39791:3980// process last block3981cmp x2,#13982b.lt 100f3983b.gt 1f3984ld1 {v4.4s},[x0],#163985eor v4.16b, v4.16b, v16.16b3986#ifndef __AARCH64EB__3987rev32 v4.16b,v4.16b3988#endif3989mov x10,x33990mov w11,#83991mov w12,v4.s[0]3992mov w13,v4.s[1]3993mov w14,v4.s[2]3994mov w15,v4.s[3]399510:3996ldp w7,w8,[x10],83997// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3998eor w6,w14,w153999eor w9,w7,w134000eor w6,w6,w94001mov v3.s[0],w64002// optimize sbox using AESE instruction4003tbl v0.16b, {v3.16b}, v26.16b4004ushr v2.16b, v0.16b, 44005and v0.16b, v0.16b, v31.16b4006tbl v0.16b, {v28.16b}, v0.16b4007tbl v2.16b, {v27.16b}, v2.16b4008eor v0.16b, v0.16b, v2.16b4009eor v1.16b, v1.16b, v1.16b4010aese v0.16b,v1.16b4011ushr v2.16b, v0.16b, 44012and v0.16b, v0.16b, v31.16b4013tbl v0.16b, {v30.16b}, v0.16b4014tbl v2.16b, {v29.16b}, v2.16b4015eor v0.16b, v0.16b, v2.16b40164017mov w7,v0.s[0]4018eor w6,w7,w7,ror #32-24019eor w6,w6,w7,ror #32-104020eor w6,w6,w7,ror #32-184021eor w6,w6,w7,ror #32-244022eor w12,w12,w64023// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)4024eor w6,w14,w154025eor w9,w12,w84026eor w6,w6,w94027mov v3.s[0],w64028// optimize sbox using AESE instruction4029tbl v0.16b, {v3.16b}, v26.16b4030ushr v2.16b, v0.16b, 44031and v0.16b, v0.16b, v31.16b4032tbl v0.16b, {v28.16b}, v0.16b4033tbl v2.16b, {v27.16b}, v2.16b4034eor v0.16b, v0.16b, v2.16b4035eor v1.16b, v1.16b, v1.16b4036aese v0.16b,v1.16b4037ushr v2.16b, v0.16b, 44038and v0.16b, v0.16b, v31.16b4039tbl v0.16b, {v30.16b}, v0.16b4040tbl v2.16b, {v29.16b}, v2.16b4041eor v0.16b, v0.16b, v2.16b40424043mov w7,v0.s[0]4044eor w6,w7,w7,ror #32-24045eor w6,w6,w7,ror #32-104046eor w6,w6,w7,ror #32-184047eor w6,w6,w7,ror #32-244048ldp w7,w8,[x10],84049eor w13,w13,w64050// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)4051eor w6,w12,w134052eor w9,w7,w154053eor w6,w6,w94054mov v3.s[0],w64055// optimize sbox using AESE instruction4056tbl v0.16b, {v3.16b}, v26.16b4057ushr v2.16b, v0.16b, 44058and v0.16b, v0.16b, v31.16b4059tbl v0.16b, {v28.16b}, v0.16b4060tbl v2.16b, {v27.16b}, v2.16b4061eor v0.16b, v0.16b, v2.16b4062eor v1.16b, v1.16b, v1.16b4063aese v0.16b,v1.16b4064ushr v2.16b, v0.16b, 44065and v0.16b, v0.16b, v31.16b4066tbl v0.16b, {v30.16b}, v0.16b4067tbl v2.16b, {v29.16b}, v2.16b4068eor v0.16b, v0.16b, v2.16b40694070mov w7,v0.s[0]4071eor w6,w7,w7,ror #32-24072eor w6,w6,w7,ror #32-104073eor w6,w6,w7,ror #32-184074eor w6,w6,w7,ror #32-244075eor w14,w14,w64076// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)4077eor w6,w12,w134078eor w9,w14,w84079eor w6,w6,w94080mov v3.s[0],w64081// optimize sbox using AESE instruction4082tbl v0.16b, {v3.16b}, v26.16b4083ushr v2.16b, v0.16b, 44084and v0.16b, v0.16b, v31.16b4085tbl v0.16b, {v28.16b}, v0.16b4086tbl v2.16b, {v27.16b}, v2.16b4087eor v0.16b, v0.16b, v2.16b4088eor v1.16b, v1.16b, v1.16b4089aese v0.16b,v1.16b4090ushr v2.16b, v0.16b, 44091and v0.16b, v0.16b, v31.16b4092tbl v0.16b, {v30.16b}, v0.16b4093tbl v2.16b, {v29.16b}, v2.16b4094eor v0.16b, v0.16b, v2.16b40954096mov w7,v0.s[0]4097eor w6,w7,w7,ror #32-24098eor w6,w6,w7,ror #32-104099eor w6,w6,w7,ror #32-184100eor w6,w6,w7,ror #32-244101eor w15,w15,w64102subs w11,w11,#14103b.ne 10b4104mov v4.s[0],w154105mov v4.s[1],w144106mov v4.s[2],w134107mov v4.s[3],w124108#ifndef __AARCH64EB__4109rev32 v4.16b,v4.16b4110#endif4111eor v4.16b, v4.16b, v16.16b4112st1 {v4.4s},[x1],#164113// save the last tweak4114mov v25.16b,v16.16b4115b 100f41161: // process last 2 blocks4117cmp x2,#24118b.gt 1f4119ld1 {v4.4s,v5.4s},[x0],#324120eor v4.16b, v4.16b, v16.16b4121eor v5.16b, v5.16b, v17.16b4122#ifndef __AARCH64EB__4123rev32 v4.16b,v4.16b4124#endif4125#ifndef __AARCH64EB__4126rev32 v5.16b,v5.16b4127#endif4128zip1 v0.4s,v4.4s,v5.4s4129zip2 v1.4s,v4.4s,v5.4s4130zip1 v2.4s,v6.4s,v7.4s4131zip2 v3.4s,v6.4s,v7.4s4132zip1 v4.2d,v0.2d,v2.2d4133zip2 v5.2d,v0.2d,v2.2d4134zip1 v6.2d,v1.2d,v3.2d4135zip2 v7.2d,v1.2d,v3.2d4136bl _vpsm4_ex_enc_4blks4137zip1 v4.4s,v0.4s,v1.4s4138zip2 v5.4s,v0.4s,v1.4s4139zip1 v6.4s,v2.4s,v3.4s4140zip2 v7.4s,v2.4s,v3.4s4141zip1 v0.2d,v4.2d,v6.2d4142zip2 v1.2d,v4.2d,v6.2d4143zip1 v2.2d,v5.2d,v7.2d4144zip2 v3.2d,v5.2d,v7.2d4145eor v0.16b, v0.16b, v16.16b4146eor v1.16b, v1.16b, v17.16b4147st1 {v0.4s,v1.4s},[x1],#324148// save the last tweak4149mov v25.16b,v17.16b4150b 100f41511: // process last 3 blocks4152ld1 {v4.4s,v5.4s,v6.4s},[x0],#484153eor v4.16b, v4.16b, v16.16b4154eor v5.16b, v5.16b, v17.16b4155eor v6.16b, v6.16b, v18.16b4156#ifndef __AARCH64EB__4157rev32 v4.16b,v4.16b4158#endif4159#ifndef __AARCH64EB__4160rev32 v5.16b,v5.16b4161#endif4162#ifndef __AARCH64EB__4163rev32 v6.16b,v6.16b4164#endif4165zip1 v0.4s,v4.4s,v5.4s4166zip2 v1.4s,v4.4s,v5.4s4167zip1 v2.4s,v6.4s,v7.4s4168zip2 v3.4s,v6.4s,v7.4s4169zip1 v4.2d,v0.2d,v2.2d4170zip2 v5.2d,v0.2d,v2.2d4171zip1 v6.2d,v1.2d,v3.2d4172zip2 v7.2d,v1.2d,v3.2d4173bl _vpsm4_ex_enc_4blks4174zip1 v4.4s,v0.4s,v1.4s4175zip2 v5.4s,v0.4s,v1.4s4176zip1 v6.4s,v2.4s,v3.4s4177zip2 v7.4s,v2.4s,v3.4s4178zip1 v0.2d,v4.2d,v6.2d4179zip2 v1.2d,v4.2d,v6.2d4180zip1 v2.2d,v5.2d,v7.2d4181zip2 v3.2d,v5.2d,v7.2d4182eor v0.16b, v0.16b, v16.16b4183eor v1.16b, v1.16b, v17.16b4184eor v2.16b, v2.16b, v18.16b4185st1 {v0.4s,v1.4s,v2.4s},[x1],#484186// save the last tweak4187mov v25.16b,v18.16b4188100:4189cmp x29,04190b.eq .return41914192// This branch calculates the last two tweaks,4193// while the encryption/decryption length is larger than 324194.last_2blks_tweak:4195#ifdef __AARCH64EB__4196rev32 v25.16b,v25.16b4197#endif4198mov v2.16b,v25.16b4199adrp x9, .Lxts_magic4200ldr q0, [x9, #:lo12:.Lxts_magic]4201shl v17.16b, v2.16b, #14202ext v1.16b, v2.16b, v2.16b,#154203ushr v1.16b, v1.16b, #74204mul v1.16b, v1.16b, v0.16b4205eor v17.16b, v17.16b, v1.16b4206mov v2.16b,v17.16b4207adrp x9, .Lxts_magic4208ldr q0, [x9, #:lo12:.Lxts_magic]4209shl v18.16b, v2.16b, #14210ext v1.16b, v2.16b, v2.16b,#154211ushr v1.16b, v1.16b, #74212mul v1.16b, v1.16b, v0.16b4213eor v18.16b, v18.16b, v1.16b4214b .check_dec421542164217// This branch calculates the last two tweaks,4218// while the encryption/decryption length is equal to 32, who only need two tweaks4219.only_2blks_tweak:4220mov v17.16b,v16.16b4221#ifdef __AARCH64EB__4222rev32 v17.16b,v17.16b4223#endif4224mov v2.16b,v17.16b4225adrp x9, .Lxts_magic4226ldr q0, [x9, #:lo12:.Lxts_magic]4227shl v18.16b, v2.16b, #14228ext v1.16b, v2.16b, v2.16b,#154229ushr v1.16b, v1.16b, #74230mul v1.16b, v1.16b, v0.16b4231eor v18.16b, v18.16b, v1.16b4232b .check_dec423342344235// Determine whether encryption or decryption is required.4236// The last two tweaks need to be swapped for decryption.4237.check_dec:4238// encryption:1 decryption:04239cmp w28,14240b.eq .process_last_2blks4241mov v0.16B,v17.16b4242mov v17.16B,v18.16b4243mov v18.16B,v0.16b42444245.process_last_2blks:4246#ifdef __AARCH64EB__4247rev32 v17.16b,v17.16b4248#endif4249#ifdef __AARCH64EB__4250rev32 v18.16b,v18.16b4251#endif4252ld1 {v4.4s},[x0],#164253eor v4.16b, v4.16b, v17.16b4254#ifndef __AARCH64EB__4255rev32 v4.16b,v4.16b4256#endif4257mov x10,x34258mov w11,#84259mov w12,v4.s[0]4260mov w13,v4.s[1]4261mov w14,v4.s[2]4262mov w15,v4.s[3]426310:4264ldp w7,w8,[x10],84265// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)4266eor w6,w14,w154267eor w9,w7,w134268eor w6,w6,w94269mov v3.s[0],w64270// optimize sbox using AESE instruction4271tbl v0.16b, {v3.16b}, v26.16b4272ushr v2.16b, v0.16b, 44273and v0.16b, v0.16b, v31.16b4274tbl v0.16b, {v28.16b}, v0.16b4275tbl v2.16b, {v27.16b}, v2.16b4276eor v0.16b, v0.16b, v2.16b4277eor v1.16b, v1.16b, v1.16b4278aese v0.16b,v1.16b4279ushr v2.16b, v0.16b, 44280and v0.16b, v0.16b, v31.16b4281tbl v0.16b, {v30.16b}, v0.16b4282tbl v2.16b, {v29.16b}, v2.16b4283eor v0.16b, v0.16b, v2.16b42844285mov w7,v0.s[0]4286eor w6,w7,w7,ror #32-24287eor w6,w6,w7,ror #32-104288eor w6,w6,w7,ror #32-184289eor w6,w6,w7,ror #32-244290eor w12,w12,w64291// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)4292eor w6,w14,w154293eor w9,w12,w84294eor w6,w6,w94295mov v3.s[0],w64296// optimize sbox using AESE instruction4297tbl v0.16b, {v3.16b}, v26.16b4298ushr v2.16b, v0.16b, 44299and v0.16b, v0.16b, v31.16b4300tbl v0.16b, {v28.16b}, v0.16b4301tbl v2.16b, {v27.16b}, v2.16b4302eor v0.16b, v0.16b, v2.16b4303eor v1.16b, v1.16b, v1.16b4304aese v0.16b,v1.16b4305ushr v2.16b, v0.16b, 44306and v0.16b, v0.16b, v31.16b4307tbl v0.16b, {v30.16b}, v0.16b4308tbl v2.16b, {v29.16b}, v2.16b4309eor v0.16b, v0.16b, v2.16b43104311mov w7,v0.s[0]4312eor w6,w7,w7,ror #32-24313eor w6,w6,w7,ror #32-104314eor w6,w6,w7,ror #32-184315eor w6,w6,w7,ror #32-244316ldp w7,w8,[x10],84317eor w13,w13,w64318// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)4319eor w6,w12,w134320eor w9,w7,w154321eor w6,w6,w94322mov v3.s[0],w64323// optimize sbox using AESE instruction4324tbl v0.16b, {v3.16b}, v26.16b4325ushr v2.16b, v0.16b, 44326and v0.16b, v0.16b, v31.16b4327tbl v0.16b, {v28.16b}, v0.16b4328tbl v2.16b, {v27.16b}, v2.16b4329eor v0.16b, v0.16b, v2.16b4330eor v1.16b, v1.16b, v1.16b4331aese v0.16b,v1.16b4332ushr v2.16b, v0.16b, 44333and v0.16b, v0.16b, v31.16b4334tbl v0.16b, {v30.16b}, v0.16b4335tbl v2.16b, {v29.16b}, v2.16b4336eor v0.16b, v0.16b, v2.16b43374338mov w7,v0.s[0]4339eor w6,w7,w7,ror #32-24340eor w6,w6,w7,ror #32-104341eor w6,w6,w7,ror #32-184342eor w6,w6,w7,ror #32-244343eor w14,w14,w64344// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)4345eor w6,w12,w134346eor w9,w14,w84347eor w6,w6,w94348mov v3.s[0],w64349// optimize sbox using AESE instruction4350tbl v0.16b, {v3.16b}, v26.16b4351ushr v2.16b, v0.16b, 44352and v0.16b, v0.16b, v31.16b4353tbl v0.16b, {v28.16b}, v0.16b4354tbl v2.16b, {v27.16b}, v2.16b4355eor v0.16b, v0.16b, v2.16b4356eor v1.16b, v1.16b, v1.16b4357aese v0.16b,v1.16b4358ushr v2.16b, v0.16b, 44359and v0.16b, v0.16b, v31.16b4360tbl v0.16b, {v30.16b}, v0.16b4361tbl v2.16b, {v29.16b}, v2.16b4362eor v0.16b, v0.16b, v2.16b43634364mov w7,v0.s[0]4365eor w6,w7,w7,ror #32-24366eor w6,w6,w7,ror #32-104367eor w6,w6,w7,ror #32-184368eor w6,w6,w7,ror #32-244369eor w15,w15,w64370subs w11,w11,#14371b.ne 10b4372mov v4.s[0],w154373mov v4.s[1],w144374mov v4.s[2],w134375mov v4.s[3],w124376#ifndef __AARCH64EB__4377rev32 v4.16b,v4.16b4378#endif4379eor v4.16b, v4.16b, v17.16b4380st1 {v4.4s},[x1],#1643814382sub x26,x1,164383.loop:4384subs x29,x29,14385ldrb w7,[x26,x29]4386ldrb w8,[x0,x29]4387strb w8,[x26,x29]4388strb w7,[x1,x29]4389b.gt .loop4390ld1 {v4.4s}, [x26]4391eor v4.16b, v4.16b, v18.16b4392#ifndef __AARCH64EB__4393rev32 v4.16b,v4.16b4394#endif4395mov x10,x34396mov w11,#84397mov w12,v4.s[0]4398mov w13,v4.s[1]4399mov w14,v4.s[2]4400mov w15,v4.s[3]440110:4402ldp w7,w8,[x10],84403// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)4404eor w6,w14,w154405eor w9,w7,w134406eor w6,w6,w94407mov v3.s[0],w64408// optimize sbox using AESE instruction4409tbl v0.16b, {v3.16b}, v26.16b4410ushr v2.16b, v0.16b, 44411and v0.16b, v0.16b, v31.16b4412tbl v0.16b, {v28.16b}, v0.16b4413tbl v2.16b, {v27.16b}, v2.16b4414eor v0.16b, v0.16b, v2.16b4415eor v1.16b, v1.16b, v1.16b4416aese v0.16b,v1.16b4417ushr v2.16b, v0.16b, 44418and v0.16b, v0.16b, v31.16b4419tbl v0.16b, {v30.16b}, v0.16b4420tbl v2.16b, {v29.16b}, v2.16b4421eor v0.16b, v0.16b, v2.16b44224423mov w7,v0.s[0]4424eor w6,w7,w7,ror #32-24425eor w6,w6,w7,ror #32-104426eor w6,w6,w7,ror #32-184427eor w6,w6,w7,ror #32-244428eor w12,w12,w64429// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)4430eor w6,w14,w154431eor w9,w12,w84432eor w6,w6,w94433mov v3.s[0],w64434// optimize sbox using AESE instruction4435tbl v0.16b, {v3.16b}, v26.16b4436ushr v2.16b, v0.16b, 44437and v0.16b, v0.16b, v31.16b4438tbl v0.16b, {v28.16b}, v0.16b4439tbl v2.16b, {v27.16b}, v2.16b4440eor v0.16b, v0.16b, v2.16b4441eor v1.16b, v1.16b, v1.16b4442aese v0.16b,v1.16b4443ushr v2.16b, v0.16b, 44444and v0.16b, v0.16b, v31.16b4445tbl v0.16b, {v30.16b}, v0.16b4446tbl v2.16b, {v29.16b}, v2.16b4447eor v0.16b, v0.16b, v2.16b44484449mov w7,v0.s[0]4450eor w6,w7,w7,ror #32-24451eor w6,w6,w7,ror #32-104452eor w6,w6,w7,ror #32-184453eor w6,w6,w7,ror #32-244454ldp w7,w8,[x10],84455eor w13,w13,w64456// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)4457eor w6,w12,w134458eor w9,w7,w154459eor w6,w6,w94460mov v3.s[0],w64461// optimize sbox using AESE instruction4462tbl v0.16b, {v3.16b}, v26.16b4463ushr v2.16b, v0.16b, 44464and v0.16b, v0.16b, v31.16b4465tbl v0.16b, {v28.16b}, v0.16b4466tbl v2.16b, {v27.16b}, v2.16b4467eor v0.16b, v0.16b, v2.16b4468eor v1.16b, v1.16b, v1.16b4469aese v0.16b,v1.16b4470ushr v2.16b, v0.16b, 44471and v0.16b, v0.16b, v31.16b4472tbl v0.16b, {v30.16b}, v0.16b4473tbl v2.16b, {v29.16b}, v2.16b4474eor v0.16b, v0.16b, v2.16b44754476mov w7,v0.s[0]4477eor w6,w7,w7,ror #32-24478eor w6,w6,w7,ror #32-104479eor w6,w6,w7,ror #32-184480eor w6,w6,w7,ror #32-244481eor w14,w14,w64482// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)4483eor w6,w12,w134484eor w9,w14,w84485eor w6,w6,w94486mov v3.s[0],w64487// optimize sbox using AESE instruction4488tbl v0.16b, {v3.16b}, v26.16b4489ushr v2.16b, v0.16b, 44490and v0.16b, v0.16b, v31.16b4491tbl v0.16b, {v28.16b}, v0.16b4492tbl v2.16b, {v27.16b}, v2.16b4493eor v0.16b, v0.16b, v2.16b4494eor v1.16b, v1.16b, v1.16b4495aese v0.16b,v1.16b4496ushr v2.16b, v0.16b, 44497and v0.16b, v0.16b, v31.16b4498tbl v0.16b, {v30.16b}, v0.16b4499tbl v2.16b, {v29.16b}, v2.16b4500eor v0.16b, v0.16b, v2.16b45014502mov w7,v0.s[0]4503eor w6,w7,w7,ror #32-24504eor w6,w6,w7,ror #32-104505eor w6,w6,w7,ror #32-184506eor w6,w6,w7,ror #32-244507eor w15,w15,w64508subs w11,w11,#14509b.ne 10b4510mov v4.s[0],w154511mov v4.s[1],w144512mov v4.s[2],w134513mov v4.s[3],w124514#ifndef __AARCH64EB__4515rev32 v4.16b,v4.16b4516#endif4517eor v4.16b, v4.16b, v18.16b4518st1 {v4.4s}, [x26]4519.return:4520ldp d14, d15, [sp], #0x104521ldp d12, d13, [sp], #0x104522ldp d10, d11, [sp], #0x104523ldp d8, d9, [sp], #0x104524ldp x29, x30, [sp], #0x104525ldp x27, x28, [sp], #0x104526ldp x25, x26, [sp], #0x104527ldp x23, x24, [sp], #0x104528ldp x21, x22, [sp], #0x104529ldp x19, x20, [sp], #0x104530ldp x17, x18, [sp], #0x104531ldp x15, x16, [sp], #0x104532AARCH64_VALIDATE_LINK_REGISTER4533ret4534.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt453545364537