Path: blob/main/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
39536 views
/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */1// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.2//3// Licensed under the Apache License 2.0 (the "License"). You may not use4// this file except in compliance with the License. You can obtain a copy5// in the file LICENSE in the source distribution or at6// https://www.openssl.org/source/license.html78//9// This module implements SM4 with ASIMD and AESE on AARCH6410//11// Dec 202212//1314// $output is the last argument if it looks like a file (it has an extension)15// $flavour is the first argument if it doesn't look like a file16#include "arm_arch.h"17.arch armv8-a+crypto18.text1920.type _vpsm4_ex_consts,%object21.align 722_vpsm4_ex_consts:23.Lck:24.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B626925.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D926.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B424927.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B928.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B222929.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B929930.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB020931.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B727932.Lfk:33.quad 0x56aa3350a3b1bac6,0xb27022dc677d919734.Lshuffles:35.quad 0x0B0A090807060504,0x030201000F0E0D0C36.Lxts_magic:37.quad 0x0101010101010187,0x010101010101010138.Lsbox_magic:39.quad 0x0b0e0104070a0d00,0x0306090c0f02050840.quad 0x62185a2042387a00,0x22581a6002783a4041.quad 0x15df62a89e54e923,0xc10bb67c4a803df742.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead43.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc44.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f4546.size _vpsm4_ex_consts,.-_vpsm4_ex_consts47.type _vpsm4_ex_set_key,%function48.align 449_vpsm4_ex_set_key:50AARCH64_VALID_CALL_TARGET51ld1 {v5.4s},[x0]52adrp x9, .Lsbox_magic53ldr q26, [x9, #:lo12:.Lsbox_magic]54ldr q27, [x9, #:lo12:.Lsbox_magic+16]55ldr q28, [x9, #:lo12:.Lsbox_magic+32]56ldr q29, [x9, #:lo12:.Lsbox_magic+48]57ldr q30, [x9, #:lo12:.Lsbox_magic+64]58ldr q31, [x9, #:lo12:.Lsbox_magic+80]59#ifndef __AARCH64EB__60rev32 v5.16b,v5.16b61#endif62adrp x5,.Lshuffles63add x5,x5,#:lo12:.Lshuffles64ld1 {v7.2d},[x5]65adrp x5,.Lfk66add x5,x5,#:lo12:.Lfk67ld1 {v6.2d},[x5]68eor v5.16b,v5.16b,v6.16b69mov x6,#3270adrp x5,.Lck71add x5,x5,#:lo12:.Lck72movi v0.16b,#6473cbnz w2,1f74add x1,x1,124751:76mov w7,v5.s[1]77ldr w8,[x5],#478eor w8,w8,w779mov w7,v5.s[2]80eor w8,w8,w781mov w7,v5.s[3]82eor w8,w8,w783// optimize sbox using AESE instruction84mov v4.s[0],w885tbl v0.16b, {v4.16b}, v26.16b86ushr v2.16b, v0.16b, 487and v0.16b, v0.16b, v31.16b88tbl v0.16b, {v28.16b}, v0.16b89tbl v2.16b, {v27.16b}, v2.16b90eor v0.16b, v0.16b, v2.16b91eor v1.16b, v1.16b, v1.16b92aese v0.16b,v1.16b93ushr v2.16b, v0.16b, 494and v0.16b, v0.16b, v31.16b95tbl v0.16b, {v30.16b}, v0.16b96tbl v2.16b, {v29.16b}, v2.16b97eor v0.16b, v0.16b, v2.16b98mov w7,v0.s[0]99eor w8,w7,w7,ror #19100eor w8,w8,w7,ror #9101mov w7,v5.s[0]102eor w8,w8,w7103mov v5.s[0],w8104cbz w2,2f105str w8,[x1],#4106b 3f1072:108str w8,[x1],#-41093:110tbl v5.16b,{v5.16b},v7.16b111subs x6,x6,#1112b.ne 1b113ret114.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key115.type _vpsm4_ex_enc_4blks,%function116.align 4117_vpsm4_ex_enc_4blks:118AARCH64_VALID_CALL_TARGET119mov x10,x3120mov w11,#812110:122ldp w7,w8,[x10],8123dup v12.4s,w7124dup v13.4s,w8125126// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)127eor v14.16b,v6.16b,v7.16b128eor v12.16b,v5.16b,v12.16b129eor v12.16b,v14.16b,v12.16b130// optimize sbox using AESE instruction131tbl v0.16b, {v12.16b}, v26.16b132ushr v24.16b, v0.16b, 4133and v0.16b, v0.16b, v31.16b134tbl v0.16b, {v28.16b}, v0.16b135tbl v24.16b, {v27.16b}, v24.16b136eor v0.16b, v0.16b, v24.16b137eor v1.16b, v1.16b, v1.16b138aese v0.16b,v1.16b139ushr v24.16b, v0.16b, 4140and v0.16b, v0.16b, v31.16b141tbl v0.16b, {v30.16b}, v0.16b142tbl v24.16b, {v29.16b}, v24.16b143eor v0.16b, v0.16b, v24.16b144mov v12.16b,v0.16b145146// linear transformation147ushr v0.4s,v12.4s,32-2148ushr v1.4s,v12.4s,32-10149ushr v2.4s,v12.4s,32-18150ushr v3.4s,v12.4s,32-24151sli v0.4s,v12.4s,2152sli v1.4s,v12.4s,10153sli v2.4s,v12.4s,18154sli v3.4s,v12.4s,24155eor v24.16b,v0.16b,v12.16b156eor v24.16b,v24.16b,v1.16b157eor v12.16b,v2.16b,v3.16b158eor v12.16b,v12.16b,v24.16b159eor v4.16b,v4.16b,v12.16b160161// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)162eor v14.16b,v14.16b,v4.16b163eor v13.16b,v14.16b,v13.16b164// optimize sbox using AESE instruction165tbl v0.16b, {v13.16b}, v26.16b166ushr v24.16b, v0.16b, 4167and v0.16b, v0.16b, v31.16b168tbl v0.16b, {v28.16b}, v0.16b169tbl v24.16b, {v27.16b}, v24.16b170eor v0.16b, v0.16b, v24.16b171eor v1.16b, v1.16b, v1.16b172aese v0.16b,v1.16b173ushr v24.16b, v0.16b, 4174and v0.16b, v0.16b, v31.16b175tbl v0.16b, {v30.16b}, v0.16b176tbl v24.16b, {v29.16b}, v24.16b177eor v0.16b, v0.16b, v24.16b178mov v13.16b,v0.16b179180// linear transformation181ushr v0.4s,v13.4s,32-2182ushr v1.4s,v13.4s,32-10183ushr v2.4s,v13.4s,32-18184ushr v3.4s,v13.4s,32-24185sli v0.4s,v13.4s,2186sli v1.4s,v13.4s,10187sli v2.4s,v13.4s,18188sli v3.4s,v13.4s,24189eor v24.16b,v0.16b,v13.16b190eor v24.16b,v24.16b,v1.16b191eor v13.16b,v2.16b,v3.16b192eor v13.16b,v13.16b,v24.16b193ldp w7,w8,[x10],8194eor v5.16b,v5.16b,v13.16b195196dup v12.4s,w7197dup v13.4s,w8198199// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)200eor v14.16b,v4.16b,v5.16b201eor v12.16b,v7.16b,v12.16b202eor v12.16b,v14.16b,v12.16b203// optimize sbox using AESE instruction204tbl v0.16b, {v12.16b}, v26.16b205ushr v24.16b, v0.16b, 4206and v0.16b, v0.16b, v31.16b207tbl v0.16b, {v28.16b}, v0.16b208tbl v24.16b, {v27.16b}, v24.16b209eor v0.16b, v0.16b, v24.16b210eor v1.16b, v1.16b, v1.16b211aese v0.16b,v1.16b212ushr v24.16b, v0.16b, 4213and v0.16b, v0.16b, v31.16b214tbl v0.16b, {v30.16b}, v0.16b215tbl v24.16b, {v29.16b}, v24.16b216eor v0.16b, v0.16b, v24.16b217mov v12.16b,v0.16b218219// linear transformation220ushr v0.4s,v12.4s,32-2221ushr v1.4s,v12.4s,32-10222ushr v2.4s,v12.4s,32-18223ushr v3.4s,v12.4s,32-24224sli v0.4s,v12.4s,2225sli v1.4s,v12.4s,10226sli v2.4s,v12.4s,18227sli v3.4s,v12.4s,24228eor v24.16b,v0.16b,v12.16b229eor v24.16b,v24.16b,v1.16b230eor v12.16b,v2.16b,v3.16b231eor v12.16b,v12.16b,v24.16b232eor v6.16b,v6.16b,v12.16b233234// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)235eor v14.16b,v14.16b,v6.16b236eor v13.16b,v14.16b,v13.16b237// optimize sbox using AESE instruction238tbl v0.16b, {v13.16b}, v26.16b239ushr v24.16b, v0.16b, 4240and v0.16b, v0.16b, v31.16b241tbl v0.16b, {v28.16b}, v0.16b242tbl v24.16b, {v27.16b}, v24.16b243eor v0.16b, v0.16b, v24.16b244eor v1.16b, v1.16b, v1.16b245aese v0.16b,v1.16b246ushr v24.16b, v0.16b, 4247and v0.16b, v0.16b, v31.16b248tbl v0.16b, {v30.16b}, v0.16b249tbl v24.16b, {v29.16b}, v24.16b250eor v0.16b, v0.16b, v24.16b251mov v13.16b,v0.16b252253// linear transformation254ushr v0.4s,v13.4s,32-2255ushr v1.4s,v13.4s,32-10256ushr v2.4s,v13.4s,32-18257ushr v3.4s,v13.4s,32-24258sli v0.4s,v13.4s,2259sli v1.4s,v13.4s,10260sli v2.4s,v13.4s,18261sli v3.4s,v13.4s,24262eor v24.16b,v0.16b,v13.16b263eor v24.16b,v24.16b,v1.16b264eor v13.16b,v2.16b,v3.16b265eor v13.16b,v13.16b,v24.16b266eor v7.16b,v7.16b,v13.16b267subs w11,w11,#1268b.ne 10b269#ifndef __AARCH64EB__270rev32 v3.16b,v4.16b271#else272mov v3.16b,v4.16b273#endif274#ifndef __AARCH64EB__275rev32 v2.16b,v5.16b276#else277mov v2.16b,v5.16b278#endif279#ifndef __AARCH64EB__280rev32 v1.16b,v6.16b281#else282mov v1.16b,v6.16b283#endif284#ifndef __AARCH64EB__285rev32 v0.16b,v7.16b286#else287mov v0.16b,v7.16b288#endif289ret290.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks291.type _vpsm4_ex_enc_8blks,%function292.align 4293_vpsm4_ex_enc_8blks:294AARCH64_VALID_CALL_TARGET295mov x10,x3296mov w11,#829710:298ldp w7,w8,[x10],8299// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)300dup v12.4s,w7301eor v14.16b,v6.16b,v7.16b302eor v15.16b,v10.16b,v11.16b303eor v0.16b,v5.16b,v12.16b304eor v1.16b,v9.16b,v12.16b305eor v12.16b,v14.16b,v0.16b306eor v13.16b,v15.16b,v1.16b307// optimize sbox using AESE instruction308tbl v0.16b, {v12.16b}, v26.16b309tbl v1.16b, {v13.16b}, v26.16b310ushr v24.16b, v0.16b, 4311and v0.16b, v0.16b, v31.16b312tbl v0.16b, {v28.16b}, v0.16b313tbl v24.16b, {v27.16b}, v24.16b314eor v0.16b, v0.16b, v24.16b315ushr v24.16b, v1.16b, 4316and v1.16b, v1.16b, v31.16b317tbl v1.16b, {v28.16b}, v1.16b318tbl v24.16b, {v27.16b}, v24.16b319eor v1.16b, v1.16b, v24.16b320eor v25.16b, v25.16b, v25.16b321aese v0.16b,v25.16b322aese v1.16b,v25.16b323ushr v24.16b, v0.16b, 4324and v0.16b, v0.16b, v31.16b325tbl v0.16b, {v30.16b}, v0.16b326tbl v24.16b, {v29.16b}, v24.16b327eor v0.16b, v0.16b, v24.16b328ushr v24.16b, v1.16b, 4329and v1.16b, v1.16b, v31.16b330tbl v1.16b, {v30.16b}, v1.16b331tbl v24.16b, {v29.16b}, v24.16b332eor v1.16b, v1.16b, v24.16b333mov v12.16b,v0.16b334mov v13.16b,v1.16b335336// linear transformation337ushr v0.4s,v12.4s,32-2338ushr v25.4s,v13.4s,32-2339ushr v1.4s,v12.4s,32-10340ushr v2.4s,v12.4s,32-18341ushr v3.4s,v12.4s,32-24342sli v0.4s,v12.4s,2343sli v25.4s,v13.4s,2344sli v1.4s,v12.4s,10345sli v2.4s,v12.4s,18346sli v3.4s,v12.4s,24347eor v24.16b,v0.16b,v12.16b348eor v24.16b,v24.16b,v1.16b349eor v12.16b,v2.16b,v3.16b350eor v12.16b,v12.16b,v24.16b351ushr v1.4s,v13.4s,32-10352ushr v2.4s,v13.4s,32-18353ushr v3.4s,v13.4s,32-24354sli v1.4s,v13.4s,10355sli v2.4s,v13.4s,18356sli v3.4s,v13.4s,24357eor v24.16b,v25.16b,v13.16b358eor v24.16b,v24.16b,v1.16b359eor v13.16b,v2.16b,v3.16b360eor v13.16b,v13.16b,v24.16b361eor v4.16b,v4.16b,v12.16b362eor v8.16b,v8.16b,v13.16b363364// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)365dup v13.4s,w8366eor v14.16b,v14.16b,v4.16b367eor v15.16b,v15.16b,v8.16b368eor v12.16b,v14.16b,v13.16b369eor v13.16b,v15.16b,v13.16b370// optimize sbox using AESE instruction371tbl v0.16b, {v12.16b}, v26.16b372tbl v1.16b, {v13.16b}, v26.16b373ushr v24.16b, v0.16b, 4374and v0.16b, v0.16b, v31.16b375tbl v0.16b, {v28.16b}, v0.16b376tbl v24.16b, {v27.16b}, v24.16b377eor v0.16b, v0.16b, v24.16b378ushr v24.16b, v1.16b, 4379and v1.16b, v1.16b, v31.16b380tbl v1.16b, {v28.16b}, v1.16b381tbl v24.16b, {v27.16b}, v24.16b382eor v1.16b, v1.16b, v24.16b383eor v25.16b, v25.16b, v25.16b384aese v0.16b,v25.16b385aese v1.16b,v25.16b386ushr v24.16b, v0.16b, 4387and v0.16b, v0.16b, v31.16b388tbl v0.16b, {v30.16b}, v0.16b389tbl v24.16b, {v29.16b}, v24.16b390eor v0.16b, v0.16b, v24.16b391ushr v24.16b, v1.16b, 4392and v1.16b, v1.16b, v31.16b393tbl v1.16b, {v30.16b}, v1.16b394tbl v24.16b, {v29.16b}, v24.16b395eor v1.16b, v1.16b, v24.16b396mov v12.16b,v0.16b397mov v13.16b,v1.16b398399// linear transformation400ushr v0.4s,v12.4s,32-2401ushr v25.4s,v13.4s,32-2402ushr v1.4s,v12.4s,32-10403ushr v2.4s,v12.4s,32-18404ushr v3.4s,v12.4s,32-24405sli v0.4s,v12.4s,2406sli v25.4s,v13.4s,2407sli v1.4s,v12.4s,10408sli v2.4s,v12.4s,18409sli v3.4s,v12.4s,24410eor v24.16b,v0.16b,v12.16b411eor v24.16b,v24.16b,v1.16b412eor v12.16b,v2.16b,v3.16b413eor v12.16b,v12.16b,v24.16b414ushr v1.4s,v13.4s,32-10415ushr v2.4s,v13.4s,32-18416ushr v3.4s,v13.4s,32-24417sli v1.4s,v13.4s,10418sli v2.4s,v13.4s,18419sli v3.4s,v13.4s,24420eor v24.16b,v25.16b,v13.16b421eor v24.16b,v24.16b,v1.16b422eor v13.16b,v2.16b,v3.16b423eor v13.16b,v13.16b,v24.16b424ldp w7,w8,[x10],8425eor v5.16b,v5.16b,v12.16b426eor v9.16b,v9.16b,v13.16b427428// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)429dup v12.4s,w7430eor v14.16b,v4.16b,v5.16b431eor v15.16b,v8.16b,v9.16b432eor v0.16b,v7.16b,v12.16b433eor v1.16b,v11.16b,v12.16b434eor v12.16b,v14.16b,v0.16b435eor v13.16b,v15.16b,v1.16b436// optimize sbox using AESE instruction437tbl v0.16b, {v12.16b}, v26.16b438tbl v1.16b, {v13.16b}, v26.16b439ushr v24.16b, v0.16b, 4440and v0.16b, v0.16b, v31.16b441tbl v0.16b, {v28.16b}, v0.16b442tbl v24.16b, {v27.16b}, v24.16b443eor v0.16b, v0.16b, v24.16b444ushr v24.16b, v1.16b, 4445and v1.16b, v1.16b, v31.16b446tbl v1.16b, {v28.16b}, v1.16b447tbl v24.16b, {v27.16b}, v24.16b448eor v1.16b, v1.16b, v24.16b449eor v25.16b, v25.16b, v25.16b450aese v0.16b,v25.16b451aese v1.16b,v25.16b452ushr v24.16b, v0.16b, 4453and v0.16b, v0.16b, v31.16b454tbl v0.16b, {v30.16b}, v0.16b455tbl v24.16b, {v29.16b}, v24.16b456eor v0.16b, v0.16b, v24.16b457ushr v24.16b, v1.16b, 4458and v1.16b, v1.16b, v31.16b459tbl v1.16b, {v30.16b}, v1.16b460tbl v24.16b, {v29.16b}, v24.16b461eor v1.16b, v1.16b, v24.16b462mov v12.16b,v0.16b463mov v13.16b,v1.16b464465// linear transformation466ushr v0.4s,v12.4s,32-2467ushr v25.4s,v13.4s,32-2468ushr v1.4s,v12.4s,32-10469ushr v2.4s,v12.4s,32-18470ushr v3.4s,v12.4s,32-24471sli v0.4s,v12.4s,2472sli v25.4s,v13.4s,2473sli v1.4s,v12.4s,10474sli v2.4s,v12.4s,18475sli v3.4s,v12.4s,24476eor v24.16b,v0.16b,v12.16b477eor v24.16b,v24.16b,v1.16b478eor v12.16b,v2.16b,v3.16b479eor v12.16b,v12.16b,v24.16b480ushr v1.4s,v13.4s,32-10481ushr v2.4s,v13.4s,32-18482ushr v3.4s,v13.4s,32-24483sli v1.4s,v13.4s,10484sli v2.4s,v13.4s,18485sli v3.4s,v13.4s,24486eor v24.16b,v25.16b,v13.16b487eor v24.16b,v24.16b,v1.16b488eor v13.16b,v2.16b,v3.16b489eor v13.16b,v13.16b,v24.16b490eor v6.16b,v6.16b,v12.16b491eor v10.16b,v10.16b,v13.16b492493// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)494dup v13.4s,w8495eor v14.16b,v14.16b,v6.16b496eor v15.16b,v15.16b,v10.16b497eor v12.16b,v14.16b,v13.16b498eor v13.16b,v15.16b,v13.16b499// optimize sbox using AESE instruction500tbl v0.16b, {v12.16b}, v26.16b501tbl v1.16b, {v13.16b}, v26.16b502ushr v24.16b, v0.16b, 4503and v0.16b, v0.16b, v31.16b504tbl v0.16b, {v28.16b}, v0.16b505tbl v24.16b, {v27.16b}, v24.16b506eor v0.16b, v0.16b, v24.16b507ushr v24.16b, v1.16b, 4508and v1.16b, v1.16b, v31.16b509tbl v1.16b, {v28.16b}, v1.16b510tbl v24.16b, {v27.16b}, v24.16b511eor v1.16b, v1.16b, v24.16b512eor v25.16b, v25.16b, v25.16b513aese v0.16b,v25.16b514aese v1.16b,v25.16b515ushr v24.16b, v0.16b, 4516and v0.16b, v0.16b, v31.16b517tbl v0.16b, {v30.16b}, v0.16b518tbl v24.16b, {v29.16b}, v24.16b519eor v0.16b, v0.16b, v24.16b520ushr v24.16b, v1.16b, 4521and v1.16b, v1.16b, v31.16b522tbl v1.16b, {v30.16b}, v1.16b523tbl v24.16b, {v29.16b}, v24.16b524eor v1.16b, v1.16b, v24.16b525mov v12.16b,v0.16b526mov v13.16b,v1.16b527528// linear transformation529ushr v0.4s,v12.4s,32-2530ushr v25.4s,v13.4s,32-2531ushr v1.4s,v12.4s,32-10532ushr v2.4s,v12.4s,32-18533ushr v3.4s,v12.4s,32-24534sli v0.4s,v12.4s,2535sli v25.4s,v13.4s,2536sli v1.4s,v12.4s,10537sli v2.4s,v12.4s,18538sli v3.4s,v12.4s,24539eor v24.16b,v0.16b,v12.16b540eor v24.16b,v24.16b,v1.16b541eor v12.16b,v2.16b,v3.16b542eor v12.16b,v12.16b,v24.16b543ushr v1.4s,v13.4s,32-10544ushr v2.4s,v13.4s,32-18545ushr v3.4s,v13.4s,32-24546sli v1.4s,v13.4s,10547sli v2.4s,v13.4s,18548sli v3.4s,v13.4s,24549eor v24.16b,v25.16b,v13.16b550eor v24.16b,v24.16b,v1.16b551eor v13.16b,v2.16b,v3.16b552eor v13.16b,v13.16b,v24.16b553eor v7.16b,v7.16b,v12.16b554eor v11.16b,v11.16b,v13.16b555subs w11,w11,#1556b.ne 10b557#ifndef __AARCH64EB__558rev32 v3.16b,v4.16b559#else560mov v3.16b,v4.16b561#endif562#ifndef __AARCH64EB__563rev32 v2.16b,v5.16b564#else565mov v2.16b,v5.16b566#endif567#ifndef __AARCH64EB__568rev32 v1.16b,v6.16b569#else570mov v1.16b,v6.16b571#endif572#ifndef __AARCH64EB__573rev32 v0.16b,v7.16b574#else575mov v0.16b,v7.16b576#endif577#ifndef __AARCH64EB__578rev32 v7.16b,v8.16b579#else580mov v7.16b,v8.16b581#endif582#ifndef __AARCH64EB__583rev32 v6.16b,v9.16b584#else585mov v6.16b,v9.16b586#endif587#ifndef __AARCH64EB__588rev32 v5.16b,v10.16b589#else590mov v5.16b,v10.16b591#endif592#ifndef __AARCH64EB__593rev32 v4.16b,v11.16b594#else595mov v4.16b,v11.16b596#endif597ret598.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks599.globl vpsm4_ex_set_encrypt_key600.type vpsm4_ex_set_encrypt_key,%function601.align 5602vpsm4_ex_set_encrypt_key:603AARCH64_SIGN_LINK_REGISTER604stp x29,x30,[sp,#-16]!605mov w2,1606bl _vpsm4_ex_set_key607ldp x29,x30,[sp],#16608AARCH64_VALIDATE_LINK_REGISTER609ret610.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key611.globl vpsm4_ex_set_decrypt_key612.type vpsm4_ex_set_decrypt_key,%function613.align 5614vpsm4_ex_set_decrypt_key:615AARCH64_SIGN_LINK_REGISTER616stp x29,x30,[sp,#-16]!617mov w2,0618bl _vpsm4_ex_set_key619ldp x29,x30,[sp],#16620AARCH64_VALIDATE_LINK_REGISTER621ret622.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key623.globl vpsm4_ex_encrypt624.type vpsm4_ex_encrypt,%function625.align 5626vpsm4_ex_encrypt:627AARCH64_VALID_CALL_TARGET628ld1 {v4.4s},[x0]629adrp x9, .Lsbox_magic630ldr q26, [x9, #:lo12:.Lsbox_magic]631ldr q27, [x9, #:lo12:.Lsbox_magic+16]632ldr q28, [x9, #:lo12:.Lsbox_magic+32]633ldr q29, [x9, #:lo12:.Lsbox_magic+48]634ldr q30, [x9, #:lo12:.Lsbox_magic+64]635ldr q31, [x9, #:lo12:.Lsbox_magic+80]636#ifndef __AARCH64EB__637rev32 v4.16b,v4.16b638#endif639mov x3,x2640mov x10,x3641mov w11,#8642mov w12,v4.s[0]643mov w13,v4.s[1]644mov w14,v4.s[2]645mov w15,v4.s[3]64610:647ldp w7,w8,[x10],8648// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)649eor w6,w14,w15650eor w9,w7,w13651eor w6,w6,w9652mov v3.s[0],w6653// optimize sbox using AESE instruction654tbl v0.16b, {v3.16b}, v26.16b655ushr v2.16b, v0.16b, 4656and v0.16b, v0.16b, v31.16b657tbl v0.16b, {v28.16b}, v0.16b658tbl v2.16b, {v27.16b}, v2.16b659eor v0.16b, v0.16b, v2.16b660eor v1.16b, v1.16b, v1.16b661aese v0.16b,v1.16b662ushr v2.16b, v0.16b, 4663and v0.16b, v0.16b, v31.16b664tbl v0.16b, {v30.16b}, v0.16b665tbl v2.16b, {v29.16b}, v2.16b666eor v0.16b, v0.16b, v2.16b667668mov w7,v0.s[0]669eor w6,w7,w7,ror #32-2670eor w6,w6,w7,ror #32-10671eor w6,w6,w7,ror #32-18672eor w6,w6,w7,ror #32-24673eor w12,w12,w6674// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)675eor w6,w14,w15676eor w9,w12,w8677eor w6,w6,w9678mov v3.s[0],w6679// optimize sbox using AESE instruction680tbl v0.16b, {v3.16b}, v26.16b681ushr v2.16b, v0.16b, 4682and v0.16b, v0.16b, v31.16b683tbl v0.16b, {v28.16b}, v0.16b684tbl v2.16b, {v27.16b}, v2.16b685eor v0.16b, v0.16b, v2.16b686eor v1.16b, v1.16b, v1.16b687aese v0.16b,v1.16b688ushr v2.16b, v0.16b, 4689and v0.16b, v0.16b, v31.16b690tbl v0.16b, {v30.16b}, v0.16b691tbl v2.16b, {v29.16b}, v2.16b692eor v0.16b, v0.16b, v2.16b693694mov w7,v0.s[0]695eor w6,w7,w7,ror #32-2696eor w6,w6,w7,ror #32-10697eor w6,w6,w7,ror #32-18698eor w6,w6,w7,ror #32-24699ldp w7,w8,[x10],8700eor w13,w13,w6701// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)702eor w6,w12,w13703eor w9,w7,w15704eor w6,w6,w9705mov v3.s[0],w6706// optimize sbox using AESE instruction707tbl v0.16b, {v3.16b}, v26.16b708ushr v2.16b, v0.16b, 4709and v0.16b, v0.16b, v31.16b710tbl v0.16b, {v28.16b}, v0.16b711tbl v2.16b, {v27.16b}, v2.16b712eor v0.16b, v0.16b, v2.16b713eor v1.16b, v1.16b, v1.16b714aese v0.16b,v1.16b715ushr v2.16b, v0.16b, 4716and v0.16b, v0.16b, v31.16b717tbl v0.16b, {v30.16b}, v0.16b718tbl v2.16b, {v29.16b}, v2.16b719eor v0.16b, v0.16b, v2.16b720721mov w7,v0.s[0]722eor w6,w7,w7,ror #32-2723eor w6,w6,w7,ror #32-10724eor w6,w6,w7,ror #32-18725eor w6,w6,w7,ror #32-24726eor w14,w14,w6727// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)728eor w6,w12,w13729eor w9,w14,w8730eor w6,w6,w9731mov v3.s[0],w6732// optimize sbox using AESE instruction733tbl v0.16b, {v3.16b}, v26.16b734ushr v2.16b, v0.16b, 4735and v0.16b, v0.16b, v31.16b736tbl v0.16b, {v28.16b}, v0.16b737tbl v2.16b, {v27.16b}, v2.16b738eor v0.16b, v0.16b, v2.16b739eor v1.16b, v1.16b, v1.16b740aese v0.16b,v1.16b741ushr v2.16b, v0.16b, 4742and v0.16b, v0.16b, v31.16b743tbl v0.16b, {v30.16b}, v0.16b744tbl v2.16b, {v29.16b}, v2.16b745eor v0.16b, v0.16b, v2.16b746747mov w7,v0.s[0]748eor w6,w7,w7,ror #32-2749eor w6,w6,w7,ror #32-10750eor w6,w6,w7,ror #32-18751eor w6,w6,w7,ror #32-24752eor w15,w15,w6753subs w11,w11,#1754b.ne 10b755mov v4.s[0],w15756mov v4.s[1],w14757mov v4.s[2],w13758mov v4.s[3],w12759#ifndef __AARCH64EB__760rev32 v4.16b,v4.16b761#endif762st1 {v4.4s},[x1]763ret764.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt765.globl vpsm4_ex_decrypt766.type vpsm4_ex_decrypt,%function767.align 5768vpsm4_ex_decrypt:769AARCH64_VALID_CALL_TARGET770ld1 {v4.4s},[x0]771adrp x9, .Lsbox_magic772ldr q26, [x9, #:lo12:.Lsbox_magic]773ldr q27, [x9, #:lo12:.Lsbox_magic+16]774ldr q28, [x9, #:lo12:.Lsbox_magic+32]775ldr q29, [x9, #:lo12:.Lsbox_magic+48]776ldr q30, [x9, #:lo12:.Lsbox_magic+64]777ldr q31, [x9, #:lo12:.Lsbox_magic+80]778#ifndef __AARCH64EB__779rev32 v4.16b,v4.16b780#endif781mov x3,x2782mov x10,x3783mov w11,#8784mov w12,v4.s[0]785mov w13,v4.s[1]786mov w14,v4.s[2]787mov w15,v4.s[3]78810:789ldp w7,w8,[x10],8790// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)791eor w6,w14,w15792eor w9,w7,w13793eor w6,w6,w9794mov v3.s[0],w6795// optimize sbox using AESE instruction796tbl v0.16b, {v3.16b}, v26.16b797ushr v2.16b, v0.16b, 4798and v0.16b, v0.16b, v31.16b799tbl v0.16b, {v28.16b}, v0.16b800tbl v2.16b, {v27.16b}, v2.16b801eor v0.16b, v0.16b, v2.16b802eor v1.16b, v1.16b, v1.16b803aese v0.16b,v1.16b804ushr v2.16b, v0.16b, 4805and v0.16b, v0.16b, v31.16b806tbl v0.16b, {v30.16b}, v0.16b807tbl v2.16b, {v29.16b}, v2.16b808eor v0.16b, v0.16b, v2.16b809810mov w7,v0.s[0]811eor w6,w7,w7,ror #32-2812eor w6,w6,w7,ror #32-10813eor w6,w6,w7,ror #32-18814eor w6,w6,w7,ror #32-24815eor w12,w12,w6816// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)817eor w6,w14,w15818eor w9,w12,w8819eor w6,w6,w9820mov v3.s[0],w6821// optimize sbox using AESE instruction822tbl v0.16b, {v3.16b}, v26.16b823ushr v2.16b, v0.16b, 4824and v0.16b, v0.16b, v31.16b825tbl v0.16b, {v28.16b}, v0.16b826tbl v2.16b, {v27.16b}, v2.16b827eor v0.16b, v0.16b, v2.16b828eor v1.16b, v1.16b, v1.16b829aese v0.16b,v1.16b830ushr v2.16b, v0.16b, 4831and v0.16b, v0.16b, v31.16b832tbl v0.16b, {v30.16b}, v0.16b833tbl v2.16b, {v29.16b}, v2.16b834eor v0.16b, v0.16b, v2.16b835836mov w7,v0.s[0]837eor w6,w7,w7,ror #32-2838eor w6,w6,w7,ror #32-10839eor w6,w6,w7,ror #32-18840eor w6,w6,w7,ror #32-24841ldp w7,w8,[x10],8842eor w13,w13,w6843// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)844eor w6,w12,w13845eor w9,w7,w15846eor w6,w6,w9847mov v3.s[0],w6848// optimize sbox using AESE instruction849tbl v0.16b, {v3.16b}, v26.16b850ushr v2.16b, v0.16b, 4851and v0.16b, v0.16b, v31.16b852tbl v0.16b, {v28.16b}, v0.16b853tbl v2.16b, {v27.16b}, v2.16b854eor v0.16b, v0.16b, v2.16b855eor v1.16b, v1.16b, v1.16b856aese v0.16b,v1.16b857ushr v2.16b, v0.16b, 4858and v0.16b, v0.16b, v31.16b859tbl v0.16b, {v30.16b}, v0.16b860tbl v2.16b, {v29.16b}, v2.16b861eor v0.16b, v0.16b, v2.16b862863mov w7,v0.s[0]864eor w6,w7,w7,ror #32-2865eor w6,w6,w7,ror #32-10866eor w6,w6,w7,ror #32-18867eor w6,w6,w7,ror #32-24868eor w14,w14,w6869// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)870eor w6,w12,w13871eor w9,w14,w8872eor w6,w6,w9873mov v3.s[0],w6874// optimize sbox using AESE instruction875tbl v0.16b, {v3.16b}, v26.16b876ushr v2.16b, v0.16b, 4877and v0.16b, v0.16b, v31.16b878tbl v0.16b, {v28.16b}, v0.16b879tbl v2.16b, {v27.16b}, v2.16b880eor v0.16b, v0.16b, v2.16b881eor v1.16b, v1.16b, v1.16b882aese v0.16b,v1.16b883ushr v2.16b, v0.16b, 4884and v0.16b, v0.16b, v31.16b885tbl v0.16b, {v30.16b}, v0.16b886tbl v2.16b, {v29.16b}, v2.16b887eor v0.16b, v0.16b, v2.16b888889mov w7,v0.s[0]890eor w6,w7,w7,ror #32-2891eor w6,w6,w7,ror #32-10892eor w6,w6,w7,ror #32-18893eor w6,w6,w7,ror #32-24894eor w15,w15,w6895subs w11,w11,#1896b.ne 10b897mov v4.s[0],w15898mov v4.s[1],w14899mov v4.s[2],w13900mov v4.s[3],w12901#ifndef __AARCH64EB__902rev32 v4.16b,v4.16b903#endif904st1 {v4.4s},[x1]905ret906.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt907.globl vpsm4_ex_ecb_encrypt908.type vpsm4_ex_ecb_encrypt,%function909.align 5910vpsm4_ex_ecb_encrypt:911AARCH64_SIGN_LINK_REGISTER912// convert length into blocks913lsr x2,x2,4914stp d8,d9,[sp,#-80]!915stp d10,d11,[sp,#16]916stp d12,d13,[sp,#32]917stp d14,d15,[sp,#48]918stp x29,x30,[sp,#64]919adrp x9, .Lsbox_magic920ldr q26, [x9, #:lo12:.Lsbox_magic]921ldr q27, [x9, #:lo12:.Lsbox_magic+16]922ldr q28, [x9, #:lo12:.Lsbox_magic+32]923ldr q29, [x9, #:lo12:.Lsbox_magic+48]924ldr q30, [x9, #:lo12:.Lsbox_magic+64]925ldr q31, [x9, #:lo12:.Lsbox_magic+80]926.Lecb_8_blocks_process:927cmp w2,#8928b.lt .Lecb_4_blocks_process929ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64930ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64931#ifndef __AARCH64EB__932rev32 v4.16b,v4.16b933#endif934#ifndef __AARCH64EB__935rev32 v5.16b,v5.16b936#endif937#ifndef __AARCH64EB__938rev32 v6.16b,v6.16b939#endif940#ifndef __AARCH64EB__941rev32 v7.16b,v7.16b942#endif943#ifndef __AARCH64EB__944rev32 v8.16b,v8.16b945#endif946#ifndef __AARCH64EB__947rev32 v9.16b,v9.16b948#endif949#ifndef __AARCH64EB__950rev32 v10.16b,v10.16b951#endif952#ifndef __AARCH64EB__953rev32 v11.16b,v11.16b954#endif955bl _vpsm4_ex_enc_8blks956st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64957st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64958subs w2,w2,#8959b.gt .Lecb_8_blocks_process960b 100f961.Lecb_4_blocks_process:962cmp w2,#4963b.lt 1f964ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64965#ifndef __AARCH64EB__966rev32 v4.16b,v4.16b967#endif968#ifndef __AARCH64EB__969rev32 v5.16b,v5.16b970#endif971#ifndef __AARCH64EB__972rev32 v6.16b,v6.16b973#endif974#ifndef __AARCH64EB__975rev32 v7.16b,v7.16b976#endif977bl _vpsm4_ex_enc_4blks978st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64979sub w2,w2,#49801:981// process last block982cmp w2,#1983b.lt 100f984b.gt 1f985ld1 {v4.4s},[x0]986#ifndef __AARCH64EB__987rev32 v4.16b,v4.16b988#endif989mov x10,x3990mov w11,#8991mov w12,v4.s[0]992mov w13,v4.s[1]993mov w14,v4.s[2]994mov w15,v4.s[3]99510:996ldp w7,w8,[x10],8997// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)998eor w6,w14,w15999eor w9,w7,w131000eor w6,w6,w91001mov v3.s[0],w61002// optimize sbox using AESE instruction1003tbl v0.16b, {v3.16b}, v26.16b1004ushr v2.16b, v0.16b, 41005and v0.16b, v0.16b, v31.16b1006tbl v0.16b, {v28.16b}, v0.16b1007tbl v2.16b, {v27.16b}, v2.16b1008eor v0.16b, v0.16b, v2.16b1009eor v1.16b, v1.16b, v1.16b1010aese v0.16b,v1.16b1011ushr v2.16b, v0.16b, 41012and v0.16b, v0.16b, v31.16b1013tbl v0.16b, {v30.16b}, v0.16b1014tbl v2.16b, {v29.16b}, v2.16b1015eor v0.16b, v0.16b, v2.16b10161017mov w7,v0.s[0]1018eor w6,w7,w7,ror #32-21019eor w6,w6,w7,ror #32-101020eor w6,w6,w7,ror #32-181021eor w6,w6,w7,ror #32-241022eor w12,w12,w61023// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1024eor w6,w14,w151025eor w9,w12,w81026eor w6,w6,w91027mov v3.s[0],w61028// optimize sbox using AESE instruction1029tbl v0.16b, {v3.16b}, v26.16b1030ushr v2.16b, v0.16b, 41031and v0.16b, v0.16b, v31.16b1032tbl v0.16b, {v28.16b}, v0.16b1033tbl v2.16b, {v27.16b}, v2.16b1034eor v0.16b, v0.16b, v2.16b1035eor v1.16b, v1.16b, v1.16b1036aese v0.16b,v1.16b1037ushr v2.16b, v0.16b, 41038and v0.16b, v0.16b, v31.16b1039tbl v0.16b, {v30.16b}, v0.16b1040tbl v2.16b, {v29.16b}, v2.16b1041eor v0.16b, v0.16b, v2.16b10421043mov w7,v0.s[0]1044eor w6,w7,w7,ror #32-21045eor w6,w6,w7,ror #32-101046eor w6,w6,w7,ror #32-181047eor w6,w6,w7,ror #32-241048ldp w7,w8,[x10],81049eor w13,w13,w61050// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1051eor w6,w12,w131052eor w9,w7,w151053eor w6,w6,w91054mov v3.s[0],w61055// optimize sbox using AESE instruction1056tbl v0.16b, {v3.16b}, v26.16b1057ushr v2.16b, v0.16b, 41058and v0.16b, v0.16b, v31.16b1059tbl v0.16b, {v28.16b}, v0.16b1060tbl v2.16b, {v27.16b}, v2.16b1061eor v0.16b, v0.16b, v2.16b1062eor v1.16b, v1.16b, v1.16b1063aese v0.16b,v1.16b1064ushr v2.16b, v0.16b, 41065and v0.16b, v0.16b, v31.16b1066tbl v0.16b, {v30.16b}, v0.16b1067tbl v2.16b, {v29.16b}, v2.16b1068eor v0.16b, v0.16b, v2.16b10691070mov w7,v0.s[0]1071eor w6,w7,w7,ror #32-21072eor w6,w6,w7,ror #32-101073eor w6,w6,w7,ror #32-181074eor w6,w6,w7,ror #32-241075eor w14,w14,w61076// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1077eor w6,w12,w131078eor w9,w14,w81079eor w6,w6,w91080mov v3.s[0],w61081// optimize sbox using AESE instruction1082tbl v0.16b, {v3.16b}, v26.16b1083ushr v2.16b, v0.16b, 41084and v0.16b, v0.16b, v31.16b1085tbl v0.16b, {v28.16b}, v0.16b1086tbl v2.16b, {v27.16b}, v2.16b1087eor v0.16b, v0.16b, v2.16b1088eor v1.16b, v1.16b, v1.16b1089aese v0.16b,v1.16b1090ushr v2.16b, v0.16b, 41091and v0.16b, v0.16b, v31.16b1092tbl v0.16b, {v30.16b}, v0.16b1093tbl v2.16b, {v29.16b}, v2.16b1094eor v0.16b, v0.16b, v2.16b10951096mov w7,v0.s[0]1097eor w6,w7,w7,ror #32-21098eor w6,w6,w7,ror #32-101099eor w6,w6,w7,ror #32-181100eor w6,w6,w7,ror #32-241101eor w15,w15,w61102subs w11,w11,#11103b.ne 10b1104mov v4.s[0],w151105mov v4.s[1],w141106mov v4.s[2],w131107mov v4.s[3],w121108#ifndef __AARCH64EB__1109rev32 v4.16b,v4.16b1110#endif1111st1 {v4.4s},[x1]1112b 100f11131: // process last 2 blocks1114ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#161115ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#161116cmp w2,#21117b.gt 1f1118#ifndef __AARCH64EB__1119rev32 v4.16b,v4.16b1120#endif1121#ifndef __AARCH64EB__1122rev32 v5.16b,v5.16b1123#endif1124#ifndef __AARCH64EB__1125rev32 v6.16b,v6.16b1126#endif1127#ifndef __AARCH64EB__1128rev32 v7.16b,v7.16b1129#endif1130bl _vpsm4_ex_enc_4blks1131st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#161132st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1]1133b 100f11341: // process last 3 blocks1135ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#161136#ifndef __AARCH64EB__1137rev32 v4.16b,v4.16b1138#endif1139#ifndef __AARCH64EB__1140rev32 v5.16b,v5.16b1141#endif1142#ifndef __AARCH64EB__1143rev32 v6.16b,v6.16b1144#endif1145#ifndef __AARCH64EB__1146rev32 v7.16b,v7.16b1147#endif1148bl _vpsm4_ex_enc_4blks1149st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#161150st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#161151st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1]1152100:1153ldp d10,d11,[sp,#16]1154ldp d12,d13,[sp,#32]1155ldp d14,d15,[sp,#48]1156ldp x29,x30,[sp,#64]1157ldp d8,d9,[sp],#801158AARCH64_VALIDATE_LINK_REGISTER1159ret1160.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt1161.globl vpsm4_ex_cbc_encrypt1162.type vpsm4_ex_cbc_encrypt,%function1163.align 51164vpsm4_ex_cbc_encrypt:1165AARCH64_VALID_CALL_TARGET1166lsr x2,x2,41167adrp x9, .Lsbox_magic1168ldr q26, [x9, #:lo12:.Lsbox_magic]1169ldr q27, [x9, #:lo12:.Lsbox_magic+16]1170ldr q28, [x9, #:lo12:.Lsbox_magic+32]1171ldr q29, [x9, #:lo12:.Lsbox_magic+48]1172ldr q30, [x9, #:lo12:.Lsbox_magic+64]1173ldr q31, [x9, #:lo12:.Lsbox_magic+80]1174cbz w5,.Ldec1175ld1 {v3.4s},[x4]1176.Lcbc_4_blocks_enc:1177cmp w2,#41178b.lt 1f1179ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#641180eor v4.16b,v4.16b,v3.16b1181#ifndef __AARCH64EB__1182rev32 v5.16b,v5.16b1183#endif1184#ifndef __AARCH64EB__1185rev32 v4.16b,v4.16b1186#endif1187#ifndef __AARCH64EB__1188rev32 v6.16b,v6.16b1189#endif1190#ifndef __AARCH64EB__1191rev32 v7.16b,v7.16b1192#endif1193mov x10,x31194mov w11,#81195mov w12,v4.s[0]1196mov w13,v4.s[1]1197mov w14,v4.s[2]1198mov w15,v4.s[3]119910:1200ldp w7,w8,[x10],81201// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1202eor w6,w14,w151203eor w9,w7,w131204eor w6,w6,w91205mov v3.s[0],w61206// optimize sbox using AESE instruction1207tbl v0.16b, {v3.16b}, v26.16b1208ushr v2.16b, v0.16b, 41209and v0.16b, v0.16b, v31.16b1210tbl v0.16b, {v28.16b}, v0.16b1211tbl v2.16b, {v27.16b}, v2.16b1212eor v0.16b, v0.16b, v2.16b1213eor v1.16b, v1.16b, v1.16b1214aese v0.16b,v1.16b1215ushr v2.16b, v0.16b, 41216and v0.16b, v0.16b, v31.16b1217tbl v0.16b, {v30.16b}, v0.16b1218tbl v2.16b, {v29.16b}, v2.16b1219eor v0.16b, v0.16b, v2.16b12201221mov w7,v0.s[0]1222eor w6,w7,w7,ror #32-21223eor w6,w6,w7,ror #32-101224eor w6,w6,w7,ror #32-181225eor w6,w6,w7,ror #32-241226eor w12,w12,w61227// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1228eor w6,w14,w151229eor w9,w12,w81230eor w6,w6,w91231mov v3.s[0],w61232// optimize sbox using AESE instruction1233tbl v0.16b, {v3.16b}, v26.16b1234ushr v2.16b, v0.16b, 41235and v0.16b, v0.16b, v31.16b1236tbl v0.16b, {v28.16b}, v0.16b1237tbl v2.16b, {v27.16b}, v2.16b1238eor v0.16b, v0.16b, v2.16b1239eor v1.16b, v1.16b, v1.16b1240aese v0.16b,v1.16b1241ushr v2.16b, v0.16b, 41242and v0.16b, v0.16b, v31.16b1243tbl v0.16b, {v30.16b}, v0.16b1244tbl v2.16b, {v29.16b}, v2.16b1245eor v0.16b, v0.16b, v2.16b12461247mov w7,v0.s[0]1248eor w6,w7,w7,ror #32-21249eor w6,w6,w7,ror #32-101250eor w6,w6,w7,ror #32-181251eor w6,w6,w7,ror #32-241252ldp w7,w8,[x10],81253eor w13,w13,w61254// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1255eor w6,w12,w131256eor w9,w7,w151257eor w6,w6,w91258mov v3.s[0],w61259// optimize sbox using AESE instruction1260tbl v0.16b, {v3.16b}, v26.16b1261ushr v2.16b, v0.16b, 41262and v0.16b, v0.16b, v31.16b1263tbl v0.16b, {v28.16b}, v0.16b1264tbl v2.16b, {v27.16b}, v2.16b1265eor v0.16b, v0.16b, v2.16b1266eor v1.16b, v1.16b, v1.16b1267aese v0.16b,v1.16b1268ushr v2.16b, v0.16b, 41269and v0.16b, v0.16b, v31.16b1270tbl v0.16b, {v30.16b}, v0.16b1271tbl v2.16b, {v29.16b}, v2.16b1272eor v0.16b, v0.16b, v2.16b12731274mov w7,v0.s[0]1275eor w6,w7,w7,ror #32-21276eor w6,w6,w7,ror #32-101277eor w6,w6,w7,ror #32-181278eor w6,w6,w7,ror #32-241279eor w14,w14,w61280// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1281eor w6,w12,w131282eor w9,w14,w81283eor w6,w6,w91284mov v3.s[0],w61285// optimize sbox using AESE instruction1286tbl v0.16b, {v3.16b}, v26.16b1287ushr v2.16b, v0.16b, 41288and v0.16b, v0.16b, v31.16b1289tbl v0.16b, {v28.16b}, v0.16b1290tbl v2.16b, {v27.16b}, v2.16b1291eor v0.16b, v0.16b, v2.16b1292eor v1.16b, v1.16b, v1.16b1293aese v0.16b,v1.16b1294ushr v2.16b, v0.16b, 41295and v0.16b, v0.16b, v31.16b1296tbl v0.16b, {v30.16b}, v0.16b1297tbl v2.16b, {v29.16b}, v2.16b1298eor v0.16b, v0.16b, v2.16b12991300mov w7,v0.s[0]1301eor w6,w7,w7,ror #32-21302eor w6,w6,w7,ror #32-101303eor w6,w6,w7,ror #32-181304eor w6,w6,w7,ror #32-241305eor w15,w15,w61306subs w11,w11,#11307b.ne 10b1308mov v4.s[0],w151309mov v4.s[1],w141310mov v4.s[2],w131311mov v4.s[3],w121312eor v5.16b,v5.16b,v4.16b1313mov x10,x31314mov w11,#81315mov w12,v5.s[0]1316mov w13,v5.s[1]1317mov w14,v5.s[2]1318mov w15,v5.s[3]131910:1320ldp w7,w8,[x10],81321// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1322eor w6,w14,w151323eor w9,w7,w131324eor w6,w6,w91325mov v3.s[0],w61326// optimize sbox using AESE instruction1327tbl v0.16b, {v3.16b}, v26.16b1328ushr v2.16b, v0.16b, 41329and v0.16b, v0.16b, v31.16b1330tbl v0.16b, {v28.16b}, v0.16b1331tbl v2.16b, {v27.16b}, v2.16b1332eor v0.16b, v0.16b, v2.16b1333eor v1.16b, v1.16b, v1.16b1334aese v0.16b,v1.16b1335ushr v2.16b, v0.16b, 41336and v0.16b, v0.16b, v31.16b1337tbl v0.16b, {v30.16b}, v0.16b1338tbl v2.16b, {v29.16b}, v2.16b1339eor v0.16b, v0.16b, v2.16b13401341mov w7,v0.s[0]1342eor w6,w7,w7,ror #32-21343eor w6,w6,w7,ror #32-101344eor w6,w6,w7,ror #32-181345eor w6,w6,w7,ror #32-241346eor w12,w12,w61347// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1348eor w6,w14,w151349eor w9,w12,w81350eor w6,w6,w91351mov v3.s[0],w61352// optimize sbox using AESE instruction1353tbl v0.16b, {v3.16b}, v26.16b1354ushr v2.16b, v0.16b, 41355and v0.16b, v0.16b, v31.16b1356tbl v0.16b, {v28.16b}, v0.16b1357tbl v2.16b, {v27.16b}, v2.16b1358eor v0.16b, v0.16b, v2.16b1359eor v1.16b, v1.16b, v1.16b1360aese v0.16b,v1.16b1361ushr v2.16b, v0.16b, 41362and v0.16b, v0.16b, v31.16b1363tbl v0.16b, {v30.16b}, v0.16b1364tbl v2.16b, {v29.16b}, v2.16b1365eor v0.16b, v0.16b, v2.16b13661367mov w7,v0.s[0]1368eor w6,w7,w7,ror #32-21369eor w6,w6,w7,ror #32-101370eor w6,w6,w7,ror #32-181371eor w6,w6,w7,ror #32-241372ldp w7,w8,[x10],81373eor w13,w13,w61374// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1375eor w6,w12,w131376eor w9,w7,w151377eor w6,w6,w91378mov v3.s[0],w61379// optimize sbox using AESE instruction1380tbl v0.16b, {v3.16b}, v26.16b1381ushr v2.16b, v0.16b, 41382and v0.16b, v0.16b, v31.16b1383tbl v0.16b, {v28.16b}, v0.16b1384tbl v2.16b, {v27.16b}, v2.16b1385eor v0.16b, v0.16b, v2.16b1386eor v1.16b, v1.16b, v1.16b1387aese v0.16b,v1.16b1388ushr v2.16b, v0.16b, 41389and v0.16b, v0.16b, v31.16b1390tbl v0.16b, {v30.16b}, v0.16b1391tbl v2.16b, {v29.16b}, v2.16b1392eor v0.16b, v0.16b, v2.16b13931394mov w7,v0.s[0]1395eor w6,w7,w7,ror #32-21396eor w6,w6,w7,ror #32-101397eor w6,w6,w7,ror #32-181398eor w6,w6,w7,ror #32-241399eor w14,w14,w61400// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1401eor w6,w12,w131402eor w9,w14,w81403eor w6,w6,w91404mov v3.s[0],w61405// optimize sbox using AESE instruction1406tbl v0.16b, {v3.16b}, v26.16b1407ushr v2.16b, v0.16b, 41408and v0.16b, v0.16b, v31.16b1409tbl v0.16b, {v28.16b}, v0.16b1410tbl v2.16b, {v27.16b}, v2.16b1411eor v0.16b, v0.16b, v2.16b1412eor v1.16b, v1.16b, v1.16b1413aese v0.16b,v1.16b1414ushr v2.16b, v0.16b, 41415and v0.16b, v0.16b, v31.16b1416tbl v0.16b, {v30.16b}, v0.16b1417tbl v2.16b, {v29.16b}, v2.16b1418eor v0.16b, v0.16b, v2.16b14191420mov w7,v0.s[0]1421eor w6,w7,w7,ror #32-21422eor w6,w6,w7,ror #32-101423eor w6,w6,w7,ror #32-181424eor w6,w6,w7,ror #32-241425eor w15,w15,w61426subs w11,w11,#11427b.ne 10b1428mov v5.s[0],w151429mov v5.s[1],w141430mov v5.s[2],w131431mov v5.s[3],w121432#ifndef __AARCH64EB__1433rev32 v4.16b,v4.16b1434#endif1435eor v6.16b,v6.16b,v5.16b1436mov x10,x31437mov w11,#81438mov w12,v6.s[0]1439mov w13,v6.s[1]1440mov w14,v6.s[2]1441mov w15,v6.s[3]144210:1443ldp w7,w8,[x10],81444// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1445eor w6,w14,w151446eor w9,w7,w131447eor w6,w6,w91448mov v3.s[0],w61449// optimize sbox using AESE instruction1450tbl v0.16b, {v3.16b}, v26.16b1451ushr v2.16b, v0.16b, 41452and v0.16b, v0.16b, v31.16b1453tbl v0.16b, {v28.16b}, v0.16b1454tbl v2.16b, {v27.16b}, v2.16b1455eor v0.16b, v0.16b, v2.16b1456eor v1.16b, v1.16b, v1.16b1457aese v0.16b,v1.16b1458ushr v2.16b, v0.16b, 41459and v0.16b, v0.16b, v31.16b1460tbl v0.16b, {v30.16b}, v0.16b1461tbl v2.16b, {v29.16b}, v2.16b1462eor v0.16b, v0.16b, v2.16b14631464mov w7,v0.s[0]1465eor w6,w7,w7,ror #32-21466eor w6,w6,w7,ror #32-101467eor w6,w6,w7,ror #32-181468eor w6,w6,w7,ror #32-241469eor w12,w12,w61470// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1471eor w6,w14,w151472eor w9,w12,w81473eor w6,w6,w91474mov v3.s[0],w61475// optimize sbox using AESE instruction1476tbl v0.16b, {v3.16b}, v26.16b1477ushr v2.16b, v0.16b, 41478and v0.16b, v0.16b, v31.16b1479tbl v0.16b, {v28.16b}, v0.16b1480tbl v2.16b, {v27.16b}, v2.16b1481eor v0.16b, v0.16b, v2.16b1482eor v1.16b, v1.16b, v1.16b1483aese v0.16b,v1.16b1484ushr v2.16b, v0.16b, 41485and v0.16b, v0.16b, v31.16b1486tbl v0.16b, {v30.16b}, v0.16b1487tbl v2.16b, {v29.16b}, v2.16b1488eor v0.16b, v0.16b, v2.16b14891490mov w7,v0.s[0]1491eor w6,w7,w7,ror #32-21492eor w6,w6,w7,ror #32-101493eor w6,w6,w7,ror #32-181494eor w6,w6,w7,ror #32-241495ldp w7,w8,[x10],81496eor w13,w13,w61497// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1498eor w6,w12,w131499eor w9,w7,w151500eor w6,w6,w91501mov v3.s[0],w61502// optimize sbox using AESE instruction1503tbl v0.16b, {v3.16b}, v26.16b1504ushr v2.16b, v0.16b, 41505and v0.16b, v0.16b, v31.16b1506tbl v0.16b, {v28.16b}, v0.16b1507tbl v2.16b, {v27.16b}, v2.16b1508eor v0.16b, v0.16b, v2.16b1509eor v1.16b, v1.16b, v1.16b1510aese v0.16b,v1.16b1511ushr v2.16b, v0.16b, 41512and v0.16b, v0.16b, v31.16b1513tbl v0.16b, {v30.16b}, v0.16b1514tbl v2.16b, {v29.16b}, v2.16b1515eor v0.16b, v0.16b, v2.16b15161517mov w7,v0.s[0]1518eor w6,w7,w7,ror #32-21519eor w6,w6,w7,ror #32-101520eor w6,w6,w7,ror #32-181521eor w6,w6,w7,ror #32-241522eor w14,w14,w61523// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1524eor w6,w12,w131525eor w9,w14,w81526eor w6,w6,w91527mov v3.s[0],w61528// optimize sbox using AESE instruction1529tbl v0.16b, {v3.16b}, v26.16b1530ushr v2.16b, v0.16b, 41531and v0.16b, v0.16b, v31.16b1532tbl v0.16b, {v28.16b}, v0.16b1533tbl v2.16b, {v27.16b}, v2.16b1534eor v0.16b, v0.16b, v2.16b1535eor v1.16b, v1.16b, v1.16b1536aese v0.16b,v1.16b1537ushr v2.16b, v0.16b, 41538and v0.16b, v0.16b, v31.16b1539tbl v0.16b, {v30.16b}, v0.16b1540tbl v2.16b, {v29.16b}, v2.16b1541eor v0.16b, v0.16b, v2.16b15421543mov w7,v0.s[0]1544eor w6,w7,w7,ror #32-21545eor w6,w6,w7,ror #32-101546eor w6,w6,w7,ror #32-181547eor w6,w6,w7,ror #32-241548eor w15,w15,w61549subs w11,w11,#11550b.ne 10b1551mov v6.s[0],w151552mov v6.s[1],w141553mov v6.s[2],w131554mov v6.s[3],w121555#ifndef __AARCH64EB__1556rev32 v5.16b,v5.16b1557#endif1558eor v7.16b,v7.16b,v6.16b1559mov x10,x31560mov w11,#81561mov w12,v7.s[0]1562mov w13,v7.s[1]1563mov w14,v7.s[2]1564mov w15,v7.s[3]156510:1566ldp w7,w8,[x10],81567// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1568eor w6,w14,w151569eor w9,w7,w131570eor w6,w6,w91571mov v3.s[0],w61572// optimize sbox using AESE instruction1573tbl v0.16b, {v3.16b}, v26.16b1574ushr v2.16b, v0.16b, 41575and v0.16b, v0.16b, v31.16b1576tbl v0.16b, {v28.16b}, v0.16b1577tbl v2.16b, {v27.16b}, v2.16b1578eor v0.16b, v0.16b, v2.16b1579eor v1.16b, v1.16b, v1.16b1580aese v0.16b,v1.16b1581ushr v2.16b, v0.16b, 41582and v0.16b, v0.16b, v31.16b1583tbl v0.16b, {v30.16b}, v0.16b1584tbl v2.16b, {v29.16b}, v2.16b1585eor v0.16b, v0.16b, v2.16b15861587mov w7,v0.s[0]1588eor w6,w7,w7,ror #32-21589eor w6,w6,w7,ror #32-101590eor w6,w6,w7,ror #32-181591eor w6,w6,w7,ror #32-241592eor w12,w12,w61593// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1594eor w6,w14,w151595eor w9,w12,w81596eor w6,w6,w91597mov v3.s[0],w61598// optimize sbox using AESE instruction1599tbl v0.16b, {v3.16b}, v26.16b1600ushr v2.16b, v0.16b, 41601and v0.16b, v0.16b, v31.16b1602tbl v0.16b, {v28.16b}, v0.16b1603tbl v2.16b, {v27.16b}, v2.16b1604eor v0.16b, v0.16b, v2.16b1605eor v1.16b, v1.16b, v1.16b1606aese v0.16b,v1.16b1607ushr v2.16b, v0.16b, 41608and v0.16b, v0.16b, v31.16b1609tbl v0.16b, {v30.16b}, v0.16b1610tbl v2.16b, {v29.16b}, v2.16b1611eor v0.16b, v0.16b, v2.16b16121613mov w7,v0.s[0]1614eor w6,w7,w7,ror #32-21615eor w6,w6,w7,ror #32-101616eor w6,w6,w7,ror #32-181617eor w6,w6,w7,ror #32-241618ldp w7,w8,[x10],81619eor w13,w13,w61620// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1621eor w6,w12,w131622eor w9,w7,w151623eor w6,w6,w91624mov v3.s[0],w61625// optimize sbox using AESE instruction1626tbl v0.16b, {v3.16b}, v26.16b1627ushr v2.16b, v0.16b, 41628and v0.16b, v0.16b, v31.16b1629tbl v0.16b, {v28.16b}, v0.16b1630tbl v2.16b, {v27.16b}, v2.16b1631eor v0.16b, v0.16b, v2.16b1632eor v1.16b, v1.16b, v1.16b1633aese v0.16b,v1.16b1634ushr v2.16b, v0.16b, 41635and v0.16b, v0.16b, v31.16b1636tbl v0.16b, {v30.16b}, v0.16b1637tbl v2.16b, {v29.16b}, v2.16b1638eor v0.16b, v0.16b, v2.16b16391640mov w7,v0.s[0]1641eor w6,w7,w7,ror #32-21642eor w6,w6,w7,ror #32-101643eor w6,w6,w7,ror #32-181644eor w6,w6,w7,ror #32-241645eor w14,w14,w61646// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1647eor w6,w12,w131648eor w9,w14,w81649eor w6,w6,w91650mov v3.s[0],w61651// optimize sbox using AESE instruction1652tbl v0.16b, {v3.16b}, v26.16b1653ushr v2.16b, v0.16b, 41654and v0.16b, v0.16b, v31.16b1655tbl v0.16b, {v28.16b}, v0.16b1656tbl v2.16b, {v27.16b}, v2.16b1657eor v0.16b, v0.16b, v2.16b1658eor v1.16b, v1.16b, v1.16b1659aese v0.16b,v1.16b1660ushr v2.16b, v0.16b, 41661and v0.16b, v0.16b, v31.16b1662tbl v0.16b, {v30.16b}, v0.16b1663tbl v2.16b, {v29.16b}, v2.16b1664eor v0.16b, v0.16b, v2.16b16651666mov w7,v0.s[0]1667eor w6,w7,w7,ror #32-21668eor w6,w6,w7,ror #32-101669eor w6,w6,w7,ror #32-181670eor w6,w6,w7,ror #32-241671eor w15,w15,w61672subs w11,w11,#11673b.ne 10b1674mov v7.s[0],w151675mov v7.s[1],w141676mov v7.s[2],w131677mov v7.s[3],w121678#ifndef __AARCH64EB__1679rev32 v6.16b,v6.16b1680#endif1681#ifndef __AARCH64EB__1682rev32 v7.16b,v7.16b1683#endif1684orr v3.16b,v7.16b,v7.16b1685st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#641686subs w2,w2,#41687b.ne .Lcbc_4_blocks_enc1688b 2f16891:1690subs w2,w2,#11691b.lt 2f1692ld1 {v4.4s},[x0],#161693eor v3.16b,v3.16b,v4.16b1694#ifndef __AARCH64EB__1695rev32 v3.16b,v3.16b1696#endif1697mov x10,x31698mov w11,#81699mov w12,v3.s[0]1700mov w13,v3.s[1]1701mov w14,v3.s[2]1702mov w15,v3.s[3]170310:1704ldp w7,w8,[x10],81705// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1706eor w6,w14,w151707eor w9,w7,w131708eor w6,w6,w91709mov v3.s[0],w61710// optimize sbox using AESE instruction1711tbl v0.16b, {v3.16b}, v26.16b1712ushr v2.16b, v0.16b, 41713and v0.16b, v0.16b, v31.16b1714tbl v0.16b, {v28.16b}, v0.16b1715tbl v2.16b, {v27.16b}, v2.16b1716eor v0.16b, v0.16b, v2.16b1717eor v1.16b, v1.16b, v1.16b1718aese v0.16b,v1.16b1719ushr v2.16b, v0.16b, 41720and v0.16b, v0.16b, v31.16b1721tbl v0.16b, {v30.16b}, v0.16b1722tbl v2.16b, {v29.16b}, v2.16b1723eor v0.16b, v0.16b, v2.16b17241725mov w7,v0.s[0]1726eor w6,w7,w7,ror #32-21727eor w6,w6,w7,ror #32-101728eor w6,w6,w7,ror #32-181729eor w6,w6,w7,ror #32-241730eor w12,w12,w61731// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1732eor w6,w14,w151733eor w9,w12,w81734eor w6,w6,w91735mov v3.s[0],w61736// optimize sbox using AESE instruction1737tbl v0.16b, {v3.16b}, v26.16b1738ushr v2.16b, v0.16b, 41739and v0.16b, v0.16b, v31.16b1740tbl v0.16b, {v28.16b}, v0.16b1741tbl v2.16b, {v27.16b}, v2.16b1742eor v0.16b, v0.16b, v2.16b1743eor v1.16b, v1.16b, v1.16b1744aese v0.16b,v1.16b1745ushr v2.16b, v0.16b, 41746and v0.16b, v0.16b, v31.16b1747tbl v0.16b, {v30.16b}, v0.16b1748tbl v2.16b, {v29.16b}, v2.16b1749eor v0.16b, v0.16b, v2.16b17501751mov w7,v0.s[0]1752eor w6,w7,w7,ror #32-21753eor w6,w6,w7,ror #32-101754eor w6,w6,w7,ror #32-181755eor w6,w6,w7,ror #32-241756ldp w7,w8,[x10],81757eor w13,w13,w61758// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)1759eor w6,w12,w131760eor w9,w7,w151761eor w6,w6,w91762mov v3.s[0],w61763// optimize sbox using AESE instruction1764tbl v0.16b, {v3.16b}, v26.16b1765ushr v2.16b, v0.16b, 41766and v0.16b, v0.16b, v31.16b1767tbl v0.16b, {v28.16b}, v0.16b1768tbl v2.16b, {v27.16b}, v2.16b1769eor v0.16b, v0.16b, v2.16b1770eor v1.16b, v1.16b, v1.16b1771aese v0.16b,v1.16b1772ushr v2.16b, v0.16b, 41773and v0.16b, v0.16b, v31.16b1774tbl v0.16b, {v30.16b}, v0.16b1775tbl v2.16b, {v29.16b}, v2.16b1776eor v0.16b, v0.16b, v2.16b17771778mov w7,v0.s[0]1779eor w6,w7,w7,ror #32-21780eor w6,w6,w7,ror #32-101781eor w6,w6,w7,ror #32-181782eor w6,w6,w7,ror #32-241783eor w14,w14,w61784// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)1785eor w6,w12,w131786eor w9,w14,w81787eor w6,w6,w91788mov v3.s[0],w61789// optimize sbox using AESE instruction1790tbl v0.16b, {v3.16b}, v26.16b1791ushr v2.16b, v0.16b, 41792and v0.16b, v0.16b, v31.16b1793tbl v0.16b, {v28.16b}, v0.16b1794tbl v2.16b, {v27.16b}, v2.16b1795eor v0.16b, v0.16b, v2.16b1796eor v1.16b, v1.16b, v1.16b1797aese v0.16b,v1.16b1798ushr v2.16b, v0.16b, 41799and v0.16b, v0.16b, v31.16b1800tbl v0.16b, {v30.16b}, v0.16b1801tbl v2.16b, {v29.16b}, v2.16b1802eor v0.16b, v0.16b, v2.16b18031804mov w7,v0.s[0]1805eor w6,w7,w7,ror #32-21806eor w6,w6,w7,ror #32-101807eor w6,w6,w7,ror #32-181808eor w6,w6,w7,ror #32-241809eor w15,w15,w61810subs w11,w11,#11811b.ne 10b1812mov v3.s[0],w151813mov v3.s[1],w141814mov v3.s[2],w131815mov v3.s[3],w121816#ifndef __AARCH64EB__1817rev32 v3.16b,v3.16b1818#endif1819st1 {v3.4s},[x1],#161820b 1b18212:1822// save back IV1823st1 {v3.4s},[x4]1824ret18251826.Ldec:1827// decryption mode starts1828AARCH64_SIGN_LINK_REGISTER1829stp d8,d9,[sp,#-80]!1830stp d10,d11,[sp,#16]1831stp d12,d13,[sp,#32]1832stp d14,d15,[sp,#48]1833stp x29,x30,[sp,#64]1834.Lcbc_8_blocks_dec:1835cmp w2,#81836b.lt 1f1837ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]1838add x10,x0,#641839ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10]1840#ifndef __AARCH64EB__1841rev32 v4.16b,v4.16b1842#endif1843#ifndef __AARCH64EB__1844rev32 v5.16b,v5.16b1845#endif1846#ifndef __AARCH64EB__1847rev32 v6.16b,v6.16b1848#endif1849#ifndef __AARCH64EB__1850rev32 v7.16b,v7.16b1851#endif1852#ifndef __AARCH64EB__1853rev32 v8.16b,v8.16b1854#endif1855#ifndef __AARCH64EB__1856rev32 v9.16b,v9.16b1857#endif1858#ifndef __AARCH64EB__1859rev32 v10.16b,v10.16b1860#endif1861#ifndef __AARCH64EB__1862rev32 v11.16b,v11.16b1863#endif1864bl _vpsm4_ex_enc_8blks1865zip1 v8.4s,v0.4s,v1.4s1866zip2 v9.4s,v0.4s,v1.4s1867zip1 v10.4s,v2.4s,v3.4s1868zip2 v11.4s,v2.4s,v3.4s1869zip1 v0.2d,v8.2d,v10.2d1870zip2 v1.2d,v8.2d,v10.2d1871zip1 v2.2d,v9.2d,v11.2d1872zip2 v3.2d,v9.2d,v11.2d1873zip1 v8.4s,v4.4s,v5.4s1874zip2 v9.4s,v4.4s,v5.4s1875zip1 v10.4s,v6.4s,v7.4s1876zip2 v11.4s,v6.4s,v7.4s1877zip1 v4.2d,v8.2d,v10.2d1878zip2 v5.2d,v8.2d,v10.2d1879zip1 v6.2d,v9.2d,v11.2d1880zip2 v7.2d,v9.2d,v11.2d1881ld1 {v15.4s},[x4]1882ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#641883// note ivec1 and vtmpx[3] are reusing the same register1884// care needs to be taken to avoid conflict1885eor v0.16b,v0.16b,v15.16b1886ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#641887eor v1.16b,v1.16b,v8.16b1888eor v2.16b,v2.16b,v9.16b1889eor v3.16b,v3.16b,v10.16b1890// save back IV1891st1 {v15.4s}, [x4]1892eor v4.16b,v4.16b,v11.16b1893eor v5.16b,v5.16b,v12.16b1894eor v6.16b,v6.16b,v13.16b1895eor v7.16b,v7.16b,v14.16b1896st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#641897st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#641898subs w2,w2,#81899b.gt .Lcbc_8_blocks_dec1900b.eq 100f19011:1902ld1 {v15.4s},[x4]1903.Lcbc_4_blocks_dec:1904cmp w2,#41905b.lt 1f1906ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0]1907#ifndef __AARCH64EB__1908rev32 v4.16b,v4.16b1909#endif1910#ifndef __AARCH64EB__1911rev32 v5.16b,v5.16b1912#endif1913#ifndef __AARCH64EB__1914rev32 v6.16b,v6.16b1915#endif1916#ifndef __AARCH64EB__1917rev32 v7.16b,v7.16b1918#endif1919bl _vpsm4_ex_enc_4blks1920ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#641921zip1 v8.4s,v0.4s,v1.4s1922zip2 v9.4s,v0.4s,v1.4s1923zip1 v10.4s,v2.4s,v3.4s1924zip2 v11.4s,v2.4s,v3.4s1925zip1 v0.2d,v8.2d,v10.2d1926zip2 v1.2d,v8.2d,v10.2d1927zip1 v2.2d,v9.2d,v11.2d1928zip2 v3.2d,v9.2d,v11.2d1929eor v0.16b,v0.16b,v15.16b1930eor v1.16b,v1.16b,v4.16b1931orr v15.16b,v7.16b,v7.16b1932eor v2.16b,v2.16b,v5.16b1933eor v3.16b,v3.16b,v6.16b1934st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#641935subs w2,w2,#41936b.gt .Lcbc_4_blocks_dec1937// save back IV1938st1 {v7.4s}, [x4]1939b 100f19401: // last block1941subs w2,w2,#11942b.lt 100f1943b.gt 1f1944ld1 {v4.4s},[x0],#161945// save back IV1946st1 {v4.4s}, [x4]1947#ifndef __AARCH64EB__1948rev32 v8.16b,v4.16b1949#else1950mov v8.16b,v4.16b1951#endif1952mov x10,x31953mov w11,#81954mov w12,v8.s[0]1955mov w13,v8.s[1]1956mov w14,v8.s[2]1957mov w15,v8.s[3]195810:1959ldp w7,w8,[x10],81960// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)1961eor w6,w14,w151962eor w9,w7,w131963eor w6,w6,w91964mov v3.s[0],w61965// optimize sbox using AESE instruction1966tbl v0.16b, {v3.16b}, v26.16b1967ushr v2.16b, v0.16b, 41968and v0.16b, v0.16b, v31.16b1969tbl v0.16b, {v28.16b}, v0.16b1970tbl v2.16b, {v27.16b}, v2.16b1971eor v0.16b, v0.16b, v2.16b1972eor v1.16b, v1.16b, v1.16b1973aese v0.16b,v1.16b1974ushr v2.16b, v0.16b, 41975and v0.16b, v0.16b, v31.16b1976tbl v0.16b, {v30.16b}, v0.16b1977tbl v2.16b, {v29.16b}, v2.16b1978eor v0.16b, v0.16b, v2.16b19791980mov w7,v0.s[0]1981eor w6,w7,w7,ror #32-21982eor w6,w6,w7,ror #32-101983eor w6,w6,w7,ror #32-181984eor w6,w6,w7,ror #32-241985eor w12,w12,w61986// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)1987eor w6,w14,w151988eor w9,w12,w81989eor w6,w6,w91990mov v3.s[0],w61991// optimize sbox using AESE instruction1992tbl v0.16b, {v3.16b}, v26.16b1993ushr v2.16b, v0.16b, 41994and v0.16b, v0.16b, v31.16b1995tbl v0.16b, {v28.16b}, v0.16b1996tbl v2.16b, {v27.16b}, v2.16b1997eor v0.16b, v0.16b, v2.16b1998eor v1.16b, v1.16b, v1.16b1999aese v0.16b,v1.16b2000ushr v2.16b, v0.16b, 42001and v0.16b, v0.16b, v31.16b2002tbl v0.16b, {v30.16b}, v0.16b2003tbl v2.16b, {v29.16b}, v2.16b2004eor v0.16b, v0.16b, v2.16b20052006mov w7,v0.s[0]2007eor w6,w7,w7,ror #32-22008eor w6,w6,w7,ror #32-102009eor w6,w6,w7,ror #32-182010eor w6,w6,w7,ror #32-242011ldp w7,w8,[x10],82012eor w13,w13,w62013// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2014eor w6,w12,w132015eor w9,w7,w152016eor w6,w6,w92017mov v3.s[0],w62018// optimize sbox using AESE instruction2019tbl v0.16b, {v3.16b}, v26.16b2020ushr v2.16b, v0.16b, 42021and v0.16b, v0.16b, v31.16b2022tbl v0.16b, {v28.16b}, v0.16b2023tbl v2.16b, {v27.16b}, v2.16b2024eor v0.16b, v0.16b, v2.16b2025eor v1.16b, v1.16b, v1.16b2026aese v0.16b,v1.16b2027ushr v2.16b, v0.16b, 42028and v0.16b, v0.16b, v31.16b2029tbl v0.16b, {v30.16b}, v0.16b2030tbl v2.16b, {v29.16b}, v2.16b2031eor v0.16b, v0.16b, v2.16b20322033mov w7,v0.s[0]2034eor w6,w7,w7,ror #32-22035eor w6,w6,w7,ror #32-102036eor w6,w6,w7,ror #32-182037eor w6,w6,w7,ror #32-242038eor w14,w14,w62039// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2040eor w6,w12,w132041eor w9,w14,w82042eor w6,w6,w92043mov v3.s[0],w62044// optimize sbox using AESE instruction2045tbl v0.16b, {v3.16b}, v26.16b2046ushr v2.16b, v0.16b, 42047and v0.16b, v0.16b, v31.16b2048tbl v0.16b, {v28.16b}, v0.16b2049tbl v2.16b, {v27.16b}, v2.16b2050eor v0.16b, v0.16b, v2.16b2051eor v1.16b, v1.16b, v1.16b2052aese v0.16b,v1.16b2053ushr v2.16b, v0.16b, 42054and v0.16b, v0.16b, v31.16b2055tbl v0.16b, {v30.16b}, v0.16b2056tbl v2.16b, {v29.16b}, v2.16b2057eor v0.16b, v0.16b, v2.16b20582059mov w7,v0.s[0]2060eor w6,w7,w7,ror #32-22061eor w6,w6,w7,ror #32-102062eor w6,w6,w7,ror #32-182063eor w6,w6,w7,ror #32-242064eor w15,w15,w62065subs w11,w11,#12066b.ne 10b2067mov v8.s[0],w152068mov v8.s[1],w142069mov v8.s[2],w132070mov v8.s[3],w122071#ifndef __AARCH64EB__2072rev32 v8.16b,v8.16b2073#endif2074eor v8.16b,v8.16b,v15.16b2075st1 {v8.4s},[x1],#162076b 100f20771: // last two blocks2078ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0]2079add x10,x0,#162080ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#162081subs w2,w2,12082b.gt 1f2083#ifndef __AARCH64EB__2084rev32 v4.16b,v4.16b2085#endif2086#ifndef __AARCH64EB__2087rev32 v5.16b,v5.16b2088#endif2089#ifndef __AARCH64EB__2090rev32 v6.16b,v6.16b2091#endif2092#ifndef __AARCH64EB__2093rev32 v7.16b,v7.16b2094#endif2095bl _vpsm4_ex_enc_4blks2096ld1 {v4.4s,v5.4s},[x0],#322097zip1 v8.4s,v0.4s,v1.4s2098zip2 v9.4s,v0.4s,v1.4s2099zip1 v10.4s,v2.4s,v3.4s2100zip2 v11.4s,v2.4s,v3.4s2101zip1 v0.2d,v8.2d,v10.2d2102zip2 v1.2d,v8.2d,v10.2d2103zip1 v2.2d,v9.2d,v11.2d2104zip2 v3.2d,v9.2d,v11.2d2105eor v0.16b,v0.16b,v15.16b2106eor v1.16b,v1.16b,v4.16b2107st1 {v0.4s,v1.4s},[x1],#322108// save back IV2109st1 {v5.4s}, [x4]2110b 100f21111: // last 3 blocks2112ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10]2113#ifndef __AARCH64EB__2114rev32 v4.16b,v4.16b2115#endif2116#ifndef __AARCH64EB__2117rev32 v5.16b,v5.16b2118#endif2119#ifndef __AARCH64EB__2120rev32 v6.16b,v6.16b2121#endif2122#ifndef __AARCH64EB__2123rev32 v7.16b,v7.16b2124#endif2125bl _vpsm4_ex_enc_4blks2126ld1 {v4.4s,v5.4s,v6.4s},[x0],#482127zip1 v8.4s,v0.4s,v1.4s2128zip2 v9.4s,v0.4s,v1.4s2129zip1 v10.4s,v2.4s,v3.4s2130zip2 v11.4s,v2.4s,v3.4s2131zip1 v0.2d,v8.2d,v10.2d2132zip2 v1.2d,v8.2d,v10.2d2133zip1 v2.2d,v9.2d,v11.2d2134zip2 v3.2d,v9.2d,v11.2d2135eor v0.16b,v0.16b,v15.16b2136eor v1.16b,v1.16b,v4.16b2137eor v2.16b,v2.16b,v5.16b2138st1 {v0.4s,v1.4s,v2.4s},[x1],#482139// save back IV2140st1 {v6.4s}, [x4]2141100:2142ldp d10,d11,[sp,#16]2143ldp d12,d13,[sp,#32]2144ldp d14,d15,[sp,#48]2145ldp x29,x30,[sp,#64]2146ldp d8,d9,[sp],#802147AARCH64_VALIDATE_LINK_REGISTER2148ret2149.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt2150.globl vpsm4_ex_ctr32_encrypt_blocks2151.type vpsm4_ex_ctr32_encrypt_blocks,%function2152.align 52153vpsm4_ex_ctr32_encrypt_blocks:2154AARCH64_VALID_CALL_TARGET2155ld1 {v3.4s},[x4]2156#ifndef __AARCH64EB__2157rev32 v3.16b,v3.16b2158#endif2159adrp x9, .Lsbox_magic2160ldr q26, [x9, #:lo12:.Lsbox_magic]2161ldr q27, [x9, #:lo12:.Lsbox_magic+16]2162ldr q28, [x9, #:lo12:.Lsbox_magic+32]2163ldr q29, [x9, #:lo12:.Lsbox_magic+48]2164ldr q30, [x9, #:lo12:.Lsbox_magic+64]2165ldr q31, [x9, #:lo12:.Lsbox_magic+80]2166cmp w2,#12167b.ne 1f2168// fast processing for one single block without2169// context saving overhead2170mov x10,x32171mov w11,#82172mov w12,v3.s[0]2173mov w13,v3.s[1]2174mov w14,v3.s[2]2175mov w15,v3.s[3]217610:2177ldp w7,w8,[x10],82178// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2179eor w6,w14,w152180eor w9,w7,w132181eor w6,w6,w92182mov v3.s[0],w62183// optimize sbox using AESE instruction2184tbl v0.16b, {v3.16b}, v26.16b2185ushr v2.16b, v0.16b, 42186and v0.16b, v0.16b, v31.16b2187tbl v0.16b, {v28.16b}, v0.16b2188tbl v2.16b, {v27.16b}, v2.16b2189eor v0.16b, v0.16b, v2.16b2190eor v1.16b, v1.16b, v1.16b2191aese v0.16b,v1.16b2192ushr v2.16b, v0.16b, 42193and v0.16b, v0.16b, v31.16b2194tbl v0.16b, {v30.16b}, v0.16b2195tbl v2.16b, {v29.16b}, v2.16b2196eor v0.16b, v0.16b, v2.16b21972198mov w7,v0.s[0]2199eor w6,w7,w7,ror #32-22200eor w6,w6,w7,ror #32-102201eor w6,w6,w7,ror #32-182202eor w6,w6,w7,ror #32-242203eor w12,w12,w62204// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)2205eor w6,w14,w152206eor w9,w12,w82207eor w6,w6,w92208mov v3.s[0],w62209// optimize sbox using AESE instruction2210tbl v0.16b, {v3.16b}, v26.16b2211ushr v2.16b, v0.16b, 42212and v0.16b, v0.16b, v31.16b2213tbl v0.16b, {v28.16b}, v0.16b2214tbl v2.16b, {v27.16b}, v2.16b2215eor v0.16b, v0.16b, v2.16b2216eor v1.16b, v1.16b, v1.16b2217aese v0.16b,v1.16b2218ushr v2.16b, v0.16b, 42219and v0.16b, v0.16b, v31.16b2220tbl v0.16b, {v30.16b}, v0.16b2221tbl v2.16b, {v29.16b}, v2.16b2222eor v0.16b, v0.16b, v2.16b22232224mov w7,v0.s[0]2225eor w6,w7,w7,ror #32-22226eor w6,w6,w7,ror #32-102227eor w6,w6,w7,ror #32-182228eor w6,w6,w7,ror #32-242229ldp w7,w8,[x10],82230eor w13,w13,w62231// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2232eor w6,w12,w132233eor w9,w7,w152234eor w6,w6,w92235mov v3.s[0],w62236// optimize sbox using AESE instruction2237tbl v0.16b, {v3.16b}, v26.16b2238ushr v2.16b, v0.16b, 42239and v0.16b, v0.16b, v31.16b2240tbl v0.16b, {v28.16b}, v0.16b2241tbl v2.16b, {v27.16b}, v2.16b2242eor v0.16b, v0.16b, v2.16b2243eor v1.16b, v1.16b, v1.16b2244aese v0.16b,v1.16b2245ushr v2.16b, v0.16b, 42246and v0.16b, v0.16b, v31.16b2247tbl v0.16b, {v30.16b}, v0.16b2248tbl v2.16b, {v29.16b}, v2.16b2249eor v0.16b, v0.16b, v2.16b22502251mov w7,v0.s[0]2252eor w6,w7,w7,ror #32-22253eor w6,w6,w7,ror #32-102254eor w6,w6,w7,ror #32-182255eor w6,w6,w7,ror #32-242256eor w14,w14,w62257// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2258eor w6,w12,w132259eor w9,w14,w82260eor w6,w6,w92261mov v3.s[0],w62262// optimize sbox using AESE instruction2263tbl v0.16b, {v3.16b}, v26.16b2264ushr v2.16b, v0.16b, 42265and v0.16b, v0.16b, v31.16b2266tbl v0.16b, {v28.16b}, v0.16b2267tbl v2.16b, {v27.16b}, v2.16b2268eor v0.16b, v0.16b, v2.16b2269eor v1.16b, v1.16b, v1.16b2270aese v0.16b,v1.16b2271ushr v2.16b, v0.16b, 42272and v0.16b, v0.16b, v31.16b2273tbl v0.16b, {v30.16b}, v0.16b2274tbl v2.16b, {v29.16b}, v2.16b2275eor v0.16b, v0.16b, v2.16b22762277mov w7,v0.s[0]2278eor w6,w7,w7,ror #32-22279eor w6,w6,w7,ror #32-102280eor w6,w6,w7,ror #32-182281eor w6,w6,w7,ror #32-242282eor w15,w15,w62283subs w11,w11,#12284b.ne 10b2285mov v3.s[0],w152286mov v3.s[1],w142287mov v3.s[2],w132288mov v3.s[3],w122289#ifndef __AARCH64EB__2290rev32 v3.16b,v3.16b2291#endif2292ld1 {v4.4s},[x0]2293eor v4.16b,v4.16b,v3.16b2294st1 {v4.4s},[x1]2295ret22961:2297AARCH64_SIGN_LINK_REGISTER2298stp d8,d9,[sp,#-80]!2299stp d10,d11,[sp,#16]2300stp d12,d13,[sp,#32]2301stp d14,d15,[sp,#48]2302stp x29,x30,[sp,#64]2303mov w12,v3.s[0]2304mov w13,v3.s[1]2305mov w14,v3.s[2]2306mov w5,v3.s[3]2307.Lctr32_4_blocks_process:2308cmp w2,#42309b.lt 1f2310dup v4.4s,w122311dup v5.4s,w132312dup v6.4s,w142313mov v7.s[0],w52314add w5,w5,#12315mov v7.s[1],w52316add w5,w5,#12317mov v7.s[2],w52318add w5,w5,#12319mov v7.s[3],w52320add w5,w5,#12321cmp w2,#82322b.ge .Lctr32_8_blocks_process2323bl _vpsm4_ex_enc_4blks2324ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#642325eor v0.16b,v0.16b,v12.16b2326eor v1.16b,v1.16b,v13.16b2327eor v2.16b,v2.16b,v14.16b2328eor v3.16b,v3.16b,v15.16b2329st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642330subs w2,w2,#42331b.ne .Lctr32_4_blocks_process2332b 100f2333.Lctr32_8_blocks_process:2334dup v8.4s,w122335dup v9.4s,w132336dup v10.4s,w142337mov v11.s[0],w52338add w5,w5,#12339mov v11.s[1],w52340add w5,w5,#12341mov v11.s[2],w52342add w5,w5,#12343mov v11.s[3],w52344add w5,w5,#12345bl _vpsm4_ex_enc_8blks2346ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#642347ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#642348eor v0.16b,v0.16b,v12.16b2349eor v1.16b,v1.16b,v13.16b2350eor v2.16b,v2.16b,v14.16b2351eor v3.16b,v3.16b,v15.16b2352eor v4.16b,v4.16b,v8.16b2353eor v5.16b,v5.16b,v9.16b2354eor v6.16b,v6.16b,v10.16b2355eor v7.16b,v7.16b,v11.16b2356st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642357st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#642358subs w2,w2,#82359b.ne .Lctr32_4_blocks_process2360b 100f23611: // last block processing2362subs w2,w2,#12363b.lt 100f2364b.gt 1f2365mov v3.s[0],w122366mov v3.s[1],w132367mov v3.s[2],w142368mov v3.s[3],w52369mov x10,x32370mov w11,#82371mov w12,v3.s[0]2372mov w13,v3.s[1]2373mov w14,v3.s[2]2374mov w15,v3.s[3]237510:2376ldp w7,w8,[x10],82377// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2378eor w6,w14,w152379eor w9,w7,w132380eor w6,w6,w92381mov v3.s[0],w62382// optimize sbox using AESE instruction2383tbl v0.16b, {v3.16b}, v26.16b2384ushr v2.16b, v0.16b, 42385and v0.16b, v0.16b, v31.16b2386tbl v0.16b, {v28.16b}, v0.16b2387tbl v2.16b, {v27.16b}, v2.16b2388eor v0.16b, v0.16b, v2.16b2389eor v1.16b, v1.16b, v1.16b2390aese v0.16b,v1.16b2391ushr v2.16b, v0.16b, 42392and v0.16b, v0.16b, v31.16b2393tbl v0.16b, {v30.16b}, v0.16b2394tbl v2.16b, {v29.16b}, v2.16b2395eor v0.16b, v0.16b, v2.16b23962397mov w7,v0.s[0]2398eor w6,w7,w7,ror #32-22399eor w6,w6,w7,ror #32-102400eor w6,w6,w7,ror #32-182401eor w6,w6,w7,ror #32-242402eor w12,w12,w62403// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)2404eor w6,w14,w152405eor w9,w12,w82406eor w6,w6,w92407mov v3.s[0],w62408// optimize sbox using AESE instruction2409tbl v0.16b, {v3.16b}, v26.16b2410ushr v2.16b, v0.16b, 42411and v0.16b, v0.16b, v31.16b2412tbl v0.16b, {v28.16b}, v0.16b2413tbl v2.16b, {v27.16b}, v2.16b2414eor v0.16b, v0.16b, v2.16b2415eor v1.16b, v1.16b, v1.16b2416aese v0.16b,v1.16b2417ushr v2.16b, v0.16b, 42418and v0.16b, v0.16b, v31.16b2419tbl v0.16b, {v30.16b}, v0.16b2420tbl v2.16b, {v29.16b}, v2.16b2421eor v0.16b, v0.16b, v2.16b24222423mov w7,v0.s[0]2424eor w6,w7,w7,ror #32-22425eor w6,w6,w7,ror #32-102426eor w6,w6,w7,ror #32-182427eor w6,w6,w7,ror #32-242428ldp w7,w8,[x10],82429eor w13,w13,w62430// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2431eor w6,w12,w132432eor w9,w7,w152433eor w6,w6,w92434mov v3.s[0],w62435// optimize sbox using AESE instruction2436tbl v0.16b, {v3.16b}, v26.16b2437ushr v2.16b, v0.16b, 42438and v0.16b, v0.16b, v31.16b2439tbl v0.16b, {v28.16b}, v0.16b2440tbl v2.16b, {v27.16b}, v2.16b2441eor v0.16b, v0.16b, v2.16b2442eor v1.16b, v1.16b, v1.16b2443aese v0.16b,v1.16b2444ushr v2.16b, v0.16b, 42445and v0.16b, v0.16b, v31.16b2446tbl v0.16b, {v30.16b}, v0.16b2447tbl v2.16b, {v29.16b}, v2.16b2448eor v0.16b, v0.16b, v2.16b24492450mov w7,v0.s[0]2451eor w6,w7,w7,ror #32-22452eor w6,w6,w7,ror #32-102453eor w6,w6,w7,ror #32-182454eor w6,w6,w7,ror #32-242455eor w14,w14,w62456// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2457eor w6,w12,w132458eor w9,w14,w82459eor w6,w6,w92460mov v3.s[0],w62461// optimize sbox using AESE instruction2462tbl v0.16b, {v3.16b}, v26.16b2463ushr v2.16b, v0.16b, 42464and v0.16b, v0.16b, v31.16b2465tbl v0.16b, {v28.16b}, v0.16b2466tbl v2.16b, {v27.16b}, v2.16b2467eor v0.16b, v0.16b, v2.16b2468eor v1.16b, v1.16b, v1.16b2469aese v0.16b,v1.16b2470ushr v2.16b, v0.16b, 42471and v0.16b, v0.16b, v31.16b2472tbl v0.16b, {v30.16b}, v0.16b2473tbl v2.16b, {v29.16b}, v2.16b2474eor v0.16b, v0.16b, v2.16b24752476mov w7,v0.s[0]2477eor w6,w7,w7,ror #32-22478eor w6,w6,w7,ror #32-102479eor w6,w6,w7,ror #32-182480eor w6,w6,w7,ror #32-242481eor w15,w15,w62482subs w11,w11,#12483b.ne 10b2484mov v3.s[0],w152485mov v3.s[1],w142486mov v3.s[2],w132487mov v3.s[3],w122488#ifndef __AARCH64EB__2489rev32 v3.16b,v3.16b2490#endif2491ld1 {v4.4s},[x0]2492eor v4.16b,v4.16b,v3.16b2493st1 {v4.4s},[x1]2494b 100f24951: // last 2 blocks processing2496dup v4.4s,w122497dup v5.4s,w132498dup v6.4s,w142499mov v7.s[0],w52500add w5,w5,#12501mov v7.s[1],w52502subs w2,w2,#12503b.ne 1f2504bl _vpsm4_ex_enc_4blks2505ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#162506ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#162507eor v0.16b,v0.16b,v12.16b2508eor v1.16b,v1.16b,v13.16b2509eor v2.16b,v2.16b,v14.16b2510eor v3.16b,v3.16b,v15.16b2511st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#162512st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#162513b 100f25141: // last 3 blocks processing2515add w5,w5,#12516mov v7.s[2],w52517bl _vpsm4_ex_enc_4blks2518ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#162519ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#162520ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#162521eor v0.16b,v0.16b,v12.16b2522eor v1.16b,v1.16b,v13.16b2523eor v2.16b,v2.16b,v14.16b2524eor v3.16b,v3.16b,v15.16b2525st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#162526st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#162527st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#162528100:2529ldp d10,d11,[sp,#16]2530ldp d12,d13,[sp,#32]2531ldp d14,d15,[sp,#48]2532ldp x29,x30,[sp,#64]2533ldp d8,d9,[sp],#802534AARCH64_VALIDATE_LINK_REGISTER2535ret2536.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks2537.globl vpsm4_ex_xts_encrypt_gb2538.type vpsm4_ex_xts_encrypt_gb,%function2539.align 52540vpsm4_ex_xts_encrypt_gb:2541AARCH64_SIGN_LINK_REGISTER2542stp x15, x16, [sp, #-0x10]!2543stp x17, x18, [sp, #-0x10]!2544stp x19, x20, [sp, #-0x10]!2545stp x21, x22, [sp, #-0x10]!2546stp x23, x24, [sp, #-0x10]!2547stp x25, x26, [sp, #-0x10]!2548stp x27, x28, [sp, #-0x10]!2549stp x29, x30, [sp, #-0x10]!2550stp d8, d9, [sp, #-0x10]!2551stp d10, d11, [sp, #-0x10]!2552stp d12, d13, [sp, #-0x10]!2553stp d14, d15, [sp, #-0x10]!2554mov x26,x32555mov x27,x42556mov w28,w62557ld1 {v16.4s}, [x5]2558mov x3,x272559adrp x9, .Lsbox_magic2560ldr q26, [x9, #:lo12:.Lsbox_magic]2561ldr q27, [x9, #:lo12:.Lsbox_magic+16]2562ldr q28, [x9, #:lo12:.Lsbox_magic+32]2563ldr q29, [x9, #:lo12:.Lsbox_magic+48]2564ldr q30, [x9, #:lo12:.Lsbox_magic+64]2565ldr q31, [x9, #:lo12:.Lsbox_magic+80]2566#ifndef __AARCH64EB__2567rev32 v16.16b,v16.16b2568#endif2569mov x10,x32570mov w11,#82571mov w12,v16.s[0]2572mov w13,v16.s[1]2573mov w14,v16.s[2]2574mov w15,v16.s[3]257510:2576ldp w7,w8,[x10],82577// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2578eor w6,w14,w152579eor w9,w7,w132580eor w6,w6,w92581mov v3.s[0],w62582// optimize sbox using AESE instruction2583tbl v0.16b, {v3.16b}, v26.16b2584ushr v2.16b, v0.16b, 42585and v0.16b, v0.16b, v31.16b2586tbl v0.16b, {v28.16b}, v0.16b2587tbl v2.16b, {v27.16b}, v2.16b2588eor v0.16b, v0.16b, v2.16b2589eor v1.16b, v1.16b, v1.16b2590aese v0.16b,v1.16b2591ushr v2.16b, v0.16b, 42592and v0.16b, v0.16b, v31.16b2593tbl v0.16b, {v30.16b}, v0.16b2594tbl v2.16b, {v29.16b}, v2.16b2595eor v0.16b, v0.16b, v2.16b25962597mov w7,v0.s[0]2598eor w6,w7,w7,ror #32-22599eor w6,w6,w7,ror #32-102600eor w6,w6,w7,ror #32-182601eor w6,w6,w7,ror #32-242602eor w12,w12,w62603// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)2604eor w6,w14,w152605eor w9,w12,w82606eor w6,w6,w92607mov v3.s[0],w62608// optimize sbox using AESE instruction2609tbl v0.16b, {v3.16b}, v26.16b2610ushr v2.16b, v0.16b, 42611and v0.16b, v0.16b, v31.16b2612tbl v0.16b, {v28.16b}, v0.16b2613tbl v2.16b, {v27.16b}, v2.16b2614eor v0.16b, v0.16b, v2.16b2615eor v1.16b, v1.16b, v1.16b2616aese v0.16b,v1.16b2617ushr v2.16b, v0.16b, 42618and v0.16b, v0.16b, v31.16b2619tbl v0.16b, {v30.16b}, v0.16b2620tbl v2.16b, {v29.16b}, v2.16b2621eor v0.16b, v0.16b, v2.16b26222623mov w7,v0.s[0]2624eor w6,w7,w7,ror #32-22625eor w6,w6,w7,ror #32-102626eor w6,w6,w7,ror #32-182627eor w6,w6,w7,ror #32-242628ldp w7,w8,[x10],82629eor w13,w13,w62630// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)2631eor w6,w12,w132632eor w9,w7,w152633eor w6,w6,w92634mov v3.s[0],w62635// optimize sbox using AESE instruction2636tbl v0.16b, {v3.16b}, v26.16b2637ushr v2.16b, v0.16b, 42638and v0.16b, v0.16b, v31.16b2639tbl v0.16b, {v28.16b}, v0.16b2640tbl v2.16b, {v27.16b}, v2.16b2641eor v0.16b, v0.16b, v2.16b2642eor v1.16b, v1.16b, v1.16b2643aese v0.16b,v1.16b2644ushr v2.16b, v0.16b, 42645and v0.16b, v0.16b, v31.16b2646tbl v0.16b, {v30.16b}, v0.16b2647tbl v2.16b, {v29.16b}, v2.16b2648eor v0.16b, v0.16b, v2.16b26492650mov w7,v0.s[0]2651eor w6,w7,w7,ror #32-22652eor w6,w6,w7,ror #32-102653eor w6,w6,w7,ror #32-182654eor w6,w6,w7,ror #32-242655eor w14,w14,w62656// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)2657eor w6,w12,w132658eor w9,w14,w82659eor w6,w6,w92660mov v3.s[0],w62661// optimize sbox using AESE instruction2662tbl v0.16b, {v3.16b}, v26.16b2663ushr v2.16b, v0.16b, 42664and v0.16b, v0.16b, v31.16b2665tbl v0.16b, {v28.16b}, v0.16b2666tbl v2.16b, {v27.16b}, v2.16b2667eor v0.16b, v0.16b, v2.16b2668eor v1.16b, v1.16b, v1.16b2669aese v0.16b,v1.16b2670ushr v2.16b, v0.16b, 42671and v0.16b, v0.16b, v31.16b2672tbl v0.16b, {v30.16b}, v0.16b2673tbl v2.16b, {v29.16b}, v2.16b2674eor v0.16b, v0.16b, v2.16b26752676mov w7,v0.s[0]2677eor w6,w7,w7,ror #32-22678eor w6,w6,w7,ror #32-102679eor w6,w6,w7,ror #32-182680eor w6,w6,w7,ror #32-242681eor w15,w15,w62682subs w11,w11,#12683b.ne 10b2684mov v16.s[0],w152685mov v16.s[1],w142686mov v16.s[2],w132687mov v16.s[3],w122688#ifndef __AARCH64EB__2689rev32 v16.16b,v16.16b2690#endif2691mov x3,x262692and x29,x2,#0x0F2693// convert length into blocks2694lsr x2,x2,42695cmp x2,#12696b.lt .return_gb26972698cmp x29,02699// If the encryption/decryption Length is N times of 16,2700// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb2701b.eq .xts_encrypt_blocks_gb27022703// If the encryption/decryption length is not N times of 16,2704// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb2705// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb2706subs x2,x2,#12707b.eq .only_2blks_tweak_gb2708.xts_encrypt_blocks_gb:2709rbit v16.16b,v16.16b2710#ifdef __AARCH64EB__2711rev32 v16.16b,v16.16b2712#endif2713mov x12,v16.d[0]2714mov x13,v16.d[1]2715mov w7,0x872716extr x9,x13,x13,#322717extr x15,x13,x12,#632718and w8,w7,w9,asr#312719eor x14,x8,x12,lsl#12720mov w7,0x872721extr x9,x15,x15,#322722extr x17,x15,x14,#632723and w8,w7,w9,asr#312724eor x16,x8,x14,lsl#12725mov w7,0x872726extr x9,x17,x17,#322727extr x19,x17,x16,#632728and w8,w7,w9,asr#312729eor x18,x8,x16,lsl#12730mov w7,0x872731extr x9,x19,x19,#322732extr x21,x19,x18,#632733and w8,w7,w9,asr#312734eor x20,x8,x18,lsl#12735mov w7,0x872736extr x9,x21,x21,#322737extr x23,x21,x20,#632738and w8,w7,w9,asr#312739eor x22,x8,x20,lsl#12740mov w7,0x872741extr x9,x23,x23,#322742extr x25,x23,x22,#632743and w8,w7,w9,asr#312744eor x24,x8,x22,lsl#12745mov w7,0x872746extr x9,x25,x25,#322747extr x27,x25,x24,#632748and w8,w7,w9,asr#312749eor x26,x8,x24,lsl#12750.Lxts_8_blocks_process_gb:2751cmp x2,#82752mov v16.d[0],x122753mov v16.d[1],x132754#ifdef __AARCH64EB__2755rev32 v16.16b,v16.16b2756#endif2757mov w7,0x872758extr x9,x27,x27,#322759extr x13,x27,x26,#632760and w8,w7,w9,asr#312761eor x12,x8,x26,lsl#12762mov v17.d[0],x142763mov v17.d[1],x152764#ifdef __AARCH64EB__2765rev32 v17.16b,v17.16b2766#endif2767mov w7,0x872768extr x9,x13,x13,#322769extr x15,x13,x12,#632770and w8,w7,w9,asr#312771eor x14,x8,x12,lsl#12772mov v18.d[0],x162773mov v18.d[1],x172774#ifdef __AARCH64EB__2775rev32 v18.16b,v18.16b2776#endif2777mov w7,0x872778extr x9,x15,x15,#322779extr x17,x15,x14,#632780and w8,w7,w9,asr#312781eor x16,x8,x14,lsl#12782mov v19.d[0],x182783mov v19.d[1],x192784#ifdef __AARCH64EB__2785rev32 v19.16b,v19.16b2786#endif2787mov w7,0x872788extr x9,x17,x17,#322789extr x19,x17,x16,#632790and w8,w7,w9,asr#312791eor x18,x8,x16,lsl#12792mov v20.d[0],x202793mov v20.d[1],x212794#ifdef __AARCH64EB__2795rev32 v20.16b,v20.16b2796#endif2797mov w7,0x872798extr x9,x19,x19,#322799extr x21,x19,x18,#632800and w8,w7,w9,asr#312801eor x20,x8,x18,lsl#12802mov v21.d[0],x222803mov v21.d[1],x232804#ifdef __AARCH64EB__2805rev32 v21.16b,v21.16b2806#endif2807mov w7,0x872808extr x9,x21,x21,#322809extr x23,x21,x20,#632810and w8,w7,w9,asr#312811eor x22,x8,x20,lsl#12812mov v22.d[0],x242813mov v22.d[1],x252814#ifdef __AARCH64EB__2815rev32 v22.16b,v22.16b2816#endif2817mov w7,0x872818extr x9,x23,x23,#322819extr x25,x23,x22,#632820and w8,w7,w9,asr#312821eor x24,x8,x22,lsl#12822mov v23.d[0],x262823mov v23.d[1],x272824#ifdef __AARCH64EB__2825rev32 v23.16b,v23.16b2826#endif2827mov w7,0x872828extr x9,x25,x25,#322829extr x27,x25,x24,#632830and w8,w7,w9,asr#312831eor x26,x8,x24,lsl#12832b.lt .Lxts_4_blocks_process_gb2833ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#642834rbit v16.16b,v16.16b2835rbit v17.16b,v17.16b2836rbit v18.16b,v18.16b2837rbit v19.16b,v19.16b2838eor v4.16b, v4.16b, v16.16b2839eor v5.16b, v5.16b, v17.16b2840eor v6.16b, v6.16b, v18.16b2841eor v7.16b, v7.16b, v19.16b2842ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#642843rbit v20.16b,v20.16b2844rbit v21.16b,v21.16b2845rbit v22.16b,v22.16b2846rbit v23.16b,v23.16b2847eor v8.16b, v8.16b, v20.16b2848eor v9.16b, v9.16b, v21.16b2849eor v10.16b, v10.16b, v22.16b2850eor v11.16b, v11.16b, v23.16b2851#ifndef __AARCH64EB__2852rev32 v4.16b,v4.16b2853#endif2854#ifndef __AARCH64EB__2855rev32 v5.16b,v5.16b2856#endif2857#ifndef __AARCH64EB__2858rev32 v6.16b,v6.16b2859#endif2860#ifndef __AARCH64EB__2861rev32 v7.16b,v7.16b2862#endif2863#ifndef __AARCH64EB__2864rev32 v8.16b,v8.16b2865#endif2866#ifndef __AARCH64EB__2867rev32 v9.16b,v9.16b2868#endif2869#ifndef __AARCH64EB__2870rev32 v10.16b,v10.16b2871#endif2872#ifndef __AARCH64EB__2873rev32 v11.16b,v11.16b2874#endif2875zip1 v0.4s,v4.4s,v5.4s2876zip2 v1.4s,v4.4s,v5.4s2877zip1 v2.4s,v6.4s,v7.4s2878zip2 v3.4s,v6.4s,v7.4s2879zip1 v4.2d,v0.2d,v2.2d2880zip2 v5.2d,v0.2d,v2.2d2881zip1 v6.2d,v1.2d,v3.2d2882zip2 v7.2d,v1.2d,v3.2d2883zip1 v0.4s,v8.4s,v9.4s2884zip2 v1.4s,v8.4s,v9.4s2885zip1 v2.4s,v10.4s,v11.4s2886zip2 v3.4s,v10.4s,v11.4s2887zip1 v8.2d,v0.2d,v2.2d2888zip2 v9.2d,v0.2d,v2.2d2889zip1 v10.2d,v1.2d,v3.2d2890zip2 v11.2d,v1.2d,v3.2d2891bl _vpsm4_ex_enc_8blks2892zip1 v8.4s,v0.4s,v1.4s2893zip2 v9.4s,v0.4s,v1.4s2894zip1 v10.4s,v2.4s,v3.4s2895zip2 v11.4s,v2.4s,v3.4s2896zip1 v0.2d,v8.2d,v10.2d2897zip2 v1.2d,v8.2d,v10.2d2898zip1 v2.2d,v9.2d,v11.2d2899zip2 v3.2d,v9.2d,v11.2d2900zip1 v8.4s,v4.4s,v5.4s2901zip2 v9.4s,v4.4s,v5.4s2902zip1 v10.4s,v6.4s,v7.4s2903zip2 v11.4s,v6.4s,v7.4s2904zip1 v4.2d,v8.2d,v10.2d2905zip2 v5.2d,v8.2d,v10.2d2906zip1 v6.2d,v9.2d,v11.2d2907zip2 v7.2d,v9.2d,v11.2d2908eor v0.16b, v0.16b, v16.16b2909eor v1.16b, v1.16b, v17.16b2910eor v2.16b, v2.16b, v18.16b2911eor v3.16b, v3.16b, v19.16b2912eor v4.16b, v4.16b, v20.16b2913eor v5.16b, v5.16b, v21.16b2914eor v6.16b, v6.16b, v22.16b2915eor v7.16b, v7.16b, v23.16b29162917// save the last tweak2918mov v25.16b,v23.16b2919st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642920st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#642921subs x2,x2,#82922b.gt .Lxts_8_blocks_process_gb2923b 100f2924.Lxts_4_blocks_process_gb:2925cmp x2,#42926b.lt 1f2927ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#642928rbit v16.16b,v16.16b2929rbit v17.16b,v17.16b2930rbit v18.16b,v18.16b2931rbit v19.16b,v19.16b2932eor v4.16b, v4.16b, v16.16b2933eor v5.16b, v5.16b, v17.16b2934eor v6.16b, v6.16b, v18.16b2935eor v7.16b, v7.16b, v19.16b2936#ifndef __AARCH64EB__2937rev32 v4.16b,v4.16b2938#endif2939#ifndef __AARCH64EB__2940rev32 v5.16b,v5.16b2941#endif2942#ifndef __AARCH64EB__2943rev32 v6.16b,v6.16b2944#endif2945#ifndef __AARCH64EB__2946rev32 v7.16b,v7.16b2947#endif2948zip1 v0.4s,v4.4s,v5.4s2949zip2 v1.4s,v4.4s,v5.4s2950zip1 v2.4s,v6.4s,v7.4s2951zip2 v3.4s,v6.4s,v7.4s2952zip1 v4.2d,v0.2d,v2.2d2953zip2 v5.2d,v0.2d,v2.2d2954zip1 v6.2d,v1.2d,v3.2d2955zip2 v7.2d,v1.2d,v3.2d2956bl _vpsm4_ex_enc_4blks2957zip1 v4.4s,v0.4s,v1.4s2958zip2 v5.4s,v0.4s,v1.4s2959zip1 v6.4s,v2.4s,v3.4s2960zip2 v7.4s,v2.4s,v3.4s2961zip1 v0.2d,v4.2d,v6.2d2962zip2 v1.2d,v4.2d,v6.2d2963zip1 v2.2d,v5.2d,v7.2d2964zip2 v3.2d,v5.2d,v7.2d2965eor v0.16b, v0.16b, v16.16b2966eor v1.16b, v1.16b, v17.16b2967eor v2.16b, v2.16b, v18.16b2968eor v3.16b, v3.16b, v19.16b2969st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#642970sub x2,x2,#42971mov v16.16b,v20.16b2972mov v17.16b,v21.16b2973mov v18.16b,v22.16b2974// save the last tweak2975mov v25.16b,v19.16b29761:2977// process last block2978cmp x2,#12979b.lt 100f2980b.gt 1f2981ld1 {v4.4s},[x0],#162982rbit v16.16b,v16.16b2983eor v4.16b, v4.16b, v16.16b2984#ifndef __AARCH64EB__2985rev32 v4.16b,v4.16b2986#endif2987mov x10,x32988mov w11,#82989mov w12,v4.s[0]2990mov w13,v4.s[1]2991mov w14,v4.s[2]2992mov w15,v4.s[3]299310:2994ldp w7,w8,[x10],82995// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)2996eor w6,w14,w152997eor w9,w7,w132998eor w6,w6,w92999mov v3.s[0],w63000// optimize sbox using AESE instruction3001tbl v0.16b, {v3.16b}, v26.16b3002ushr v2.16b, v0.16b, 43003and v0.16b, v0.16b, v31.16b3004tbl v0.16b, {v28.16b}, v0.16b3005tbl v2.16b, {v27.16b}, v2.16b3006eor v0.16b, v0.16b, v2.16b3007eor v1.16b, v1.16b, v1.16b3008aese v0.16b,v1.16b3009ushr v2.16b, v0.16b, 43010and v0.16b, v0.16b, v31.16b3011tbl v0.16b, {v30.16b}, v0.16b3012tbl v2.16b, {v29.16b}, v2.16b3013eor v0.16b, v0.16b, v2.16b30143015mov w7,v0.s[0]3016eor w6,w7,w7,ror #32-23017eor w6,w6,w7,ror #32-103018eor w6,w6,w7,ror #32-183019eor w6,w6,w7,ror #32-243020eor w12,w12,w63021// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3022eor w6,w14,w153023eor w9,w12,w83024eor w6,w6,w93025mov v3.s[0],w63026// optimize sbox using AESE instruction3027tbl v0.16b, {v3.16b}, v26.16b3028ushr v2.16b, v0.16b, 43029and v0.16b, v0.16b, v31.16b3030tbl v0.16b, {v28.16b}, v0.16b3031tbl v2.16b, {v27.16b}, v2.16b3032eor v0.16b, v0.16b, v2.16b3033eor v1.16b, v1.16b, v1.16b3034aese v0.16b,v1.16b3035ushr v2.16b, v0.16b, 43036and v0.16b, v0.16b, v31.16b3037tbl v0.16b, {v30.16b}, v0.16b3038tbl v2.16b, {v29.16b}, v2.16b3039eor v0.16b, v0.16b, v2.16b30403041mov w7,v0.s[0]3042eor w6,w7,w7,ror #32-23043eor w6,w6,w7,ror #32-103044eor w6,w6,w7,ror #32-183045eor w6,w6,w7,ror #32-243046ldp w7,w8,[x10],83047eor w13,w13,w63048// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3049eor w6,w12,w133050eor w9,w7,w153051eor w6,w6,w93052mov v3.s[0],w63053// optimize sbox using AESE instruction3054tbl v0.16b, {v3.16b}, v26.16b3055ushr v2.16b, v0.16b, 43056and v0.16b, v0.16b, v31.16b3057tbl v0.16b, {v28.16b}, v0.16b3058tbl v2.16b, {v27.16b}, v2.16b3059eor v0.16b, v0.16b, v2.16b3060eor v1.16b, v1.16b, v1.16b3061aese v0.16b,v1.16b3062ushr v2.16b, v0.16b, 43063and v0.16b, v0.16b, v31.16b3064tbl v0.16b, {v30.16b}, v0.16b3065tbl v2.16b, {v29.16b}, v2.16b3066eor v0.16b, v0.16b, v2.16b30673068mov w7,v0.s[0]3069eor w6,w7,w7,ror #32-23070eor w6,w6,w7,ror #32-103071eor w6,w6,w7,ror #32-183072eor w6,w6,w7,ror #32-243073eor w14,w14,w63074// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3075eor w6,w12,w133076eor w9,w14,w83077eor w6,w6,w93078mov v3.s[0],w63079// optimize sbox using AESE instruction3080tbl v0.16b, {v3.16b}, v26.16b3081ushr v2.16b, v0.16b, 43082and v0.16b, v0.16b, v31.16b3083tbl v0.16b, {v28.16b}, v0.16b3084tbl v2.16b, {v27.16b}, v2.16b3085eor v0.16b, v0.16b, v2.16b3086eor v1.16b, v1.16b, v1.16b3087aese v0.16b,v1.16b3088ushr v2.16b, v0.16b, 43089and v0.16b, v0.16b, v31.16b3090tbl v0.16b, {v30.16b}, v0.16b3091tbl v2.16b, {v29.16b}, v2.16b3092eor v0.16b, v0.16b, v2.16b30933094mov w7,v0.s[0]3095eor w6,w7,w7,ror #32-23096eor w6,w6,w7,ror #32-103097eor w6,w6,w7,ror #32-183098eor w6,w6,w7,ror #32-243099eor w15,w15,w63100subs w11,w11,#13101b.ne 10b3102mov v4.s[0],w153103mov v4.s[1],w143104mov v4.s[2],w133105mov v4.s[3],w123106#ifndef __AARCH64EB__3107rev32 v4.16b,v4.16b3108#endif3109eor v4.16b, v4.16b, v16.16b3110st1 {v4.4s},[x1],#163111// save the last tweak3112mov v25.16b,v16.16b3113b 100f31141: // process last 2 blocks3115cmp x2,#23116b.gt 1f3117ld1 {v4.4s,v5.4s},[x0],#323118rbit v16.16b,v16.16b3119rbit v17.16b,v17.16b3120eor v4.16b, v4.16b, v16.16b3121eor v5.16b, v5.16b, v17.16b3122#ifndef __AARCH64EB__3123rev32 v4.16b,v4.16b3124#endif3125#ifndef __AARCH64EB__3126rev32 v5.16b,v5.16b3127#endif3128zip1 v0.4s,v4.4s,v5.4s3129zip2 v1.4s,v4.4s,v5.4s3130zip1 v2.4s,v6.4s,v7.4s3131zip2 v3.4s,v6.4s,v7.4s3132zip1 v4.2d,v0.2d,v2.2d3133zip2 v5.2d,v0.2d,v2.2d3134zip1 v6.2d,v1.2d,v3.2d3135zip2 v7.2d,v1.2d,v3.2d3136bl _vpsm4_ex_enc_4blks3137zip1 v4.4s,v0.4s,v1.4s3138zip2 v5.4s,v0.4s,v1.4s3139zip1 v6.4s,v2.4s,v3.4s3140zip2 v7.4s,v2.4s,v3.4s3141zip1 v0.2d,v4.2d,v6.2d3142zip2 v1.2d,v4.2d,v6.2d3143zip1 v2.2d,v5.2d,v7.2d3144zip2 v3.2d,v5.2d,v7.2d3145eor v0.16b, v0.16b, v16.16b3146eor v1.16b, v1.16b, v17.16b3147st1 {v0.4s,v1.4s},[x1],#323148// save the last tweak3149mov v25.16b,v17.16b3150b 100f31511: // process last 3 blocks3152ld1 {v4.4s,v5.4s,v6.4s},[x0],#483153rbit v16.16b,v16.16b3154rbit v17.16b,v17.16b3155rbit v18.16b,v18.16b3156eor v4.16b, v4.16b, v16.16b3157eor v5.16b, v5.16b, v17.16b3158eor v6.16b, v6.16b, v18.16b3159#ifndef __AARCH64EB__3160rev32 v4.16b,v4.16b3161#endif3162#ifndef __AARCH64EB__3163rev32 v5.16b,v5.16b3164#endif3165#ifndef __AARCH64EB__3166rev32 v6.16b,v6.16b3167#endif3168zip1 v0.4s,v4.4s,v5.4s3169zip2 v1.4s,v4.4s,v5.4s3170zip1 v2.4s,v6.4s,v7.4s3171zip2 v3.4s,v6.4s,v7.4s3172zip1 v4.2d,v0.2d,v2.2d3173zip2 v5.2d,v0.2d,v2.2d3174zip1 v6.2d,v1.2d,v3.2d3175zip2 v7.2d,v1.2d,v3.2d3176bl _vpsm4_ex_enc_4blks3177zip1 v4.4s,v0.4s,v1.4s3178zip2 v5.4s,v0.4s,v1.4s3179zip1 v6.4s,v2.4s,v3.4s3180zip2 v7.4s,v2.4s,v3.4s3181zip1 v0.2d,v4.2d,v6.2d3182zip2 v1.2d,v4.2d,v6.2d3183zip1 v2.2d,v5.2d,v7.2d3184zip2 v3.2d,v5.2d,v7.2d3185eor v0.16b, v0.16b, v16.16b3186eor v1.16b, v1.16b, v17.16b3187eor v2.16b, v2.16b, v18.16b3188st1 {v0.4s,v1.4s,v2.4s},[x1],#483189// save the last tweak3190mov v25.16b,v18.16b3191100:3192cmp x29,03193b.eq .return_gb31943195// This branch calculates the last two tweaks,3196// while the encryption/decryption length is larger than 323197.last_2blks_tweak_gb:3198#ifdef __AARCH64EB__3199rev32 v25.16b,v25.16b3200#endif3201rbit v2.16b,v25.16b3202adrp x9, .Lxts_magic3203ldr q0, [x9, #:lo12:.Lxts_magic]3204shl v17.16b, v2.16b, #13205ext v1.16b, v2.16b, v2.16b,#153206ushr v1.16b, v1.16b, #73207mul v1.16b, v1.16b, v0.16b3208eor v17.16b, v17.16b, v1.16b3209rbit v17.16b,v17.16b3210rbit v2.16b,v17.16b3211adrp x9, .Lxts_magic3212ldr q0, [x9, #:lo12:.Lxts_magic]3213shl v18.16b, v2.16b, #13214ext v1.16b, v2.16b, v2.16b,#153215ushr v1.16b, v1.16b, #73216mul v1.16b, v1.16b, v0.16b3217eor v18.16b, v18.16b, v1.16b3218rbit v18.16b,v18.16b3219b .check_dec_gb322032213222// This branch calculates the last two tweaks,3223// while the encryption/decryption length is equal to 32, who only need two tweaks3224.only_2blks_tweak_gb:3225mov v17.16b,v16.16b3226#ifdef __AARCH64EB__3227rev32 v17.16b,v17.16b3228#endif3229rbit v2.16b,v17.16b3230adrp x9, .Lxts_magic3231ldr q0, [x9, #:lo12:.Lxts_magic]3232shl v18.16b, v2.16b, #13233ext v1.16b, v2.16b, v2.16b,#153234ushr v1.16b, v1.16b, #73235mul v1.16b, v1.16b, v0.16b3236eor v18.16b, v18.16b, v1.16b3237rbit v18.16b,v18.16b3238b .check_dec_gb323932403241// Determine whether encryption or decryption is required.3242// The last two tweaks need to be swapped for decryption.3243.check_dec_gb:3244// encryption:1 decryption:03245cmp w28,13246b.eq .process_last_2blks_gb3247mov v0.16B,v17.16b3248mov v17.16B,v18.16b3249mov v18.16B,v0.16b32503251.process_last_2blks_gb:3252#ifdef __AARCH64EB__3253rev32 v17.16b,v17.16b3254#endif3255#ifdef __AARCH64EB__3256rev32 v18.16b,v18.16b3257#endif3258ld1 {v4.4s},[x0],#163259eor v4.16b, v4.16b, v17.16b3260#ifndef __AARCH64EB__3261rev32 v4.16b,v4.16b3262#endif3263mov x10,x33264mov w11,#83265mov w12,v4.s[0]3266mov w13,v4.s[1]3267mov w14,v4.s[2]3268mov w15,v4.s[3]326910:3270ldp w7,w8,[x10],83271// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3272eor w6,w14,w153273eor w9,w7,w133274eor w6,w6,w93275mov v3.s[0],w63276// optimize sbox using AESE instruction3277tbl v0.16b, {v3.16b}, v26.16b3278ushr v2.16b, v0.16b, 43279and v0.16b, v0.16b, v31.16b3280tbl v0.16b, {v28.16b}, v0.16b3281tbl v2.16b, {v27.16b}, v2.16b3282eor v0.16b, v0.16b, v2.16b3283eor v1.16b, v1.16b, v1.16b3284aese v0.16b,v1.16b3285ushr v2.16b, v0.16b, 43286and v0.16b, v0.16b, v31.16b3287tbl v0.16b, {v30.16b}, v0.16b3288tbl v2.16b, {v29.16b}, v2.16b3289eor v0.16b, v0.16b, v2.16b32903291mov w7,v0.s[0]3292eor w6,w7,w7,ror #32-23293eor w6,w6,w7,ror #32-103294eor w6,w6,w7,ror #32-183295eor w6,w6,w7,ror #32-243296eor w12,w12,w63297// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3298eor w6,w14,w153299eor w9,w12,w83300eor w6,w6,w93301mov v3.s[0],w63302// optimize sbox using AESE instruction3303tbl v0.16b, {v3.16b}, v26.16b3304ushr v2.16b, v0.16b, 43305and v0.16b, v0.16b, v31.16b3306tbl v0.16b, {v28.16b}, v0.16b3307tbl v2.16b, {v27.16b}, v2.16b3308eor v0.16b, v0.16b, v2.16b3309eor v1.16b, v1.16b, v1.16b3310aese v0.16b,v1.16b3311ushr v2.16b, v0.16b, 43312and v0.16b, v0.16b, v31.16b3313tbl v0.16b, {v30.16b}, v0.16b3314tbl v2.16b, {v29.16b}, v2.16b3315eor v0.16b, v0.16b, v2.16b33163317mov w7,v0.s[0]3318eor w6,w7,w7,ror #32-23319eor w6,w6,w7,ror #32-103320eor w6,w6,w7,ror #32-183321eor w6,w6,w7,ror #32-243322ldp w7,w8,[x10],83323eor w13,w13,w63324// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3325eor w6,w12,w133326eor w9,w7,w153327eor w6,w6,w93328mov v3.s[0],w63329// optimize sbox using AESE instruction3330tbl v0.16b, {v3.16b}, v26.16b3331ushr v2.16b, v0.16b, 43332and v0.16b, v0.16b, v31.16b3333tbl v0.16b, {v28.16b}, v0.16b3334tbl v2.16b, {v27.16b}, v2.16b3335eor v0.16b, v0.16b, v2.16b3336eor v1.16b, v1.16b, v1.16b3337aese v0.16b,v1.16b3338ushr v2.16b, v0.16b, 43339and v0.16b, v0.16b, v31.16b3340tbl v0.16b, {v30.16b}, v0.16b3341tbl v2.16b, {v29.16b}, v2.16b3342eor v0.16b, v0.16b, v2.16b33433344mov w7,v0.s[0]3345eor w6,w7,w7,ror #32-23346eor w6,w6,w7,ror #32-103347eor w6,w6,w7,ror #32-183348eor w6,w6,w7,ror #32-243349eor w14,w14,w63350// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3351eor w6,w12,w133352eor w9,w14,w83353eor w6,w6,w93354mov v3.s[0],w63355// optimize sbox using AESE instruction3356tbl v0.16b, {v3.16b}, v26.16b3357ushr v2.16b, v0.16b, 43358and v0.16b, v0.16b, v31.16b3359tbl v0.16b, {v28.16b}, v0.16b3360tbl v2.16b, {v27.16b}, v2.16b3361eor v0.16b, v0.16b, v2.16b3362eor v1.16b, v1.16b, v1.16b3363aese v0.16b,v1.16b3364ushr v2.16b, v0.16b, 43365and v0.16b, v0.16b, v31.16b3366tbl v0.16b, {v30.16b}, v0.16b3367tbl v2.16b, {v29.16b}, v2.16b3368eor v0.16b, v0.16b, v2.16b33693370mov w7,v0.s[0]3371eor w6,w7,w7,ror #32-23372eor w6,w6,w7,ror #32-103373eor w6,w6,w7,ror #32-183374eor w6,w6,w7,ror #32-243375eor w15,w15,w63376subs w11,w11,#13377b.ne 10b3378mov v4.s[0],w153379mov v4.s[1],w143380mov v4.s[2],w133381mov v4.s[3],w123382#ifndef __AARCH64EB__3383rev32 v4.16b,v4.16b3384#endif3385eor v4.16b, v4.16b, v17.16b3386st1 {v4.4s},[x1],#1633873388sub x26,x1,163389.loop_gb:3390subs x29,x29,13391ldrb w7,[x26,x29]3392ldrb w8,[x0,x29]3393strb w8,[x26,x29]3394strb w7,[x1,x29]3395b.gt .loop_gb3396ld1 {v4.4s}, [x26]3397eor v4.16b, v4.16b, v18.16b3398#ifndef __AARCH64EB__3399rev32 v4.16b,v4.16b3400#endif3401mov x10,x33402mov w11,#83403mov w12,v4.s[0]3404mov w13,v4.s[1]3405mov w14,v4.s[2]3406mov w15,v4.s[3]340710:3408ldp w7,w8,[x10],83409// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3410eor w6,w14,w153411eor w9,w7,w133412eor w6,w6,w93413mov v3.s[0],w63414// optimize sbox using AESE instruction3415tbl v0.16b, {v3.16b}, v26.16b3416ushr v2.16b, v0.16b, 43417and v0.16b, v0.16b, v31.16b3418tbl v0.16b, {v28.16b}, v0.16b3419tbl v2.16b, {v27.16b}, v2.16b3420eor v0.16b, v0.16b, v2.16b3421eor v1.16b, v1.16b, v1.16b3422aese v0.16b,v1.16b3423ushr v2.16b, v0.16b, 43424and v0.16b, v0.16b, v31.16b3425tbl v0.16b, {v30.16b}, v0.16b3426tbl v2.16b, {v29.16b}, v2.16b3427eor v0.16b, v0.16b, v2.16b34283429mov w7,v0.s[0]3430eor w6,w7,w7,ror #32-23431eor w6,w6,w7,ror #32-103432eor w6,w6,w7,ror #32-183433eor w6,w6,w7,ror #32-243434eor w12,w12,w63435// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3436eor w6,w14,w153437eor w9,w12,w83438eor w6,w6,w93439mov v3.s[0],w63440// optimize sbox using AESE instruction3441tbl v0.16b, {v3.16b}, v26.16b3442ushr v2.16b, v0.16b, 43443and v0.16b, v0.16b, v31.16b3444tbl v0.16b, {v28.16b}, v0.16b3445tbl v2.16b, {v27.16b}, v2.16b3446eor v0.16b, v0.16b, v2.16b3447eor v1.16b, v1.16b, v1.16b3448aese v0.16b,v1.16b3449ushr v2.16b, v0.16b, 43450and v0.16b, v0.16b, v31.16b3451tbl v0.16b, {v30.16b}, v0.16b3452tbl v2.16b, {v29.16b}, v2.16b3453eor v0.16b, v0.16b, v2.16b34543455mov w7,v0.s[0]3456eor w6,w7,w7,ror #32-23457eor w6,w6,w7,ror #32-103458eor w6,w6,w7,ror #32-183459eor w6,w6,w7,ror #32-243460ldp w7,w8,[x10],83461eor w13,w13,w63462// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3463eor w6,w12,w133464eor w9,w7,w153465eor w6,w6,w93466mov v3.s[0],w63467// optimize sbox using AESE instruction3468tbl v0.16b, {v3.16b}, v26.16b3469ushr v2.16b, v0.16b, 43470and v0.16b, v0.16b, v31.16b3471tbl v0.16b, {v28.16b}, v0.16b3472tbl v2.16b, {v27.16b}, v2.16b3473eor v0.16b, v0.16b, v2.16b3474eor v1.16b, v1.16b, v1.16b3475aese v0.16b,v1.16b3476ushr v2.16b, v0.16b, 43477and v0.16b, v0.16b, v31.16b3478tbl v0.16b, {v30.16b}, v0.16b3479tbl v2.16b, {v29.16b}, v2.16b3480eor v0.16b, v0.16b, v2.16b34813482mov w7,v0.s[0]3483eor w6,w7,w7,ror #32-23484eor w6,w6,w7,ror #32-103485eor w6,w6,w7,ror #32-183486eor w6,w6,w7,ror #32-243487eor w14,w14,w63488// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3489eor w6,w12,w133490eor w9,w14,w83491eor w6,w6,w93492mov v3.s[0],w63493// optimize sbox using AESE instruction3494tbl v0.16b, {v3.16b}, v26.16b3495ushr v2.16b, v0.16b, 43496and v0.16b, v0.16b, v31.16b3497tbl v0.16b, {v28.16b}, v0.16b3498tbl v2.16b, {v27.16b}, v2.16b3499eor v0.16b, v0.16b, v2.16b3500eor v1.16b, v1.16b, v1.16b3501aese v0.16b,v1.16b3502ushr v2.16b, v0.16b, 43503and v0.16b, v0.16b, v31.16b3504tbl v0.16b, {v30.16b}, v0.16b3505tbl v2.16b, {v29.16b}, v2.16b3506eor v0.16b, v0.16b, v2.16b35073508mov w7,v0.s[0]3509eor w6,w7,w7,ror #32-23510eor w6,w6,w7,ror #32-103511eor w6,w6,w7,ror #32-183512eor w6,w6,w7,ror #32-243513eor w15,w15,w63514subs w11,w11,#13515b.ne 10b3516mov v4.s[0],w153517mov v4.s[1],w143518mov v4.s[2],w133519mov v4.s[3],w123520#ifndef __AARCH64EB__3521rev32 v4.16b,v4.16b3522#endif3523eor v4.16b, v4.16b, v18.16b3524st1 {v4.4s}, [x26]3525.return_gb:3526ldp d14, d15, [sp], #0x103527ldp d12, d13, [sp], #0x103528ldp d10, d11, [sp], #0x103529ldp d8, d9, [sp], #0x103530ldp x29, x30, [sp], #0x103531ldp x27, x28, [sp], #0x103532ldp x25, x26, [sp], #0x103533ldp x23, x24, [sp], #0x103534ldp x21, x22, [sp], #0x103535ldp x19, x20, [sp], #0x103536ldp x17, x18, [sp], #0x103537ldp x15, x16, [sp], #0x103538AARCH64_VALIDATE_LINK_REGISTER3539ret3540.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb3541.globl vpsm4_ex_xts_encrypt3542.type vpsm4_ex_xts_encrypt,%function3543.align 53544vpsm4_ex_xts_encrypt:3545AARCH64_SIGN_LINK_REGISTER3546stp x15, x16, [sp, #-0x10]!3547stp x17, x18, [sp, #-0x10]!3548stp x19, x20, [sp, #-0x10]!3549stp x21, x22, [sp, #-0x10]!3550stp x23, x24, [sp, #-0x10]!3551stp x25, x26, [sp, #-0x10]!3552stp x27, x28, [sp, #-0x10]!3553stp x29, x30, [sp, #-0x10]!3554stp d8, d9, [sp, #-0x10]!3555stp d10, d11, [sp, #-0x10]!3556stp d12, d13, [sp, #-0x10]!3557stp d14, d15, [sp, #-0x10]!3558mov x26,x33559mov x27,x43560mov w28,w63561ld1 {v16.4s}, [x5]3562mov x3,x273563adrp x9, .Lsbox_magic3564ldr q26, [x9, #:lo12:.Lsbox_magic]3565ldr q27, [x9, #:lo12:.Lsbox_magic+16]3566ldr q28, [x9, #:lo12:.Lsbox_magic+32]3567ldr q29, [x9, #:lo12:.Lsbox_magic+48]3568ldr q30, [x9, #:lo12:.Lsbox_magic+64]3569ldr q31, [x9, #:lo12:.Lsbox_magic+80]3570#ifndef __AARCH64EB__3571rev32 v16.16b,v16.16b3572#endif3573mov x10,x33574mov w11,#83575mov w12,v16.s[0]3576mov w13,v16.s[1]3577mov w14,v16.s[2]3578mov w15,v16.s[3]357910:3580ldp w7,w8,[x10],83581// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3582eor w6,w14,w153583eor w9,w7,w133584eor w6,w6,w93585mov v3.s[0],w63586// optimize sbox using AESE instruction3587tbl v0.16b, {v3.16b}, v26.16b3588ushr v2.16b, v0.16b, 43589and v0.16b, v0.16b, v31.16b3590tbl v0.16b, {v28.16b}, v0.16b3591tbl v2.16b, {v27.16b}, v2.16b3592eor v0.16b, v0.16b, v2.16b3593eor v1.16b, v1.16b, v1.16b3594aese v0.16b,v1.16b3595ushr v2.16b, v0.16b, 43596and v0.16b, v0.16b, v31.16b3597tbl v0.16b, {v30.16b}, v0.16b3598tbl v2.16b, {v29.16b}, v2.16b3599eor v0.16b, v0.16b, v2.16b36003601mov w7,v0.s[0]3602eor w6,w7,w7,ror #32-23603eor w6,w6,w7,ror #32-103604eor w6,w6,w7,ror #32-183605eor w6,w6,w7,ror #32-243606eor w12,w12,w63607// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)3608eor w6,w14,w153609eor w9,w12,w83610eor w6,w6,w93611mov v3.s[0],w63612// optimize sbox using AESE instruction3613tbl v0.16b, {v3.16b}, v26.16b3614ushr v2.16b, v0.16b, 43615and v0.16b, v0.16b, v31.16b3616tbl v0.16b, {v28.16b}, v0.16b3617tbl v2.16b, {v27.16b}, v2.16b3618eor v0.16b, v0.16b, v2.16b3619eor v1.16b, v1.16b, v1.16b3620aese v0.16b,v1.16b3621ushr v2.16b, v0.16b, 43622and v0.16b, v0.16b, v31.16b3623tbl v0.16b, {v30.16b}, v0.16b3624tbl v2.16b, {v29.16b}, v2.16b3625eor v0.16b, v0.16b, v2.16b36263627mov w7,v0.s[0]3628eor w6,w7,w7,ror #32-23629eor w6,w6,w7,ror #32-103630eor w6,w6,w7,ror #32-183631eor w6,w6,w7,ror #32-243632ldp w7,w8,[x10],83633eor w13,w13,w63634// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)3635eor w6,w12,w133636eor w9,w7,w153637eor w6,w6,w93638mov v3.s[0],w63639// optimize sbox using AESE instruction3640tbl v0.16b, {v3.16b}, v26.16b3641ushr v2.16b, v0.16b, 43642and v0.16b, v0.16b, v31.16b3643tbl v0.16b, {v28.16b}, v0.16b3644tbl v2.16b, {v27.16b}, v2.16b3645eor v0.16b, v0.16b, v2.16b3646eor v1.16b, v1.16b, v1.16b3647aese v0.16b,v1.16b3648ushr v2.16b, v0.16b, 43649and v0.16b, v0.16b, v31.16b3650tbl v0.16b, {v30.16b}, v0.16b3651tbl v2.16b, {v29.16b}, v2.16b3652eor v0.16b, v0.16b, v2.16b36533654mov w7,v0.s[0]3655eor w6,w7,w7,ror #32-23656eor w6,w6,w7,ror #32-103657eor w6,w6,w7,ror #32-183658eor w6,w6,w7,ror #32-243659eor w14,w14,w63660// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)3661eor w6,w12,w133662eor w9,w14,w83663eor w6,w6,w93664mov v3.s[0],w63665// optimize sbox using AESE instruction3666tbl v0.16b, {v3.16b}, v26.16b3667ushr v2.16b, v0.16b, 43668and v0.16b, v0.16b, v31.16b3669tbl v0.16b, {v28.16b}, v0.16b3670tbl v2.16b, {v27.16b}, v2.16b3671eor v0.16b, v0.16b, v2.16b3672eor v1.16b, v1.16b, v1.16b3673aese v0.16b,v1.16b3674ushr v2.16b, v0.16b, 43675and v0.16b, v0.16b, v31.16b3676tbl v0.16b, {v30.16b}, v0.16b3677tbl v2.16b, {v29.16b}, v2.16b3678eor v0.16b, v0.16b, v2.16b36793680mov w7,v0.s[0]3681eor w6,w7,w7,ror #32-23682eor w6,w6,w7,ror #32-103683eor w6,w6,w7,ror #32-183684eor w6,w6,w7,ror #32-243685eor w15,w15,w63686subs w11,w11,#13687b.ne 10b3688mov v16.s[0],w153689mov v16.s[1],w143690mov v16.s[2],w133691mov v16.s[3],w123692#ifndef __AARCH64EB__3693rev32 v16.16b,v16.16b3694#endif3695mov x3,x263696and x29,x2,#0x0F3697// convert length into blocks3698lsr x2,x2,43699cmp x2,#13700b.lt .return37013702cmp x29,03703// If the encryption/decryption Length is N times of 16,3704// the all blocks are encrypted/decrypted in .xts_encrypt_blocks3705b.eq .xts_encrypt_blocks37063707// If the encryption/decryption length is not N times of 16,3708// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak3709// the other blocks are encrypted/decrypted in .xts_encrypt_blocks3710subs x2,x2,#13711b.eq .only_2blks_tweak3712.xts_encrypt_blocks:3713#ifdef __AARCH64EB__3714rev32 v16.16b,v16.16b3715#endif3716mov x12,v16.d[0]3717mov x13,v16.d[1]3718mov w7,0x873719extr x9,x13,x13,#323720extr x15,x13,x12,#633721and w8,w7,w9,asr#313722eor x14,x8,x12,lsl#13723mov w7,0x873724extr x9,x15,x15,#323725extr x17,x15,x14,#633726and w8,w7,w9,asr#313727eor x16,x8,x14,lsl#13728mov w7,0x873729extr x9,x17,x17,#323730extr x19,x17,x16,#633731and w8,w7,w9,asr#313732eor x18,x8,x16,lsl#13733mov w7,0x873734extr x9,x19,x19,#323735extr x21,x19,x18,#633736and w8,w7,w9,asr#313737eor x20,x8,x18,lsl#13738mov w7,0x873739extr x9,x21,x21,#323740extr x23,x21,x20,#633741and w8,w7,w9,asr#313742eor x22,x8,x20,lsl#13743mov w7,0x873744extr x9,x23,x23,#323745extr x25,x23,x22,#633746and w8,w7,w9,asr#313747eor x24,x8,x22,lsl#13748mov w7,0x873749extr x9,x25,x25,#323750extr x27,x25,x24,#633751and w8,w7,w9,asr#313752eor x26,x8,x24,lsl#13753.Lxts_8_blocks_process:3754cmp x2,#83755mov v16.d[0],x123756mov v16.d[1],x133757#ifdef __AARCH64EB__3758rev32 v16.16b,v16.16b3759#endif3760mov w7,0x873761extr x9,x27,x27,#323762extr x13,x27,x26,#633763and w8,w7,w9,asr#313764eor x12,x8,x26,lsl#13765mov v17.d[0],x143766mov v17.d[1],x153767#ifdef __AARCH64EB__3768rev32 v17.16b,v17.16b3769#endif3770mov w7,0x873771extr x9,x13,x13,#323772extr x15,x13,x12,#633773and w8,w7,w9,asr#313774eor x14,x8,x12,lsl#13775mov v18.d[0],x163776mov v18.d[1],x173777#ifdef __AARCH64EB__3778rev32 v18.16b,v18.16b3779#endif3780mov w7,0x873781extr x9,x15,x15,#323782extr x17,x15,x14,#633783and w8,w7,w9,asr#313784eor x16,x8,x14,lsl#13785mov v19.d[0],x183786mov v19.d[1],x193787#ifdef __AARCH64EB__3788rev32 v19.16b,v19.16b3789#endif3790mov w7,0x873791extr x9,x17,x17,#323792extr x19,x17,x16,#633793and w8,w7,w9,asr#313794eor x18,x8,x16,lsl#13795mov v20.d[0],x203796mov v20.d[1],x213797#ifdef __AARCH64EB__3798rev32 v20.16b,v20.16b3799#endif3800mov w7,0x873801extr x9,x19,x19,#323802extr x21,x19,x18,#633803and w8,w7,w9,asr#313804eor x20,x8,x18,lsl#13805mov v21.d[0],x223806mov v21.d[1],x233807#ifdef __AARCH64EB__3808rev32 v21.16b,v21.16b3809#endif3810mov w7,0x873811extr x9,x21,x21,#323812extr x23,x21,x20,#633813and w8,w7,w9,asr#313814eor x22,x8,x20,lsl#13815mov v22.d[0],x243816mov v22.d[1],x253817#ifdef __AARCH64EB__3818rev32 v22.16b,v22.16b3819#endif3820mov w7,0x873821extr x9,x23,x23,#323822extr x25,x23,x22,#633823and w8,w7,w9,asr#313824eor x24,x8,x22,lsl#13825mov v23.d[0],x263826mov v23.d[1],x273827#ifdef __AARCH64EB__3828rev32 v23.16b,v23.16b3829#endif3830mov w7,0x873831extr x9,x25,x25,#323832extr x27,x25,x24,#633833and w8,w7,w9,asr#313834eor x26,x8,x24,lsl#13835b.lt .Lxts_4_blocks_process3836ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#643837eor v4.16b, v4.16b, v16.16b3838eor v5.16b, v5.16b, v17.16b3839eor v6.16b, v6.16b, v18.16b3840eor v7.16b, v7.16b, v19.16b3841ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#643842eor v8.16b, v8.16b, v20.16b3843eor v9.16b, v9.16b, v21.16b3844eor v10.16b, v10.16b, v22.16b3845eor v11.16b, v11.16b, v23.16b3846#ifndef __AARCH64EB__3847rev32 v4.16b,v4.16b3848#endif3849#ifndef __AARCH64EB__3850rev32 v5.16b,v5.16b3851#endif3852#ifndef __AARCH64EB__3853rev32 v6.16b,v6.16b3854#endif3855#ifndef __AARCH64EB__3856rev32 v7.16b,v7.16b3857#endif3858#ifndef __AARCH64EB__3859rev32 v8.16b,v8.16b3860#endif3861#ifndef __AARCH64EB__3862rev32 v9.16b,v9.16b3863#endif3864#ifndef __AARCH64EB__3865rev32 v10.16b,v10.16b3866#endif3867#ifndef __AARCH64EB__3868rev32 v11.16b,v11.16b3869#endif3870zip1 v0.4s,v4.4s,v5.4s3871zip2 v1.4s,v4.4s,v5.4s3872zip1 v2.4s,v6.4s,v7.4s3873zip2 v3.4s,v6.4s,v7.4s3874zip1 v4.2d,v0.2d,v2.2d3875zip2 v5.2d,v0.2d,v2.2d3876zip1 v6.2d,v1.2d,v3.2d3877zip2 v7.2d,v1.2d,v3.2d3878zip1 v0.4s,v8.4s,v9.4s3879zip2 v1.4s,v8.4s,v9.4s3880zip1 v2.4s,v10.4s,v11.4s3881zip2 v3.4s,v10.4s,v11.4s3882zip1 v8.2d,v0.2d,v2.2d3883zip2 v9.2d,v0.2d,v2.2d3884zip1 v10.2d,v1.2d,v3.2d3885zip2 v11.2d,v1.2d,v3.2d3886bl _vpsm4_ex_enc_8blks3887zip1 v8.4s,v0.4s,v1.4s3888zip2 v9.4s,v0.4s,v1.4s3889zip1 v10.4s,v2.4s,v3.4s3890zip2 v11.4s,v2.4s,v3.4s3891zip1 v0.2d,v8.2d,v10.2d3892zip2 v1.2d,v8.2d,v10.2d3893zip1 v2.2d,v9.2d,v11.2d3894zip2 v3.2d,v9.2d,v11.2d3895zip1 v8.4s,v4.4s,v5.4s3896zip2 v9.4s,v4.4s,v5.4s3897zip1 v10.4s,v6.4s,v7.4s3898zip2 v11.4s,v6.4s,v7.4s3899zip1 v4.2d,v8.2d,v10.2d3900zip2 v5.2d,v8.2d,v10.2d3901zip1 v6.2d,v9.2d,v11.2d3902zip2 v7.2d,v9.2d,v11.2d3903eor v0.16b, v0.16b, v16.16b3904eor v1.16b, v1.16b, v17.16b3905eor v2.16b, v2.16b, v18.16b3906eor v3.16b, v3.16b, v19.16b3907eor v4.16b, v4.16b, v20.16b3908eor v5.16b, v5.16b, v21.16b3909eor v6.16b, v6.16b, v22.16b3910eor v7.16b, v7.16b, v23.16b39113912// save the last tweak3913mov v25.16b,v23.16b3914st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#643915st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#643916subs x2,x2,#83917b.gt .Lxts_8_blocks_process3918b 100f3919.Lxts_4_blocks_process:3920cmp x2,#43921b.lt 1f3922ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#643923eor v4.16b, v4.16b, v16.16b3924eor v5.16b, v5.16b, v17.16b3925eor v6.16b, v6.16b, v18.16b3926eor v7.16b, v7.16b, v19.16b3927#ifndef __AARCH64EB__3928rev32 v4.16b,v4.16b3929#endif3930#ifndef __AARCH64EB__3931rev32 v5.16b,v5.16b3932#endif3933#ifndef __AARCH64EB__3934rev32 v6.16b,v6.16b3935#endif3936#ifndef __AARCH64EB__3937rev32 v7.16b,v7.16b3938#endif3939zip1 v0.4s,v4.4s,v5.4s3940zip2 v1.4s,v4.4s,v5.4s3941zip1 v2.4s,v6.4s,v7.4s3942zip2 v3.4s,v6.4s,v7.4s3943zip1 v4.2d,v0.2d,v2.2d3944zip2 v5.2d,v0.2d,v2.2d3945zip1 v6.2d,v1.2d,v3.2d3946zip2 v7.2d,v1.2d,v3.2d3947bl _vpsm4_ex_enc_4blks3948zip1 v4.4s,v0.4s,v1.4s3949zip2 v5.4s,v0.4s,v1.4s3950zip1 v6.4s,v2.4s,v3.4s3951zip2 v7.4s,v2.4s,v3.4s3952zip1 v0.2d,v4.2d,v6.2d3953zip2 v1.2d,v4.2d,v6.2d3954zip1 v2.2d,v5.2d,v7.2d3955zip2 v3.2d,v5.2d,v7.2d3956eor v0.16b, v0.16b, v16.16b3957eor v1.16b, v1.16b, v17.16b3958eor v2.16b, v2.16b, v18.16b3959eor v3.16b, v3.16b, v19.16b3960st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#643961sub x2,x2,#43962mov v16.16b,v20.16b3963mov v17.16b,v21.16b3964mov v18.16b,v22.16b3965// save the last tweak3966mov v25.16b,v19.16b39671:3968// process last block3969cmp x2,#13970b.lt 100f3971b.gt 1f3972ld1 {v4.4s},[x0],#163973eor v4.16b, v4.16b, v16.16b3974#ifndef __AARCH64EB__3975rev32 v4.16b,v4.16b3976#endif3977mov x10,x33978mov w11,#83979mov w12,v4.s[0]3980mov w13,v4.s[1]3981mov w14,v4.s[2]3982mov w15,v4.s[3]398310:3984ldp w7,w8,[x10],83985// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)3986eor w6,w14,w153987eor w9,w7,w133988eor w6,w6,w93989mov v3.s[0],w63990// optimize sbox using AESE instruction3991tbl v0.16b, {v3.16b}, v26.16b3992ushr v2.16b, v0.16b, 43993and v0.16b, v0.16b, v31.16b3994tbl v0.16b, {v28.16b}, v0.16b3995tbl v2.16b, {v27.16b}, v2.16b3996eor v0.16b, v0.16b, v2.16b3997eor v1.16b, v1.16b, v1.16b3998aese v0.16b,v1.16b3999ushr v2.16b, v0.16b, 44000and v0.16b, v0.16b, v31.16b4001tbl v0.16b, {v30.16b}, v0.16b4002tbl v2.16b, {v29.16b}, v2.16b4003eor v0.16b, v0.16b, v2.16b40044005mov w7,v0.s[0]4006eor w6,w7,w7,ror #32-24007eor w6,w6,w7,ror #32-104008eor w6,w6,w7,ror #32-184009eor w6,w6,w7,ror #32-244010eor w12,w12,w64011// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)4012eor w6,w14,w154013eor w9,w12,w84014eor w6,w6,w94015mov v3.s[0],w64016// optimize sbox using AESE instruction4017tbl v0.16b, {v3.16b}, v26.16b4018ushr v2.16b, v0.16b, 44019and v0.16b, v0.16b, v31.16b4020tbl v0.16b, {v28.16b}, v0.16b4021tbl v2.16b, {v27.16b}, v2.16b4022eor v0.16b, v0.16b, v2.16b4023eor v1.16b, v1.16b, v1.16b4024aese v0.16b,v1.16b4025ushr v2.16b, v0.16b, 44026and v0.16b, v0.16b, v31.16b4027tbl v0.16b, {v30.16b}, v0.16b4028tbl v2.16b, {v29.16b}, v2.16b4029eor v0.16b, v0.16b, v2.16b40304031mov w7,v0.s[0]4032eor w6,w7,w7,ror #32-24033eor w6,w6,w7,ror #32-104034eor w6,w6,w7,ror #32-184035eor w6,w6,w7,ror #32-244036ldp w7,w8,[x10],84037eor w13,w13,w64038// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)4039eor w6,w12,w134040eor w9,w7,w154041eor w6,w6,w94042mov v3.s[0],w64043// optimize sbox using AESE instruction4044tbl v0.16b, {v3.16b}, v26.16b4045ushr v2.16b, v0.16b, 44046and v0.16b, v0.16b, v31.16b4047tbl v0.16b, {v28.16b}, v0.16b4048tbl v2.16b, {v27.16b}, v2.16b4049eor v0.16b, v0.16b, v2.16b4050eor v1.16b, v1.16b, v1.16b4051aese v0.16b,v1.16b4052ushr v2.16b, v0.16b, 44053and v0.16b, v0.16b, v31.16b4054tbl v0.16b, {v30.16b}, v0.16b4055tbl v2.16b, {v29.16b}, v2.16b4056eor v0.16b, v0.16b, v2.16b40574058mov w7,v0.s[0]4059eor w6,w7,w7,ror #32-24060eor w6,w6,w7,ror #32-104061eor w6,w6,w7,ror #32-184062eor w6,w6,w7,ror #32-244063eor w14,w14,w64064// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)4065eor w6,w12,w134066eor w9,w14,w84067eor w6,w6,w94068mov v3.s[0],w64069// optimize sbox using AESE instruction4070tbl v0.16b, {v3.16b}, v26.16b4071ushr v2.16b, v0.16b, 44072and v0.16b, v0.16b, v31.16b4073tbl v0.16b, {v28.16b}, v0.16b4074tbl v2.16b, {v27.16b}, v2.16b4075eor v0.16b, v0.16b, v2.16b4076eor v1.16b, v1.16b, v1.16b4077aese v0.16b,v1.16b4078ushr v2.16b, v0.16b, 44079and v0.16b, v0.16b, v31.16b4080tbl v0.16b, {v30.16b}, v0.16b4081tbl v2.16b, {v29.16b}, v2.16b4082eor v0.16b, v0.16b, v2.16b40834084mov w7,v0.s[0]4085eor w6,w7,w7,ror #32-24086eor w6,w6,w7,ror #32-104087eor w6,w6,w7,ror #32-184088eor w6,w6,w7,ror #32-244089eor w15,w15,w64090subs w11,w11,#14091b.ne 10b4092mov v4.s[0],w154093mov v4.s[1],w144094mov v4.s[2],w134095mov v4.s[3],w124096#ifndef __AARCH64EB__4097rev32 v4.16b,v4.16b4098#endif4099eor v4.16b, v4.16b, v16.16b4100st1 {v4.4s},[x1],#164101// save the last tweak4102mov v25.16b,v16.16b4103b 100f41041: // process last 2 blocks4105cmp x2,#24106b.gt 1f4107ld1 {v4.4s,v5.4s},[x0],#324108eor v4.16b, v4.16b, v16.16b4109eor v5.16b, v5.16b, v17.16b4110#ifndef __AARCH64EB__4111rev32 v4.16b,v4.16b4112#endif4113#ifndef __AARCH64EB__4114rev32 v5.16b,v5.16b4115#endif4116zip1 v0.4s,v4.4s,v5.4s4117zip2 v1.4s,v4.4s,v5.4s4118zip1 v2.4s,v6.4s,v7.4s4119zip2 v3.4s,v6.4s,v7.4s4120zip1 v4.2d,v0.2d,v2.2d4121zip2 v5.2d,v0.2d,v2.2d4122zip1 v6.2d,v1.2d,v3.2d4123zip2 v7.2d,v1.2d,v3.2d4124bl _vpsm4_ex_enc_4blks4125zip1 v4.4s,v0.4s,v1.4s4126zip2 v5.4s,v0.4s,v1.4s4127zip1 v6.4s,v2.4s,v3.4s4128zip2 v7.4s,v2.4s,v3.4s4129zip1 v0.2d,v4.2d,v6.2d4130zip2 v1.2d,v4.2d,v6.2d4131zip1 v2.2d,v5.2d,v7.2d4132zip2 v3.2d,v5.2d,v7.2d4133eor v0.16b, v0.16b, v16.16b4134eor v1.16b, v1.16b, v17.16b4135st1 {v0.4s,v1.4s},[x1],#324136// save the last tweak4137mov v25.16b,v17.16b4138b 100f41391: // process last 3 blocks4140ld1 {v4.4s,v5.4s,v6.4s},[x0],#484141eor v4.16b, v4.16b, v16.16b4142eor v5.16b, v5.16b, v17.16b4143eor v6.16b, v6.16b, v18.16b4144#ifndef __AARCH64EB__4145rev32 v4.16b,v4.16b4146#endif4147#ifndef __AARCH64EB__4148rev32 v5.16b,v5.16b4149#endif4150#ifndef __AARCH64EB__4151rev32 v6.16b,v6.16b4152#endif4153zip1 v0.4s,v4.4s,v5.4s4154zip2 v1.4s,v4.4s,v5.4s4155zip1 v2.4s,v6.4s,v7.4s4156zip2 v3.4s,v6.4s,v7.4s4157zip1 v4.2d,v0.2d,v2.2d4158zip2 v5.2d,v0.2d,v2.2d4159zip1 v6.2d,v1.2d,v3.2d4160zip2 v7.2d,v1.2d,v3.2d4161bl _vpsm4_ex_enc_4blks4162zip1 v4.4s,v0.4s,v1.4s4163zip2 v5.4s,v0.4s,v1.4s4164zip1 v6.4s,v2.4s,v3.4s4165zip2 v7.4s,v2.4s,v3.4s4166zip1 v0.2d,v4.2d,v6.2d4167zip2 v1.2d,v4.2d,v6.2d4168zip1 v2.2d,v5.2d,v7.2d4169zip2 v3.2d,v5.2d,v7.2d4170eor v0.16b, v0.16b, v16.16b4171eor v1.16b, v1.16b, v17.16b4172eor v2.16b, v2.16b, v18.16b4173st1 {v0.4s,v1.4s,v2.4s},[x1],#484174// save the last tweak4175mov v25.16b,v18.16b4176100:4177cmp x29,04178b.eq .return41794180// This branch calculates the last two tweaks,4181// while the encryption/decryption length is larger than 324182.last_2blks_tweak:4183#ifdef __AARCH64EB__4184rev32 v25.16b,v25.16b4185#endif4186mov v2.16b,v25.16b4187adrp x9, .Lxts_magic4188ldr q0, [x9, #:lo12:.Lxts_magic]4189shl v17.16b, v2.16b, #14190ext v1.16b, v2.16b, v2.16b,#154191ushr v1.16b, v1.16b, #74192mul v1.16b, v1.16b, v0.16b4193eor v17.16b, v17.16b, v1.16b4194mov v2.16b,v17.16b4195adrp x9, .Lxts_magic4196ldr q0, [x9, #:lo12:.Lxts_magic]4197shl v18.16b, v2.16b, #14198ext v1.16b, v2.16b, v2.16b,#154199ushr v1.16b, v1.16b, #74200mul v1.16b, v1.16b, v0.16b4201eor v18.16b, v18.16b, v1.16b4202b .check_dec420342044205// This branch calculates the last two tweaks,4206// while the encryption/decryption length is equal to 32, who only need two tweaks4207.only_2blks_tweak:4208mov v17.16b,v16.16b4209#ifdef __AARCH64EB__4210rev32 v17.16b,v17.16b4211#endif4212mov v2.16b,v17.16b4213adrp x9, .Lxts_magic4214ldr q0, [x9, #:lo12:.Lxts_magic]4215shl v18.16b, v2.16b, #14216ext v1.16b, v2.16b, v2.16b,#154217ushr v1.16b, v1.16b, #74218mul v1.16b, v1.16b, v0.16b4219eor v18.16b, v18.16b, v1.16b4220b .check_dec422142224223// Determine whether encryption or decryption is required.4224// The last two tweaks need to be swapped for decryption.4225.check_dec:4226// encryption:1 decryption:04227cmp w28,14228b.eq .process_last_2blks4229mov v0.16B,v17.16b4230mov v17.16B,v18.16b4231mov v18.16B,v0.16b42324233.process_last_2blks:4234#ifdef __AARCH64EB__4235rev32 v17.16b,v17.16b4236#endif4237#ifdef __AARCH64EB__4238rev32 v18.16b,v18.16b4239#endif4240ld1 {v4.4s},[x0],#164241eor v4.16b, v4.16b, v17.16b4242#ifndef __AARCH64EB__4243rev32 v4.16b,v4.16b4244#endif4245mov x10,x34246mov w11,#84247mov w12,v4.s[0]4248mov w13,v4.s[1]4249mov w14,v4.s[2]4250mov w15,v4.s[3]425110:4252ldp w7,w8,[x10],84253// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)4254eor w6,w14,w154255eor w9,w7,w134256eor w6,w6,w94257mov v3.s[0],w64258// optimize sbox using AESE instruction4259tbl v0.16b, {v3.16b}, v26.16b4260ushr v2.16b, v0.16b, 44261and v0.16b, v0.16b, v31.16b4262tbl v0.16b, {v28.16b}, v0.16b4263tbl v2.16b, {v27.16b}, v2.16b4264eor v0.16b, v0.16b, v2.16b4265eor v1.16b, v1.16b, v1.16b4266aese v0.16b,v1.16b4267ushr v2.16b, v0.16b, 44268and v0.16b, v0.16b, v31.16b4269tbl v0.16b, {v30.16b}, v0.16b4270tbl v2.16b, {v29.16b}, v2.16b4271eor v0.16b, v0.16b, v2.16b42724273mov w7,v0.s[0]4274eor w6,w7,w7,ror #32-24275eor w6,w6,w7,ror #32-104276eor w6,w6,w7,ror #32-184277eor w6,w6,w7,ror #32-244278eor w12,w12,w64279// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)4280eor w6,w14,w154281eor w9,w12,w84282eor w6,w6,w94283mov v3.s[0],w64284// optimize sbox using AESE instruction4285tbl v0.16b, {v3.16b}, v26.16b4286ushr v2.16b, v0.16b, 44287and v0.16b, v0.16b, v31.16b4288tbl v0.16b, {v28.16b}, v0.16b4289tbl v2.16b, {v27.16b}, v2.16b4290eor v0.16b, v0.16b, v2.16b4291eor v1.16b, v1.16b, v1.16b4292aese v0.16b,v1.16b4293ushr v2.16b, v0.16b, 44294and v0.16b, v0.16b, v31.16b4295tbl v0.16b, {v30.16b}, v0.16b4296tbl v2.16b, {v29.16b}, v2.16b4297eor v0.16b, v0.16b, v2.16b42984299mov w7,v0.s[0]4300eor w6,w7,w7,ror #32-24301eor w6,w6,w7,ror #32-104302eor w6,w6,w7,ror #32-184303eor w6,w6,w7,ror #32-244304ldp w7,w8,[x10],84305eor w13,w13,w64306// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)4307eor w6,w12,w134308eor w9,w7,w154309eor w6,w6,w94310mov v3.s[0],w64311// optimize sbox using AESE instruction4312tbl v0.16b, {v3.16b}, v26.16b4313ushr v2.16b, v0.16b, 44314and v0.16b, v0.16b, v31.16b4315tbl v0.16b, {v28.16b}, v0.16b4316tbl v2.16b, {v27.16b}, v2.16b4317eor v0.16b, v0.16b, v2.16b4318eor v1.16b, v1.16b, v1.16b4319aese v0.16b,v1.16b4320ushr v2.16b, v0.16b, 44321and v0.16b, v0.16b, v31.16b4322tbl v0.16b, {v30.16b}, v0.16b4323tbl v2.16b, {v29.16b}, v2.16b4324eor v0.16b, v0.16b, v2.16b43254326mov w7,v0.s[0]4327eor w6,w7,w7,ror #32-24328eor w6,w6,w7,ror #32-104329eor w6,w6,w7,ror #32-184330eor w6,w6,w7,ror #32-244331eor w14,w14,w64332// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)4333eor w6,w12,w134334eor w9,w14,w84335eor w6,w6,w94336mov v3.s[0],w64337// optimize sbox using AESE instruction4338tbl v0.16b, {v3.16b}, v26.16b4339ushr v2.16b, v0.16b, 44340and v0.16b, v0.16b, v31.16b4341tbl v0.16b, {v28.16b}, v0.16b4342tbl v2.16b, {v27.16b}, v2.16b4343eor v0.16b, v0.16b, v2.16b4344eor v1.16b, v1.16b, v1.16b4345aese v0.16b,v1.16b4346ushr v2.16b, v0.16b, 44347and v0.16b, v0.16b, v31.16b4348tbl v0.16b, {v30.16b}, v0.16b4349tbl v2.16b, {v29.16b}, v2.16b4350eor v0.16b, v0.16b, v2.16b43514352mov w7,v0.s[0]4353eor w6,w7,w7,ror #32-24354eor w6,w6,w7,ror #32-104355eor w6,w6,w7,ror #32-184356eor w6,w6,w7,ror #32-244357eor w15,w15,w64358subs w11,w11,#14359b.ne 10b4360mov v4.s[0],w154361mov v4.s[1],w144362mov v4.s[2],w134363mov v4.s[3],w124364#ifndef __AARCH64EB__4365rev32 v4.16b,v4.16b4366#endif4367eor v4.16b, v4.16b, v17.16b4368st1 {v4.4s},[x1],#1643694370sub x26,x1,164371.loop:4372subs x29,x29,14373ldrb w7,[x26,x29]4374ldrb w8,[x0,x29]4375strb w8,[x26,x29]4376strb w7,[x1,x29]4377b.gt .loop4378ld1 {v4.4s}, [x26]4379eor v4.16b, v4.16b, v18.16b4380#ifndef __AARCH64EB__4381rev32 v4.16b,v4.16b4382#endif4383mov x10,x34384mov w11,#84385mov w12,v4.s[0]4386mov w13,v4.s[1]4387mov w14,v4.s[2]4388mov w15,v4.s[3]438910:4390ldp w7,w8,[x10],84391// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)4392eor w6,w14,w154393eor w9,w7,w134394eor w6,w6,w94395mov v3.s[0],w64396// optimize sbox using AESE instruction4397tbl v0.16b, {v3.16b}, v26.16b4398ushr v2.16b, v0.16b, 44399and v0.16b, v0.16b, v31.16b4400tbl v0.16b, {v28.16b}, v0.16b4401tbl v2.16b, {v27.16b}, v2.16b4402eor v0.16b, v0.16b, v2.16b4403eor v1.16b, v1.16b, v1.16b4404aese v0.16b,v1.16b4405ushr v2.16b, v0.16b, 44406and v0.16b, v0.16b, v31.16b4407tbl v0.16b, {v30.16b}, v0.16b4408tbl v2.16b, {v29.16b}, v2.16b4409eor v0.16b, v0.16b, v2.16b44104411mov w7,v0.s[0]4412eor w6,w7,w7,ror #32-24413eor w6,w6,w7,ror #32-104414eor w6,w6,w7,ror #32-184415eor w6,w6,w7,ror #32-244416eor w12,w12,w64417// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)4418eor w6,w14,w154419eor w9,w12,w84420eor w6,w6,w94421mov v3.s[0],w64422// optimize sbox using AESE instruction4423tbl v0.16b, {v3.16b}, v26.16b4424ushr v2.16b, v0.16b, 44425and v0.16b, v0.16b, v31.16b4426tbl v0.16b, {v28.16b}, v0.16b4427tbl v2.16b, {v27.16b}, v2.16b4428eor v0.16b, v0.16b, v2.16b4429eor v1.16b, v1.16b, v1.16b4430aese v0.16b,v1.16b4431ushr v2.16b, v0.16b, 44432and v0.16b, v0.16b, v31.16b4433tbl v0.16b, {v30.16b}, v0.16b4434tbl v2.16b, {v29.16b}, v2.16b4435eor v0.16b, v0.16b, v2.16b44364437mov w7,v0.s[0]4438eor w6,w7,w7,ror #32-24439eor w6,w6,w7,ror #32-104440eor w6,w6,w7,ror #32-184441eor w6,w6,w7,ror #32-244442ldp w7,w8,[x10],84443eor w13,w13,w64444// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)4445eor w6,w12,w134446eor w9,w7,w154447eor w6,w6,w94448mov v3.s[0],w64449// optimize sbox using AESE instruction4450tbl v0.16b, {v3.16b}, v26.16b4451ushr v2.16b, v0.16b, 44452and v0.16b, v0.16b, v31.16b4453tbl v0.16b, {v28.16b}, v0.16b4454tbl v2.16b, {v27.16b}, v2.16b4455eor v0.16b, v0.16b, v2.16b4456eor v1.16b, v1.16b, v1.16b4457aese v0.16b,v1.16b4458ushr v2.16b, v0.16b, 44459and v0.16b, v0.16b, v31.16b4460tbl v0.16b, {v30.16b}, v0.16b4461tbl v2.16b, {v29.16b}, v2.16b4462eor v0.16b, v0.16b, v2.16b44634464mov w7,v0.s[0]4465eor w6,w7,w7,ror #32-24466eor w6,w6,w7,ror #32-104467eor w6,w6,w7,ror #32-184468eor w6,w6,w7,ror #32-244469eor w14,w14,w64470// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)4471eor w6,w12,w134472eor w9,w14,w84473eor w6,w6,w94474mov v3.s[0],w64475// optimize sbox using AESE instruction4476tbl v0.16b, {v3.16b}, v26.16b4477ushr v2.16b, v0.16b, 44478and v0.16b, v0.16b, v31.16b4479tbl v0.16b, {v28.16b}, v0.16b4480tbl v2.16b, {v27.16b}, v2.16b4481eor v0.16b, v0.16b, v2.16b4482eor v1.16b, v1.16b, v1.16b4483aese v0.16b,v1.16b4484ushr v2.16b, v0.16b, 44485and v0.16b, v0.16b, v31.16b4486tbl v0.16b, {v30.16b}, v0.16b4487tbl v2.16b, {v29.16b}, v2.16b4488eor v0.16b, v0.16b, v2.16b44894490mov w7,v0.s[0]4491eor w6,w7,w7,ror #32-24492eor w6,w6,w7,ror #32-104493eor w6,w6,w7,ror #32-184494eor w6,w6,w7,ror #32-244495eor w15,w15,w64496subs w11,w11,#14497b.ne 10b4498mov v4.s[0],w154499mov v4.s[1],w144500mov v4.s[2],w134501mov v4.s[3],w124502#ifndef __AARCH64EB__4503rev32 v4.16b,v4.16b4504#endif4505eor v4.16b, v4.16b, v18.16b4506st1 {v4.4s}, [x26]4507.return:4508ldp d14, d15, [sp], #0x104509ldp d12, d13, [sp], #0x104510ldp d10, d11, [sp], #0x104511ldp d8, d9, [sp], #0x104512ldp x29, x30, [sp], #0x104513ldp x27, x28, [sp], #0x104514ldp x25, x26, [sp], #0x104515ldp x23, x24, [sp], #0x104516ldp x21, x22, [sp], #0x104517ldp x19, x20, [sp], #0x104518ldp x17, x18, [sp], #0x104519ldp x15, x16, [sp], #0x104520AARCH64_VALIDATE_LINK_REGISTER4521ret4522.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt452345244525