Path: blob/main/sys/crypto/openssl/aarch64/bsaes-armv8.S
39507 views
/* Do not modify. This file is auto-generated from bsaes-armv8.pl. */1// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.2//3// Licensed under the OpenSSL license (the "License"). You may not use4// this file except in compliance with the License. You can obtain a copy5// in the file LICENSE in the source distribution or at6// https://www.openssl.org/source/license.html7//8// ====================================================================9// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL10// project. Rights for redistribution and usage in source and binary11// forms are granted according to the OpenSSL license.12// ====================================================================13//14// This implementation is a translation of bsaes-armv7 for AArch64.15// No attempt has been made to carry across the build switches for16// kernel targets, since the Linux kernel crypto support has moved on17// from when it was based on OpenSSL.1819// A lot of hand-scheduling has been performed. Consequently, this code20// doesn't factor out neatly into macros in the same way that the21// AArch32 version did, and there is little to be gained by wrapping it22// up in Perl, and it is presented as pure assembly.232425#include "crypto/arm_arch.h"2627.text282930313233.type _bsaes_decrypt8,%function34.align 435// On entry:36// x9 -> key (previously expanded using _bsaes_key_convert)37// x10 = number of rounds38// v0-v7 input data39// On exit:40// x9-x11 corrupted41// other general-purpose registers preserved42// v0-v7 output data43// v11-v15 preserved44// other SIMD registers corrupted45_bsaes_decrypt8:46ldr q8, [x9], #1647adrp x11, .LM0ISR48add x11, x11, #:lo12:.LM0ISR49movi v9.16b, #0x5550ldr q10, [x11], #1651movi v16.16b, #0x3352movi v17.16b, #0x0f53sub x10, x10, #154eor v0.16b, v0.16b, v8.16b55eor v1.16b, v1.16b, v8.16b56eor v2.16b, v2.16b, v8.16b57eor v4.16b, v4.16b, v8.16b58eor v3.16b, v3.16b, v8.16b59eor v5.16b, v5.16b, v8.16b60tbl v0.16b, {v0.16b}, v10.16b61tbl v1.16b, {v1.16b}, v10.16b62tbl v2.16b, {v2.16b}, v10.16b63tbl v4.16b, {v4.16b}, v10.16b64eor v6.16b, v6.16b, v8.16b65eor v7.16b, v7.16b, v8.16b66tbl v3.16b, {v3.16b}, v10.16b67tbl v5.16b, {v5.16b}, v10.16b68tbl v6.16b, {v6.16b}, v10.16b69ushr v8.2d, v0.2d, #170tbl v7.16b, {v7.16b}, v10.16b71ushr v10.2d, v4.2d, #172ushr v18.2d, v2.2d, #173eor v8.16b, v8.16b, v1.16b74ushr v19.2d, v6.2d, #175eor v10.16b, v10.16b, v5.16b76eor v18.16b, v18.16b, v3.16b77and v8.16b, v8.16b, v9.16b78eor v19.16b, v19.16b, v7.16b79and v10.16b, v10.16b, v9.16b80and v18.16b, v18.16b, v9.16b81eor v1.16b, v1.16b, v8.16b82shl v8.2d, v8.2d, #183and v9.16b, v19.16b, v9.16b84eor v5.16b, v5.16b, v10.16b85shl v10.2d, v10.2d, #186eor v3.16b, v3.16b, v18.16b87shl v18.2d, v18.2d, #188eor v0.16b, v0.16b, v8.16b89shl v8.2d, v9.2d, #190eor v7.16b, v7.16b, v9.16b91eor v4.16b, v4.16b, v10.16b92eor v2.16b, v2.16b, v18.16b93ushr v9.2d, v1.2d, #294eor v6.16b, v6.16b, v8.16b95ushr v8.2d, v0.2d, #296ushr v10.2d, v5.2d, #297ushr v18.2d, v4.2d, #298eor v9.16b, v9.16b, v3.16b99eor v8.16b, v8.16b, v2.16b100eor v10.16b, v10.16b, v7.16b101eor v18.16b, v18.16b, v6.16b102and v9.16b, v9.16b, v16.16b103and v8.16b, v8.16b, v16.16b104and v10.16b, v10.16b, v16.16b105and v16.16b, v18.16b, v16.16b106eor v3.16b, v3.16b, v9.16b107shl v9.2d, v9.2d, #2108eor v2.16b, v2.16b, v8.16b109shl v8.2d, v8.2d, #2110eor v7.16b, v7.16b, v10.16b111shl v10.2d, v10.2d, #2112eor v6.16b, v6.16b, v16.16b113shl v16.2d, v16.2d, #2114eor v1.16b, v1.16b, v9.16b115eor v0.16b, v0.16b, v8.16b116eor v5.16b, v5.16b, v10.16b117eor v4.16b, v4.16b, v16.16b118ushr v8.2d, v3.2d, #4119ushr v9.2d, v2.2d, #4120ushr v10.2d, v1.2d, #4121ushr v16.2d, v0.2d, #4122eor v8.16b, v8.16b, v7.16b123eor v9.16b, v9.16b, v6.16b124eor v10.16b, v10.16b, v5.16b125eor v16.16b, v16.16b, v4.16b126and v8.16b, v8.16b, v17.16b127and v9.16b, v9.16b, v17.16b128and v10.16b, v10.16b, v17.16b129and v16.16b, v16.16b, v17.16b130eor v7.16b, v7.16b, v8.16b131shl v8.2d, v8.2d, #4132eor v6.16b, v6.16b, v9.16b133shl v9.2d, v9.2d, #4134eor v5.16b, v5.16b, v10.16b135shl v10.2d, v10.2d, #4136eor v4.16b, v4.16b, v16.16b137shl v16.2d, v16.2d, #4138eor v3.16b, v3.16b, v8.16b139eor v2.16b, v2.16b, v9.16b140eor v1.16b, v1.16b, v10.16b141eor v0.16b, v0.16b, v16.16b142b .Ldec_sbox143.align 4144.Ldec_loop:145ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64146ldp q8, q9, [x9], #32147eor v0.16b, v16.16b, v0.16b148ldr q10, [x9], #16149eor v1.16b, v17.16b, v1.16b150ldr q16, [x9], #16151eor v2.16b, v18.16b, v2.16b152eor v3.16b, v19.16b, v3.16b153eor v4.16b, v8.16b, v4.16b154eor v5.16b, v9.16b, v5.16b155eor v6.16b, v10.16b, v6.16b156eor v7.16b, v16.16b, v7.16b157tbl v0.16b, {v0.16b}, v28.16b158tbl v1.16b, {v1.16b}, v28.16b159tbl v2.16b, {v2.16b}, v28.16b160tbl v3.16b, {v3.16b}, v28.16b161tbl v4.16b, {v4.16b}, v28.16b162tbl v5.16b, {v5.16b}, v28.16b163tbl v6.16b, {v6.16b}, v28.16b164tbl v7.16b, {v7.16b}, v28.16b165.Ldec_sbox:166eor v1.16b, v1.16b, v4.16b167eor v3.16b, v3.16b, v4.16b168subs x10, x10, #1169eor v4.16b, v4.16b, v7.16b170eor v2.16b, v2.16b, v7.16b171eor v1.16b, v1.16b, v6.16b172eor v6.16b, v6.16b, v4.16b173eor v2.16b, v2.16b, v5.16b174eor v0.16b, v0.16b, v1.16b175eor v7.16b, v7.16b, v6.16b176eor v8.16b, v6.16b, v2.16b177and v9.16b, v4.16b, v6.16b178eor v10.16b, v2.16b, v6.16b179eor v3.16b, v3.16b, v0.16b180eor v5.16b, v5.16b, v0.16b181eor v16.16b, v7.16b, v4.16b182eor v17.16b, v4.16b, v0.16b183and v18.16b, v0.16b, v2.16b184eor v19.16b, v7.16b, v4.16b185eor v1.16b, v1.16b, v3.16b186eor v20.16b, v3.16b, v0.16b187eor v21.16b, v5.16b, v2.16b188eor v22.16b, v3.16b, v7.16b189and v8.16b, v17.16b, v8.16b190orr v17.16b, v3.16b, v5.16b191eor v23.16b, v1.16b, v6.16b192eor v24.16b, v20.16b, v16.16b193eor v25.16b, v1.16b, v5.16b194orr v26.16b, v20.16b, v21.16b195and v20.16b, v20.16b, v21.16b196and v27.16b, v7.16b, v1.16b197eor v21.16b, v21.16b, v23.16b198orr v28.16b, v16.16b, v23.16b199orr v29.16b, v22.16b, v25.16b200eor v26.16b, v26.16b, v8.16b201and v16.16b, v16.16b, v23.16b202and v22.16b, v22.16b, v25.16b203and v21.16b, v24.16b, v21.16b204eor v8.16b, v28.16b, v8.16b205eor v23.16b, v5.16b, v2.16b206eor v24.16b, v1.16b, v6.16b207eor v16.16b, v16.16b, v22.16b208eor v22.16b, v3.16b, v0.16b209eor v25.16b, v29.16b, v21.16b210eor v21.16b, v26.16b, v21.16b211eor v8.16b, v8.16b, v20.16b212eor v26.16b, v23.16b, v24.16b213eor v16.16b, v16.16b, v20.16b214eor v28.16b, v22.16b, v19.16b215eor v20.16b, v25.16b, v20.16b216eor v9.16b, v21.16b, v9.16b217eor v8.16b, v8.16b, v18.16b218eor v18.16b, v5.16b, v1.16b219eor v21.16b, v16.16b, v17.16b220eor v16.16b, v16.16b, v17.16b221eor v17.16b, v20.16b, v27.16b222eor v20.16b, v3.16b, v7.16b223eor v25.16b, v9.16b, v8.16b224eor v27.16b, v0.16b, v4.16b225and v29.16b, v9.16b, v17.16b226eor v30.16b, v8.16b, v29.16b227eor v31.16b, v21.16b, v29.16b228eor v29.16b, v21.16b, v29.16b229bsl v30.16b, v17.16b, v21.16b230bsl v31.16b, v9.16b, v8.16b231bsl v16.16b, v30.16b, v29.16b232bsl v21.16b, v29.16b, v30.16b233eor v8.16b, v31.16b, v30.16b234and v1.16b, v1.16b, v31.16b235and v9.16b, v16.16b, v31.16b236and v6.16b, v6.16b, v30.16b237eor v16.16b, v17.16b, v21.16b238and v4.16b, v4.16b, v30.16b239eor v17.16b, v8.16b, v30.16b240and v21.16b, v24.16b, v8.16b241eor v9.16b, v9.16b, v25.16b242and v19.16b, v19.16b, v8.16b243eor v24.16b, v30.16b, v16.16b244eor v25.16b, v30.16b, v16.16b245and v7.16b, v7.16b, v17.16b246and v10.16b, v10.16b, v16.16b247eor v29.16b, v9.16b, v16.16b248eor v30.16b, v31.16b, v9.16b249and v0.16b, v24.16b, v0.16b250and v9.16b, v18.16b, v9.16b251and v2.16b, v25.16b, v2.16b252eor v10.16b, v10.16b, v6.16b253eor v18.16b, v29.16b, v16.16b254and v5.16b, v30.16b, v5.16b255eor v24.16b, v8.16b, v29.16b256and v25.16b, v26.16b, v29.16b257and v26.16b, v28.16b, v29.16b258eor v8.16b, v8.16b, v29.16b259eor v17.16b, v17.16b, v18.16b260eor v5.16b, v1.16b, v5.16b261and v23.16b, v24.16b, v23.16b262eor v21.16b, v21.16b, v25.16b263eor v19.16b, v19.16b, v26.16b264eor v0.16b, v4.16b, v0.16b265and v3.16b, v17.16b, v3.16b266eor v1.16b, v9.16b, v1.16b267eor v9.16b, v25.16b, v23.16b268eor v5.16b, v5.16b, v21.16b269eor v2.16b, v6.16b, v2.16b270and v6.16b, v8.16b, v22.16b271eor v3.16b, v7.16b, v3.16b272and v8.16b, v20.16b, v18.16b273eor v10.16b, v10.16b, v9.16b274eor v0.16b, v0.16b, v19.16b275eor v9.16b, v1.16b, v9.16b276eor v1.16b, v2.16b, v21.16b277eor v3.16b, v3.16b, v19.16b278and v16.16b, v27.16b, v16.16b279eor v17.16b, v26.16b, v6.16b280eor v6.16b, v8.16b, v7.16b281eor v7.16b, v1.16b, v9.16b282eor v1.16b, v5.16b, v3.16b283eor v2.16b, v10.16b, v3.16b284eor v4.16b, v16.16b, v4.16b285eor v8.16b, v6.16b, v17.16b286eor v5.16b, v9.16b, v3.16b287eor v9.16b, v0.16b, v1.16b288eor v6.16b, v7.16b, v1.16b289eor v0.16b, v4.16b, v17.16b290eor v4.16b, v8.16b, v7.16b291eor v7.16b, v9.16b, v2.16b292eor v8.16b, v3.16b, v0.16b293eor v7.16b, v7.16b, v5.16b294eor v3.16b, v4.16b, v7.16b295eor v4.16b, v7.16b, v0.16b296eor v7.16b, v8.16b, v3.16b297bcc .Ldec_done298ext v8.16b, v0.16b, v0.16b, #8299ext v9.16b, v1.16b, v1.16b, #8300ldr q28, [x11] // load from .LISR in common case (x10 > 0)301ext v10.16b, v6.16b, v6.16b, #8302ext v16.16b, v3.16b, v3.16b, #8303ext v17.16b, v5.16b, v5.16b, #8304ext v18.16b, v4.16b, v4.16b, #8305eor v8.16b, v8.16b, v0.16b306eor v9.16b, v9.16b, v1.16b307eor v10.16b, v10.16b, v6.16b308eor v16.16b, v16.16b, v3.16b309eor v17.16b, v17.16b, v5.16b310ext v19.16b, v2.16b, v2.16b, #8311ext v20.16b, v7.16b, v7.16b, #8312eor v18.16b, v18.16b, v4.16b313eor v6.16b, v6.16b, v8.16b314eor v8.16b, v2.16b, v10.16b315eor v4.16b, v4.16b, v9.16b316eor v2.16b, v19.16b, v2.16b317eor v9.16b, v20.16b, v7.16b318eor v0.16b, v0.16b, v16.16b319eor v1.16b, v1.16b, v16.16b320eor v6.16b, v6.16b, v17.16b321eor v8.16b, v8.16b, v16.16b322eor v7.16b, v7.16b, v18.16b323eor v4.16b, v4.16b, v16.16b324eor v2.16b, v3.16b, v2.16b325eor v1.16b, v1.16b, v17.16b326eor v3.16b, v5.16b, v9.16b327eor v5.16b, v8.16b, v17.16b328eor v7.16b, v7.16b, v17.16b329ext v8.16b, v0.16b, v0.16b, #12330ext v9.16b, v6.16b, v6.16b, #12331ext v10.16b, v4.16b, v4.16b, #12332ext v16.16b, v1.16b, v1.16b, #12333ext v17.16b, v5.16b, v5.16b, #12334ext v18.16b, v7.16b, v7.16b, #12335eor v0.16b, v0.16b, v8.16b336eor v6.16b, v6.16b, v9.16b337eor v4.16b, v4.16b, v10.16b338ext v19.16b, v2.16b, v2.16b, #12339ext v20.16b, v3.16b, v3.16b, #12340eor v1.16b, v1.16b, v16.16b341eor v5.16b, v5.16b, v17.16b342eor v7.16b, v7.16b, v18.16b343eor v2.16b, v2.16b, v19.16b344eor v16.16b, v16.16b, v0.16b345eor v3.16b, v3.16b, v20.16b346eor v17.16b, v17.16b, v4.16b347eor v10.16b, v10.16b, v6.16b348ext v0.16b, v0.16b, v0.16b, #8349eor v9.16b, v9.16b, v1.16b350ext v1.16b, v1.16b, v1.16b, #8351eor v8.16b, v8.16b, v3.16b352eor v16.16b, v16.16b, v3.16b353eor v18.16b, v18.16b, v5.16b354eor v19.16b, v19.16b, v7.16b355ext v21.16b, v5.16b, v5.16b, #8356ext v5.16b, v7.16b, v7.16b, #8357eor v7.16b, v20.16b, v2.16b358ext v4.16b, v4.16b, v4.16b, #8359ext v20.16b, v3.16b, v3.16b, #8360eor v17.16b, v17.16b, v3.16b361ext v2.16b, v2.16b, v2.16b, #8362eor v3.16b, v10.16b, v3.16b363ext v10.16b, v6.16b, v6.16b, #8364eor v0.16b, v0.16b, v8.16b365eor v1.16b, v1.16b, v16.16b366eor v5.16b, v5.16b, v18.16b367eor v3.16b, v3.16b, v4.16b368eor v7.16b, v20.16b, v7.16b369eor v6.16b, v2.16b, v19.16b370eor v4.16b, v21.16b, v17.16b371eor v2.16b, v10.16b, v9.16b372bne .Ldec_loop373ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)374b .Ldec_loop375.align 4376.Ldec_done:377ushr v8.2d, v0.2d, #1378movi v9.16b, #0x55379ldr q10, [x9]380ushr v16.2d, v2.2d, #1381movi v17.16b, #0x33382ushr v18.2d, v6.2d, #1383movi v19.16b, #0x0f384eor v8.16b, v8.16b, v1.16b385ushr v20.2d, v3.2d, #1386eor v16.16b, v16.16b, v7.16b387eor v18.16b, v18.16b, v4.16b388and v8.16b, v8.16b, v9.16b389eor v20.16b, v20.16b, v5.16b390and v16.16b, v16.16b, v9.16b391and v18.16b, v18.16b, v9.16b392shl v21.2d, v8.2d, #1393eor v1.16b, v1.16b, v8.16b394and v8.16b, v20.16b, v9.16b395eor v7.16b, v7.16b, v16.16b396shl v9.2d, v16.2d, #1397eor v4.16b, v4.16b, v18.16b398shl v16.2d, v18.2d, #1399eor v0.16b, v0.16b, v21.16b400shl v18.2d, v8.2d, #1401eor v5.16b, v5.16b, v8.16b402eor v2.16b, v2.16b, v9.16b403eor v6.16b, v6.16b, v16.16b404ushr v8.2d, v1.2d, #2405eor v3.16b, v3.16b, v18.16b406ushr v9.2d, v0.2d, #2407ushr v16.2d, v7.2d, #2408ushr v18.2d, v2.2d, #2409eor v8.16b, v8.16b, v4.16b410eor v9.16b, v9.16b, v6.16b411eor v16.16b, v16.16b, v5.16b412eor v18.16b, v18.16b, v3.16b413and v8.16b, v8.16b, v17.16b414and v9.16b, v9.16b, v17.16b415and v16.16b, v16.16b, v17.16b416and v17.16b, v18.16b, v17.16b417eor v4.16b, v4.16b, v8.16b418shl v8.2d, v8.2d, #2419eor v6.16b, v6.16b, v9.16b420shl v9.2d, v9.2d, #2421eor v5.16b, v5.16b, v16.16b422shl v16.2d, v16.2d, #2423eor v3.16b, v3.16b, v17.16b424shl v17.2d, v17.2d, #2425eor v1.16b, v1.16b, v8.16b426eor v0.16b, v0.16b, v9.16b427eor v7.16b, v7.16b, v16.16b428eor v2.16b, v2.16b, v17.16b429ushr v8.2d, v4.2d, #4430ushr v9.2d, v6.2d, #4431ushr v16.2d, v1.2d, #4432ushr v17.2d, v0.2d, #4433eor v8.16b, v8.16b, v5.16b434eor v9.16b, v9.16b, v3.16b435eor v16.16b, v16.16b, v7.16b436eor v17.16b, v17.16b, v2.16b437and v8.16b, v8.16b, v19.16b438and v9.16b, v9.16b, v19.16b439and v16.16b, v16.16b, v19.16b440and v17.16b, v17.16b, v19.16b441eor v5.16b, v5.16b, v8.16b442shl v8.2d, v8.2d, #4443eor v3.16b, v3.16b, v9.16b444shl v9.2d, v9.2d, #4445eor v7.16b, v7.16b, v16.16b446shl v16.2d, v16.2d, #4447eor v2.16b, v2.16b, v17.16b448shl v17.2d, v17.2d, #4449eor v4.16b, v4.16b, v8.16b450eor v6.16b, v6.16b, v9.16b451eor v7.16b, v7.16b, v10.16b452eor v1.16b, v1.16b, v16.16b453eor v2.16b, v2.16b, v10.16b454eor v0.16b, v0.16b, v17.16b455eor v4.16b, v4.16b, v10.16b456eor v6.16b, v6.16b, v10.16b457eor v3.16b, v3.16b, v10.16b458eor v5.16b, v5.16b, v10.16b459eor v1.16b, v1.16b, v10.16b460eor v0.16b, v0.16b, v10.16b461ret462.size _bsaes_decrypt8,.-_bsaes_decrypt8463464.section .rodata465.type _bsaes_consts,%object466.align 6467_bsaes_consts:468// InvShiftRows constants469// Used in _bsaes_decrypt8, which assumes contiguity470// .LM0ISR used with round 0 key471// .LISR used with middle round keys472// .LISRM0 used with final round key473.LM0ISR:474.quad 0x0a0e0206070b0f03, 0x0004080c0d010509475.LISR:476.quad 0x0504070602010003, 0x0f0e0d0c080b0a09477.LISRM0:478.quad 0x01040b0e0205080f, 0x0306090c00070a0d479480// ShiftRows constants481// Used in _bsaes_encrypt8, which assumes contiguity482// .LM0SR used with round 0 key483// .LSR used with middle round keys484// .LSRM0 used with final round key485.LM0SR:486.quad 0x0a0e02060f03070b, 0x0004080c05090d01487.LSR:488.quad 0x0504070600030201, 0x0f0e0d0c0a09080b489.LSRM0:490.quad 0x0304090e00050a0f, 0x01060b0c0207080d491492.LM0_bigendian:493.quad 0x02060a0e03070b0f, 0x0004080c0105090d494.LM0_littleendian:495.quad 0x0105090d0004080c, 0x03070b0f02060a0e496497// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into498// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR499.LREVM0SR:500.quad 0x090d01050c000408, 0x03070b0f060a0e02501502.align 6503.size _bsaes_consts,.-_bsaes_consts504505.previous506507.type _bsaes_encrypt8,%function508.align 4509// On entry:510// x9 -> key (previously expanded using _bsaes_key_convert)511// x10 = number of rounds512// v0-v7 input data513// On exit:514// x9-x11 corrupted515// other general-purpose registers preserved516// v0-v7 output data517// v11-v15 preserved518// other SIMD registers corrupted519_bsaes_encrypt8:520ldr q8, [x9], #16521adrp x11, .LM0SR522add x11, x11, #:lo12:.LM0SR523ldr q9, [x11], #16524_bsaes_encrypt8_alt:525eor v0.16b, v0.16b, v8.16b526eor v1.16b, v1.16b, v8.16b527sub x10, x10, #1528eor v2.16b, v2.16b, v8.16b529eor v4.16b, v4.16b, v8.16b530eor v3.16b, v3.16b, v8.16b531eor v5.16b, v5.16b, v8.16b532tbl v0.16b, {v0.16b}, v9.16b533tbl v1.16b, {v1.16b}, v9.16b534tbl v2.16b, {v2.16b}, v9.16b535tbl v4.16b, {v4.16b}, v9.16b536eor v6.16b, v6.16b, v8.16b537eor v7.16b, v7.16b, v8.16b538tbl v3.16b, {v3.16b}, v9.16b539tbl v5.16b, {v5.16b}, v9.16b540tbl v6.16b, {v6.16b}, v9.16b541ushr v8.2d, v0.2d, #1542movi v10.16b, #0x55543tbl v7.16b, {v7.16b}, v9.16b544ushr v9.2d, v4.2d, #1545movi v16.16b, #0x33546ushr v17.2d, v2.2d, #1547eor v8.16b, v8.16b, v1.16b548movi v18.16b, #0x0f549ushr v19.2d, v6.2d, #1550eor v9.16b, v9.16b, v5.16b551eor v17.16b, v17.16b, v3.16b552and v8.16b, v8.16b, v10.16b553eor v19.16b, v19.16b, v7.16b554and v9.16b, v9.16b, v10.16b555and v17.16b, v17.16b, v10.16b556eor v1.16b, v1.16b, v8.16b557shl v8.2d, v8.2d, #1558and v10.16b, v19.16b, v10.16b559eor v5.16b, v5.16b, v9.16b560shl v9.2d, v9.2d, #1561eor v3.16b, v3.16b, v17.16b562shl v17.2d, v17.2d, #1563eor v0.16b, v0.16b, v8.16b564shl v8.2d, v10.2d, #1565eor v7.16b, v7.16b, v10.16b566eor v4.16b, v4.16b, v9.16b567eor v2.16b, v2.16b, v17.16b568ushr v9.2d, v1.2d, #2569eor v6.16b, v6.16b, v8.16b570ushr v8.2d, v0.2d, #2571ushr v10.2d, v5.2d, #2572ushr v17.2d, v4.2d, #2573eor v9.16b, v9.16b, v3.16b574eor v8.16b, v8.16b, v2.16b575eor v10.16b, v10.16b, v7.16b576eor v17.16b, v17.16b, v6.16b577and v9.16b, v9.16b, v16.16b578and v8.16b, v8.16b, v16.16b579and v10.16b, v10.16b, v16.16b580and v16.16b, v17.16b, v16.16b581eor v3.16b, v3.16b, v9.16b582shl v9.2d, v9.2d, #2583eor v2.16b, v2.16b, v8.16b584shl v8.2d, v8.2d, #2585eor v7.16b, v7.16b, v10.16b586shl v10.2d, v10.2d, #2587eor v6.16b, v6.16b, v16.16b588shl v16.2d, v16.2d, #2589eor v1.16b, v1.16b, v9.16b590eor v0.16b, v0.16b, v8.16b591eor v5.16b, v5.16b, v10.16b592eor v4.16b, v4.16b, v16.16b593ushr v8.2d, v3.2d, #4594ushr v9.2d, v2.2d, #4595ushr v10.2d, v1.2d, #4596ushr v16.2d, v0.2d, #4597eor v8.16b, v8.16b, v7.16b598eor v9.16b, v9.16b, v6.16b599eor v10.16b, v10.16b, v5.16b600eor v16.16b, v16.16b, v4.16b601and v8.16b, v8.16b, v18.16b602and v9.16b, v9.16b, v18.16b603and v10.16b, v10.16b, v18.16b604and v16.16b, v16.16b, v18.16b605eor v7.16b, v7.16b, v8.16b606shl v8.2d, v8.2d, #4607eor v6.16b, v6.16b, v9.16b608shl v9.2d, v9.2d, #4609eor v5.16b, v5.16b, v10.16b610shl v10.2d, v10.2d, #4611eor v4.16b, v4.16b, v16.16b612shl v16.2d, v16.2d, #4613eor v3.16b, v3.16b, v8.16b614eor v2.16b, v2.16b, v9.16b615eor v1.16b, v1.16b, v10.16b616eor v0.16b, v0.16b, v16.16b617b .Lenc_sbox618.align 4619.Lenc_loop:620ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64621ldp q8, q9, [x9], #32622eor v0.16b, v16.16b, v0.16b623ldr q10, [x9], #16624eor v1.16b, v17.16b, v1.16b625ldr q16, [x9], #16626eor v2.16b, v18.16b, v2.16b627eor v3.16b, v19.16b, v3.16b628eor v4.16b, v8.16b, v4.16b629eor v5.16b, v9.16b, v5.16b630eor v6.16b, v10.16b, v6.16b631eor v7.16b, v16.16b, v7.16b632tbl v0.16b, {v0.16b}, v28.16b633tbl v1.16b, {v1.16b}, v28.16b634tbl v2.16b, {v2.16b}, v28.16b635tbl v3.16b, {v3.16b}, v28.16b636tbl v4.16b, {v4.16b}, v28.16b637tbl v5.16b, {v5.16b}, v28.16b638tbl v6.16b, {v6.16b}, v28.16b639tbl v7.16b, {v7.16b}, v28.16b640.Lenc_sbox:641eor v5.16b, v5.16b, v6.16b642eor v3.16b, v3.16b, v0.16b643subs x10, x10, #1644eor v2.16b, v2.16b, v1.16b645eor v5.16b, v5.16b, v0.16b646eor v8.16b, v3.16b, v7.16b647eor v6.16b, v6.16b, v2.16b648eor v7.16b, v7.16b, v5.16b649eor v8.16b, v8.16b, v4.16b650eor v3.16b, v6.16b, v3.16b651eor v4.16b, v4.16b, v5.16b652eor v6.16b, v1.16b, v5.16b653eor v2.16b, v2.16b, v7.16b654eor v1.16b, v8.16b, v1.16b655eor v8.16b, v7.16b, v4.16b656eor v9.16b, v3.16b, v0.16b657eor v10.16b, v7.16b, v6.16b658eor v16.16b, v5.16b, v3.16b659eor v17.16b, v6.16b, v2.16b660eor v18.16b, v5.16b, v1.16b661eor v19.16b, v2.16b, v4.16b662eor v20.16b, v1.16b, v0.16b663orr v21.16b, v8.16b, v9.16b664orr v22.16b, v10.16b, v16.16b665eor v23.16b, v8.16b, v17.16b666eor v24.16b, v9.16b, v18.16b667and v19.16b, v19.16b, v20.16b668orr v20.16b, v17.16b, v18.16b669and v8.16b, v8.16b, v9.16b670and v9.16b, v17.16b, v18.16b671and v17.16b, v23.16b, v24.16b672and v10.16b, v10.16b, v16.16b673eor v16.16b, v21.16b, v19.16b674eor v18.16b, v20.16b, v19.16b675and v19.16b, v2.16b, v1.16b676and v20.16b, v6.16b, v5.16b677eor v21.16b, v22.16b, v17.16b678eor v9.16b, v9.16b, v10.16b679eor v10.16b, v16.16b, v17.16b680eor v16.16b, v18.16b, v8.16b681and v17.16b, v4.16b, v0.16b682orr v18.16b, v7.16b, v3.16b683eor v21.16b, v21.16b, v8.16b684eor v8.16b, v9.16b, v8.16b685eor v9.16b, v10.16b, v19.16b686eor v10.16b, v3.16b, v0.16b687eor v16.16b, v16.16b, v17.16b688eor v17.16b, v5.16b, v1.16b689eor v19.16b, v21.16b, v20.16b690eor v20.16b, v8.16b, v18.16b691eor v8.16b, v8.16b, v18.16b692eor v18.16b, v7.16b, v4.16b693eor v21.16b, v9.16b, v16.16b694eor v22.16b, v6.16b, v2.16b695and v23.16b, v9.16b, v19.16b696eor v24.16b, v10.16b, v17.16b697eor v25.16b, v0.16b, v1.16b698eor v26.16b, v7.16b, v6.16b699eor v27.16b, v18.16b, v22.16b700eor v28.16b, v3.16b, v5.16b701eor v29.16b, v16.16b, v23.16b702eor v30.16b, v20.16b, v23.16b703eor v23.16b, v20.16b, v23.16b704eor v31.16b, v4.16b, v2.16b705bsl v29.16b, v19.16b, v20.16b706bsl v30.16b, v9.16b, v16.16b707bsl v8.16b, v29.16b, v23.16b708bsl v20.16b, v23.16b, v29.16b709eor v9.16b, v30.16b, v29.16b710and v5.16b, v5.16b, v30.16b711and v8.16b, v8.16b, v30.16b712and v1.16b, v1.16b, v29.16b713eor v16.16b, v19.16b, v20.16b714and v2.16b, v2.16b, v29.16b715eor v19.16b, v9.16b, v29.16b716and v17.16b, v17.16b, v9.16b717eor v8.16b, v8.16b, v21.16b718and v20.16b, v22.16b, v9.16b719eor v21.16b, v29.16b, v16.16b720eor v22.16b, v29.16b, v16.16b721and v23.16b, v25.16b, v16.16b722and v6.16b, v6.16b, v19.16b723eor v25.16b, v8.16b, v16.16b724eor v29.16b, v30.16b, v8.16b725and v4.16b, v21.16b, v4.16b726and v8.16b, v28.16b, v8.16b727and v0.16b, v22.16b, v0.16b728eor v21.16b, v23.16b, v1.16b729eor v22.16b, v9.16b, v25.16b730eor v9.16b, v9.16b, v25.16b731eor v23.16b, v25.16b, v16.16b732and v3.16b, v29.16b, v3.16b733and v24.16b, v24.16b, v25.16b734and v25.16b, v27.16b, v25.16b735and v10.16b, v22.16b, v10.16b736and v9.16b, v9.16b, v18.16b737eor v18.16b, v19.16b, v23.16b738and v19.16b, v26.16b, v23.16b739eor v3.16b, v5.16b, v3.16b740eor v17.16b, v17.16b, v24.16b741eor v10.16b, v24.16b, v10.16b742and v16.16b, v31.16b, v16.16b743eor v20.16b, v20.16b, v25.16b744eor v9.16b, v25.16b, v9.16b745eor v4.16b, v2.16b, v4.16b746and v7.16b, v18.16b, v7.16b747eor v18.16b, v19.16b, v6.16b748eor v5.16b, v8.16b, v5.16b749eor v0.16b, v1.16b, v0.16b750eor v1.16b, v21.16b, v10.16b751eor v8.16b, v3.16b, v17.16b752eor v2.16b, v16.16b, v2.16b753eor v3.16b, v6.16b, v7.16b754eor v6.16b, v18.16b, v9.16b755eor v4.16b, v4.16b, v20.16b756eor v10.16b, v5.16b, v10.16b757eor v0.16b, v0.16b, v17.16b758eor v9.16b, v2.16b, v9.16b759eor v3.16b, v3.16b, v20.16b760eor v7.16b, v6.16b, v1.16b761eor v5.16b, v8.16b, v4.16b762eor v6.16b, v10.16b, v1.16b763eor v2.16b, v4.16b, v0.16b764eor v4.16b, v3.16b, v10.16b765eor v9.16b, v9.16b, v7.16b766eor v3.16b, v0.16b, v5.16b767eor v0.16b, v1.16b, v4.16b768eor v1.16b, v4.16b, v8.16b769eor v4.16b, v9.16b, v5.16b770eor v6.16b, v6.16b, v3.16b771bcc .Lenc_done772ext v8.16b, v0.16b, v0.16b, #12773ext v9.16b, v4.16b, v4.16b, #12774ldr q28, [x11]775ext v10.16b, v6.16b, v6.16b, #12776ext v16.16b, v1.16b, v1.16b, #12777ext v17.16b, v3.16b, v3.16b, #12778ext v18.16b, v7.16b, v7.16b, #12779eor v0.16b, v0.16b, v8.16b780eor v4.16b, v4.16b, v9.16b781eor v6.16b, v6.16b, v10.16b782ext v19.16b, v2.16b, v2.16b, #12783ext v20.16b, v5.16b, v5.16b, #12784eor v1.16b, v1.16b, v16.16b785eor v3.16b, v3.16b, v17.16b786eor v7.16b, v7.16b, v18.16b787eor v2.16b, v2.16b, v19.16b788eor v16.16b, v16.16b, v0.16b789eor v5.16b, v5.16b, v20.16b790eor v17.16b, v17.16b, v6.16b791eor v10.16b, v10.16b, v4.16b792ext v0.16b, v0.16b, v0.16b, #8793eor v9.16b, v9.16b, v1.16b794ext v1.16b, v1.16b, v1.16b, #8795eor v8.16b, v8.16b, v5.16b796eor v16.16b, v16.16b, v5.16b797eor v18.16b, v18.16b, v3.16b798eor v19.16b, v19.16b, v7.16b799ext v3.16b, v3.16b, v3.16b, #8800ext v7.16b, v7.16b, v7.16b, #8801eor v20.16b, v20.16b, v2.16b802ext v6.16b, v6.16b, v6.16b, #8803ext v21.16b, v5.16b, v5.16b, #8804eor v17.16b, v17.16b, v5.16b805ext v2.16b, v2.16b, v2.16b, #8806eor v10.16b, v10.16b, v5.16b807ext v22.16b, v4.16b, v4.16b, #8808eor v0.16b, v0.16b, v8.16b809eor v1.16b, v1.16b, v16.16b810eor v5.16b, v7.16b, v18.16b811eor v4.16b, v3.16b, v17.16b812eor v3.16b, v6.16b, v10.16b813eor v7.16b, v21.16b, v20.16b814eor v6.16b, v2.16b, v19.16b815eor v2.16b, v22.16b, v9.16b816bne .Lenc_loop817ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)818b .Lenc_loop819.align 4820.Lenc_done:821ushr v8.2d, v0.2d, #1822movi v9.16b, #0x55823ldr q10, [x9]824ushr v16.2d, v3.2d, #1825movi v17.16b, #0x33826ushr v18.2d, v4.2d, #1827movi v19.16b, #0x0f828eor v8.16b, v8.16b, v1.16b829ushr v20.2d, v2.2d, #1830eor v16.16b, v16.16b, v7.16b831eor v18.16b, v18.16b, v6.16b832and v8.16b, v8.16b, v9.16b833eor v20.16b, v20.16b, v5.16b834and v16.16b, v16.16b, v9.16b835and v18.16b, v18.16b, v9.16b836shl v21.2d, v8.2d, #1837eor v1.16b, v1.16b, v8.16b838and v8.16b, v20.16b, v9.16b839eor v7.16b, v7.16b, v16.16b840shl v9.2d, v16.2d, #1841eor v6.16b, v6.16b, v18.16b842shl v16.2d, v18.2d, #1843eor v0.16b, v0.16b, v21.16b844shl v18.2d, v8.2d, #1845eor v5.16b, v5.16b, v8.16b846eor v3.16b, v3.16b, v9.16b847eor v4.16b, v4.16b, v16.16b848ushr v8.2d, v1.2d, #2849eor v2.16b, v2.16b, v18.16b850ushr v9.2d, v0.2d, #2851ushr v16.2d, v7.2d, #2852ushr v18.2d, v3.2d, #2853eor v8.16b, v8.16b, v6.16b854eor v9.16b, v9.16b, v4.16b855eor v16.16b, v16.16b, v5.16b856eor v18.16b, v18.16b, v2.16b857and v8.16b, v8.16b, v17.16b858and v9.16b, v9.16b, v17.16b859and v16.16b, v16.16b, v17.16b860and v17.16b, v18.16b, v17.16b861eor v6.16b, v6.16b, v8.16b862shl v8.2d, v8.2d, #2863eor v4.16b, v4.16b, v9.16b864shl v9.2d, v9.2d, #2865eor v5.16b, v5.16b, v16.16b866shl v16.2d, v16.2d, #2867eor v2.16b, v2.16b, v17.16b868shl v17.2d, v17.2d, #2869eor v1.16b, v1.16b, v8.16b870eor v0.16b, v0.16b, v9.16b871eor v7.16b, v7.16b, v16.16b872eor v3.16b, v3.16b, v17.16b873ushr v8.2d, v6.2d, #4874ushr v9.2d, v4.2d, #4875ushr v16.2d, v1.2d, #4876ushr v17.2d, v0.2d, #4877eor v8.16b, v8.16b, v5.16b878eor v9.16b, v9.16b, v2.16b879eor v16.16b, v16.16b, v7.16b880eor v17.16b, v17.16b, v3.16b881and v8.16b, v8.16b, v19.16b882and v9.16b, v9.16b, v19.16b883and v16.16b, v16.16b, v19.16b884and v17.16b, v17.16b, v19.16b885eor v5.16b, v5.16b, v8.16b886shl v8.2d, v8.2d, #4887eor v2.16b, v2.16b, v9.16b888shl v9.2d, v9.2d, #4889eor v7.16b, v7.16b, v16.16b890shl v16.2d, v16.2d, #4891eor v3.16b, v3.16b, v17.16b892shl v17.2d, v17.2d, #4893eor v6.16b, v6.16b, v8.16b894eor v4.16b, v4.16b, v9.16b895eor v7.16b, v7.16b, v10.16b896eor v1.16b, v1.16b, v16.16b897eor v3.16b, v3.16b, v10.16b898eor v0.16b, v0.16b, v17.16b899eor v6.16b, v6.16b, v10.16b900eor v4.16b, v4.16b, v10.16b901eor v2.16b, v2.16b, v10.16b902eor v5.16b, v5.16b, v10.16b903eor v1.16b, v1.16b, v10.16b904eor v0.16b, v0.16b, v10.16b905ret906.size _bsaes_encrypt8,.-_bsaes_encrypt8907908.type _bsaes_key_convert,%function909.align 4910// On entry:911// x9 -> input key (big-endian)912// x10 = number of rounds913// x17 -> output key (native endianness)914// On exit:915// x9, x10 corrupted916// x11 -> .LM0_bigendian917// x17 -> last quadword of output key918// other general-purpose registers preserved919// v2-v6 preserved920// v7.16b[] = 0x63921// v8-v14 preserved922// v15 = last round key (converted to native endianness)923// other SIMD registers corrupted924_bsaes_key_convert:925#ifdef __AARCH64EL__926adrp x11, .LM0_littleendian927add x11, x11, #:lo12:.LM0_littleendian928#else929adrp x11, .LM0_bigendian930add x11, x11, #:lo12:.LM0_bigendian931#endif932ldr q0, [x9], #16 // load round 0 key933ldr q1, [x11] // .LM0934ldr q15, [x9], #16 // load round 1 key935936movi v7.16b, #0x63 // compose .L63937movi v16.16b, #0x01 // bit masks938movi v17.16b, #0x02939movi v18.16b, #0x04940movi v19.16b, #0x08941movi v20.16b, #0x10942movi v21.16b, #0x20943movi v22.16b, #0x40944movi v23.16b, #0x80945946#ifdef __AARCH64EL__947rev32 v0.16b, v0.16b948#endif949sub x10, x10, #1950str q0, [x17], #16 // save round 0 key951952.align 4953.Lkey_loop:954tbl v0.16b, {v15.16b}, v1.16b955ldr q15, [x9], #16 // load next round key956957eor v0.16b, v0.16b, v7.16b958cmtst v24.16b, v0.16b, v16.16b959cmtst v25.16b, v0.16b, v17.16b960cmtst v26.16b, v0.16b, v18.16b961cmtst v27.16b, v0.16b, v19.16b962cmtst v28.16b, v0.16b, v20.16b963cmtst v29.16b, v0.16b, v21.16b964cmtst v30.16b, v0.16b, v22.16b965cmtst v31.16b, v0.16b, v23.16b966sub x10, x10, #1967st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key968st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64969cbnz x10, .Lkey_loop970971// don't save last round key972#ifdef __AARCH64EL__973rev32 v15.16b, v15.16b974adrp x11, .LM0_bigendian975add x11, x11, #:lo12:.LM0_bigendian976#endif977ret978.size _bsaes_key_convert,.-_bsaes_key_convert979980.globl ossl_bsaes_cbc_encrypt981.type ossl_bsaes_cbc_encrypt,%function982.align 4983// On entry:984// x0 -> input ciphertext985// x1 -> output plaintext986// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)987// x3 -> key988// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)989// w5 must be == 0990// On exit:991// Output plaintext filled in992// Initialisation vector overwritten with last quadword of ciphertext993// No output registers, usual AAPCS64 register preservation994ossl_bsaes_cbc_encrypt:995AARCH64_VALID_CALL_TARGET996cmp x2, #128997bhs .Lcbc_do_bsaes998b AES_cbc_encrypt999.Lcbc_do_bsaes:10001001// it is up to the caller to make sure we are called with enc == 010021003stp x29, x30, [sp, #-48]!1004stp d8, d9, [sp, #16]1005stp d10, d15, [sp, #32]1006lsr x2, x2, #4 // len in 16 byte blocks10071008ldr w15, [x3, #240] // get # of rounds1009mov x14, sp10101011// allocate the key schedule on the stack1012add x17, sp, #961013sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes10141015// populate the key schedule1016mov x9, x3 // pass key1017mov x10, x15 // pass # of rounds1018mov sp, x17 // sp is sp1019bl _bsaes_key_convert1020ldr q6, [sp]1021str q15, [x17] // save last round key1022eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)1023str q6, [sp]10241025ldr q15, [x4] // load IV1026b .Lcbc_dec_loop10271028.align 41029.Lcbc_dec_loop:1030subs x2, x2, #0x81031bmi .Lcbc_dec_loop_finish10321033ldr q0, [x0], #16 // load input1034mov x9, sp // pass the key1035ldr q1, [x0], #161036mov x10, x151037ldr q2, [x0], #161038ldr q3, [x0], #161039ldr q4, [x0], #161040ldr q5, [x0], #161041ldr q6, [x0], #161042ldr q7, [x0], #-7*1610431044bl _bsaes_decrypt810451046ldr q16, [x0], #16 // reload input1047eor v0.16b, v0.16b, v15.16b // ^= IV1048eor v1.16b, v1.16b, v16.16b1049str q0, [x1], #16 // write output1050ldr q0, [x0], #161051str q1, [x1], #161052ldr q1, [x0], #161053eor v1.16b, v4.16b, v1.16b1054ldr q4, [x0], #161055eor v2.16b, v2.16b, v4.16b1056eor v0.16b, v6.16b, v0.16b1057ldr q4, [x0], #161058str q0, [x1], #161059str q1, [x1], #161060eor v0.16b, v7.16b, v4.16b1061ldr q1, [x0], #161062str q2, [x1], #161063ldr q2, [x0], #161064ldr q15, [x0], #161065str q0, [x1], #161066eor v0.16b, v5.16b, v2.16b1067eor v1.16b, v3.16b, v1.16b1068str q1, [x1], #161069str q0, [x1], #1610701071b .Lcbc_dec_loop10721073.Lcbc_dec_loop_finish:1074adds x2, x2, #81075beq .Lcbc_dec_done10761077ldr q0, [x0], #16 // load input1078cmp x2, #21079blo .Lcbc_dec_one1080ldr q1, [x0], #161081mov x9, sp // pass the key1082mov x10, x151083beq .Lcbc_dec_two1084ldr q2, [x0], #161085cmp x2, #41086blo .Lcbc_dec_three1087ldr q3, [x0], #161088beq .Lcbc_dec_four1089ldr q4, [x0], #161090cmp x2, #61091blo .Lcbc_dec_five1092ldr q5, [x0], #161093beq .Lcbc_dec_six1094ldr q6, [x0], #-6*1610951096bl _bsaes_decrypt810971098ldr q5, [x0], #16 // reload input1099eor v0.16b, v0.16b, v15.16b // ^= IV1100ldr q8, [x0], #161101ldr q9, [x0], #161102ldr q10, [x0], #161103str q0, [x1], #16 // write output1104ldr q0, [x0], #161105eor v1.16b, v1.16b, v5.16b1106ldr q5, [x0], #161107eor v6.16b, v6.16b, v8.16b1108ldr q15, [x0]1109eor v4.16b, v4.16b, v9.16b1110eor v2.16b, v2.16b, v10.16b1111str q1, [x1], #161112eor v0.16b, v7.16b, v0.16b1113str q6, [x1], #161114eor v1.16b, v3.16b, v5.16b1115str q4, [x1], #161116str q2, [x1], #161117str q0, [x1], #161118str q1, [x1]1119b .Lcbc_dec_done1120.align 41121.Lcbc_dec_six:1122sub x0, x0, #0x601123bl _bsaes_decrypt81124ldr q3, [x0], #16 // reload input1125eor v0.16b, v0.16b, v15.16b // ^= IV1126ldr q5, [x0], #161127ldr q8, [x0], #161128ldr q9, [x0], #161129str q0, [x1], #16 // write output1130ldr q0, [x0], #161131eor v1.16b, v1.16b, v3.16b1132ldr q15, [x0]1133eor v3.16b, v6.16b, v5.16b1134eor v4.16b, v4.16b, v8.16b1135eor v2.16b, v2.16b, v9.16b1136str q1, [x1], #161137eor v0.16b, v7.16b, v0.16b1138str q3, [x1], #161139str q4, [x1], #161140str q2, [x1], #161141str q0, [x1]1142b .Lcbc_dec_done1143.align 41144.Lcbc_dec_five:1145sub x0, x0, #0x501146bl _bsaes_decrypt81147ldr q3, [x0], #16 // reload input1148eor v0.16b, v0.16b, v15.16b // ^= IV1149ldr q5, [x0], #161150ldr q7, [x0], #161151ldr q8, [x0], #161152str q0, [x1], #16 // write output1153ldr q15, [x0]1154eor v0.16b, v1.16b, v3.16b1155eor v1.16b, v6.16b, v5.16b1156eor v3.16b, v4.16b, v7.16b1157str q0, [x1], #161158eor v0.16b, v2.16b, v8.16b1159str q1, [x1], #161160str q3, [x1], #161161str q0, [x1]1162b .Lcbc_dec_done1163.align 41164.Lcbc_dec_four:1165sub x0, x0, #0x401166bl _bsaes_decrypt81167ldr q2, [x0], #16 // reload input1168eor v0.16b, v0.16b, v15.16b // ^= IV1169ldr q3, [x0], #161170ldr q5, [x0], #161171str q0, [x1], #16 // write output1172ldr q15, [x0]1173eor v0.16b, v1.16b, v2.16b1174eor v1.16b, v6.16b, v3.16b1175eor v2.16b, v4.16b, v5.16b1176str q0, [x1], #161177str q1, [x1], #161178str q2, [x1]1179b .Lcbc_dec_done1180.align 41181.Lcbc_dec_three:1182sub x0, x0, #0x301183bl _bsaes_decrypt81184ldr q2, [x0], #16 // reload input1185eor v0.16b, v0.16b, v15.16b // ^= IV1186ldr q3, [x0], #161187ldr q15, [x0]1188str q0, [x1], #16 // write output1189eor v0.16b, v1.16b, v2.16b1190eor v1.16b, v6.16b, v3.16b1191str q0, [x1], #161192str q1, [x1]1193b .Lcbc_dec_done1194.align 41195.Lcbc_dec_two:1196sub x0, x0, #0x201197bl _bsaes_decrypt81198ldr q2, [x0], #16 // reload input1199eor v0.16b, v0.16b, v15.16b // ^= IV1200ldr q15, [x0]1201str q0, [x1], #16 // write output1202eor v0.16b, v1.16b, v2.16b1203str q0, [x1]1204b .Lcbc_dec_done1205.align 41206.Lcbc_dec_one:1207sub x0, x0, #0x101208stp x1, x4, [sp, #-32]!1209str x14, [sp, #16]1210mov v8.16b, v15.16b1211mov v15.16b, v0.16b1212mov x2, x31213bl AES_decrypt1214ldr x14, [sp, #16]1215ldp x1, x4, [sp], #321216ldr q0, [x1] // load result1217eor v0.16b, v0.16b, v8.16b // ^= IV1218str q0, [x1] // write output12191220.align 41221.Lcbc_dec_done:1222movi v0.16b, #01223movi v1.16b, #01224.Lcbc_dec_bzero: // wipe key schedule [if any]1225stp q0, q1, [sp], #321226cmp sp, x141227bne .Lcbc_dec_bzero1228str q15, [x4] // return IV1229ldp d8, d9, [sp, #16]1230ldp d10, d15, [sp, #32]1231ldp x29, x30, [sp], #481232ret1233.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt12341235.globl ossl_bsaes_ctr32_encrypt_blocks1236.type ossl_bsaes_ctr32_encrypt_blocks,%function1237.align 41238// On entry:1239// x0 -> input text (whole 16-byte blocks)1240// x1 -> output text (whole 16-byte blocks)1241// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)1242// x3 -> key1243// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block1244// On exit:1245// Output text filled in1246// No output registers, usual AAPCS64 register preservation1247ossl_bsaes_ctr32_encrypt_blocks:1248AARCH64_VALID_CALL_TARGET1249cmp x2, #8 // use plain AES for1250blo .Lctr_enc_short // small sizes12511252stp x29, x30, [sp, #-80]!1253stp d8, d9, [sp, #16]1254stp d10, d11, [sp, #32]1255stp d12, d13, [sp, #48]1256stp d14, d15, [sp, #64]12571258ldr w15, [x3, #240] // get # of rounds1259mov x14, sp12601261// allocate the key schedule on the stack1262add x17, sp, #961263sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes12641265// populate the key schedule1266mov x9, x3 // pass key1267mov x10, x15 // pass # of rounds1268mov sp, x17 // sp is sp1269bl _bsaes_key_convert1270eor v7.16b, v7.16b, v15.16b // fix up last round key1271str q7, [x17] // save last round key12721273ldr q0, [x4] // load counter1274add x13, x11, #.LREVM0SR-.LM0_bigendian1275ldr q4, [sp] // load round0 key12761277movi v8.4s, #1 // compose 1<<961278movi v9.16b, #01279rev32 v15.16b, v0.16b1280rev32 v0.16b, v0.16b1281ext v11.16b, v9.16b, v8.16b, #41282rev32 v4.16b, v4.16b1283add v12.4s, v11.4s, v11.4s // compose 2<<961284str q4, [sp] // save adjusted round0 key1285add v13.4s, v11.4s, v12.4s // compose 3<<961286add v14.4s, v12.4s, v12.4s // compose 4<<961287b .Lctr_enc_loop12881289.align 41290.Lctr_enc_loop:1291// Intermix prologue from _bsaes_encrypt8 to use the opportunity1292// to flip byte order in 32-bit counter12931294add v1.4s, v15.4s, v11.4s // +11295add x9, sp, #0x10 // pass next round key1296add v2.4s, v15.4s, v12.4s // +21297ldr q9, [x13] // .LREVM0SR1298ldr q8, [sp] // load round0 key1299add v3.4s, v15.4s, v13.4s // +31300mov x10, x15 // pass rounds1301sub x11, x13, #.LREVM0SR-.LSR // pass constants1302add v6.4s, v2.4s, v14.4s1303add v4.4s, v15.4s, v14.4s // +41304add v7.4s, v3.4s, v14.4s1305add v15.4s, v4.4s, v14.4s // next counter1306add v5.4s, v1.4s, v14.4s13071308bl _bsaes_encrypt8_alt13091310subs x2, x2, #81311blo .Lctr_enc_loop_done13121313ldr q16, [x0], #161314ldr q17, [x0], #161315eor v1.16b, v1.16b, v17.16b1316ldr q17, [x0], #161317eor v0.16b, v0.16b, v16.16b1318eor v4.16b, v4.16b, v17.16b1319str q0, [x1], #161320ldr q16, [x0], #161321str q1, [x1], #161322mov v0.16b, v15.16b1323str q4, [x1], #161324ldr q1, [x0], #161325eor v4.16b, v6.16b, v16.16b1326eor v1.16b, v3.16b, v1.16b1327ldr q3, [x0], #161328eor v3.16b, v7.16b, v3.16b1329ldr q6, [x0], #161330eor v2.16b, v2.16b, v6.16b1331ldr q6, [x0], #161332eor v5.16b, v5.16b, v6.16b1333str q4, [x1], #161334str q1, [x1], #161335str q3, [x1], #161336str q2, [x1], #161337str q5, [x1], #1613381339bne .Lctr_enc_loop1340b .Lctr_enc_done13411342.align 41343.Lctr_enc_loop_done:1344add x2, x2, #81345ldr q16, [x0], #16 // load input1346eor v0.16b, v0.16b, v16.16b1347str q0, [x1], #16 // write output1348cmp x2, #21349blo .Lctr_enc_done1350ldr q17, [x0], #161351eor v1.16b, v1.16b, v17.16b1352str q1, [x1], #161353beq .Lctr_enc_done1354ldr q18, [x0], #161355eor v4.16b, v4.16b, v18.16b1356str q4, [x1], #161357cmp x2, #41358blo .Lctr_enc_done1359ldr q19, [x0], #161360eor v6.16b, v6.16b, v19.16b1361str q6, [x1], #161362beq .Lctr_enc_done1363ldr q20, [x0], #161364eor v3.16b, v3.16b, v20.16b1365str q3, [x1], #161366cmp x2, #61367blo .Lctr_enc_done1368ldr q21, [x0], #161369eor v7.16b, v7.16b, v21.16b1370str q7, [x1], #161371beq .Lctr_enc_done1372ldr q22, [x0]1373eor v2.16b, v2.16b, v22.16b1374str q2, [x1], #1613751376.Lctr_enc_done:1377movi v0.16b, #01378movi v1.16b, #01379.Lctr_enc_bzero: // wipe key schedule [if any]1380stp q0, q1, [sp], #321381cmp sp, x141382bne .Lctr_enc_bzero13831384ldp d8, d9, [sp, #16]1385ldp d10, d11, [sp, #32]1386ldp d12, d13, [sp, #48]1387ldp d14, d15, [sp, #64]1388ldp x29, x30, [sp], #801389ret13901391.Lctr_enc_short:1392stp x29, x30, [sp, #-96]!1393stp x19, x20, [sp, #16]1394stp x21, x22, [sp, #32]1395str x23, [sp, #48]13961397mov x19, x0 // copy arguments1398mov x20, x11399mov x21, x21400mov x22, x31401ldr w23, [x4, #12] // load counter .LSW1402ldr q1, [x4] // load whole counter value1403#ifdef __AARCH64EL__1404rev w23, w231405#endif1406str q1, [sp, #80] // copy counter value14071408.Lctr_enc_short_loop:1409add x0, sp, #80 // input counter value1410add x1, sp, #64 // output on the stack1411mov x2, x22 // key14121413bl AES_encrypt14141415ldr q0, [x19], #16 // load input1416ldr q1, [sp, #64] // load encrypted counter1417add x23, x23, #11418#ifdef __AARCH64EL__1419rev w0, w231420str w0, [sp, #80+12] // next counter value1421#else1422str w23, [sp, #80+12] // next counter value1423#endif1424eor v0.16b, v0.16b, v1.16b1425str q0, [x20], #16 // store output1426subs x21, x21, #11427bne .Lctr_enc_short_loop14281429movi v0.16b, #01430movi v1.16b, #01431stp q0, q1, [sp, #64]14321433ldr x23, [sp, #48]1434ldp x21, x22, [sp, #32]1435ldp x19, x20, [sp, #16]1436ldp x29, x30, [sp], #961437ret1438.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks14391440.globl ossl_bsaes_xts_encrypt1441.type ossl_bsaes_xts_encrypt,%function1442.align 41443// On entry:1444// x0 -> input plaintext1445// x1 -> output ciphertext1446// x2 -> length of text in bytes (must be at least 16)1447// x3 -> key1 (used to encrypt the XORed plaintext blocks)1448// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)1449// x5 -> 16-byte initial vector (typically, sector number)1450// On exit:1451// Output ciphertext filled in1452// No output registers, usual AAPCS64 register preservation1453ossl_bsaes_xts_encrypt:1454AARCH64_VALID_CALL_TARGET1455// Stack layout:1456// sp ->1457// nrounds*128-96 bytes: key schedule1458// x19 ->1459// 16 bytes: frame record1460// 4*16 bytes: tweak storage across _bsaes_encrypt81461// 6*8 bytes: storage for 5 callee-saved general-purpose registers1462// 8*8 bytes: storage for 8 callee-saved SIMD registers1463stp x29, x30, [sp, #-192]!1464stp x19, x20, [sp, #80]1465stp x21, x22, [sp, #96]1466str x23, [sp, #112]1467stp d8, d9, [sp, #128]1468stp d10, d11, [sp, #144]1469stp d12, d13, [sp, #160]1470stp d14, d15, [sp, #176]14711472mov x19, sp1473mov x20, x01474mov x21, x11475mov x22, x21476mov x23, x314771478// generate initial tweak1479sub sp, sp, #161480mov x0, x5 // iv[]1481mov x1, sp1482mov x2, x4 // key21483bl AES_encrypt1484ldr q11, [sp], #1614851486ldr w1, [x23, #240] // get # of rounds1487// allocate the key schedule on the stack1488add x17, sp, #961489sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes14901491// populate the key schedule1492mov x9, x23 // pass key1493mov x10, x1 // pass # of rounds1494mov sp, x171495bl _bsaes_key_convert1496eor v15.16b, v15.16b, v7.16b // fix up last round key1497str q15, [x17] // save last round key14981499subs x22, x22, #0x801500blo .Lxts_enc_short1501b .Lxts_enc_loop15021503.align 41504.Lxts_enc_loop:1505ldr q8, .Lxts_magic1506mov x10, x1 // pass rounds1507add x2, x19, #161508ldr q0, [x20], #161509sshr v1.2d, v11.2d, #631510mov x9, sp // pass key schedule1511ldr q6, .Lxts_magic+161512add v2.2d, v11.2d, v11.2d1513cmtst v3.2d, v11.2d, v6.2d1514and v1.16b, v1.16b, v8.16b1515ext v1.16b, v1.16b, v1.16b, #81516and v3.16b, v3.16b, v8.16b1517ldr q4, [x20], #161518eor v12.16b, v2.16b, v1.16b1519eor v1.16b, v4.16b, v12.16b1520eor v0.16b, v0.16b, v11.16b1521cmtst v2.2d, v12.2d, v6.2d1522add v4.2d, v12.2d, v12.2d1523add x0, x19, #161524ext v3.16b, v3.16b, v3.16b, #81525and v2.16b, v2.16b, v8.16b1526eor v13.16b, v4.16b, v3.16b1527ldr q3, [x20], #161528ext v4.16b, v2.16b, v2.16b, #81529eor v2.16b, v3.16b, v13.16b1530ldr q3, [x20], #161531add v5.2d, v13.2d, v13.2d1532cmtst v7.2d, v13.2d, v6.2d1533and v7.16b, v7.16b, v8.16b1534ldr q9, [x20], #161535ext v7.16b, v7.16b, v7.16b, #81536ldr q10, [x20], #161537eor v14.16b, v5.16b, v4.16b1538ldr q16, [x20], #161539add v4.2d, v14.2d, v14.2d1540eor v3.16b, v3.16b, v14.16b1541eor v15.16b, v4.16b, v7.16b1542add v5.2d, v15.2d, v15.2d1543ldr q7, [x20], #161544cmtst v4.2d, v14.2d, v6.2d1545and v17.16b, v4.16b, v8.16b1546cmtst v18.2d, v15.2d, v6.2d1547eor v4.16b, v9.16b, v15.16b1548ext v9.16b, v17.16b, v17.16b, #81549eor v9.16b, v5.16b, v9.16b1550add v17.2d, v9.2d, v9.2d1551and v18.16b, v18.16b, v8.16b1552eor v5.16b, v10.16b, v9.16b1553str q9, [x2], #161554ext v10.16b, v18.16b, v18.16b, #81555cmtst v9.2d, v9.2d, v6.2d1556and v9.16b, v9.16b, v8.16b1557eor v10.16b, v17.16b, v10.16b1558cmtst v17.2d, v10.2d, v6.2d1559eor v6.16b, v16.16b, v10.16b1560str q10, [x2], #161561ext v9.16b, v9.16b, v9.16b, #81562add v10.2d, v10.2d, v10.2d1563eor v9.16b, v10.16b, v9.16b1564str q9, [x2], #161565eor v7.16b, v7.16b, v9.16b1566add v9.2d, v9.2d, v9.2d1567and v8.16b, v17.16b, v8.16b1568ext v8.16b, v8.16b, v8.16b, #81569eor v8.16b, v9.16b, v8.16b1570str q8, [x2] // next round tweak15711572bl _bsaes_encrypt815731574ldr q8, [x0], #161575eor v0.16b, v0.16b, v11.16b1576eor v1.16b, v1.16b, v12.16b1577ldr q9, [x0], #161578eor v4.16b, v4.16b, v13.16b1579eor v6.16b, v6.16b, v14.16b1580ldr q10, [x0], #161581eor v3.16b, v3.16b, v15.16b1582subs x22, x22, #0x801583str q0, [x21], #161584ldr q11, [x0] // next round tweak1585str q1, [x21], #161586eor v0.16b, v7.16b, v8.16b1587eor v1.16b, v2.16b, v9.16b1588str q4, [x21], #161589eor v2.16b, v5.16b, v10.16b1590str q6, [x21], #161591str q3, [x21], #161592str q0, [x21], #161593str q1, [x21], #161594str q2, [x21], #161595bpl .Lxts_enc_loop15961597.Lxts_enc_short:1598adds x22, x22, #0x701599bmi .Lxts_enc_done16001601ldr q8, .Lxts_magic1602sshr v1.2d, v11.2d, #631603add v2.2d, v11.2d, v11.2d1604ldr q9, .Lxts_magic+161605subs x22, x22, #0x101606ldr q0, [x20], #161607and v1.16b, v1.16b, v8.16b1608cmtst v3.2d, v11.2d, v9.2d1609ext v1.16b, v1.16b, v1.16b, #81610and v3.16b, v3.16b, v8.16b1611eor v12.16b, v2.16b, v1.16b1612ext v1.16b, v3.16b, v3.16b, #81613add v2.2d, v12.2d, v12.2d1614cmtst v3.2d, v12.2d, v9.2d1615eor v13.16b, v2.16b, v1.16b1616and v22.16b, v3.16b, v8.16b1617bmi .Lxts_enc_116181619ext v2.16b, v22.16b, v22.16b, #81620add v3.2d, v13.2d, v13.2d1621ldr q1, [x20], #161622cmtst v4.2d, v13.2d, v9.2d1623subs x22, x22, #0x101624eor v14.16b, v3.16b, v2.16b1625and v23.16b, v4.16b, v8.16b1626bmi .Lxts_enc_216271628ext v3.16b, v23.16b, v23.16b, #81629add v4.2d, v14.2d, v14.2d1630ldr q2, [x20], #161631cmtst v5.2d, v14.2d, v9.2d1632eor v0.16b, v0.16b, v11.16b1633subs x22, x22, #0x101634eor v15.16b, v4.16b, v3.16b1635and v24.16b, v5.16b, v8.16b1636bmi .Lxts_enc_316371638ext v4.16b, v24.16b, v24.16b, #81639add v5.2d, v15.2d, v15.2d1640ldr q3, [x20], #161641cmtst v6.2d, v15.2d, v9.2d1642eor v1.16b, v1.16b, v12.16b1643subs x22, x22, #0x101644eor v16.16b, v5.16b, v4.16b1645and v25.16b, v6.16b, v8.16b1646bmi .Lxts_enc_416471648ext v5.16b, v25.16b, v25.16b, #81649add v6.2d, v16.2d, v16.2d1650add x0, x19, #161651cmtst v7.2d, v16.2d, v9.2d1652ldr q4, [x20], #161653eor v2.16b, v2.16b, v13.16b1654str q16, [x0], #161655subs x22, x22, #0x101656eor v17.16b, v6.16b, v5.16b1657and v26.16b, v7.16b, v8.16b1658bmi .Lxts_enc_516591660ext v7.16b, v26.16b, v26.16b, #81661add v18.2d, v17.2d, v17.2d1662ldr q5, [x20], #161663eor v3.16b, v3.16b, v14.16b1664str q17, [x0], #161665subs x22, x22, #0x101666eor v18.16b, v18.16b, v7.16b1667bmi .Lxts_enc_616681669ldr q6, [x20], #161670eor v4.16b, v4.16b, v15.16b1671eor v5.16b, v5.16b, v16.16b1672str q18, [x0] // next round tweak1673mov x9, sp // pass key schedule1674mov x10, x11675add x0, x19, #161676sub x22, x22, #0x101677eor v6.16b, v6.16b, v17.16b16781679bl _bsaes_encrypt816801681ldr q16, [x0], #161682eor v0.16b, v0.16b, v11.16b1683eor v1.16b, v1.16b, v12.16b1684ldr q17, [x0], #161685eor v4.16b, v4.16b, v13.16b1686eor v6.16b, v6.16b, v14.16b1687eor v3.16b, v3.16b, v15.16b1688ldr q11, [x0] // next round tweak1689str q0, [x21], #161690str q1, [x21], #161691eor v0.16b, v7.16b, v16.16b1692eor v1.16b, v2.16b, v17.16b1693str q4, [x21], #161694str q6, [x21], #161695str q3, [x21], #161696str q0, [x21], #161697str q1, [x21], #161698b .Lxts_enc_done16991700.align 41701.Lxts_enc_6:1702eor v4.16b, v4.16b, v15.16b1703eor v5.16b, v5.16b, v16.16b1704mov x9, sp // pass key schedule1705mov x10, x1 // pass rounds1706add x0, x19, #1617071708bl _bsaes_encrypt817091710ldr q16, [x0], #161711eor v0.16b, v0.16b, v11.16b1712eor v1.16b, v1.16b, v12.16b1713eor v4.16b, v4.16b, v13.16b1714eor v6.16b, v6.16b, v14.16b1715ldr q11, [x0] // next round tweak1716eor v3.16b, v3.16b, v15.16b1717str q0, [x21], #161718str q1, [x21], #161719eor v0.16b, v7.16b, v16.16b1720str q4, [x21], #161721str q6, [x21], #161722str q3, [x21], #161723str q0, [x21], #161724b .Lxts_enc_done17251726.align 41727.Lxts_enc_5:1728eor v3.16b, v3.16b, v14.16b1729eor v4.16b, v4.16b, v15.16b1730mov x9, sp // pass key schedule1731mov x10, x1 // pass rounds1732add x0, x19, #1617331734bl _bsaes_encrypt817351736eor v0.16b, v0.16b, v11.16b1737eor v1.16b, v1.16b, v12.16b1738ldr q11, [x0] // next round tweak1739eor v4.16b, v4.16b, v13.16b1740eor v6.16b, v6.16b, v14.16b1741eor v3.16b, v3.16b, v15.16b1742str q0, [x21], #161743str q1, [x21], #161744str q4, [x21], #161745str q6, [x21], #161746str q3, [x21], #161747b .Lxts_enc_done17481749.align 41750.Lxts_enc_4:1751eor v2.16b, v2.16b, v13.16b1752eor v3.16b, v3.16b, v14.16b1753mov x9, sp // pass key schedule1754mov x10, x1 // pass rounds1755add x0, x19, #1617561757bl _bsaes_encrypt817581759eor v0.16b, v0.16b, v11.16b1760eor v1.16b, v1.16b, v12.16b1761eor v4.16b, v4.16b, v13.16b1762eor v6.16b, v6.16b, v14.16b1763mov v11.16b, v15.16b // next round tweak1764str q0, [x21], #161765str q1, [x21], #161766str q4, [x21], #161767str q6, [x21], #161768b .Lxts_enc_done17691770.align 41771.Lxts_enc_3:1772eor v1.16b, v1.16b, v12.16b1773eor v2.16b, v2.16b, v13.16b1774mov x9, sp // pass key schedule1775mov x10, x1 // pass rounds1776add x0, x19, #1617771778bl _bsaes_encrypt817791780eor v0.16b, v0.16b, v11.16b1781eor v1.16b, v1.16b, v12.16b1782eor v4.16b, v4.16b, v13.16b1783mov v11.16b, v14.16b // next round tweak1784str q0, [x21], #161785str q1, [x21], #161786str q4, [x21], #161787b .Lxts_enc_done17881789.align 41790.Lxts_enc_2:1791eor v0.16b, v0.16b, v11.16b1792eor v1.16b, v1.16b, v12.16b1793mov x9, sp // pass key schedule1794mov x10, x1 // pass rounds1795add x0, x19, #1617961797bl _bsaes_encrypt817981799eor v0.16b, v0.16b, v11.16b1800eor v1.16b, v1.16b, v12.16b1801mov v11.16b, v13.16b // next round tweak1802str q0, [x21], #161803str q1, [x21], #161804b .Lxts_enc_done18051806.align 41807.Lxts_enc_1:1808eor v0.16b, v0.16b, v11.16b1809sub x0, sp, #161810sub x1, sp, #161811mov x2, x231812mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers1813mov v14.d[0], v12.d[1]1814str q0, [sp, #-16]!18151816bl AES_encrypt18171818ldr q0, [sp], #161819trn1 v13.2d, v11.2d, v13.2d1820trn1 v11.2d, v12.2d, v14.2d // next round tweak1821eor v0.16b, v0.16b, v13.16b1822str q0, [x21], #1618231824.Lxts_enc_done:1825adds x22, x22, #0x101826beq .Lxts_enc_ret18271828sub x6, x21, #0x101829// Penultimate plaintext block produces final ciphertext part-block1830// plus remaining part of final plaintext block. Move ciphertext part1831// to final position and reuse penultimate ciphertext block buffer to1832// construct final plaintext block1833.Lxts_enc_steal:1834ldrb w0, [x20], #11835ldrb w1, [x21, #-0x10]1836strb w0, [x21, #-0x10]1837strb w1, [x21], #118381839subs x22, x22, #11840bhi .Lxts_enc_steal18411842// Finally encrypt the penultimate ciphertext block using the1843// last tweak1844ldr q0, [x6]1845eor v0.16b, v0.16b, v11.16b1846str q0, [sp, #-16]!1847mov x0, sp1848mov x1, sp1849mov x2, x231850mov x21, x61851mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers18521853bl AES_encrypt18541855trn1 v11.2d, v11.2d, v13.2d1856ldr q0, [sp], #161857eor v0.16b, v0.16b, v11.16b1858str q0, [x21]18591860.Lxts_enc_ret:18611862movi v0.16b, #01863movi v1.16b, #01864.Lxts_enc_bzero: // wipe key schedule1865stp q0, q1, [sp], #321866cmp sp, x191867bne .Lxts_enc_bzero18681869ldp x19, x20, [sp, #80]1870ldp x21, x22, [sp, #96]1871ldr x23, [sp, #112]1872ldp d8, d9, [sp, #128]1873ldp d10, d11, [sp, #144]1874ldp d12, d13, [sp, #160]1875ldp d14, d15, [sp, #176]1876ldp x29, x30, [sp], #1921877ret1878.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt18791880// The assembler doesn't seem capable of de-duplicating these when expressed1881// using `ldr qd,=` syntax, so assign a symbolic address1882.align 51883.Lxts_magic:1884.quad 1, 0x87, 0x4000000000000000, 0x400000000000000018851886.globl ossl_bsaes_xts_decrypt1887.type ossl_bsaes_xts_decrypt,%function1888.align 41889// On entry:1890// x0 -> input ciphertext1891// x1 -> output plaintext1892// x2 -> length of text in bytes (must be at least 16)1893// x3 -> key1 (used to decrypt the XORed ciphertext blocks)1894// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)1895// x5 -> 16-byte initial vector (typically, sector number)1896// On exit:1897// Output plaintext filled in1898// No output registers, usual AAPCS64 register preservation1899ossl_bsaes_xts_decrypt:1900AARCH64_VALID_CALL_TARGET1901// Stack layout:1902// sp ->1903// nrounds*128-96 bytes: key schedule1904// x19 ->1905// 16 bytes: frame record1906// 4*16 bytes: tweak storage across _bsaes_decrypt81907// 6*8 bytes: storage for 5 callee-saved general-purpose registers1908// 8*8 bytes: storage for 8 callee-saved SIMD registers1909stp x29, x30, [sp, #-192]!1910stp x19, x20, [sp, #80]1911stp x21, x22, [sp, #96]1912str x23, [sp, #112]1913stp d8, d9, [sp, #128]1914stp d10, d11, [sp, #144]1915stp d12, d13, [sp, #160]1916stp d14, d15, [sp, #176]19171918mov x19, sp1919mov x20, x01920mov x21, x11921mov x22, x21922mov x23, x319231924// generate initial tweak1925sub sp, sp, #161926mov x0, x5 // iv[]1927mov x1, sp1928mov x2, x4 // key21929bl AES_encrypt1930ldr q11, [sp], #1619311932ldr w1, [x23, #240] // get # of rounds1933// allocate the key schedule on the stack1934add x17, sp, #961935sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes19361937// populate the key schedule1938mov x9, x23 // pass key1939mov x10, x1 // pass # of rounds1940mov sp, x171941bl _bsaes_key_convert1942ldr q6, [sp]1943str q15, [x17] // save last round key1944eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)1945str q6, [sp]19461947sub x30, x22, #0x101948tst x22, #0xf // if not multiple of 161949csel x22, x30, x22, ne // subtract another 16 bytes1950subs x22, x22, #0x8019511952blo .Lxts_dec_short1953b .Lxts_dec_loop19541955.align 41956.Lxts_dec_loop:1957ldr q8, .Lxts_magic1958mov x10, x1 // pass rounds1959add x2, x19, #161960ldr q0, [x20], #161961sshr v1.2d, v11.2d, #631962mov x9, sp // pass key schedule1963ldr q6, .Lxts_magic+161964add v2.2d, v11.2d, v11.2d1965cmtst v3.2d, v11.2d, v6.2d1966and v1.16b, v1.16b, v8.16b1967ext v1.16b, v1.16b, v1.16b, #81968and v3.16b, v3.16b, v8.16b1969ldr q4, [x20], #161970eor v12.16b, v2.16b, v1.16b1971eor v1.16b, v4.16b, v12.16b1972eor v0.16b, v0.16b, v11.16b1973cmtst v2.2d, v12.2d, v6.2d1974add v4.2d, v12.2d, v12.2d1975add x0, x19, #161976ext v3.16b, v3.16b, v3.16b, #81977and v2.16b, v2.16b, v8.16b1978eor v13.16b, v4.16b, v3.16b1979ldr q3, [x20], #161980ext v4.16b, v2.16b, v2.16b, #81981eor v2.16b, v3.16b, v13.16b1982ldr q3, [x20], #161983add v5.2d, v13.2d, v13.2d1984cmtst v7.2d, v13.2d, v6.2d1985and v7.16b, v7.16b, v8.16b1986ldr q9, [x20], #161987ext v7.16b, v7.16b, v7.16b, #81988ldr q10, [x20], #161989eor v14.16b, v5.16b, v4.16b1990ldr q16, [x20], #161991add v4.2d, v14.2d, v14.2d1992eor v3.16b, v3.16b, v14.16b1993eor v15.16b, v4.16b, v7.16b1994add v5.2d, v15.2d, v15.2d1995ldr q7, [x20], #161996cmtst v4.2d, v14.2d, v6.2d1997and v17.16b, v4.16b, v8.16b1998cmtst v18.2d, v15.2d, v6.2d1999eor v4.16b, v9.16b, v15.16b2000ext v9.16b, v17.16b, v17.16b, #82001eor v9.16b, v5.16b, v9.16b2002add v17.2d, v9.2d, v9.2d2003and v18.16b, v18.16b, v8.16b2004eor v5.16b, v10.16b, v9.16b2005str q9, [x2], #162006ext v10.16b, v18.16b, v18.16b, #82007cmtst v9.2d, v9.2d, v6.2d2008and v9.16b, v9.16b, v8.16b2009eor v10.16b, v17.16b, v10.16b2010cmtst v17.2d, v10.2d, v6.2d2011eor v6.16b, v16.16b, v10.16b2012str q10, [x2], #162013ext v9.16b, v9.16b, v9.16b, #82014add v10.2d, v10.2d, v10.2d2015eor v9.16b, v10.16b, v9.16b2016str q9, [x2], #162017eor v7.16b, v7.16b, v9.16b2018add v9.2d, v9.2d, v9.2d2019and v8.16b, v17.16b, v8.16b2020ext v8.16b, v8.16b, v8.16b, #82021eor v8.16b, v9.16b, v8.16b2022str q8, [x2] // next round tweak20232024bl _bsaes_decrypt820252026eor v6.16b, v6.16b, v13.16b2027eor v0.16b, v0.16b, v11.16b2028ldr q8, [x0], #162029eor v7.16b, v7.16b, v8.16b2030str q0, [x21], #162031eor v0.16b, v1.16b, v12.16b2032ldr q1, [x0], #162033eor v1.16b, v3.16b, v1.16b2034subs x22, x22, #0x802035eor v2.16b, v2.16b, v15.16b2036eor v3.16b, v4.16b, v14.16b2037ldr q4, [x0], #162038str q0, [x21], #162039ldr q11, [x0] // next round tweak2040eor v0.16b, v5.16b, v4.16b2041str q6, [x21], #162042str q3, [x21], #162043str q2, [x21], #162044str q7, [x21], #162045str q1, [x21], #162046str q0, [x21], #162047bpl .Lxts_dec_loop20482049.Lxts_dec_short:2050adds x22, x22, #0x702051bmi .Lxts_dec_done20522053ldr q8, .Lxts_magic2054sshr v1.2d, v11.2d, #632055add v2.2d, v11.2d, v11.2d2056ldr q9, .Lxts_magic+162057subs x22, x22, #0x102058ldr q0, [x20], #162059and v1.16b, v1.16b, v8.16b2060cmtst v3.2d, v11.2d, v9.2d2061ext v1.16b, v1.16b, v1.16b, #82062and v3.16b, v3.16b, v8.16b2063eor v12.16b, v2.16b, v1.16b2064ext v1.16b, v3.16b, v3.16b, #82065add v2.2d, v12.2d, v12.2d2066cmtst v3.2d, v12.2d, v9.2d2067eor v13.16b, v2.16b, v1.16b2068and v22.16b, v3.16b, v8.16b2069bmi .Lxts_dec_120702071ext v2.16b, v22.16b, v22.16b, #82072add v3.2d, v13.2d, v13.2d2073ldr q1, [x20], #162074cmtst v4.2d, v13.2d, v9.2d2075subs x22, x22, #0x102076eor v14.16b, v3.16b, v2.16b2077and v23.16b, v4.16b, v8.16b2078bmi .Lxts_dec_220792080ext v3.16b, v23.16b, v23.16b, #82081add v4.2d, v14.2d, v14.2d2082ldr q2, [x20], #162083cmtst v5.2d, v14.2d, v9.2d2084eor v0.16b, v0.16b, v11.16b2085subs x22, x22, #0x102086eor v15.16b, v4.16b, v3.16b2087and v24.16b, v5.16b, v8.16b2088bmi .Lxts_dec_320892090ext v4.16b, v24.16b, v24.16b, #82091add v5.2d, v15.2d, v15.2d2092ldr q3, [x20], #162093cmtst v6.2d, v15.2d, v9.2d2094eor v1.16b, v1.16b, v12.16b2095subs x22, x22, #0x102096eor v16.16b, v5.16b, v4.16b2097and v25.16b, v6.16b, v8.16b2098bmi .Lxts_dec_420992100ext v5.16b, v25.16b, v25.16b, #82101add v6.2d, v16.2d, v16.2d2102add x0, x19, #162103cmtst v7.2d, v16.2d, v9.2d2104ldr q4, [x20], #162105eor v2.16b, v2.16b, v13.16b2106str q16, [x0], #162107subs x22, x22, #0x102108eor v17.16b, v6.16b, v5.16b2109and v26.16b, v7.16b, v8.16b2110bmi .Lxts_dec_521112112ext v7.16b, v26.16b, v26.16b, #82113add v18.2d, v17.2d, v17.2d2114ldr q5, [x20], #162115eor v3.16b, v3.16b, v14.16b2116str q17, [x0], #162117subs x22, x22, #0x102118eor v18.16b, v18.16b, v7.16b2119bmi .Lxts_dec_621202121ldr q6, [x20], #162122eor v4.16b, v4.16b, v15.16b2123eor v5.16b, v5.16b, v16.16b2124str q18, [x0] // next round tweak2125mov x9, sp // pass key schedule2126mov x10, x12127add x0, x19, #162128sub x22, x22, #0x102129eor v6.16b, v6.16b, v17.16b21302131bl _bsaes_decrypt821322133ldr q16, [x0], #162134eor v0.16b, v0.16b, v11.16b2135eor v1.16b, v1.16b, v12.16b2136ldr q17, [x0], #162137eor v6.16b, v6.16b, v13.16b2138eor v4.16b, v4.16b, v14.16b2139eor v2.16b, v2.16b, v15.16b2140ldr q11, [x0] // next round tweak2141str q0, [x21], #162142str q1, [x21], #162143eor v0.16b, v7.16b, v16.16b2144eor v1.16b, v3.16b, v17.16b2145str q6, [x21], #162146str q4, [x21], #162147str q2, [x21], #162148str q0, [x21], #162149str q1, [x21], #162150b .Lxts_dec_done21512152.align 42153.Lxts_dec_6:2154eor v4.16b, v4.16b, v15.16b2155eor v5.16b, v5.16b, v16.16b2156mov x9, sp // pass key schedule2157mov x10, x1 // pass rounds2158add x0, x19, #1621592160bl _bsaes_decrypt821612162ldr q16, [x0], #162163eor v0.16b, v0.16b, v11.16b2164eor v1.16b, v1.16b, v12.16b2165eor v6.16b, v6.16b, v13.16b2166eor v4.16b, v4.16b, v14.16b2167ldr q11, [x0] // next round tweak2168eor v2.16b, v2.16b, v15.16b2169str q0, [x21], #162170str q1, [x21], #162171eor v0.16b, v7.16b, v16.16b2172str q6, [x21], #162173str q4, [x21], #162174str q2, [x21], #162175str q0, [x21], #162176b .Lxts_dec_done21772178.align 42179.Lxts_dec_5:2180eor v3.16b, v3.16b, v14.16b2181eor v4.16b, v4.16b, v15.16b2182mov x9, sp // pass key schedule2183mov x10, x1 // pass rounds2184add x0, x19, #1621852186bl _bsaes_decrypt821872188eor v0.16b, v0.16b, v11.16b2189eor v1.16b, v1.16b, v12.16b2190ldr q11, [x0] // next round tweak2191eor v6.16b, v6.16b, v13.16b2192eor v4.16b, v4.16b, v14.16b2193eor v2.16b, v2.16b, v15.16b2194str q0, [x21], #162195str q1, [x21], #162196str q6, [x21], #162197str q4, [x21], #162198str q2, [x21], #162199b .Lxts_dec_done22002201.align 42202.Lxts_dec_4:2203eor v2.16b, v2.16b, v13.16b2204eor v3.16b, v3.16b, v14.16b2205mov x9, sp // pass key schedule2206mov x10, x1 // pass rounds2207add x0, x19, #1622082209bl _bsaes_decrypt822102211eor v0.16b, v0.16b, v11.16b2212eor v1.16b, v1.16b, v12.16b2213eor v6.16b, v6.16b, v13.16b2214eor v4.16b, v4.16b, v14.16b2215mov v11.16b, v15.16b // next round tweak2216str q0, [x21], #162217str q1, [x21], #162218str q6, [x21], #162219str q4, [x21], #162220b .Lxts_dec_done22212222.align 42223.Lxts_dec_3:2224eor v1.16b, v1.16b, v12.16b2225eor v2.16b, v2.16b, v13.16b2226mov x9, sp // pass key schedule2227mov x10, x1 // pass rounds2228add x0, x19, #1622292230bl _bsaes_decrypt822312232eor v0.16b, v0.16b, v11.16b2233eor v1.16b, v1.16b, v12.16b2234eor v6.16b, v6.16b, v13.16b2235mov v11.16b, v14.16b // next round tweak2236str q0, [x21], #162237str q1, [x21], #162238str q6, [x21], #162239b .Lxts_dec_done22402241.align 42242.Lxts_dec_2:2243eor v0.16b, v0.16b, v11.16b2244eor v1.16b, v1.16b, v12.16b2245mov x9, sp // pass key schedule2246mov x10, x1 // pass rounds2247add x0, x19, #1622482249bl _bsaes_decrypt822502251eor v0.16b, v0.16b, v11.16b2252eor v1.16b, v1.16b, v12.16b2253mov v11.16b, v13.16b // next round tweak2254str q0, [x21], #162255str q1, [x21], #162256b .Lxts_dec_done22572258.align 42259.Lxts_dec_1:2260eor v0.16b, v0.16b, v11.16b2261sub x0, sp, #162262sub x1, sp, #162263mov x2, x232264mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers2265mov v14.d[0], v12.d[1]2266str q0, [sp, #-16]!22672268bl AES_decrypt22692270ldr q0, [sp], #162271trn1 v13.2d, v11.2d, v13.2d2272trn1 v11.2d, v12.2d, v14.2d // next round tweak2273eor v0.16b, v0.16b, v13.16b2274str q0, [x21], #1622752276.Lxts_dec_done:2277adds x22, x22, #0x102278beq .Lxts_dec_ret22792280// calculate one round of extra tweak for the stolen ciphertext2281ldr q8, .Lxts_magic2282sshr v6.2d, v11.2d, #632283and v6.16b, v6.16b, v8.16b2284add v12.2d, v11.2d, v11.2d2285ext v6.16b, v6.16b, v6.16b, #82286eor v12.16b, v12.16b, v6.16b22872288// perform the final decryption with the last tweak value2289ldr q0, [x20], #162290eor v0.16b, v0.16b, v12.16b2291str q0, [sp, #-16]!2292mov x0, sp2293mov x1, sp2294mov x2, x232295mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers2296mov v14.d[0], v12.d[1]22972298bl AES_decrypt22992300trn1 v12.2d, v12.2d, v14.2d2301trn1 v11.2d, v11.2d, v13.2d2302ldr q0, [sp], #162303eor v0.16b, v0.16b, v12.16b2304str q0, [x21]23052306mov x6, x212307// Penultimate ciphertext block produces final plaintext part-block2308// plus remaining part of final ciphertext block. Move plaintext part2309// to final position and reuse penultimate plaintext block buffer to2310// construct final ciphertext block2311.Lxts_dec_steal:2312ldrb w1, [x21]2313ldrb w0, [x20], #12314strb w1, [x21, #0x10]2315strb w0, [x21], #123162317subs x22, x22, #12318bhi .Lxts_dec_steal23192320// Finally decrypt the penultimate plaintext block using the2321// penultimate tweak2322ldr q0, [x6]2323eor v0.16b, v0.16b, v11.16b2324str q0, [sp, #-16]!2325mov x0, sp2326mov x1, sp2327mov x2, x232328mov x21, x623292330bl AES_decrypt23312332trn1 v11.2d, v11.2d, v13.2d2333ldr q0, [sp], #162334eor v0.16b, v0.16b, v11.16b2335str q0, [x21]23362337.Lxts_dec_ret:23382339movi v0.16b, #02340movi v1.16b, #02341.Lxts_dec_bzero: // wipe key schedule2342stp q0, q1, [sp], #322343cmp sp, x192344bne .Lxts_dec_bzero23452346ldp x19, x20, [sp, #80]2347ldp x21, x22, [sp, #96]2348ldr x23, [sp, #112]2349ldp d8, d9, [sp, #128]2350ldp d10, d11, [sp, #144]2351ldp d12, d13, [sp, #160]2352ldp d14, d15, [sp, #176]2353ldp x29, x30, [sp], #1922354ret2355.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt235623572358