Path: blob/main/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
39507 views
/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */1#include "arm_arch.h"23#if __ARM_MAX_ARCH__>=84.arch armv8-a+crypto5.text6.globl aes_gcm_enc_128_kernel7.type aes_gcm_enc_128_kernel,%function8.align 49aes_gcm_enc_128_kernel:10AARCH64_VALID_CALL_TARGET11cbz x1, .L128_enc_ret12stp x19, x20, [sp, #-112]!13mov x16, x414mov x8, x515stp x21, x22, [sp, #16]16stp x23, x24, [sp, #32]17stp d8, d9, [sp, #48]18stp d10, d11, [sp, #64]19stp d12, d13, [sp, #80]20stp d14, d15, [sp, #96]2122ldp x10, x11, [x16] //ctr96_b64, ctr96_t3223#ifdef __AARCH64EB__24rev x10, x1025rev x11, x1126#endif27ldp x13, x14, [x8, #160] //load rk1028#ifdef __AARCH64EB__29ror x13, x13, #3230ror x14, x14, #3231#endif32ld1 {v11.16b}, [x3]33ext v11.16b, v11.16b, v11.16b, #834rev64 v11.16b, v11.16b35lsr x5, x1, #3 //byte_len36mov x15, x53738ld1 {v18.4s}, [x8], #16 //load rk039add x4, x0, x1, lsr #3 //end_input_ptr40sub x5, x5, #1 //byte_len - 14142lsr x12, x11, #3243ldr q15, [x3, #112] //load h4l | h4h44#ifndef __AARCH64EB__45ext v15.16b, v15.16b, v15.16b, #846#endif47fmov d1, x10 //CTR block 148rev w12, w12 //rev_ctr324950add w12, w12, #1 //increment rev_ctr3251orr w11, w11, w1152ld1 {v19.4s}, [x8], #16 //load rk15354rev w9, w12 //CTR block 155add w12, w12, #1 //CTR block 156fmov d3, x10 //CTR block 35758orr x9, x11, x9, lsl #32 //CTR block 159ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible6061fmov v1.d[1], x9 //CTR block 162rev w9, w12 //CTR block 26364fmov d2, x10 //CTR block 265orr x9, x11, x9, lsl #32 //CTR block 266add w12, w12, #1 //CTR block 26768fmov v2.d[1], x9 //CTR block 269rev w9, w12 //CTR block 37071orr x9, x11, x9, lsl #32 //CTR block 372ld1 {v20.4s}, [x8], #16 //load rk27374add w12, w12, #1 //CTR block 375fmov v3.d[1], x9 //CTR block 37677ldr q14, [x3, #80] //load h3l | h3h78#ifndef __AARCH64EB__79ext v14.16b, v14.16b, v14.16b, #880#endif81aese v1.16b, v18.16b82aesmc v1.16b, v1.16b //AES block 1 - round 083ld1 {v21.4s}, [x8], #16 //load rk38485aese v2.16b, v18.16b86aesmc v2.16b, v2.16b //AES block 2 - round 087ldr q12, [x3, #32] //load h1l | h1h88#ifndef __AARCH64EB__89ext v12.16b, v12.16b, v12.16b, #890#endif9192aese v0.16b, v18.16b93aesmc v0.16b, v0.16b //AES block 0 - round 094ld1 {v22.4s}, [x8], #16 //load rk49596aese v3.16b, v18.16b97aesmc v3.16b, v3.16b //AES block 3 - round 098ld1 {v23.4s}, [x8], #16 //load rk599100aese v2.16b, v19.16b101aesmc v2.16b, v2.16b //AES block 2 - round 1102trn2 v17.2d, v14.2d, v15.2d //h4l | h3l103104aese v0.16b, v19.16b105aesmc v0.16b, v0.16b //AES block 0 - round 1106ld1 {v24.4s}, [x8], #16 //load rk6107108aese v1.16b, v19.16b109aesmc v1.16b, v1.16b //AES block 1 - round 1110ld1 {v25.4s}, [x8], #16 //load rk7111112aese v3.16b, v19.16b113aesmc v3.16b, v3.16b //AES block 3 - round 1114trn1 v9.2d, v14.2d, v15.2d //h4h | h3h115116aese v0.16b, v20.16b117aesmc v0.16b, v0.16b //AES block 0 - round 2118ld1 {v26.4s}, [x8], #16 //load rk8119120aese v1.16b, v20.16b121aesmc v1.16b, v1.16b //AES block 1 - round 2122ldr q13, [x3, #64] //load h2l | h2h123#ifndef __AARCH64EB__124ext v13.16b, v13.16b, v13.16b, #8125#endif126127aese v3.16b, v20.16b128aesmc v3.16b, v3.16b //AES block 3 - round 2129130aese v2.16b, v20.16b131aesmc v2.16b, v2.16b //AES block 2 - round 2132eor v17.16b, v17.16b, v9.16b //h4k | h3k133134aese v0.16b, v21.16b135aesmc v0.16b, v0.16b //AES block 0 - round 3136137aese v1.16b, v21.16b138aesmc v1.16b, v1.16b //AES block 1 - round 3139140aese v2.16b, v21.16b141aesmc v2.16b, v2.16b //AES block 2 - round 3142ld1 {v27.4s}, [x8], #16 //load rk9143144aese v3.16b, v21.16b145aesmc v3.16b, v3.16b //AES block 3 - round 3146147and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)148trn2 v16.2d, v12.2d, v13.2d //h2l | h1l149150aese v3.16b, v22.16b151aesmc v3.16b, v3.16b //AES block 3 - round 4152add x5, x5, x0153154aese v2.16b, v22.16b155aesmc v2.16b, v2.16b //AES block 2 - round 4156cmp x0, x5 //check if we have <= 4 blocks157158aese v0.16b, v22.16b159aesmc v0.16b, v0.16b //AES block 0 - round 4160161aese v3.16b, v23.16b162aesmc v3.16b, v3.16b //AES block 3 - round 5163164aese v2.16b, v23.16b165aesmc v2.16b, v2.16b //AES block 2 - round 5166167aese v0.16b, v23.16b168aesmc v0.16b, v0.16b //AES block 0 - round 5169170aese v3.16b, v24.16b171aesmc v3.16b, v3.16b //AES block 3 - round 6172173aese v1.16b, v22.16b174aesmc v1.16b, v1.16b //AES block 1 - round 4175176aese v2.16b, v24.16b177aesmc v2.16b, v2.16b //AES block 2 - round 6178trn1 v8.2d, v12.2d, v13.2d //h2h | h1h179180aese v0.16b, v24.16b181aesmc v0.16b, v0.16b //AES block 0 - round 6182183aese v1.16b, v23.16b184aesmc v1.16b, v1.16b //AES block 1 - round 5185186aese v3.16b, v25.16b187aesmc v3.16b, v3.16b //AES block 3 - round 7188189aese v0.16b, v25.16b190aesmc v0.16b, v0.16b //AES block 0 - round 7191192aese v1.16b, v24.16b193aesmc v1.16b, v1.16b //AES block 1 - round 6194195aese v2.16b, v25.16b196aesmc v2.16b, v2.16b //AES block 2 - round 7197198aese v0.16b, v26.16b199aesmc v0.16b, v0.16b //AES block 0 - round 8200201aese v1.16b, v25.16b202aesmc v1.16b, v1.16b //AES block 1 - round 7203204aese v2.16b, v26.16b205aesmc v2.16b, v2.16b //AES block 2 - round 8206207aese v3.16b, v26.16b208aesmc v3.16b, v3.16b //AES block 3 - round 8209210aese v1.16b, v26.16b211aesmc v1.16b, v1.16b //AES block 1 - round 8212213aese v2.16b, v27.16b //AES block 2 - round 9214215aese v0.16b, v27.16b //AES block 0 - round 9216217eor v16.16b, v16.16b, v8.16b //h2k | h1k218219aese v1.16b, v27.16b //AES block 1 - round 9220221aese v3.16b, v27.16b //AES block 3 - round 9222b.ge .L128_enc_tail //handle tail223224ldp x6, x7, [x0, #0] //AES block 0 - load plaintext225#ifdef __AARCH64EB__226rev x6, x6227rev x7, x7228#endif229ldp x21, x22, [x0, #32] //AES block 2 - load plaintext230#ifdef __AARCH64EB__231rev x21, x21232rev x22, x22233#endif234ldp x19, x20, [x0, #16] //AES block 1 - load plaintext235#ifdef __AARCH64EB__236rev x19, x19237rev x20, x20238#endif239ldp x23, x24, [x0, #48] //AES block 3 - load plaintext240#ifdef __AARCH64EB__241rev x23, x23242rev x24, x24243#endif244eor x6, x6, x13 //AES block 0 - round 10 low245eor x7, x7, x14 //AES block 0 - round 10 high246247eor x21, x21, x13 //AES block 2 - round 10 low248fmov d4, x6 //AES block 0 - mov low249250eor x19, x19, x13 //AES block 1 - round 10 low251eor x22, x22, x14 //AES block 2 - round 10 high252fmov v4.d[1], x7 //AES block 0 - mov high253254fmov d5, x19 //AES block 1 - mov low255eor x20, x20, x14 //AES block 1 - round 10 high256257eor x23, x23, x13 //AES block 3 - round 10 low258fmov v5.d[1], x20 //AES block 1 - mov high259260fmov d6, x21 //AES block 2 - mov low261eor x24, x24, x14 //AES block 3 - round 10 high262rev w9, w12 //CTR block 4263264fmov v6.d[1], x22 //AES block 2 - mov high265orr x9, x11, x9, lsl #32 //CTR block 4266267eor v4.16b, v4.16b, v0.16b //AES block 0 - result268fmov d0, x10 //CTR block 4269add w12, w12, #1 //CTR block 4270271fmov v0.d[1], x9 //CTR block 4272rev w9, w12 //CTR block 5273274eor v5.16b, v5.16b, v1.16b //AES block 1 - result275fmov d1, x10 //CTR block 5276orr x9, x11, x9, lsl #32 //CTR block 5277278add w12, w12, #1 //CTR block 5279add x0, x0, #64 //AES input_ptr update280fmov v1.d[1], x9 //CTR block 5281282fmov d7, x23 //AES block 3 - mov low283rev w9, w12 //CTR block 6284st1 { v4.16b}, [x2], #16 //AES block 0 - store result285286fmov v7.d[1], x24 //AES block 3 - mov high287orr x9, x11, x9, lsl #32 //CTR block 6288289add w12, w12, #1 //CTR block 6290eor v6.16b, v6.16b, v2.16b //AES block 2 - result291st1 { v5.16b}, [x2], #16 //AES block 1 - store result292293fmov d2, x10 //CTR block 6294cmp x0, x5 //check if we have <= 8 blocks295296fmov v2.d[1], x9 //CTR block 6297rev w9, w12 //CTR block 7298st1 { v6.16b}, [x2], #16 //AES block 2 - store result299300orr x9, x11, x9, lsl #32 //CTR block 7301302eor v7.16b, v7.16b, v3.16b //AES block 3 - result303st1 { v7.16b}, [x2], #16 //AES block 3 - store result304b.ge .L128_enc_prepretail //do prepretail305306.L128_enc_main_loop: //main loop start307ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext308#ifdef __AARCH64EB__309rev x23, x23310rev x24, x24311#endif312rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)313rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)314315aese v2.16b, v18.16b316aesmc v2.16b, v2.16b //AES block 4k+6 - round 0317fmov d3, x10 //CTR block 4k+3318319ext v11.16b, v11.16b, v11.16b, #8 //PRE 0320rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)321322aese v1.16b, v18.16b323aesmc v1.16b, v1.16b //AES block 4k+5 - round 0324add w12, w12, #1 //CTR block 4k+3325fmov v3.d[1], x9 //CTR block 4k+3326327aese v0.16b, v18.16b328aesmc v0.16b, v0.16b //AES block 4k+4 - round 0329mov d31, v6.d[1] //GHASH block 4k+2 - mid330331aese v2.16b, v19.16b332aesmc v2.16b, v2.16b //AES block 4k+6 - round 1333mov d30, v5.d[1] //GHASH block 4k+1 - mid334335aese v1.16b, v19.16b336aesmc v1.16b, v1.16b //AES block 4k+5 - round 1337eor v4.16b, v4.16b, v11.16b //PRE 1338339aese v3.16b, v18.16b340aesmc v3.16b, v3.16b //AES block 4k+7 - round 0341eor x24, x24, x14 //AES block 4k+3 - round 10 high342343pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high344eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid345ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext346#ifdef __AARCH64EB__347rev x6, x6348rev x7, x7349#endif350aese v0.16b, v19.16b351aesmc v0.16b, v0.16b //AES block 4k+4 - round 1352rev w9, w12 //CTR block 4k+8353354eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid355mov d8, v4.d[1] //GHASH block 4k - mid356orr x9, x11, x9, lsl #32 //CTR block 4k+8357358pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high359add w12, w12, #1 //CTR block 4k+8360mov d10, v17.d[1] //GHASH block 4k - mid361362aese v0.16b, v20.16b363aesmc v0.16b, v0.16b //AES block 4k+4 - round 2364365pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low366eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid367368aese v1.16b, v20.16b369aesmc v1.16b, v1.16b //AES block 4k+5 - round 2370371aese v0.16b, v21.16b372aesmc v0.16b, v0.16b //AES block 4k+4 - round 3373eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high374375pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low376377pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid378rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)379380pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid381382pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low383ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid384385pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high386eor x7, x7, x14 //AES block 4k+4 - round 10 high387388eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid389mov d30, v7.d[1] //GHASH block 4k+3 - mid390391aese v3.16b, v19.16b392aesmc v3.16b, v3.16b //AES block 4k+7 - round 1393eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low394395aese v2.16b, v20.16b396aesmc v2.16b, v2.16b //AES block 4k+6 - round 2397eor x6, x6, x13 //AES block 4k+4 - round 10 low398399aese v1.16b, v21.16b400aesmc v1.16b, v1.16b //AES block 4k+5 - round 3401eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid402403pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high404405aese v2.16b, v21.16b406aesmc v2.16b, v2.16b //AES block 4k+6 - round 3407eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high408409pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid410411pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low412movi v8.8b, #0xc2413414pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid415eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low416417aese v1.16b, v22.16b418aesmc v1.16b, v1.16b //AES block 4k+5 - round 4419420aese v3.16b, v20.16b421aesmc v3.16b, v3.16b //AES block 4k+7 - round 2422shl d8, d8, #56 //mod_constant423424aese v0.16b, v22.16b425aesmc v0.16b, v0.16b //AES block 4k+4 - round 4426eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high427428aese v1.16b, v23.16b429aesmc v1.16b, v1.16b //AES block 4k+5 - round 5430ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext431#ifdef __AARCH64EB__432rev x19, x19433rev x20, x20434#endif435aese v3.16b, v21.16b436aesmc v3.16b, v3.16b //AES block 4k+7 - round 3437eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid438439aese v0.16b, v23.16b440aesmc v0.16b, v0.16b //AES block 4k+4 - round 5441ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext442#ifdef __AARCH64EB__443rev x21, x21444rev x22, x22445#endif446pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid447eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low448449aese v2.16b, v22.16b450aesmc v2.16b, v2.16b //AES block 4k+6 - round 4451eor x19, x19, x13 //AES block 4k+5 - round 10 low452453aese v3.16b, v22.16b454aesmc v3.16b, v3.16b //AES block 4k+7 - round 4455eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid456457aese v1.16b, v24.16b458aesmc v1.16b, v1.16b //AES block 4k+5 - round 6459eor x23, x23, x13 //AES block 4k+3 - round 10 low460461aese v2.16b, v23.16b462aesmc v2.16b, v2.16b //AES block 4k+6 - round 5463eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up464465fmov d4, x6 //AES block 4k+4 - mov low466aese v0.16b, v24.16b467aesmc v0.16b, v0.16b //AES block 4k+4 - round 6468fmov v4.d[1], x7 //AES block 4k+4 - mov high469470add x0, x0, #64 //AES input_ptr update471fmov d7, x23 //AES block 4k+3 - mov low472ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment473474aese v3.16b, v23.16b475aesmc v3.16b, v3.16b //AES block 4k+7 - round 5476fmov d5, x19 //AES block 4k+5 - mov low477478aese v0.16b, v25.16b479aesmc v0.16b, v0.16b //AES block 4k+4 - round 7480eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up481482aese v2.16b, v24.16b483aesmc v2.16b, v2.16b //AES block 4k+6 - round 6484eor x20, x20, x14 //AES block 4k+5 - round 10 high485486aese v1.16b, v25.16b487aesmc v1.16b, v1.16b //AES block 4k+5 - round 7488fmov v5.d[1], x20 //AES block 4k+5 - mov high489490aese v0.16b, v26.16b491aesmc v0.16b, v0.16b //AES block 4k+4 - round 8492fmov v7.d[1], x24 //AES block 4k+3 - mov high493494aese v3.16b, v24.16b495aesmc v3.16b, v3.16b //AES block 4k+7 - round 6496cmp x0, x5 //.LOOP CONTROL497498aese v1.16b, v26.16b499aesmc v1.16b, v1.16b //AES block 4k+5 - round 8500eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid501502aese v0.16b, v27.16b //AES block 4k+4 - round 9503eor x21, x21, x13 //AES block 4k+6 - round 10 low504eor x22, x22, x14 //AES block 4k+6 - round 10 high505506aese v3.16b, v25.16b507aesmc v3.16b, v3.16b //AES block 4k+7 - round 7508fmov d6, x21 //AES block 4k+6 - mov low509510aese v1.16b, v27.16b //AES block 4k+5 - round 9511fmov v6.d[1], x22 //AES block 4k+6 - mov high512513aese v2.16b, v25.16b514aesmc v2.16b, v2.16b //AES block 4k+6 - round 7515eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result516517fmov d0, x10 //CTR block 4k+8518aese v3.16b, v26.16b519aesmc v3.16b, v3.16b //AES block 4k+7 - round 8520521fmov v0.d[1], x9 //CTR block 4k+8522rev w9, w12 //CTR block 4k+9523eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid524525aese v2.16b, v26.16b526aesmc v2.16b, v2.16b //AES block 4k+6 - round 8527eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result528529add w12, w12, #1 //CTR block 4k+9530orr x9, x11, x9, lsl #32 //CTR block 4k+9531fmov d1, x10 //CTR block 4k+9532533pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low534fmov v1.d[1], x9 //CTR block 4k+9535rev w9, w12 //CTR block 4k+10536537aese v2.16b, v27.16b //AES block 4k+6 - round 9538st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result539eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result540orr x9, x11, x9, lsl #32 //CTR block 4k+10541542aese v3.16b, v27.16b //AES block 4k+7 - round 9543add w12, w12, #1 //CTR block 4k+10544ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment545fmov d2, x10 //CTR block 4k+10546547eor v11.16b, v11.16b, v9.16b //MODULO - fold into low548st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result549550fmov v2.d[1], x9 //CTR block 4k+10551st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result552rev w9, w12 //CTR block 4k+11553554orr x9, x11, x9, lsl #32 //CTR block 4k+11555eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result556557eor v11.16b, v11.16b, v10.16b //MODULO - fold into low558st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result559b.lt .L128_enc_main_loop560561.L128_enc_prepretail: //PREPRETAIL562rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)563fmov d3, x10 //CTR block 4k+3564rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)565566ext v11.16b, v11.16b, v11.16b, #8 //PRE 0567add w12, w12, #1 //CTR block 4k+3568fmov v3.d[1], x9 //CTR block 4k+3569570aese v1.16b, v18.16b571aesmc v1.16b, v1.16b //AES block 4k+5 - round 0572rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)573574pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low575576rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)577eor v4.16b, v4.16b, v11.16b //PRE 1578579pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high580581aese v3.16b, v18.16b582aesmc v3.16b, v3.16b //AES block 4k+7 - round 0583mov d30, v5.d[1] //GHASH block 4k+1 - mid584585pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low586mov d8, v4.d[1] //GHASH block 4k - mid587588mov d31, v6.d[1] //GHASH block 4k+2 - mid589mov d10, v17.d[1] //GHASH block 4k - mid590591aese v1.16b, v19.16b592aesmc v1.16b, v1.16b //AES block 4k+5 - round 1593eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid594595eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid596597pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high598eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid599600aese v3.16b, v19.16b601aesmc v3.16b, v3.16b //AES block 4k+7 - round 1602603pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid604eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low605606pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid607608aese v0.16b, v18.16b609aesmc v0.16b, v0.16b //AES block 4k+4 - round 0610ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid611612aese v2.16b, v18.16b613aesmc v2.16b, v2.16b //AES block 4k+6 - round 0614615eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid616mov d30, v7.d[1] //GHASH block 4k+3 - mid617618aese v0.16b, v19.16b619aesmc v0.16b, v0.16b //AES block 4k+4 - round 1620eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high621622pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid623624pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high625eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid626627pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high628629pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low630631aese v2.16b, v19.16b632aesmc v2.16b, v2.16b //AES block 4k+6 - round 1633eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high634635aese v0.16b, v20.16b636aesmc v0.16b, v0.16b //AES block 4k+4 - round 2637638pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low639movi v8.8b, #0xc2640641aese v2.16b, v20.16b642aesmc v2.16b, v2.16b //AES block 4k+6 - round 2643eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low644645aese v3.16b, v20.16b646aesmc v3.16b, v3.16b //AES block 4k+7 - round 2647648pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid649eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid650651aese v2.16b, v21.16b652aesmc v2.16b, v2.16b //AES block 4k+6 - round 3653654aese v1.16b, v20.16b655aesmc v1.16b, v1.16b //AES block 4k+5 - round 2656eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high657658aese v0.16b, v21.16b659aesmc v0.16b, v0.16b //AES block 4k+4 - round 3660661eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid662shl d8, d8, #56 //mod_constant663664aese v1.16b, v21.16b665aesmc v1.16b, v1.16b //AES block 4k+5 - round 3666eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low667668aese v0.16b, v22.16b669aesmc v0.16b, v0.16b //AES block 4k+4 - round 4670671pmull v28.1q, v9.1d, v8.1d672eor v10.16b, v10.16b, v9.16b //karatsuba tidy up673674aese v1.16b, v22.16b675aesmc v1.16b, v1.16b //AES block 4k+5 - round 4676677aese v0.16b, v23.16b678aesmc v0.16b, v0.16b //AES block 4k+4 - round 5679ext v9.16b, v9.16b, v9.16b, #8680681aese v3.16b, v21.16b682aesmc v3.16b, v3.16b //AES block 4k+7 - round 3683684aese v2.16b, v22.16b685aesmc v2.16b, v2.16b //AES block 4k+6 - round 4686eor v10.16b, v10.16b, v11.16b687688aese v0.16b, v24.16b689aesmc v0.16b, v0.16b //AES block 4k+4 - round 6690691aese v3.16b, v22.16b692aesmc v3.16b, v3.16b //AES block 4k+7 - round 4693694aese v1.16b, v23.16b695aesmc v1.16b, v1.16b //AES block 4k+5 - round 5696697aese v2.16b, v23.16b698aesmc v2.16b, v2.16b //AES block 4k+6 - round 5699eor v10.16b, v10.16b, v28.16b700701aese v3.16b, v23.16b702aesmc v3.16b, v3.16b //AES block 4k+7 - round 5703704aese v1.16b, v24.16b705aesmc v1.16b, v1.16b //AES block 4k+5 - round 6706707aese v2.16b, v24.16b708aesmc v2.16b, v2.16b //AES block 4k+6 - round 6709710aese v3.16b, v24.16b711aesmc v3.16b, v3.16b //AES block 4k+7 - round 6712eor v10.16b, v10.16b, v9.16b713714aese v0.16b, v25.16b715aesmc v0.16b, v0.16b //AES block 4k+4 - round 7716717aese v2.16b, v25.16b718aesmc v2.16b, v2.16b //AES block 4k+6 - round 7719720aese v3.16b, v25.16b721aesmc v3.16b, v3.16b //AES block 4k+7 - round 7722723pmull v28.1q, v10.1d, v8.1d724725aese v1.16b, v25.16b726aesmc v1.16b, v1.16b //AES block 4k+5 - round 7727ext v10.16b, v10.16b, v10.16b, #8728729aese v3.16b, v26.16b730aesmc v3.16b, v3.16b //AES block 4k+7 - round 8731732aese v0.16b, v26.16b733aesmc v0.16b, v0.16b //AES block 4k+4 - round 8734eor v11.16b, v11.16b, v28.16b735736aese v1.16b, v26.16b737aesmc v1.16b, v1.16b //AES block 4k+5 - round 8738739aese v3.16b, v27.16b //AES block 4k+7 - round 9740741aese v2.16b, v26.16b742aesmc v2.16b, v2.16b //AES block 4k+6 - round 8743744aese v0.16b, v27.16b //AES block 4k+4 - round 9745746aese v1.16b, v27.16b //AES block 4k+5 - round 9747eor v11.16b, v11.16b, v10.16b748749aese v2.16b, v27.16b //AES block 4k+6 - round 9750.L128_enc_tail: //TAIL751752sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process753ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext754#ifdef __AARCH64EB__755rev x6, x6756rev x7, x7757#endif758cmp x5, #48759760ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag761eor x6, x6, x13 //AES block 4k+4 - round 10 low762eor x7, x7, x14 //AES block 4k+4 - round 10 high763764fmov d4, x6 //AES block 4k+4 - mov low765766fmov v4.d[1], x7 //AES block 4k+4 - mov high767768eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result769770b.gt .L128_enc_blocks_more_than_3771772sub w12, w12, #1773movi v11.8b, #0774mov v3.16b, v2.16b775776cmp x5, #32777mov v2.16b, v1.16b778movi v9.8b, #0779780movi v10.8b, #0781b.gt .L128_enc_blocks_more_than_2782783mov v3.16b, v1.16b784cmp x5, #16785786sub w12, w12, #1787b.gt .L128_enc_blocks_more_than_1788789sub w12, w12, #1790b .L128_enc_blocks_less_than_1791.L128_enc_blocks_more_than_3: //blocks left > 3792st1 { v5.16b}, [x2], #16 //AES final-3 block - store result793794ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high795#ifdef __AARCH64EB__796rev x6, x6797rev x7, x7798#endif799rev64 v4.16b, v5.16b //GHASH final-3 block800801eor v4.16b, v4.16b, v8.16b //feed in partial tag802eor x7, x7, x14 //AES final-2 block - round 10 high803eor x6, x6, x13 //AES final-2 block - round 10 low804805fmov d5, x6 //AES final-2 block - mov low806807movi v8.8b, #0 //suppress further partial tag feed in808fmov v5.d[1], x7 //AES final-2 block - mov high809810pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low811mov d22, v4.d[1] //GHASH final-3 block - mid812813pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high814815mov d10, v17.d[1] //GHASH final-3 block - mid816817eor v5.16b, v5.16b, v1.16b //AES final-2 block - result818eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid819820pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid821.L128_enc_blocks_more_than_2: //blocks left > 2822823st1 { v5.16b}, [x2], #16 //AES final-2 block - store result824825rev64 v4.16b, v5.16b //GHASH final-2 block826ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high827#ifdef __AARCH64EB__828rev x6, x6829rev x7, x7830#endif831eor v4.16b, v4.16b, v8.16b //feed in partial tag832833eor x6, x6, x13 //AES final-1 block - round 10 low834835fmov d5, x6 //AES final-1 block - mov low836eor x7, x7, x14 //AES final-1 block - round 10 high837838pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high839fmov v5.d[1], x7 //AES final-1 block - mov high840841mov d22, v4.d[1] //GHASH final-2 block - mid842843pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low844845eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high846847eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid848849eor v5.16b, v5.16b, v2.16b //AES final-1 block - result850851eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low852853pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid854855movi v8.8b, #0 //suppress further partial tag feed in856857eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid858.L128_enc_blocks_more_than_1: //blocks left > 1859860st1 { v5.16b}, [x2], #16 //AES final-1 block - store result861862rev64 v4.16b, v5.16b //GHASH final-1 block863ldp x6, x7, [x0], #16 //AES final block - load input low & high864#ifdef __AARCH64EB__865rev x6, x6866rev x7, x7867#endif868eor v4.16b, v4.16b, v8.16b //feed in partial tag869870eor x7, x7, x14 //AES final block - round 10 high871eor x6, x6, x13 //AES final block - round 10 low872873fmov d5, x6 //AES final block - mov low874875pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high876fmov v5.d[1], x7 //AES final block - mov high877878mov d22, v4.d[1] //GHASH final-1 block - mid879880pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low881882eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid883884eor v5.16b, v5.16b, v3.16b //AES final block - result885886ins v22.d[1], v22.d[0] //GHASH final-1 block - mid887888pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid889890eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low891892eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high893894eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid895movi v8.8b, #0 //suppress further partial tag feed in896.L128_enc_blocks_less_than_1: //blocks left <= 1897898and x1, x1, #127 //bit_length %= 128899mvn x13, xzr //rk10_l = 0xffffffffffffffff900901mvn x14, xzr //rk10_h = 0xffffffffffffffff902sub x1, x1, #128 //bit_length -= 128903904neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])905906and x1, x1, #127 //bit_length %= 128907908lsr x14, x14, x1 //rk10_h is mask for top 64b of last block909cmp x1, #64910911csel x6, x13, x14, lt912csel x7, x14, xzr, lt913914fmov d0, x6 //ctr0b is mask for last block915916fmov v0.d[1], x7917918and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits919920rev64 v4.16b, v5.16b //GHASH final block921922eor v4.16b, v4.16b, v8.16b //feed in partial tag923924mov d8, v4.d[1] //GHASH final block - mid925926pmull v21.1q, v4.1d, v12.1d //GHASH final block - low927ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored928929eor v8.8b, v8.8b, v4.8b //GHASH final block - mid930#ifndef __AARCH64EB__931rev w9, w12932#else933mov w9, w12934#endif935pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high936937pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid938939eor v11.16b, v11.16b, v21.16b //GHASH final block - low940941eor v9.16b, v9.16b, v20.16b //GHASH final block - high942943eor v10.16b, v10.16b, v8.16b //GHASH final block - mid944movi v8.8b, #0xc2945946eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up947948shl d8, d8, #56 //mod_constant949950eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up951952pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid953954ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment955956eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid957958eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid959960pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low961962ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment963964bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing965966eor v11.16b, v11.16b, v9.16b //MODULO - fold into low967st1 { v5.16b}, [x2] //store all 16B968969str w9, [x16, #12] //store the updated counter970971eor v11.16b, v11.16b, v10.16b //MODULO - fold into low972ext v11.16b, v11.16b, v11.16b, #8973rev64 v11.16b, v11.16b974mov x0, x15975st1 { v11.16b }, [x3]976ldp x21, x22, [sp, #16]977ldp x23, x24, [sp, #32]978ldp d8, d9, [sp, #48]979ldp d10, d11, [sp, #64]980ldp d12, d13, [sp, #80]981ldp d14, d15, [sp, #96]982ldp x19, x20, [sp], #112983ret984985.L128_enc_ret:986mov w0, #0x0987ret988.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel989.globl aes_gcm_dec_128_kernel990.type aes_gcm_dec_128_kernel,%function991.align 4992aes_gcm_dec_128_kernel:993AARCH64_VALID_CALL_TARGET994cbz x1, .L128_dec_ret995stp x19, x20, [sp, #-112]!996mov x16, x4997mov x8, x5998stp x21, x22, [sp, #16]999stp x23, x24, [sp, #32]1000stp d8, d9, [sp, #48]1001stp d10, d11, [sp, #64]1002stp d12, d13, [sp, #80]1003stp d14, d15, [sp, #96]10041005lsr x5, x1, #3 //byte_len1006mov x15, x51007ldp x10, x11, [x16] //ctr96_b64, ctr96_t321008#ifdef __AARCH64EB__1009rev x10, x101010rev x11, x111011#endif1012ldp x13, x14, [x8, #160] //load rk101013#ifdef __AARCH64EB__1014ror x14, x14, 321015ror x13, x13, 321016#endif1017sub x5, x5, #1 //byte_len - 11018ld1 {v18.4s}, [x8], #16 //load rk010191020and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)1021ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible10221023ldr q13, [x3, #64] //load h2l | h2h1024#ifndef __AARCH64EB__1025ext v13.16b, v13.16b, v13.16b, #81026#endif1027lsr x12, x11, #321028fmov d2, x10 //CTR block 210291030ld1 {v19.4s}, [x8], #16 //load rk11031orr w11, w11, w111032rev w12, w12 //rev_ctr3210331034fmov d1, x10 //CTR block 11035add w12, w12, #1 //increment rev_ctr3210361037aese v0.16b, v18.16b1038aesmc v0.16b, v0.16b //AES block 0 - round 01039rev w9, w12 //CTR block 110401041orr x9, x11, x9, lsl #32 //CTR block 11042ld1 {v20.4s}, [x8], #16 //load rk21043add w12, w12, #1 //CTR block 110441045fmov v1.d[1], x9 //CTR block 11046rev w9, w12 //CTR block 21047add w12, w12, #1 //CTR block 210481049aese v0.16b, v19.16b1050aesmc v0.16b, v0.16b //AES block 0 - round 11051orr x9, x11, x9, lsl #32 //CTR block 210521053fmov v2.d[1], x9 //CTR block 21054rev w9, w12 //CTR block 310551056fmov d3, x10 //CTR block 31057orr x9, x11, x9, lsl #32 //CTR block 31058add w12, w12, #1 //CTR block 310591060fmov v3.d[1], x9 //CTR block 31061add x4, x0, x1, lsr #3 //end_input_ptr10621063aese v1.16b, v18.16b1064aesmc v1.16b, v1.16b //AES block 1 - round 01065ld1 {v21.4s}, [x8], #16 //load rk310661067aese v0.16b, v20.16b1068aesmc v0.16b, v0.16b //AES block 0 - round 21069ld1 {v22.4s}, [x8], #16 //load rk410701071aese v2.16b, v18.16b1072aesmc v2.16b, v2.16b //AES block 2 - round 01073ld1 {v23.4s}, [x8], #16 //load rk510741075aese v1.16b, v19.16b1076aesmc v1.16b, v1.16b //AES block 1 - round 11077ld1 {v24.4s}, [x8], #16 //load rk610781079aese v3.16b, v18.16b1080aesmc v3.16b, v3.16b //AES block 3 - round 010811082aese v2.16b, v19.16b1083aesmc v2.16b, v2.16b //AES block 2 - round 110841085aese v1.16b, v20.16b1086aesmc v1.16b, v1.16b //AES block 1 - round 210871088aese v3.16b, v19.16b1089aesmc v3.16b, v3.16b //AES block 3 - round 11090ld1 { v11.16b}, [x3]1091ext v11.16b, v11.16b, v11.16b, #81092rev64 v11.16b, v11.16b10931094aese v0.16b, v21.16b1095aesmc v0.16b, v0.16b //AES block 0 - round 31096ld1 {v25.4s}, [x8], #16 //load rk710971098aese v1.16b, v21.16b1099aesmc v1.16b, v1.16b //AES block 1 - round 311001101aese v3.16b, v20.16b1102aesmc v3.16b, v3.16b //AES block 3 - round 211031104aese v2.16b, v20.16b1105aesmc v2.16b, v2.16b //AES block 2 - round 21106ld1 {v26.4s}, [x8], #16 //load rk811071108aese v1.16b, v22.16b1109aesmc v1.16b, v1.16b //AES block 1 - round 411101111aese v3.16b, v21.16b1112aesmc v3.16b, v3.16b //AES block 3 - round 311131114aese v2.16b, v21.16b1115aesmc v2.16b, v2.16b //AES block 2 - round 31116ldr q14, [x3, #80] //load h3l | h3h1117#ifndef __AARCH64EB__1118ext v14.16b, v14.16b, v14.16b, #81119#endif1120aese v0.16b, v22.16b1121aesmc v0.16b, v0.16b //AES block 0 - round 41122ld1 {v27.4s}, [x8], #16 //load rk911231124aese v1.16b, v23.16b1125aesmc v1.16b, v1.16b //AES block 1 - round 511261127aese v2.16b, v22.16b1128aesmc v2.16b, v2.16b //AES block 2 - round 411291130aese v3.16b, v22.16b1131aesmc v3.16b, v3.16b //AES block 3 - round 411321133aese v0.16b, v23.16b1134aesmc v0.16b, v0.16b //AES block 0 - round 511351136aese v2.16b, v23.16b1137aesmc v2.16b, v2.16b //AES block 2 - round 51138ldr q12, [x3, #32] //load h1l | h1h1139#ifndef __AARCH64EB__1140ext v12.16b, v12.16b, v12.16b, #81141#endif1142aese v3.16b, v23.16b1143aesmc v3.16b, v3.16b //AES block 3 - round 511441145aese v0.16b, v24.16b1146aesmc v0.16b, v0.16b //AES block 0 - round 611471148aese v1.16b, v24.16b1149aesmc v1.16b, v1.16b //AES block 1 - round 611501151aese v3.16b, v24.16b1152aesmc v3.16b, v3.16b //AES block 3 - round 611531154aese v2.16b, v24.16b1155aesmc v2.16b, v2.16b //AES block 2 - round 61156trn1 v8.2d, v12.2d, v13.2d //h2h | h1h11571158ldr q15, [x3, #112] //load h4l | h4h1159#ifndef __AARCH64EB__1160ext v15.16b, v15.16b, v15.16b, #81161#endif1162trn2 v16.2d, v12.2d, v13.2d //h2l | h1l1163add x5, x5, x011641165aese v1.16b, v25.16b1166aesmc v1.16b, v1.16b //AES block 1 - round 711671168aese v2.16b, v25.16b1169aesmc v2.16b, v2.16b //AES block 2 - round 711701171aese v0.16b, v25.16b1172aesmc v0.16b, v0.16b //AES block 0 - round 71173eor v16.16b, v16.16b, v8.16b //h2k | h1k11741175aese v3.16b, v25.16b1176aesmc v3.16b, v3.16b //AES block 3 - round 711771178aese v1.16b, v26.16b1179aesmc v1.16b, v1.16b //AES block 1 - round 81180trn2 v17.2d, v14.2d, v15.2d //h4l | h3l11811182aese v2.16b, v26.16b1183aesmc v2.16b, v2.16b //AES block 2 - round 811841185aese v3.16b, v26.16b1186aesmc v3.16b, v3.16b //AES block 3 - round 811871188aese v0.16b, v26.16b1189aesmc v0.16b, v0.16b //AES block 0 - round 81190trn1 v9.2d, v14.2d, v15.2d //h4h | h3h11911192aese v2.16b, v27.16b //AES block 2 - round 911931194aese v3.16b, v27.16b //AES block 3 - round 911951196aese v0.16b, v27.16b //AES block 0 - round 91197cmp x0, x5 //check if we have <= 4 blocks11981199aese v1.16b, v27.16b //AES block 1 - round 91200eor v17.16b, v17.16b, v9.16b //h4k | h3k1201b.ge .L128_dec_tail //handle tail12021203ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext12041205eor v1.16b, v5.16b, v1.16b //AES block 1 - result1206ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext12071208eor v0.16b, v4.16b, v0.16b //AES block 0 - result1209rev64 v4.16b, v4.16b //GHASH block 01210rev w9, w12 //CTR block 412111212orr x9, x11, x9, lsl #32 //CTR block 41213add w12, w12, #1 //CTR block 41214ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext12151216rev64 v5.16b, v5.16b //GHASH block 11217mov x19, v1.d[0] //AES block 1 - mov low12181219mov x20, v1.d[1] //AES block 1 - mov high12201221mov x6, v0.d[0] //AES block 0 - mov low1222cmp x0, x5 //check if we have <= 8 blocks12231224mov x7, v0.d[1] //AES block 0 - mov high12251226fmov d0, x10 //CTR block 412271228fmov v0.d[1], x9 //CTR block 41229rev w9, w12 //CTR block 51230eor x19, x19, x13 //AES block 1 - round 10 low1231#ifdef __AARCH64EB__1232rev x19, x191233#endif1234fmov d1, x10 //CTR block 51235add w12, w12, #1 //CTR block 51236orr x9, x11, x9, lsl #32 //CTR block 512371238fmov v1.d[1], x9 //CTR block 51239rev w9, w12 //CTR block 61240add w12, w12, #1 //CTR block 612411242orr x9, x11, x9, lsl #32 //CTR block 612431244eor x20, x20, x14 //AES block 1 - round 10 high1245#ifdef __AARCH64EB__1246rev x20, x201247#endif1248eor x6, x6, x13 //AES block 0 - round 10 low1249#ifdef __AARCH64EB__1250rev x6, x61251#endif1252eor v2.16b, v6.16b, v2.16b //AES block 2 - result12531254eor x7, x7, x14 //AES block 0 - round 10 high1255#ifdef __AARCH64EB__1256rev x7, x71257#endif1258stp x6, x7, [x2], #16 //AES block 0 - store result12591260stp x19, x20, [x2], #16 //AES block 1 - store result1261b.ge .L128_dec_prepretail //do prepretail12621263.L128_dec_main_loop: //main loop start1264eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result1265ext v11.16b, v11.16b, v11.16b, #8 //PRE 01266mov x21, v2.d[0] //AES block 4k+2 - mov low12671268pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high1269mov x22, v2.d[1] //AES block 4k+2 - mov high12701271aese v1.16b, v18.16b1272aesmc v1.16b, v1.16b //AES block 4k+5 - round 01273fmov d2, x10 //CTR block 4k+612741275rev64 v6.16b, v6.16b //GHASH block 4k+21276fmov v2.d[1], x9 //CTR block 4k+61277rev w9, w12 //CTR block 4k+712781279mov x23, v3.d[0] //AES block 4k+3 - mov low1280eor v4.16b, v4.16b, v11.16b //PRE 11281mov d30, v5.d[1] //GHASH block 4k+1 - mid12821283aese v1.16b, v19.16b1284aesmc v1.16b, v1.16b //AES block 4k+5 - round 11285rev64 v7.16b, v7.16b //GHASH block 4k+312861287pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low1288mov x24, v3.d[1] //AES block 4k+3 - mov high1289orr x9, x11, x9, lsl #32 //CTR block 4k+712901291pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low1292fmov d3, x10 //CTR block 4k+71293eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid12941295aese v1.16b, v20.16b1296aesmc v1.16b, v1.16b //AES block 4k+5 - round 21297fmov v3.d[1], x9 //CTR block 4k+712981299aese v2.16b, v18.16b1300aesmc v2.16b, v2.16b //AES block 4k+6 - round 01301mov d10, v17.d[1] //GHASH block 4k - mid13021303pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high1304eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low13051306pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low13071308aese v1.16b, v21.16b1309aesmc v1.16b, v1.16b //AES block 4k+5 - round 31310mov d8, v4.d[1] //GHASH block 4k - mid13111312aese v3.16b, v18.16b1313aesmc v3.16b, v3.16b //AES block 4k+7 - round 01314eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high13151316aese v0.16b, v18.16b1317aesmc v0.16b, v0.16b //AES block 4k+4 - round 013181319pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low1320eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid13211322aese v3.16b, v19.16b1323aesmc v3.16b, v3.16b //AES block 4k+7 - round 11324eor x23, x23, x13 //AES block 4k+3 - round 10 low1325#ifdef __AARCH64EB__1326rev x23, x231327#endif1328pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid1329eor x22, x22, x14 //AES block 4k+2 - round 10 high1330#ifdef __AARCH64EB__1331rev x22, x221332#endif1333mov d31, v6.d[1] //GHASH block 4k+2 - mid13341335aese v0.16b, v19.16b1336aesmc v0.16b, v0.16b //AES block 4k+4 - round 11337eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low13381339pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid13401341aese v3.16b, v20.16b1342aesmc v3.16b, v3.16b //AES block 4k+7 - round 21343eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid13441345aese v0.16b, v20.16b1346aesmc v0.16b, v0.16b //AES block 4k+4 - round 213471348aese v1.16b, v22.16b1349aesmc v1.16b, v1.16b //AES block 4k+5 - round 41350eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid13511352pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high13531354aese v0.16b, v21.16b1355aesmc v0.16b, v0.16b //AES block 4k+4 - round 31356ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid13571358pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high13591360aese v2.16b, v19.16b1361aesmc v2.16b, v2.16b //AES block 4k+6 - round 11362mov d30, v7.d[1] //GHASH block 4k+3 - mid13631364aese v0.16b, v22.16b1365aesmc v0.16b, v0.16b //AES block 4k+4 - round 41366eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high13671368pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid1369eor x24, x24, x14 //AES block 4k+3 - round 10 high1370#ifdef __AARCH64EB__1371rev x24, x241372#endif1373aese v2.16b, v20.16b1374aesmc v2.16b, v2.16b //AES block 4k+6 - round 21375eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid13761377aese v1.16b, v23.16b1378aesmc v1.16b, v1.16b //AES block 4k+5 - round 51379eor x21, x21, x13 //AES block 4k+2 - round 10 low1380#ifdef __AARCH64EB__1381rev x21, x211382#endif1383aese v0.16b, v23.16b1384aesmc v0.16b, v0.16b //AES block 4k+4 - round 51385movi v8.8b, #0xc213861387aese v2.16b, v21.16b1388aesmc v2.16b, v2.16b //AES block 4k+6 - round 31389eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low13901391aese v1.16b, v24.16b1392aesmc v1.16b, v1.16b //AES block 4k+5 - round 613931394aese v0.16b, v24.16b1395aesmc v0.16b, v0.16b //AES block 4k+4 - round 61396eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid13971398aese v2.16b, v22.16b1399aesmc v2.16b, v2.16b //AES block 4k+6 - round 41400stp x21, x22, [x2], #16 //AES block 4k+2 - store result14011402pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid1403eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high1404ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext14051406aese v1.16b, v25.16b1407aesmc v1.16b, v1.16b //AES block 4k+5 - round 71408add w12, w12, #1 //CTR block 4k+714091410aese v0.16b, v25.16b1411aesmc v0.16b, v0.16b //AES block 4k+4 - round 71412shl d8, d8, #56 //mod_constant14131414aese v2.16b, v23.16b1415aesmc v2.16b, v2.16b //AES block 4k+6 - round 51416eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid14171418aese v1.16b, v26.16b1419aesmc v1.16b, v1.16b //AES block 4k+5 - round 81420stp x23, x24, [x2], #16 //AES block 4k+3 - store result14211422aese v0.16b, v26.16b1423aesmc v0.16b, v0.16b //AES block 4k+4 - round 81424eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up14251426aese v3.16b, v21.16b1427aesmc v3.16b, v3.16b //AES block 4k+7 - round 31428rev w9, w12 //CTR block 4k+814291430pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid1431ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext1432ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment14331434aese v0.16b, v27.16b //AES block 4k+4 - round 91435orr x9, x11, x9, lsl #32 //CTR block 4k+814361437aese v3.16b, v22.16b1438aesmc v3.16b, v3.16b //AES block 4k+7 - round 41439eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up14401441aese v1.16b, v27.16b //AES block 4k+5 - round 914421443aese v2.16b, v24.16b1444aesmc v2.16b, v2.16b //AES block 4k+6 - round 61445eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result14461447aese v3.16b, v23.16b1448aesmc v3.16b, v3.16b //AES block 4k+7 - round 51449ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext14501451add w12, w12, #1 //CTR block 4k+81452eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid1453eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result14541455aese v2.16b, v25.16b1456aesmc v2.16b, v2.16b //AES block 4k+6 - round 71457ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext14581459aese v3.16b, v24.16b1460aesmc v3.16b, v3.16b //AES block 4k+7 - round 614611462rev64 v5.16b, v5.16b //GHASH block 4k+51463eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid1464mov x7, v0.d[1] //AES block 4k+4 - mov high14651466aese v2.16b, v26.16b1467aesmc v2.16b, v2.16b //AES block 4k+6 - round 81468mov x6, v0.d[0] //AES block 4k+4 - mov low14691470aese v3.16b, v25.16b1471aesmc v3.16b, v3.16b //AES block 4k+7 - round 71472fmov d0, x10 //CTR block 4k+814731474pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low1475fmov v0.d[1], x9 //CTR block 4k+81476rev w9, w12 //CTR block 4k+914771478aese v2.16b, v27.16b //AES block 4k+6 - round 91479orr x9, x11, x9, lsl #32 //CTR block 4k+91480ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment14811482aese v3.16b, v26.16b1483aesmc v3.16b, v3.16b //AES block 4k+7 - round 81484eor x7, x7, x14 //AES block 4k+4 - round 10 high1485#ifdef __AARCH64EB__1486rev x7, x71487#endif1488eor v11.16b, v11.16b, v8.16b //MODULO - fold into low1489mov x20, v1.d[1] //AES block 4k+5 - mov high1490eor x6, x6, x13 //AES block 4k+4 - round 10 low1491#ifdef __AARCH64EB__1492rev x6, x61493#endif1494eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result1495mov x19, v1.d[0] //AES block 4k+5 - mov low1496add w12, w12, #1 //CTR block 4k+914971498aese v3.16b, v27.16b //AES block 4k+7 - round 91499fmov d1, x10 //CTR block 4k+91500cmp x0, x5 //.LOOP CONTROL15011502rev64 v4.16b, v4.16b //GHASH block 4k+41503eor v11.16b, v11.16b, v10.16b //MODULO - fold into low1504fmov v1.d[1], x9 //CTR block 4k+915051506rev w9, w12 //CTR block 4k+101507add w12, w12, #1 //CTR block 4k+1015081509eor x20, x20, x14 //AES block 4k+5 - round 10 high1510#ifdef __AARCH64EB__1511rev x20, x201512#endif1513stp x6, x7, [x2], #16 //AES block 4k+4 - store result15141515eor x19, x19, x13 //AES block 4k+5 - round 10 low1516#ifdef __AARCH64EB__1517rev x19, x191518#endif1519stp x19, x20, [x2], #16 //AES block 4k+5 - store result15201521orr x9, x11, x9, lsl #32 //CTR block 4k+101522b.lt .L128_dec_main_loop15231524.L128_dec_prepretail: //PREPRETAIL1525ext v11.16b, v11.16b, v11.16b, #8 //PRE 01526mov x21, v2.d[0] //AES block 4k+2 - mov low1527mov d30, v5.d[1] //GHASH block 4k+1 - mid15281529aese v0.16b, v18.16b1530aesmc v0.16b, v0.16b //AES block 4k+4 - round 01531eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result15321533aese v1.16b, v18.16b1534aesmc v1.16b, v1.16b //AES block 4k+5 - round 01535mov x22, v2.d[1] //AES block 4k+2 - mov high15361537eor v4.16b, v4.16b, v11.16b //PRE 11538fmov d2, x10 //CTR block 4k+61539rev64 v6.16b, v6.16b //GHASH block 4k+215401541aese v0.16b, v19.16b1542aesmc v0.16b, v0.16b //AES block 4k+4 - round 11543fmov v2.d[1], x9 //CTR block 4k+615441545rev w9, w12 //CTR block 4k+71546mov x23, v3.d[0] //AES block 4k+3 - mov low1547eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid15481549pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low1550mov d10, v17.d[1] //GHASH block 4k - mid1551mov x24, v3.d[1] //AES block 4k+3 - mov high15521553aese v1.16b, v19.16b1554aesmc v1.16b, v1.16b //AES block 4k+5 - round 11555mov d31, v6.d[1] //GHASH block 4k+2 - mid15561557aese v0.16b, v20.16b1558aesmc v0.16b, v0.16b //AES block 4k+4 - round 21559orr x9, x11, x9, lsl #32 //CTR block 4k+715601561pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low1562mov d8, v4.d[1] //GHASH block 4k - mid1563fmov d3, x10 //CTR block 4k+715641565aese v2.16b, v18.16b1566aesmc v2.16b, v2.16b //AES block 4k+6 - round 01567fmov v3.d[1], x9 //CTR block 4k+715681569pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid1570eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid15711572rev64 v7.16b, v7.16b //GHASH block 4k+315731574aese v2.16b, v19.16b1575aesmc v2.16b, v2.16b //AES block 4k+6 - round 11576eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid15771578pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high15791580aese v3.16b, v18.16b1581aesmc v3.16b, v3.16b //AES block 4k+7 - round 01582ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid15831584pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high15851586pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid1587eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low15881589pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low15901591pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid1592eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high15931594eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid15951596pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high15971598pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high1599mov d30, v7.d[1] //GHASH block 4k+3 - mid16001601aese v1.16b, v20.16b1602aesmc v1.16b, v1.16b //AES block 4k+5 - round 21603eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid16041605pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low16061607eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high1608movi v8.8b, #0xc216091610aese v3.16b, v19.16b1611aesmc v3.16b, v3.16b //AES block 4k+7 - round 11612eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid16131614eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low16151616aese v2.16b, v20.16b1617aesmc v2.16b, v2.16b //AES block 4k+6 - round 21618eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high16191620aese v3.16b, v20.16b1621aesmc v3.16b, v3.16b //AES block 4k+7 - round 21622eor x23, x23, x13 //AES block 4k+3 - round 10 low1623#ifdef __AARCH64EB__1624rev x23, x231625#endif1626pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid1627eor x21, x21, x13 //AES block 4k+2 - round 10 low1628#ifdef __AARCH64EB__1629rev x21, x211630#endif1631eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low16321633aese v2.16b, v21.16b1634aesmc v2.16b, v2.16b //AES block 4k+6 - round 316351636aese v1.16b, v21.16b1637aesmc v1.16b, v1.16b //AES block 4k+5 - round 31638shl d8, d8, #56 //mod_constant16391640aese v0.16b, v21.16b1641aesmc v0.16b, v0.16b //AES block 4k+4 - round 316421643aese v2.16b, v22.16b1644aesmc v2.16b, v2.16b //AES block 4k+6 - round 41645eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid16461647aese v1.16b, v22.16b1648aesmc v1.16b, v1.16b //AES block 4k+5 - round 416491650aese v3.16b, v21.16b1651aesmc v3.16b, v3.16b //AES block 4k+7 - round 31652eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up16531654aese v2.16b, v23.16b1655aesmc v2.16b, v2.16b //AES block 4k+6 - round 516561657aese v1.16b, v23.16b1658aesmc v1.16b, v1.16b //AES block 4k+5 - round 516591660aese v3.16b, v22.16b1661aesmc v3.16b, v3.16b //AES block 4k+7 - round 416621663aese v0.16b, v22.16b1664aesmc v0.16b, v0.16b //AES block 4k+4 - round 41665eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up16661667pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid16681669aese v1.16b, v24.16b1670aesmc v1.16b, v1.16b //AES block 4k+5 - round 61671ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment16721673aese v3.16b, v23.16b1674aesmc v3.16b, v3.16b //AES block 4k+7 - round 516751676aese v0.16b, v23.16b1677aesmc v0.16b, v0.16b //AES block 4k+4 - round 51678eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid16791680aese v1.16b, v25.16b1681aesmc v1.16b, v1.16b //AES block 4k+5 - round 716821683aese v2.16b, v24.16b1684aesmc v2.16b, v2.16b //AES block 4k+6 - round 616851686aese v0.16b, v24.16b1687aesmc v0.16b, v0.16b //AES block 4k+4 - round 616881689aese v1.16b, v26.16b1690aesmc v1.16b, v1.16b //AES block 4k+5 - round 81691eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid16921693aese v3.16b, v24.16b1694aesmc v3.16b, v3.16b //AES block 4k+7 - round 616951696aese v0.16b, v25.16b1697aesmc v0.16b, v0.16b //AES block 4k+4 - round 716981699aese v1.16b, v27.16b //AES block 4k+5 - round 917001701pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low1702eor x24, x24, x14 //AES block 4k+3 - round 10 high1703#ifdef __AARCH64EB__1704rev x24, x241705#endif1706aese v2.16b, v25.16b1707aesmc v2.16b, v2.16b //AES block 4k+6 - round 71708ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment17091710aese v3.16b, v25.16b1711aesmc v3.16b, v3.16b //AES block 4k+7 - round 717121713aese v0.16b, v26.16b1714aesmc v0.16b, v0.16b //AES block 4k+4 - round 81715eor v11.16b, v11.16b, v8.16b //MODULO - fold into low17161717aese v2.16b, v26.16b1718aesmc v2.16b, v2.16b //AES block 4k+6 - round 817191720aese v3.16b, v26.16b1721aesmc v3.16b, v3.16b //AES block 4k+7 - round 81722eor x22, x22, x14 //AES block 4k+2 - round 10 high1723#ifdef __AARCH64EB__1724rev x22, x221725#endif1726aese v0.16b, v27.16b //AES block 4k+4 - round 91727stp x21, x22, [x2], #16 //AES block 4k+2 - store result17281729aese v2.16b, v27.16b //AES block 4k+6 - round 91730add w12, w12, #1 //CTR block 4k+71731stp x23, x24, [x2], #16 //AES block 4k+3 - store result17321733aese v3.16b, v27.16b //AES block 4k+7 - round 91734eor v11.16b, v11.16b, v10.16b //MODULO - fold into low1735.L128_dec_tail: //TAIL17361737sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process1738ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext17391740eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result17411742mov x7, v0.d[1] //AES block 4k+4 - mov high17431744mov x6, v0.d[0] //AES block 4k+4 - mov low17451746cmp x5, #4817471748eor x7, x7, x14 //AES block 4k+4 - round 10 high1749#ifdef __AARCH64EB__1750rev x7, x71751#endif1752ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag1753eor x6, x6, x13 //AES block 4k+4 - round 10 low1754#ifdef __AARCH64EB__1755rev x6, x61756#endif1757b.gt .L128_dec_blocks_more_than_317581759mov v3.16b, v2.16b1760sub w12, w12, #11761movi v11.8b, #017621763movi v9.8b, #01764mov v2.16b, v1.16b17651766movi v10.8b, #01767cmp x5, #321768b.gt .L128_dec_blocks_more_than_217691770cmp x5, #1617711772mov v3.16b, v1.16b1773sub w12, w12, #11774b.gt .L128_dec_blocks_more_than_117751776sub w12, w12, #11777b .L128_dec_blocks_less_than_11778.L128_dec_blocks_more_than_3: //blocks left > 31779rev64 v4.16b, v5.16b //GHASH final-3 block1780ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext17811782eor v4.16b, v4.16b, v8.16b //feed in partial tag17831784mov d10, v17.d[1] //GHASH final-3 block - mid1785stp x6, x7, [x2], #16 //AES final-3 block - store result1786eor v0.16b, v5.16b, v1.16b //AES final-2 block - result17871788mov d22, v4.d[1] //GHASH final-3 block - mid1789mov x7, v0.d[1] //AES final-2 block - mov high17901791pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low1792mov x6, v0.d[0] //AES final-2 block - mov low17931794pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high17951796eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid17971798movi v8.8b, #0 //suppress further partial tag feed in1799eor x7, x7, x14 //AES final-2 block - round 10 high1800#ifdef __AARCH64EB__1801rev x7, x71802#endif1803pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid1804eor x6, x6, x13 //AES final-2 block - round 10 low1805#ifdef __AARCH64EB__1806rev x6, x61807#endif1808.L128_dec_blocks_more_than_2: //blocks left > 218091810rev64 v4.16b, v5.16b //GHASH final-2 block1811ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext18121813eor v4.16b, v4.16b, v8.16b //feed in partial tag18141815eor v0.16b, v5.16b, v2.16b //AES final-1 block - result1816stp x6, x7, [x2], #16 //AES final-2 block - store result18171818mov d22, v4.d[1] //GHASH final-2 block - mid18191820pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low18211822pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high1823mov x6, v0.d[0] //AES final-1 block - mov low18241825mov x7, v0.d[1] //AES final-1 block - mov high1826eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid18271828movi v8.8b, #0 //suppress further partial tag feed in18291830pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid18311832eor x6, x6, x13 //AES final-1 block - round 10 low1833#ifdef __AARCH64EB__1834rev x6, x61835#endif1836eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low18371838eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high18391840eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid1841eor x7, x7, x14 //AES final-1 block - round 10 high1842#ifdef __AARCH64EB__1843rev x7, x71844#endif1845.L128_dec_blocks_more_than_1: //blocks left > 118461847rev64 v4.16b, v5.16b //GHASH final-1 block18481849ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext1850eor v4.16b, v4.16b, v8.16b //feed in partial tag18511852mov d22, v4.d[1] //GHASH final-1 block - mid18531854eor v0.16b, v5.16b, v3.16b //AES final block - result18551856eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid18571858stp x6, x7, [x2], #16 //AES final-1 block - store result1859mov x6, v0.d[0] //AES final block - mov low18601861mov x7, v0.d[1] //AES final block - mov high1862ins v22.d[1], v22.d[0] //GHASH final-1 block - mid18631864pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low18651866pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high18671868pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid1869movi v8.8b, #0 //suppress further partial tag feed in18701871eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low18721873eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high1874eor x7, x7, x14 //AES final block - round 10 high1875#ifdef __AARCH64EB__1876rev x7, x71877#endif1878eor x6, x6, x13 //AES final block - round 10 low1879#ifdef __AARCH64EB__1880rev x6, x61881#endif1882eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid1883.L128_dec_blocks_less_than_1: //blocks left <= 118841885mvn x14, xzr //rk10_h = 0xffffffffffffffff1886and x1, x1, #127 //bit_length %= 12818871888mvn x13, xzr //rk10_l = 0xffffffffffffffff1889sub x1, x1, #128 //bit_length -= 12818901891neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])18921893and x1, x1, #127 //bit_length %= 12818941895lsr x14, x14, x1 //rk10_h is mask for top 64b of last block1896cmp x1, #6418971898csel x10, x14, xzr, lt1899csel x9, x13, x14, lt19001901fmov d0, x9 //ctr0b is mask for last block19021903mov v0.d[1], x1019041905and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits19061907rev64 v4.16b, v5.16b //GHASH final block19081909eor v4.16b, v4.16b, v8.16b //feed in partial tag19101911ldp x4, x5, [x2] //load existing bytes we need to not overwrite19121913and x7, x7, x1019141915pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high1916mov d8, v4.d[1] //GHASH final block - mid19171918eor v8.8b, v8.8b, v4.8b //GHASH final block - mid1919eor v9.16b, v9.16b, v20.16b //GHASH final block - high19201921pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid19221923pmull v21.1q, v4.1d, v12.1d //GHASH final block - low1924bic x4, x4, x9 //mask out low existing bytes1925and x6, x6, x919261927#ifndef __AARCH64EB__1928rev w9, w121929#else1930mov w9, w121931#endif19321933eor v10.16b, v10.16b, v8.16b //GHASH final block - mid1934movi v8.8b, #0xc219351936eor v11.16b, v11.16b, v21.16b //GHASH final block - low19371938bic x5, x5, x10 //mask out high existing bytes1939shl d8, d8, #56 //mod_constant19401941eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up19421943pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid19441945eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up19461947orr x6, x6, x41948str w9, [x16, #12] //store the updated counter19491950orr x7, x7, x51951stp x6, x7, [x2]1952ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment19531954eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid19551956eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid19571958pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low1959ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment19601961eor v11.16b, v11.16b, v8.16b //MODULO - fold into low19621963eor v11.16b, v11.16b, v10.16b //MODULO - fold into low1964ext v11.16b, v11.16b, v11.16b, #81965rev64 v11.16b, v11.16b1966mov x0, x151967st1 { v11.16b }, [x3]19681969ldp x21, x22, [sp, #16]1970ldp x23, x24, [sp, #32]1971ldp d8, d9, [sp, #48]1972ldp d10, d11, [sp, #64]1973ldp d12, d13, [sp, #80]1974ldp d14, d15, [sp, #96]1975ldp x19, x20, [sp], #1121976ret19771978.L128_dec_ret:1979mov w0, #0x01980ret1981.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel1982.globl aes_gcm_enc_192_kernel1983.type aes_gcm_enc_192_kernel,%function1984.align 41985aes_gcm_enc_192_kernel:1986AARCH64_VALID_CALL_TARGET1987cbz x1, .L192_enc_ret1988stp x19, x20, [sp, #-112]!1989mov x16, x41990mov x8, x51991stp x21, x22, [sp, #16]1992stp x23, x24, [sp, #32]1993stp d8, d9, [sp, #48]1994stp d10, d11, [sp, #64]1995stp d12, d13, [sp, #80]1996stp d14, d15, [sp, #96]19971998ldp x10, x11, [x16] //ctr96_b64, ctr96_t321999#ifdef __AARCH64EB__2000rev x10, x102001rev x11, x112002#endif2003ldp x13, x14, [x8, #192] //load rk122004#ifdef __AARCH64EB__2005ror x13, x13, #322006ror x14, x14, #322007#endif2008ld1 {v18.4s}, [x8], #16 //load rk020092010ld1 {v19.4s}, [x8], #16 //load rk120112012ld1 {v20.4s}, [x8], #16 //load rk220132014lsr x12, x11, #322015ld1 {v21.4s}, [x8], #16 //load rk32016orr w11, w11, w1120172018ld1 {v22.4s}, [x8], #16 //load rk42019rev w12, w12 //rev_ctr3220202021add w12, w12, #1 //increment rev_ctr322022fmov d3, x10 //CTR block 320232024rev w9, w12 //CTR block 12025add w12, w12, #1 //CTR block 12026fmov d1, x10 //CTR block 120272028orr x9, x11, x9, lsl #32 //CTR block 12029ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible20302031fmov v1.d[1], x9 //CTR block 12032rev w9, w12 //CTR block 22033add w12, w12, #1 //CTR block 220342035fmov d2, x10 //CTR block 22036orr x9, x11, x9, lsl #32 //CTR block 220372038fmov v2.d[1], x9 //CTR block 22039rev w9, w12 //CTR block 320402041orr x9, x11, x9, lsl #32 //CTR block 32042ld1 {v23.4s}, [x8], #16 //load rk520432044fmov v3.d[1], x9 //CTR block 320452046ld1 {v24.4s}, [x8], #16 //load rk620472048ld1 {v25.4s}, [x8], #16 //load rk720492050aese v0.16b, v18.16b2051aesmc v0.16b, v0.16b //AES block 0 - round 02052ld1 { v11.16b}, [x3]2053ext v11.16b, v11.16b, v11.16b, #82054rev64 v11.16b, v11.16b20552056aese v3.16b, v18.16b2057aesmc v3.16b, v3.16b //AES block 3 - round 02058ld1 {v26.4s}, [x8], #16 //load rk820592060aese v1.16b, v18.16b2061aesmc v1.16b, v1.16b //AES block 1 - round 02062ldr q15, [x3, #112] //load h4l | h4h2063#ifndef __AARCH64EB__2064ext v15.16b, v15.16b, v15.16b, #82065#endif2066aese v2.16b, v18.16b2067aesmc v2.16b, v2.16b //AES block 2 - round 02068ld1 {v27.4s}, [x8], #16 //load rk920692070aese v0.16b, v19.16b2071aesmc v0.16b, v0.16b //AES block 0 - round 12072ld1 {v28.4s}, [x8], #16 //load rk1020732074aese v1.16b, v19.16b2075aesmc v1.16b, v1.16b //AES block 1 - round 12076ldr q12, [x3, #32] //load h1l | h1h2077#ifndef __AARCH64EB__2078ext v12.16b, v12.16b, v12.16b, #82079#endif2080aese v2.16b, v19.16b2081aesmc v2.16b, v2.16b //AES block 2 - round 12082ld1 {v29.4s}, [x8], #16 //load rk1120832084aese v3.16b, v19.16b2085aesmc v3.16b, v3.16b //AES block 3 - round 12086ldr q14, [x3, #80] //load h3l | h3h2087#ifndef __AARCH64EB__2088ext v14.16b, v14.16b, v14.16b, #82089#endif2090aese v0.16b, v20.16b2091aesmc v0.16b, v0.16b //AES block 0 - round 220922093aese v2.16b, v20.16b2094aesmc v2.16b, v2.16b //AES block 2 - round 220952096aese v3.16b, v20.16b2097aesmc v3.16b, v3.16b //AES block 3 - round 220982099aese v0.16b, v21.16b2100aesmc v0.16b, v0.16b //AES block 0 - round 32101trn1 v9.2d, v14.2d, v15.2d //h4h | h3h21022103aese v2.16b, v21.16b2104aesmc v2.16b, v2.16b //AES block 2 - round 321052106aese v1.16b, v20.16b2107aesmc v1.16b, v1.16b //AES block 1 - round 22108trn2 v17.2d, v14.2d, v15.2d //h4l | h3l21092110aese v0.16b, v22.16b2111aesmc v0.16b, v0.16b //AES block 0 - round 421122113aese v3.16b, v21.16b2114aesmc v3.16b, v3.16b //AES block 3 - round 321152116aese v1.16b, v21.16b2117aesmc v1.16b, v1.16b //AES block 1 - round 321182119aese v0.16b, v23.16b2120aesmc v0.16b, v0.16b //AES block 0 - round 521212122aese v2.16b, v22.16b2123aesmc v2.16b, v2.16b //AES block 2 - round 421242125aese v1.16b, v22.16b2126aesmc v1.16b, v1.16b //AES block 1 - round 421272128aese v0.16b, v24.16b2129aesmc v0.16b, v0.16b //AES block 0 - round 621302131aese v3.16b, v22.16b2132aesmc v3.16b, v3.16b //AES block 3 - round 421332134aese v2.16b, v23.16b2135aesmc v2.16b, v2.16b //AES block 2 - round 521362137aese v1.16b, v23.16b2138aesmc v1.16b, v1.16b //AES block 1 - round 521392140aese v3.16b, v23.16b2141aesmc v3.16b, v3.16b //AES block 3 - round 521422143aese v2.16b, v24.16b2144aesmc v2.16b, v2.16b //AES block 2 - round 62145ldr q13, [x3, #64] //load h2l | h2h2146#ifndef __AARCH64EB__2147ext v13.16b, v13.16b, v13.16b, #82148#endif2149aese v1.16b, v24.16b2150aesmc v1.16b, v1.16b //AES block 1 - round 621512152aese v3.16b, v24.16b2153aesmc v3.16b, v3.16b //AES block 3 - round 621542155aese v0.16b, v25.16b2156aesmc v0.16b, v0.16b //AES block 0 - round 721572158aese v1.16b, v25.16b2159aesmc v1.16b, v1.16b //AES block 1 - round 72160trn2 v16.2d, v12.2d, v13.2d //h2l | h1l21612162aese v3.16b, v25.16b2163aesmc v3.16b, v3.16b //AES block 3 - round 721642165aese v0.16b, v26.16b2166aesmc v0.16b, v0.16b //AES block 0 - round 821672168aese v2.16b, v25.16b2169aesmc v2.16b, v2.16b //AES block 2 - round 72170trn1 v8.2d, v12.2d, v13.2d //h2h | h1h21712172aese v1.16b, v26.16b2173aesmc v1.16b, v1.16b //AES block 1 - round 821742175aese v3.16b, v26.16b2176aesmc v3.16b, v3.16b //AES block 3 - round 821772178aese v2.16b, v26.16b2179aesmc v2.16b, v2.16b //AES block 2 - round 821802181aese v0.16b, v27.16b2182aesmc v0.16b, v0.16b //AES block 0 - round 921832184aese v3.16b, v27.16b2185aesmc v3.16b, v3.16b //AES block 3 - round 921862187aese v2.16b, v27.16b2188aesmc v2.16b, v2.16b //AES block 2 - round 921892190aese v1.16b, v27.16b2191aesmc v1.16b, v1.16b //AES block 1 - round 921922193aese v0.16b, v28.16b2194aesmc v0.16b, v0.16b //AES block 0 - round 1021952196aese v2.16b, v28.16b2197aesmc v2.16b, v2.16b //AES block 2 - round 1021982199aese v1.16b, v28.16b2200aesmc v1.16b, v1.16b //AES block 1 - round 102201lsr x5, x1, #3 //byte_len2202mov x15, x522032204aese v3.16b, v28.16b2205aesmc v3.16b, v3.16b //AES block 3 - round 102206sub x5, x5, #1 //byte_len - 122072208eor v16.16b, v16.16b, v8.16b //h2k | h1k2209and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)22102211eor v17.16b, v17.16b, v9.16b //h4k | h3k22122213aese v2.16b, v29.16b //AES block 2 - round 112214add x4, x0, x1, lsr #3 //end_input_ptr2215add x5, x5, x022162217aese v1.16b, v29.16b //AES block 1 - round 112218cmp x0, x5 //check if we have <= 4 blocks22192220aese v0.16b, v29.16b //AES block 0 - round 112221add w12, w12, #1 //CTR block 322222223aese v3.16b, v29.16b //AES block 3 - round 112224b.ge .L192_enc_tail //handle tail22252226rev w9, w12 //CTR block 42227ldp x6, x7, [x0, #0] //AES block 0 - load plaintext2228#ifdef __AARCH64EB__2229rev x6, x62230rev x7, x72231#endif2232orr x9, x11, x9, lsl #32 //CTR block 42233ldp x21, x22, [x0, #32] //AES block 2 - load plaintext2234#ifdef __AARCH64EB__2235rev x21, x212236rev x22, x222237#endif2238ldp x23, x24, [x0, #48] //AES block 3 - load plaintext2239#ifdef __AARCH64EB__2240rev x23, x232241rev x24, x242242#endif2243ldp x19, x20, [x0, #16] //AES block 1 - load plaintext2244#ifdef __AARCH64EB__2245rev x19, x192246rev x20, x202247#endif2248add x0, x0, #64 //AES input_ptr update2249cmp x0, x5 //check if we have <= 8 blocks22502251eor x6, x6, x13 //AES block 0 - round 12 low22522253eor x7, x7, x14 //AES block 0 - round 12 high2254eor x22, x22, x14 //AES block 2 - round 12 high2255fmov d4, x6 //AES block 0 - mov low22562257eor x24, x24, x14 //AES block 3 - round 12 high2258fmov v4.d[1], x7 //AES block 0 - mov high22592260eor x21, x21, x13 //AES block 2 - round 12 low2261eor x19, x19, x13 //AES block 1 - round 12 low22622263fmov d5, x19 //AES block 1 - mov low2264eor x20, x20, x14 //AES block 1 - round 12 high22652266fmov v5.d[1], x20 //AES block 1 - mov high22672268eor x23, x23, x13 //AES block 3 - round 12 low2269fmov d6, x21 //AES block 2 - mov low22702271add w12, w12, #1 //CTR block 42272eor v4.16b, v4.16b, v0.16b //AES block 0 - result2273fmov d0, x10 //CTR block 422742275fmov v0.d[1], x9 //CTR block 42276rev w9, w12 //CTR block 522772278orr x9, x11, x9, lsl #32 //CTR block 52279add w12, w12, #1 //CTR block 522802281fmov d7, x23 //AES block 3 - mov low2282st1 { v4.16b}, [x2], #16 //AES block 0 - store result22832284fmov v6.d[1], x22 //AES block 2 - mov high22852286eor v5.16b, v5.16b, v1.16b //AES block 1 - result2287fmov d1, x10 //CTR block 52288st1 { v5.16b}, [x2], #16 //AES block 1 - store result22892290fmov v7.d[1], x24 //AES block 3 - mov high22912292fmov v1.d[1], x9 //CTR block 52293rev w9, w12 //CTR block 622942295orr x9, x11, x9, lsl #32 //CTR block 622962297add w12, w12, #1 //CTR block 62298eor v6.16b, v6.16b, v2.16b //AES block 2 - result2299fmov d2, x10 //CTR block 623002301fmov v2.d[1], x9 //CTR block 62302rev w9, w12 //CTR block 723032304orr x9, x11, x9, lsl #32 //CTR block 72305st1 { v6.16b}, [x2], #16 //AES block 2 - store result23062307eor v7.16b, v7.16b, v3.16b //AES block 3 - result2308st1 { v7.16b}, [x2], #16 //AES block 3 - store result2309b.ge .L192_enc_prepretail //do prepretail23102311.L192_enc_main_loop: //main loop start2312aese v2.16b, v18.16b2313aesmc v2.16b, v2.16b //AES block 4k+6 - round 02314rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)23152316aese v1.16b, v18.16b2317aesmc v1.16b, v1.16b //AES block 4k+5 - round 02318ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext2319#ifdef __AARCH64EB__2320rev x19, x192321rev x20, x202322#endif2323ext v11.16b, v11.16b, v11.16b, #8 //PRE 02324fmov d3, x10 //CTR block 4k+32325rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)23262327aese v2.16b, v19.16b2328aesmc v2.16b, v2.16b //AES block 4k+6 - round 12329fmov v3.d[1], x9 //CTR block 4k+323302331pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high2332rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)2333ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext2334#ifdef __AARCH64EB__2335rev x21, x212336rev x22, x222337#endif2338aese v0.16b, v18.16b2339aesmc v0.16b, v0.16b //AES block 4k+4 - round 02340ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext2341#ifdef __AARCH64EB__2342rev x23, x232343rev x24, x242344#endif2345pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low2346eor v4.16b, v4.16b, v11.16b //PRE 123472348aese v1.16b, v19.16b2349aesmc v1.16b, v1.16b //AES block 4k+5 - round 123502351aese v0.16b, v19.16b2352aesmc v0.16b, v0.16b //AES block 4k+4 - round 12353rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)23542355aese v3.16b, v18.16b2356aesmc v3.16b, v3.16b //AES block 4k+7 - round 02357eor x24, x24, x14 //AES block 4k+3 - round 12 high23582359pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low2360mov d8, v4.d[1] //GHASH block 4k - mid23612362aese v0.16b, v20.16b2363aesmc v0.16b, v0.16b //AES block 4k+4 - round 223642365aese v3.16b, v19.16b2366aesmc v3.16b, v3.16b //AES block 4k+7 - round 12367eor x21, x21, x13 //AES block 4k+6 - round 12 low23682369eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid2370eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low23712372aese v0.16b, v21.16b2373aesmc v0.16b, v0.16b //AES block 4k+4 - round 32374eor x19, x19, x13 //AES block 4k+5 - round 12 low23752376aese v1.16b, v20.16b2377aesmc v1.16b, v1.16b //AES block 4k+5 - round 22378mov d31, v6.d[1] //GHASH block 4k+2 - mid23792380pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high2381mov d4, v5.d[1] //GHASH block 4k+1 - mid23822383aese v2.16b, v20.16b2384aesmc v2.16b, v2.16b //AES block 4k+6 - round 223852386aese v1.16b, v21.16b2387aesmc v1.16b, v1.16b //AES block 4k+5 - round 323882389mov d10, v17.d[1] //GHASH block 4k - mid2390eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high23912392aese v3.16b, v20.16b2393aesmc v3.16b, v3.16b //AES block 4k+7 - round 22394eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid23952396pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high23972398aese v0.16b, v22.16b2399aesmc v0.16b, v0.16b //AES block 4k+4 - round 42400eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid24012402aese v3.16b, v21.16b2403aesmc v3.16b, v3.16b //AES block 4k+7 - round 324042405pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high2406eor x20, x20, x14 //AES block 4k+5 - round 12 high2407ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid24082409aese v0.16b, v23.16b2410aesmc v0.16b, v0.16b //AES block 4k+4 - round 52411add w12, w12, #1 //CTR block 4k+324122413aese v3.16b, v22.16b2414aesmc v3.16b, v3.16b //AES block 4k+7 - round 42415eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high24162417pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid2418eor x22, x22, x14 //AES block 4k+6 - round 12 high24192420pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid2421eor x23, x23, x13 //AES block 4k+3 - round 12 low2422mov d30, v7.d[1] //GHASH block 4k+3 - mid24232424pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid2425rev w9, w12 //CTR block 4k+824262427pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low2428orr x9, x11, x9, lsl #32 //CTR block 4k+824292430aese v2.16b, v21.16b2431aesmc v2.16b, v2.16b //AES block 4k+6 - round 32432eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid24332434aese v1.16b, v22.16b2435aesmc v1.16b, v1.16b //AES block 4k+5 - round 42436ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext2437#ifdef __AARCH64EB__2438rev x6, x62439rev x7, x72440#endif2441aese v0.16b, v24.16b2442aesmc v0.16b, v0.16b //AES block 4k+4 - round 62443eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low24442445aese v2.16b, v22.16b2446aesmc v2.16b, v2.16b //AES block 4k+6 - round 42447add x0, x0, #64 //AES input_ptr update24482449aese v1.16b, v23.16b2450aesmc v1.16b, v1.16b //AES block 4k+5 - round 52451movi v8.8b, #0xc224522453pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low2454eor x7, x7, x14 //AES block 4k+4 - round 12 high2455eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid24562457aese v2.16b, v23.16b2458aesmc v2.16b, v2.16b //AES block 4k+6 - round 52459eor x6, x6, x13 //AES block 4k+4 - round 12 low24602461aese v1.16b, v24.16b2462aesmc v1.16b, v1.16b //AES block 4k+5 - round 62463shl d8, d8, #56 //mod_constant24642465aese v3.16b, v23.16b2466aesmc v3.16b, v3.16b //AES block 4k+7 - round 52467eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high24682469aese v0.16b, v25.16b2470aesmc v0.16b, v0.16b //AES block 4k+4 - round 72471fmov d5, x19 //AES block 4k+5 - mov low24722473aese v1.16b, v25.16b2474aesmc v1.16b, v1.16b //AES block 4k+5 - round 72475eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid24762477aese v3.16b, v24.16b2478aesmc v3.16b, v3.16b //AES block 4k+7 - round 62479fmov v5.d[1], x20 //AES block 4k+5 - mov high24802481aese v0.16b, v26.16b2482aesmc v0.16b, v0.16b //AES block 4k+4 - round 82483eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low24842485pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid2486cmp x0, x5 //.LOOP CONTROL2487fmov d4, x6 //AES block 4k+4 - mov low24882489aese v2.16b, v24.16b2490aesmc v2.16b, v2.16b //AES block 4k+6 - round 62491fmov v4.d[1], x7 //AES block 4k+4 - mov high24922493aese v1.16b, v26.16b2494aesmc v1.16b, v1.16b //AES block 4k+5 - round 82495fmov d7, x23 //AES block 4k+3 - mov low24962497eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid2498eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up2499add w12, w12, #1 //CTR block 4k+825002501aese v2.16b, v25.16b2502aesmc v2.16b, v2.16b //AES block 4k+6 - round 72503fmov v7.d[1], x24 //AES block 4k+3 - mov high25042505pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid2506ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment2507fmov d6, x21 //AES block 4k+6 - mov low25082509aese v3.16b, v25.16b2510aesmc v3.16b, v3.16b //AES block 4k+7 - round 725112512aese v0.16b, v27.16b2513aesmc v0.16b, v0.16b //AES block 4k+4 - round 92514eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up25152516aese v2.16b, v26.16b2517aesmc v2.16b, v2.16b //AES block 4k+6 - round 825182519aese v3.16b, v26.16b2520aesmc v3.16b, v3.16b //AES block 4k+7 - round 825212522aese v1.16b, v27.16b2523aesmc v1.16b, v1.16b //AES block 4k+5 - round 925242525aese v0.16b, v28.16b2526aesmc v0.16b, v0.16b //AES block 4k+4 - round 102527eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid25282529aese v3.16b, v27.16b2530aesmc v3.16b, v3.16b //AES block 4k+7 - round 925312532aese v2.16b, v27.16b2533aesmc v2.16b, v2.16b //AES block 4k+6 - round 925342535aese v0.16b, v29.16b //AES block 4k+4 - round 1125362537aese v1.16b, v28.16b2538aesmc v1.16b, v1.16b //AES block 4k+5 - round 102539eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid25402541aese v2.16b, v28.16b2542aesmc v2.16b, v2.16b //AES block 4k+6 - round 1025432544eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result2545fmov d0, x10 //CTR block 4k+825462547aese v1.16b, v29.16b //AES block 4k+5 - round 112548fmov v0.d[1], x9 //CTR block 4k+82549rev w9, w12 //CTR block 4k+925502551pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low2552fmov v6.d[1], x22 //AES block 4k+6 - mov high2553st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result25542555aese v3.16b, v28.16b2556aesmc v3.16b, v3.16b //AES block 4k+7 - round 102557orr x9, x11, x9, lsl #32 //CTR block 4k+925582559eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result2560add w12, w12, #1 //CTR block 4k+92561fmov d1, x10 //CTR block 4k+925622563aese v2.16b, v29.16b //AES block 4k+6 - round 112564fmov v1.d[1], x9 //CTR block 4k+92565rev w9, w12 //CTR block 4k+1025662567add w12, w12, #1 //CTR block 4k+102568ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment2569orr x9, x11, x9, lsl #32 //CTR block 4k+1025702571st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result2572eor v11.16b, v11.16b, v9.16b //MODULO - fold into low25732574aese v3.16b, v29.16b //AES block 4k+7 - round 112575eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result2576fmov d2, x10 //CTR block 4k+1025772578st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result2579fmov v2.d[1], x9 //CTR block 4k+102580rev w9, w12 //CTR block 4k+1125812582eor v11.16b, v11.16b, v10.16b //MODULO - fold into low2583orr x9, x11, x9, lsl #32 //CTR block 4k+1125842585eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result2586st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result2587b.lt .L192_enc_main_loop25882589.L192_enc_prepretail: //PREPRETAIL2590aese v0.16b, v18.16b2591aesmc v0.16b, v0.16b //AES block 4k+4 - round 02592rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)25932594fmov d3, x10 //CTR block 4k+32595ext v11.16b, v11.16b, v11.16b, #8 //PRE 02596add w12, w12, #1 //CTR block 4k+325972598aese v1.16b, v18.16b2599aesmc v1.16b, v1.16b //AES block 4k+5 - round 02600rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)26012602aese v2.16b, v18.16b2603aesmc v2.16b, v2.16b //AES block 4k+6 - round 026042605fmov v3.d[1], x9 //CTR block 4k+32606eor v4.16b, v4.16b, v11.16b //PRE 12607mov d10, v17.d[1] //GHASH block 4k - mid26082609aese v1.16b, v19.16b2610aesmc v1.16b, v1.16b //AES block 4k+5 - round 12611rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)26122613pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high26142615pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low2616mov d8, v4.d[1] //GHASH block 4k - mid26172618pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low2619rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)26202621pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high26222623eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid2624mov d4, v5.d[1] //GHASH block 4k+1 - mid26252626eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low2627mov d31, v6.d[1] //GHASH block 4k+2 - mid26282629aese v3.16b, v18.16b2630aesmc v3.16b, v3.16b //AES block 4k+7 - round 02631eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high26322633pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high26342635eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid2636eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid26372638aese v3.16b, v19.16b2639aesmc v3.16b, v3.16b //AES block 4k+7 - round 126402641aese v2.16b, v19.16b2642aesmc v2.16b, v2.16b //AES block 4k+6 - round 12643eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high26442645aese v0.16b, v19.16b2646aesmc v0.16b, v0.16b //AES block 4k+4 - round 126472648aese v1.16b, v20.16b2649aesmc v1.16b, v1.16b //AES block 4k+5 - round 22650mov d30, v7.d[1] //GHASH block 4k+3 - mid26512652pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high2653ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid26542655aese v0.16b, v20.16b2656aesmc v0.16b, v0.16b //AES block 4k+4 - round 226572658pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid2659eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid26602661aese v1.16b, v21.16b2662aesmc v1.16b, v1.16b //AES block 4k+5 - round 326632664pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid26652666pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid26672668pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid2669eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high26702671pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low26722673aese v0.16b, v21.16b2674aesmc v0.16b, v0.16b //AES block 4k+4 - round 32675eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid26762677aese v3.16b, v20.16b2678aesmc v3.16b, v3.16b //AES block 4k+7 - round 226792680aese v2.16b, v20.16b2681aesmc v2.16b, v2.16b //AES block 4k+6 - round 22682eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low26832684aese v0.16b, v22.16b2685aesmc v0.16b, v0.16b //AES block 4k+4 - round 426862687aese v3.16b, v21.16b2688aesmc v3.16b, v3.16b //AES block 4k+7 - round 32689eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid26902691aese v2.16b, v21.16b2692aesmc v2.16b, v2.16b //AES block 4k+6 - round 326932694pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low2695movi v8.8b, #0xc226962697aese v3.16b, v22.16b2698aesmc v3.16b, v3.16b //AES block 4k+7 - round 426992700aese v2.16b, v22.16b2701aesmc v2.16b, v2.16b //AES block 4k+6 - round 427022703aese v1.16b, v22.16b2704aesmc v1.16b, v1.16b //AES block 4k+5 - round 42705eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid27062707aese v3.16b, v23.16b2708aesmc v3.16b, v3.16b //AES block 4k+7 - round 527092710aese v2.16b, v23.16b2711aesmc v2.16b, v2.16b //AES block 4k+6 - round 527122713aese v1.16b, v23.16b2714aesmc v1.16b, v1.16b //AES block 4k+5 - round 52715eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low27162717aese v0.16b, v23.16b2718aesmc v0.16b, v0.16b //AES block 4k+4 - round 527192720aese v3.16b, v24.16b2721aesmc v3.16b, v3.16b //AES block 4k+7 - round 62722eor v10.16b, v10.16b, v9.16b //karatsuba tidy up27232724aese v1.16b, v24.16b2725aesmc v1.16b, v1.16b //AES block 4k+5 - round 627262727aese v0.16b, v24.16b2728aesmc v0.16b, v0.16b //AES block 4k+4 - round 62729shl d8, d8, #56 //mod_constant27302731aese v3.16b, v25.16b2732aesmc v3.16b, v3.16b //AES block 4k+7 - round 727332734aese v1.16b, v25.16b2735aesmc v1.16b, v1.16b //AES block 4k+5 - round 72736eor v10.16b, v10.16b, v11.16b27372738aese v0.16b, v25.16b2739aesmc v0.16b, v0.16b //AES block 4k+4 - round 727402741pmull v30.1q, v9.1d, v8.1d27422743aese v2.16b, v24.16b2744aesmc v2.16b, v2.16b //AES block 4k+6 - round 62745ext v9.16b, v9.16b, v9.16b, #827462747aese v0.16b, v26.16b2748aesmc v0.16b, v0.16b //AES block 4k+4 - round 827492750aese v1.16b, v26.16b2751aesmc v1.16b, v1.16b //AES block 4k+5 - round 82752eor v10.16b, v10.16b, v30.16b27532754aese v2.16b, v25.16b2755aesmc v2.16b, v2.16b //AES block 4k+6 - round 727562757aese v3.16b, v26.16b2758aesmc v3.16b, v3.16b //AES block 4k+7 - round 827592760aese v0.16b, v27.16b2761aesmc v0.16b, v0.16b //AES block 4k+4 - round 927622763aese v2.16b, v26.16b2764aesmc v2.16b, v2.16b //AES block 4k+6 - round 82765eor v10.16b, v10.16b, v9.16b27662767aese v3.16b, v27.16b2768aesmc v3.16b, v3.16b //AES block 4k+7 - round 927692770aese v1.16b, v27.16b2771aesmc v1.16b, v1.16b //AES block 4k+5 - round 927722773aese v2.16b, v27.16b2774aesmc v2.16b, v2.16b //AES block 4k+6 - round 927752776pmull v30.1q, v10.1d, v8.1d27772778ext v10.16b, v10.16b, v10.16b, #827792780aese v3.16b, v28.16b2781aesmc v3.16b, v3.16b //AES block 4k+7 - round 1027822783aese v0.16b, v28.16b2784aesmc v0.16b, v0.16b //AES block 4k+4 - round 1027852786aese v2.16b, v28.16b2787aesmc v2.16b, v2.16b //AES block 4k+6 - round 1027882789aese v1.16b, v28.16b2790aesmc v1.16b, v1.16b //AES block 4k+5 - round 102791eor v11.16b, v11.16b, v30.16b27922793aese v0.16b, v29.16b //AES block 4k+4 - round 1127942795aese v3.16b, v29.16b //AES block 4k+7 - round 1127962797aese v2.16b, v29.16b //AES block 4k+6 - round 1127982799aese v1.16b, v29.16b //AES block 4k+5 - round 112800eor v11.16b, v11.16b, v10.16b2801.L192_enc_tail: //TAIL28022803sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process2804ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext2805#ifdef __AARCH64EB__2806rev x6, x62807rev x7, x72808#endif2809eor x6, x6, x13 //AES block 4k+4 - round 12 low2810eor x7, x7, x14 //AES block 4k+4 - round 12 high28112812fmov d4, x6 //AES block 4k+4 - mov low28132814fmov v4.d[1], x7 //AES block 4k+4 - mov high2815cmp x5, #4828162817eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result28182819ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag2820b.gt .L192_enc_blocks_more_than_328212822sub w12, w12, #12823movi v10.8b, #028242825mov v3.16b, v2.16b2826movi v9.8b, #02827cmp x5, #3228282829mov v2.16b, v1.16b2830movi v11.8b, #02831b.gt .L192_enc_blocks_more_than_228322833sub w12, w12, #128342835mov v3.16b, v1.16b2836cmp x5, #162837b.gt .L192_enc_blocks_more_than_128382839sub w12, w12, #12840b .L192_enc_blocks_less_than_12841.L192_enc_blocks_more_than_3: //blocks left > 32842st1 { v5.16b}, [x2], #16 //AES final-3 block - store result28432844ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high2845#ifdef __AARCH64EB__2846rev x6, x62847rev x7, x72848#endif2849rev64 v4.16b, v5.16b //GHASH final-3 block28502851eor x6, x6, x13 //AES final-2 block - round 12 low2852eor v4.16b, v4.16b, v8.16b //feed in partial tag28532854eor x7, x7, x14 //AES final-2 block - round 12 high2855fmov d5, x6 //AES final-2 block - mov low28562857fmov v5.d[1], x7 //AES final-2 block - mov high28582859mov d22, v4.d[1] //GHASH final-3 block - mid28602861pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low28622863mov d10, v17.d[1] //GHASH final-3 block - mid28642865eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid28662867movi v8.8b, #0 //suppress further partial tag feed in28682869pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high28702871pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid2872eor v5.16b, v5.16b, v1.16b //AES final-2 block - result2873.L192_enc_blocks_more_than_2: //blocks left > 228742875st1 { v5.16b}, [x2], #16 //AES final-2 block - store result28762877rev64 v4.16b, v5.16b //GHASH final-2 block2878ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high2879#ifdef __AARCH64EB__2880rev x6, x62881rev x7, x72882#endif2883eor v4.16b, v4.16b, v8.16b //feed in partial tag28842885eor x7, x7, x14 //AES final-1 block - round 12 high28862887pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high2888mov d22, v4.d[1] //GHASH final-2 block - mid28892890pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low2891eor x6, x6, x13 //AES final-1 block - round 12 low28922893fmov d5, x6 //AES final-1 block - mov low28942895fmov v5.d[1], x7 //AES final-1 block - mov high2896eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high2897eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid28982899eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low29002901pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid29022903movi v8.8b, #0 //suppress further partial tag feed in29042905eor v5.16b, v5.16b, v2.16b //AES final-1 block - result29062907eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid2908.L192_enc_blocks_more_than_1: //blocks left > 129092910st1 { v5.16b}, [x2], #16 //AES final-1 block - store result29112912ldp x6, x7, [x0], #16 //AES final block - load input low & high2913#ifdef __AARCH64EB__2914rev x6, x62915rev x7, x72916#endif2917rev64 v4.16b, v5.16b //GHASH final-1 block29182919eor x6, x6, x13 //AES final block - round 12 low2920eor v4.16b, v4.16b, v8.16b //feed in partial tag2921movi v8.8b, #0 //suppress further partial tag feed in29222923mov d22, v4.d[1] //GHASH final-1 block - mid29242925eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid2926eor x7, x7, x14 //AES final block - round 12 high2927fmov d5, x6 //AES final block - mov low29282929pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high2930fmov v5.d[1], x7 //AES final block - mov high29312932ins v22.d[1], v22.d[0] //GHASH final-1 block - mid29332934eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high29352936pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low29372938pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid29392940eor v5.16b, v5.16b, v3.16b //AES final block - result29412942eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low29432944eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid2945.L192_enc_blocks_less_than_1: //blocks left <= 129462947ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored2948#ifndef __AARCH64EB__2949rev w9, w122950#else2951mov w9, w122952#endif2953and x1, x1, #127 //bit_length %= 12829542955sub x1, x1, #128 //bit_length -= 1282956mvn x14, xzr //rk12_h = 0xffffffffffffffff29572958neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])2959mvn x13, xzr //rk12_l = 0xffffffffffffffff29602961and x1, x1, #127 //bit_length %= 12829622963lsr x14, x14, x1 //rk12_h is mask for top 64b of last block2964cmp x1, #6429652966csel x6, x13, x14, lt2967csel x7, x14, xzr, lt29682969fmov d0, x6 //ctr0b is mask for last block29702971fmov v0.d[1], x729722973and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits29742975rev64 v4.16b, v5.16b //GHASH final block29762977eor v4.16b, v4.16b, v8.16b //feed in partial tag29782979mov d8, v4.d[1] //GHASH final block - mid29802981pmull v21.1q, v4.1d, v12.1d //GHASH final block - low29822983pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high29842985eor v8.8b, v8.8b, v4.8b //GHASH final block - mid29862987eor v11.16b, v11.16b, v21.16b //GHASH final block - low29882989eor v9.16b, v9.16b, v20.16b //GHASH final block - high29902991pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid29922993eor v10.16b, v10.16b, v8.16b //GHASH final block - mid2994movi v8.8b, #0xc229952996eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up29972998shl d8, d8, #56 //mod_constant29993000bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing30013002eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up30033004pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid30053006ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment30073008eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid30093010eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid30113012pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low30133014ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment30153016eor v11.16b, v11.16b, v9.16b //MODULO - fold into low3017str w9, [x16, #12] //store the updated counter30183019st1 { v5.16b}, [x2] //store all 16B30203021eor v11.16b, v11.16b, v10.16b //MODULO - fold into low3022ext v11.16b, v11.16b, v11.16b, #83023rev64 v11.16b, v11.16b3024mov x0, x153025st1 { v11.16b }, [x3]30263027ldp x21, x22, [sp, #16]3028ldp x23, x24, [sp, #32]3029ldp d8, d9, [sp, #48]3030ldp d10, d11, [sp, #64]3031ldp d12, d13, [sp, #80]3032ldp d14, d15, [sp, #96]3033ldp x19, x20, [sp], #1123034ret30353036.L192_enc_ret:3037mov w0, #0x03038ret3039.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel3040.globl aes_gcm_dec_192_kernel3041.type aes_gcm_dec_192_kernel,%function3042.align 43043aes_gcm_dec_192_kernel:3044AARCH64_VALID_CALL_TARGET3045cbz x1, .L192_dec_ret3046stp x19, x20, [sp, #-112]!3047mov x16, x43048mov x8, x53049stp x21, x22, [sp, #16]3050stp x23, x24, [sp, #32]3051stp d8, d9, [sp, #48]3052stp d10, d11, [sp, #64]3053stp d12, d13, [sp, #80]3054stp d14, d15, [sp, #96]30553056add x4, x0, x1, lsr #3 //end_input_ptr3057ldp x10, x11, [x16] //ctr96_b64, ctr96_t323058#ifdef __AARCH64EB__3059rev x10, x103060rev x11, x113061#endif3062ldp x13, x14, [x8, #192] //load rk123063#ifdef __AARCH64EB__3064ror x13, x13, #323065ror x14, x14, #323066#endif3067ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible30683069ld1 {v18.4s}, [x8], #16 //load rk030703071lsr x5, x1, #3 //byte_len3072mov x15, x53073ld1 {v19.4s}, [x8], #16 //load rk130743075lsr x12, x11, #323076orr w11, w11, w113077fmov d3, x10 //CTR block 330783079rev w12, w12 //rev_ctr323080fmov d1, x10 //CTR block 130813082add w12, w12, #1 //increment rev_ctr323083ld1 {v20.4s}, [x8], #16 //load rk230843085aese v0.16b, v18.16b3086aesmc v0.16b, v0.16b //AES block 0 - round 03087rev w9, w12 //CTR block 130883089add w12, w12, #1 //CTR block 13090orr x9, x11, x9, lsl #32 //CTR block 13091ld1 {v21.4s}, [x8], #16 //load rk330923093fmov v1.d[1], x9 //CTR block 13094rev w9, w12 //CTR block 23095add w12, w12, #1 //CTR block 230963097fmov d2, x10 //CTR block 23098orr x9, x11, x9, lsl #32 //CTR block 230993100fmov v2.d[1], x9 //CTR block 23101rev w9, w12 //CTR block 331023103aese v0.16b, v19.16b3104aesmc v0.16b, v0.16b //AES block 0 - round 13105orr x9, x11, x9, lsl #32 //CTR block 331063107fmov v3.d[1], x9 //CTR block 331083109ld1 {v22.4s}, [x8], #16 //load rk431103111aese v0.16b, v20.16b3112aesmc v0.16b, v0.16b //AES block 0 - round 231133114aese v2.16b, v18.16b3115aesmc v2.16b, v2.16b //AES block 2 - round 03116ld1 {v23.4s}, [x8], #16 //load rk531173118aese v1.16b, v18.16b3119aesmc v1.16b, v1.16b //AES block 1 - round 03120ldr q15, [x3, #112] //load h4l | h4h3121#ifndef __AARCH64EB__3122ext v15.16b, v15.16b, v15.16b, #83123#endif3124aese v3.16b, v18.16b3125aesmc v3.16b, v3.16b //AES block 3 - round 03126ldr q13, [x3, #64] //load h2l | h2h3127#ifndef __AARCH64EB__3128ext v13.16b, v13.16b, v13.16b, #83129#endif3130aese v2.16b, v19.16b3131aesmc v2.16b, v2.16b //AES block 2 - round 13132ldr q14, [x3, #80] //load h3l | h3h3133#ifndef __AARCH64EB__3134ext v14.16b, v14.16b, v14.16b, #83135#endif3136aese v1.16b, v19.16b3137aesmc v1.16b, v1.16b //AES block 1 - round 131383139aese v3.16b, v19.16b3140aesmc v3.16b, v3.16b //AES block 3 - round 13141ldr q12, [x3, #32] //load h1l | h1h3142#ifndef __AARCH64EB__3143ext v12.16b, v12.16b, v12.16b, #83144#endif3145aese v2.16b, v20.16b3146aesmc v2.16b, v2.16b //AES block 2 - round 23147ld1 {v24.4s}, [x8], #16 //load rk631483149aese v0.16b, v21.16b3150aesmc v0.16b, v0.16b //AES block 0 - round 33151ld1 {v25.4s}, [x8], #16 //load rk731523153aese v1.16b, v20.16b3154aesmc v1.16b, v1.16b //AES block 1 - round 23155ld1 {v26.4s}, [x8], #16 //load rk831563157aese v3.16b, v20.16b3158aesmc v3.16b, v3.16b //AES block 3 - round 23159ld1 {v27.4s}, [x8], #16 //load rk931603161aese v2.16b, v21.16b3162aesmc v2.16b, v2.16b //AES block 2 - round 33163ld1 { v11.16b}, [x3]3164ext v11.16b, v11.16b, v11.16b, #83165rev64 v11.16b, v11.16b31663167aese v1.16b, v21.16b3168aesmc v1.16b, v1.16b //AES block 1 - round 33169add w12, w12, #1 //CTR block 331703171aese v3.16b, v21.16b3172aesmc v3.16b, v3.16b //AES block 3 - round 33173trn1 v9.2d, v14.2d, v15.2d //h4h | h3h31743175aese v0.16b, v22.16b3176aesmc v0.16b, v0.16b //AES block 0 - round 43177ld1 {v28.4s}, [x8], #16 //load rk1031783179aese v1.16b, v22.16b3180aesmc v1.16b, v1.16b //AES block 1 - round 43181trn2 v17.2d, v14.2d, v15.2d //h4l | h3l31823183aese v2.16b, v22.16b3184aesmc v2.16b, v2.16b //AES block 2 - round 431853186aese v3.16b, v22.16b3187aesmc v3.16b, v3.16b //AES block 3 - round 43188trn2 v16.2d, v12.2d, v13.2d //h2l | h1l31893190aese v0.16b, v23.16b3191aesmc v0.16b, v0.16b //AES block 0 - round 53192ld1 {v29.4s}, [x8], #16 //load rk1131933194aese v1.16b, v23.16b3195aesmc v1.16b, v1.16b //AES block 1 - round 531963197aese v2.16b, v23.16b3198aesmc v2.16b, v2.16b //AES block 2 - round 531993200aese v3.16b, v23.16b3201aesmc v3.16b, v3.16b //AES block 3 - round 532023203aese v0.16b, v24.16b3204aesmc v0.16b, v0.16b //AES block 0 - round 632053206aese v2.16b, v24.16b3207aesmc v2.16b, v2.16b //AES block 2 - round 632083209aese v3.16b, v24.16b3210aesmc v3.16b, v3.16b //AES block 3 - round 632113212aese v0.16b, v25.16b3213aesmc v0.16b, v0.16b //AES block 0 - round 732143215aese v2.16b, v25.16b3216aesmc v2.16b, v2.16b //AES block 2 - round 732173218aese v3.16b, v25.16b3219aesmc v3.16b, v3.16b //AES block 3 - round 732203221aese v1.16b, v24.16b3222aesmc v1.16b, v1.16b //AES block 1 - round 632233224aese v2.16b, v26.16b3225aesmc v2.16b, v2.16b //AES block 2 - round 832263227aese v3.16b, v26.16b3228aesmc v3.16b, v3.16b //AES block 3 - round 832293230aese v1.16b, v25.16b3231aesmc v1.16b, v1.16b //AES block 1 - round 732323233aese v2.16b, v27.16b3234aesmc v2.16b, v2.16b //AES block 2 - round 932353236aese v3.16b, v27.16b3237aesmc v3.16b, v3.16b //AES block 3 - round 932383239aese v1.16b, v26.16b3240aesmc v1.16b, v1.16b //AES block 1 - round 83241sub x5, x5, #1 //byte_len - 132423243aese v0.16b, v26.16b3244aesmc v0.16b, v0.16b //AES block 0 - round 83245and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)32463247aese v3.16b, v28.16b3248aesmc v3.16b, v3.16b //AES block 3 - round 103249add x5, x5, x032503251aese v1.16b, v27.16b3252aesmc v1.16b, v1.16b //AES block 1 - round 93253cmp x0, x5 //check if we have <= 4 blocks32543255aese v0.16b, v27.16b3256aesmc v0.16b, v0.16b //AES block 0 - round 93257trn1 v8.2d, v12.2d, v13.2d //h2h | h1h32583259aese v3.16b, v29.16b //AES block 3 - round 1132603261aese v2.16b, v28.16b3262aesmc v2.16b, v2.16b //AES block 2 - round 1032633264aese v1.16b, v28.16b3265aesmc v1.16b, v1.16b //AES block 1 - round 1032663267aese v0.16b, v28.16b3268aesmc v0.16b, v0.16b //AES block 0 - round 103269eor v16.16b, v16.16b, v8.16b //h2k | h1k32703271aese v2.16b, v29.16b //AES block 2 - round 1132723273aese v1.16b, v29.16b //AES block 1 - round 113274eor v17.16b, v17.16b, v9.16b //h4k | h3k32753276aese v0.16b, v29.16b //AES block 0 - round 113277b.ge .L192_dec_tail //handle tail32783279ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext32803281eor v1.16b, v5.16b, v1.16b //AES block 1 - result32823283eor v0.16b, v4.16b, v0.16b //AES block 0 - result3284rev w9, w12 //CTR block 43285ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext32863287mov x19, v1.d[0] //AES block 1 - mov low32883289mov x20, v1.d[1] //AES block 1 - mov high32903291mov x6, v0.d[0] //AES block 0 - mov low3292orr x9, x11, x9, lsl #32 //CTR block 43293add w12, w12, #1 //CTR block 432943295mov x7, v0.d[1] //AES block 0 - mov high3296rev64 v4.16b, v4.16b //GHASH block 032973298fmov d0, x10 //CTR block 43299rev64 v5.16b, v5.16b //GHASH block 13300cmp x0, x5 //check if we have <= 8 blocks33013302eor x19, x19, x13 //AES block 1 - round 12 low3303#ifdef __AARCH64EB__3304rev x19, x193305#endif3306fmov v0.d[1], x9 //CTR block 43307rev w9, w12 //CTR block 533083309orr x9, x11, x9, lsl #32 //CTR block 53310fmov d1, x10 //CTR block 53311eor x20, x20, x14 //AES block 1 - round 12 high3312#ifdef __AARCH64EB__3313rev x20, x203314#endif3315add w12, w12, #1 //CTR block 53316fmov v1.d[1], x9 //CTR block 53317eor x6, x6, x13 //AES block 0 - round 12 low3318#ifdef __AARCH64EB__3319rev x6, x63320#endif3321rev w9, w12 //CTR block 63322eor x7, x7, x14 //AES block 0 - round 12 high3323#ifdef __AARCH64EB__3324rev x7, x73325#endif3326stp x6, x7, [x2], #16 //AES block 0 - store result3327orr x9, x11, x9, lsl #32 //CTR block 633283329stp x19, x20, [x2], #16 //AES block 1 - store result33303331add w12, w12, #1 //CTR block 63332eor v2.16b, v6.16b, v2.16b //AES block 2 - result3333b.ge .L192_dec_prepretail //do prepretail33343335.L192_dec_main_loop: //main loop start3336aese v1.16b, v18.16b3337aesmc v1.16b, v1.16b //AES block 4k+5 - round 03338ext v11.16b, v11.16b, v11.16b, #8 //PRE 033393340pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low3341mov x21, v2.d[0] //AES block 4k+2 - mov low33423343mov x22, v2.d[1] //AES block 4k+2 - mov high3344eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result3345rev64 v7.16b, v7.16b //GHASH block 4k+333463347aese v1.16b, v19.16b3348aesmc v1.16b, v1.16b //AES block 4k+5 - round 13349fmov d2, x10 //CTR block 4k+633503351aese v0.16b, v18.16b3352aesmc v0.16b, v0.16b //AES block 4k+4 - round 03353eor v4.16b, v4.16b, v11.16b //PRE 133543355pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high3356fmov v2.d[1], x9 //CTR block 4k+633573358aese v1.16b, v20.16b3359aesmc v1.16b, v1.16b //AES block 4k+5 - round 23360mov x24, v3.d[1] //AES block 4k+3 - mov high33613362aese v0.16b, v19.16b3363aesmc v0.16b, v0.16b //AES block 4k+4 - round 13364mov x23, v3.d[0] //AES block 4k+3 - mov low33653366pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high3367fmov d3, x10 //CTR block 4k+73368mov d8, v4.d[1] //GHASH block 4k - mid33693370pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low3371mov d10, v17.d[1] //GHASH block 4k - mid3372rev w9, w12 //CTR block 4k+733733374aese v2.16b, v18.16b3375aesmc v2.16b, v2.16b //AES block 4k+6 - round 03376orr x9, x11, x9, lsl #32 //CTR block 4k+733773378fmov v3.d[1], x9 //CTR block 4k+73379eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid3380mov d4, v5.d[1] //GHASH block 4k+1 - mid33813382aese v1.16b, v21.16b3383aesmc v1.16b, v1.16b //AES block 4k+5 - round 333843385aese v0.16b, v20.16b3386aesmc v0.16b, v0.16b //AES block 4k+4 - round 23387eor x22, x22, x14 //AES block 4k+2 - round 12 high3388#ifdef __AARCH64EB__3389rev x22, x223390#endif3391aese v2.16b, v19.16b3392aesmc v2.16b, v2.16b //AES block 4k+6 - round 13393eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid33943395pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid33963397aese v3.16b, v18.16b3398aesmc v3.16b, v3.16b //AES block 4k+7 - round 03399rev64 v6.16b, v6.16b //GHASH block 4k+234003401aese v2.16b, v20.16b3402aesmc v2.16b, v2.16b //AES block 4k+6 - round 234033404pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid3405eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low3406eor x21, x21, x13 //AES block 4k+2 - round 12 low3407#ifdef __AARCH64EB__3408rev x21, x213409#endif3410aese v1.16b, v22.16b3411aesmc v1.16b, v1.16b //AES block 4k+5 - round 434123413aese v0.16b, v21.16b3414aesmc v0.16b, v0.16b //AES block 4k+4 - round 334153416eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid3417mov d31, v6.d[1] //GHASH block 4k+2 - mid34183419aese v3.16b, v19.16b3420aesmc v3.16b, v3.16b //AES block 4k+7 - round 13421eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high34223423aese v0.16b, v22.16b3424aesmc v0.16b, v0.16b //AES block 4k+4 - round 434253426pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high3427eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid34283429pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low34303431aese v0.16b, v23.16b3432aesmc v0.16b, v0.16b //AES block 4k+4 - round 534333434eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high3435mov d30, v7.d[1] //GHASH block 4k+3 - mid34363437aese v1.16b, v23.16b3438aesmc v1.16b, v1.16b //AES block 4k+5 - round 534393440pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high34413442aese v3.16b, v20.16b3443aesmc v3.16b, v3.16b //AES block 4k+7 - round 23444eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid34453446aese v1.16b, v24.16b3447aesmc v1.16b, v1.16b //AES block 4k+5 - round 634483449aese v0.16b, v24.16b3450aesmc v0.16b, v0.16b //AES block 4k+4 - round 63451ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid34523453aese v3.16b, v21.16b3454aesmc v3.16b, v3.16b //AES block 4k+7 - round 334553456pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid3457eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low34583459aese v0.16b, v25.16b3460aesmc v0.16b, v0.16b //AES block 4k+4 - round 734613462pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid3463eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high34643465aese v1.16b, v25.16b3466aesmc v1.16b, v1.16b //AES block 4k+5 - round 734673468aese v0.16b, v26.16b3469aesmc v0.16b, v0.16b //AES block 4k+4 - round 83470movi v8.8b, #0xc234713472pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low34733474aese v1.16b, v26.16b3475aesmc v1.16b, v1.16b //AES block 4k+5 - round 83476eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid34773478aese v2.16b, v21.16b3479aesmc v2.16b, v2.16b //AES block 4k+6 - round 334803481aese v0.16b, v27.16b3482aesmc v0.16b, v0.16b //AES block 4k+4 - round 93483eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low34843485aese v3.16b, v22.16b3486aesmc v3.16b, v3.16b //AES block 4k+7 - round 434873488aese v2.16b, v22.16b3489aesmc v2.16b, v2.16b //AES block 4k+6 - round 43490eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid34913492aese v0.16b, v28.16b3493aesmc v0.16b, v0.16b //AES block 4k+4 - round 1034943495aese v1.16b, v27.16b3496aesmc v1.16b, v1.16b //AES block 4k+5 - round 93497eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up34983499aese v2.16b, v23.16b3500aesmc v2.16b, v2.16b //AES block 4k+6 - round 535013502aese v3.16b, v23.16b3503aesmc v3.16b, v3.16b //AES block 4k+7 - round 53504shl d8, d8, #56 //mod_constant35053506aese v1.16b, v28.16b3507aesmc v1.16b, v1.16b //AES block 4k+5 - round 1035083509aese v2.16b, v24.16b3510aesmc v2.16b, v2.16b //AES block 4k+6 - round 63511ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext35123513aese v3.16b, v24.16b3514aesmc v3.16b, v3.16b //AES block 4k+7 - round 63515eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up35163517pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid3518ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext3519eor x23, x23, x13 //AES block 4k+3 - round 12 low3520#ifdef __AARCH64EB__3521rev x23, x233522#endif3523aese v2.16b, v25.16b3524aesmc v2.16b, v2.16b //AES block 4k+6 - round 73525ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment35263527aese v0.16b, v29.16b //AES block 4k+4 - round 113528add w12, w12, #1 //CTR block 4k+735293530aese v3.16b, v25.16b3531aesmc v3.16b, v3.16b //AES block 4k+7 - round 73532eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid35333534aese v2.16b, v26.16b3535aesmc v2.16b, v2.16b //AES block 4k+6 - round 83536ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext35373538aese v1.16b, v29.16b //AES block 4k+5 - round 113539ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext3540rev w9, w12 //CTR block 4k+835413542aese v3.16b, v26.16b3543aesmc v3.16b, v3.16b //AES block 4k+7 - round 83544stp x21, x22, [x2], #16 //AES block 4k+2 - store result35453546aese v2.16b, v27.16b3547aesmc v2.16b, v2.16b //AES block 4k+6 - round 93548eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid35493550cmp x0, x5 //.LOOP CONTROL35513552eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result3553eor x24, x24, x14 //AES block 4k+3 - round 12 high3554#ifdef __AARCH64EB__3555rev x24, x243556#endif3557eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result35583559aese v2.16b, v28.16b3560aesmc v2.16b, v2.16b //AES block 4k+6 - round 103561orr x9, x11, x9, lsl #32 //CTR block 4k+835623563aese v3.16b, v27.16b3564aesmc v3.16b, v3.16b //AES block 4k+7 - round 935653566pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low3567mov x19, v1.d[0] //AES block 4k+5 - mov low35683569mov x6, v0.d[0] //AES block 4k+4 - mov low3570stp x23, x24, [x2], #16 //AES block 4k+3 - store result3571rev64 v5.16b, v5.16b //GHASH block 4k+535723573aese v2.16b, v29.16b //AES block 4k+6 - round 113574mov x7, v0.d[1] //AES block 4k+4 - mov high35753576aese v3.16b, v28.16b3577aesmc v3.16b, v3.16b //AES block 4k+7 - round 103578mov x20, v1.d[1] //AES block 4k+5 - mov high35793580fmov d0, x10 //CTR block 4k+83581add w12, w12, #1 //CTR block 4k+83582ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment35833584eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result3585fmov v0.d[1], x9 //CTR block 4k+83586rev w9, w12 //CTR block 4k+935873588eor x6, x6, x13 //AES block 4k+4 - round 12 low3589#ifdef __AARCH64EB__3590rev x6, x63591#endif3592orr x9, x11, x9, lsl #32 //CTR block 4k+93593eor v11.16b, v11.16b, v8.16b //MODULO - fold into low35943595fmov d1, x10 //CTR block 4k+93596add w12, w12, #1 //CTR block 4k+93597eor x19, x19, x13 //AES block 4k+5 - round 12 low3598#ifdef __AARCH64EB__3599rev x19, x193600#endif3601fmov v1.d[1], x9 //CTR block 4k+93602rev w9, w12 //CTR block 4k+103603eor x20, x20, x14 //AES block 4k+5 - round 12 high3604#ifdef __AARCH64EB__3605rev x20, x203606#endif3607eor x7, x7, x14 //AES block 4k+4 - round 12 high3608#ifdef __AARCH64EB__3609rev x7, x73610#endif3611stp x6, x7, [x2], #16 //AES block 4k+4 - store result3612eor v11.16b, v11.16b, v10.16b //MODULO - fold into low36133614add w12, w12, #1 //CTR block 4k+103615rev64 v4.16b, v4.16b //GHASH block 4k+43616orr x9, x11, x9, lsl #32 //CTR block 4k+1036173618aese v3.16b, v29.16b //AES block 4k+7 - round 113619stp x19, x20, [x2], #16 //AES block 4k+5 - store result3620b.lt .L192_dec_main_loop36213622.L192_dec_prepretail: //PREPRETAIL3623mov x22, v2.d[1] //AES block 4k+2 - mov high3624ext v11.16b, v11.16b, v11.16b, #8 //PRE 03625eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result36263627aese v1.16b, v18.16b3628aesmc v1.16b, v1.16b //AES block 4k+5 - round 03629mov x21, v2.d[0] //AES block 4k+2 - mov low36303631aese v0.16b, v18.16b3632aesmc v0.16b, v0.16b //AES block 4k+4 - round 03633mov d10, v17.d[1] //GHASH block 4k - mid36343635eor v4.16b, v4.16b, v11.16b //PRE 13636fmov d2, x10 //CTR block 4k+636373638aese v1.16b, v19.16b3639aesmc v1.16b, v1.16b //AES block 4k+5 - round 13640mov x23, v3.d[0] //AES block 4k+3 - mov low36413642aese v0.16b, v19.16b3643aesmc v0.16b, v0.16b //AES block 4k+4 - round 13644mov x24, v3.d[1] //AES block 4k+3 - mov high36453646pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low3647mov d8, v4.d[1] //GHASH block 4k - mid3648fmov d3, x10 //CTR block 4k+736493650aese v1.16b, v20.16b3651aesmc v1.16b, v1.16b //AES block 4k+5 - round 23652rev64 v6.16b, v6.16b //GHASH block 4k+236533654pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high3655fmov v2.d[1], x9 //CTR block 4k+63656rev w9, w12 //CTR block 4k+736573658orr x9, x11, x9, lsl #32 //CTR block 4k+73659eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid3660mov d4, v5.d[1] //GHASH block 4k+1 - mid36613662pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low3663eor x24, x24, x14 //AES block 4k+3 - round 12 high3664#ifdef __AARCH64EB__3665rev x24, x243666#endif3667fmov v3.d[1], x9 //CTR block 4k+736683669aese v0.16b, v20.16b3670aesmc v0.16b, v0.16b //AES block 4k+4 - round 23671eor x21, x21, x13 //AES block 4k+2 - round 12 low3672#ifdef __AARCH64EB__3673rev x21, x213674#endif3675pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high3676eor x22, x22, x14 //AES block 4k+2 - round 12 high3677#ifdef __AARCH64EB__3678rev x22, x223679#endif3680eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid36813682pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid3683eor x23, x23, x13 //AES block 4k+3 - round 12 low3684#ifdef __AARCH64EB__3685rev x23, x233686#endif3687stp x21, x22, [x2], #16 //AES block 4k+2 - store result36883689rev64 v7.16b, v7.16b //GHASH block 4k+33690stp x23, x24, [x2], #16 //AES block 4k+3 - store result36913692aese v3.16b, v18.16b3693aesmc v3.16b, v3.16b //AES block 4k+7 - round 03694eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high36953696pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid3697add w12, w12, #1 //CTR block 4k+736983699pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high3700eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low37013702aese v2.16b, v18.16b3703aesmc v2.16b, v2.16b //AES block 4k+6 - round 037043705eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid3706mov d31, v6.d[1] //GHASH block 4k+2 - mid37073708aese v3.16b, v19.16b3709aesmc v3.16b, v3.16b //AES block 4k+7 - round 137103711aese v2.16b, v19.16b3712aesmc v2.16b, v2.16b //AES block 4k+6 - round 13713eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high37143715eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid37163717pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low37183719aese v2.16b, v20.16b3720aesmc v2.16b, v2.16b //AES block 4k+6 - round 23721mov d30, v7.d[1] //GHASH block 4k+3 - mid37223723aese v3.16b, v20.16b3724aesmc v3.16b, v3.16b //AES block 4k+7 - round 23725ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid37263727pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low37283729aese v0.16b, v21.16b3730aesmc v0.16b, v0.16b //AES block 4k+4 - round 33731eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid37323733aese v1.16b, v21.16b3734aesmc v1.16b, v1.16b //AES block 4k+5 - round 337353736pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid3737eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low37383739aese v0.16b, v22.16b3740aesmc v0.16b, v0.16b //AES block 4k+4 - round 437413742pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high3743movi v8.8b, #0xc237443745pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid37463747aese v2.16b, v21.16b3748aesmc v2.16b, v2.16b //AES block 4k+6 - round 337493750shl d8, d8, #56 //mod_constant3751eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high37523753aese v0.16b, v23.16b3754aesmc v0.16b, v0.16b //AES block 4k+4 - round 53755eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid37563757aese v2.16b, v22.16b3758aesmc v2.16b, v2.16b //AES block 4k+6 - round 437593760pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid3761eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low37623763aese v0.16b, v24.16b3764aesmc v0.16b, v0.16b //AES block 4k+4 - round 637653766aese v3.16b, v21.16b3767aesmc v3.16b, v3.16b //AES block 4k+7 - round 33768eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid37693770aese v2.16b, v23.16b3771aesmc v2.16b, v2.16b //AES block 4k+6 - round 537723773aese v0.16b, v25.16b3774aesmc v0.16b, v0.16b //AES block 4k+4 - round 73775eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up37763777aese v3.16b, v22.16b3778aesmc v3.16b, v3.16b //AES block 4k+7 - round 437793780aese v2.16b, v24.16b3781aesmc v2.16b, v2.16b //AES block 4k+6 - round 63782ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment37833784aese v0.16b, v26.16b3785aesmc v0.16b, v0.16b //AES block 4k+4 - round 837863787aese v3.16b, v23.16b3788aesmc v3.16b, v3.16b //AES block 4k+7 - round 53789eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up37903791aese v1.16b, v22.16b3792aesmc v1.16b, v1.16b //AES block 4k+5 - round 437933794aese v2.16b, v25.16b3795aesmc v2.16b, v2.16b //AES block 4k+6 - round 737963797aese v0.16b, v27.16b3798aesmc v0.16b, v0.16b //AES block 4k+4 - round 937993800aese v1.16b, v23.16b3801aesmc v1.16b, v1.16b //AES block 4k+5 - round 538023803aese v3.16b, v24.16b3804aesmc v3.16b, v3.16b //AES block 4k+7 - round 63805eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid38063807aese v0.16b, v28.16b3808aesmc v0.16b, v0.16b //AES block 4k+4 - round 1038093810aese v1.16b, v24.16b3811aesmc v1.16b, v1.16b //AES block 4k+5 - round 638123813aese v3.16b, v25.16b3814aesmc v3.16b, v3.16b //AES block 4k+7 - round 738153816aese v2.16b, v26.16b3817aesmc v2.16b, v2.16b //AES block 4k+6 - round 83818eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid38193820aese v1.16b, v25.16b3821aesmc v1.16b, v1.16b //AES block 4k+5 - round 738223823aese v3.16b, v26.16b3824aesmc v3.16b, v3.16b //AES block 4k+7 - round 838253826aese v2.16b, v27.16b3827aesmc v2.16b, v2.16b //AES block 4k+6 - round 938283829aese v1.16b, v26.16b3830aesmc v1.16b, v1.16b //AES block 4k+5 - round 838313832aese v3.16b, v27.16b3833aesmc v3.16b, v3.16b //AES block 4k+7 - round 938343835pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low38363837aese v1.16b, v27.16b3838aesmc v1.16b, v1.16b //AES block 4k+5 - round 938393840aese v2.16b, v28.16b3841aesmc v2.16b, v2.16b //AES block 4k+6 - round 1038423843aese v3.16b, v28.16b3844aesmc v3.16b, v3.16b //AES block 4k+7 - round 103845ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment38463847aese v1.16b, v28.16b3848aesmc v1.16b, v1.16b //AES block 4k+5 - round 1038493850aese v0.16b, v29.16b3851eor v11.16b, v11.16b, v8.16b //MODULO - fold into low38523853aese v2.16b, v29.16b38543855aese v1.16b, v29.16b38563857aese v3.16b, v29.16b38583859eor v11.16b, v11.16b, v10.16b //MODULO - fold into low3860.L192_dec_tail: //TAIL38613862sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process3863ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext38643865eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result38663867mov x7, v0.d[1] //AES block 4k+4 - mov high38683869mov x6, v0.d[0] //AES block 4k+4 - mov low38703871ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag38723873cmp x5, #4838743875eor x7, x7, x14 //AES block 4k+4 - round 12 high3876#ifdef __AARCH64EB__3877rev x7, x73878#endif3879eor x6, x6, x13 //AES block 4k+4 - round 12 low3880#ifdef __AARCH64EB__3881rev x6, x63882#endif3883b.gt .L192_dec_blocks_more_than_338843885movi v11.8b, #03886movi v9.8b, #038873888mov v3.16b, v2.16b3889mov v2.16b, v1.16b3890sub w12, w12, #138913892movi v10.8b, #03893cmp x5, #323894b.gt .L192_dec_blocks_more_than_238953896mov v3.16b, v1.16b3897cmp x5, #163898sub w12, w12, #138993900b.gt .L192_dec_blocks_more_than_139013902sub w12, w12, #13903b .L192_dec_blocks_less_than_13904.L192_dec_blocks_more_than_3: //blocks left > 33905rev64 v4.16b, v5.16b //GHASH final-3 block3906ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext39073908stp x6, x7, [x2], #16 //AES final-3 block - store result39093910eor v4.16b, v4.16b, v8.16b //feed in partial tag39113912eor v0.16b, v5.16b, v1.16b //AES final-2 block - result39133914pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low3915mov x6, v0.d[0] //AES final-2 block - mov low3916mov d22, v4.d[1] //GHASH final-3 block - mid39173918mov x7, v0.d[1] //AES final-2 block - mov high39193920mov d10, v17.d[1] //GHASH final-3 block - mid3921eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid39223923pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high39243925eor x6, x6, x13 //AES final-2 block - round 12 low3926#ifdef __AARCH64EB__3927rev x6, x63928#endif3929movi v8.8b, #0 //suppress further partial tag feed in39303931pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid3932eor x7, x7, x14 //AES final-2 block - round 12 high3933#ifdef __AARCH64EB__3934rev x7, x73935#endif3936.L192_dec_blocks_more_than_2: //blocks left > 239373938rev64 v4.16b, v5.16b //GHASH final-2 block3939ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext39403941eor v4.16b, v4.16b, v8.16b //feed in partial tag39423943movi v8.8b, #0 //suppress further partial tag feed in39443945eor v0.16b, v5.16b, v2.16b //AES final-1 block - result39463947mov d22, v4.d[1] //GHASH final-2 block - mid39483949pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low39503951stp x6, x7, [x2], #16 //AES final-2 block - store result39523953eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid3954mov x7, v0.d[1] //AES final-1 block - mov high39553956eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low3957mov x6, v0.d[0] //AES final-1 block - mov low39583959pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high39603961pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid39623963eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high3964eor x7, x7, x14 //AES final-1 block - round 12 high3965#ifdef __AARCH64EB__3966rev x7, x73967#endif3968eor x6, x6, x13 //AES final-1 block - round 12 low3969#ifdef __AARCH64EB__3970rev x6, x63971#endif3972eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid3973.L192_dec_blocks_more_than_1: //blocks left > 139743975rev64 v4.16b, v5.16b //GHASH final-1 block39763977eor v4.16b, v4.16b, v8.16b //feed in partial tag3978ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext39793980mov d22, v4.d[1] //GHASH final-1 block - mid39813982pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high39833984eor v0.16b, v5.16b, v3.16b //AES final block - result3985stp x6, x7, [x2], #16 //AES final-1 block - store result39863987eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid39883989eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high39903991pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low3992mov x7, v0.d[1] //AES final block - mov high39933994ins v22.d[1], v22.d[0] //GHASH final-1 block - mid3995mov x6, v0.d[0] //AES final block - mov low39963997pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid39983999movi v8.8b, #0 //suppress further partial tag feed in4000eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low4001eor x7, x7, x14 //AES final block - round 12 high4002#ifdef __AARCH64EB__4003rev x7, x74004#endif4005eor x6, x6, x13 //AES final block - round 12 low4006#ifdef __AARCH64EB__4007rev x6, x64008#endif4009eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid4010.L192_dec_blocks_less_than_1: //blocks left <= 140114012mvn x13, xzr //rk12_l = 0xffffffffffffffff4013ldp x4, x5, [x2] //load existing bytes we need to not overwrite4014and x1, x1, #127 //bit_length %= 12840154016sub x1, x1, #128 //bit_length -= 12840174018neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])40194020and x1, x1, #127 //bit_length %= 1284021mvn x14, xzr //rk12_h = 0xffffffffffffffff40224023lsr x14, x14, x1 //rk12_h is mask for top 64b of last block4024cmp x1, #6440254026csel x9, x13, x14, lt4027csel x10, x14, xzr, lt40284029fmov d0, x9 //ctr0b is mask for last block4030and x6, x6, x94031bic x4, x4, x9 //mask out low existing bytes40324033orr x6, x6, x44034mov v0.d[1], x104035#ifndef __AARCH64EB__4036rev w9, w124037#else4038mov w9, w124039#endif40404041and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits4042str w9, [x16, #12] //store the updated counter40434044rev64 v4.16b, v5.16b //GHASH final block40454046eor v4.16b, v4.16b, v8.16b //feed in partial tag4047bic x5, x5, x10 //mask out high existing bytes40484049and x7, x7, x1040504051pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high4052mov d8, v4.d[1] //GHASH final block - mid40534054pmull v21.1q, v4.1d, v12.1d //GHASH final block - low40554056eor v8.8b, v8.8b, v4.8b //GHASH final block - mid40574058eor v9.16b, v9.16b, v20.16b //GHASH final block - high40594060pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid40614062eor v11.16b, v11.16b, v21.16b //GHASH final block - low40634064eor v10.16b, v10.16b, v8.16b //GHASH final block - mid4065movi v8.8b, #0xc240664067eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up40684069shl d8, d8, #56 //mod_constant40704071eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up40724073pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid4074orr x7, x7, x54075stp x6, x7, [x2]40764077ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment40784079eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid40804081eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid40824083pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low40844085eor v11.16b, v11.16b, v8.16b //MODULO - fold into low40864087ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment40884089eor v11.16b, v11.16b, v10.16b //MODULO - fold into low4090ext v11.16b, v11.16b, v11.16b, #84091rev64 v11.16b, v11.16b4092mov x0, x154093st1 { v11.16b }, [x3]40944095ldp x21, x22, [sp, #16]4096ldp x23, x24, [sp, #32]4097ldp d8, d9, [sp, #48]4098ldp d10, d11, [sp, #64]4099ldp d12, d13, [sp, #80]4100ldp d14, d15, [sp, #96]4101ldp x19, x20, [sp], #1124102ret41034104.L192_dec_ret:4105mov w0, #0x04106ret4107.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel4108.globl aes_gcm_enc_256_kernel4109.type aes_gcm_enc_256_kernel,%function4110.align 44111aes_gcm_enc_256_kernel:4112AARCH64_VALID_CALL_TARGET4113cbz x1, .L256_enc_ret4114stp x19, x20, [sp, #-112]!4115mov x16, x44116mov x8, x54117stp x21, x22, [sp, #16]4118stp x23, x24, [sp, #32]4119stp d8, d9, [sp, #48]4120stp d10, d11, [sp, #64]4121stp d12, d13, [sp, #80]4122stp d14, d15, [sp, #96]41234124add x4, x0, x1, lsr #3 //end_input_ptr4125lsr x5, x1, #3 //byte_len4126mov x15, x54127ldp x10, x11, [x16] //ctr96_b64, ctr96_t324128#ifdef __AARCH64EB__4129rev x10, x104130rev x11, x114131#endif4132ldp x13, x14, [x8, #224] //load rk144133#ifdef __AARCH64EB__4134ror x13, x13, #324135ror x14, x14, #324136#endif4137ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible4138sub x5, x5, #1 //byte_len - 141394140ld1 {v18.4s}, [x8], #16 //load rk04141and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)41424143ld1 {v19.4s}, [x8], #16 //load rk14144add x5, x5, x041454146lsr x12, x11, #324147fmov d2, x10 //CTR block 24148orr w11, w11, w1141494150rev w12, w12 //rev_ctr324151cmp x0, x5 //check if we have <= 4 blocks4152fmov d1, x10 //CTR block 141534154aese v0.16b, v18.16b4155aesmc v0.16b, v0.16b //AES block 0 - round 04156add w12, w12, #1 //increment rev_ctr3241574158rev w9, w12 //CTR block 14159fmov d3, x10 //CTR block 341604161orr x9, x11, x9, lsl #32 //CTR block 14162add w12, w12, #1 //CTR block 14163ld1 {v20.4s}, [x8], #16 //load rk241644165fmov v1.d[1], x9 //CTR block 14166rev w9, w12 //CTR block 24167add w12, w12, #1 //CTR block 241684169orr x9, x11, x9, lsl #32 //CTR block 24170ld1 {v21.4s}, [x8], #16 //load rk341714172fmov v2.d[1], x9 //CTR block 24173rev w9, w12 //CTR block 341744175aese v0.16b, v19.16b4176aesmc v0.16b, v0.16b //AES block 0 - round 14177orr x9, x11, x9, lsl #32 //CTR block 341784179fmov v3.d[1], x9 //CTR block 341804181aese v1.16b, v18.16b4182aesmc v1.16b, v1.16b //AES block 1 - round 04183ld1 {v22.4s}, [x8], #16 //load rk441844185aese v0.16b, v20.16b4186aesmc v0.16b, v0.16b //AES block 0 - round 24187ld1 {v23.4s}, [x8], #16 //load rk541884189aese v2.16b, v18.16b4190aesmc v2.16b, v2.16b //AES block 2 - round 04191ld1 {v24.4s}, [x8], #16 //load rk641924193aese v1.16b, v19.16b4194aesmc v1.16b, v1.16b //AES block 1 - round 14195ldr q14, [x3, #80] //load h3l | h3h4196#ifndef __AARCH64EB__4197ext v14.16b, v14.16b, v14.16b, #84198#endif4199aese v3.16b, v18.16b4200aesmc v3.16b, v3.16b //AES block 3 - round 04201ld1 {v25.4s}, [x8], #16 //load rk742024203aese v2.16b, v19.16b4204aesmc v2.16b, v2.16b //AES block 2 - round 14205ld1 {v26.4s}, [x8], #16 //load rk842064207aese v1.16b, v20.16b4208aesmc v1.16b, v1.16b //AES block 1 - round 24209ldr q13, [x3, #64] //load h2l | h2h4210#ifndef __AARCH64EB__4211ext v13.16b, v13.16b, v13.16b, #84212#endif4213aese v3.16b, v19.16b4214aesmc v3.16b, v3.16b //AES block 3 - round 14215ld1 {v27.4s}, [x8], #16 //load rk942164217aese v2.16b, v20.16b4218aesmc v2.16b, v2.16b //AES block 2 - round 24219ldr q15, [x3, #112] //load h4l | h4h4220#ifndef __AARCH64EB__4221ext v15.16b, v15.16b, v15.16b, #84222#endif4223aese v1.16b, v21.16b4224aesmc v1.16b, v1.16b //AES block 1 - round 34225ld1 {v28.4s}, [x8], #16 //load rk1042264227aese v3.16b, v20.16b4228aesmc v3.16b, v3.16b //AES block 3 - round 24229ld1 {v29.4s}, [x8], #16 //load rk1142304231aese v2.16b, v21.16b4232aesmc v2.16b, v2.16b //AES block 2 - round 34233add w12, w12, #1 //CTR block 342344235aese v0.16b, v21.16b4236aesmc v0.16b, v0.16b //AES block 0 - round 342374238aese v3.16b, v21.16b4239aesmc v3.16b, v3.16b //AES block 3 - round 34240ld1 { v11.16b}, [x3]4241ext v11.16b, v11.16b, v11.16b, #84242rev64 v11.16b, v11.16b42434244aese v2.16b, v22.16b4245aesmc v2.16b, v2.16b //AES block 2 - round 442464247aese v0.16b, v22.16b4248aesmc v0.16b, v0.16b //AES block 0 - round 442494250aese v1.16b, v22.16b4251aesmc v1.16b, v1.16b //AES block 1 - round 442524253aese v3.16b, v22.16b4254aesmc v3.16b, v3.16b //AES block 3 - round 442554256aese v0.16b, v23.16b4257aesmc v0.16b, v0.16b //AES block 0 - round 542584259aese v1.16b, v23.16b4260aesmc v1.16b, v1.16b //AES block 1 - round 542614262aese v3.16b, v23.16b4263aesmc v3.16b, v3.16b //AES block 3 - round 542644265aese v2.16b, v23.16b4266aesmc v2.16b, v2.16b //AES block 2 - round 542674268aese v1.16b, v24.16b4269aesmc v1.16b, v1.16b //AES block 1 - round 64270trn2 v17.2d, v14.2d, v15.2d //h4l | h3l42714272aese v3.16b, v24.16b4273aesmc v3.16b, v3.16b //AES block 3 - round 64274ld1 {v30.4s}, [x8], #16 //load rk1242754276aese v0.16b, v24.16b4277aesmc v0.16b, v0.16b //AES block 0 - round 64278ldr q12, [x3, #32] //load h1l | h1h4279#ifndef __AARCH64EB__4280ext v12.16b, v12.16b, v12.16b, #84281#endif4282aese v2.16b, v24.16b4283aesmc v2.16b, v2.16b //AES block 2 - round 64284ld1 {v31.4s}, [x8], #16 //load rk1342854286aese v1.16b, v25.16b4287aesmc v1.16b, v1.16b //AES block 1 - round 74288trn1 v9.2d, v14.2d, v15.2d //h4h | h3h42894290aese v0.16b, v25.16b4291aesmc v0.16b, v0.16b //AES block 0 - round 742924293aese v2.16b, v25.16b4294aesmc v2.16b, v2.16b //AES block 2 - round 742954296aese v3.16b, v25.16b4297aesmc v3.16b, v3.16b //AES block 3 - round 74298trn2 v16.2d, v12.2d, v13.2d //h2l | h1l42994300aese v1.16b, v26.16b4301aesmc v1.16b, v1.16b //AES block 1 - round 843024303aese v2.16b, v26.16b4304aesmc v2.16b, v2.16b //AES block 2 - round 843054306aese v3.16b, v26.16b4307aesmc v3.16b, v3.16b //AES block 3 - round 843084309aese v1.16b, v27.16b4310aesmc v1.16b, v1.16b //AES block 1 - round 943114312aese v2.16b, v27.16b4313aesmc v2.16b, v2.16b //AES block 2 - round 943144315aese v0.16b, v26.16b4316aesmc v0.16b, v0.16b //AES block 0 - round 843174318aese v1.16b, v28.16b4319aesmc v1.16b, v1.16b //AES block 1 - round 1043204321aese v3.16b, v27.16b4322aesmc v3.16b, v3.16b //AES block 3 - round 943234324aese v0.16b, v27.16b4325aesmc v0.16b, v0.16b //AES block 0 - round 943264327aese v2.16b, v28.16b4328aesmc v2.16b, v2.16b //AES block 2 - round 1043294330aese v3.16b, v28.16b4331aesmc v3.16b, v3.16b //AES block 3 - round 1043324333aese v1.16b, v29.16b4334aesmc v1.16b, v1.16b //AES block 1 - round 1143354336aese v2.16b, v29.16b4337aesmc v2.16b, v2.16b //AES block 2 - round 1143384339aese v0.16b, v28.16b4340aesmc v0.16b, v0.16b //AES block 0 - round 1043414342aese v1.16b, v30.16b4343aesmc v1.16b, v1.16b //AES block 1 - round 1243444345aese v2.16b, v30.16b4346aesmc v2.16b, v2.16b //AES block 2 - round 1243474348aese v0.16b, v29.16b4349aesmc v0.16b, v0.16b //AES block 0 - round 114350eor v17.16b, v17.16b, v9.16b //h4k | h3k43514352aese v3.16b, v29.16b4353aesmc v3.16b, v3.16b //AES block 3 - round 1143544355aese v2.16b, v31.16b //AES block 2 - round 134356trn1 v8.2d, v12.2d, v13.2d //h2h | h1h43574358aese v0.16b, v30.16b4359aesmc v0.16b, v0.16b //AES block 0 - round 1243604361aese v3.16b, v30.16b4362aesmc v3.16b, v3.16b //AES block 3 - round 1243634364aese v1.16b, v31.16b //AES block 1 - round 1343654366aese v0.16b, v31.16b //AES block 0 - round 1343674368aese v3.16b, v31.16b //AES block 3 - round 134369eor v16.16b, v16.16b, v8.16b //h2k | h1k4370b.ge .L256_enc_tail //handle tail43714372ldp x19, x20, [x0, #16] //AES block 1 - load plaintext4373#ifdef __AARCH64EB__4374rev x19, x194375rev x20, x204376#endif4377rev w9, w12 //CTR block 44378ldp x6, x7, [x0, #0] //AES block 0 - load plaintext4379#ifdef __AARCH64EB__4380rev x6, x64381rev x7, x74382#endif4383ldp x23, x24, [x0, #48] //AES block 3 - load plaintext4384#ifdef __AARCH64EB__4385rev x23, x234386rev x24, x244387#endif4388ldp x21, x22, [x0, #32] //AES block 2 - load plaintext4389#ifdef __AARCH64EB__4390rev x21, x214391rev x22, x224392#endif4393add x0, x0, #64 //AES input_ptr update43944395eor x19, x19, x13 //AES block 1 - round 14 low4396eor x20, x20, x14 //AES block 1 - round 14 high43974398fmov d5, x19 //AES block 1 - mov low4399eor x6, x6, x13 //AES block 0 - round 14 low44004401eor x7, x7, x14 //AES block 0 - round 14 high4402eor x24, x24, x14 //AES block 3 - round 14 high4403fmov d4, x6 //AES block 0 - mov low44044405cmp x0, x5 //check if we have <= 8 blocks4406fmov v4.d[1], x7 //AES block 0 - mov high4407eor x23, x23, x13 //AES block 3 - round 14 low44084409eor x21, x21, x13 //AES block 2 - round 14 low4410fmov v5.d[1], x20 //AES block 1 - mov high44114412fmov d6, x21 //AES block 2 - mov low4413add w12, w12, #1 //CTR block 444144415orr x9, x11, x9, lsl #32 //CTR block 44416fmov d7, x23 //AES block 3 - mov low4417eor x22, x22, x14 //AES block 2 - round 14 high44184419fmov v6.d[1], x22 //AES block 2 - mov high44204421eor v4.16b, v4.16b, v0.16b //AES block 0 - result4422fmov d0, x10 //CTR block 444234424fmov v0.d[1], x9 //CTR block 44425rev w9, w12 //CTR block 54426add w12, w12, #1 //CTR block 544274428eor v5.16b, v5.16b, v1.16b //AES block 1 - result4429fmov d1, x10 //CTR block 54430orr x9, x11, x9, lsl #32 //CTR block 544314432fmov v1.d[1], x9 //CTR block 54433rev w9, w12 //CTR block 64434st1 { v4.16b}, [x2], #16 //AES block 0 - store result44354436fmov v7.d[1], x24 //AES block 3 - mov high4437orr x9, x11, x9, lsl #32 //CTR block 64438eor v6.16b, v6.16b, v2.16b //AES block 2 - result44394440st1 { v5.16b}, [x2], #16 //AES block 1 - store result44414442add w12, w12, #1 //CTR block 64443fmov d2, x10 //CTR block 644444445fmov v2.d[1], x9 //CTR block 64446st1 { v6.16b}, [x2], #16 //AES block 2 - store result4447rev w9, w12 //CTR block 744484449orr x9, x11, x9, lsl #32 //CTR block 744504451eor v7.16b, v7.16b, v3.16b //AES block 3 - result4452st1 { v7.16b}, [x2], #16 //AES block 3 - store result4453b.ge .L256_enc_prepretail //do prepretail44544455.L256_enc_main_loop: //main loop start4456aese v0.16b, v18.16b4457aesmc v0.16b, v0.16b //AES block 4k+4 - round 04458rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)44594460aese v1.16b, v18.16b4461aesmc v1.16b, v1.16b //AES block 4k+5 - round 04462fmov d3, x10 //CTR block 4k+344634464aese v2.16b, v18.16b4465aesmc v2.16b, v2.16b //AES block 4k+6 - round 04466ext v11.16b, v11.16b, v11.16b, #8 //PRE 044674468aese v0.16b, v19.16b4469aesmc v0.16b, v0.16b //AES block 4k+4 - round 14470fmov v3.d[1], x9 //CTR block 4k+344714472aese v1.16b, v19.16b4473aesmc v1.16b, v1.16b //AES block 4k+5 - round 14474ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext4475#ifdef __AARCH64EB__4476rev x23, x234477rev x24, x244478#endif4479aese v2.16b, v19.16b4480aesmc v2.16b, v2.16b //AES block 4k+6 - round 14481ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext4482#ifdef __AARCH64EB__4483rev x21, x214484rev x22, x224485#endif4486aese v0.16b, v20.16b4487aesmc v0.16b, v0.16b //AES block 4k+4 - round 24488eor v4.16b, v4.16b, v11.16b //PRE 144894490aese v1.16b, v20.16b4491aesmc v1.16b, v1.16b //AES block 4k+5 - round 244924493aese v3.16b, v18.16b4494aesmc v3.16b, v3.16b //AES block 4k+7 - round 04495eor x23, x23, x13 //AES block 4k+7 - round 14 low44964497aese v0.16b, v21.16b4498aesmc v0.16b, v0.16b //AES block 4k+4 - round 34499mov d10, v17.d[1] //GHASH block 4k - mid45004501pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high4502eor x22, x22, x14 //AES block 4k+6 - round 14 high4503mov d8, v4.d[1] //GHASH block 4k - mid45044505aese v3.16b, v19.16b4506aesmc v3.16b, v3.16b //AES block 4k+7 - round 14507rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)45084509aese v0.16b, v22.16b4510aesmc v0.16b, v0.16b //AES block 4k+4 - round 445114512pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low4513eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid45144515aese v2.16b, v20.16b4516aesmc v2.16b, v2.16b //AES block 4k+6 - round 245174518aese v0.16b, v23.16b4519aesmc v0.16b, v0.16b //AES block 4k+4 - round 54520rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)45214522pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high45234524pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid4525rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)45264527pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low45284529eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high4530mov d4, v5.d[1] //GHASH block 4k+1 - mid45314532aese v1.16b, v21.16b4533aesmc v1.16b, v1.16b //AES block 4k+5 - round 345344535aese v3.16b, v20.16b4536aesmc v3.16b, v3.16b //AES block 4k+7 - round 24537eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low45384539aese v2.16b, v21.16b4540aesmc v2.16b, v2.16b //AES block 4k+6 - round 345414542aese v1.16b, v22.16b4543aesmc v1.16b, v1.16b //AES block 4k+5 - round 44544mov d8, v6.d[1] //GHASH block 4k+2 - mid45454546aese v3.16b, v21.16b4547aesmc v3.16b, v3.16b //AES block 4k+7 - round 34548eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid45494550aese v2.16b, v22.16b4551aesmc v2.16b, v2.16b //AES block 4k+6 - round 445524553aese v0.16b, v24.16b4554aesmc v0.16b, v0.16b //AES block 4k+4 - round 64555eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid45564557aese v3.16b, v22.16b4558aesmc v3.16b, v3.16b //AES block 4k+7 - round 445594560pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid45614562aese v0.16b, v25.16b4563aesmc v0.16b, v0.16b //AES block 4k+4 - round 745644565aese v3.16b, v23.16b4566aesmc v3.16b, v3.16b //AES block 4k+7 - round 54567ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid45684569aese v1.16b, v23.16b4570aesmc v1.16b, v1.16b //AES block 4k+5 - round 545714572aese v0.16b, v26.16b4573aesmc v0.16b, v0.16b //AES block 4k+4 - round 845744575aese v2.16b, v23.16b4576aesmc v2.16b, v2.16b //AES block 4k+6 - round 545774578aese v1.16b, v24.16b4579aesmc v1.16b, v1.16b //AES block 4k+5 - round 64580eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid45814582pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high45834584pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low45854586aese v1.16b, v25.16b4587aesmc v1.16b, v1.16b //AES block 4k+5 - round 745884589pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low4590eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high45914592aese v3.16b, v24.16b4593aesmc v3.16b, v3.16b //AES block 4k+7 - round 64594ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext4595#ifdef __AARCH64EB__4596rev x19, x194597rev x20, x204598#endif4599aese v1.16b, v26.16b4600aesmc v1.16b, v1.16b //AES block 4k+5 - round 84601mov d4, v7.d[1] //GHASH block 4k+3 - mid46024603aese v2.16b, v24.16b4604aesmc v2.16b, v2.16b //AES block 4k+6 - round 64605eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low46064607pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid46084609pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high4610eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid46114612aese v2.16b, v25.16b4613aesmc v2.16b, v2.16b //AES block 4k+6 - round 74614eor x19, x19, x13 //AES block 4k+5 - round 14 low46154616aese v1.16b, v27.16b4617aesmc v1.16b, v1.16b //AES block 4k+5 - round 94618eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid46194620aese v3.16b, v25.16b4621aesmc v3.16b, v3.16b //AES block 4k+7 - round 74622eor x21, x21, x13 //AES block 4k+6 - round 14 low46234624aese v0.16b, v27.16b4625aesmc v0.16b, v0.16b //AES block 4k+4 - round 94626movi v8.8b, #0xc246274628pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid4629eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high4630fmov d5, x19 //AES block 4k+5 - mov low46314632aese v2.16b, v26.16b4633aesmc v2.16b, v2.16b //AES block 4k+6 - round 84634ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext4635#ifdef __AARCH64EB__4636rev x6, x64637rev x7, x74638#endif4639aese v0.16b, v28.16b4640aesmc v0.16b, v0.16b //AES block 4k+4 - round 104641shl d8, d8, #56 //mod_constant46424643aese v3.16b, v26.16b4644aesmc v3.16b, v3.16b //AES block 4k+7 - round 84645eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low46464647aese v2.16b, v27.16b4648aesmc v2.16b, v2.16b //AES block 4k+6 - round 946494650aese v1.16b, v28.16b4651aesmc v1.16b, v1.16b //AES block 4k+5 - round 104652eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid46534654aese v3.16b, v27.16b4655aesmc v3.16b, v3.16b //AES block 4k+7 - round 94656add w12, w12, #1 //CTR block 4k+346574658aese v0.16b, v29.16b4659aesmc v0.16b, v0.16b //AES block 4k+4 - round 114660eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up46614662aese v1.16b, v29.16b4663aesmc v1.16b, v1.16b //AES block 4k+5 - round 114664add x0, x0, #64 //AES input_ptr update46654666pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid4667rev w9, w12 //CTR block 4k+84668ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment46694670aese v2.16b, v28.16b4671aesmc v2.16b, v2.16b //AES block 4k+6 - round 104672eor x6, x6, x13 //AES block 4k+4 - round 14 low46734674aese v1.16b, v30.16b4675aesmc v1.16b, v1.16b //AES block 4k+5 - round 124676eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up46774678aese v3.16b, v28.16b4679aesmc v3.16b, v3.16b //AES block 4k+7 - round 104680eor x7, x7, x14 //AES block 4k+4 - round 14 high46814682fmov d4, x6 //AES block 4k+4 - mov low4683orr x9, x11, x9, lsl #32 //CTR block 4k+84684eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid46854686aese v0.16b, v30.16b4687aesmc v0.16b, v0.16b //AES block 4k+4 - round 124688eor x20, x20, x14 //AES block 4k+5 - round 14 high46894690aese v2.16b, v29.16b4691aesmc v2.16b, v2.16b //AES block 4k+6 - round 114692eor x24, x24, x14 //AES block 4k+7 - round 14 high46934694aese v3.16b, v29.16b4695aesmc v3.16b, v3.16b //AES block 4k+7 - round 114696add w12, w12, #1 //CTR block 4k+846974698aese v0.16b, v31.16b //AES block 4k+4 - round 134699fmov v4.d[1], x7 //AES block 4k+4 - mov high4700eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid47014702aese v2.16b, v30.16b4703aesmc v2.16b, v2.16b //AES block 4k+6 - round 124704fmov d7, x23 //AES block 4k+7 - mov low47054706aese v1.16b, v31.16b //AES block 4k+5 - round 134707fmov v5.d[1], x20 //AES block 4k+5 - mov high47084709fmov d6, x21 //AES block 4k+6 - mov low4710cmp x0, x5 //.LOOP CONTROL47114712fmov v6.d[1], x22 //AES block 4k+6 - mov high47134714pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low4715eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result4716fmov d0, x10 //CTR block 4k+847174718fmov v0.d[1], x9 //CTR block 4k+84719rev w9, w12 //CTR block 4k+94720add w12, w12, #1 //CTR block 4k+947214722eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result4723fmov d1, x10 //CTR block 4k+94724orr x9, x11, x9, lsl #32 //CTR block 4k+947254726aese v3.16b, v30.16b4727aesmc v3.16b, v3.16b //AES block 4k+7 - round 124728fmov v1.d[1], x9 //CTR block 4k+947294730aese v2.16b, v31.16b //AES block 4k+6 - round 134731rev w9, w12 //CTR block 4k+104732st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result47334734orr x9, x11, x9, lsl #32 //CTR block 4k+104735eor v11.16b, v11.16b, v9.16b //MODULO - fold into low4736fmov v7.d[1], x24 //AES block 4k+7 - mov high47374738ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment4739st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result4740add w12, w12, #1 //CTR block 4k+1047414742aese v3.16b, v31.16b //AES block 4k+7 - round 134743eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result4744fmov d2, x10 //CTR block 4k+1047454746st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result4747fmov v2.d[1], x9 //CTR block 4k+104748rev w9, w12 //CTR block 4k+1147494750eor v11.16b, v11.16b, v10.16b //MODULO - fold into low4751orr x9, x11, x9, lsl #32 //CTR block 4k+1147524753eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result4754st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result4755b.lt .L256_enc_main_loop47564757.L256_enc_prepretail: //PREPRETAIL4758aese v1.16b, v18.16b4759aesmc v1.16b, v1.16b //AES block 4k+5 - round 04760rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)47614762aese v2.16b, v18.16b4763aesmc v2.16b, v2.16b //AES block 4k+6 - round 04764fmov d3, x10 //CTR block 4k+347654766aese v0.16b, v18.16b4767aesmc v0.16b, v0.16b //AES block 4k+4 - round 04768rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)47694770fmov v3.d[1], x9 //CTR block 4k+34771ext v11.16b, v11.16b, v11.16b, #8 //PRE 047724773aese v2.16b, v19.16b4774aesmc v2.16b, v2.16b //AES block 4k+6 - round 147754776aese v0.16b, v19.16b4777aesmc v0.16b, v0.16b //AES block 4k+4 - round 147784779eor v4.16b, v4.16b, v11.16b //PRE 14780rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)47814782aese v2.16b, v20.16b4783aesmc v2.16b, v2.16b //AES block 4k+6 - round 247844785aese v3.16b, v18.16b4786aesmc v3.16b, v3.16b //AES block 4k+7 - round 04787mov d10, v17.d[1] //GHASH block 4k - mid47884789aese v1.16b, v19.16b4790aesmc v1.16b, v1.16b //AES block 4k+5 - round 147914792pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low4793mov d8, v4.d[1] //GHASH block 4k - mid47944795pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high47964797aese v2.16b, v21.16b4798aesmc v2.16b, v2.16b //AES block 4k+6 - round 347994800aese v1.16b, v20.16b4801aesmc v1.16b, v1.16b //AES block 4k+5 - round 24802eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid48034804aese v0.16b, v20.16b4805aesmc v0.16b, v0.16b //AES block 4k+4 - round 248064807aese v3.16b, v19.16b4808aesmc v3.16b, v3.16b //AES block 4k+7 - round 148094810aese v1.16b, v21.16b4811aesmc v1.16b, v1.16b //AES block 4k+5 - round 348124813pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid48144815pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high48164817pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low48184819aese v3.16b, v20.16b4820aesmc v3.16b, v3.16b //AES block 4k+7 - round 248214822eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high4823mov d4, v5.d[1] //GHASH block 4k+1 - mid48244825aese v0.16b, v21.16b4826aesmc v0.16b, v0.16b //AES block 4k+4 - round 34827eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low48284829aese v3.16b, v21.16b4830aesmc v3.16b, v3.16b //AES block 4k+7 - round 348314832eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid4833mov d8, v6.d[1] //GHASH block 4k+2 - mid48344835aese v0.16b, v22.16b4836aesmc v0.16b, v0.16b //AES block 4k+4 - round 44837rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)48384839aese v3.16b, v22.16b4840aesmc v3.16b, v3.16b //AES block 4k+7 - round 448414842pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid4843eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid4844add w12, w12, #1 //CTR block 4k+348454846pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low48474848aese v3.16b, v23.16b4849aesmc v3.16b, v3.16b //AES block 4k+7 - round 548504851aese v2.16b, v22.16b4852aesmc v2.16b, v2.16b //AES block 4k+6 - round 44853eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid48544855pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high48564857eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low4858ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid48594860aese v2.16b, v23.16b4861aesmc v2.16b, v2.16b //AES block 4k+6 - round 548624863eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high4864mov d4, v7.d[1] //GHASH block 4k+3 - mid48654866aese v1.16b, v22.16b4867aesmc v1.16b, v1.16b //AES block 4k+5 - round 448684869pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid48704871eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid48724873pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high48744875aese v1.16b, v23.16b4876aesmc v1.16b, v1.16b //AES block 4k+5 - round 548774878pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid4879eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid48804881aese v0.16b, v23.16b4882aesmc v0.16b, v0.16b //AES block 4k+4 - round 548834884aese v1.16b, v24.16b4885aesmc v1.16b, v1.16b //AES block 4k+5 - round 648864887aese v2.16b, v24.16b4888aesmc v2.16b, v2.16b //AES block 4k+6 - round 648894890aese v0.16b, v24.16b4891aesmc v0.16b, v0.16b //AES block 4k+4 - round 64892movi v8.8b, #0xc248934894aese v3.16b, v24.16b4895aesmc v3.16b, v3.16b //AES block 4k+7 - round 648964897aese v1.16b, v25.16b4898aesmc v1.16b, v1.16b //AES block 4k+5 - round 74899eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high49004901aese v0.16b, v25.16b4902aesmc v0.16b, v0.16b //AES block 4k+4 - round 749034904aese v3.16b, v25.16b4905aesmc v3.16b, v3.16b //AES block 4k+7 - round 74906shl d8, d8, #56 //mod_constant49074908aese v1.16b, v26.16b4909aesmc v1.16b, v1.16b //AES block 4k+5 - round 84910eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid49114912pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low49134914aese v3.16b, v26.16b4915aesmc v3.16b, v3.16b //AES block 4k+7 - round 849164917aese v1.16b, v27.16b4918aesmc v1.16b, v1.16b //AES block 4k+5 - round 949194920aese v0.16b, v26.16b4921aesmc v0.16b, v0.16b //AES block 4k+4 - round 84922eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low49234924aese v3.16b, v27.16b4925aesmc v3.16b, v3.16b //AES block 4k+7 - round 949264927eor v10.16b, v10.16b, v9.16b //karatsuba tidy up49284929pmull v4.1q, v9.1d, v8.1d4930ext v9.16b, v9.16b, v9.16b, #849314932aese v3.16b, v28.16b4933aesmc v3.16b, v3.16b //AES block 4k+7 - round 1049344935aese v2.16b, v25.16b4936aesmc v2.16b, v2.16b //AES block 4k+6 - round 74937eor v10.16b, v10.16b, v11.16b49384939aese v1.16b, v28.16b4940aesmc v1.16b, v1.16b //AES block 4k+5 - round 1049414942aese v0.16b, v27.16b4943aesmc v0.16b, v0.16b //AES block 4k+4 - round 949444945aese v2.16b, v26.16b4946aesmc v2.16b, v2.16b //AES block 4k+6 - round 849474948aese v1.16b, v29.16b4949aesmc v1.16b, v1.16b //AES block 4k+5 - round 114950eor v10.16b, v10.16b, v4.16b49514952aese v0.16b, v28.16b4953aesmc v0.16b, v0.16b //AES block 4k+4 - round 1049544955aese v2.16b, v27.16b4956aesmc v2.16b, v2.16b //AES block 4k+6 - round 949574958aese v1.16b, v30.16b4959aesmc v1.16b, v1.16b //AES block 4k+5 - round 1249604961aese v0.16b, v29.16b4962aesmc v0.16b, v0.16b //AES block 4k+4 - round 114963eor v10.16b, v10.16b, v9.16b49644965aese v3.16b, v29.16b4966aesmc v3.16b, v3.16b //AES block 4k+7 - round 1149674968aese v2.16b, v28.16b4969aesmc v2.16b, v2.16b //AES block 4k+6 - round 1049704971aese v0.16b, v30.16b4972aesmc v0.16b, v0.16b //AES block 4k+4 - round 1249734974pmull v4.1q, v10.1d, v8.1d49754976aese v2.16b, v29.16b4977aesmc v2.16b, v2.16b //AES block 4k+6 - round 114978ext v10.16b, v10.16b, v10.16b, #849794980aese v3.16b, v30.16b4981aesmc v3.16b, v3.16b //AES block 4k+7 - round 1249824983aese v1.16b, v31.16b //AES block 4k+5 - round 134984eor v11.16b, v11.16b, v4.16b49854986aese v2.16b, v30.16b4987aesmc v2.16b, v2.16b //AES block 4k+6 - round 1249884989aese v3.16b, v31.16b //AES block 4k+7 - round 1349904991aese v0.16b, v31.16b //AES block 4k+4 - round 1349924993aese v2.16b, v31.16b //AES block 4k+6 - round 134994eor v11.16b, v11.16b, v10.16b4995.L256_enc_tail: //TAIL49964997ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag4998sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process4999ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext5000#ifdef __AARCH64EB__5001rev x6, x65002rev x7, x75003#endif5004eor x6, x6, x13 //AES block 4k+4 - round 14 low5005eor x7, x7, x14 //AES block 4k+4 - round 14 high50065007cmp x5, #485008fmov d4, x6 //AES block 4k+4 - mov low50095010fmov v4.d[1], x7 //AES block 4k+4 - mov high50115012eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result5013b.gt .L256_enc_blocks_more_than_350145015cmp x5, #325016mov v3.16b, v2.16b5017movi v11.8b, #050185019movi v9.8b, #05020sub w12, w12, #150215022mov v2.16b, v1.16b5023movi v10.8b, #05024b.gt .L256_enc_blocks_more_than_250255026mov v3.16b, v1.16b5027sub w12, w12, #15028cmp x5, #1650295030b.gt .L256_enc_blocks_more_than_150315032sub w12, w12, #15033b .L256_enc_blocks_less_than_15034.L256_enc_blocks_more_than_3: //blocks left > 35035st1 { v5.16b}, [x2], #16 //AES final-3 block - store result50365037ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high5038#ifdef __AARCH64EB__5039rev x6, x65040rev x7, x75041#endif5042rev64 v4.16b, v5.16b //GHASH final-3 block50435044eor x6, x6, x13 //AES final-2 block - round 14 low5045eor v4.16b, v4.16b, v8.16b //feed in partial tag50465047eor x7, x7, x14 //AES final-2 block - round 14 high50485049mov d22, v4.d[1] //GHASH final-3 block - mid5050fmov d5, x6 //AES final-2 block - mov low50515052fmov v5.d[1], x7 //AES final-2 block - mov high50535054eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid5055movi v8.8b, #0 //suppress further partial tag feed in50565057mov d10, v17.d[1] //GHASH final-3 block - mid50585059pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low50605061pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high50625063pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid5064eor v5.16b, v5.16b, v1.16b //AES final-2 block - result5065.L256_enc_blocks_more_than_2: //blocks left > 250665067st1 { v5.16b}, [x2], #16 //AES final-2 block - store result50685069ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high5070#ifdef __AARCH64EB__5071rev x6, x65072rev x7, x75073#endif5074rev64 v4.16b, v5.16b //GHASH final-2 block50755076eor x6, x6, x13 //AES final-1 block - round 14 low5077eor v4.16b, v4.16b, v8.16b //feed in partial tag50785079fmov d5, x6 //AES final-1 block - mov low5080eor x7, x7, x14 //AES final-1 block - round 14 high50815082fmov v5.d[1], x7 //AES final-1 block - mov high50835084movi v8.8b, #0 //suppress further partial tag feed in50855086pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high5087mov d22, v4.d[1] //GHASH final-2 block - mid50885089pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low50905091eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid50925093eor v5.16b, v5.16b, v2.16b //AES final-1 block - result50945095eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high50965097pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid50985099eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low51005101eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid5102.L256_enc_blocks_more_than_1: //blocks left > 151035104st1 { v5.16b}, [x2], #16 //AES final-1 block - store result51055106rev64 v4.16b, v5.16b //GHASH final-1 block51075108ldp x6, x7, [x0], #16 //AES final block - load input low & high5109#ifdef __AARCH64EB__5110rev x6, x65111rev x7, x75112#endif5113eor v4.16b, v4.16b, v8.16b //feed in partial tag51145115movi v8.8b, #0 //suppress further partial tag feed in51165117eor x6, x6, x13 //AES final block - round 14 low5118mov d22, v4.d[1] //GHASH final-1 block - mid51195120pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high5121eor x7, x7, x14 //AES final block - round 14 high51225123eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid51245125eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high51265127ins v22.d[1], v22.d[0] //GHASH final-1 block - mid5128fmov d5, x6 //AES final block - mov low51295130fmov v5.d[1], x7 //AES final block - mov high51315132pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid51335134pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low51355136eor v5.16b, v5.16b, v3.16b //AES final block - result5137eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid51385139eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low5140.L256_enc_blocks_less_than_1: //blocks left <= 151415142and x1, x1, #127 //bit_length %= 12851435144mvn x13, xzr //rk14_l = 0xffffffffffffffff5145sub x1, x1, #128 //bit_length -= 12851465147neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])5148ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored51495150mvn x14, xzr //rk14_h = 0xffffffffffffffff5151and x1, x1, #127 //bit_length %= 12851525153lsr x14, x14, x1 //rk14_h is mask for top 64b of last block5154cmp x1, #6451555156csel x6, x13, x14, lt5157csel x7, x14, xzr, lt51585159fmov d0, x6 //ctr0b is mask for last block51605161fmov v0.d[1], x751625163and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits51645165rev64 v4.16b, v5.16b //GHASH final block51665167eor v4.16b, v4.16b, v8.16b //feed in partial tag51685169bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing51705171pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high5172mov d8, v4.d[1] //GHASH final block - mid5173#ifndef __AARCH64EB__5174rev w9, w125175#else5176mov w9, w125177#endif51785179pmull v21.1q, v4.1d, v12.1d //GHASH final block - low51805181eor v9.16b, v9.16b, v20.16b //GHASH final block - high5182eor v8.8b, v8.8b, v4.8b //GHASH final block - mid51835184pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid51855186eor v11.16b, v11.16b, v21.16b //GHASH final block - low51875188eor v10.16b, v10.16b, v8.16b //GHASH final block - mid5189movi v8.8b, #0xc251905191eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up51925193shl d8, d8, #56 //mod_constant51945195eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up51965197pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid51985199ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment52005201eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid52025203eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid52045205pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low52065207ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment52085209str w9, [x16, #12] //store the updated counter52105211st1 { v5.16b}, [x2] //store all 16B5212eor v11.16b, v11.16b, v9.16b //MODULO - fold into low52135214eor v11.16b, v11.16b, v10.16b //MODULO - fold into low5215ext v11.16b, v11.16b, v11.16b, #85216rev64 v11.16b, v11.16b5217mov x0, x155218st1 { v11.16b }, [x3]52195220ldp x21, x22, [sp, #16]5221ldp x23, x24, [sp, #32]5222ldp d8, d9, [sp, #48]5223ldp d10, d11, [sp, #64]5224ldp d12, d13, [sp, #80]5225ldp d14, d15, [sp, #96]5226ldp x19, x20, [sp], #1125227ret52285229.L256_enc_ret:5230mov w0, #0x05231ret5232.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel5233.globl aes_gcm_dec_256_kernel5234.type aes_gcm_dec_256_kernel,%function5235.align 45236aes_gcm_dec_256_kernel:5237AARCH64_VALID_CALL_TARGET5238cbz x1, .L256_dec_ret5239stp x19, x20, [sp, #-112]!5240mov x16, x45241mov x8, x55242stp x21, x22, [sp, #16]5243stp x23, x24, [sp, #32]5244stp d8, d9, [sp, #48]5245stp d10, d11, [sp, #64]5246stp d12, d13, [sp, #80]5247stp d14, d15, [sp, #96]52485249lsr x5, x1, #3 //byte_len5250mov x15, x55251ldp x10, x11, [x16] //ctr96_b64, ctr96_t325252#ifdef __AARCH64EB__5253rev x10, x105254rev x11, x115255#endif5256ldp x13, x14, [x8, #224] //load rk145257#ifdef __AARCH64EB__5258ror x14, x14, #325259ror x13, x13, #325260#endif5261ld1 {v18.4s}, [x8], #16 //load rk05262sub x5, x5, #1 //byte_len - 152635264ld1 {v19.4s}, [x8], #16 //load rk15265and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)52665267add x4, x0, x1, lsr #3 //end_input_ptr5268ld1 {v20.4s}, [x8], #16 //load rk252695270lsr x12, x11, #325271ld1 {v21.4s}, [x8], #16 //load rk35272orr w11, w11, w1152735274ld1 {v22.4s}, [x8], #16 //load rk45275add x5, x5, x05276rev w12, w12 //rev_ctr3252775278add w12, w12, #1 //increment rev_ctr325279fmov d3, x10 //CTR block 352805281rev w9, w12 //CTR block 15282add w12, w12, #1 //CTR block 15283fmov d1, x10 //CTR block 152845285orr x9, x11, x9, lsl #32 //CTR block 15286ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible52875288fmov v1.d[1], x9 //CTR block 15289rev w9, w12 //CTR block 25290add w12, w12, #1 //CTR block 252915292fmov d2, x10 //CTR block 25293orr x9, x11, x9, lsl #32 //CTR block 252945295fmov v2.d[1], x9 //CTR block 25296rev w9, w12 //CTR block 352975298orr x9, x11, x9, lsl #32 //CTR block 35299ld1 {v23.4s}, [x8], #16 //load rk553005301fmov v3.d[1], x9 //CTR block 35302add w12, w12, #1 //CTR block 353035304ld1 {v24.4s}, [x8], #16 //load rk653055306ld1 {v25.4s}, [x8], #16 //load rk753075308ld1 {v26.4s}, [x8], #16 //load rk853095310aese v0.16b, v18.16b5311aesmc v0.16b, v0.16b //AES block 0 - round 05312ldr q14, [x3, #80] //load h3l | h3h5313#ifndef __AARCH64EB__5314ext v14.16b, v14.16b, v14.16b, #85315#endif53165317aese v3.16b, v18.16b5318aesmc v3.16b, v3.16b //AES block 3 - round 05319ldr q15, [x3, #112] //load h4l | h4h5320#ifndef __AARCH64EB__5321ext v15.16b, v15.16b, v15.16b, #85322#endif53235324aese v1.16b, v18.16b5325aesmc v1.16b, v1.16b //AES block 1 - round 05326ldr q13, [x3, #64] //load h2l | h2h5327#ifndef __AARCH64EB__5328ext v13.16b, v13.16b, v13.16b, #85329#endif53305331aese v2.16b, v18.16b5332aesmc v2.16b, v2.16b //AES block 2 - round 05333ld1 {v27.4s}, [x8], #16 //load rk953345335aese v0.16b, v19.16b5336aesmc v0.16b, v0.16b //AES block 0 - round 153375338aese v1.16b, v19.16b5339aesmc v1.16b, v1.16b //AES block 1 - round 15340ld1 { v11.16b}, [x3]5341ext v11.16b, v11.16b, v11.16b, #85342rev64 v11.16b, v11.16b53435344aese v2.16b, v19.16b5345aesmc v2.16b, v2.16b //AES block 2 - round 15346ld1 {v28.4s}, [x8], #16 //load rk1053475348aese v3.16b, v19.16b5349aesmc v3.16b, v3.16b //AES block 3 - round 15350ld1 {v29.4s}, [x8], #16 //load rk1153515352aese v0.16b, v20.16b5353aesmc v0.16b, v0.16b //AES block 0 - round 25354ldr q12, [x3, #32] //load h1l | h1h5355#ifndef __AARCH64EB__5356ext v12.16b, v12.16b, v12.16b, #85357#endif5358aese v2.16b, v20.16b5359aesmc v2.16b, v2.16b //AES block 2 - round 25360ld1 {v30.4s}, [x8], #16 //load rk1253615362aese v3.16b, v20.16b5363aesmc v3.16b, v3.16b //AES block 3 - round 253645365aese v0.16b, v21.16b5366aesmc v0.16b, v0.16b //AES block 0 - round 353675368aese v1.16b, v20.16b5369aesmc v1.16b, v1.16b //AES block 1 - round 253705371aese v3.16b, v21.16b5372aesmc v3.16b, v3.16b //AES block 3 - round 353735374aese v0.16b, v22.16b5375aesmc v0.16b, v0.16b //AES block 0 - round 45376cmp x0, x5 //check if we have <= 4 blocks53775378aese v2.16b, v21.16b5379aesmc v2.16b, v2.16b //AES block 2 - round 353805381aese v1.16b, v21.16b5382aesmc v1.16b, v1.16b //AES block 1 - round 353835384aese v3.16b, v22.16b5385aesmc v3.16b, v3.16b //AES block 3 - round 453865387aese v2.16b, v22.16b5388aesmc v2.16b, v2.16b //AES block 2 - round 453895390aese v1.16b, v22.16b5391aesmc v1.16b, v1.16b //AES block 1 - round 453925393aese v3.16b, v23.16b5394aesmc v3.16b, v3.16b //AES block 3 - round 553955396aese v0.16b, v23.16b5397aesmc v0.16b, v0.16b //AES block 0 - round 553985399aese v1.16b, v23.16b5400aesmc v1.16b, v1.16b //AES block 1 - round 554015402aese v2.16b, v23.16b5403aesmc v2.16b, v2.16b //AES block 2 - round 554045405aese v0.16b, v24.16b5406aesmc v0.16b, v0.16b //AES block 0 - round 654075408aese v3.16b, v24.16b5409aesmc v3.16b, v3.16b //AES block 3 - round 654105411aese v1.16b, v24.16b5412aesmc v1.16b, v1.16b //AES block 1 - round 654135414aese v2.16b, v24.16b5415aesmc v2.16b, v2.16b //AES block 2 - round 654165417aese v0.16b, v25.16b5418aesmc v0.16b, v0.16b //AES block 0 - round 754195420aese v1.16b, v25.16b5421aesmc v1.16b, v1.16b //AES block 1 - round 754225423aese v3.16b, v25.16b5424aesmc v3.16b, v3.16b //AES block 3 - round 754255426aese v0.16b, v26.16b5427aesmc v0.16b, v0.16b //AES block 0 - round 854285429aese v2.16b, v25.16b5430aesmc v2.16b, v2.16b //AES block 2 - round 754315432aese v3.16b, v26.16b5433aesmc v3.16b, v3.16b //AES block 3 - round 854345435aese v1.16b, v26.16b5436aesmc v1.16b, v1.16b //AES block 1 - round 854375438aese v0.16b, v27.16b5439aesmc v0.16b, v0.16b //AES block 0 - round 954405441aese v2.16b, v26.16b5442aesmc v2.16b, v2.16b //AES block 2 - round 85443ld1 {v31.4s}, [x8], #16 //load rk1354445445aese v1.16b, v27.16b5446aesmc v1.16b, v1.16b //AES block 1 - round 954475448aese v0.16b, v28.16b5449aesmc v0.16b, v0.16b //AES block 0 - round 1054505451aese v3.16b, v27.16b5452aesmc v3.16b, v3.16b //AES block 3 - round 954535454aese v1.16b, v28.16b5455aesmc v1.16b, v1.16b //AES block 1 - round 1054565457aese v2.16b, v27.16b5458aesmc v2.16b, v2.16b //AES block 2 - round 954595460aese v3.16b, v28.16b5461aesmc v3.16b, v3.16b //AES block 3 - round 1054625463aese v0.16b, v29.16b5464aesmc v0.16b, v0.16b //AES block 0 - round 1154655466aese v2.16b, v28.16b5467aesmc v2.16b, v2.16b //AES block 2 - round 1054685469aese v3.16b, v29.16b5470aesmc v3.16b, v3.16b //AES block 3 - round 1154715472aese v1.16b, v29.16b5473aesmc v1.16b, v1.16b //AES block 1 - round 1154745475aese v2.16b, v29.16b5476aesmc v2.16b, v2.16b //AES block 2 - round 1154775478trn1 v9.2d, v14.2d, v15.2d //h4h | h3h54795480trn2 v17.2d, v14.2d, v15.2d //h4l | h3l54815482trn1 v8.2d, v12.2d, v13.2d //h2h | h1h5483trn2 v16.2d, v12.2d, v13.2d //h2l | h1l54845485aese v1.16b, v30.16b5486aesmc v1.16b, v1.16b //AES block 1 - round 1254875488aese v0.16b, v30.16b5489aesmc v0.16b, v0.16b //AES block 0 - round 1254905491aese v2.16b, v30.16b5492aesmc v2.16b, v2.16b //AES block 2 - round 1254935494aese v3.16b, v30.16b5495aesmc v3.16b, v3.16b //AES block 3 - round 125496eor v17.16b, v17.16b, v9.16b //h4k | h3k54975498aese v1.16b, v31.16b //AES block 1 - round 1354995500aese v2.16b, v31.16b //AES block 2 - round 135501eor v16.16b, v16.16b, v8.16b //h2k | h1k55025503aese v3.16b, v31.16b //AES block 3 - round 1355045505aese v0.16b, v31.16b //AES block 0 - round 135506b.ge .L256_dec_tail //handle tail55075508ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext55095510rev w9, w12 //CTR block 455115512eor v0.16b, v4.16b, v0.16b //AES block 0 - result55135514eor v1.16b, v5.16b, v1.16b //AES block 1 - result5515rev64 v5.16b, v5.16b //GHASH block 15516ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext55175518mov x7, v0.d[1] //AES block 0 - mov high55195520mov x6, v0.d[0] //AES block 0 - mov low5521rev64 v4.16b, v4.16b //GHASH block 05522add w12, w12, #1 //CTR block 455235524fmov d0, x10 //CTR block 45525orr x9, x11, x9, lsl #32 //CTR block 455265527fmov v0.d[1], x9 //CTR block 45528rev w9, w12 //CTR block 55529add w12, w12, #1 //CTR block 555305531mov x19, v1.d[0] //AES block 1 - mov low55325533orr x9, x11, x9, lsl #32 //CTR block 55534mov x20, v1.d[1] //AES block 1 - mov high5535eor x7, x7, x14 //AES block 0 - round 14 high5536#ifdef __AARCH64EB__5537rev x7, x75538#endif5539eor x6, x6, x13 //AES block 0 - round 14 low5540#ifdef __AARCH64EB__5541rev x6, x65542#endif5543stp x6, x7, [x2], #16 //AES block 0 - store result5544fmov d1, x10 //CTR block 555455546ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext55475548fmov v1.d[1], x9 //CTR block 55549rev w9, w12 //CTR block 65550add w12, w12, #1 //CTR block 655515552eor x19, x19, x13 //AES block 1 - round 14 low5553#ifdef __AARCH64EB__5554rev x19, x195555#endif5556orr x9, x11, x9, lsl #32 //CTR block 655575558eor x20, x20, x14 //AES block 1 - round 14 high5559#ifdef __AARCH64EB__5560rev x20, x205561#endif5562stp x19, x20, [x2], #16 //AES block 1 - store result55635564eor v2.16b, v6.16b, v2.16b //AES block 2 - result5565cmp x0, x5 //check if we have <= 8 blocks5566b.ge .L256_dec_prepretail //do prepretail55675568.L256_dec_main_loop: //main loop start5569mov x21, v2.d[0] //AES block 4k+2 - mov low5570ext v11.16b, v11.16b, v11.16b, #8 //PRE 05571eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result55725573aese v0.16b, v18.16b5574aesmc v0.16b, v0.16b //AES block 4k+4 - round 05575mov x22, v2.d[1] //AES block 4k+2 - mov high55765577aese v1.16b, v18.16b5578aesmc v1.16b, v1.16b //AES block 4k+5 - round 05579fmov d2, x10 //CTR block 4k+655805581fmov v2.d[1], x9 //CTR block 4k+65582eor v4.16b, v4.16b, v11.16b //PRE 15583rev w9, w12 //CTR block 4k+755845585aese v0.16b, v19.16b5586aesmc v0.16b, v0.16b //AES block 4k+4 - round 15587mov x24, v3.d[1] //AES block 4k+3 - mov high55885589aese v1.16b, v19.16b5590aesmc v1.16b, v1.16b //AES block 4k+5 - round 15591mov x23, v3.d[0] //AES block 4k+3 - mov low55925593pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high5594mov d8, v4.d[1] //GHASH block 4k - mid5595fmov d3, x10 //CTR block 4k+755965597aese v0.16b, v20.16b5598aesmc v0.16b, v0.16b //AES block 4k+4 - round 25599orr x9, x11, x9, lsl #32 //CTR block 4k+756005601aese v2.16b, v18.16b5602aesmc v2.16b, v2.16b //AES block 4k+6 - round 05603fmov v3.d[1], x9 //CTR block 4k+756045605aese v1.16b, v20.16b5606aesmc v1.16b, v1.16b //AES block 4k+5 - round 25607eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid56085609aese v0.16b, v21.16b5610aesmc v0.16b, v0.16b //AES block 4k+4 - round 35611eor x22, x22, x14 //AES block 4k+2 - round 14 high5612#ifdef __AARCH64EB__5613rev x22, x225614#endif5615aese v2.16b, v19.16b5616aesmc v2.16b, v2.16b //AES block 4k+6 - round 15617mov d10, v17.d[1] //GHASH block 4k - mid56185619aese v1.16b, v21.16b5620aesmc v1.16b, v1.16b //AES block 4k+5 - round 35621rev64 v6.16b, v6.16b //GHASH block 4k+256225623aese v3.16b, v18.16b5624aesmc v3.16b, v3.16b //AES block 4k+7 - round 05625eor x21, x21, x13 //AES block 4k+2 - round 14 low5626#ifdef __AARCH64EB__5627rev x21, x215628#endif5629aese v2.16b, v20.16b5630aesmc v2.16b, v2.16b //AES block 4k+6 - round 25631stp x21, x22, [x2], #16 //AES block 4k+2 - store result56325633pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low56345635pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high56365637aese v2.16b, v21.16b5638aesmc v2.16b, v2.16b //AES block 4k+6 - round 35639rev64 v7.16b, v7.16b //GHASH block 4k+356405641pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid5642eor x23, x23, x13 //AES block 4k+3 - round 14 low5643#ifdef __AARCH64EB__5644rev x23, x235645#endif5646pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low5647eor x24, x24, x14 //AES block 4k+3 - round 14 high5648#ifdef __AARCH64EB__5649rev x24, x245650#endif5651eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high56525653aese v2.16b, v22.16b5654aesmc v2.16b, v2.16b //AES block 4k+6 - round 456555656aese v3.16b, v19.16b5657aesmc v3.16b, v3.16b //AES block 4k+7 - round 15658mov d4, v5.d[1] //GHASH block 4k+1 - mid56595660aese v0.16b, v22.16b5661aesmc v0.16b, v0.16b //AES block 4k+4 - round 45662eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low56635664aese v2.16b, v23.16b5665aesmc v2.16b, v2.16b //AES block 4k+6 - round 55666add w12, w12, #1 //CTR block 4k+756675668aese v3.16b, v20.16b5669aesmc v3.16b, v3.16b //AES block 4k+7 - round 25670mov d8, v6.d[1] //GHASH block 4k+2 - mid56715672aese v1.16b, v22.16b5673aesmc v1.16b, v1.16b //AES block 4k+5 - round 45674eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid56755676pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low56775678aese v3.16b, v21.16b5679aesmc v3.16b, v3.16b //AES block 4k+7 - round 35680eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid56815682aese v1.16b, v23.16b5683aesmc v1.16b, v1.16b //AES block 4k+5 - round 556845685aese v0.16b, v23.16b5686aesmc v0.16b, v0.16b //AES block 4k+4 - round 55687eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low56885689pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid5690rev w9, w12 //CTR block 4k+856915692aese v1.16b, v24.16b5693aesmc v1.16b, v1.16b //AES block 4k+5 - round 65694ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid56955696aese v0.16b, v24.16b5697aesmc v0.16b, v0.16b //AES block 4k+4 - round 65698add w12, w12, #1 //CTR block 4k+856995700aese v3.16b, v22.16b5701aesmc v3.16b, v3.16b //AES block 4k+7 - round 457025703aese v1.16b, v25.16b5704aesmc v1.16b, v1.16b //AES block 4k+5 - round 75705eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid57065707aese v0.16b, v25.16b5708aesmc v0.16b, v0.16b //AES block 4k+4 - round 757095710pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high5711mov d6, v7.d[1] //GHASH block 4k+3 - mid57125713aese v3.16b, v23.16b5714aesmc v3.16b, v3.16b //AES block 4k+7 - round 557155716pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid57175718aese v0.16b, v26.16b5719aesmc v0.16b, v0.16b //AES block 4k+4 - round 85720eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high57215722aese v3.16b, v24.16b5723aesmc v3.16b, v3.16b //AES block 4k+7 - round 657245725pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low5726orr x9, x11, x9, lsl #32 //CTR block 4k+85727eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid57285729pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high57305731aese v0.16b, v27.16b5732aesmc v0.16b, v0.16b //AES block 4k+4 - round 95733eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid57345735aese v1.16b, v26.16b5736aesmc v1.16b, v1.16b //AES block 4k+5 - round 857375738aese v2.16b, v24.16b5739aesmc v2.16b, v2.16b //AES block 4k+6 - round 65740eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high57415742aese v0.16b, v28.16b5743aesmc v0.16b, v0.16b //AES block 4k+4 - round 1057445745pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid5746movi v8.8b, #0xc257475748aese v2.16b, v25.16b5749aesmc v2.16b, v2.16b //AES block 4k+6 - round 75750eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low57515752aese v0.16b, v29.16b5753aesmc v0.16b, v0.16b //AES block 4k+4 - round 1157545755aese v3.16b, v25.16b5756aesmc v3.16b, v3.16b //AES block 4k+7 - round 75757shl d8, d8, #56 //mod_constant57585759aese v2.16b, v26.16b5760aesmc v2.16b, v2.16b //AES block 4k+6 - round 85761eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid57625763aese v0.16b, v30.16b5764aesmc v0.16b, v0.16b //AES block 4k+4 - round 1257655766pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid5767eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up57685769aese v1.16b, v27.16b5770aesmc v1.16b, v1.16b //AES block 4k+5 - round 95771ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext57725773aese v0.16b, v31.16b //AES block 4k+4 - round 135774ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment57755776aese v1.16b, v28.16b5777aesmc v1.16b, v1.16b //AES block 4k+5 - round 105778eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up57795780aese v2.16b, v27.16b5781aesmc v2.16b, v2.16b //AES block 4k+6 - round 95782ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext57835784aese v3.16b, v26.16b5785aesmc v3.16b, v3.16b //AES block 4k+7 - round 85786eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result57875788aese v1.16b, v29.16b5789aesmc v1.16b, v1.16b //AES block 4k+5 - round 115790stp x23, x24, [x2], #16 //AES block 4k+3 - store result57915792aese v2.16b, v28.16b5793aesmc v2.16b, v2.16b //AES block 4k+6 - round 105794eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid57955796aese v3.16b, v27.16b5797aesmc v3.16b, v3.16b //AES block 4k+7 - round 95798ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext57995800aese v1.16b, v30.16b5801aesmc v1.16b, v1.16b //AES block 4k+5 - round 125802ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext58035804aese v2.16b, v29.16b5805aesmc v2.16b, v2.16b //AES block 4k+6 - round 115806mov x7, v0.d[1] //AES block 4k+4 - mov high58075808aese v3.16b, v28.16b5809aesmc v3.16b, v3.16b //AES block 4k+7 - round 105810eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid58115812aese v1.16b, v31.16b //AES block 4k+5 - round 135813mov x6, v0.d[0] //AES block 4k+4 - mov low58145815aese v2.16b, v30.16b5816aesmc v2.16b, v2.16b //AES block 4k+6 - round 125817fmov d0, x10 //CTR block 4k+858185819aese v3.16b, v29.16b5820aesmc v3.16b, v3.16b //AES block 4k+7 - round 115821fmov v0.d[1], x9 //CTR block 4k+858225823pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low5824eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result5825rev w9, w12 //CTR block 4k+958265827aese v2.16b, v31.16b //AES block 4k+6 - round 135828orr x9, x11, x9, lsl #32 //CTR block 4k+95829cmp x0, x5 //.LOOP CONTROL58305831add w12, w12, #1 //CTR block 4k+958325833eor x6, x6, x13 //AES block 4k+4 - round 14 low5834#ifdef __AARCH64EB__5835rev x6, x65836#endif5837eor x7, x7, x14 //AES block 4k+4 - round 14 high5838#ifdef __AARCH64EB__5839rev x7, x75840#endif5841mov x20, v1.d[1] //AES block 4k+5 - mov high5842eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result5843eor v11.16b, v11.16b, v8.16b //MODULO - fold into low58445845aese v3.16b, v30.16b5846aesmc v3.16b, v3.16b //AES block 4k+7 - round 125847mov x19, v1.d[0] //AES block 4k+5 - mov low58485849fmov d1, x10 //CTR block 4k+95850ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment58515852fmov v1.d[1], x9 //CTR block 4k+95853rev w9, w12 //CTR block 4k+105854add w12, w12, #1 //CTR block 4k+1058555856aese v3.16b, v31.16b //AES block 4k+7 - round 135857orr x9, x11, x9, lsl #32 //CTR block 4k+1058585859rev64 v5.16b, v5.16b //GHASH block 4k+55860eor x20, x20, x14 //AES block 4k+5 - round 14 high5861#ifdef __AARCH64EB__5862rev x20, x205863#endif5864stp x6, x7, [x2], #16 //AES block 4k+4 - store result58655866eor x19, x19, x13 //AES block 4k+5 - round 14 low5867#ifdef __AARCH64EB__5868rev x19, x195869#endif5870stp x19, x20, [x2], #16 //AES block 4k+5 - store result58715872rev64 v4.16b, v4.16b //GHASH block 4k+45873eor v11.16b, v11.16b, v10.16b //MODULO - fold into low5874b.lt .L256_dec_main_loop587558765877.L256_dec_prepretail: //PREPRETAIL5878ext v11.16b, v11.16b, v11.16b, #8 //PRE 05879mov x21, v2.d[0] //AES block 4k+2 - mov low5880eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result58815882aese v0.16b, v18.16b5883aesmc v0.16b, v0.16b //AES block 4k+4 - round 05884mov x22, v2.d[1] //AES block 4k+2 - mov high58855886aese v1.16b, v18.16b5887aesmc v1.16b, v1.16b //AES block 4k+5 - round 05888fmov d2, x10 //CTR block 4k+658895890fmov v2.d[1], x9 //CTR block 4k+65891rev w9, w12 //CTR block 4k+75892eor v4.16b, v4.16b, v11.16b //PRE 158935894rev64 v6.16b, v6.16b //GHASH block 4k+25895orr x9, x11, x9, lsl #32 //CTR block 4k+75896mov x23, v3.d[0] //AES block 4k+3 - mov low58975898aese v1.16b, v19.16b5899aesmc v1.16b, v1.16b //AES block 4k+5 - round 15900mov x24, v3.d[1] //AES block 4k+3 - mov high59015902pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low5903mov d8, v4.d[1] //GHASH block 4k - mid5904fmov d3, x10 //CTR block 4k+759055906pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high5907fmov v3.d[1], x9 //CTR block 4k+759085909aese v2.16b, v18.16b5910aesmc v2.16b, v2.16b //AES block 4k+6 - round 05911mov d10, v17.d[1] //GHASH block 4k - mid59125913aese v0.16b, v19.16b5914aesmc v0.16b, v0.16b //AES block 4k+4 - round 15915eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid59165917pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high59185919aese v2.16b, v19.16b5920aesmc v2.16b, v2.16b //AES block 4k+6 - round 15921rev64 v7.16b, v7.16b //GHASH block 4k+359225923aese v3.16b, v18.16b5924aesmc v3.16b, v3.16b //AES block 4k+7 - round 059255926pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid5927eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high59285929pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low59305931aese v3.16b, v19.16b5932aesmc v3.16b, v3.16b //AES block 4k+7 - round 15933mov d4, v5.d[1] //GHASH block 4k+1 - mid59345935aese v0.16b, v20.16b5936aesmc v0.16b, v0.16b //AES block 4k+4 - round 259375938aese v1.16b, v20.16b5939aesmc v1.16b, v1.16b //AES block 4k+5 - round 25940eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low59415942aese v2.16b, v20.16b5943aesmc v2.16b, v2.16b //AES block 4k+6 - round 259445945aese v0.16b, v21.16b5946aesmc v0.16b, v0.16b //AES block 4k+4 - round 35947mov d8, v6.d[1] //GHASH block 4k+2 - mid59485949aese v3.16b, v20.16b5950aesmc v3.16b, v3.16b //AES block 4k+7 - round 25951eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid59525953pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low59545955aese v0.16b, v22.16b5956aesmc v0.16b, v0.16b //AES block 4k+4 - round 459575958aese v3.16b, v21.16b5959aesmc v3.16b, v3.16b //AES block 4k+7 - round 35960eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid59615962pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid59635964aese v0.16b, v23.16b5965aesmc v0.16b, v0.16b //AES block 4k+4 - round 55966eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low59675968aese v3.16b, v22.16b5969aesmc v3.16b, v3.16b //AES block 4k+7 - round 459705971pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high5972eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid59735974pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high59755976aese v3.16b, v23.16b5977aesmc v3.16b, v3.16b //AES block 4k+7 - round 55978ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid59795980aese v2.16b, v21.16b5981aesmc v2.16b, v2.16b //AES block 4k+6 - round 359825983aese v1.16b, v21.16b5984aesmc v1.16b, v1.16b //AES block 4k+5 - round 35985eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high59865987pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low59885989aese v2.16b, v22.16b5990aesmc v2.16b, v2.16b //AES block 4k+6 - round 45991mov d6, v7.d[1] //GHASH block 4k+3 - mid59925993aese v1.16b, v22.16b5994aesmc v1.16b, v1.16b //AES block 4k+5 - round 459955996pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid59975998aese v2.16b, v23.16b5999aesmc v2.16b, v2.16b //AES block 4k+6 - round 56000eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid60016002aese v1.16b, v23.16b6003aesmc v1.16b, v1.16b //AES block 4k+5 - round 560046005aese v3.16b, v24.16b6006aesmc v3.16b, v3.16b //AES block 4k+7 - round 66007eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid60086009aese v2.16b, v24.16b6010aesmc v2.16b, v2.16b //AES block 4k+6 - round 660116012aese v0.16b, v24.16b6013aesmc v0.16b, v0.16b //AES block 4k+4 - round 66014movi v8.8b, #0xc260156016aese v1.16b, v24.16b6017aesmc v1.16b, v1.16b //AES block 4k+5 - round 66018eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low60196020pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid60216022aese v3.16b, v25.16b6023aesmc v3.16b, v3.16b //AES block 4k+7 - round 76024eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high60256026aese v1.16b, v25.16b6027aesmc v1.16b, v1.16b //AES block 4k+5 - round 760286029aese v0.16b, v25.16b6030aesmc v0.16b, v0.16b //AES block 4k+4 - round 76031eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid60326033aese v3.16b, v26.16b6034aesmc v3.16b, v3.16b //AES block 4k+7 - round 860356036aese v2.16b, v25.16b6037aesmc v2.16b, v2.16b //AES block 4k+6 - round 76038eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up60396040aese v1.16b, v26.16b6041aesmc v1.16b, v1.16b //AES block 4k+5 - round 860426043aese v0.16b, v26.16b6044aesmc v0.16b, v0.16b //AES block 4k+4 - round 86045shl d8, d8, #56 //mod_constant60466047aese v2.16b, v26.16b6048aesmc v2.16b, v2.16b //AES block 4k+6 - round 860496050aese v1.16b, v27.16b6051aesmc v1.16b, v1.16b //AES block 4k+5 - round 96052eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up60536054pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid60556056aese v2.16b, v27.16b6057aesmc v2.16b, v2.16b //AES block 4k+6 - round 96058ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment60596060aese v3.16b, v27.16b6061aesmc v3.16b, v3.16b //AES block 4k+7 - round 960626063aese v0.16b, v27.16b6064aesmc v0.16b, v0.16b //AES block 4k+4 - round 96065eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid60666067aese v2.16b, v28.16b6068aesmc v2.16b, v2.16b //AES block 4k+6 - round 1060696070aese v3.16b, v28.16b6071aesmc v3.16b, v3.16b //AES block 4k+7 - round 1060726073aese v0.16b, v28.16b6074aesmc v0.16b, v0.16b //AES block 4k+4 - round 106075eor x22, x22, x14 //AES block 4k+2 - round 14 high6076#ifdef __AARCH64EB__6077rev x22, x226078#endif6079aese v1.16b, v28.16b6080aesmc v1.16b, v1.16b //AES block 4k+5 - round 106081eor x23, x23, x13 //AES block 4k+3 - round 14 low6082#ifdef __AARCH64EB__6083rev x23, x236084#endif6085aese v2.16b, v29.16b6086aesmc v2.16b, v2.16b //AES block 4k+6 - round 116087eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid60886089aese v0.16b, v29.16b6090aesmc v0.16b, v0.16b //AES block 4k+4 - round 116091add w12, w12, #1 //CTR block 4k+760926093aese v1.16b, v29.16b6094aesmc v1.16b, v1.16b //AES block 4k+5 - round 116095eor x21, x21, x13 //AES block 4k+2 - round 14 low6096#ifdef __AARCH64EB__6097rev x21, x216098#endif60996100aese v2.16b, v30.16b6101aesmc v2.16b, v2.16b //AES block 4k+6 - round 1261026103pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low6104eor x24, x24, x14 //AES block 4k+3 - round 14 high6105#ifdef __AARCH64EB__6106rev x24, x246107#endif61086109aese v3.16b, v29.16b6110aesmc v3.16b, v3.16b //AES block 4k+7 - round 116111stp x21, x22, [x2], #16 //AES block 4k+2 - store result61126113aese v1.16b, v30.16b6114aesmc v1.16b, v1.16b //AES block 4k+5 - round 126115ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment61166117aese v0.16b, v30.16b6118aesmc v0.16b, v0.16b //AES block 4k+4 - round 126119stp x23, x24, [x2], #16 //AES block 4k+3 - store result61206121aese v3.16b, v30.16b6122aesmc v3.16b, v3.16b //AES block 4k+7 - round 126123eor v11.16b, v11.16b, v8.16b //MODULO - fold into low61246125aese v1.16b, v31.16b //AES block 4k+5 - round 1361266127aese v0.16b, v31.16b //AES block 4k+4 - round 1361286129aese v3.16b, v31.16b //AES block 4k+7 - round 1361306131aese v2.16b, v31.16b //AES block 4k+6 - round 136132eor v11.16b, v11.16b, v10.16b //MODULO - fold into low6133.L256_dec_tail: //TAIL61346135sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process6136ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext61376138eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result61396140mov x6, v0.d[0] //AES block 4k+4 - mov low61416142mov x7, v0.d[1] //AES block 4k+4 - mov high6143ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag61446145cmp x5, #4861466147eor x6, x6, x13 //AES block 4k+4 - round 14 low6148#ifdef __AARCH64EB__6149rev x6, x66150#endif61516152eor x7, x7, x14 //AES block 4k+4 - round 14 high6153#ifdef __AARCH64EB__6154rev x7, x76155#endif6156b.gt .L256_dec_blocks_more_than_361576158sub w12, w12, #16159mov v3.16b, v2.16b6160movi v10.8b, #061616162movi v11.8b, #06163cmp x5, #3261646165movi v9.8b, #06166mov v2.16b, v1.16b6167b.gt .L256_dec_blocks_more_than_261686169sub w12, w12, #161706171mov v3.16b, v1.16b6172cmp x5, #166173b.gt .L256_dec_blocks_more_than_161746175sub w12, w12, #16176b .L256_dec_blocks_less_than_16177.L256_dec_blocks_more_than_3: //blocks left > 36178rev64 v4.16b, v5.16b //GHASH final-3 block6179ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext61806181stp x6, x7, [x2], #16 //AES final-3 block - store result61826183mov d10, v17.d[1] //GHASH final-3 block - mid61846185eor v4.16b, v4.16b, v8.16b //feed in partial tag61866187eor v0.16b, v5.16b, v1.16b //AES final-2 block - result61886189mov d22, v4.d[1] //GHASH final-3 block - mid61906191mov x6, v0.d[0] //AES final-2 block - mov low61926193mov x7, v0.d[1] //AES final-2 block - mov high61946195eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid61966197movi v8.8b, #0 //suppress further partial tag feed in61986199pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high62006201pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid6202eor x6, x6, x13 //AES final-2 block - round 14 low6203#ifdef __AARCH64EB__6204rev x6, x66205#endif62066207pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low6208eor x7, x7, x14 //AES final-2 block - round 14 high6209#ifdef __AARCH64EB__6210rev x7, x76211#endif6212.L256_dec_blocks_more_than_2: //blocks left > 262136214rev64 v4.16b, v5.16b //GHASH final-2 block6215ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext62166217eor v4.16b, v4.16b, v8.16b //feed in partial tag6218stp x6, x7, [x2], #16 //AES final-2 block - store result62196220eor v0.16b, v5.16b, v2.16b //AES final-1 block - result62216222mov d22, v4.d[1] //GHASH final-2 block - mid62236224pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low62256226pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high62276228eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid6229mov x6, v0.d[0] //AES final-1 block - mov low62306231mov x7, v0.d[1] //AES final-1 block - mov high6232eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low6233movi v8.8b, #0 //suppress further partial tag feed in62346235pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid62366237eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high6238eor x6, x6, x13 //AES final-1 block - round 14 low6239#ifdef __AARCH64EB__6240rev x6, x66241#endif62426243eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid6244eor x7, x7, x14 //AES final-1 block - round 14 high6245#ifdef __AARCH64EB__6246rev x7, x76247#endif6248.L256_dec_blocks_more_than_1: //blocks left > 162496250stp x6, x7, [x2], #16 //AES final-1 block - store result6251rev64 v4.16b, v5.16b //GHASH final-1 block62526253ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext62546255eor v4.16b, v4.16b, v8.16b //feed in partial tag6256movi v8.8b, #0 //suppress further partial tag feed in62576258mov d22, v4.d[1] //GHASH final-1 block - mid62596260eor v0.16b, v5.16b, v3.16b //AES final block - result62616262pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high62636264eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid62656266pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low6267mov x6, v0.d[0] //AES final block - mov low62686269ins v22.d[1], v22.d[0] //GHASH final-1 block - mid62706271mov x7, v0.d[1] //AES final block - mov high62726273pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid6274eor x6, x6, x13 //AES final block - round 14 low6275#ifdef __AARCH64EB__6276rev x6, x66277#endif6278eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low62796280eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high62816282eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid6283eor x7, x7, x14 //AES final block - round 14 high6284#ifdef __AARCH64EB__6285rev x7, x76286#endif6287.L256_dec_blocks_less_than_1: //blocks left <= 162886289and x1, x1, #127 //bit_length %= 1286290mvn x14, xzr //rk14_h = 0xffffffffffffffff62916292sub x1, x1, #128 //bit_length -= 1286293mvn x13, xzr //rk14_l = 0xffffffffffffffff62946295ldp x4, x5, [x2] //load existing bytes we need to not overwrite6296neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])62976298and x1, x1, #127 //bit_length %= 12862996300lsr x14, x14, x1 //rk14_h is mask for top 64b of last block6301cmp x1, #6463026303csel x9, x13, x14, lt6304csel x10, x14, xzr, lt63056306fmov d0, x9 //ctr0b is mask for last block6307and x6, x6, x963086309mov v0.d[1], x106310bic x4, x4, x9 //mask out low existing bytes63116312#ifndef __AARCH64EB__6313rev w9, w126314#else6315mov w9, w126316#endif63176318bic x5, x5, x10 //mask out high existing bytes63196320orr x6, x6, x463216322and x7, x7, x1063236324orr x7, x7, x563256326and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits63276328rev64 v4.16b, v5.16b //GHASH final block63296330eor v4.16b, v4.16b, v8.16b //feed in partial tag63316332pmull v21.1q, v4.1d, v12.1d //GHASH final block - low63336334mov d8, v4.d[1] //GHASH final block - mid63356336eor v8.8b, v8.8b, v4.8b //GHASH final block - mid63376338pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high63396340pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid63416342eor v9.16b, v9.16b, v20.16b //GHASH final block - high63436344eor v11.16b, v11.16b, v21.16b //GHASH final block - low63456346eor v10.16b, v10.16b, v8.16b //GHASH final block - mid6347movi v8.8b, #0xc263486349eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up63506351shl d8, d8, #56 //mod_constant63526353eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up63546355pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid63566357ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment63586359eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid63606361eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid63626363pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low63646365ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment63666367eor v11.16b, v11.16b, v8.16b //MODULO - fold into low63686369stp x6, x7, [x2]63706371str w9, [x16, #12] //store the updated counter63726373eor v11.16b, v11.16b, v10.16b //MODULO - fold into low6374ext v11.16b, v11.16b, v11.16b, #86375rev64 v11.16b, v11.16b6376mov x0, x156377st1 { v11.16b }, [x3]63786379ldp x21, x22, [sp, #16]6380ldp x23, x24, [sp, #32]6381ldp d8, d9, [sp, #48]6382ldp d10, d11, [sp, #64]6383ldp d12, d13, [sp, #80]6384ldp d14, d15, [sp, #96]6385ldp x19, x20, [sp], #1126386ret63876388.L256_dec_ret:6389mov w0, #0x06390ret6391.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel6392.section .rodata6393.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,06394.align 26395.align 26396#endif639763986399