Path: blob/main/sys/crypto/openssl/aarch64/vpaes-armv8.S
39507 views
/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */1#include "arm_arch.h"23.section .rodata45.type _vpaes_consts,%object6.align 7 // totally strategic alignment7_vpaes_consts:8.Lk_mc_forward: // mc_forward9.quad 0x0407060500030201, 0x0C0F0E0D080B0A0910.quad 0x080B0A0904070605, 0x000302010C0F0E0D11.quad 0x0C0F0E0D080B0A09, 0x040706050003020112.quad 0x000302010C0F0E0D, 0x080B0A090407060513.Lk_mc_backward: // mc_backward14.quad 0x0605040702010003, 0x0E0D0C0F0A09080B15.quad 0x020100030E0D0C0F, 0x0A09080B0605040716.quad 0x0E0D0C0F0A09080B, 0x060504070201000317.quad 0x0A09080B06050407, 0x020100030E0D0C0F18.Lk_sr: // sr19.quad 0x0706050403020100, 0x0F0E0D0C0B0A090820.quad 0x030E09040F0A0500, 0x0B06010C07020D0821.quad 0x0F060D040B020900, 0x070E050C030A010822.quad 0x0B0E0104070A0D00, 0x0306090C0F0205082324//25// "Hot" constants26//27.Lk_inv: // inv, inva28.quad 0x0E05060F0D080180, 0x040703090A0B0C0229.quad 0x01040A060F0B0780, 0x030D0E0C0205080930.Lk_ipt: // input transform (lo, hi)31.quad 0xC2B2E8985A2A7000, 0xCABAE0905222780832.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC8133.Lk_sbo: // sbou, sbot34.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A87835.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA36.Lk_sb1: // sb1u, sb1t37.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF38.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF54439.Lk_sb2: // sb2u, sb2t40.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A41.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD4243//44// Decryption stuff45//46.Lk_dipt: // decryption input transform47.quad 0x0F505B040B545F00, 0x154A411E114E451A48.quad 0x86E383E660056500, 0x12771772F491F19449.Lk_dsbo: // decryption sbox final output50.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D51.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C52.Lk_dsb9: // decryption sbox output *9*u, *9*t53.quad 0x851C03539A86D600, 0xCAD51F504F994CC954.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA56555.Lk_dsbd: // decryption sbox output *D*u, *D*t56.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A443957.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD358.Lk_dsbb: // decryption sbox output *B*u, *B*t59.quad 0xD022649296B44200, 0x602646F6B0F2D40460.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B61.Lk_dsbe: // decryption sbox output *E*u, *E*t62.quad 0x46F2929626D4D000, 0x2242600464B4F6B063.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E326465//66// Key schedule constants67//68.Lk_dksd: // decryption key schedule: invskew x*D69.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF970.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E71.Lk_dksb: // decryption key schedule: invskew x*B72.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C9973.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C874.Lk_dkse: // decryption key schedule: invskew x*E + 0x6375.quad 0xD5031CCA1FC9D600, 0x53859A4C994F508676.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B3148777.Lk_dks9: // decryption key schedule: invskew x*978.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC79.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE8081.Lk_rcon: // rcon82.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D818384.Lk_opt: // output transform85.quad 0xFF9F4929D6B66000, 0xF7974121DEBE680886.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE087.Lk_deskew: // deskew tables: inverts the sbox's "skew"88.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A89.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E779091.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,092.align 293.size _vpaes_consts,.-_vpaes_consts94.align 69596.text9798//99// _aes_preheat100//101// Fills register %r10 -> .aes_consts (so you can -fPIC)102// and %xmm9-%xmm15 as specified below.103//104.type _vpaes_encrypt_preheat,%function105.align 4106_vpaes_encrypt_preheat:107adrp x10, .Lk_inv108add x10, x10, #:lo12:.Lk_inv109movi v17.16b, #0x0f110ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv111ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo112ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2113ret114.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat115116//117// _aes_encrypt_core118//119// AES-encrypt %xmm0.120//121// Inputs:122// %xmm0 = input123// %xmm9-%xmm15 as in _vpaes_preheat124// (%rdx) = scheduled keys125//126// Output in %xmm0127// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax128// Preserves %xmm6 - %xmm8 so you get some local vectors129//130//131.type _vpaes_encrypt_core,%function132.align 4133_vpaes_encrypt_core:134mov x9, x2135ldr w8, [x2,#240] // pull rounds136adrp x11, .Lk_mc_forward+16137add x11, x11, #:lo12:.Lk_mc_forward+16138// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo139ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key140and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1141ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0142tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1143// vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi144tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2145eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0146eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0147b .Lenc_entry148149.align 4150.Lenc_loop:151// middle of middle round152add x10, x11, #0x40153tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u154ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]155tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t156eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k157tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u158eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A159tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t160ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]161tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B162eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A163tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D164eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B165tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C166eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D167and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4168eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D169sub w8, w8, #1 // nr--170171.Lenc_entry:172// top of round173and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k174ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i175tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k176eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j177tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i178tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j179eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k180eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k181tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak182tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak183eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io184eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo185ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5186cbnz w8, .Lenc_loop187188// middle of last round189add x10, x11, #0x80190// vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo191// vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16192tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou193ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]194tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t195eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k196eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A197tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0198ret199.size _vpaes_encrypt_core,.-_vpaes_encrypt_core200201.globl vpaes_encrypt202.type vpaes_encrypt,%function203.align 4204vpaes_encrypt:205AARCH64_SIGN_LINK_REGISTER206stp x29,x30,[sp,#-16]!207add x29,sp,#0208209ld1 {v7.16b}, [x0]210bl _vpaes_encrypt_preheat211bl _vpaes_encrypt_core212st1 {v0.16b}, [x1]213214ldp x29,x30,[sp],#16215AARCH64_VALIDATE_LINK_REGISTER216ret217.size vpaes_encrypt,.-vpaes_encrypt218219.type _vpaes_encrypt_2x,%function220.align 4221_vpaes_encrypt_2x:222mov x9, x2223ldr w8, [x2,#240] // pull rounds224adrp x11, .Lk_mc_forward+16225add x11, x11, #:lo12:.Lk_mc_forward+16226// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo227ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key228and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1229ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0230and v9.16b, v15.16b, v17.16b231ushr v8.16b, v15.16b, #4232tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1233tbl v9.16b, {v20.16b}, v9.16b234// vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi235tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2236tbl v10.16b, {v21.16b}, v8.16b237eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0238eor v8.16b, v9.16b, v16.16b239eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0240eor v8.16b, v8.16b, v10.16b241b .Lenc_2x_entry242243.align 4244.Lenc_2x_loop:245// middle of middle round246add x10, x11, #0x40247tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u248tbl v12.16b, {v25.16b}, v10.16b249ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]250tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t251tbl v8.16b, {v24.16b}, v11.16b252eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k253eor v12.16b, v12.16b, v16.16b254tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u255tbl v13.16b, {v27.16b}, v10.16b256eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A257eor v8.16b, v8.16b, v12.16b258tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t259tbl v10.16b, {v26.16b}, v11.16b260ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]261tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B262tbl v11.16b, {v8.16b}, v1.16b263eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A264eor v10.16b, v10.16b, v13.16b265tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D266tbl v8.16b, {v8.16b}, v4.16b267eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B268eor v11.16b, v11.16b, v10.16b269tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C270tbl v12.16b, {v11.16b},v1.16b271eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D272eor v8.16b, v8.16b, v11.16b273and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4274eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D275eor v8.16b, v8.16b, v12.16b276sub w8, w8, #1 // nr--277278.Lenc_2x_entry:279// top of round280and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k281ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i282and v9.16b, v8.16b, v17.16b283ushr v8.16b, v8.16b, #4284tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k285tbl v13.16b, {v19.16b},v9.16b286eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j287eor v9.16b, v9.16b, v8.16b288tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i289tbl v11.16b, {v18.16b},v8.16b290tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j291tbl v12.16b, {v18.16b},v9.16b292eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k293eor v11.16b, v11.16b, v13.16b294eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k295eor v12.16b, v12.16b, v13.16b296tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak297tbl v10.16b, {v18.16b},v11.16b298tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak299tbl v11.16b, {v18.16b},v12.16b300eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io301eor v10.16b, v10.16b, v9.16b302eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo303eor v11.16b, v11.16b, v8.16b304ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5305cbnz w8, .Lenc_2x_loop306307// middle of last round308add x10, x11, #0x80309// vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo310// vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16311tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou312tbl v12.16b, {v22.16b}, v10.16b313ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]314tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t315tbl v8.16b, {v23.16b}, v11.16b316eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k317eor v12.16b, v12.16b, v16.16b318eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A319eor v8.16b, v8.16b, v12.16b320tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0321tbl v1.16b, {v8.16b},v1.16b322ret323.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x324325.type _vpaes_decrypt_preheat,%function326.align 4327_vpaes_decrypt_preheat:328adrp x10, .Lk_inv329add x10, x10, #:lo12:.Lk_inv330movi v17.16b, #0x0f331adrp x11, .Lk_dipt332add x11, x11, #:lo12:.Lk_dipt333ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv334ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo335ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd336ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe337ret338.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat339340//341// Decryption core342//343// Same API as encryption core.344//345.type _vpaes_decrypt_core,%function346.align 4347_vpaes_decrypt_core:348mov x9, x2349ldr w8, [x2,#240] // pull rounds350351// vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo352lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11353eor x11, x11, #0x30 // xor $0x30, %r11354adrp x10, .Lk_sr355add x10, x10, #:lo12:.Lk_sr356and x11, x11, #0x30 // and $0x30, %r11357add x11, x11, x10358adrp x10, .Lk_mc_forward+48359add x10, x10, #:lo12:.Lk_mc_forward+48360361ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key362and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1363ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0364tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2365ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5366// vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi367tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0368eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2369eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0370b .Ldec_entry371372.align 4373.Ldec_loop:374//375// Inverse mix columns376//377// vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u378// vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t379tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u380tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t381eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0382// vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu383eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch384// vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt385386tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu387tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch388tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt389eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch390// vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu391eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch392// vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt393394tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu395tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch396tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt397eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch398// vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu399eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch400// vmovdqa 0x50(%r10), %xmm1 # 0 : sbet401402tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu403tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch404tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet405eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch406ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5407eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch408sub w8, w8, #1 // sub $1,%rax # nr--409410.Ldec_entry:411// top of round412and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k413ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i414tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k415eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j416tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i417tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j418eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k419eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k420tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak421tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak422eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io423eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo424ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0425cbnz w8, .Ldec_loop426427// middle of last round428// vmovdqa 0x60(%r10), %xmm4 # 3 : sbou429tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou430// vmovdqa 0x70(%r10), %xmm1 # 0 : sbot431ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160432tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t433eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k434eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A435tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0436ret437.size _vpaes_decrypt_core,.-_vpaes_decrypt_core438439.globl vpaes_decrypt440.type vpaes_decrypt,%function441.align 4442vpaes_decrypt:443AARCH64_SIGN_LINK_REGISTER444stp x29,x30,[sp,#-16]!445add x29,sp,#0446447ld1 {v7.16b}, [x0]448bl _vpaes_decrypt_preheat449bl _vpaes_decrypt_core450st1 {v0.16b}, [x1]451452ldp x29,x30,[sp],#16453AARCH64_VALIDATE_LINK_REGISTER454ret455.size vpaes_decrypt,.-vpaes_decrypt456457// v14-v15 input, v0-v1 output458.type _vpaes_decrypt_2x,%function459.align 4460_vpaes_decrypt_2x:461mov x9, x2462ldr w8, [x2,#240] // pull rounds463464// vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo465lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11466eor x11, x11, #0x30 // xor $0x30, %r11467adrp x10, .Lk_sr468add x10, x10, #:lo12:.Lk_sr469and x11, x11, #0x30 // and $0x30, %r11470add x11, x11, x10471adrp x10, .Lk_mc_forward+48472add x10, x10, #:lo12:.Lk_mc_forward+48473474ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key475and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1476ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0477and v9.16b, v15.16b, v17.16b478ushr v8.16b, v15.16b, #4479tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2480tbl v10.16b, {v20.16b},v9.16b481ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5482// vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi483tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0484tbl v8.16b, {v21.16b},v8.16b485eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2486eor v10.16b, v10.16b, v16.16b487eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0488eor v8.16b, v8.16b, v10.16b489b .Ldec_2x_entry490491.align 4492.Ldec_2x_loop:493//494// Inverse mix columns495//496// vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u497// vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t498tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u499tbl v12.16b, {v24.16b}, v10.16b500tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t501tbl v9.16b, {v25.16b}, v11.16b502eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0503eor v8.16b, v12.16b, v16.16b504// vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu505eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch506eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch507// vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt508509tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu510tbl v12.16b, {v26.16b}, v10.16b511tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch512tbl v8.16b, {v8.16b},v5.16b513tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt514tbl v9.16b, {v27.16b}, v11.16b515eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch516eor v8.16b, v8.16b, v12.16b517// vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu518eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch519eor v8.16b, v8.16b, v9.16b520// vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt521522tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu523tbl v12.16b, {v28.16b}, v10.16b524tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch525tbl v8.16b, {v8.16b},v5.16b526tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt527tbl v9.16b, {v29.16b}, v11.16b528eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch529eor v8.16b, v8.16b, v12.16b530// vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu531eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch532eor v8.16b, v8.16b, v9.16b533// vmovdqa 0x50(%r10), %xmm1 # 0 : sbet534535tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu536tbl v12.16b, {v30.16b}, v10.16b537tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch538tbl v8.16b, {v8.16b},v5.16b539tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet540tbl v9.16b, {v31.16b}, v11.16b541eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch542eor v8.16b, v8.16b, v12.16b543ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5544eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch545eor v8.16b, v8.16b, v9.16b546sub w8, w8, #1 // sub $1,%rax # nr--547548.Ldec_2x_entry:549// top of round550and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k551ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i552and v9.16b, v8.16b, v17.16b553ushr v8.16b, v8.16b, #4554tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k555tbl v10.16b, {v19.16b},v9.16b556eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j557eor v9.16b, v9.16b, v8.16b558tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i559tbl v11.16b, {v18.16b},v8.16b560tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j561tbl v12.16b, {v18.16b},v9.16b562eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k563eor v11.16b, v11.16b, v10.16b564eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k565eor v12.16b, v12.16b, v10.16b566tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak567tbl v10.16b, {v18.16b},v11.16b568tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak569tbl v11.16b, {v18.16b},v12.16b570eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io571eor v10.16b, v10.16b, v9.16b572eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo573eor v11.16b, v11.16b, v8.16b574ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0575cbnz w8, .Ldec_2x_loop576577// middle of last round578// vmovdqa 0x60(%r10), %xmm4 # 3 : sbou579tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou580tbl v12.16b, {v22.16b}, v10.16b581// vmovdqa 0x70(%r10), %xmm1 # 0 : sbot582tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t583tbl v9.16b, {v23.16b}, v11.16b584ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160585eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k586eor v12.16b, v12.16b, v16.16b587eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A588eor v8.16b, v9.16b, v12.16b589tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0590tbl v1.16b, {v8.16b},v2.16b591ret592.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x593////////////////////////////////////////////////////////594// //595// AES key schedule //596// //597////////////////////////////////////////////////////////598.type _vpaes_key_preheat,%function599.align 4600_vpaes_key_preheat:601adrp x10, .Lk_inv602add x10, x10, #:lo12:.Lk_inv603movi v16.16b, #0x5b // .Lk_s63604adrp x11, .Lk_sb1605add x11, x11, #:lo12:.Lk_sb1606movi v17.16b, #0x0f // .Lk_s0F607ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt608adrp x10, .Lk_dksd609add x10, x10, #:lo12:.Lk_dksd610ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1611adrp x11, .Lk_mc_forward612add x11, x11, #:lo12:.Lk_mc_forward613ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb614ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9615ld1 {v8.2d}, [x10] // .Lk_rcon616ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]617ret618.size _vpaes_key_preheat,.-_vpaes_key_preheat619620.type _vpaes_schedule_core,%function621.align 4622_vpaes_schedule_core:623AARCH64_SIGN_LINK_REGISTER624stp x29, x30, [sp,#-16]!625add x29,sp,#0626627bl _vpaes_key_preheat // load the tables628629ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)630631// input transform632mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3633bl _vpaes_schedule_transform634mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7635636adrp x10, .Lk_sr637add x10, x10, #:lo12:.Lk_sr638add x8, x8, x10639cbnz w3, .Lschedule_am_decrypting640641// encrypting, output zeroth round key after transform642st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)643b .Lschedule_go644645.Lschedule_am_decrypting:646// decrypting, output zeroth round key after shiftrows647ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1648tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3649st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)650eor x8, x8, #0x30 // xor $0x30, %r8651652.Lschedule_go:653cmp w1, #192 // cmp $192, %esi654b.hi .Lschedule_256655b.eq .Lschedule_192656// 128: fall though657658//659// .schedule_128660//661// 128-bit specific part of key schedule.662//663// This schedule is really simple, because all its parts664// are accomplished by the subroutines.665//666.Lschedule_128:667mov x0, #10 // mov $10, %esi668669.Loop_schedule_128:670sub x0, x0, #1 // dec %esi671bl _vpaes_schedule_round672cbz x0, .Lschedule_mangle_last673bl _vpaes_schedule_mangle // write output674b .Loop_schedule_128675676//677// .aes_schedule_192678//679// 192-bit specific part of key schedule.680//681// The main body of this schedule is the same as the 128-bit682// schedule, but with more smearing. The long, high side is683// stored in %xmm7 as before, and the short, low side is in684// the high bits of %xmm6.685//686// This schedule is somewhat nastier, however, because each687// round produces 192 bits of key material, or 1.5 round keys.688// Therefore, on each cycle we do 2 rounds and produce 3 round689// keys.690//691.align 4692.Lschedule_192:693sub x0, x0, #8694ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)695bl _vpaes_schedule_transform // input transform696mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part697eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4698ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros699mov x0, #4 // mov $4, %esi700701.Loop_schedule_192:702sub x0, x0, #1 // dec %esi703bl _vpaes_schedule_round704ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0705bl _vpaes_schedule_mangle // save key n706bl _vpaes_schedule_192_smear707bl _vpaes_schedule_mangle // save key n+1708bl _vpaes_schedule_round709cbz x0, .Lschedule_mangle_last710bl _vpaes_schedule_mangle // save key n+2711bl _vpaes_schedule_192_smear712b .Loop_schedule_192713714//715// .aes_schedule_256716//717// 256-bit specific part of key schedule.718//719// The structure here is very similar to the 128-bit720// schedule, but with an additional "low side" in721// %xmm6. The low side's rounds are the same as the722// high side's, except no rcon and no rotation.723//724.align 4725.Lschedule_256:726ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)727bl _vpaes_schedule_transform // input transform728mov x0, #7 // mov $7, %esi729730.Loop_schedule_256:731sub x0, x0, #1 // dec %esi732bl _vpaes_schedule_mangle // output low result733mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6734735// high round736bl _vpaes_schedule_round737cbz x0, .Lschedule_mangle_last738bl _vpaes_schedule_mangle739740// low round. swap xmm7 and xmm6741dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0742movi v4.16b, #0743mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5744mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7745bl _vpaes_schedule_low_round746mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7747748b .Loop_schedule_256749750//751// .aes_schedule_mangle_last752//753// Mangler for last round of key schedule754// Mangles %xmm0755// when encrypting, outputs out(%xmm0) ^ 63756// when decrypting, outputs unskew(%xmm0)757//758// Always called right before return... jumps to cleanup and exits759//760.align 4761.Lschedule_mangle_last:762// schedule last round key from xmm0763adrp x11, .Lk_deskew764add x11, x11, #:lo12:.Lk_deskew765cbnz w3, .Lschedule_mangle_last_dec766767// encrypting768ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1769adrp x11, .Lk_opt770add x11, x11, #:lo12:.Lk_opt771add x2, x2, #32 // add $32, %rdx772tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute773774.Lschedule_mangle_last_dec:775ld1 {v20.2d,v21.2d}, [x11] // reload constants776sub x2, x2, #16 // add $-16, %rdx777eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0778bl _vpaes_schedule_transform // output transform779st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key780781// cleanup782eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0783eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1784eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2785eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3786eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4787eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5788eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6789eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7790ldp x29, x30, [sp],#16791AARCH64_VALIDATE_LINK_REGISTER792ret793.size _vpaes_schedule_core,.-_vpaes_schedule_core794795//796// .aes_schedule_192_smear797//798// Smear the short, low side in the 192-bit key schedule.799//800// Inputs:801// %xmm7: high side, b a x y802// %xmm6: low side, d c 0 0803// %xmm13: 0804//805// Outputs:806// %xmm6: b+c+d b+c 0 0807// %xmm0: b+c+d b+c b a808//809.type _vpaes_schedule_192_smear,%function810.align 4811_vpaes_schedule_192_smear:812movi v1.16b, #0813dup v0.4s, v7.s[3]814ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0815ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a816eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0817eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1818eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a819mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0820ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros821ret822.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear823824//825// .aes_schedule_round826//827// Runs one main round of the key schedule on %xmm0, %xmm7828//829// Specifically, runs subbytes on the high dword of %xmm0830// then rotates it by one byte and xors into the low dword of831// %xmm7.832//833// Adds rcon from low byte of %xmm8, then rotates %xmm8 for834// next rcon.835//836// Smears the dwords of %xmm7 by xoring the low into the837// second low, result into third, result into highest.838//839// Returns results in %xmm7 = %xmm0.840// Clobbers %xmm1-%xmm4, %r11.841//842.type _vpaes_schedule_round,%function843.align 4844_vpaes_schedule_round:845// extract rcon from xmm8846movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4847ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1848ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8849eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7850851// rotate852dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0853ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0854855// fall through...856857// low round: same as high round, but no rotation and no rcon.858_vpaes_schedule_low_round:859// smear xmm7860ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1861eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7862ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4863864// subbytes865and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k866ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i867eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7868tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k869eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j870tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i871eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k872tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j873eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7874tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak875eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k876tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak877eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io878eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo879tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou880tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t881eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output882883// add in smeared stuff884eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0885eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7886ret887.size _vpaes_schedule_round,.-_vpaes_schedule_round888889//890// .aes_schedule_transform891//892// Linear-transform %xmm0 according to tables at (%r11)893//894// Requires that %xmm9 = 0x0F0F... as in preheat895// Output in %xmm0896// Clobbers %xmm1, %xmm2897//898.type _vpaes_schedule_transform,%function899.align 4900_vpaes_schedule_transform:901and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1902ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0903// vmovdqa (%r11), %xmm2 # lo904tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2905// vmovdqa 16(%r11), %xmm1 # hi906tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0907eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0908ret909.size _vpaes_schedule_transform,.-_vpaes_schedule_transform910911//912// .aes_schedule_mangle913//914// Mangle xmm0 from (basis-transformed) standard version915// to our version.916//917// On encrypt,918// xor with 0x63919// multiply by circulant 0,1,1,1920// apply shiftrows transform921//922// On decrypt,923// xor with 0x63924// multiply by "inverse mixcolumns" circulant E,B,D,9925// deskew926// apply shiftrows transform927//928//929// Writes out to (%rdx), and increments or decrements it930// Keeps track of round number mod 4 in %r8931// Preserves xmm0932// Clobbers xmm1-xmm5933//934.type _vpaes_schedule_mangle,%function935.align 4936_vpaes_schedule_mangle:937mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later938// vmovdqa .Lk_mc_forward(%rip),%xmm5939cbnz w3, .Lschedule_mangle_dec940941// encrypting942eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4943add x2, x2, #16 // add $16, %rdx944tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4945tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1946tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3947eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4948ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1949eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3950951b .Lschedule_mangle_both952.align 4953.Lschedule_mangle_dec:954// inverse mix columns955// lea .Lk_dksd(%rip),%r11956ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi957and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo958959// vmovdqa 0x00(%r11), %xmm2960tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2961// vmovdqa 0x10(%r11), %xmm3962tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3963eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3964tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3965966// vmovdqa 0x20(%r11), %xmm2967tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2968eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2969// vmovdqa 0x30(%r11), %xmm3970tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3971eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3972tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3973974// vmovdqa 0x40(%r11), %xmm2975tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2976eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2977// vmovdqa 0x50(%r11), %xmm3978tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3979eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3980981// vmovdqa 0x60(%r11), %xmm2982tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2983tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3984// vmovdqa 0x70(%r11), %xmm4985tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4986ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1987eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2988eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3989990sub x2, x2, #16 // add $-16, %rdx991992.Lschedule_mangle_both:993tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3994add x8, x8, #64-16 // add $-16, %r8995and x8, x8, #~(1<<6) // and $0x30, %r8996st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)997ret998.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle9991000.globl vpaes_set_encrypt_key1001.type vpaes_set_encrypt_key,%function1002.align 41003vpaes_set_encrypt_key:1004AARCH64_SIGN_LINK_REGISTER1005stp x29,x30,[sp,#-16]!1006add x29,sp,#01007stp d8,d9,[sp,#-16]! // ABI spec says so10081009lsr w9, w1, #5 // shr $5,%eax1010add w9, w9, #5 // $5,%eax1011str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;10121013mov w3, #0 // mov $0,%ecx1014mov x8, #0x30 // mov $0x30,%r8d1015bl _vpaes_schedule_core1016eor x0, x0, x010171018ldp d8,d9,[sp],#161019ldp x29,x30,[sp],#161020AARCH64_VALIDATE_LINK_REGISTER1021ret1022.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key10231024.globl vpaes_set_decrypt_key1025.type vpaes_set_decrypt_key,%function1026.align 41027vpaes_set_decrypt_key:1028AARCH64_SIGN_LINK_REGISTER1029stp x29,x30,[sp,#-16]!1030add x29,sp,#01031stp d8,d9,[sp,#-16]! // ABI spec says so10321033lsr w9, w1, #5 // shr $5,%eax1034add w9, w9, #5 // $5,%eax1035str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;1036lsl w9, w9, #4 // shl $4,%eax1037add x2, x2, #16 // lea 16(%rdx,%rax),%rdx1038add x2, x2, x910391040mov w3, #1 // mov $1,%ecx1041lsr w8, w1, #1 // shr $1,%r8d1042and x8, x8, #32 // and $32,%r8d1043eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:321044bl _vpaes_schedule_core10451046ldp d8,d9,[sp],#161047ldp x29,x30,[sp],#161048AARCH64_VALIDATE_LINK_REGISTER1049ret1050.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key1051.globl vpaes_cbc_encrypt1052.type vpaes_cbc_encrypt,%function1053.align 41054vpaes_cbc_encrypt:1055AARCH64_SIGN_LINK_REGISTER1056cbz x2, .Lcbc_abort1057cmp w5, #0 // check direction1058b.eq vpaes_cbc_decrypt10591060stp x29,x30,[sp,#-16]!1061add x29,sp,#010621063mov x17, x2 // reassign1064mov x2, x3 // reassign10651066ld1 {v0.16b}, [x4] // load ivec1067bl _vpaes_encrypt_preheat1068b .Lcbc_enc_loop10691070.align 41071.Lcbc_enc_loop:1072ld1 {v7.16b}, [x0],#16 // load input1073eor v7.16b, v7.16b, v0.16b // xor with ivec1074bl _vpaes_encrypt_core1075st1 {v0.16b}, [x1],#16 // save output1076subs x17, x17, #161077b.hi .Lcbc_enc_loop10781079st1 {v0.16b}, [x4] // write ivec10801081ldp x29,x30,[sp],#161082.Lcbc_abort:1083AARCH64_VALIDATE_LINK_REGISTER1084ret1085.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt10861087.type vpaes_cbc_decrypt,%function1088.align 41089vpaes_cbc_decrypt:1090// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to1091// only from vpaes_cbc_encrypt which has already signed the return address.1092stp x29,x30,[sp,#-16]!1093add x29,sp,#01094stp d8,d9,[sp,#-16]! // ABI spec says so1095stp d10,d11,[sp,#-16]!1096stp d12,d13,[sp,#-16]!1097stp d14,d15,[sp,#-16]!10981099mov x17, x2 // reassign1100mov x2, x3 // reassign1101ld1 {v6.16b}, [x4] // load ivec1102bl _vpaes_decrypt_preheat1103tst x17, #161104b.eq .Lcbc_dec_loop2x11051106ld1 {v7.16b}, [x0], #16 // load input1107bl _vpaes_decrypt_core1108eor v0.16b, v0.16b, v6.16b // xor with ivec1109orr v6.16b, v7.16b, v7.16b // next ivec value1110st1 {v0.16b}, [x1], #161111subs x17, x17, #161112b.ls .Lcbc_dec_done11131114.align 41115.Lcbc_dec_loop2x:1116ld1 {v14.16b,v15.16b}, [x0], #321117bl _vpaes_decrypt_2x1118eor v0.16b, v0.16b, v6.16b // xor with ivec1119eor v1.16b, v1.16b, v14.16b1120orr v6.16b, v15.16b, v15.16b1121st1 {v0.16b,v1.16b}, [x1], #321122subs x17, x17, #321123b.hi .Lcbc_dec_loop2x11241125.Lcbc_dec_done:1126st1 {v6.16b}, [x4]11271128ldp d14,d15,[sp],#161129ldp d12,d13,[sp],#161130ldp d10,d11,[sp],#161131ldp d8,d9,[sp],#161132ldp x29,x30,[sp],#161133AARCH64_VALIDATE_LINK_REGISTER1134ret1135.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt1136.globl vpaes_ecb_encrypt1137.type vpaes_ecb_encrypt,%function1138.align 41139vpaes_ecb_encrypt:1140AARCH64_SIGN_LINK_REGISTER1141stp x29,x30,[sp,#-16]!1142add x29,sp,#01143stp d8,d9,[sp,#-16]! // ABI spec says so1144stp d10,d11,[sp,#-16]!1145stp d12,d13,[sp,#-16]!1146stp d14,d15,[sp,#-16]!11471148mov x17, x21149mov x2, x31150bl _vpaes_encrypt_preheat1151tst x17, #161152b.eq .Lecb_enc_loop11531154ld1 {v7.16b}, [x0],#161155bl _vpaes_encrypt_core1156st1 {v0.16b}, [x1],#161157subs x17, x17, #161158b.ls .Lecb_enc_done11591160.align 41161.Lecb_enc_loop:1162ld1 {v14.16b,v15.16b}, [x0], #321163bl _vpaes_encrypt_2x1164st1 {v0.16b,v1.16b}, [x1], #321165subs x17, x17, #321166b.hi .Lecb_enc_loop11671168.Lecb_enc_done:1169ldp d14,d15,[sp],#161170ldp d12,d13,[sp],#161171ldp d10,d11,[sp],#161172ldp d8,d9,[sp],#161173ldp x29,x30,[sp],#161174AARCH64_VALIDATE_LINK_REGISTER1175ret1176.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt11771178.globl vpaes_ecb_decrypt1179.type vpaes_ecb_decrypt,%function1180.align 41181vpaes_ecb_decrypt:1182AARCH64_SIGN_LINK_REGISTER1183stp x29,x30,[sp,#-16]!1184add x29,sp,#01185stp d8,d9,[sp,#-16]! // ABI spec says so1186stp d10,d11,[sp,#-16]!1187stp d12,d13,[sp,#-16]!1188stp d14,d15,[sp,#-16]!11891190mov x17, x21191mov x2, x31192bl _vpaes_decrypt_preheat1193tst x17, #161194b.eq .Lecb_dec_loop11951196ld1 {v7.16b}, [x0],#161197bl _vpaes_encrypt_core1198st1 {v0.16b}, [x1],#161199subs x17, x17, #161200b.ls .Lecb_dec_done12011202.align 41203.Lecb_dec_loop:1204ld1 {v14.16b,v15.16b}, [x0], #321205bl _vpaes_decrypt_2x1206st1 {v0.16b,v1.16b}, [x1], #321207subs x17, x17, #321208b.hi .Lecb_dec_loop12091210.Lecb_dec_done:1211ldp d14,d15,[sp],#161212ldp d12,d13,[sp],#161213ldp d10,d11,[sp],#161214ldp d8,d9,[sp],#161215ldp x29,x30,[sp],#161216AARCH64_VALIDATE_LINK_REGISTER1217ret1218.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt121912201221