Path: blob/master/arch/x86/crypto/aesni-intel_asm.S
10817 views
/*1* Implement AES algorithm in Intel AES-NI instructions.2*3* The white paper of AES-NI instructions can be downloaded from:4* http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf5*6* Copyright (C) 2008, Intel Corp.7* Author: Huang Ying <[email protected]>8* Vinodh Gopal <[email protected]>9* Kahraman Akdemir10*11* Added RFC4106 AES-GCM support for 128-bit keys under the AEAD12* interface for 64-bit kernels.13* Authors: Erdinc Ozturk ([email protected])14* Aidan O'Mahony ([email protected])15* Adrian Hoban <[email protected]>16* James Guilford ([email protected])17* Gabriele Paoloni <[email protected]>18* Tadeusz Struk ([email protected])19* Wajdi Feghali ([email protected])20* Copyright (c) 2010, Intel Corporation.21*22* Ported x86_64 version to x86:23* Author: Mathias Krause <[email protected]>24*25* This program is free software; you can redistribute it and/or modify26* it under the terms of the GNU General Public License as published by27* the Free Software Foundation; either version 2 of the License, or28* (at your option) any later version.29*/3031#include <linux/linkage.h>32#include <asm/inst.h>3334#ifdef __x86_64__35.data36POLY: .octa 0xC200000000000000000000000000000137TWOONE: .octa 0x000000010000000000000000000000013839# order of these constants should not change.40# more specifically, ALL_F should follow SHIFT_MASK,41# and ZERO should follow ALL_F4243SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F44MASK1: .octa 0x0000000000000000ffffffffffffffff45MASK2: .octa 0xffffffffffffffff000000000000000046SHIFT_MASK: .octa 0x0f0e0d0c0b0a0908070605040302010047ALL_F: .octa 0xffffffffffffffffffffffffffffffff48ZERO: .octa 0x0000000000000000000000000000000049ONE: .octa 0x0000000000000000000000000000000150F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff051dec: .octa 0x152enc: .octa 0x2535455.text565758#define STACK_OFFSET 8*359#define HashKey 16*0 // store HashKey <<1 mod poly here60#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here61#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here62#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here63#define HashKey_k 16*4 // store XOR of High 64 bits and Low 6464// bits of HashKey <<1 mod poly here65//(for Karatsuba purposes)66#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 6467// bits of HashKey^2 <<1 mod poly here68// (for Karatsuba purposes)69#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 6470// bits of HashKey^3 <<1 mod poly here71// (for Karatsuba purposes)72#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 6473// bits of HashKey^4 <<1 mod poly here74// (for Karatsuba purposes)75#define VARIABLE_OFFSET 16*87677#define arg1 rdi78#define arg2 rsi79#define arg3 rdx80#define arg4 rcx81#define arg5 r882#define arg6 r983#define arg7 STACK_OFFSET+8(%r14)84#define arg8 STACK_OFFSET+16(%r14)85#define arg9 STACK_OFFSET+24(%r14)86#define arg10 STACK_OFFSET+32(%r14)87#endif888990#define STATE1 %xmm091#define STATE2 %xmm492#define STATE3 %xmm593#define STATE4 %xmm694#define STATE STATE195#define IN1 %xmm196#define IN2 %xmm797#define IN3 %xmm898#define IN4 %xmm999#define IN IN1100#define KEY %xmm2101#define IV %xmm3102103#define BSWAP_MASK %xmm10104#define CTR %xmm11105#define INC %xmm12106107#ifdef __x86_64__108#define AREG %rax109#define KEYP %rdi110#define OUTP %rsi111#define UKEYP OUTP112#define INP %rdx113#define LEN %rcx114#define IVP %r8115#define KLEN %r9d116#define T1 %r10117#define TKEYP T1118#define T2 %r11119#define TCTR_LOW T2120#else121#define AREG %eax122#define KEYP %edi123#define OUTP AREG124#define UKEYP OUTP125#define INP %edx126#define LEN %esi127#define IVP %ebp128#define KLEN %ebx129#define T1 %ecx130#define TKEYP T1131#endif132133134#ifdef __x86_64__135/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)136*137*138* Input: A and B (128-bits each, bit-reflected)139* Output: C = A*B*x mod poly, (i.e. >>1 )140* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input141* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.142*143*/144.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5145movdqa \GH, \TMP1146pshufd $78, \GH, \TMP2147pshufd $78, \HK, \TMP3148pxor \GH, \TMP2 # TMP2 = a1+a0149pxor \HK, \TMP3 # TMP3 = b1+b0150PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1151PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0152PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)153pxor \GH, \TMP2154pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)155movdqa \TMP2, \TMP3156pslldq $8, \TMP3 # left shift TMP3 2 DWs157psrldq $8, \TMP2 # right shift TMP2 2 DWs158pxor \TMP3, \GH159pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK160161# first phase of the reduction162163movdqa \GH, \TMP2164movdqa \GH, \TMP3165movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4166# in in order to perform167# independent shifts168pslld $31, \TMP2 # packed right shift <<31169pslld $30, \TMP3 # packed right shift <<30170pslld $25, \TMP4 # packed right shift <<25171pxor \TMP3, \TMP2 # xor the shifted versions172pxor \TMP4, \TMP2173movdqa \TMP2, \TMP5174psrldq $4, \TMP5 # right shift TMP5 1 DW175pslldq $12, \TMP2 # left shift TMP2 3 DWs176pxor \TMP2, \GH177178# second phase of the reduction179180movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4181# in in order to perform182# independent shifts183movdqa \GH,\TMP3184movdqa \GH,\TMP4185psrld $1,\TMP2 # packed left shift >>1186psrld $2,\TMP3 # packed left shift >>2187psrld $7,\TMP4 # packed left shift >>7188pxor \TMP3,\TMP2 # xor the shifted versions189pxor \TMP4,\TMP2190pxor \TMP5, \TMP2191pxor \TMP2, \GH192pxor \TMP1, \GH # result is in TMP1193.endm194195/*196* if a = number of total plaintext bytes197* b = floor(a/16)198* num_initial_blocks = b mod 4199* encrypt the initial num_initial_blocks blocks and apply ghash on200* the ciphertext201* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers202* are clobbered203* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified204*/205206207.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \208XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation209mov arg7, %r10 # %r10 = AAD210mov arg8, %r12 # %r12 = aadLen211mov %r12, %r11212pxor %xmm\i, %xmm\i213_get_AAD_loop\num_initial_blocks\operation:214movd (%r10), \TMP1215pslldq $12, \TMP1216psrldq $4, %xmm\i217pxor \TMP1, %xmm\i218add $4, %r10219sub $4, %r12220jne _get_AAD_loop\num_initial_blocks\operation221cmp $16, %r11222je _get_AAD_loop2_done\num_initial_blocks\operation223mov $16, %r12224_get_AAD_loop2\num_initial_blocks\operation:225psrldq $4, %xmm\i226sub $4, %r12227cmp %r11, %r12228jne _get_AAD_loop2\num_initial_blocks\operation229_get_AAD_loop2_done\num_initial_blocks\operation:230movdqa SHUF_MASK(%rip), %xmm14231PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data232233xor %r11, %r11 # initialise the data pointer offset as zero234235# start AES for num_initial_blocks blocks236237mov %arg5, %rax # %rax = *Y0238movdqu (%rax), \XMM0 # XMM0 = Y0239movdqa SHUF_MASK(%rip), %xmm14240PSHUFB_XMM %xmm14, \XMM0241242.if (\i == 5) || (\i == 6) || (\i == 7)243.irpc index, \i_seq244paddd ONE(%rip), \XMM0 # INCR Y0245movdqa \XMM0, %xmm\index246movdqa SHUF_MASK(%rip), %xmm14247PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap248249.endr250.irpc index, \i_seq251pxor 16*0(%arg1), %xmm\index252.endr253.irpc index, \i_seq254movaps 0x10(%rdi), \TMP1255AESENC \TMP1, %xmm\index # Round 1256.endr257.irpc index, \i_seq258movaps 0x20(%arg1), \TMP1259AESENC \TMP1, %xmm\index # Round 2260.endr261.irpc index, \i_seq262movaps 0x30(%arg1), \TMP1263AESENC \TMP1, %xmm\index # Round 2264.endr265.irpc index, \i_seq266movaps 0x40(%arg1), \TMP1267AESENC \TMP1, %xmm\index # Round 2268.endr269.irpc index, \i_seq270movaps 0x50(%arg1), \TMP1271AESENC \TMP1, %xmm\index # Round 2272.endr273.irpc index, \i_seq274movaps 0x60(%arg1), \TMP1275AESENC \TMP1, %xmm\index # Round 2276.endr277.irpc index, \i_seq278movaps 0x70(%arg1), \TMP1279AESENC \TMP1, %xmm\index # Round 2280.endr281.irpc index, \i_seq282movaps 0x80(%arg1), \TMP1283AESENC \TMP1, %xmm\index # Round 2284.endr285.irpc index, \i_seq286movaps 0x90(%arg1), \TMP1287AESENC \TMP1, %xmm\index # Round 2288.endr289.irpc index, \i_seq290movaps 0xa0(%arg1), \TMP1291AESENCLAST \TMP1, %xmm\index # Round 10292.endr293.irpc index, \i_seq294movdqu (%arg3 , %r11, 1), \TMP1295pxor \TMP1, %xmm\index296movdqu %xmm\index, (%arg2 , %r11, 1)297# write back plaintext/ciphertext for num_initial_blocks298add $16, %r11299300movdqa \TMP1, %xmm\index301movdqa SHUF_MASK(%rip), %xmm14302PSHUFB_XMM %xmm14, %xmm\index303304# prepare plaintext/ciphertext for GHASH computation305.endr306.endif307GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1308# apply GHASH on num_initial_blocks blocks309310.if \i == 5311pxor %xmm5, %xmm6312GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1313pxor %xmm6, %xmm7314GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1315pxor %xmm7, %xmm8316GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1317.elseif \i == 6318pxor %xmm6, %xmm7319GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1320pxor %xmm7, %xmm8321GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1322.elseif \i == 7323pxor %xmm7, %xmm8324GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1325.endif326cmp $64, %r13327jl _initial_blocks_done\num_initial_blocks\operation328# no need for precomputed values329/*330*331* Precomputations for HashKey parallel with encryption of first 4 blocks.332* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i333*/334paddd ONE(%rip), \XMM0 # INCR Y0335movdqa \XMM0, \XMM1336movdqa SHUF_MASK(%rip), %xmm14337PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap338339paddd ONE(%rip), \XMM0 # INCR Y0340movdqa \XMM0, \XMM2341movdqa SHUF_MASK(%rip), %xmm14342PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap343344paddd ONE(%rip), \XMM0 # INCR Y0345movdqa \XMM0, \XMM3346movdqa SHUF_MASK(%rip), %xmm14347PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap348349paddd ONE(%rip), \XMM0 # INCR Y0350movdqa \XMM0, \XMM4351movdqa SHUF_MASK(%rip), %xmm14352PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap353354pxor 16*0(%arg1), \XMM1355pxor 16*0(%arg1), \XMM2356pxor 16*0(%arg1), \XMM3357pxor 16*0(%arg1), \XMM4358movdqa \TMP3, \TMP5359pshufd $78, \TMP3, \TMP1360pxor \TMP3, \TMP1361movdqa \TMP1, HashKey_k(%rsp)362GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7363# TMP5 = HashKey^2<<1 (mod poly)364movdqa \TMP5, HashKey_2(%rsp)365# HashKey_2 = HashKey^2<<1 (mod poly)366pshufd $78, \TMP5, \TMP1367pxor \TMP5, \TMP1368movdqa \TMP1, HashKey_2_k(%rsp)369.irpc index, 1234 # do 4 rounds370movaps 0x10*\index(%arg1), \TMP1371AESENC \TMP1, \XMM1372AESENC \TMP1, \XMM2373AESENC \TMP1, \XMM3374AESENC \TMP1, \XMM4375.endr376GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7377# TMP5 = HashKey^3<<1 (mod poly)378movdqa \TMP5, HashKey_3(%rsp)379pshufd $78, \TMP5, \TMP1380pxor \TMP5, \TMP1381movdqa \TMP1, HashKey_3_k(%rsp)382.irpc index, 56789 # do next 5 rounds383movaps 0x10*\index(%arg1), \TMP1384AESENC \TMP1, \XMM1385AESENC \TMP1, \XMM2386AESENC \TMP1, \XMM3387AESENC \TMP1, \XMM4388.endr389GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7390# TMP5 = HashKey^3<<1 (mod poly)391movdqa \TMP5, HashKey_4(%rsp)392pshufd $78, \TMP5, \TMP1393pxor \TMP5, \TMP1394movdqa \TMP1, HashKey_4_k(%rsp)395movaps 0xa0(%arg1), \TMP2396AESENCLAST \TMP2, \XMM1397AESENCLAST \TMP2, \XMM2398AESENCLAST \TMP2, \XMM3399AESENCLAST \TMP2, \XMM4400movdqu 16*0(%arg3 , %r11 , 1), \TMP1401pxor \TMP1, \XMM1402movdqu \XMM1, 16*0(%arg2 , %r11 , 1)403movdqa \TMP1, \XMM1404movdqu 16*1(%arg3 , %r11 , 1), \TMP1405pxor \TMP1, \XMM2406movdqu \XMM2, 16*1(%arg2 , %r11 , 1)407movdqa \TMP1, \XMM2408movdqu 16*2(%arg3 , %r11 , 1), \TMP1409pxor \TMP1, \XMM3410movdqu \XMM3, 16*2(%arg2 , %r11 , 1)411movdqa \TMP1, \XMM3412movdqu 16*3(%arg3 , %r11 , 1), \TMP1413pxor \TMP1, \XMM4414movdqu \XMM4, 16*3(%arg2 , %r11 , 1)415movdqa \TMP1, \XMM4416add $64, %r11417movdqa SHUF_MASK(%rip), %xmm14418PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap419pxor \XMMDst, \XMM1420# combine GHASHed value with the corresponding ciphertext421movdqa SHUF_MASK(%rip), %xmm14422PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap423movdqa SHUF_MASK(%rip), %xmm14424PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap425movdqa SHUF_MASK(%rip), %xmm14426PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap427428_initial_blocks_done\num_initial_blocks\operation:429430.endm431432433/*434* if a = number of total plaintext bytes435* b = floor(a/16)436* num_initial_blocks = b mod 4437* encrypt the initial num_initial_blocks blocks and apply ghash on438* the ciphertext439* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers440* are clobbered441* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified442*/443444445.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \446XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation447mov arg7, %r10 # %r10 = AAD448mov arg8, %r12 # %r12 = aadLen449mov %r12, %r11450pxor %xmm\i, %xmm\i451_get_AAD_loop\num_initial_blocks\operation:452movd (%r10), \TMP1453pslldq $12, \TMP1454psrldq $4, %xmm\i455pxor \TMP1, %xmm\i456add $4, %r10457sub $4, %r12458jne _get_AAD_loop\num_initial_blocks\operation459cmp $16, %r11460je _get_AAD_loop2_done\num_initial_blocks\operation461mov $16, %r12462_get_AAD_loop2\num_initial_blocks\operation:463psrldq $4, %xmm\i464sub $4, %r12465cmp %r11, %r12466jne _get_AAD_loop2\num_initial_blocks\operation467_get_AAD_loop2_done\num_initial_blocks\operation:468movdqa SHUF_MASK(%rip), %xmm14469PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data470471xor %r11, %r11 # initialise the data pointer offset as zero472473# start AES for num_initial_blocks blocks474475mov %arg5, %rax # %rax = *Y0476movdqu (%rax), \XMM0 # XMM0 = Y0477movdqa SHUF_MASK(%rip), %xmm14478PSHUFB_XMM %xmm14, \XMM0479480.if (\i == 5) || (\i == 6) || (\i == 7)481.irpc index, \i_seq482paddd ONE(%rip), \XMM0 # INCR Y0483movdqa \XMM0, %xmm\index484movdqa SHUF_MASK(%rip), %xmm14485PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap486487.endr488.irpc index, \i_seq489pxor 16*0(%arg1), %xmm\index490.endr491.irpc index, \i_seq492movaps 0x10(%rdi), \TMP1493AESENC \TMP1, %xmm\index # Round 1494.endr495.irpc index, \i_seq496movaps 0x20(%arg1), \TMP1497AESENC \TMP1, %xmm\index # Round 2498.endr499.irpc index, \i_seq500movaps 0x30(%arg1), \TMP1501AESENC \TMP1, %xmm\index # Round 2502.endr503.irpc index, \i_seq504movaps 0x40(%arg1), \TMP1505AESENC \TMP1, %xmm\index # Round 2506.endr507.irpc index, \i_seq508movaps 0x50(%arg1), \TMP1509AESENC \TMP1, %xmm\index # Round 2510.endr511.irpc index, \i_seq512movaps 0x60(%arg1), \TMP1513AESENC \TMP1, %xmm\index # Round 2514.endr515.irpc index, \i_seq516movaps 0x70(%arg1), \TMP1517AESENC \TMP1, %xmm\index # Round 2518.endr519.irpc index, \i_seq520movaps 0x80(%arg1), \TMP1521AESENC \TMP1, %xmm\index # Round 2522.endr523.irpc index, \i_seq524movaps 0x90(%arg1), \TMP1525AESENC \TMP1, %xmm\index # Round 2526.endr527.irpc index, \i_seq528movaps 0xa0(%arg1), \TMP1529AESENCLAST \TMP1, %xmm\index # Round 10530.endr531.irpc index, \i_seq532movdqu (%arg3 , %r11, 1), \TMP1533pxor \TMP1, %xmm\index534movdqu %xmm\index, (%arg2 , %r11, 1)535# write back plaintext/ciphertext for num_initial_blocks536add $16, %r11537538movdqa SHUF_MASK(%rip), %xmm14539PSHUFB_XMM %xmm14, %xmm\index540541# prepare plaintext/ciphertext for GHASH computation542.endr543.endif544GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1545# apply GHASH on num_initial_blocks blocks546547.if \i == 5548pxor %xmm5, %xmm6549GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1550pxor %xmm6, %xmm7551GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1552pxor %xmm7, %xmm8553GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1554.elseif \i == 6555pxor %xmm6, %xmm7556GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1557pxor %xmm7, %xmm8558GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1559.elseif \i == 7560pxor %xmm7, %xmm8561GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1562.endif563cmp $64, %r13564jl _initial_blocks_done\num_initial_blocks\operation565# no need for precomputed values566/*567*568* Precomputations for HashKey parallel with encryption of first 4 blocks.569* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i570*/571paddd ONE(%rip), \XMM0 # INCR Y0572movdqa \XMM0, \XMM1573movdqa SHUF_MASK(%rip), %xmm14574PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap575576paddd ONE(%rip), \XMM0 # INCR Y0577movdqa \XMM0, \XMM2578movdqa SHUF_MASK(%rip), %xmm14579PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap580581paddd ONE(%rip), \XMM0 # INCR Y0582movdqa \XMM0, \XMM3583movdqa SHUF_MASK(%rip), %xmm14584PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap585586paddd ONE(%rip), \XMM0 # INCR Y0587movdqa \XMM0, \XMM4588movdqa SHUF_MASK(%rip), %xmm14589PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap590591pxor 16*0(%arg1), \XMM1592pxor 16*0(%arg1), \XMM2593pxor 16*0(%arg1), \XMM3594pxor 16*0(%arg1), \XMM4595movdqa \TMP3, \TMP5596pshufd $78, \TMP3, \TMP1597pxor \TMP3, \TMP1598movdqa \TMP1, HashKey_k(%rsp)599GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7600# TMP5 = HashKey^2<<1 (mod poly)601movdqa \TMP5, HashKey_2(%rsp)602# HashKey_2 = HashKey^2<<1 (mod poly)603pshufd $78, \TMP5, \TMP1604pxor \TMP5, \TMP1605movdqa \TMP1, HashKey_2_k(%rsp)606.irpc index, 1234 # do 4 rounds607movaps 0x10*\index(%arg1), \TMP1608AESENC \TMP1, \XMM1609AESENC \TMP1, \XMM2610AESENC \TMP1, \XMM3611AESENC \TMP1, \XMM4612.endr613GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7614# TMP5 = HashKey^3<<1 (mod poly)615movdqa \TMP5, HashKey_3(%rsp)616pshufd $78, \TMP5, \TMP1617pxor \TMP5, \TMP1618movdqa \TMP1, HashKey_3_k(%rsp)619.irpc index, 56789 # do next 5 rounds620movaps 0x10*\index(%arg1), \TMP1621AESENC \TMP1, \XMM1622AESENC \TMP1, \XMM2623AESENC \TMP1, \XMM3624AESENC \TMP1, \XMM4625.endr626GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7627# TMP5 = HashKey^3<<1 (mod poly)628movdqa \TMP5, HashKey_4(%rsp)629pshufd $78, \TMP5, \TMP1630pxor \TMP5, \TMP1631movdqa \TMP1, HashKey_4_k(%rsp)632movaps 0xa0(%arg1), \TMP2633AESENCLAST \TMP2, \XMM1634AESENCLAST \TMP2, \XMM2635AESENCLAST \TMP2, \XMM3636AESENCLAST \TMP2, \XMM4637movdqu 16*0(%arg3 , %r11 , 1), \TMP1638pxor \TMP1, \XMM1639movdqu 16*1(%arg3 , %r11 , 1), \TMP1640pxor \TMP1, \XMM2641movdqu 16*2(%arg3 , %r11 , 1), \TMP1642pxor \TMP1, \XMM3643movdqu 16*3(%arg3 , %r11 , 1), \TMP1644pxor \TMP1, \XMM4645movdqu \XMM1, 16*0(%arg2 , %r11 , 1)646movdqu \XMM2, 16*1(%arg2 , %r11 , 1)647movdqu \XMM3, 16*2(%arg2 , %r11 , 1)648movdqu \XMM4, 16*3(%arg2 , %r11 , 1)649650add $64, %r11651movdqa SHUF_MASK(%rip), %xmm14652PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap653pxor \XMMDst, \XMM1654# combine GHASHed value with the corresponding ciphertext655movdqa SHUF_MASK(%rip), %xmm14656PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap657movdqa SHUF_MASK(%rip), %xmm14658PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap659movdqa SHUF_MASK(%rip), %xmm14660PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap661662_initial_blocks_done\num_initial_blocks\operation:663664.endm665666/*667* encrypt 4 blocks at a time668* ghash the 4 previously encrypted ciphertext blocks669* arg1, %arg2, %arg3 are used as pointers only, not modified670* %r11 is the data offset value671*/672.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \673TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation674675movdqa \XMM1, \XMM5676movdqa \XMM2, \XMM6677movdqa \XMM3, \XMM7678movdqa \XMM4, \XMM8679680movdqa SHUF_MASK(%rip), %xmm15681# multiply TMP5 * HashKey using karatsuba682683movdqa \XMM5, \TMP4684pshufd $78, \XMM5, \TMP6685pxor \XMM5, \TMP6686paddd ONE(%rip), \XMM0 # INCR CNT687movdqa HashKey_4(%rsp), \TMP5688PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1689movdqa \XMM0, \XMM1690paddd ONE(%rip), \XMM0 # INCR CNT691movdqa \XMM0, \XMM2692paddd ONE(%rip), \XMM0 # INCR CNT693movdqa \XMM0, \XMM3694paddd ONE(%rip), \XMM0 # INCR CNT695movdqa \XMM0, \XMM4696PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap697PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0698PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap699PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap700PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap701702pxor (%arg1), \XMM1703pxor (%arg1), \XMM2704pxor (%arg1), \XMM3705pxor (%arg1), \XMM4706movdqa HashKey_4_k(%rsp), \TMP5707PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)708movaps 0x10(%arg1), \TMP1709AESENC \TMP1, \XMM1 # Round 1710AESENC \TMP1, \XMM2711AESENC \TMP1, \XMM3712AESENC \TMP1, \XMM4713movaps 0x20(%arg1), \TMP1714AESENC \TMP1, \XMM1 # Round 2715AESENC \TMP1, \XMM2716AESENC \TMP1, \XMM3717AESENC \TMP1, \XMM4718movdqa \XMM6, \TMP1719pshufd $78, \XMM6, \TMP2720pxor \XMM6, \TMP2721movdqa HashKey_3(%rsp), \TMP5722PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1723movaps 0x30(%arg1), \TMP3724AESENC \TMP3, \XMM1 # Round 3725AESENC \TMP3, \XMM2726AESENC \TMP3, \XMM3727AESENC \TMP3, \XMM4728PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0729movaps 0x40(%arg1), \TMP3730AESENC \TMP3, \XMM1 # Round 4731AESENC \TMP3, \XMM2732AESENC \TMP3, \XMM3733AESENC \TMP3, \XMM4734movdqa HashKey_3_k(%rsp), \TMP5735PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)736movaps 0x50(%arg1), \TMP3737AESENC \TMP3, \XMM1 # Round 5738AESENC \TMP3, \XMM2739AESENC \TMP3, \XMM3740AESENC \TMP3, \XMM4741pxor \TMP1, \TMP4742# accumulate the results in TMP4:XMM5, TMP6 holds the middle part743pxor \XMM6, \XMM5744pxor \TMP2, \TMP6745movdqa \XMM7, \TMP1746pshufd $78, \XMM7, \TMP2747pxor \XMM7, \TMP2748movdqa HashKey_2(%rsp ), \TMP5749750# Multiply TMP5 * HashKey using karatsuba751752PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1753movaps 0x60(%arg1), \TMP3754AESENC \TMP3, \XMM1 # Round 6755AESENC \TMP3, \XMM2756AESENC \TMP3, \XMM3757AESENC \TMP3, \XMM4758PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0759movaps 0x70(%arg1), \TMP3760AESENC \TMP3, \XMM1 # Round 7761AESENC \TMP3, \XMM2762AESENC \TMP3, \XMM3763AESENC \TMP3, \XMM4764movdqa HashKey_2_k(%rsp), \TMP5765PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)766movaps 0x80(%arg1), \TMP3767AESENC \TMP3, \XMM1 # Round 8768AESENC \TMP3, \XMM2769AESENC \TMP3, \XMM3770AESENC \TMP3, \XMM4771pxor \TMP1, \TMP4772# accumulate the results in TMP4:XMM5, TMP6 holds the middle part773pxor \XMM7, \XMM5774pxor \TMP2, \TMP6775776# Multiply XMM8 * HashKey777# XMM8 and TMP5 hold the values for the two operands778779movdqa \XMM8, \TMP1780pshufd $78, \XMM8, \TMP2781pxor \XMM8, \TMP2782movdqa HashKey(%rsp), \TMP5783PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1784movaps 0x90(%arg1), \TMP3785AESENC \TMP3, \XMM1 # Round 9786AESENC \TMP3, \XMM2787AESENC \TMP3, \XMM3788AESENC \TMP3, \XMM4789PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0790movaps 0xa0(%arg1), \TMP3791AESENCLAST \TMP3, \XMM1 # Round 10792AESENCLAST \TMP3, \XMM2793AESENCLAST \TMP3, \XMM3794AESENCLAST \TMP3, \XMM4795movdqa HashKey_k(%rsp), \TMP5796PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)797movdqu (%arg3,%r11,1), \TMP3798pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK799movdqu 16(%arg3,%r11,1), \TMP3800pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK801movdqu 32(%arg3,%r11,1), \TMP3802pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK803movdqu 48(%arg3,%r11,1), \TMP3804pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK805movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer806movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer807movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer808movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer809PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap810PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap811PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap812PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap813814pxor \TMP4, \TMP1815pxor \XMM8, \XMM5816pxor \TMP6, \TMP2817pxor \TMP1, \TMP2818pxor \XMM5, \TMP2819movdqa \TMP2, \TMP3820pslldq $8, \TMP3 # left shift TMP3 2 DWs821psrldq $8, \TMP2 # right shift TMP2 2 DWs822pxor \TMP3, \XMM5823pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5824825# first phase of reduction826827movdqa \XMM5, \TMP2828movdqa \XMM5, \TMP3829movdqa \XMM5, \TMP4830# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently831pslld $31, \TMP2 # packed right shift << 31832pslld $30, \TMP3 # packed right shift << 30833pslld $25, \TMP4 # packed right shift << 25834pxor \TMP3, \TMP2 # xor the shifted versions835pxor \TMP4, \TMP2836movdqa \TMP2, \TMP5837psrldq $4, \TMP5 # right shift T5 1 DW838pslldq $12, \TMP2 # left shift T2 3 DWs839pxor \TMP2, \XMM5840841# second phase of reduction842843movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4844movdqa \XMM5,\TMP3845movdqa \XMM5,\TMP4846psrld $1, \TMP2 # packed left shift >>1847psrld $2, \TMP3 # packed left shift >>2848psrld $7, \TMP4 # packed left shift >>7849pxor \TMP3,\TMP2 # xor the shifted versions850pxor \TMP4,\TMP2851pxor \TMP5, \TMP2852pxor \TMP2, \XMM5853pxor \TMP1, \XMM5 # result is in TMP1854855pxor \XMM5, \XMM1856.endm857858/*859* decrypt 4 blocks at a time860* ghash the 4 previously decrypted ciphertext blocks861* arg1, %arg2, %arg3 are used as pointers only, not modified862* %r11 is the data offset value863*/864.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \865TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation866867movdqa \XMM1, \XMM5868movdqa \XMM2, \XMM6869movdqa \XMM3, \XMM7870movdqa \XMM4, \XMM8871872movdqa SHUF_MASK(%rip), %xmm15873# multiply TMP5 * HashKey using karatsuba874875movdqa \XMM5, \TMP4876pshufd $78, \XMM5, \TMP6877pxor \XMM5, \TMP6878paddd ONE(%rip), \XMM0 # INCR CNT879movdqa HashKey_4(%rsp), \TMP5880PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1881movdqa \XMM0, \XMM1882paddd ONE(%rip), \XMM0 # INCR CNT883movdqa \XMM0, \XMM2884paddd ONE(%rip), \XMM0 # INCR CNT885movdqa \XMM0, \XMM3886paddd ONE(%rip), \XMM0 # INCR CNT887movdqa \XMM0, \XMM4888PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap889PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0890PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap891PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap892PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap893894pxor (%arg1), \XMM1895pxor (%arg1), \XMM2896pxor (%arg1), \XMM3897pxor (%arg1), \XMM4898movdqa HashKey_4_k(%rsp), \TMP5899PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)900movaps 0x10(%arg1), \TMP1901AESENC \TMP1, \XMM1 # Round 1902AESENC \TMP1, \XMM2903AESENC \TMP1, \XMM3904AESENC \TMP1, \XMM4905movaps 0x20(%arg1), \TMP1906AESENC \TMP1, \XMM1 # Round 2907AESENC \TMP1, \XMM2908AESENC \TMP1, \XMM3909AESENC \TMP1, \XMM4910movdqa \XMM6, \TMP1911pshufd $78, \XMM6, \TMP2912pxor \XMM6, \TMP2913movdqa HashKey_3(%rsp), \TMP5914PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1915movaps 0x30(%arg1), \TMP3916AESENC \TMP3, \XMM1 # Round 3917AESENC \TMP3, \XMM2918AESENC \TMP3, \XMM3919AESENC \TMP3, \XMM4920PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0921movaps 0x40(%arg1), \TMP3922AESENC \TMP3, \XMM1 # Round 4923AESENC \TMP3, \XMM2924AESENC \TMP3, \XMM3925AESENC \TMP3, \XMM4926movdqa HashKey_3_k(%rsp), \TMP5927PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)928movaps 0x50(%arg1), \TMP3929AESENC \TMP3, \XMM1 # Round 5930AESENC \TMP3, \XMM2931AESENC \TMP3, \XMM3932AESENC \TMP3, \XMM4933pxor \TMP1, \TMP4934# accumulate the results in TMP4:XMM5, TMP6 holds the middle part935pxor \XMM6, \XMM5936pxor \TMP2, \TMP6937movdqa \XMM7, \TMP1938pshufd $78, \XMM7, \TMP2939pxor \XMM7, \TMP2940movdqa HashKey_2(%rsp ), \TMP5941942# Multiply TMP5 * HashKey using karatsuba943944PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1945movaps 0x60(%arg1), \TMP3946AESENC \TMP3, \XMM1 # Round 6947AESENC \TMP3, \XMM2948AESENC \TMP3, \XMM3949AESENC \TMP3, \XMM4950PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0951movaps 0x70(%arg1), \TMP3952AESENC \TMP3, \XMM1 # Round 7953AESENC \TMP3, \XMM2954AESENC \TMP3, \XMM3955AESENC \TMP3, \XMM4956movdqa HashKey_2_k(%rsp), \TMP5957PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)958movaps 0x80(%arg1), \TMP3959AESENC \TMP3, \XMM1 # Round 8960AESENC \TMP3, \XMM2961AESENC \TMP3, \XMM3962AESENC \TMP3, \XMM4963pxor \TMP1, \TMP4964# accumulate the results in TMP4:XMM5, TMP6 holds the middle part965pxor \XMM7, \XMM5966pxor \TMP2, \TMP6967968# Multiply XMM8 * HashKey969# XMM8 and TMP5 hold the values for the two operands970971movdqa \XMM8, \TMP1972pshufd $78, \XMM8, \TMP2973pxor \XMM8, \TMP2974movdqa HashKey(%rsp), \TMP5975PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1976movaps 0x90(%arg1), \TMP3977AESENC \TMP3, \XMM1 # Round 9978AESENC \TMP3, \XMM2979AESENC \TMP3, \XMM3980AESENC \TMP3, \XMM4981PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0982movaps 0xa0(%arg1), \TMP3983AESENCLAST \TMP3, \XMM1 # Round 10984AESENCLAST \TMP3, \XMM2985AESENCLAST \TMP3, \XMM3986AESENCLAST \TMP3, \XMM4987movdqa HashKey_k(%rsp), \TMP5988PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)989movdqu (%arg3,%r11,1), \TMP3990pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK991movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer992movdqa \TMP3, \XMM1993movdqu 16(%arg3,%r11,1), \TMP3994pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK995movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer996movdqa \TMP3, \XMM2997movdqu 32(%arg3,%r11,1), \TMP3998pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK999movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer1000movdqa \TMP3, \XMM31001movdqu 48(%arg3,%r11,1), \TMP31002pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK1003movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer1004movdqa \TMP3, \XMM41005PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap1006PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap1007PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap1008PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap10091010pxor \TMP4, \TMP11011pxor \XMM8, \XMM51012pxor \TMP6, \TMP21013pxor \TMP1, \TMP21014pxor \XMM5, \TMP21015movdqa \TMP2, \TMP31016pslldq $8, \TMP3 # left shift TMP3 2 DWs1017psrldq $8, \TMP2 # right shift TMP2 2 DWs1018pxor \TMP3, \XMM51019pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM510201021# first phase of reduction10221023movdqa \XMM5, \TMP21024movdqa \XMM5, \TMP31025movdqa \XMM5, \TMP41026# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently1027pslld $31, \TMP2 # packed right shift << 311028pslld $30, \TMP3 # packed right shift << 301029pslld $25, \TMP4 # packed right shift << 251030pxor \TMP3, \TMP2 # xor the shifted versions1031pxor \TMP4, \TMP21032movdqa \TMP2, \TMP51033psrldq $4, \TMP5 # right shift T5 1 DW1034pslldq $12, \TMP2 # left shift T2 3 DWs1035pxor \TMP2, \XMM510361037# second phase of reduction10381039movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP41040movdqa \XMM5,\TMP31041movdqa \XMM5,\TMP41042psrld $1, \TMP2 # packed left shift >>11043psrld $2, \TMP3 # packed left shift >>21044psrld $7, \TMP4 # packed left shift >>71045pxor \TMP3,\TMP2 # xor the shifted versions1046pxor \TMP4,\TMP21047pxor \TMP5, \TMP21048pxor \TMP2, \XMM51049pxor \TMP1, \XMM5 # result is in TMP110501051pxor \XMM5, \XMM11052.endm10531054/* GHASH the last 4 ciphertext blocks. */1055.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \1056TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst10571058# Multiply TMP6 * HashKey (using Karatsuba)10591060movdqa \XMM1, \TMP61061pshufd $78, \XMM1, \TMP21062pxor \XMM1, \TMP21063movdqa HashKey_4(%rsp), \TMP51064PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b11065PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b01066movdqa HashKey_4_k(%rsp), \TMP41067PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)1068movdqa \XMM1, \XMMDst1069movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM110701071# Multiply TMP1 * HashKey (using Karatsuba)10721073movdqa \XMM2, \TMP11074pshufd $78, \XMM2, \TMP21075pxor \XMM2, \TMP21076movdqa HashKey_3(%rsp), \TMP51077PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b11078PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b01079movdqa HashKey_3_k(%rsp), \TMP41080PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)1081pxor \TMP1, \TMP61082pxor \XMM2, \XMMDst1083pxor \TMP2, \XMM11084# results accumulated in TMP6, XMMDst, XMM110851086# Multiply TMP1 * HashKey (using Karatsuba)10871088movdqa \XMM3, \TMP11089pshufd $78, \XMM3, \TMP21090pxor \XMM3, \TMP21091movdqa HashKey_2(%rsp), \TMP51092PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b11093PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b01094movdqa HashKey_2_k(%rsp), \TMP41095PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)1096pxor \TMP1, \TMP61097pxor \XMM3, \XMMDst1098pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM110991100# Multiply TMP1 * HashKey (using Karatsuba)1101movdqa \XMM4, \TMP11102pshufd $78, \XMM4, \TMP21103pxor \XMM4, \TMP21104movdqa HashKey(%rsp), \TMP51105PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b11106PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b01107movdqa HashKey_k(%rsp), \TMP41108PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)1109pxor \TMP1, \TMP61110pxor \XMM4, \XMMDst1111pxor \XMM1, \TMP21112pxor \TMP6, \TMP21113pxor \XMMDst, \TMP21114# middle section of the temp results combined as in karatsuba algorithm1115movdqa \TMP2, \TMP41116pslldq $8, \TMP4 # left shift TMP4 2 DWs1117psrldq $8, \TMP2 # right shift TMP2 2 DWs1118pxor \TMP4, \XMMDst1119pxor \TMP2, \TMP61120# TMP6:XMMDst holds the result of the accumulated carry-less multiplications1121# first phase of the reduction1122movdqa \XMMDst, \TMP21123movdqa \XMMDst, \TMP31124movdqa \XMMDst, \TMP41125# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently1126pslld $31, \TMP2 # packed right shifting << 311127pslld $30, \TMP3 # packed right shifting << 301128pslld $25, \TMP4 # packed right shifting << 251129pxor \TMP3, \TMP2 # xor the shifted versions1130pxor \TMP4, \TMP21131movdqa \TMP2, \TMP71132psrldq $4, \TMP7 # right shift TMP7 1 DW1133pslldq $12, \TMP2 # left shift TMP2 3 DWs1134pxor \TMP2, \XMMDst11351136# second phase of the reduction1137movdqa \XMMDst, \TMP21138# make 3 copies of XMMDst for doing 3 shift operations1139movdqa \XMMDst, \TMP31140movdqa \XMMDst, \TMP41141psrld $1, \TMP2 # packed left shift >> 11142psrld $2, \TMP3 # packed left shift >> 21143psrld $7, \TMP4 # packed left shift >> 71144pxor \TMP3, \TMP2 # xor the shifted versions1145pxor \TMP4, \TMP21146pxor \TMP7, \TMP21147pxor \TMP2, \XMMDst1148pxor \TMP6, \XMMDst # reduced result is in XMMDst1149.endm11501151/* Encryption of a single block done*/1152.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP111531154pxor (%arg1), \XMM01155movaps 16(%arg1), \TMP11156AESENC \TMP1, \XMM01157movaps 32(%arg1), \TMP11158AESENC \TMP1, \XMM01159movaps 48(%arg1), \TMP11160AESENC \TMP1, \XMM01161movaps 64(%arg1), \TMP11162AESENC \TMP1, \XMM01163movaps 80(%arg1), \TMP11164AESENC \TMP1, \XMM01165movaps 96(%arg1), \TMP11166AESENC \TMP1, \XMM01167movaps 112(%arg1), \TMP11168AESENC \TMP1, \XMM01169movaps 128(%arg1), \TMP11170AESENC \TMP1, \XMM01171movaps 144(%arg1), \TMP11172AESENC \TMP1, \XMM01173movaps 160(%arg1), \TMP11174AESENCLAST \TMP1, \XMM01175.endm117611771178/*****************************************************************************1179* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.1180* u8 *out, // Plaintext output. Encrypt in-place is allowed.1181* const u8 *in, // Ciphertext input1182* u64 plaintext_len, // Length of data in bytes for decryption.1183* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)1184* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)1185* // concatenated with 0x00000001. 16-byte aligned pointer.1186* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.1187* const u8 *aad, // Additional Authentication Data (AAD)1188* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes1189* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the1190* // given authentication tag and only return the plaintext if they match.1191* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 161192* // (most likely), 12 or 8.1193*1194* Assumptions:1195*1196* keys:1197* keys are pre-expanded and aligned to 16 bytes. we are using the first1198* set of 11 keys in the data structure void *aes_ctx1199*1200* iv:1201* 0 1 2 31202* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11203* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1204* | Salt (From the SA) |1205* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1206* | Initialization Vector |1207* | (This is the sequence number from IPSec header) |1208* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1209* | 0x1 |1210* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1211*1212*1213*1214* AAD:1215* AAD padded to 128 bits with 01216* for example, assume AAD is a u32 vector1217*1218* if AAD is 8 bytes:1219* AAD[3] = {A0, A1};1220* padded AAD in xmm register = {A1 A0 0 0}1221*1222* 0 1 2 31223* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11224* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1225* | SPI (A1) |1226* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1227* | 32-bit Sequence Number (A0) |1228* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1229* | 0x0 |1230* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1231*1232* AAD Format with 32-bit Sequence Number1233*1234* if AAD is 12 bytes:1235* AAD[3] = {A0, A1, A2};1236* padded AAD in xmm register = {A2 A1 A0 0}1237*1238* 0 1 2 31239* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11240* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1241* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11242* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1243* | SPI (A2) |1244* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1245* | 64-bit Extended Sequence Number {A1,A0} |1246* | |1247* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1248* | 0x0 |1249* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1250*1251* AAD Format with 64-bit Extended Sequence Number1252*1253* aadLen:1254* from the definition of the spec, aadLen can only be 8 or 12 bytes.1255* The code supports 16 too but for other sizes, the code will fail.1256*1257* TLen:1258* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.1259* For other sizes, the code will fail.1260*1261* poly = x^128 + x^127 + x^126 + x^121 + 11262*1263*****************************************************************************/12641265ENTRY(aesni_gcm_dec)1266push %r121267push %r131268push %r141269mov %rsp, %r141270/*1271* states of %xmm registers %xmm6:%xmm15 not saved1272* all %xmm registers are clobbered1273*/1274sub $VARIABLE_OFFSET, %rsp1275and $~63, %rsp # align rsp to 64 bytes1276mov %arg6, %r121277movdqu (%r12), %xmm13 # %xmm13 = HashKey1278movdqa SHUF_MASK(%rip), %xmm21279PSHUFB_XMM %xmm2, %xmm13128012811282# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)12831284movdqa %xmm13, %xmm21285psllq $1, %xmm131286psrlq $63, %xmm21287movdqa %xmm2, %xmm11288pslldq $8, %xmm21289psrldq $8, %xmm11290por %xmm2, %xmm1312911292# Reduction12931294pshufd $0x24, %xmm1, %xmm21295pcmpeqd TWOONE(%rip), %xmm21296pand POLY(%rip), %xmm21297pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)129812991300# Decrypt first few blocks13011302movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)1303mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext1304and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)1305mov %r13, %r121306and $(3<<4), %r121307jz _initial_num_blocks_is_0_decrypt1308cmp $(2<<4), %r121309jb _initial_num_blocks_is_1_decrypt1310je _initial_num_blocks_is_2_decrypt1311_initial_num_blocks_is_3_decrypt:1312INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1313%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec1314sub $48, %r131315jmp _initial_blocks_decrypted1316_initial_num_blocks_is_2_decrypt:1317INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1318%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec1319sub $32, %r131320jmp _initial_blocks_decrypted1321_initial_num_blocks_is_1_decrypt:1322INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1323%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec1324sub $16, %r131325jmp _initial_blocks_decrypted1326_initial_num_blocks_is_0_decrypt:1327INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1328%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec1329_initial_blocks_decrypted:1330cmp $0, %r131331je _zero_cipher_left_decrypt1332sub $64, %r131333je _four_cipher_left_decrypt1334_decrypt_by_4:1335GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \1336%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec1337add $64, %r111338sub $64, %r131339jne _decrypt_by_41340_four_cipher_left_decrypt:1341GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \1342%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm81343_zero_cipher_left_decrypt:1344mov %arg4, %r131345and $15, %r13 # %r13 = arg4 (mod 16)1346je _multiple_of_16_bytes_decrypt13471348# Handle the last <16 byte block separately13491350paddd ONE(%rip), %xmm0 # increment CNT to get Yn1351movdqa SHUF_MASK(%rip), %xmm101352PSHUFB_XMM %xmm10, %xmm013531354ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)1355sub $16, %r111356add %r13, %r111357movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block1358lea SHIFT_MASK+16(%rip), %r121359sub %r13, %r121360# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes1361# (%r13 is the number of bytes in plaintext mod 16)1362movdqu (%r12), %xmm2 # get the appropriate shuffle mask1363PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes13641365movdqa %xmm1, %xmm21366pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)1367movdqu ALL_F-SHIFT_MASK(%r12), %xmm11368# get the appropriate mask to mask out top 16-%r13 bytes of %xmm01369pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm01370pand %xmm1, %xmm21371movdqa SHUF_MASK(%rip), %xmm101372PSHUFB_XMM %xmm10 ,%xmm213731374pxor %xmm2, %xmm81375GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm61376# GHASH computation for the last <16 byte block1377sub %r13, %r111378add $16, %r1113791380# output %r13 bytes1381MOVQ_R64_XMM %xmm0, %rax1382cmp $8, %r131383jle _less_than_8_bytes_left_decrypt1384mov %rax, (%arg2 , %r11, 1)1385add $8, %r111386psrldq $8, %xmm01387MOVQ_R64_XMM %xmm0, %rax1388sub $8, %r131389_less_than_8_bytes_left_decrypt:1390mov %al, (%arg2, %r11, 1)1391add $1, %r111392shr $8, %rax1393sub $1, %r131394jne _less_than_8_bytes_left_decrypt1395_multiple_of_16_bytes_decrypt:1396mov arg8, %r12 # %r13 = aadLen (number of bytes)1397shl $3, %r12 # convert into number of bits1398movd %r12d, %xmm15 # len(A) in %xmm151399shl $3, %arg4 # len(C) in bits (*128)1400MOVQ_R64_XMM %arg4, %xmm11401pslldq $8, %xmm15 # %xmm15 = len(A)||0x00000000000000001402pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)1403pxor %xmm15, %xmm81404GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm61405# final GHASH computation1406movdqa SHUF_MASK(%rip), %xmm101407PSHUFB_XMM %xmm10, %xmm814081409mov %arg5, %rax # %rax = *Y01410movdqu (%rax), %xmm0 # %xmm0 = Y01411ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)1412pxor %xmm8, %xmm01413_return_T_decrypt:1414mov arg9, %r10 # %r10 = authTag1415mov arg10, %r11 # %r11 = auth_tag_len1416cmp $16, %r111417je _T_16_decrypt1418cmp $12, %r111419je _T_12_decrypt1420_T_8_decrypt:1421MOVQ_R64_XMM %xmm0, %rax1422mov %rax, (%r10)1423jmp _return_T_done_decrypt1424_T_12_decrypt:1425MOVQ_R64_XMM %xmm0, %rax1426mov %rax, (%r10)1427psrldq $8, %xmm01428movd %xmm0, %eax1429mov %eax, 8(%r10)1430jmp _return_T_done_decrypt1431_T_16_decrypt:1432movdqu %xmm0, (%r10)1433_return_T_done_decrypt:1434mov %r14, %rsp1435pop %r141436pop %r131437pop %r121438ret143914401441/*****************************************************************************1442* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.1443* u8 *out, // Ciphertext output. Encrypt in-place is allowed.1444* const u8 *in, // Plaintext input1445* u64 plaintext_len, // Length of data in bytes for encryption.1446* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)1447* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)1448* // concatenated with 0x00000001. 16-byte aligned pointer.1449* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.1450* const u8 *aad, // Additional Authentication Data (AAD)1451* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes1452* u8 *auth_tag, // Authenticated Tag output.1453* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),1454* // 12 or 8.1455*1456* Assumptions:1457*1458* keys:1459* keys are pre-expanded and aligned to 16 bytes. we are using the1460* first set of 11 keys in the data structure void *aes_ctx1461*1462*1463* iv:1464* 0 1 2 31465* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11466* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1467* | Salt (From the SA) |1468* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1469* | Initialization Vector |1470* | (This is the sequence number from IPSec header) |1471* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1472* | 0x1 |1473* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1474*1475*1476*1477* AAD:1478* AAD padded to 128 bits with 01479* for example, assume AAD is a u32 vector1480*1481* if AAD is 8 bytes:1482* AAD[3] = {A0, A1};1483* padded AAD in xmm register = {A1 A0 0 0}1484*1485* 0 1 2 31486* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11487* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1488* | SPI (A1) |1489* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1490* | 32-bit Sequence Number (A0) |1491* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1492* | 0x0 |1493* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1494*1495* AAD Format with 32-bit Sequence Number1496*1497* if AAD is 12 bytes:1498* AAD[3] = {A0, A1, A2};1499* padded AAD in xmm register = {A2 A1 A0 0}1500*1501* 0 1 2 31502* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 11503* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1504* | SPI (A2) |1505* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1506* | 64-bit Extended Sequence Number {A1,A0} |1507* | |1508* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1509* | 0x0 |1510* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+1511*1512* AAD Format with 64-bit Extended Sequence Number1513*1514* aadLen:1515* from the definition of the spec, aadLen can only be 8 or 12 bytes.1516* The code supports 16 too but for other sizes, the code will fail.1517*1518* TLen:1519* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.1520* For other sizes, the code will fail.1521*1522* poly = x^128 + x^127 + x^126 + x^121 + 11523***************************************************************************/1524ENTRY(aesni_gcm_enc)1525push %r121526push %r131527push %r141528mov %rsp, %r141529#1530# states of %xmm registers %xmm6:%xmm15 not saved1531# all %xmm registers are clobbered1532#1533sub $VARIABLE_OFFSET, %rsp1534and $~63, %rsp1535mov %arg6, %r121536movdqu (%r12), %xmm131537movdqa SHUF_MASK(%rip), %xmm21538PSHUFB_XMM %xmm2, %xmm13153915401541# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)15421543movdqa %xmm13, %xmm21544psllq $1, %xmm131545psrlq $63, %xmm21546movdqa %xmm2, %xmm11547pslldq $8, %xmm21548psrldq $8, %xmm11549por %xmm2, %xmm1315501551# reduce HashKey<<115521553pshufd $0x24, %xmm1, %xmm21554pcmpeqd TWOONE(%rip), %xmm21555pand POLY(%rip), %xmm21556pxor %xmm2, %xmm131557movdqa %xmm13, HashKey(%rsp)1558mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)1559and $-16, %r131560mov %r13, %r1215611562# Encrypt first few blocks15631564and $(3<<4), %r121565jz _initial_num_blocks_is_0_encrypt1566cmp $(2<<4), %r121567jb _initial_num_blocks_is_1_encrypt1568je _initial_num_blocks_is_2_encrypt1569_initial_num_blocks_is_3_encrypt:1570INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1571%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc1572sub $48, %r131573jmp _initial_blocks_encrypted1574_initial_num_blocks_is_2_encrypt:1575INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1576%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc1577sub $32, %r131578jmp _initial_blocks_encrypted1579_initial_num_blocks_is_1_encrypt:1580INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1581%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc1582sub $16, %r131583jmp _initial_blocks_encrypted1584_initial_num_blocks_is_0_encrypt:1585INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \1586%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc1587_initial_blocks_encrypted:15881589# Main loop - Encrypt remaining blocks15901591cmp $0, %r131592je _zero_cipher_left_encrypt1593sub $64, %r131594je _four_cipher_left_encrypt1595_encrypt_by_4_encrypt:1596GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \1597%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc1598add $64, %r111599sub $64, %r131600jne _encrypt_by_4_encrypt1601_four_cipher_left_encrypt:1602GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \1603%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm81604_zero_cipher_left_encrypt:1605mov %arg4, %r131606and $15, %r13 # %r13 = arg4 (mod 16)1607je _multiple_of_16_bytes_encrypt16081609# Handle the last <16 Byte block separately1610paddd ONE(%rip), %xmm0 # INCR CNT to get Yn1611movdqa SHUF_MASK(%rip), %xmm101612PSHUFB_XMM %xmm10, %xmm0161316141615ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)1616sub $16, %r111617add %r13, %r111618movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks1619lea SHIFT_MASK+16(%rip), %r121620sub %r13, %r121621# adjust the shuffle mask pointer to be able to shift 16-r13 bytes1622# (%r13 is the number of bytes in plaintext mod 16)1623movdqu (%r12), %xmm2 # get the appropriate shuffle mask1624PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte1625pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)1626movdqu ALL_F-SHIFT_MASK(%r12), %xmm11627# get the appropriate mask to mask out top 16-r13 bytes of xmm01628pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm01629movdqa SHUF_MASK(%rip), %xmm101630PSHUFB_XMM %xmm10,%xmm016311632pxor %xmm0, %xmm81633GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm61634# GHASH computation for the last <16 byte block1635sub %r13, %r111636add $16, %r1116371638movdqa SHUF_MASK(%rip), %xmm101639PSHUFB_XMM %xmm10, %xmm016401641# shuffle xmm0 back to output as ciphertext16421643# Output %r13 bytes1644MOVQ_R64_XMM %xmm0, %rax1645cmp $8, %r131646jle _less_than_8_bytes_left_encrypt1647mov %rax, (%arg2 , %r11, 1)1648add $8, %r111649psrldq $8, %xmm01650MOVQ_R64_XMM %xmm0, %rax1651sub $8, %r131652_less_than_8_bytes_left_encrypt:1653mov %al, (%arg2, %r11, 1)1654add $1, %r111655shr $8, %rax1656sub $1, %r131657jne _less_than_8_bytes_left_encrypt1658_multiple_of_16_bytes_encrypt:1659mov arg8, %r12 # %r12 = addLen (number of bytes)1660shl $3, %r121661movd %r12d, %xmm15 # len(A) in %xmm151662shl $3, %arg4 # len(C) in bits (*128)1663MOVQ_R64_XMM %arg4, %xmm11664pslldq $8, %xmm15 # %xmm15 = len(A)||0x00000000000000001665pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)1666pxor %xmm15, %xmm81667GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm61668# final GHASH computation1669movdqa SHUF_MASK(%rip), %xmm101670PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap16711672mov %arg5, %rax # %rax = *Y01673movdqu (%rax), %xmm0 # %xmm0 = Y01674ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)1675pxor %xmm8, %xmm01676_return_T_encrypt:1677mov arg9, %r10 # %r10 = authTag1678mov arg10, %r11 # %r11 = auth_tag_len1679cmp $16, %r111680je _T_16_encrypt1681cmp $12, %r111682je _T_12_encrypt1683_T_8_encrypt:1684MOVQ_R64_XMM %xmm0, %rax1685mov %rax, (%r10)1686jmp _return_T_done_encrypt1687_T_12_encrypt:1688MOVQ_R64_XMM %xmm0, %rax1689mov %rax, (%r10)1690psrldq $8, %xmm01691movd %xmm0, %eax1692mov %eax, 8(%r10)1693jmp _return_T_done_encrypt1694_T_16_encrypt:1695movdqu %xmm0, (%r10)1696_return_T_done_encrypt:1697mov %r14, %rsp1698pop %r141699pop %r131700pop %r121701ret17021703#endif170417051706_key_expansion_128:1707_key_expansion_256a:1708pshufd $0b11111111, %xmm1, %xmm11709shufps $0b00010000, %xmm0, %xmm41710pxor %xmm4, %xmm01711shufps $0b10001100, %xmm0, %xmm41712pxor %xmm4, %xmm01713pxor %xmm1, %xmm01714movaps %xmm0, (TKEYP)1715add $0x10, TKEYP1716ret17171718.align 41719_key_expansion_192a:1720pshufd $0b01010101, %xmm1, %xmm11721shufps $0b00010000, %xmm0, %xmm41722pxor %xmm4, %xmm01723shufps $0b10001100, %xmm0, %xmm41724pxor %xmm4, %xmm01725pxor %xmm1, %xmm017261727movaps %xmm2, %xmm51728movaps %xmm2, %xmm61729pslldq $4, %xmm51730pshufd $0b11111111, %xmm0, %xmm31731pxor %xmm3, %xmm21732pxor %xmm5, %xmm217331734movaps %xmm0, %xmm11735shufps $0b01000100, %xmm0, %xmm61736movaps %xmm6, (TKEYP)1737shufps $0b01001110, %xmm2, %xmm11738movaps %xmm1, 0x10(TKEYP)1739add $0x20, TKEYP1740ret17411742.align 41743_key_expansion_192b:1744pshufd $0b01010101, %xmm1, %xmm11745shufps $0b00010000, %xmm0, %xmm41746pxor %xmm4, %xmm01747shufps $0b10001100, %xmm0, %xmm41748pxor %xmm4, %xmm01749pxor %xmm1, %xmm017501751movaps %xmm2, %xmm51752pslldq $4, %xmm51753pshufd $0b11111111, %xmm0, %xmm31754pxor %xmm3, %xmm21755pxor %xmm5, %xmm217561757movaps %xmm0, (TKEYP)1758add $0x10, TKEYP1759ret17601761.align 41762_key_expansion_256b:1763pshufd $0b10101010, %xmm1, %xmm11764shufps $0b00010000, %xmm2, %xmm41765pxor %xmm4, %xmm21766shufps $0b10001100, %xmm2, %xmm41767pxor %xmm4, %xmm21768pxor %xmm1, %xmm21769movaps %xmm2, (TKEYP)1770add $0x10, TKEYP1771ret17721773/*1774* int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,1775* unsigned int key_len)1776*/1777ENTRY(aesni_set_key)1778#ifndef __x86_64__1779pushl KEYP1780movl 8(%esp), KEYP # ctx1781movl 12(%esp), UKEYP # in_key1782movl 16(%esp), %edx # key_len1783#endif1784movups (UKEYP), %xmm0 # user key (first 16 bytes)1785movaps %xmm0, (KEYP)1786lea 0x10(KEYP), TKEYP # key addr1787movl %edx, 480(KEYP)1788pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x1789cmp $24, %dl1790jb .Lenc_key1281791je .Lenc_key1921792movups 0x10(UKEYP), %xmm2 # other user key1793movaps %xmm2, (TKEYP)1794add $0x10, TKEYP1795AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 11796call _key_expansion_256a1797AESKEYGENASSIST 0x1 %xmm0 %xmm11798call _key_expansion_256b1799AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 21800call _key_expansion_256a1801AESKEYGENASSIST 0x2 %xmm0 %xmm11802call _key_expansion_256b1803AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 31804call _key_expansion_256a1805AESKEYGENASSIST 0x4 %xmm0 %xmm11806call _key_expansion_256b1807AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 41808call _key_expansion_256a1809AESKEYGENASSIST 0x8 %xmm0 %xmm11810call _key_expansion_256b1811AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 51812call _key_expansion_256a1813AESKEYGENASSIST 0x10 %xmm0 %xmm11814call _key_expansion_256b1815AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 61816call _key_expansion_256a1817AESKEYGENASSIST 0x20 %xmm0 %xmm11818call _key_expansion_256b1819AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 71820call _key_expansion_256a1821jmp .Ldec_key1822.Lenc_key192:1823movq 0x10(UKEYP), %xmm2 # other user key1824AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 11825call _key_expansion_192a1826AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 21827call _key_expansion_192b1828AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 31829call _key_expansion_192a1830AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 41831call _key_expansion_192b1832AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 51833call _key_expansion_192a1834AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 61835call _key_expansion_192b1836AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 71837call _key_expansion_192a1838AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 81839call _key_expansion_192b1840jmp .Ldec_key1841.Lenc_key128:1842AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 11843call _key_expansion_1281844AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 21845call _key_expansion_1281846AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 31847call _key_expansion_1281848AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 41849call _key_expansion_1281850AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 51851call _key_expansion_1281852AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 61853call _key_expansion_1281854AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 71855call _key_expansion_1281856AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 81857call _key_expansion_1281858AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 91859call _key_expansion_1281860AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 101861call _key_expansion_1281862.Ldec_key:1863sub $0x10, TKEYP1864movaps (KEYP), %xmm01865movaps (TKEYP), %xmm11866movaps %xmm0, 240(TKEYP)1867movaps %xmm1, 240(KEYP)1868add $0x10, KEYP1869lea 240-16(TKEYP), UKEYP1870.align 41871.Ldec_key_loop:1872movaps (KEYP), %xmm01873AESIMC %xmm0 %xmm11874movaps %xmm1, (UKEYP)1875add $0x10, KEYP1876sub $0x10, UKEYP1877cmp TKEYP, KEYP1878jb .Ldec_key_loop1879xor AREG, AREG1880#ifndef __x86_64__1881popl KEYP1882#endif1883ret18841885/*1886* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)1887*/1888ENTRY(aesni_enc)1889#ifndef __x86_64__1890pushl KEYP1891pushl KLEN1892movl 12(%esp), KEYP1893movl 16(%esp), OUTP1894movl 20(%esp), INP1895#endif1896movl 480(KEYP), KLEN # key length1897movups (INP), STATE # input1898call _aesni_enc11899movups STATE, (OUTP) # output1900#ifndef __x86_64__1901popl KLEN1902popl KEYP1903#endif1904ret19051906/*1907* _aesni_enc1: internal ABI1908* input:1909* KEYP: key struct pointer1910* KLEN: round count1911* STATE: initial state (input)1912* output:1913* STATE: finial state (output)1914* changed:1915* KEY1916* TKEYP (T1)1917*/1918.align 41919_aesni_enc1:1920movaps (KEYP), KEY # key1921mov KEYP, TKEYP1922pxor KEY, STATE # round 01923add $0x30, TKEYP1924cmp $24, KLEN1925jb .Lenc1281926lea 0x20(TKEYP), TKEYP1927je .Lenc1921928add $0x20, TKEYP1929movaps -0x60(TKEYP), KEY1930AESENC KEY STATE1931movaps -0x50(TKEYP), KEY1932AESENC KEY STATE1933.align 41934.Lenc192:1935movaps -0x40(TKEYP), KEY1936AESENC KEY STATE1937movaps -0x30(TKEYP), KEY1938AESENC KEY STATE1939.align 41940.Lenc128:1941movaps -0x20(TKEYP), KEY1942AESENC KEY STATE1943movaps -0x10(TKEYP), KEY1944AESENC KEY STATE1945movaps (TKEYP), KEY1946AESENC KEY STATE1947movaps 0x10(TKEYP), KEY1948AESENC KEY STATE1949movaps 0x20(TKEYP), KEY1950AESENC KEY STATE1951movaps 0x30(TKEYP), KEY1952AESENC KEY STATE1953movaps 0x40(TKEYP), KEY1954AESENC KEY STATE1955movaps 0x50(TKEYP), KEY1956AESENC KEY STATE1957movaps 0x60(TKEYP), KEY1958AESENC KEY STATE1959movaps 0x70(TKEYP), KEY1960AESENCLAST KEY STATE1961ret19621963/*1964* _aesni_enc4: internal ABI1965* input:1966* KEYP: key struct pointer1967* KLEN: round count1968* STATE1: initial state (input)1969* STATE21970* STATE31971* STATE41972* output:1973* STATE1: finial state (output)1974* STATE21975* STATE31976* STATE41977* changed:1978* KEY1979* TKEYP (T1)1980*/1981.align 41982_aesni_enc4:1983movaps (KEYP), KEY # key1984mov KEYP, TKEYP1985pxor KEY, STATE1 # round 01986pxor KEY, STATE21987pxor KEY, STATE31988pxor KEY, STATE41989add $0x30, TKEYP1990cmp $24, KLEN1991jb .L4enc1281992lea 0x20(TKEYP), TKEYP1993je .L4enc1921994add $0x20, TKEYP1995movaps -0x60(TKEYP), KEY1996AESENC KEY STATE11997AESENC KEY STATE21998AESENC KEY STATE31999AESENC KEY STATE42000movaps -0x50(TKEYP), KEY2001AESENC KEY STATE12002AESENC KEY STATE22003AESENC KEY STATE32004AESENC KEY STATE42005#.align 42006.L4enc192:2007movaps -0x40(TKEYP), KEY2008AESENC KEY STATE12009AESENC KEY STATE22010AESENC KEY STATE32011AESENC KEY STATE42012movaps -0x30(TKEYP), KEY2013AESENC KEY STATE12014AESENC KEY STATE22015AESENC KEY STATE32016AESENC KEY STATE42017#.align 42018.L4enc128:2019movaps -0x20(TKEYP), KEY2020AESENC KEY STATE12021AESENC KEY STATE22022AESENC KEY STATE32023AESENC KEY STATE42024movaps -0x10(TKEYP), KEY2025AESENC KEY STATE12026AESENC KEY STATE22027AESENC KEY STATE32028AESENC KEY STATE42029movaps (TKEYP), KEY2030AESENC KEY STATE12031AESENC KEY STATE22032AESENC KEY STATE32033AESENC KEY STATE42034movaps 0x10(TKEYP), KEY2035AESENC KEY STATE12036AESENC KEY STATE22037AESENC KEY STATE32038AESENC KEY STATE42039movaps 0x20(TKEYP), KEY2040AESENC KEY STATE12041AESENC KEY STATE22042AESENC KEY STATE32043AESENC KEY STATE42044movaps 0x30(TKEYP), KEY2045AESENC KEY STATE12046AESENC KEY STATE22047AESENC KEY STATE32048AESENC KEY STATE42049movaps 0x40(TKEYP), KEY2050AESENC KEY STATE12051AESENC KEY STATE22052AESENC KEY STATE32053AESENC KEY STATE42054movaps 0x50(TKEYP), KEY2055AESENC KEY STATE12056AESENC KEY STATE22057AESENC KEY STATE32058AESENC KEY STATE42059movaps 0x60(TKEYP), KEY2060AESENC KEY STATE12061AESENC KEY STATE22062AESENC KEY STATE32063AESENC KEY STATE42064movaps 0x70(TKEYP), KEY2065AESENCLAST KEY STATE1 # last round2066AESENCLAST KEY STATE22067AESENCLAST KEY STATE32068AESENCLAST KEY STATE42069ret20702071/*2072* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)2073*/2074ENTRY(aesni_dec)2075#ifndef __x86_64__2076pushl KEYP2077pushl KLEN2078movl 12(%esp), KEYP2079movl 16(%esp), OUTP2080movl 20(%esp), INP2081#endif2082mov 480(KEYP), KLEN # key length2083add $240, KEYP2084movups (INP), STATE # input2085call _aesni_dec12086movups STATE, (OUTP) #output2087#ifndef __x86_64__2088popl KLEN2089popl KEYP2090#endif2091ret20922093/*2094* _aesni_dec1: internal ABI2095* input:2096* KEYP: key struct pointer2097* KLEN: key length2098* STATE: initial state (input)2099* output:2100* STATE: finial state (output)2101* changed:2102* KEY2103* TKEYP (T1)2104*/2105.align 42106_aesni_dec1:2107movaps (KEYP), KEY # key2108mov KEYP, TKEYP2109pxor KEY, STATE # round 02110add $0x30, TKEYP2111cmp $24, KLEN2112jb .Ldec1282113lea 0x20(TKEYP), TKEYP2114je .Ldec1922115add $0x20, TKEYP2116movaps -0x60(TKEYP), KEY2117AESDEC KEY STATE2118movaps -0x50(TKEYP), KEY2119AESDEC KEY STATE2120.align 42121.Ldec192:2122movaps -0x40(TKEYP), KEY2123AESDEC KEY STATE2124movaps -0x30(TKEYP), KEY2125AESDEC KEY STATE2126.align 42127.Ldec128:2128movaps -0x20(TKEYP), KEY2129AESDEC KEY STATE2130movaps -0x10(TKEYP), KEY2131AESDEC KEY STATE2132movaps (TKEYP), KEY2133AESDEC KEY STATE2134movaps 0x10(TKEYP), KEY2135AESDEC KEY STATE2136movaps 0x20(TKEYP), KEY2137AESDEC KEY STATE2138movaps 0x30(TKEYP), KEY2139AESDEC KEY STATE2140movaps 0x40(TKEYP), KEY2141AESDEC KEY STATE2142movaps 0x50(TKEYP), KEY2143AESDEC KEY STATE2144movaps 0x60(TKEYP), KEY2145AESDEC KEY STATE2146movaps 0x70(TKEYP), KEY2147AESDECLAST KEY STATE2148ret21492150/*2151* _aesni_dec4: internal ABI2152* input:2153* KEYP: key struct pointer2154* KLEN: key length2155* STATE1: initial state (input)2156* STATE22157* STATE32158* STATE42159* output:2160* STATE1: finial state (output)2161* STATE22162* STATE32163* STATE42164* changed:2165* KEY2166* TKEYP (T1)2167*/2168.align 42169_aesni_dec4:2170movaps (KEYP), KEY # key2171mov KEYP, TKEYP2172pxor KEY, STATE1 # round 02173pxor KEY, STATE22174pxor KEY, STATE32175pxor KEY, STATE42176add $0x30, TKEYP2177cmp $24, KLEN2178jb .L4dec1282179lea 0x20(TKEYP), TKEYP2180je .L4dec1922181add $0x20, TKEYP2182movaps -0x60(TKEYP), KEY2183AESDEC KEY STATE12184AESDEC KEY STATE22185AESDEC KEY STATE32186AESDEC KEY STATE42187movaps -0x50(TKEYP), KEY2188AESDEC KEY STATE12189AESDEC KEY STATE22190AESDEC KEY STATE32191AESDEC KEY STATE42192.align 42193.L4dec192:2194movaps -0x40(TKEYP), KEY2195AESDEC KEY STATE12196AESDEC KEY STATE22197AESDEC KEY STATE32198AESDEC KEY STATE42199movaps -0x30(TKEYP), KEY2200AESDEC KEY STATE12201AESDEC KEY STATE22202AESDEC KEY STATE32203AESDEC KEY STATE42204.align 42205.L4dec128:2206movaps -0x20(TKEYP), KEY2207AESDEC KEY STATE12208AESDEC KEY STATE22209AESDEC KEY STATE32210AESDEC KEY STATE42211movaps -0x10(TKEYP), KEY2212AESDEC KEY STATE12213AESDEC KEY STATE22214AESDEC KEY STATE32215AESDEC KEY STATE42216movaps (TKEYP), KEY2217AESDEC KEY STATE12218AESDEC KEY STATE22219AESDEC KEY STATE32220AESDEC KEY STATE42221movaps 0x10(TKEYP), KEY2222AESDEC KEY STATE12223AESDEC KEY STATE22224AESDEC KEY STATE32225AESDEC KEY STATE42226movaps 0x20(TKEYP), KEY2227AESDEC KEY STATE12228AESDEC KEY STATE22229AESDEC KEY STATE32230AESDEC KEY STATE42231movaps 0x30(TKEYP), KEY2232AESDEC KEY STATE12233AESDEC KEY STATE22234AESDEC KEY STATE32235AESDEC KEY STATE42236movaps 0x40(TKEYP), KEY2237AESDEC KEY STATE12238AESDEC KEY STATE22239AESDEC KEY STATE32240AESDEC KEY STATE42241movaps 0x50(TKEYP), KEY2242AESDEC KEY STATE12243AESDEC KEY STATE22244AESDEC KEY STATE32245AESDEC KEY STATE42246movaps 0x60(TKEYP), KEY2247AESDEC KEY STATE12248AESDEC KEY STATE22249AESDEC KEY STATE32250AESDEC KEY STATE42251movaps 0x70(TKEYP), KEY2252AESDECLAST KEY STATE1 # last round2253AESDECLAST KEY STATE22254AESDECLAST KEY STATE32255AESDECLAST KEY STATE42256ret22572258/*2259* void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,2260* size_t len)2261*/2262ENTRY(aesni_ecb_enc)2263#ifndef __x86_64__2264pushl LEN2265pushl KEYP2266pushl KLEN2267movl 16(%esp), KEYP2268movl 20(%esp), OUTP2269movl 24(%esp), INP2270movl 28(%esp), LEN2271#endif2272test LEN, LEN # check length2273jz .Lecb_enc_ret2274mov 480(KEYP), KLEN2275cmp $16, LEN2276jb .Lecb_enc_ret2277cmp $64, LEN2278jb .Lecb_enc_loop12279.align 42280.Lecb_enc_loop4:2281movups (INP), STATE12282movups 0x10(INP), STATE22283movups 0x20(INP), STATE32284movups 0x30(INP), STATE42285call _aesni_enc42286movups STATE1, (OUTP)2287movups STATE2, 0x10(OUTP)2288movups STATE3, 0x20(OUTP)2289movups STATE4, 0x30(OUTP)2290sub $64, LEN2291add $64, INP2292add $64, OUTP2293cmp $64, LEN2294jge .Lecb_enc_loop42295cmp $16, LEN2296jb .Lecb_enc_ret2297.align 42298.Lecb_enc_loop1:2299movups (INP), STATE12300call _aesni_enc12301movups STATE1, (OUTP)2302sub $16, LEN2303add $16, INP2304add $16, OUTP2305cmp $16, LEN2306jge .Lecb_enc_loop12307.Lecb_enc_ret:2308#ifndef __x86_64__2309popl KLEN2310popl KEYP2311popl LEN2312#endif2313ret23142315/*2316* void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,2317* size_t len);2318*/2319ENTRY(aesni_ecb_dec)2320#ifndef __x86_64__2321pushl LEN2322pushl KEYP2323pushl KLEN2324movl 16(%esp), KEYP2325movl 20(%esp), OUTP2326movl 24(%esp), INP2327movl 28(%esp), LEN2328#endif2329test LEN, LEN2330jz .Lecb_dec_ret2331mov 480(KEYP), KLEN2332add $240, KEYP2333cmp $16, LEN2334jb .Lecb_dec_ret2335cmp $64, LEN2336jb .Lecb_dec_loop12337.align 42338.Lecb_dec_loop4:2339movups (INP), STATE12340movups 0x10(INP), STATE22341movups 0x20(INP), STATE32342movups 0x30(INP), STATE42343call _aesni_dec42344movups STATE1, (OUTP)2345movups STATE2, 0x10(OUTP)2346movups STATE3, 0x20(OUTP)2347movups STATE4, 0x30(OUTP)2348sub $64, LEN2349add $64, INP2350add $64, OUTP2351cmp $64, LEN2352jge .Lecb_dec_loop42353cmp $16, LEN2354jb .Lecb_dec_ret2355.align 42356.Lecb_dec_loop1:2357movups (INP), STATE12358call _aesni_dec12359movups STATE1, (OUTP)2360sub $16, LEN2361add $16, INP2362add $16, OUTP2363cmp $16, LEN2364jge .Lecb_dec_loop12365.Lecb_dec_ret:2366#ifndef __x86_64__2367popl KLEN2368popl KEYP2369popl LEN2370#endif2371ret23722373/*2374* void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,2375* size_t len, u8 *iv)2376*/2377ENTRY(aesni_cbc_enc)2378#ifndef __x86_64__2379pushl IVP2380pushl LEN2381pushl KEYP2382pushl KLEN2383movl 20(%esp), KEYP2384movl 24(%esp), OUTP2385movl 28(%esp), INP2386movl 32(%esp), LEN2387movl 36(%esp), IVP2388#endif2389cmp $16, LEN2390jb .Lcbc_enc_ret2391mov 480(KEYP), KLEN2392movups (IVP), STATE # load iv as initial state2393.align 42394.Lcbc_enc_loop:2395movups (INP), IN # load input2396pxor IN, STATE2397call _aesni_enc12398movups STATE, (OUTP) # store output2399sub $16, LEN2400add $16, INP2401add $16, OUTP2402cmp $16, LEN2403jge .Lcbc_enc_loop2404movups STATE, (IVP)2405.Lcbc_enc_ret:2406#ifndef __x86_64__2407popl KLEN2408popl KEYP2409popl LEN2410popl IVP2411#endif2412ret24132414/*2415* void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,2416* size_t len, u8 *iv)2417*/2418ENTRY(aesni_cbc_dec)2419#ifndef __x86_64__2420pushl IVP2421pushl LEN2422pushl KEYP2423pushl KLEN2424movl 20(%esp), KEYP2425movl 24(%esp), OUTP2426movl 28(%esp), INP2427movl 32(%esp), LEN2428movl 36(%esp), IVP2429#endif2430cmp $16, LEN2431jb .Lcbc_dec_just_ret2432mov 480(KEYP), KLEN2433add $240, KEYP2434movups (IVP), IV2435cmp $64, LEN2436jb .Lcbc_dec_loop12437.align 42438.Lcbc_dec_loop4:2439movups (INP), IN12440movaps IN1, STATE12441movups 0x10(INP), IN22442movaps IN2, STATE22443#ifdef __x86_64__2444movups 0x20(INP), IN32445movaps IN3, STATE32446movups 0x30(INP), IN42447movaps IN4, STATE42448#else2449movups 0x20(INP), IN12450movaps IN1, STATE32451movups 0x30(INP), IN22452movaps IN2, STATE42453#endif2454call _aesni_dec42455pxor IV, STATE12456#ifdef __x86_64__2457pxor IN1, STATE22458pxor IN2, STATE32459pxor IN3, STATE42460movaps IN4, IV2461#else2462pxor (INP), STATE22463pxor 0x10(INP), STATE32464pxor IN1, STATE42465movaps IN2, IV2466#endif2467movups STATE1, (OUTP)2468movups STATE2, 0x10(OUTP)2469movups STATE3, 0x20(OUTP)2470movups STATE4, 0x30(OUTP)2471sub $64, LEN2472add $64, INP2473add $64, OUTP2474cmp $64, LEN2475jge .Lcbc_dec_loop42476cmp $16, LEN2477jb .Lcbc_dec_ret2478.align 42479.Lcbc_dec_loop1:2480movups (INP), IN2481movaps IN, STATE2482call _aesni_dec12483pxor IV, STATE2484movups STATE, (OUTP)2485movaps IN, IV2486sub $16, LEN2487add $16, INP2488add $16, OUTP2489cmp $16, LEN2490jge .Lcbc_dec_loop12491.Lcbc_dec_ret:2492movups IV, (IVP)2493.Lcbc_dec_just_ret:2494#ifndef __x86_64__2495popl KLEN2496popl KEYP2497popl LEN2498popl IVP2499#endif2500ret25012502#ifdef __x86_64__2503.align 162504.Lbswap_mask:2505.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 025062507/*2508* _aesni_inc_init: internal ABI2509* setup registers used by _aesni_inc2510* input:2511* IV2512* output:2513* CTR: == IV, in little endian2514* TCTR_LOW: == lower qword of CTR2515* INC: == 1, in little endian2516* BSWAP_MASK == endian swapping mask2517*/2518.align 42519_aesni_inc_init:2520movaps .Lbswap_mask, BSWAP_MASK2521movaps IV, CTR2522PSHUFB_XMM BSWAP_MASK CTR2523mov $1, TCTR_LOW2524MOVQ_R64_XMM TCTR_LOW INC2525MOVQ_R64_XMM CTR TCTR_LOW2526ret25272528/*2529* _aesni_inc: internal ABI2530* Increase IV by 1, IV is in big endian2531* input:2532* IV2533* CTR: == IV, in little endian2534* TCTR_LOW: == lower qword of CTR2535* INC: == 1, in little endian2536* BSWAP_MASK == endian swapping mask2537* output:2538* IV: Increase by 12539* changed:2540* CTR: == output IV, in little endian2541* TCTR_LOW: == lower qword of CTR2542*/2543.align 42544_aesni_inc:2545paddq INC, CTR2546add $1, TCTR_LOW2547jnc .Linc_low2548pslldq $8, INC2549paddq INC, CTR2550psrldq $8, INC2551.Linc_low:2552movaps CTR, IV2553PSHUFB_XMM BSWAP_MASK IV2554ret25552556/*2557* void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,2558* size_t len, u8 *iv)2559*/2560ENTRY(aesni_ctr_enc)2561cmp $16, LEN2562jb .Lctr_enc_just_ret2563mov 480(KEYP), KLEN2564movups (IVP), IV2565call _aesni_inc_init2566cmp $64, LEN2567jb .Lctr_enc_loop12568.align 42569.Lctr_enc_loop4:2570movaps IV, STATE12571call _aesni_inc2572movups (INP), IN12573movaps IV, STATE22574call _aesni_inc2575movups 0x10(INP), IN22576movaps IV, STATE32577call _aesni_inc2578movups 0x20(INP), IN32579movaps IV, STATE42580call _aesni_inc2581movups 0x30(INP), IN42582call _aesni_enc42583pxor IN1, STATE12584movups STATE1, (OUTP)2585pxor IN2, STATE22586movups STATE2, 0x10(OUTP)2587pxor IN3, STATE32588movups STATE3, 0x20(OUTP)2589pxor IN4, STATE42590movups STATE4, 0x30(OUTP)2591sub $64, LEN2592add $64, INP2593add $64, OUTP2594cmp $64, LEN2595jge .Lctr_enc_loop42596cmp $16, LEN2597jb .Lctr_enc_ret2598.align 42599.Lctr_enc_loop1:2600movaps IV, STATE2601call _aesni_inc2602movups (INP), IN2603call _aesni_enc12604pxor IN, STATE2605movups STATE, (OUTP)2606sub $16, LEN2607add $16, INP2608add $16, OUTP2609cmp $16, LEN2610jge .Lctr_enc_loop12611.Lctr_enc_ret:2612movups IV, (IVP)2613.Lctr_enc_just_ret:2614ret2615#endif261626172618