Path: blob/main/sys/crypto/openssl/aarch64/ghashv8-armx.S
39536 views
/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */1#include "arm_arch.h"23#if __ARM_MAX_ARCH__>=74.arch armv8-a+crypto5.text6.globl gcm_init_v87.type gcm_init_v8,%function8.align 49gcm_init_v8:10AARCH64_VALID_CALL_TARGET11ld1 {v17.2d},[x1] //load input H12movi v19.16b,#0xe113shl v19.2d,v19.2d,#57 //0xc2.014ext v3.16b,v17.16b,v17.16b,#815ushr v18.2d,v19.2d,#6316dup v17.4s,v17.s[1]17ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....0118ushr v18.2d,v3.2d,#6319sshr v17.4s,v17.4s,#31 //broadcast carry bit20and v18.16b,v18.16b,v16.16b21shl v3.2d,v3.2d,#122ext v18.16b,v18.16b,v18.16b,#823and v16.16b,v16.16b,v17.16b24orr v3.16b,v3.16b,v18.16b //H<<<=125eor v20.16b,v3.16b,v16.16b //twisted H26st1 {v20.2d},[x0],#16 //store Htable[0]2728//calculate H^229ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing30pmull v0.1q,v20.1d,v20.1d31eor v16.16b,v16.16b,v20.16b32pmull2 v2.1q,v20.2d,v20.2d33pmull v1.1q,v16.1d,v16.1d3435ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing36eor v18.16b,v0.16b,v2.16b37eor v1.16b,v1.16b,v17.16b38eor v1.16b,v1.16b,v18.16b39pmull v18.1q,v0.1d,v19.1d //1st phase4041ins v2.d[0],v1.d[1]42ins v1.d[1],v0.d[0]43eor v0.16b,v1.16b,v18.16b4445ext v18.16b,v0.16b,v0.16b,#8 //2nd phase46pmull v0.1q,v0.1d,v19.1d47eor v18.16b,v18.16b,v2.16b48eor v22.16b,v0.16b,v18.16b4950ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing51eor v17.16b,v17.16b,v22.16b52ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed53st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]54//calculate H^3 and H^455pmull v0.1q,v20.1d, v22.1d56pmull v5.1q,v22.1d,v22.1d57pmull2 v2.1q,v20.2d, v22.2d58pmull2 v7.1q,v22.2d,v22.2d59pmull v1.1q,v16.1d,v17.1d60pmull v6.1q,v17.1d,v17.1d6162ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing63ext v17.16b,v5.16b,v7.16b,#864eor v18.16b,v0.16b,v2.16b65eor v1.16b,v1.16b,v16.16b66eor v4.16b,v5.16b,v7.16b67eor v6.16b,v6.16b,v17.16b68eor v1.16b,v1.16b,v18.16b69pmull v18.1q,v0.1d,v19.1d //1st phase70eor v6.16b,v6.16b,v4.16b71pmull v4.1q,v5.1d,v19.1d7273ins v2.d[0],v1.d[1]74ins v7.d[0],v6.d[1]75ins v1.d[1],v0.d[0]76ins v6.d[1],v5.d[0]77eor v0.16b,v1.16b,v18.16b78eor v5.16b,v6.16b,v4.16b7980ext v18.16b,v0.16b,v0.16b,#8 //2nd phase81ext v4.16b,v5.16b,v5.16b,#882pmull v0.1q,v0.1d,v19.1d83pmull v5.1q,v5.1d,v19.1d84eor v18.16b,v18.16b,v2.16b85eor v4.16b,v4.16b,v7.16b86eor v23.16b, v0.16b,v18.16b //H^387eor v25.16b,v5.16b,v4.16b //H^48889ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing90ext v17.16b,v25.16b,v25.16b,#891ext v18.16b,v22.16b,v22.16b,#892eor v16.16b,v16.16b,v23.16b93eor v17.16b,v17.16b,v25.16b94eor v18.16b,v18.16b,v22.16b95ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed96st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5]9798//calculate H^5 and H^699pmull v0.1q,v22.1d, v23.1d100pmull v5.1q,v23.1d,v23.1d101pmull2 v2.1q,v22.2d, v23.2d102pmull2 v7.1q,v23.2d,v23.2d103pmull v1.1q,v16.1d,v18.1d104pmull v6.1q,v16.1d,v16.1d105106ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing107ext v17.16b,v5.16b,v7.16b,#8108eor v18.16b,v0.16b,v2.16b109eor v1.16b,v1.16b,v16.16b110eor v4.16b,v5.16b,v7.16b111eor v6.16b,v6.16b,v17.16b112eor v1.16b,v1.16b,v18.16b113pmull v18.1q,v0.1d,v19.1d //1st phase114eor v6.16b,v6.16b,v4.16b115pmull v4.1q,v5.1d,v19.1d116117ins v2.d[0],v1.d[1]118ins v7.d[0],v6.d[1]119ins v1.d[1],v0.d[0]120ins v6.d[1],v5.d[0]121eor v0.16b,v1.16b,v18.16b122eor v5.16b,v6.16b,v4.16b123124ext v18.16b,v0.16b,v0.16b,#8 //2nd phase125ext v4.16b,v5.16b,v5.16b,#8126pmull v0.1q,v0.1d,v19.1d127pmull v5.1q,v5.1d,v19.1d128eor v18.16b,v18.16b,v2.16b129eor v4.16b,v4.16b,v7.16b130eor v26.16b,v0.16b,v18.16b //H^5131eor v28.16b,v5.16b,v4.16b //H^6132133ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing134ext v17.16b,v28.16b,v28.16b,#8135ext v18.16b,v22.16b,v22.16b,#8136eor v16.16b,v16.16b,v26.16b137eor v17.16b,v17.16b,v28.16b138eor v18.16b,v18.16b,v22.16b139ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed140st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8]141142//calculate H^7 and H^8143pmull v0.1q,v22.1d,v26.1d144pmull v5.1q,v22.1d,v28.1d145pmull2 v2.1q,v22.2d,v26.2d146pmull2 v7.1q,v22.2d,v28.2d147pmull v1.1q,v16.1d,v18.1d148pmull v6.1q,v17.1d,v18.1d149150ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing151ext v17.16b,v5.16b,v7.16b,#8152eor v18.16b,v0.16b,v2.16b153eor v1.16b,v1.16b,v16.16b154eor v4.16b,v5.16b,v7.16b155eor v6.16b,v6.16b,v17.16b156eor v1.16b,v1.16b,v18.16b157pmull v18.1q,v0.1d,v19.1d //1st phase158eor v6.16b,v6.16b,v4.16b159pmull v4.1q,v5.1d,v19.1d160161ins v2.d[0],v1.d[1]162ins v7.d[0],v6.d[1]163ins v1.d[1],v0.d[0]164ins v6.d[1],v5.d[0]165eor v0.16b,v1.16b,v18.16b166eor v5.16b,v6.16b,v4.16b167168ext v18.16b,v0.16b,v0.16b,#8 //2nd phase169ext v4.16b,v5.16b,v5.16b,#8170pmull v0.1q,v0.1d,v19.1d171pmull v5.1q,v5.1d,v19.1d172eor v18.16b,v18.16b,v2.16b173eor v4.16b,v4.16b,v7.16b174eor v29.16b,v0.16b,v18.16b //H^7175eor v31.16b,v5.16b,v4.16b //H^8176177ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing178ext v17.16b,v31.16b,v31.16b,#8179eor v16.16b,v16.16b,v29.16b180eor v17.16b,v17.16b,v31.16b181ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed182st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11]183ret184.size gcm_init_v8,.-gcm_init_v8185.globl gcm_gmult_v8186.type gcm_gmult_v8,%function187.align 4188gcm_gmult_v8:189AARCH64_VALID_CALL_TARGET190ld1 {v17.2d},[x0] //load Xi191movi v19.16b,#0xe1192ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...193shl v19.2d,v19.2d,#57194#ifndef __AARCH64EB__195rev64 v17.16b,v17.16b196#endif197ext v3.16b,v17.16b,v17.16b,#8198199pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo200eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing201pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi202pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)203204ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing205eor v18.16b,v0.16b,v2.16b206eor v1.16b,v1.16b,v17.16b207eor v1.16b,v1.16b,v18.16b208pmull v18.1q,v0.1d,v19.1d //1st phase of reduction209210ins v2.d[0],v1.d[1]211ins v1.d[1],v0.d[0]212eor v0.16b,v1.16b,v18.16b213214ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction215pmull v0.1q,v0.1d,v19.1d216eor v18.16b,v18.16b,v2.16b217eor v0.16b,v0.16b,v18.16b218219#ifndef __AARCH64EB__220rev64 v0.16b,v0.16b221#endif222ext v0.16b,v0.16b,v0.16b,#8223st1 {v0.2d},[x0] //write out Xi224225ret226.size gcm_gmult_v8,.-gcm_gmult_v8227.globl gcm_ghash_v8228.type gcm_ghash_v8,%function229.align 4230gcm_ghash_v8:231AARCH64_VALID_CALL_TARGET232cmp x3,#64233b.hs .Lgcm_ghash_v8_4x234ld1 {v0.2d},[x0] //load [rotated] Xi235//"[rotated]" means that236//loaded value would have237//to be rotated in order to238//make it appear as in239//algorithm specification240subs x3,x3,#32 //see if x3 is 32 or larger241mov x12,#16 //x12 is used as post-242//increment for input pointer;243//as loop is modulo-scheduled244//x12 is zeroed just in time245//to preclude overstepping246//inp[len], which means that247//last block[s] are actually248//loaded twice, but last249//copy is not processed250ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2251movi v19.16b,#0xe1252ld1 {v22.2d},[x1]253csel x12,xzr,x12,eq //is it time to zero x12?254ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi255ld1 {v16.2d},[x2],#16 //load [rotated] I[0]256shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant257#ifndef __AARCH64EB__258rev64 v16.16b,v16.16b259rev64 v0.16b,v0.16b260#endif261ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]262b.lo .Lodd_tail_v8 //x3 was less than 32263ld1 {v17.2d},[x2],x12 //load [rotated] I[1]264#ifndef __AARCH64EB__265rev64 v17.16b,v17.16b266#endif267ext v7.16b,v17.16b,v17.16b,#8268eor v3.16b,v3.16b,v0.16b //I[i]^=Xi269pmull v4.1q,v20.1d,v7.1d //H·Ii+1270eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing271pmull2 v6.1q,v20.2d,v7.2d272b .Loop_mod2x_v8273274.align 4275.Loop_mod2x_v8:276ext v18.16b,v3.16b,v3.16b,#8277subs x3,x3,#32 //is there more data?278pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo279csel x12,xzr,x12,lo //is it time to zero x12?280281pmull v5.1q,v21.1d,v17.1d282eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing283pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi284eor v0.16b,v0.16b,v4.16b //accumulate285pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)286ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]287288eor v2.16b,v2.16b,v6.16b289csel x12,xzr,x12,eq //is it time to zero x12?290eor v1.16b,v1.16b,v5.16b291292ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing293eor v18.16b,v0.16b,v2.16b294eor v1.16b,v1.16b,v17.16b295ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]296#ifndef __AARCH64EB__297rev64 v16.16b,v16.16b298#endif299eor v1.16b,v1.16b,v18.16b300pmull v18.1q,v0.1d,v19.1d //1st phase of reduction301302#ifndef __AARCH64EB__303rev64 v17.16b,v17.16b304#endif305ins v2.d[0],v1.d[1]306ins v1.d[1],v0.d[0]307ext v7.16b,v17.16b,v17.16b,#8308ext v3.16b,v16.16b,v16.16b,#8309eor v0.16b,v1.16b,v18.16b310pmull v4.1q,v20.1d,v7.1d //H·Ii+1311eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early312313ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction314pmull v0.1q,v0.1d,v19.1d315eor v3.16b,v3.16b,v18.16b316eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing317eor v3.16b,v3.16b,v0.16b318pmull2 v6.1q,v20.2d,v7.2d319b.hs .Loop_mod2x_v8 //there was at least 32 more bytes320321eor v2.16b,v2.16b,v18.16b322ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b323adds x3,x3,#32 //re-construct x3324eor v0.16b,v0.16b,v2.16b //re-construct v0.16b325b.eq .Ldone_v8 //is x3 zero?326.Lodd_tail_v8:327ext v18.16b,v0.16b,v0.16b,#8328eor v3.16b,v3.16b,v0.16b //inp^=Xi329eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi330331pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo332eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing333pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi334pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)335336ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing337eor v18.16b,v0.16b,v2.16b338eor v1.16b,v1.16b,v17.16b339eor v1.16b,v1.16b,v18.16b340pmull v18.1q,v0.1d,v19.1d //1st phase of reduction341342ins v2.d[0],v1.d[1]343ins v1.d[1],v0.d[0]344eor v0.16b,v1.16b,v18.16b345346ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction347pmull v0.1q,v0.1d,v19.1d348eor v18.16b,v18.16b,v2.16b349eor v0.16b,v0.16b,v18.16b350351.Ldone_v8:352#ifndef __AARCH64EB__353rev64 v0.16b,v0.16b354#endif355ext v0.16b,v0.16b,v0.16b,#8356st1 {v0.2d},[x0] //write out Xi357358ret359.size gcm_ghash_v8,.-gcm_ghash_v8360.type gcm_ghash_v8_4x,%function361.align 4362gcm_ghash_v8_4x:363.Lgcm_ghash_v8_4x:364ld1 {v0.2d},[x0] //load [rotated] Xi365ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2366movi v19.16b,#0xe1367ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4368shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant369370ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64371#ifndef __AARCH64EB__372rev64 v0.16b,v0.16b373rev64 v5.16b,v5.16b374rev64 v6.16b,v6.16b375rev64 v7.16b,v7.16b376rev64 v4.16b,v4.16b377#endif378ext v25.16b,v7.16b,v7.16b,#8379ext v24.16b,v6.16b,v6.16b,#8380ext v23.16b,v5.16b,v5.16b,#8381382pmull v29.1q,v20.1d,v25.1d //H·Ii+3383eor v7.16b,v7.16b,v25.16b384pmull2 v31.1q,v20.2d,v25.2d385pmull v30.1q,v21.1d,v7.1d386387pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2388eor v6.16b,v6.16b,v24.16b389pmull2 v24.1q,v22.2d,v24.2d390pmull2 v6.1q,v21.2d,v6.2d391392eor v29.16b,v29.16b,v16.16b393eor v31.16b,v31.16b,v24.16b394eor v30.16b,v30.16b,v6.16b395396pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1397eor v5.16b,v5.16b,v23.16b398pmull2 v23.1q,v26.2d,v23.2d399pmull v5.1q,v27.1d,v5.1d400401eor v29.16b,v29.16b,v7.16b402eor v31.16b,v31.16b,v23.16b403eor v30.16b,v30.16b,v5.16b404405subs x3,x3,#128406b.lo .Ltail4x407408b .Loop4x409410.align 4411.Loop4x:412eor v16.16b,v4.16b,v0.16b413ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64414ext v3.16b,v16.16b,v16.16b,#8415#ifndef __AARCH64EB__416rev64 v5.16b,v5.16b417rev64 v6.16b,v6.16b418rev64 v7.16b,v7.16b419rev64 v4.16b,v4.16b420#endif421422pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)423eor v16.16b,v16.16b,v3.16b424pmull2 v2.1q,v28.2d,v3.2d425ext v25.16b,v7.16b,v7.16b,#8426pmull2 v1.1q,v27.2d,v16.2d427428eor v0.16b,v0.16b,v29.16b429eor v2.16b,v2.16b,v31.16b430ext v24.16b,v6.16b,v6.16b,#8431eor v1.16b,v1.16b,v30.16b432ext v23.16b,v5.16b,v5.16b,#8433434ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing435eor v18.16b,v0.16b,v2.16b436pmull v29.1q,v20.1d,v25.1d //H·Ii+3437eor v7.16b,v7.16b,v25.16b438eor v1.16b,v1.16b,v17.16b439pmull2 v31.1q,v20.2d,v25.2d440eor v1.16b,v1.16b,v18.16b441pmull v30.1q,v21.1d,v7.1d442443pmull v18.1q,v0.1d,v19.1d //1st phase of reduction444ins v2.d[0],v1.d[1]445ins v1.d[1],v0.d[0]446pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2447eor v6.16b,v6.16b,v24.16b448pmull2 v24.1q,v22.2d,v24.2d449eor v0.16b,v1.16b,v18.16b450pmull2 v6.1q,v21.2d,v6.2d451452eor v29.16b,v29.16b,v16.16b453eor v31.16b,v31.16b,v24.16b454eor v30.16b,v30.16b,v6.16b455456ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction457pmull v0.1q,v0.1d,v19.1d458pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1459eor v5.16b,v5.16b,v23.16b460eor v18.16b,v18.16b,v2.16b461pmull2 v23.1q,v26.2d,v23.2d462pmull v5.1q,v27.1d,v5.1d463464eor v0.16b,v0.16b,v18.16b465eor v29.16b,v29.16b,v7.16b466eor v31.16b,v31.16b,v23.16b467ext v0.16b,v0.16b,v0.16b,#8468eor v30.16b,v30.16b,v5.16b469470subs x3,x3,#64471b.hs .Loop4x472473.Ltail4x:474eor v16.16b,v4.16b,v0.16b475ext v3.16b,v16.16b,v16.16b,#8476477pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)478eor v16.16b,v16.16b,v3.16b479pmull2 v2.1q,v28.2d,v3.2d480pmull2 v1.1q,v27.2d,v16.2d481482eor v0.16b,v0.16b,v29.16b483eor v2.16b,v2.16b,v31.16b484eor v1.16b,v1.16b,v30.16b485486adds x3,x3,#64487b.eq .Ldone4x488489cmp x3,#32490b.lo .Lone491b.eq .Ltwo492.Lthree:493ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing494eor v18.16b,v0.16b,v2.16b495eor v1.16b,v1.16b,v17.16b496ld1 {v4.2d,v5.2d,v6.2d},[x2]497eor v1.16b,v1.16b,v18.16b498#ifndef __AARCH64EB__499rev64 v5.16b,v5.16b500rev64 v6.16b,v6.16b501rev64 v4.16b,v4.16b502#endif503504pmull v18.1q,v0.1d,v19.1d //1st phase of reduction505ins v2.d[0],v1.d[1]506ins v1.d[1],v0.d[0]507ext v24.16b,v6.16b,v6.16b,#8508ext v23.16b,v5.16b,v5.16b,#8509eor v0.16b,v1.16b,v18.16b510511pmull v29.1q,v20.1d,v24.1d //H·Ii+2512eor v6.16b,v6.16b,v24.16b513514ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction515pmull v0.1q,v0.1d,v19.1d516eor v18.16b,v18.16b,v2.16b517pmull2 v31.1q,v20.2d,v24.2d518pmull v30.1q,v21.1d,v6.1d519eor v0.16b,v0.16b,v18.16b520pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1521eor v5.16b,v5.16b,v23.16b522ext v0.16b,v0.16b,v0.16b,#8523524pmull2 v23.1q,v22.2d,v23.2d525eor v16.16b,v4.16b,v0.16b526pmull2 v5.1q,v21.2d,v5.2d527ext v3.16b,v16.16b,v16.16b,#8528529eor v29.16b,v29.16b,v7.16b530eor v31.16b,v31.16b,v23.16b531eor v30.16b,v30.16b,v5.16b532533pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)534eor v16.16b,v16.16b,v3.16b535pmull2 v2.1q,v26.2d,v3.2d536pmull v1.1q,v27.1d,v16.1d537538eor v0.16b,v0.16b,v29.16b539eor v2.16b,v2.16b,v31.16b540eor v1.16b,v1.16b,v30.16b541b .Ldone4x542543.align 4544.Ltwo:545ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing546eor v18.16b,v0.16b,v2.16b547eor v1.16b,v1.16b,v17.16b548ld1 {v4.2d,v5.2d},[x2]549eor v1.16b,v1.16b,v18.16b550#ifndef __AARCH64EB__551rev64 v5.16b,v5.16b552rev64 v4.16b,v4.16b553#endif554555pmull v18.1q,v0.1d,v19.1d //1st phase of reduction556ins v2.d[0],v1.d[1]557ins v1.d[1],v0.d[0]558ext v23.16b,v5.16b,v5.16b,#8559eor v0.16b,v1.16b,v18.16b560561ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction562pmull v0.1q,v0.1d,v19.1d563eor v18.16b,v18.16b,v2.16b564eor v0.16b,v0.16b,v18.16b565ext v0.16b,v0.16b,v0.16b,#8566567pmull v29.1q,v20.1d,v23.1d //H·Ii+1568eor v5.16b,v5.16b,v23.16b569570eor v16.16b,v4.16b,v0.16b571ext v3.16b,v16.16b,v16.16b,#8572573pmull2 v31.1q,v20.2d,v23.2d574pmull v30.1q,v21.1d,v5.1d575576pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)577eor v16.16b,v16.16b,v3.16b578pmull2 v2.1q,v22.2d,v3.2d579pmull2 v1.1q,v21.2d,v16.2d580581eor v0.16b,v0.16b,v29.16b582eor v2.16b,v2.16b,v31.16b583eor v1.16b,v1.16b,v30.16b584b .Ldone4x585586.align 4587.Lone:588ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing589eor v18.16b,v0.16b,v2.16b590eor v1.16b,v1.16b,v17.16b591ld1 {v4.2d},[x2]592eor v1.16b,v1.16b,v18.16b593#ifndef __AARCH64EB__594rev64 v4.16b,v4.16b595#endif596597pmull v18.1q,v0.1d,v19.1d //1st phase of reduction598ins v2.d[0],v1.d[1]599ins v1.d[1],v0.d[0]600eor v0.16b,v1.16b,v18.16b601602ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction603pmull v0.1q,v0.1d,v19.1d604eor v18.16b,v18.16b,v2.16b605eor v0.16b,v0.16b,v18.16b606ext v0.16b,v0.16b,v0.16b,#8607608eor v16.16b,v4.16b,v0.16b609ext v3.16b,v16.16b,v16.16b,#8610611pmull v0.1q,v20.1d,v3.1d612eor v16.16b,v16.16b,v3.16b613pmull2 v2.1q,v20.2d,v3.2d614pmull v1.1q,v21.1d,v16.1d615616.Ldone4x:617ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing618eor v18.16b,v0.16b,v2.16b619eor v1.16b,v1.16b,v17.16b620eor v1.16b,v1.16b,v18.16b621622pmull v18.1q,v0.1d,v19.1d //1st phase of reduction623ins v2.d[0],v1.d[1]624ins v1.d[1],v0.d[0]625eor v0.16b,v1.16b,v18.16b626627ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction628pmull v0.1q,v0.1d,v19.1d629eor v18.16b,v18.16b,v2.16b630eor v0.16b,v0.16b,v18.16b631ext v0.16b,v0.16b,v0.16b,#8632633#ifndef __AARCH64EB__634rev64 v0.16b,v0.16b635#endif636st1 {v0.2d},[x0] //write out Xi637638ret639.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x640.section .rodata641.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0642.align 2643.align 2644#endif645646647