Path: blob/master/lib/crypto/powerpc/poly1305-p10le_64.S
26289 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1#2# Accelerated poly1305 implementation for ppc64le.3#4# Copyright 2023- IBM Corp. All rights reserved5#6#===================================================================================7# Written by Danny Tsen <[email protected]>8#9# Poly1305 - this version mainly using vector/VSX/Scalar10# - 26 bits limbs11# - Handle multiple 64 byte blcok.12#13# Block size 16 bytes14# key = (r, s)15# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF16# p = 2^130 - 517# a += m18# a = (r + a) % p19# a += s20#21# Improve performance by breaking down polynominal to the sum of products with22# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r23#24# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s025# to 9 vectors for multiplications.26#27# setup r^4, r^3, r^2, r vectors28# vs [r^1, r^3, r^2, r^4]29# vs0 = [r0,.....]30# vs1 = [r1,.....]31# vs2 = [r2,.....]32# vs3 = [r3,.....]33# vs4 = [r4,.....]34# vs5 = [r1*5,...]35# vs6 = [r2*5,...]36# vs7 = [r2*5,...]37# vs8 = [r4*5,...]38#39# Each word in a vector consists a member of a "r/s" in [a * r/s].40#41# r0, r4*5, r3*5, r2*5, r1*5;42# r1, r0, r4*5, r3*5, r2*5;43# r2, r1, r0, r4*5, r3*5;44# r3, r2, r1, r0, r4*5;45# r4, r3, r2, r1, r0 ;46#47#48# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)49# k = 32 bytes key50# r3 = k (r, s)51# r4 = mlen52# r5 = m53#54#include <asm/ppc_asm.h>55#include <asm/asm-offsets.h>56#include <asm/asm-compat.h>57#include <linux/linkage.h>5859.machine "any"6061.text6263.macro SAVE_GPR GPR OFFSET FRAME64std \GPR,\OFFSET(\FRAME)65.endm6667.macro SAVE_VRS VRS OFFSET FRAME68li 16, \OFFSET69stvx \VRS, 16, \FRAME70.endm7172.macro SAVE_VSX VSX OFFSET FRAME73li 16, \OFFSET74stxvx \VSX, 16, \FRAME75.endm7677.macro RESTORE_GPR GPR OFFSET FRAME78ld \GPR,\OFFSET(\FRAME)79.endm8081.macro RESTORE_VRS VRS OFFSET FRAME82li 16, \OFFSET83lvx \VRS, 16, \FRAME84.endm8586.macro RESTORE_VSX VSX OFFSET FRAME87li 16, \OFFSET88lxvx \VSX, 16, \FRAME89.endm9091.macro SAVE_REGS92mflr 093std 0, 16(1)94stdu 1,-752(1)9596SAVE_GPR 14, 112, 197SAVE_GPR 15, 120, 198SAVE_GPR 16, 128, 199SAVE_GPR 17, 136, 1100SAVE_GPR 18, 144, 1101SAVE_GPR 19, 152, 1102SAVE_GPR 20, 160, 1103SAVE_GPR 21, 168, 1104SAVE_GPR 22, 176, 1105SAVE_GPR 23, 184, 1106SAVE_GPR 24, 192, 1107SAVE_GPR 25, 200, 1108SAVE_GPR 26, 208, 1109SAVE_GPR 27, 216, 1110SAVE_GPR 28, 224, 1111SAVE_GPR 29, 232, 1112SAVE_GPR 30, 240, 1113SAVE_GPR 31, 248, 1114115addi 9, 1, 256116SAVE_VRS 20, 0, 9117SAVE_VRS 21, 16, 9118SAVE_VRS 22, 32, 9119SAVE_VRS 23, 48, 9120SAVE_VRS 24, 64, 9121SAVE_VRS 25, 80, 9122SAVE_VRS 26, 96, 9123SAVE_VRS 27, 112, 9124SAVE_VRS 28, 128, 9125SAVE_VRS 29, 144, 9126SAVE_VRS 30, 160, 9127SAVE_VRS 31, 176, 9128129SAVE_VSX 14, 192, 9130SAVE_VSX 15, 208, 9131SAVE_VSX 16, 224, 9132SAVE_VSX 17, 240, 9133SAVE_VSX 18, 256, 9134SAVE_VSX 19, 272, 9135SAVE_VSX 20, 288, 9136SAVE_VSX 21, 304, 9137SAVE_VSX 22, 320, 9138SAVE_VSX 23, 336, 9139SAVE_VSX 24, 352, 9140SAVE_VSX 25, 368, 9141SAVE_VSX 26, 384, 9142SAVE_VSX 27, 400, 9143SAVE_VSX 28, 416, 9144SAVE_VSX 29, 432, 9145SAVE_VSX 30, 448, 9146SAVE_VSX 31, 464, 9147.endm # SAVE_REGS148149.macro RESTORE_REGS150addi 9, 1, 256151RESTORE_VRS 20, 0, 9152RESTORE_VRS 21, 16, 9153RESTORE_VRS 22, 32, 9154RESTORE_VRS 23, 48, 9155RESTORE_VRS 24, 64, 9156RESTORE_VRS 25, 80, 9157RESTORE_VRS 26, 96, 9158RESTORE_VRS 27, 112, 9159RESTORE_VRS 28, 128, 9160RESTORE_VRS 29, 144, 9161RESTORE_VRS 30, 160, 9162RESTORE_VRS 31, 176, 9163164RESTORE_VSX 14, 192, 9165RESTORE_VSX 15, 208, 9166RESTORE_VSX 16, 224, 9167RESTORE_VSX 17, 240, 9168RESTORE_VSX 18, 256, 9169RESTORE_VSX 19, 272, 9170RESTORE_VSX 20, 288, 9171RESTORE_VSX 21, 304, 9172RESTORE_VSX 22, 320, 9173RESTORE_VSX 23, 336, 9174RESTORE_VSX 24, 352, 9175RESTORE_VSX 25, 368, 9176RESTORE_VSX 26, 384, 9177RESTORE_VSX 27, 400, 9178RESTORE_VSX 28, 416, 9179RESTORE_VSX 29, 432, 9180RESTORE_VSX 30, 448, 9181RESTORE_VSX 31, 464, 9182183RESTORE_GPR 14, 112, 1184RESTORE_GPR 15, 120, 1185RESTORE_GPR 16, 128, 1186RESTORE_GPR 17, 136, 1187RESTORE_GPR 18, 144, 1188RESTORE_GPR 19, 152, 1189RESTORE_GPR 20, 160, 1190RESTORE_GPR 21, 168, 1191RESTORE_GPR 22, 176, 1192RESTORE_GPR 23, 184, 1193RESTORE_GPR 24, 192, 1194RESTORE_GPR 25, 200, 1195RESTORE_GPR 26, 208, 1196RESTORE_GPR 27, 216, 1197RESTORE_GPR 28, 224, 1198RESTORE_GPR 29, 232, 1199RESTORE_GPR 30, 240, 1200RESTORE_GPR 31, 248, 1201202addi 1, 1, 752203ld 0, 16(1)204mtlr 0205.endm # RESTORE_REGS206207#208# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;209# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;210# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;211# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;212# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;213#214# [r^2, r^3, r^1, r^4]215# [m3, m2, m4, m1]216#217# multiply odd and even words218.macro mul_odd219vmulouw 14, 4, 26220vmulouw 10, 5, 3221vmulouw 11, 6, 2222vmulouw 12, 7, 1223vmulouw 13, 8, 0224vmulouw 15, 4, 27225vaddudm 14, 14, 10226vaddudm 14, 14, 11227vmulouw 10, 5, 26228vmulouw 11, 6, 3229vaddudm 14, 14, 12230vaddudm 14, 14, 13 # x0231vaddudm 15, 15, 10232vaddudm 15, 15, 11233vmulouw 12, 7, 2234vmulouw 13, 8, 1235vaddudm 15, 15, 12236vaddudm 15, 15, 13 # x1237vmulouw 16, 4, 28238vmulouw 10, 5, 27239vmulouw 11, 6, 26240vaddudm 16, 16, 10241vaddudm 16, 16, 11242vmulouw 12, 7, 3243vmulouw 13, 8, 2244vaddudm 16, 16, 12245vaddudm 16, 16, 13 # x2246vmulouw 17, 4, 29247vmulouw 10, 5, 28248vmulouw 11, 6, 27249vaddudm 17, 17, 10250vaddudm 17, 17, 11251vmulouw 12, 7, 26252vmulouw 13, 8, 3253vaddudm 17, 17, 12254vaddudm 17, 17, 13 # x3255vmulouw 18, 4, 30256vmulouw 10, 5, 29257vmulouw 11, 6, 28258vaddudm 18, 18, 10259vaddudm 18, 18, 11260vmulouw 12, 7, 27261vmulouw 13, 8, 26262vaddudm 18, 18, 12263vaddudm 18, 18, 13 # x4264.endm265266.macro mul_even267vmuleuw 9, 4, 26268vmuleuw 10, 5, 3269vmuleuw 11, 6, 2270vmuleuw 12, 7, 1271vmuleuw 13, 8, 0272vaddudm 14, 14, 9273vaddudm 14, 14, 10274vaddudm 14, 14, 11275vaddudm 14, 14, 12276vaddudm 14, 14, 13 # x0277278vmuleuw 9, 4, 27279vmuleuw 10, 5, 26280vmuleuw 11, 6, 3281vmuleuw 12, 7, 2282vmuleuw 13, 8, 1283vaddudm 15, 15, 9284vaddudm 15, 15, 10285vaddudm 15, 15, 11286vaddudm 15, 15, 12287vaddudm 15, 15, 13 # x1288289vmuleuw 9, 4, 28290vmuleuw 10, 5, 27291vmuleuw 11, 6, 26292vmuleuw 12, 7, 3293vmuleuw 13, 8, 2294vaddudm 16, 16, 9295vaddudm 16, 16, 10296vaddudm 16, 16, 11297vaddudm 16, 16, 12298vaddudm 16, 16, 13 # x2299300vmuleuw 9, 4, 29301vmuleuw 10, 5, 28302vmuleuw 11, 6, 27303vmuleuw 12, 7, 26304vmuleuw 13, 8, 3305vaddudm 17, 17, 9306vaddudm 17, 17, 10307vaddudm 17, 17, 11308vaddudm 17, 17, 12309vaddudm 17, 17, 13 # x3310311vmuleuw 9, 4, 30312vmuleuw 10, 5, 29313vmuleuw 11, 6, 28314vmuleuw 12, 7, 27315vmuleuw 13, 8, 26316vaddudm 18, 18, 9317vaddudm 18, 18, 10318vaddudm 18, 18, 11319vaddudm 18, 18, 12320vaddudm 18, 18, 13 # x4321.endm322323#324# poly1305_setup_r325#326# setup r^4, r^3, r^2, r vectors327# [r, r^3, r^2, r^4]328# vs0 = [r0,...]329# vs1 = [r1,...]330# vs2 = [r2,...]331# vs3 = [r3,...]332# vs4 = [r4,...]333# vs5 = [r4*5,...]334# vs6 = [r3*5,...]335# vs7 = [r2*5,...]336# vs8 = [r1*5,...]337#338# r0, r4*5, r3*5, r2*5, r1*5;339# r1, r0, r4*5, r3*5, r2*5;340# r2, r1, r0, r4*5, r3*5;341# r3, r2, r1, r0, r4*5;342# r4, r3, r2, r1, r0 ;343#344.macro poly1305_setup_r345346# save r347xxlor 26, 58, 58348xxlor 27, 59, 59349xxlor 28, 60, 60350xxlor 29, 61, 61351xxlor 30, 62, 62352353xxlxor 31, 31, 31354355# [r, r^3, r^2, r^4]356# compute r^2357vmr 4, 26358vmr 5, 27359vmr 6, 28360vmr 7, 29361vmr 8, 30362bl do_mul # r^2 r^1363xxpermdi 58, 58, 36, 0x3 # r0364xxpermdi 59, 59, 37, 0x3 # r1365xxpermdi 60, 60, 38, 0x3 # r2366xxpermdi 61, 61, 39, 0x3 # r3367xxpermdi 62, 62, 40, 0x3 # r4368xxpermdi 36, 36, 36, 0x3369xxpermdi 37, 37, 37, 0x3370xxpermdi 38, 38, 38, 0x3371xxpermdi 39, 39, 39, 0x3372xxpermdi 40, 40, 40, 0x3373vspltisb 13, 2374vsld 9, 27, 13375vsld 10, 28, 13376vsld 11, 29, 13377vsld 12, 30, 13378vaddudm 0, 9, 27379vaddudm 1, 10, 28380vaddudm 2, 11, 29381vaddudm 3, 12, 30382383bl do_mul # r^4 r^3384vmrgow 26, 26, 4385vmrgow 27, 27, 5386vmrgow 28, 28, 6387vmrgow 29, 29, 7388vmrgow 30, 30, 8389vspltisb 13, 2390vsld 9, 27, 13391vsld 10, 28, 13392vsld 11, 29, 13393vsld 12, 30, 13394vaddudm 0, 9, 27395vaddudm 1, 10, 28396vaddudm 2, 11, 29397vaddudm 3, 12, 30398399# r^2 r^4400xxlor 0, 58, 58401xxlor 1, 59, 59402xxlor 2, 60, 60403xxlor 3, 61, 61404xxlor 4, 62, 62405xxlor 5, 32, 32406xxlor 6, 33, 33407xxlor 7, 34, 34408xxlor 8, 35, 35409410vspltw 9, 26, 3411vspltw 10, 26, 2412vmrgow 26, 10, 9413vspltw 9, 27, 3414vspltw 10, 27, 2415vmrgow 27, 10, 9416vspltw 9, 28, 3417vspltw 10, 28, 2418vmrgow 28, 10, 9419vspltw 9, 29, 3420vspltw 10, 29, 2421vmrgow 29, 10, 9422vspltw 9, 30, 3423vspltw 10, 30, 2424vmrgow 30, 10, 9425426vsld 9, 27, 13427vsld 10, 28, 13428vsld 11, 29, 13429vsld 12, 30, 13430vaddudm 0, 9, 27431vaddudm 1, 10, 28432vaddudm 2, 11, 29433vaddudm 3, 12, 30434.endm435436SYM_FUNC_START_LOCAL(do_mul)437mul_odd438439# do reduction ( h %= p )440# carry reduction441vspltisb 9, 2442vsrd 10, 14, 31443vsrd 11, 17, 31444vand 7, 17, 25445vand 4, 14, 25446vaddudm 18, 18, 11447vsrd 12, 18, 31448vaddudm 15, 15, 10449450vsrd 11, 15, 31451vand 8, 18, 25452vand 5, 15, 25453vaddudm 4, 4, 12454vsld 10, 12, 9455vaddudm 6, 16, 11456457vsrd 13, 6, 31458vand 6, 6, 25459vaddudm 4, 4, 10460vsrd 10, 4, 31461vaddudm 7, 7, 13462463vsrd 11, 7, 31464vand 7, 7, 25465vand 4, 4, 25466vaddudm 5, 5, 10467vaddudm 8, 8, 11468blr469SYM_FUNC_END(do_mul)470471#472# init key473#474.macro do_poly1305_init475addis 10, 2, rmask@toc@ha476addi 10, 10, rmask@toc@l477478ld 11, 0(10)479ld 12, 8(10)480481li 14, 16482li 15, 32483addis 10, 2, cnum@toc@ha484addi 10, 10, cnum@toc@l485lvx 25, 0, 10 # v25 - mask486lvx 31, 14, 10 # v31 = 1a487lvx 19, 15, 10 # v19 = 1 << 24488lxv 24, 48(10) # vs24489lxv 25, 64(10) # vs25490491# initialize492# load key from r3 to vectors493ld 9, 24(3)494ld 10, 32(3)495and. 9, 9, 11496and. 10, 10, 12497498# break 26 bits499extrdi 14, 9, 26, 38500extrdi 15, 9, 26, 12501extrdi 16, 9, 12, 0502mtvsrdd 58, 0, 14503insrdi 16, 10, 14, 38504mtvsrdd 59, 0, 15505extrdi 17, 10, 26, 24506mtvsrdd 60, 0, 16507extrdi 18, 10, 24, 0508mtvsrdd 61, 0, 17509mtvsrdd 62, 0, 18510511# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5512li 9, 5513mtvsrdd 36, 0, 9514vmulouw 0, 27, 4 # v0 = rr0515vmulouw 1, 28, 4 # v1 = rr1516vmulouw 2, 29, 4 # v2 = rr2517vmulouw 3, 30, 4 # v3 = rr3518.endm519520#521# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)522# k = 32 bytes key523# r3 = k (r, s)524# r4 = mlen525# r5 = m526#527SYM_FUNC_START(poly1305_p10le_4blocks)528.align 5529cmpdi 5, 64530blt Out_no_poly1305531532SAVE_REGS533534do_poly1305_init535536li 21, 0 # counter to message537538poly1305_setup_r539540# load previous H state541# break/convert r6 to 26 bits542ld 9, 0(3)543ld 10, 8(3)544ld 19, 16(3)545sldi 19, 19, 24546mtvsrdd 41, 0, 19547extrdi 14, 9, 26, 38548extrdi 15, 9, 26, 12549extrdi 16, 9, 12, 0550mtvsrdd 36, 0, 14551insrdi 16, 10, 14, 38552mtvsrdd 37, 0, 15553extrdi 17, 10, 26, 24554mtvsrdd 38, 0, 16555extrdi 18, 10, 24, 0556mtvsrdd 39, 0, 17557mtvsrdd 40, 0, 18558vor 8, 8, 9559560# input m1 m2561add 20, 4, 21562xxlor 49, 24, 24563xxlor 50, 25, 25564lxvw4x 43, 0, 20565addi 17, 20, 16566lxvw4x 44, 0, 17567vperm 14, 11, 12, 17568vperm 15, 11, 12, 18569vand 9, 14, 25 # a0570vsrd 10, 14, 31 # >> 26571vsrd 11, 10, 31 # 12 bits left572vand 10, 10, 25 # a1573vspltisb 13, 12574vand 16, 15, 25575vsld 12, 16, 13576vor 11, 11, 12577vand 11, 11, 25 # a2578vspltisb 13, 14579vsrd 12, 15, 13 # >> 14580vsrd 13, 12, 31 # >> 26, a4581vand 12, 12, 25 # a3582583vaddudm 20, 4, 9584vaddudm 21, 5, 10585vaddudm 22, 6, 11586vaddudm 23, 7, 12587vaddudm 24, 8, 13588589# m3 m4590addi 17, 17, 16591lxvw4x 43, 0, 17592addi 17, 17, 16593lxvw4x 44, 0, 17594vperm 14, 11, 12, 17595vperm 15, 11, 12, 18596vand 9, 14, 25 # a0597vsrd 10, 14, 31 # >> 26598vsrd 11, 10, 31 # 12 bits left599vand 10, 10, 25 # a1600vspltisb 13, 12601vand 16, 15, 25602vsld 12, 16, 13603vspltisb 13, 14604vor 11, 11, 12605vand 11, 11, 25 # a2606vsrd 12, 15, 13 # >> 14607vsrd 13, 12, 31 # >> 26, a4608vand 12, 12, 25 # a3609610# Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]611vmrgow 4, 9, 20612vmrgow 5, 10, 21613vmrgow 6, 11, 22614vmrgow 7, 12, 23615vmrgow 8, 13, 24616vaddudm 8, 8, 19617618addi 5, 5, -64 # len -= 64619addi 21, 21, 64 # offset += 64620621li 9, 64622divdu 31, 5, 9623624cmpdi 31, 0625ble Skip_block_loop626627mtctr 31628629# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r630# Rewrite the polynominal sum of product as follows,631# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2632# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2633# .... Repeat634# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->635# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r636#637loop_4blocks:638639# Multiply odd words and even words640mul_odd641mul_even642# carry reduction643vspltisb 9, 2644vsrd 10, 14, 31645vsrd 11, 17, 31646vand 7, 17, 25647vand 4, 14, 25648vaddudm 18, 18, 11649vsrd 12, 18, 31650vaddudm 15, 15, 10651652vsrd 11, 15, 31653vand 8, 18, 25654vand 5, 15, 25655vaddudm 4, 4, 12656vsld 10, 12, 9657vaddudm 6, 16, 11658659vsrd 13, 6, 31660vand 6, 6, 25661vaddudm 4, 4, 10662vsrd 10, 4, 31663vaddudm 7, 7, 13664665vsrd 11, 7, 31666vand 7, 7, 25667vand 4, 4, 25668vaddudm 5, 5, 10669vaddudm 8, 8, 11670671# input m1 m2 m3 m4672add 20, 4, 21673xxlor 49, 24, 24674xxlor 50, 25, 25675lxvw4x 43, 0, 20676addi 17, 20, 16677lxvw4x 44, 0, 17678vperm 14, 11, 12, 17679vperm 15, 11, 12, 18680addi 17, 17, 16681lxvw4x 43, 0, 17682addi 17, 17, 16683lxvw4x 44, 0, 17684vperm 17, 11, 12, 17685vperm 18, 11, 12, 18686687vand 20, 14, 25 # a0688vand 9, 17, 25 # a0689vsrd 21, 14, 31 # >> 26690vsrd 22, 21, 31 # 12 bits left691vsrd 10, 17, 31 # >> 26692vsrd 11, 10, 31 # 12 bits left693694vand 21, 21, 25 # a1695vand 10, 10, 25 # a1696697vspltisb 13, 12698vand 16, 15, 25699vsld 23, 16, 13700vor 22, 22, 23701vand 22, 22, 25 # a2702vand 16, 18, 25703vsld 12, 16, 13704vor 11, 11, 12705vand 11, 11, 25 # a2706vspltisb 13, 14707vsrd 23, 15, 13 # >> 14708vsrd 24, 23, 31 # >> 26, a4709vand 23, 23, 25 # a3710vsrd 12, 18, 13 # >> 14711vsrd 13, 12, 31 # >> 26, a4712vand 12, 12, 25 # a3713714vaddudm 4, 4, 20715vaddudm 5, 5, 21716vaddudm 6, 6, 22717vaddudm 7, 7, 23718vaddudm 8, 8, 24719720# Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]721vmrgow 4, 9, 4722vmrgow 5, 10, 5723vmrgow 6, 11, 6724vmrgow 7, 12, 7725vmrgow 8, 13, 8726vaddudm 8, 8, 19727728addi 5, 5, -64 # len -= 64729addi 21, 21, 64 # offset += 64730731bdnz loop_4blocks732733Skip_block_loop:734xxlor 58, 0, 0735xxlor 59, 1, 1736xxlor 60, 2, 2737xxlor 61, 3, 3738xxlor 62, 4, 4739xxlor 32, 5, 5740xxlor 33, 6, 6741xxlor 34, 7, 7742xxlor 35, 8, 8743744# Multiply odd words and even words745mul_odd746mul_even747748# Sum the products.749xxpermdi 41, 31, 46, 0750xxpermdi 42, 31, 47, 0751vaddudm 4, 14, 9752xxpermdi 36, 31, 36, 3753vaddudm 5, 15, 10754xxpermdi 37, 31, 37, 3755xxpermdi 43, 31, 48, 0756vaddudm 6, 16, 11757xxpermdi 38, 31, 38, 3758xxpermdi 44, 31, 49, 0759vaddudm 7, 17, 12760xxpermdi 39, 31, 39, 3761xxpermdi 45, 31, 50, 0762vaddudm 8, 18, 13763xxpermdi 40, 31, 40, 3764765# carry reduction766vspltisb 9, 2767vsrd 10, 4, 31768vsrd 11, 7, 31769vand 7, 7, 25770vand 4, 4, 25771vaddudm 8, 8, 11772vsrd 12, 8, 31773vaddudm 5, 5, 10774775vsrd 11, 5, 31776vand 8, 8, 25777vand 5, 5, 25778vaddudm 4, 4, 12779vsld 10, 12, 9780vaddudm 6, 6, 11781782vsrd 13, 6, 31783vand 6, 6, 25784vaddudm 4, 4, 10785vsrd 10, 4, 31786vaddudm 7, 7, 13787788vsrd 11, 7, 31789vand 7, 7, 25790vand 4, 4, 25791vaddudm 5, 5, 10792vsrd 10, 5, 31793vand 5, 5, 25794vaddudm 6, 6, 10795vaddudm 8, 8, 11796797b do_final_update798799do_final_update:800# combine 26 bit limbs801# v4, v5, v6, v7 and v8 are 26 bit vectors802vsld 5, 5, 31803vor 20, 4, 5804vspltisb 11, 12805vsrd 12, 6, 11806vsld 6, 6, 31807vsld 6, 6, 31808vor 20, 20, 6809vspltisb 11, 14810vsld 7, 7, 11811vor 21, 7, 12812mfvsrld 16, 40 # save last 2 bytes813vsld 8, 8, 11814vsld 8, 8, 31815vor 21, 21, 8816mfvsrld 17, 52817mfvsrld 19, 53818srdi 16, 16, 24819820std 17, 0(3)821std 19, 8(3)822stw 16, 16(3)823824Out_loop:825li 3, 0826827RESTORE_REGS828829blr830831Out_no_poly1305:832li 3, 0833blr834SYM_FUNC_END(poly1305_p10le_4blocks)835836#837# =======================================================================838# The following functions implement 64 x 64 bits multiplication poly1305.839#840SYM_FUNC_START_LOCAL(Poly1305_init_64)841# mask 0x0FFFFFFC0FFFFFFC842# mask 0x0FFFFFFC0FFFFFFF843addis 10, 2, rmask@toc@ha844addi 10, 10, rmask@toc@l845ld 11, 0(10)846ld 12, 8(10)847848# initialize849# load key from r3850ld 9, 24(3)851ld 10, 32(3)852and. 9, 9, 11 # cramp mask r0853and. 10, 10, 12 # cramp mask r1854855srdi 21, 10, 2856add 19, 21, 10 # s1: r19 - (r1 >> 2) *5857858# setup r and s859li 25, 0860mtvsrdd 32+0, 9, 19 # r0, s1861mtvsrdd 32+1, 10, 9 # r1, r0862mtvsrdd 32+2, 19, 25 # s1863mtvsrdd 32+3, 9, 25 # r0864865blr866SYM_FUNC_END(Poly1305_init_64)867868# Poly1305_mult869# v6 = (h0, h1), v8 = h2870# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0871#872# Output: v7, v10, v11873#874SYM_FUNC_START_LOCAL(Poly1305_mult)875#876# d0 = h0 * r0 + h1 * s1877vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1878879# d1 = h0 * r1 + h1 * r0 + h2 * s1880vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0881vmsumudm 10, 8, 2, 11 # d1 += h2 * s1882883# d2 = r0884vmsumudm 11, 8, 3, 9 # d2 = h2 * r0885blr886SYM_FUNC_END(Poly1305_mult)887888#889# carry reduction890# h %=p891#892# Input: v7, v10, v11893# Output: r27, r28, r29894#895SYM_FUNC_START_LOCAL(Carry_reduction)896mfvsrld 27, 32+7897mfvsrld 28, 32+10898mfvsrld 29, 32+11899mfvsrd 20, 32+7 # h0.h900mfvsrd 21, 32+10 # h1.h901902addc 28, 28, 20903adde 29, 29, 21904srdi 22, 29, 0x2905sldi 23, 22, 0x2906add 23, 23, 22 # (h2 & 3) * 5907addc 27, 27, 23 # h0908addze 28, 28 # h1909andi. 29, 29, 0x3 # h2910blr911SYM_FUNC_END(Carry_reduction)912913#914# poly1305 multiplication915# h *= r, h %= p916# d0 = h0 * r0 + h1 * s1917# d1 = h0 * r1 + h1 * r0 + h2 * s1918# d2 = h0 * r0919#920#921# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)922# - no highbit if final leftover block (highbit = 0)923#924SYM_FUNC_START(poly1305_64s)925cmpdi 5, 0926ble Out_no_poly1305_64927928mflr 0929std 0, 16(1)930stdu 1,-400(1)931932SAVE_GPR 14, 112, 1933SAVE_GPR 15, 120, 1934SAVE_GPR 16, 128, 1935SAVE_GPR 17, 136, 1936SAVE_GPR 18, 144, 1937SAVE_GPR 19, 152, 1938SAVE_GPR 20, 160, 1939SAVE_GPR 21, 168, 1940SAVE_GPR 22, 176, 1941SAVE_GPR 23, 184, 1942SAVE_GPR 24, 192, 1943SAVE_GPR 25, 200, 1944SAVE_GPR 26, 208, 1945SAVE_GPR 27, 216, 1946SAVE_GPR 28, 224, 1947SAVE_GPR 29, 232, 1948SAVE_GPR 30, 240, 1949SAVE_GPR 31, 248, 1950951# Init poly1305952bl Poly1305_init_64953954li 25, 0 # offset to inp and outp955956add 11, 25, 4957958# load h959# h0, h1, h2?960ld 27, 0(3)961ld 28, 8(3)962lwz 29, 16(3)963964li 30, 16965divdu 31, 5, 30966967mtctr 31968969mr 24, 6 # highbit970971Loop_block_64:972vxor 9, 9, 9973974ld 20, 0(11)975ld 21, 8(11)976addi 11, 11, 16977978addc 27, 27, 20979adde 28, 28, 21980adde 29, 29, 24981982li 22, 0983mtvsrdd 32+6, 27, 28 # h0, h1984mtvsrdd 32+8, 29, 22 # h2985986bl Poly1305_mult987988bl Carry_reduction989990bdnz Loop_block_64991992std 27, 0(3)993std 28, 8(3)994stw 29, 16(3)995996li 3, 0997998RESTORE_GPR 14, 112, 1999RESTORE_GPR 15, 120, 11000RESTORE_GPR 16, 128, 11001RESTORE_GPR 17, 136, 11002RESTORE_GPR 18, 144, 11003RESTORE_GPR 19, 152, 11004RESTORE_GPR 20, 160, 11005RESTORE_GPR 21, 168, 11006RESTORE_GPR 22, 176, 11007RESTORE_GPR 23, 184, 11008RESTORE_GPR 24, 192, 11009RESTORE_GPR 25, 200, 11010RESTORE_GPR 26, 208, 11011RESTORE_GPR 27, 216, 11012RESTORE_GPR 28, 224, 11013RESTORE_GPR 29, 232, 11014RESTORE_GPR 30, 240, 11015RESTORE_GPR 31, 248, 110161017addi 1, 1, 4001018ld 0, 16(1)1019mtlr 010201021blr10221023Out_no_poly1305_64:1024li 3, 01025blr1026SYM_FUNC_END(poly1305_64s)10271028#1029# Input: r3 = h, r4 = s, r5 = mac1030# mac = h + s1031#1032SYM_FUNC_START(poly1305_emit_64)1033ld 10, 0(3)1034ld 11, 8(3)1035ld 12, 16(3)10361037# compare modulus1038# h + 5 + (-p)1039mr 6, 101040mr 7, 111041mr 8, 121042addic. 6, 6, 51043addze 7, 71044addze 8, 81045srdi 9, 8, 2 # overflow?1046cmpdi 9, 01047beq Skip_h641048mr 10, 61049mr 11, 71050mr 12, 810511052Skip_h64:1053ld 6, 0(4)1054ld 7, 8(4)1055addc 10, 10, 61056adde 11, 11, 71057addze 12, 1210581059std 10, 0(5)1060std 11, 8(5)1061blr1062SYM_FUNC_END(poly1305_emit_64)10631064SYM_DATA_START_LOCAL(RMASK)1065.align 51066rmask:1067.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f1068cnum:1069.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x000000001070.long 0x1a, 0x00, 0x1a, 0x001071.long 0x01000000, 0x01000000, 0x01000000, 0x010000001072.long 0x00010203, 0x04050607, 0x10111213, 0x141516171073.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f1074SYM_DATA_END(RMASK)107510761077