Path: blob/aarch64-shenandoah-jdk8u272-b10/hotspot/src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp
32285 views
// Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.1// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.2//3// This code is free software; you can redistribute it and/or modify it4// under the terms of the GNU General Public License version 2 only, as5// published by the Free Software Foundation.6//7// This code is distributed in the hope that it will be useful, but WITHOUT8// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or9// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License10// version 2 for more details (a copy is included in the LICENSE file that11// accompanied this code).12//13// You should have received a copy of the GNU General Public License version14// 2 along with this work; if not, write to the Free Software Foundation,15// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.16//17// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA18// or visit www.oracle.com if you need additional information or have any19// questions.2021// Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"22// (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).2324#include "asm/macroAssembler.inline.hpp"25#include "runtime/stubRoutines.hpp"2627/**********************************************************************28* SHA 25629*********************************************************************/3031void MacroAssembler::sha256_deque(const VectorRegister src,32const VectorRegister dst1,33const VectorRegister dst2,34const VectorRegister dst3) {35vsldoi (dst1, src, src, 12);36vsldoi (dst2, src, src, 8);37vsldoi (dst3, src, src, 4);38}3940void MacroAssembler::sha256_round(const VectorRegister* hs,41const int total_hs,42int& h_cnt,43const VectorRegister kpw) {44// convenience registers: cycle from 0-7 downwards45const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];46const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];47const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];48const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];49const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];50const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];51const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];52const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];53// temporaries54VectorRegister ch = VR0;55VectorRegister maj = VR1;56VectorRegister bsa = VR2;57VectorRegister bse = VR3;58VectorRegister vt0 = VR4;59VectorRegister vt1 = VR5;60VectorRegister vt2 = VR6;61VectorRegister vt3 = VR7;6263vsel (ch, g, f, e);64vxor (maj, a, b);65vshasigmaw (bse, e, 1, 0xf);66vadduwm (vt2, ch, kpw);67vadduwm (vt1, h, bse);68vsel (maj, b, c, maj);69vadduwm (vt3, vt1, vt2);70vshasigmaw (bsa, a, 1, 0);71vadduwm (vt0, bsa, maj);7273vadduwm (d, d, vt3);74vadduwm (h, vt3, vt0);7576// advance vector pointer to the next iteration77h_cnt++;78}7980void MacroAssembler::sha256_load_h_vec(const VectorRegister a,81const VectorRegister e,82const Register hptr) {83// temporaries84Register tmp = R8;85VectorRegister vt0 = VR0;86VectorRegister vRb = VR6;87// labels88Label sha256_aligned;8990andi_ (tmp, hptr, 0xf);91lvx (a, hptr);92addi (tmp, hptr, 16);93lvx (e, tmp);94beq (CCR0, sha256_aligned);9596// handle unaligned accesses97load_perm(vRb, hptr);98addi (tmp, hptr, 32);99vec_perm(a, e, vRb);100101lvx (vt0, tmp);102vec_perm(e, vt0, vRb);103104// aligned accesses105bind(sha256_aligned);106}107108void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,109const VectorRegister* ws,110const int total_ws,111const Register k,112const VectorRegister* kpws,113const int total_kpws) {114Label w_aligned, after_w_load;115116Register tmp = R8;117VectorRegister vt0 = VR0;118VectorRegister vt1 = VR1;119VectorRegister vRb = VR6;120121andi_ (tmp, buf_in, 0xF);122beq (CCR0, w_aligned); // address ends with 0x0, not 0x8123124// deal with unaligned addresses125lvx (ws[0], buf_in);126load_perm(vRb, buf_in);127128for (int n = 1; n < total_ws; n++) {129VectorRegister w_cur = ws[n];130VectorRegister w_prev = ws[n-1];131132addi (tmp, buf_in, n * 16);133lvx (w_cur, tmp);134vec_perm(w_prev, w_cur, vRb);135}136addi (tmp, buf_in, total_ws * 16);137lvx (vt0, tmp);138vec_perm(ws[total_ws-1], vt0, vRb);139b (after_w_load);140141bind(w_aligned);142143// deal with aligned addresses144lvx(ws[0], buf_in);145for (int n = 1; n < total_ws; n++) {146VectorRegister w = ws[n];147addi (tmp, buf_in, n * 16);148lvx (w, tmp);149}150151bind(after_w_load);152153#if defined(VM_LITTLE_ENDIAN)154// Byte swapping within int values155li (tmp, 8);156lvsl (vt0, tmp);157vspltisb (vt1, 0xb);158vxor (vt1, vt0, vt1);159for (int n = 0; n < total_ws; n++) {160VectorRegister w = ws[n];161vec_perm(w, w, vt1);162}163#endif164165// Loading k, which is always aligned to 16-bytes166lvx (kpws[0], k);167for (int n = 1; n < total_kpws; n++) {168VectorRegister kpw = kpws[n];169addi (tmp, k, 16 * n);170lvx (kpw, tmp);171}172173// Add w to K174assert(total_ws == total_kpws, "Redesign the loop below");175for (int n = 0; n < total_kpws; n++) {176VectorRegister kpw = kpws[n];177VectorRegister w = ws[n];178179vadduwm (kpw, kpw, w);180}181}182183void MacroAssembler::sha256_calc_4w(const VectorRegister w0,184const VectorRegister w1,185const VectorRegister w2,186const VectorRegister w3,187const VectorRegister kpw0,188const VectorRegister kpw1,189const VectorRegister kpw2,190const VectorRegister kpw3,191const Register j,192const Register k) {193// Temporaries194const VectorRegister vt0 = VR0;195const VectorRegister vt1 = VR1;196const VectorSRegister vsrt1 = vt1->to_vsr();197const VectorRegister vt2 = VR2;198const VectorRegister vt3 = VR3;199const VectorSRegister vst3 = vt3->to_vsr();200const VectorRegister vt4 = VR4;201202// load to k[j]203lvx (vt0, j, k);204205// advance j206addi (j, j, 16); // 16 bytes were read207208#if defined(VM_LITTLE_ENDIAN)209// b = w[j-15], w[j-14], w[j-13], w[j-12]210vsldoi (vt1, w1, w0, 12);211212// c = w[j-7], w[j-6], w[j-5], w[j-4]213vsldoi (vt2, w3, w2, 12);214215#else216// b = w[j-15], w[j-14], w[j-13], w[j-12]217vsldoi (vt1, w0, w1, 4);218219// c = w[j-7], w[j-6], w[j-5], w[j-4]220vsldoi (vt2, w2, w3, 4);221#endif222223// d = w[j-2], w[j-1], w[j-4], w[j-3]224vsldoi (vt3, w3, w3, 8);225226// b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])227vshasigmaw (vt1, vt1, 0, 0);228229// d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])230vshasigmaw (vt3, vt3, 0, 0xf);231232// c = s0(w[j-15]) + w[j-7],233// s0(w[j-14]) + w[j-6],234// s0(w[j-13]) + w[j-5],235// s0(w[j-12]) + w[j-4]236vadduwm (vt2, vt1, vt2);237238// c = s0(w[j-15]) + w[j-7] + w[j-16],239// s0(w[j-14]) + w[j-6] + w[j-15],240// s0(w[j-13]) + w[j-5] + w[j-14],241// s0(w[j-12]) + w[j-4] + w[j-13]242vadduwm (vt2, vt2, w0);243244// e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]245// s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]246// s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED247// s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED248vadduwm (vt4, vt2, vt3);249250// At this point, e[0] and e[1] are the correct values to be stored at w[j]251// and w[j+1].252// e[2] and e[3] are not considered.253// b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED254vshasigmaw (vt1, vt4, 0, 0xf);255256// v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])257#if defined(VM_LITTLE_ENDIAN)258xxmrgld (vst3, vsrt1, vst3);259#else260xxmrghd (vst3, vst3, vsrt1);261#endif262263// c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]264// s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]265// s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2]266// s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4]267vadduwm (vt2, vt2, vt3);268269// Updating w0 to w3 to hold the new previous 16 values from w.270vmr (w0, w1);271vmr (w1, w2);272vmr (w2, w3);273vmr (w3, vt2);274275// store k + w to v9 (4 values at once)276#if defined(VM_LITTLE_ENDIAN)277vadduwm (kpw0, vt2, vt0);278279vsldoi (kpw1, kpw0, kpw0, 12);280vsldoi (kpw2, kpw0, kpw0, 8);281vsldoi (kpw3, kpw0, kpw0, 4);282#else283vadduwm (kpw3, vt2, vt0);284285vsldoi (kpw2, kpw3, kpw3, 12);286vsldoi (kpw1, kpw3, kpw3, 8);287vsldoi (kpw0, kpw3, kpw3, 4);288#endif289}290291void MacroAssembler::sha256_update_sha_state(const VectorRegister a,292const VectorRegister b_,293const VectorRegister c,294const VectorRegister d,295const VectorRegister e,296const VectorRegister f,297const VectorRegister g,298const VectorRegister h,299const Register hptr) {300// temporaries301VectorRegister vt0 = VR0;302VectorRegister vt1 = VR1;303VectorRegister vt2 = VR2;304VectorRegister vt3 = VR3;305VectorRegister vt4 = VR4;306VectorRegister vt5 = VR5;307VectorRegister vaux = VR6;308VectorRegister vRb = VR6;309Register tmp = R8;310Register of16 = R8;311Register of32 = R9;312Label state_load_aligned;313314// Load hptr315andi_ (tmp, hptr, 0xf);316li (of16, 16);317lvx (vt0, hptr);318lvx (vt5, of16, hptr);319beq (CCR0, state_load_aligned);320321// handle unaligned accesses322li (of32, 32);323load_perm(vRb, hptr);324325vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3]326327lvx (vt1, hptr, of32);328vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7]329330// aligned accesses331bind(state_load_aligned);332333#if defined(VM_LITTLE_ENDIAN)334vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?}335vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?}336vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?}337vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?}338xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}339xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}340vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}341vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}342343// Save hptr back, works for any alignment344xxswapd (vt0->to_vsr(), a->to_vsr());345stxvd2x (vt0->to_vsr(), hptr);346xxswapd (vt5->to_vsr(), e->to_vsr());347stxvd2x (vt5->to_vsr(), of16, hptr);348#else349vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?}350vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?}351vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?}352vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?}353xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}354xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}355vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}356vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}357358// Save hptr back, works for any alignment359stxvd2x (d->to_vsr(), hptr);360stxvd2x (h->to_vsr(), of16, hptr);361#endif362}363364static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {3650x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,3660x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,3670xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,3680x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,3690xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,3700x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,3710x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,3720xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,3730x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,3740x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,3750xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,3760xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,3770x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,3780x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,3790x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,3800x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,381};382static const uint32_t *sha256_round_consts = sha256_round_table;383384// R3_ARG1 - byte[] Input string with padding but in Big Endian385// R4_ARG2 - int[] SHA.state (at first, the root of primes)386// R5_ARG3 - int offset387// R6_ARG4 - int limit388//389// Internal Register usage:390// R7 - k391// R8 - tmp | j | of16392// R9 - of32393// VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb394// VR9-VR16 - a-h395// VR17-VR20 - w0-w3396// VR21-VR23 - vRb | vaux0-vaux2397// VR24-VR27 - kpw0-kpw3398void MacroAssembler::sha256(bool multi_block) {399static const ssize_t buf_size = 64;400static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);401#ifdef AIX402// malloc provides 16 byte alignment403if (((uintptr_t)sha256_round_consts & 0xF) != 0) {404uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));405guarantee(new_round_consts, "oom");406memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));407sha256_round_consts = (const uint32_t*)new_round_consts;408}409#endif410411Register buf_in = R3_ARG1;412Register state = R4_ARG2;413Register ofs = R5_ARG3;414Register limit = R6_ARG4;415416Label sha_loop, core_loop;417418// Save non-volatile vector registers in the red zone419static const VectorRegister nv[] = {420VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/421};422static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);423424for (int c = 0; c < nv_size; c++) {425Register tmp = R8;426li (tmp, (c - (nv_size)) * 16);427stvx(nv[c], tmp, R1);428}429430// Load hash state to registers431VectorRegister a = VR9;432VectorRegister b = VR10;433VectorRegister c = VR11;434VectorRegister d = VR12;435VectorRegister e = VR13;436VectorRegister f = VR14;437VectorRegister g = VR15;438VectorRegister h = VR16;439static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};440static const int total_hs = sizeof(hs)/sizeof(VectorRegister);441// counter for cycling through hs vector to avoid register moves between iterations442int h_cnt = 0;443444// Load a-h registers from the memory pointed by state445#if defined(VM_LITTLE_ENDIAN)446sha256_load_h_vec(a, e, state);447#else448sha256_load_h_vec(d, h, state);449#endif450451// keep k loaded also during MultiBlock loops452Register k = R7;453assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");454load_const_optimized(k, (address)sha256_round_consts, R0);455456// Avoiding redundant loads457if (multi_block) {458align(OptoLoopAlignment);459}460bind(sha_loop);461#if defined(VM_LITTLE_ENDIAN)462sha256_deque(a, b, c, d);463sha256_deque(e, f, g, h);464#else465sha256_deque(d, c, b, a);466sha256_deque(h, g, f, e);467#endif468469// Load 16 elements from w out of the loop.470// Order of the int values is Endianess specific.471VectorRegister w0 = VR17;472VectorRegister w1 = VR18;473VectorRegister w2 = VR19;474VectorRegister w3 = VR20;475static const VectorRegister ws[] = {w0, w1, w2, w3};476static const int total_ws = sizeof(ws)/sizeof(VectorRegister);477478VectorRegister kpw0 = VR24;479VectorRegister kpw1 = VR25;480VectorRegister kpw2 = VR26;481VectorRegister kpw3 = VR27;482static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};483static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);484485sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);486487// Cycle through the first 16 elements488assert(total_ws == total_kpws, "Redesign the loop below");489for (int n = 0; n < total_ws; n++) {490VectorRegister vaux0 = VR21;491VectorRegister vaux1 = VR22;492VectorRegister vaux2 = VR23;493494sha256_deque(kpws[n], vaux0, vaux1, vaux2);495496#if defined(VM_LITTLE_ENDIAN)497sha256_round(hs, total_hs, h_cnt, kpws[n]);498sha256_round(hs, total_hs, h_cnt, vaux0);499sha256_round(hs, total_hs, h_cnt, vaux1);500sha256_round(hs, total_hs, h_cnt, vaux2);501#else502sha256_round(hs, total_hs, h_cnt, vaux2);503sha256_round(hs, total_hs, h_cnt, vaux1);504sha256_round(hs, total_hs, h_cnt, vaux0);505sha256_round(hs, total_hs, h_cnt, kpws[n]);506#endif507}508509Register tmp = R8;510// loop the 16th to the 64th iteration by 8 steps511li (tmp, (w_size - 16) / total_hs);512mtctr(tmp);513514// j will be aligned to 4 for loading words.515// Whenever read, advance the pointer (e.g: when j is used in a function)516Register j = R8;517li (j, 16*4);518519align(OptoLoopAlignment);520bind(core_loop);521522// due to VectorRegister rotate, always iterate in multiples of total_hs523for (int n = 0; n < total_hs/4; n++) {524sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);525sha256_round(hs, total_hs, h_cnt, kpw0);526sha256_round(hs, total_hs, h_cnt, kpw1);527sha256_round(hs, total_hs, h_cnt, kpw2);528sha256_round(hs, total_hs, h_cnt, kpw3);529}530531bdnz (core_loop);532533// Update hash state534sha256_update_sha_state(a, b, c, d, e, f, g, h, state);535536if (multi_block) {537addi(buf_in, buf_in, buf_size);538addi(ofs, ofs, buf_size);539cmplw(CCR0, ofs, limit);540ble(CCR0, sha_loop);541542// return ofs543mr(R3_RET, ofs);544}545546// Restore non-volatile registers547for (int c = 0; c < nv_size; c++) {548Register tmp = R8;549li (tmp, (c - (nv_size)) * 16);550lvx(nv[c], tmp, R1);551}552}553554555/**********************************************************************556* SHA 512557*********************************************************************/558559void MacroAssembler::sha512_load_w_vec(const Register buf_in,560const VectorRegister* ws,561const int total_ws) {562Register tmp = R8;563VectorRegister vRb = VR8;564VectorRegister aux = VR9;565Label is_aligned, after_alignment;566567andi_ (tmp, buf_in, 0xF);568beq (CCR0, is_aligned); // address ends with 0x0, not 0x8569570// deal with unaligned addresses571lvx (ws[0], buf_in);572load_perm(vRb, buf_in);573574for (int n = 1; n < total_ws; n++) {575VectorRegister w_cur = ws[n];576VectorRegister w_prev = ws[n-1];577addi (tmp, buf_in, n * 16);578lvx (w_cur, tmp);579vec_perm(w_prev, w_cur, vRb);580}581addi (tmp, buf_in, total_ws * 16);582lvx (aux, tmp);583vec_perm(ws[total_ws-1], aux, vRb);584b (after_alignment);585586bind(is_aligned);587lvx (ws[0], buf_in);588for (int n = 1; n < total_ws; n++) {589VectorRegister w = ws[n];590addi (tmp, buf_in, n * 16);591lvx (w, tmp);592}593594bind(after_alignment);595}596597// Update hash state598void MacroAssembler::sha512_update_sha_state(const Register state,599const VectorRegister* hs,600const int total_hs) {601602#if defined(VM_LITTLE_ENDIAN)603int start_idx = 0;604#else605int start_idx = 1;606#endif607608// load initial hash from the memory pointed by state609VectorRegister ini_a = VR10;610VectorRegister ini_c = VR12;611VectorRegister ini_e = VR14;612VectorRegister ini_g = VR16;613static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};614static const int total_inis = sizeof(inis)/sizeof(VectorRegister);615616Label state_save_aligned, after_state_save_aligned;617618Register addr = R7;619Register tmp = R8;620VectorRegister vRb = VR8;621VectorRegister aux = VR9;622623andi_(tmp, state, 0xf);624beq(CCR0, state_save_aligned);625// deal with unaligned addresses626627{628VectorRegister a = hs[0];629VectorRegister b_ = hs[1];630VectorRegister c = hs[2];631VectorRegister d = hs[3];632VectorRegister e = hs[4];633VectorRegister f = hs[5];634VectorRegister g = hs[6];635VectorRegister h = hs[7];636load_perm(vRb, state);637lvx (ini_a, state);638addi (addr, state, 16);639640lvx (ini_c, addr);641addi (addr, state, 32);642vec_perm(ini_a, ini_c, vRb);643644lvx (ini_e, addr);645addi (addr, state, 48);646vec_perm(ini_c, ini_e, vRb);647648lvx (ini_g, addr);649addi (addr, state, 64);650vec_perm(ini_e, ini_g, vRb);651652lvx (aux, addr);653vec_perm(ini_g, aux, vRb);654655#if defined(VM_LITTLE_ENDIAN)656xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());657xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());658xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());659xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());660#else661xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());662xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());663xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());664xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());665#endif666667for (int n = start_idx; n < total_hs; n += 2) {668VectorRegister h_cur = hs[n];669VectorRegister ini_cur = inis[n/2];670671vaddudm(h_cur, ini_cur, h_cur);672}673674for (int n = start_idx; n < total_hs; n += 2) {675VectorRegister h_cur = hs[n];676677mfvrd (tmp, h_cur);678#if defined(VM_LITTLE_ENDIAN)679std (tmp, 8*n + 8, state);680#else681std (tmp, 8*n - 8, state);682#endif683vsldoi (aux, h_cur, h_cur, 8);684mfvrd (tmp, aux);685std (tmp, 8*n + 0, state);686}687688b (after_state_save_aligned);689}690691bind(state_save_aligned);692{693for (int n = 0; n < total_hs; n += 2) {694#if defined(VM_LITTLE_ENDIAN)695VectorRegister h_cur = hs[n];696VectorRegister h_next = hs[n+1];697#else698VectorRegister h_cur = hs[n+1];699VectorRegister h_next = hs[n];700#endif701VectorRegister ini_cur = inis[n/2];702703if (n/2 == 0) {704lvx(ini_cur, state);705} else {706addi(addr, state, (n/2) * 16);707lvx(ini_cur, addr);708}709xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());710}711712for (int n = start_idx; n < total_hs; n += 2) {713VectorRegister h_cur = hs[n];714VectorRegister ini_cur = inis[n/2];715716vaddudm(h_cur, ini_cur, h_cur);717}718719for (int n = start_idx; n < total_hs; n += 2) {720VectorRegister h_cur = hs[n];721722if (n/2 == 0) {723stvx(h_cur, state);724} else {725addi(addr, state, (n/2) * 16);726stvx(h_cur, addr);727}728}729}730731bind(after_state_save_aligned);732}733734// Use h_cnt to cycle through hs elements but also increment it at the end735void MacroAssembler::sha512_round(const VectorRegister* hs,736const int total_hs, int& h_cnt,737const VectorRegister kpw) {738739// convenience registers: cycle from 0-7 downwards740const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];741const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];742const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];743const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];744const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];745const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];746const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];747const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];748// temporaries749const VectorRegister Ch = VR20;750const VectorRegister Maj = VR21;751const VectorRegister bsa = VR22;752const VectorRegister bse = VR23;753const VectorRegister tmp1 = VR24;754const VectorRegister tmp2 = VR25;755756vsel (Ch, g, f, e);757vxor (Maj, a, b);758vshasigmad(bse, e, 1, 0xf);759vaddudm (tmp2, Ch, kpw);760vaddudm (tmp1, h, bse);761vsel (Maj, b, c, Maj);762vaddudm (tmp1, tmp1, tmp2);763vshasigmad(bsa, a, 1, 0);764vaddudm (tmp2, bsa, Maj);765vaddudm (d, d, tmp1);766vaddudm (h, tmp1, tmp2);767768// advance vector pointer to the next iteration769h_cnt++;770}771772void MacroAssembler::sha512_calc_2w(const VectorRegister w0,773const VectorRegister w1,774const VectorRegister w2,775const VectorRegister w3,776const VectorRegister w4,777const VectorRegister w5,778const VectorRegister w6,779const VectorRegister w7,780const VectorRegister kpw0,781const VectorRegister kpw1,782const Register j,783const VectorRegister vRb,784const Register k) {785// Temporaries786const VectorRegister VR_a = VR20;787const VectorRegister VR_b = VR21;788const VectorRegister VR_c = VR22;789const VectorRegister VR_d = VR23;790791// load to k[j]792lvx (VR_a, j, k);793// advance j794addi (j, j, 16); // 16 bytes were read795796#if defined(VM_LITTLE_ENDIAN)797// v6 = w[j-15], w[j-14]798vperm (VR_b, w1, w0, vRb);799// v12 = w[j-7], w[j-6]800vperm (VR_c, w5, w4, vRb);801#else802// v6 = w[j-15], w[j-14]803vperm (VR_b, w0, w1, vRb);804// v12 = w[j-7], w[j-6]805vperm (VR_c, w4, w5, vRb);806#endif807808// v6 = s0(w[j-15]) , s0(w[j-14])809vshasigmad (VR_b, VR_b, 0, 0);810// v5 = s1(w[j-2]) , s1(w[j-1])811vshasigmad (VR_d, w7, 0, 0xf);812// v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]813vaddudm (VR_b, VR_b, VR_c);814// v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]815vaddudm (VR_d, VR_d, w0);816// v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]817// s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]818vaddudm (VR_c, VR_d, VR_b);819// Updating w0 to w7 to hold the new previous 16 values from w.820vmr (w0, w1);821vmr (w1, w2);822vmr (w2, w3);823vmr (w3, w4);824vmr (w4, w5);825vmr (w5, w6);826vmr (w6, w7);827vmr (w7, VR_c);828829#if defined(VM_LITTLE_ENDIAN)830// store k + w to kpw0 (2 values at once)831vaddudm (kpw0, VR_c, VR_a);832// kpw1 holds (k + w)[1]833vsldoi (kpw1, kpw0, kpw0, 8);834#else835// store k + w to kpw0 (2 values at once)836vaddudm (kpw1, VR_c, VR_a);837// kpw1 holds (k + w)[1]838vsldoi (kpw0, kpw1, kpw1, 8);839#endif840}841842void MacroAssembler::sha512_load_h_vec(const Register state,843const VectorRegister* hs,844const int total_hs) {845#if defined(VM_LITTLE_ENDIAN)846VectorRegister a = hs[0];847VectorRegister g = hs[6];848int start_idx = 0;849#else850VectorRegister a = hs[1];851VectorRegister g = hs[7];852int start_idx = 1;853#endif854855Register addr = R7;856VectorRegister vRb = VR8;857Register tmp = R8;858Label state_aligned, after_state_aligned;859860andi_(tmp, state, 0xf);861beq(CCR0, state_aligned);862863// deal with unaligned addresses864VectorRegister aux = VR9;865866lvx(hs[start_idx], state);867load_perm(vRb, state);868869for (int n = start_idx + 2; n < total_hs; n += 2) {870VectorRegister h_cur = hs[n];871VectorRegister h_prev2 = hs[n - 2];872addi(addr, state, (n/2) * 16);873lvx(h_cur, addr);874vec_perm(h_prev2, h_cur, vRb);875}876addi(addr, state, (total_hs/2) * 16);877lvx (aux, addr);878vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);879b (after_state_aligned);880881bind(state_aligned);882883// deal with aligned addresses884lvx(hs[start_idx], state);885886for (int n = start_idx + 2; n < total_hs; n += 2) {887VectorRegister h_cur = hs[n];888addi(addr, state, (n/2) * 16);889lvx(h_cur, addr);890}891892bind(after_state_aligned);893}894895static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {8960x428a2f98d728ae22, 0x7137449123ef65cd,8970xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,8980x3956c25bf348b538, 0x59f111f1b605d019,8990x923f82a4af194f9b, 0xab1c5ed5da6d8118,9000xd807aa98a3030242, 0x12835b0145706fbe,9010x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,9020x72be5d74f27b896f, 0x80deb1fe3b1696b1,9030x9bdc06a725c71235, 0xc19bf174cf692694,9040xe49b69c19ef14ad2, 0xefbe4786384f25e3,9050x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,9060x2de92c6f592b0275, 0x4a7484aa6ea6e483,9070x5cb0a9dcbd41fbd4, 0x76f988da831153b5,9080x983e5152ee66dfab, 0xa831c66d2db43210,9090xb00327c898fb213f, 0xbf597fc7beef0ee4,9100xc6e00bf33da88fc2, 0xd5a79147930aa725,9110x06ca6351e003826f, 0x142929670a0e6e70,9120x27b70a8546d22ffc, 0x2e1b21385c26c926,9130x4d2c6dfc5ac42aed, 0x53380d139d95b3df,9140x650a73548baf63de, 0x766a0abb3c77b2a8,9150x81c2c92e47edaee6, 0x92722c851482353b,9160xa2bfe8a14cf10364, 0xa81a664bbc423001,9170xc24b8b70d0f89791, 0xc76c51a30654be30,9180xd192e819d6ef5218, 0xd69906245565a910,9190xf40e35855771202a, 0x106aa07032bbd1b8,9200x19a4c116b8d2d0c8, 0x1e376c085141ab53,9210x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,9220x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,9230x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,9240x748f82ee5defb2fc, 0x78a5636f43172f60,9250x84c87814a1f0ab72, 0x8cc702081a6439ec,9260x90befffa23631e28, 0xa4506cebde82bde9,9270xbef9a3f7b2c67915, 0xc67178f2e372532b,9280xca273eceea26619c, 0xd186b8c721c0c207,9290xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,9300x06f067aa72176fba, 0x0a637dc5a2c898a6,9310x113f9804bef90dae, 0x1b710b35131c471b,9320x28db77f523047d84, 0x32caab7b40c72493,9330x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,9340x4cc5d4becb3e42b6, 0x597f299cfc657e2a,9350x5fcb6fab3ad6faec, 0x6c44198c4a475817,936};937static const uint64_t *sha512_round_consts = sha512_round_table;938939// R3_ARG1 - byte[] Input string with padding but in Big Endian940// R4_ARG2 - int[] SHA.state (at first, the root of primes)941// R5_ARG3 - int offset942// R6_ARG4 - int limit943//944// Internal Register usage:945// R7 R8 R9 - volatile temporaries946// VR0-VR7 - a-h947// VR8 - vRb948// VR9 - aux (highly volatile, use with care)949// VR10-VR17 - w0-w7 | ini_a-ini_h950// VR18 - vsp16 | kplusw0951// VR19 - vsp32 | kplusw1952// VR20-VR25 - sha512_calc_2w and sha512_round temporaries953void MacroAssembler::sha512(bool multi_block) {954static const ssize_t buf_size = 128;955static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);956#ifdef AIX957// malloc provides 16 byte alignment958if (((uintptr_t)sha512_round_consts & 0xF) != 0) {959uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));960guarantee(new_round_consts, "oom");961memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));962sha512_round_consts = (const uint64_t*)new_round_consts;963}964#endif965966Register buf_in = R3_ARG1;967Register state = R4_ARG2;968Register ofs = R5_ARG3;969Register limit = R6_ARG4;970971Label sha_loop, core_loop;972973// Save non-volatile vector registers in the red zone974static const VectorRegister nv[] = {975VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/976};977static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);978979for (int c = 0; c < nv_size; c++) {980Register idx = R7;981li (idx, (c - (nv_size)) * 16);982stvx(nv[c], idx, R1);983}984985// Load hash state to registers986VectorRegister a = VR0;987VectorRegister b = VR1;988VectorRegister c = VR2;989VectorRegister d = VR3;990VectorRegister e = VR4;991VectorRegister f = VR5;992VectorRegister g = VR6;993VectorRegister h = VR7;994static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};995static const int total_hs = sizeof(hs)/sizeof(VectorRegister);996// counter for cycling through hs vector to avoid register moves between iterations997int h_cnt = 0;998999// Load a-h registers from the memory pointed by state1000sha512_load_h_vec(state, hs, total_hs);10011002Register k = R9;1003assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");1004load_const_optimized(k, (address)sha512_round_consts, R0);10051006if (multi_block) {1007align(OptoLoopAlignment);1008}1009bind(sha_loop);10101011for (int n = 0; n < total_hs; n += 2) {1012#if defined(VM_LITTLE_ENDIAN)1013VectorRegister h_cur = hs[n];1014VectorRegister h_next = hs[n + 1];1015#else1016VectorRegister h_cur = hs[n + 1];1017VectorRegister h_next = hs[n];1018#endif1019vsldoi (h_next, h_cur, h_cur, 8);1020}10211022// Load 16 elements from w out of the loop.1023// Order of the long values is Endianess specific.1024VectorRegister w0 = VR10;1025VectorRegister w1 = VR11;1026VectorRegister w2 = VR12;1027VectorRegister w3 = VR13;1028VectorRegister w4 = VR14;1029VectorRegister w5 = VR15;1030VectorRegister w6 = VR16;1031VectorRegister w7 = VR17;1032static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};1033static const int total_ws = sizeof(ws)/sizeof(VectorRegister);10341035// Load 16 w into vectors and setup vsl for vperm1036sha512_load_w_vec(buf_in, ws, total_ws);10371038#if defined(VM_LITTLE_ENDIAN)1039VectorRegister vsp16 = VR18;1040VectorRegister vsp32 = VR19;1041VectorRegister shiftarg = VR9;10421043vspltisw(vsp16, 8);1044vspltisw(shiftarg, 1);1045vsl (vsp16, vsp16, shiftarg);1046vsl (vsp32, vsp16, shiftarg);10471048VectorRegister vsp8 = VR9;1049vspltish(vsp8, 8);10501051// Convert input from Big Endian to Little Endian1052for (int c = 0; c < total_ws; c++) {1053VectorRegister w = ws[c];1054vrlh (w, w, vsp8);1055}1056for (int c = 0; c < total_ws; c++) {1057VectorRegister w = ws[c];1058vrlw (w, w, vsp16);1059}1060for (int c = 0; c < total_ws; c++) {1061VectorRegister w = ws[c];1062vrld (w, w, vsp32);1063}1064#endif10651066Register Rb = R10;1067VectorRegister vRb = VR8;1068li (Rb, 8);1069load_perm(vRb, Rb);10701071VectorRegister kplusw0 = VR18;1072VectorRegister kplusw1 = VR19;10731074Register addr = R7;10751076for (int n = 0; n < total_ws; n++) {1077VectorRegister w = ws[n];10781079if (n == 0) {1080lvx (kplusw0, k);1081} else {1082addi (addr, k, n * 16);1083lvx (kplusw0, addr);1084}1085#if defined(VM_LITTLE_ENDIAN)1086vaddudm(kplusw0, kplusw0, w);1087vsldoi (kplusw1, kplusw0, kplusw0, 8);1088#else1089vaddudm(kplusw1, kplusw0, w);1090vsldoi (kplusw0, kplusw1, kplusw1, 8);1091#endif10921093sha512_round(hs, total_hs, h_cnt, kplusw0);1094sha512_round(hs, total_hs, h_cnt, kplusw1);1095}10961097Register tmp = R8;1098li (tmp, (w_size-16)/total_hs);1099mtctr (tmp);1100// j will be aligned to 4 for loading words.1101// Whenever read, advance the pointer (e.g: when j is used in a function)1102Register j = tmp;1103li (j, 8*16);11041105align(OptoLoopAlignment);1106bind(core_loop);11071108// due to VectorRegister rotate, always iterate in multiples of total_hs1109for (int n = 0; n < total_hs/2; n++) {1110sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);1111sha512_round(hs, total_hs, h_cnt, kplusw0);1112sha512_round(hs, total_hs, h_cnt, kplusw1);1113}11141115bdnz (core_loop);11161117sha512_update_sha_state(state, hs, total_hs);11181119if (multi_block) {1120addi(buf_in, buf_in, buf_size);1121addi(ofs, ofs, buf_size);1122cmplw(CCR0, ofs, limit);1123ble(CCR0, sha_loop);11241125// return ofs1126mr(R3_RET, ofs);1127}11281129// Restore non-volatile registers1130for (int c = 0; c < nv_size; c++) {1131Register idx = R7;1132li (idx, (c - (nv_size)) * 16);1133lvx(nv[c], idx, R1);1134}1135}113611371138