CoCalc -- macroAssembler_ppc

GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/hotspot/src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp
³²²⁸⁵ views
1
// Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
2
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3
//
4
// This code is free software; you can redistribute it and/or modify it
5
// under the terms of the GNU General Public License version 2 only, as
6
// published by the Free Software Foundation.
7
//
8
// This code is distributed in the hope that it will be useful, but WITHOUT
9
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11
// version 2 for more details (a copy is included in the LICENSE file that
12
// accompanied this code).
13
//
14
// You should have received a copy of the GNU General Public License version
15
// 2 along with this work; if not, write to the Free Software Foundation,
16
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17
//
18
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
19
// or visit www.oracle.com if you need additional information or have any
20
// questions.
21

22
// Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
23
// (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
24

25
#include "asm/macroAssembler.inline.hpp"
26
#include "runtime/stubRoutines.hpp"
27

28
/**********************************************************************
29
 * SHA 256
30
 *********************************************************************/
31

32
void MacroAssembler::sha256_deque(const VectorRegister src,
33
                                  const VectorRegister dst1,
34
                                  const VectorRegister dst2,
35
                                  const VectorRegister dst3) {
36
  vsldoi (dst1, src, src, 12);
37
  vsldoi (dst2, src, src, 8);
38
  vsldoi (dst3, src, src, 4);
39
}
40

41
void MacroAssembler::sha256_round(const VectorRegister* hs,
42
                                  const int total_hs,
43
                                  int& h_cnt,
44
                                  const VectorRegister kpw) {
45
  // convenience registers: cycle from 0-7 downwards
46
  const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
47
  const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
48
  const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
49
  const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
50
  const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
51
  const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
52
  const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
53
  const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
54
  // temporaries
55
  VectorRegister ch  = VR0;
56
  VectorRegister maj = VR1;
57
  VectorRegister bsa = VR2;
58
  VectorRegister bse = VR3;
59
  VectorRegister vt0 = VR4;
60
  VectorRegister vt1 = VR5;
61
  VectorRegister vt2 = VR6;
62
  VectorRegister vt3 = VR7;
63

64
  vsel       (ch,  g,   f, e);
65
  vxor       (maj, a,   b);
66
  vshasigmaw (bse, e,   1, 0xf);
67
  vadduwm    (vt2, ch,  kpw);
68
  vadduwm    (vt1, h,   bse);
69
  vsel       (maj, b,   c, maj);
70
  vadduwm    (vt3, vt1, vt2);
71
  vshasigmaw (bsa, a,   1, 0);
72
  vadduwm    (vt0, bsa, maj);
73

74
  vadduwm    (d,   d,   vt3);
75
  vadduwm    (h,   vt3, vt0);
76

77
  // advance vector pointer to the next iteration
78
  h_cnt++;
79
}
80

81
void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
82
                                       const VectorRegister e,
83
                                       const Register hptr) {
84
  // temporaries
85
  Register tmp = R8;
86
  VectorRegister vt0 = VR0;
87
  VectorRegister vRb = VR6;
88
  // labels
89
  Label sha256_aligned;
90

91
  andi_  (tmp,  hptr, 0xf);
92
  lvx    (a,    hptr);
93
  addi   (tmp,  hptr, 16);
94
  lvx    (e,    tmp);
95
  beq    (CCR0, sha256_aligned);
96

97
  // handle unaligned accesses
98
  load_perm(vRb, hptr);
99
  addi   (tmp, hptr, 32);
100
  vec_perm(a,   e,    vRb);
101

102
  lvx    (vt0,  tmp);
103
  vec_perm(e,   vt0,  vRb);
104

105
  // aligned accesses
106
  bind(sha256_aligned);
107
}
108

109
void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
110
                                              const VectorRegister* ws,
111
                                              const int total_ws,
112
                                              const Register k,
113
                                              const VectorRegister* kpws,
114
                                              const int total_kpws) {
115
  Label w_aligned, after_w_load;
116

117
  Register tmp       = R8;
118
  VectorRegister vt0 = VR0;
119
  VectorRegister vt1 = VR1;
120
  VectorRegister vRb = VR6;
121

122
  andi_ (tmp, buf_in, 0xF);
123
  beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
124

125
  // deal with unaligned addresses
126
  lvx    (ws[0], buf_in);
127
  load_perm(vRb, buf_in);
128

129
  for (int n = 1; n < total_ws; n++) {
130
    VectorRegister w_cur = ws[n];
131
    VectorRegister w_prev = ws[n-1];
132

133
    addi (tmp, buf_in, n * 16);
134
    lvx  (w_cur, tmp);
135
    vec_perm(w_prev, w_cur, vRb);
136
  }
137
  addi   (tmp, buf_in, total_ws * 16);
138
  lvx    (vt0, tmp);
139
  vec_perm(ws[total_ws-1], vt0, vRb);
140
  b      (after_w_load);
141

142
  bind(w_aligned);
143

144
  // deal with aligned addresses
145
  lvx(ws[0], buf_in);
146
  for (int n = 1; n < total_ws; n++) {
147
    VectorRegister w = ws[n];
148
    addi (tmp, buf_in, n * 16);
149
    lvx  (w, tmp);
150
  }
151

152
  bind(after_w_load);
153

154
#if defined(VM_LITTLE_ENDIAN)
155
  // Byte swapping within int values
156
  li       (tmp, 8);
157
  lvsl     (vt0, tmp);
158
  vspltisb (vt1, 0xb);
159
  vxor     (vt1, vt0, vt1);
160
  for (int n = 0; n < total_ws; n++) {
161
    VectorRegister w = ws[n];
162
    vec_perm(w, w, vt1);
163
  }
164
#endif
165

166
  // Loading k, which is always aligned to 16-bytes
167
  lvx    (kpws[0], k);
168
  for (int n = 1; n < total_kpws; n++) {
169
    VectorRegister kpw = kpws[n];
170
    addi (tmp, k, 16 * n);
171
    lvx  (kpw, tmp);
172
  }
173

174
  // Add w to K
175
  assert(total_ws == total_kpws, "Redesign the loop below");
176
  for (int n = 0; n < total_kpws; n++) {
177
    VectorRegister kpw = kpws[n];
178
    VectorRegister w   = ws[n];
179

180
    vadduwm  (kpw, kpw, w);
181
  }
182
}
183

184
void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
185
                                    const VectorRegister w1,
186
                                    const VectorRegister w2,
187
                                    const VectorRegister w3,
188
                                    const VectorRegister kpw0,
189
                                    const VectorRegister kpw1,
190
                                    const VectorRegister kpw2,
191
                                    const VectorRegister kpw3,
192
                                    const Register j,
193
                                    const Register k) {
194
  // Temporaries
195
  const VectorRegister  vt0  = VR0;
196
  const VectorRegister  vt1  = VR1;
197
  const VectorSRegister vsrt1 = vt1->to_vsr();
198
  const VectorRegister  vt2  = VR2;
199
  const VectorRegister  vt3  = VR3;
200
  const VectorSRegister vst3 = vt3->to_vsr();
201
  const VectorRegister  vt4  = VR4;
202

203
  // load to k[j]
204
  lvx        (vt0, j,   k);
205

206
  // advance j
207
  addi       (j,   j,   16); // 16 bytes were read
208

209
#if defined(VM_LITTLE_ENDIAN)
210
  // b = w[j-15], w[j-14], w[j-13], w[j-12]
211
  vsldoi     (vt1, w1,  w0, 12);
212

213
  // c = w[j-7], w[j-6], w[j-5], w[j-4]
214
  vsldoi     (vt2, w3,  w2, 12);
215

216
#else
217
  // b = w[j-15], w[j-14], w[j-13], w[j-12]
218
  vsldoi     (vt1, w0,  w1, 4);
219

220
  // c = w[j-7], w[j-6], w[j-5], w[j-4]
221
  vsldoi     (vt2, w2,  w3, 4);
222
#endif
223

224
  // d = w[j-2], w[j-1], w[j-4], w[j-3]
225
  vsldoi     (vt3, w3,  w3, 8);
226

227
  // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
228
  vshasigmaw (vt1, vt1, 0,  0);
229

230
  // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
231
  vshasigmaw (vt3, vt3, 0,  0xf);
232

233
  // c = s0(w[j-15]) + w[j-7],
234
  //     s0(w[j-14]) + w[j-6],
235
  //     s0(w[j-13]) + w[j-5],
236
  //     s0(w[j-12]) + w[j-4]
237
  vadduwm    (vt2, vt1, vt2);
238

239
  // c = s0(w[j-15]) + w[j-7] + w[j-16],
240
  //     s0(w[j-14]) + w[j-6] + w[j-15],
241
  //     s0(w[j-13]) + w[j-5] + w[j-14],
242
  //     s0(w[j-12]) + w[j-4] + w[j-13]
243
  vadduwm    (vt2, vt2, w0);
244

245
  // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
246
  //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
247
  //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
248
  //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
249
  vadduwm    (vt4, vt2, vt3);
250

251
  // At this point, e[0] and e[1] are the correct values to be stored at w[j]
252
  // and w[j+1].
253
  // e[2] and e[3] are not considered.
254
  // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
255
  vshasigmaw (vt1, vt4, 0,  0xf);
256

257
  // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
258
#if defined(VM_LITTLE_ENDIAN)
259
  xxmrgld    (vst3, vsrt1, vst3);
260
#else
261
  xxmrghd    (vst3, vst3, vsrt1);
262
#endif
263

264
  // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
265
  //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
266
  //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
267
  //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
268
  vadduwm    (vt2, vt2, vt3);
269

270
  // Updating w0 to w3 to hold the new previous 16 values from w.
271
  vmr        (w0,  w1);
272
  vmr        (w1,  w2);
273
  vmr        (w2,  w3);
274
  vmr        (w3,  vt2);
275

276
  // store k + w to v9 (4 values at once)
277
#if defined(VM_LITTLE_ENDIAN)
278
  vadduwm    (kpw0, vt2, vt0);
279

280
  vsldoi     (kpw1, kpw0, kpw0, 12);
281
  vsldoi     (kpw2, kpw0, kpw0, 8);
282
  vsldoi     (kpw3, kpw0, kpw0, 4);
283
#else
284
  vadduwm    (kpw3, vt2, vt0);
285

286
  vsldoi     (kpw2, kpw3, kpw3, 12);
287
  vsldoi     (kpw1, kpw3, kpw3, 8);
288
  vsldoi     (kpw0, kpw3, kpw3, 4);
289
#endif
290
}
291

292
void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
293
                                             const VectorRegister b_,
294
                                             const VectorRegister c,
295
                                             const VectorRegister d,
296
                                             const VectorRegister e,
297
                                             const VectorRegister f,
298
                                             const VectorRegister g,
299
                                             const VectorRegister h,
300
                                             const Register hptr) {
301
  // temporaries
302
  VectorRegister vt0  = VR0;
303
  VectorRegister vt1  = VR1;
304
  VectorRegister vt2  = VR2;
305
  VectorRegister vt3  = VR3;
306
  VectorRegister vt4  = VR4;
307
  VectorRegister vt5  = VR5;
308
  VectorRegister vaux = VR6;
309
  VectorRegister vRb  = VR6;
310
  Register tmp        = R8;
311
  Register of16       = R8;
312
  Register of32       = R9;
313
  Label state_load_aligned;
314

315
  // Load hptr
316
  andi_   (tmp, hptr, 0xf);
317
  li      (of16, 16);
318
  lvx     (vt0, hptr);
319
  lvx     (vt5, of16, hptr);
320
  beq     (CCR0, state_load_aligned);
321

322
  // handle unaligned accesses
323
  li      (of32, 32);
324
  load_perm(vRb, hptr);
325

326
  vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
327

328
  lvx     (vt1, hptr, of32);
329
  vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
330

331
  // aligned accesses
332
  bind(state_load_aligned);
333

334
#if defined(VM_LITTLE_ENDIAN)
335
  vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
336
  vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
337
  vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
338
  vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
339
  xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
340
  xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
341
  vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
342
  vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
343

344
  // Save hptr back, works for any alignment
345
  xxswapd (vt0->to_vsr(), a->to_vsr());
346
  stxvd2x (vt0->to_vsr(), hptr);
347
  xxswapd (vt5->to_vsr(), e->to_vsr());
348
  stxvd2x (vt5->to_vsr(), of16, hptr);
349
#else
350
  vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
351
  vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
352
  vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
353
  vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
354
  xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
355
  xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
356
  vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
357
  vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
358

359
  // Save hptr back, works for any alignment
360
  stxvd2x (d->to_vsr(), hptr);
361
  stxvd2x (h->to_vsr(), of16, hptr);
362
#endif
363
}
364

365
static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
366
  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
367
  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
368
  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
369
  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
370
  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
371
  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
372
  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
373
  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
374
  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
375
  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
376
  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
377
  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
378
  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
379
  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
380
  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
381
  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
382
};
383
static const uint32_t *sha256_round_consts = sha256_round_table;
384

385
//   R3_ARG1   - byte[]  Input string with padding but in Big Endian
386
//   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
387
//   R5_ARG3   - int     offset
388
//   R6_ARG4   - int     limit
389
//
390
//   Internal Register usage:
391
//   R7        - k
392
//   R8        - tmp | j | of16
393
//   R9        - of32
394
//   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
395
//   VR9-VR16  - a-h
396
//   VR17-VR20 - w0-w3
397
//   VR21-VR23 - vRb | vaux0-vaux2
398
//   VR24-VR27 - kpw0-kpw3
399
void MacroAssembler::sha256(bool multi_block) {
400
  static const ssize_t buf_size = 64;
401
  static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
402
#ifdef AIX
403
  // malloc provides 16 byte alignment
404
  if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
405
    uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
406
    guarantee(new_round_consts, "oom");
407
    memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
408
    sha256_round_consts = (const uint32_t*)new_round_consts;
409
  }
410
#endif
411

412
  Register buf_in = R3_ARG1;
413
  Register state  = R4_ARG2;
414
  Register ofs    = R5_ARG3;
415
  Register limit  = R6_ARG4;
416

417
  Label sha_loop, core_loop;
418

419
  // Save non-volatile vector registers in the red zone
420
  static const VectorRegister nv[] = {
421
    VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
422
  };
423
  static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
424

425
  for (int c = 0; c < nv_size; c++) {
426
    Register tmp = R8;
427
    li  (tmp, (c - (nv_size)) * 16);
428
    stvx(nv[c], tmp, R1);
429
  }
430

431
  // Load hash state to registers
432
  VectorRegister a = VR9;
433
  VectorRegister b = VR10;
434
  VectorRegister c = VR11;
435
  VectorRegister d = VR12;
436
  VectorRegister e = VR13;
437
  VectorRegister f = VR14;
438
  VectorRegister g = VR15;
439
  VectorRegister h = VR16;
440
  static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
441
  static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
442
  // counter for cycling through hs vector to avoid register moves between iterations
443
  int h_cnt = 0;
444

445
  // Load a-h registers from the memory pointed by state
446
#if defined(VM_LITTLE_ENDIAN)
447
  sha256_load_h_vec(a, e, state);
448
#else
449
  sha256_load_h_vec(d, h, state);
450
#endif
451

452
  // keep k loaded also during MultiBlock loops
453
  Register k = R7;
454
  assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
455
  load_const_optimized(k, (address)sha256_round_consts, R0);
456

457
  // Avoiding redundant loads
458
  if (multi_block) {
459
    align(OptoLoopAlignment);
460
  }
461
  bind(sha_loop);
462
#if defined(VM_LITTLE_ENDIAN)
463
  sha256_deque(a, b, c, d);
464
  sha256_deque(e, f, g, h);
465
#else
466
  sha256_deque(d, c, b, a);
467
  sha256_deque(h, g, f, e);
468
#endif
469

470
  // Load 16 elements from w out of the loop.
471
  // Order of the int values is Endianess specific.
472
  VectorRegister w0 = VR17;
473
  VectorRegister w1 = VR18;
474
  VectorRegister w2 = VR19;
475
  VectorRegister w3 = VR20;
476
  static const VectorRegister ws[] = {w0, w1, w2, w3};
477
  static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
478

479
  VectorRegister kpw0 = VR24;
480
  VectorRegister kpw1 = VR25;
481
  VectorRegister kpw2 = VR26;
482
  VectorRegister kpw3 = VR27;
483
  static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
484
  static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
485

486
  sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
487

488
  // Cycle through the first 16 elements
489
  assert(total_ws == total_kpws, "Redesign the loop below");
490
  for (int n = 0; n < total_ws; n++) {
491
    VectorRegister vaux0 = VR21;
492
    VectorRegister vaux1 = VR22;
493
    VectorRegister vaux2 = VR23;
494

495
    sha256_deque(kpws[n], vaux0, vaux1, vaux2);
496

497
#if defined(VM_LITTLE_ENDIAN)
498
    sha256_round(hs, total_hs, h_cnt, kpws[n]);
499
    sha256_round(hs, total_hs, h_cnt, vaux0);
500
    sha256_round(hs, total_hs, h_cnt, vaux1);
501
    sha256_round(hs, total_hs, h_cnt, vaux2);
502
#else
503
    sha256_round(hs, total_hs, h_cnt, vaux2);
504
    sha256_round(hs, total_hs, h_cnt, vaux1);
505
    sha256_round(hs, total_hs, h_cnt, vaux0);
506
    sha256_round(hs, total_hs, h_cnt, kpws[n]);
507
#endif
508
  }
509

510
  Register tmp = R8;
511
  // loop the 16th to the 64th iteration by 8 steps
512
  li   (tmp, (w_size - 16) / total_hs);
513
  mtctr(tmp);
514

515
  // j will be aligned to 4 for loading words.
516
  // Whenever read, advance the pointer (e.g: when j is used in a function)
517
  Register j = R8;
518
  li   (j, 16*4);
519

520
  align(OptoLoopAlignment);
521
  bind(core_loop);
522

523
  // due to VectorRegister rotate, always iterate in multiples of total_hs
524
  for (int n = 0; n < total_hs/4; n++) {
525
    sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
526
    sha256_round(hs, total_hs, h_cnt, kpw0);
527
    sha256_round(hs, total_hs, h_cnt, kpw1);
528
    sha256_round(hs, total_hs, h_cnt, kpw2);
529
    sha256_round(hs, total_hs, h_cnt, kpw3);
530
  }
531

532
  bdnz   (core_loop);
533

534
  // Update hash state
535
  sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
536

537
  if (multi_block) {
538
    addi(buf_in, buf_in, buf_size);
539
    addi(ofs, ofs, buf_size);
540
    cmplw(CCR0, ofs, limit);
541
    ble(CCR0, sha_loop);
542

543
    // return ofs
544
    mr(R3_RET, ofs);
545
  }
546

547
  // Restore non-volatile registers
548
  for (int c = 0; c < nv_size; c++) {
549
    Register tmp = R8;
550
    li  (tmp, (c - (nv_size)) * 16);
551
    lvx(nv[c], tmp, R1);
552
  }
553
}
554

555

556
/**********************************************************************
557
 * SHA 512
558
 *********************************************************************/
559

560
void MacroAssembler::sha512_load_w_vec(const Register buf_in,
561
                                       const VectorRegister* ws,
562
                                       const int total_ws) {
563
  Register tmp       = R8;
564
  VectorRegister vRb = VR8;
565
  VectorRegister aux = VR9;
566
  Label is_aligned, after_alignment;
567

568
  andi_  (tmp, buf_in, 0xF);
569
  beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
570

571
  // deal with unaligned addresses
572
  lvx    (ws[0], buf_in);
573
  load_perm(vRb, buf_in);
574

575
  for (int n = 1; n < total_ws; n++) {
576
    VectorRegister w_cur = ws[n];
577
    VectorRegister w_prev = ws[n-1];
578
    addi (tmp, buf_in, n * 16);
579
    lvx  (w_cur, tmp);
580
    vec_perm(w_prev, w_cur, vRb);
581
  }
582
  addi   (tmp, buf_in, total_ws * 16);
583
  lvx    (aux, tmp);
584
  vec_perm(ws[total_ws-1], aux, vRb);
585
  b      (after_alignment);
586

587
  bind(is_aligned);
588
  lvx  (ws[0], buf_in);
589
  for (int n = 1; n < total_ws; n++) {
590
    VectorRegister w = ws[n];
591
    addi (tmp, buf_in, n * 16);
592
    lvx  (w, tmp);
593
  }
594

595
  bind(after_alignment);
596
}
597

598
// Update hash state
599
void MacroAssembler::sha512_update_sha_state(const Register state,
600
                                             const VectorRegister* hs,
601
                                             const int total_hs) {
602

603
#if defined(VM_LITTLE_ENDIAN)
604
  int start_idx = 0;
605
#else
606
  int start_idx = 1;
607
#endif
608

609
  // load initial hash from the memory pointed by state
610
  VectorRegister ini_a = VR10;
611
  VectorRegister ini_c = VR12;
612
  VectorRegister ini_e = VR14;
613
  VectorRegister ini_g = VR16;
614
  static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
615
  static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
616

617
  Label state_save_aligned, after_state_save_aligned;
618

619
  Register addr      = R7;
620
  Register tmp       = R8;
621
  VectorRegister vRb = VR8;
622
  VectorRegister aux = VR9;
623

624
  andi_(tmp, state, 0xf);
625
  beq(CCR0, state_save_aligned);
626
  // deal with unaligned addresses
627

628
  {
629
    VectorRegister a = hs[0];
630
    VectorRegister b_ = hs[1];
631
    VectorRegister c = hs[2];
632
    VectorRegister d = hs[3];
633
    VectorRegister e = hs[4];
634
    VectorRegister f = hs[5];
635
    VectorRegister g = hs[6];
636
    VectorRegister h = hs[7];
637
    load_perm(vRb, state);
638
    lvx    (ini_a, state);
639
    addi   (addr, state, 16);
640

641
    lvx    (ini_c, addr);
642
    addi   (addr, state, 32);
643
    vec_perm(ini_a, ini_c, vRb);
644

645
    lvx    (ini_e, addr);
646
    addi   (addr, state, 48);
647
    vec_perm(ini_c, ini_e, vRb);
648

649
    lvx    (ini_g, addr);
650
    addi   (addr, state, 64);
651
    vec_perm(ini_e, ini_g, vRb);
652

653
    lvx    (aux, addr);
654
    vec_perm(ini_g, aux, vRb);
655

656
#if defined(VM_LITTLE_ENDIAN)
657
    xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
658
    xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
659
    xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
660
    xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
661
#else
662
    xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
663
    xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
664
    xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
665
    xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
666
#endif
667

668
    for (int n = start_idx; n < total_hs; n += 2) {
669
      VectorRegister h_cur = hs[n];
670
      VectorRegister ini_cur = inis[n/2];
671

672
      vaddudm(h_cur, ini_cur, h_cur);
673
    }
674

675
    for (int n = start_idx; n < total_hs; n += 2) {
676
      VectorRegister h_cur = hs[n];
677

678
      mfvrd  (tmp, h_cur);
679
#if defined(VM_LITTLE_ENDIAN)
680
      std    (tmp, 8*n + 8, state);
681
#else
682
      std    (tmp, 8*n - 8, state);
683
#endif
684
      vsldoi (aux, h_cur, h_cur, 8);
685
      mfvrd  (tmp, aux);
686
      std    (tmp, 8*n + 0, state);
687
    }
688

689
    b      (after_state_save_aligned);
690
  }
691

692
  bind(state_save_aligned);
693
  {
694
    for (int n = 0; n < total_hs; n += 2) {
695
#if defined(VM_LITTLE_ENDIAN)
696
      VectorRegister h_cur = hs[n];
697
      VectorRegister h_next = hs[n+1];
698
#else
699
      VectorRegister h_cur = hs[n+1];
700
      VectorRegister h_next = hs[n];
701
#endif
702
      VectorRegister ini_cur = inis[n/2];
703

704
      if (n/2 == 0) {
705
        lvx(ini_cur, state);
706
      } else {
707
        addi(addr, state, (n/2) * 16);
708
        lvx(ini_cur, addr);
709
      }
710
      xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
711
    }
712

713
    for (int n = start_idx; n < total_hs; n += 2) {
714
      VectorRegister h_cur = hs[n];
715
      VectorRegister ini_cur = inis[n/2];
716

717
      vaddudm(h_cur, ini_cur, h_cur);
718
    }
719

720
    for (int n = start_idx; n < total_hs; n += 2) {
721
      VectorRegister h_cur = hs[n];
722

723
      if (n/2 == 0) {
724
        stvx(h_cur, state);
725
      } else {
726
        addi(addr, state, (n/2) * 16);
727
        stvx(h_cur, addr);
728
      }
729
    }
730
  }
731

732
  bind(after_state_save_aligned);
733
}
734

735
// Use h_cnt to cycle through hs elements but also increment it at the end
736
void MacroAssembler::sha512_round(const VectorRegister* hs,
737
                                  const int total_hs, int& h_cnt,
738
                                  const VectorRegister kpw) {
739

740
  // convenience registers: cycle from 0-7 downwards
741
  const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
742
  const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
743
  const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
744
  const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
745
  const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
746
  const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
747
  const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
748
  const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
749
  // temporaries
750
  const VectorRegister Ch   = VR20;
751
  const VectorRegister Maj  = VR21;
752
  const VectorRegister bsa  = VR22;
753
  const VectorRegister bse  = VR23;
754
  const VectorRegister tmp1 = VR24;
755
  const VectorRegister tmp2 = VR25;
756

757
  vsel      (Ch,   g,    f,   e);
758
  vxor      (Maj,  a,    b);
759
  vshasigmad(bse,  e,    1,   0xf);
760
  vaddudm   (tmp2, Ch,   kpw);
761
  vaddudm   (tmp1, h,    bse);
762
  vsel      (Maj,  b,    c,   Maj);
763
  vaddudm   (tmp1, tmp1, tmp2);
764
  vshasigmad(bsa,  a,    1,   0);
765
  vaddudm   (tmp2, bsa,  Maj);
766
  vaddudm   (d,    d,    tmp1);
767
  vaddudm   (h,    tmp1, tmp2);
768

769
  // advance vector pointer to the next iteration
770
  h_cnt++;
771
}
772

773
void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
774
                                    const VectorRegister w1,
775
                                    const VectorRegister w2,
776
                                    const VectorRegister w3,
777
                                    const VectorRegister w4,
778
                                    const VectorRegister w5,
779
                                    const VectorRegister w6,
780
                                    const VectorRegister w7,
781
                                    const VectorRegister kpw0,
782
                                    const VectorRegister kpw1,
783
                                    const Register j,
784
                                    const VectorRegister vRb,
785
                                    const Register k) {
786
  // Temporaries
787
  const VectorRegister VR_a = VR20;
788
  const VectorRegister VR_b = VR21;
789
  const VectorRegister VR_c = VR22;
790
  const VectorRegister VR_d = VR23;
791

792
  // load to k[j]
793
  lvx        (VR_a, j,    k);
794
  // advance j
795
  addi       (j,    j,    16); // 16 bytes were read
796

797
#if defined(VM_LITTLE_ENDIAN)
798
  // v6 = w[j-15], w[j-14]
799
  vperm      (VR_b, w1,   w0,  vRb);
800
  // v12 = w[j-7], w[j-6]
801
  vperm      (VR_c, w5,   w4,  vRb);
802
#else
803
  // v6 = w[j-15], w[j-14]
804
  vperm      (VR_b, w0,   w1,  vRb);
805
  // v12 = w[j-7], w[j-6]
806
  vperm      (VR_c, w4,   w5,  vRb);
807
#endif
808

809
  // v6 = s0(w[j-15]) , s0(w[j-14])
810
  vshasigmad (VR_b, VR_b,    0,   0);
811
  // v5 = s1(w[j-2]) , s1(w[j-1])
812
  vshasigmad (VR_d, w7,      0,   0xf);
813
  // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
814
  vaddudm    (VR_b, VR_b, VR_c);
815
  // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
816
  vaddudm    (VR_d, VR_d, w0);
817
  // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
818
  //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
819
  vaddudm    (VR_c, VR_d, VR_b);
820
  // Updating w0 to w7 to hold the new previous 16 values from w.
821
  vmr        (w0,   w1);
822
  vmr        (w1,   w2);
823
  vmr        (w2,   w3);
824
  vmr        (w3,   w4);
825
  vmr        (w4,   w5);
826
  vmr        (w5,   w6);
827
  vmr        (w6,   w7);
828
  vmr        (w7,   VR_c);
829

830
#if defined(VM_LITTLE_ENDIAN)
831
  // store k + w to kpw0 (2 values at once)
832
  vaddudm    (kpw0, VR_c, VR_a);
833
  // kpw1 holds (k + w)[1]
834
  vsldoi     (kpw1, kpw0, kpw0, 8);
835
#else
836
  // store k + w to kpw0 (2 values at once)
837
  vaddudm    (kpw1, VR_c, VR_a);
838
  // kpw1 holds (k + w)[1]
839
  vsldoi     (kpw0, kpw1, kpw1, 8);
840
#endif
841
}
842

843
void MacroAssembler::sha512_load_h_vec(const Register state,
844
                                       const VectorRegister* hs,
845
                                       const int total_hs) {
846
#if defined(VM_LITTLE_ENDIAN)
847
  VectorRegister a   = hs[0];
848
  VectorRegister g   = hs[6];
849
  int start_idx = 0;
850
#else
851
  VectorRegister a   = hs[1];
852
  VectorRegister g   = hs[7];
853
  int start_idx = 1;
854
#endif
855

856
  Register addr      = R7;
857
  VectorRegister vRb = VR8;
858
  Register tmp       = R8;
859
  Label state_aligned, after_state_aligned;
860

861
  andi_(tmp, state, 0xf);
862
  beq(CCR0, state_aligned);
863

864
  // deal with unaligned addresses
865
  VectorRegister aux = VR9;
866

867
  lvx(hs[start_idx], state);
868
  load_perm(vRb, state);
869

870
  for (int n = start_idx + 2; n < total_hs; n += 2) {
871
    VectorRegister h_cur   = hs[n];
872
    VectorRegister h_prev2 = hs[n - 2];
873
    addi(addr, state, (n/2) * 16);
874
    lvx(h_cur, addr);
875
    vec_perm(h_prev2, h_cur, vRb);
876
  }
877
  addi(addr, state, (total_hs/2) * 16);
878
  lvx    (aux, addr);
879
  vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
880
  b      (after_state_aligned);
881

882
  bind(state_aligned);
883

884
  // deal with aligned addresses
885
  lvx(hs[start_idx], state);
886

887
  for (int n = start_idx + 2; n < total_hs; n += 2) {
888
    VectorRegister h_cur = hs[n];
889
    addi(addr, state, (n/2) * 16);
890
    lvx(h_cur, addr);
891
  }
892

893
  bind(after_state_aligned);
894
}
895

896
static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
897
  0x428a2f98d728ae22, 0x7137449123ef65cd,
898
  0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
899
  0x3956c25bf348b538, 0x59f111f1b605d019,
900
  0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
901
  0xd807aa98a3030242, 0x12835b0145706fbe,
902
  0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
903
  0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
904
  0x9bdc06a725c71235, 0xc19bf174cf692694,
905
  0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
906
  0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
907
  0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
908
  0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
909
  0x983e5152ee66dfab, 0xa831c66d2db43210,
910
  0xb00327c898fb213f, 0xbf597fc7beef0ee4,
911
  0xc6e00bf33da88fc2, 0xd5a79147930aa725,
912
  0x06ca6351e003826f, 0x142929670a0e6e70,
913
  0x27b70a8546d22ffc, 0x2e1b21385c26c926,
914
  0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
915
  0x650a73548baf63de, 0x766a0abb3c77b2a8,
916
  0x81c2c92e47edaee6, 0x92722c851482353b,
917
  0xa2bfe8a14cf10364, 0xa81a664bbc423001,
918
  0xc24b8b70d0f89791, 0xc76c51a30654be30,
919
  0xd192e819d6ef5218, 0xd69906245565a910,
920
  0xf40e35855771202a, 0x106aa07032bbd1b8,
921
  0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
922
  0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
923
  0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
924
  0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
925
  0x748f82ee5defb2fc, 0x78a5636f43172f60,
926
  0x84c87814a1f0ab72, 0x8cc702081a6439ec,
927
  0x90befffa23631e28, 0xa4506cebde82bde9,
928
  0xbef9a3f7b2c67915, 0xc67178f2e372532b,
929
  0xca273eceea26619c, 0xd186b8c721c0c207,
930
  0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
931
  0x06f067aa72176fba, 0x0a637dc5a2c898a6,
932
  0x113f9804bef90dae, 0x1b710b35131c471b,
933
  0x28db77f523047d84, 0x32caab7b40c72493,
934
  0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
935
  0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
936
  0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
937
};
938
static const uint64_t *sha512_round_consts = sha512_round_table;
939

940
//   R3_ARG1   - byte[]  Input string with padding but in Big Endian
941
//   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
942
//   R5_ARG3   - int     offset
943
//   R6_ARG4   - int     limit
944
//
945
//   Internal Register usage:
946
//   R7 R8 R9  - volatile temporaries
947
//   VR0-VR7   - a-h
948
//   VR8       - vRb
949
//   VR9       - aux (highly volatile, use with care)
950
//   VR10-VR17 - w0-w7 | ini_a-ini_h
951
//   VR18      - vsp16 | kplusw0
952
//   VR19      - vsp32 | kplusw1
953
//   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
954
void MacroAssembler::sha512(bool multi_block) {
955
  static const ssize_t buf_size = 128;
956
  static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
957
#ifdef AIX
958
  // malloc provides 16 byte alignment
959
  if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
960
    uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
961
    guarantee(new_round_consts, "oom");
962
    memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
963
    sha512_round_consts = (const uint64_t*)new_round_consts;
964
  }
965
#endif
966

967
  Register buf_in = R3_ARG1;
968
  Register state  = R4_ARG2;
969
  Register ofs    = R5_ARG3;
970
  Register limit  = R6_ARG4;
971

972
  Label sha_loop, core_loop;
973

974
  // Save non-volatile vector registers in the red zone
975
  static const VectorRegister nv[] = {
976
    VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
977
  };
978
  static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
979

980
  for (int c = 0; c < nv_size; c++) {
981
    Register idx = R7;
982
    li  (idx, (c - (nv_size)) * 16);
983
    stvx(nv[c], idx, R1);
984
  }
985

986
  // Load hash state to registers
987
  VectorRegister a = VR0;
988
  VectorRegister b = VR1;
989
  VectorRegister c = VR2;
990
  VectorRegister d = VR3;
991
  VectorRegister e = VR4;
992
  VectorRegister f = VR5;
993
  VectorRegister g = VR6;
994
  VectorRegister h = VR7;
995
  static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
996
  static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
997
  // counter for cycling through hs vector to avoid register moves between iterations
998
  int h_cnt = 0;
999

1000
  // Load a-h registers from the memory pointed by state
1001
  sha512_load_h_vec(state, hs, total_hs);
1002

1003
  Register k = R9;
1004
  assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
1005
  load_const_optimized(k, (address)sha512_round_consts, R0);
1006

1007
  if (multi_block) {
1008
    align(OptoLoopAlignment);
1009
  }
1010
  bind(sha_loop);
1011

1012
  for (int n = 0; n < total_hs; n += 2) {
1013
#if defined(VM_LITTLE_ENDIAN)
1014
    VectorRegister h_cur = hs[n];
1015
    VectorRegister h_next = hs[n + 1];
1016
#else
1017
    VectorRegister h_cur = hs[n + 1];
1018
    VectorRegister h_next = hs[n];
1019
#endif
1020
    vsldoi (h_next, h_cur, h_cur, 8);
1021
  }
1022

1023
  // Load 16 elements from w out of the loop.
1024
  // Order of the long values is Endianess specific.
1025
  VectorRegister w0 = VR10;
1026
  VectorRegister w1 = VR11;
1027
  VectorRegister w2 = VR12;
1028
  VectorRegister w3 = VR13;
1029
  VectorRegister w4 = VR14;
1030
  VectorRegister w5 = VR15;
1031
  VectorRegister w6 = VR16;
1032
  VectorRegister w7 = VR17;
1033
  static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1034
  static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1035

1036
  // Load 16 w into vectors and setup vsl for vperm
1037
  sha512_load_w_vec(buf_in, ws, total_ws);
1038

1039
#if defined(VM_LITTLE_ENDIAN)
1040
  VectorRegister vsp16 = VR18;
1041
  VectorRegister vsp32 = VR19;
1042
  VectorRegister shiftarg = VR9;
1043

1044
  vspltisw(vsp16,    8);
1045
  vspltisw(shiftarg, 1);
1046
  vsl     (vsp16,    vsp16, shiftarg);
1047
  vsl     (vsp32,    vsp16, shiftarg);
1048

1049
  VectorRegister vsp8 = VR9;
1050
  vspltish(vsp8,     8);
1051

1052
  // Convert input from Big Endian to Little Endian
1053
  for (int c = 0; c < total_ws; c++) {
1054
    VectorRegister w = ws[c];
1055
    vrlh  (w, w, vsp8);
1056
  }
1057
  for (int c = 0; c < total_ws; c++) {
1058
    VectorRegister w = ws[c];
1059
    vrlw  (w, w, vsp16);
1060
  }
1061
  for (int c = 0; c < total_ws; c++) {
1062
    VectorRegister w = ws[c];
1063
    vrld  (w, w, vsp32);
1064
  }
1065
#endif
1066

1067
  Register Rb        = R10;
1068
  VectorRegister vRb = VR8;
1069
  li      (Rb, 8);
1070
  load_perm(vRb, Rb);
1071

1072
  VectorRegister kplusw0 = VR18;
1073
  VectorRegister kplusw1 = VR19;
1074

1075
  Register addr      = R7;
1076

1077
  for (int n = 0; n < total_ws; n++) {
1078
    VectorRegister w = ws[n];
1079

1080
    if (n == 0) {
1081
      lvx  (kplusw0, k);
1082
    } else {
1083
      addi (addr, k, n * 16);
1084
      lvx  (kplusw0, addr);
1085
    }
1086
#if defined(VM_LITTLE_ENDIAN)
1087
    vaddudm(kplusw0, kplusw0, w);
1088
    vsldoi (kplusw1, kplusw0, kplusw0, 8);
1089
#else
1090
    vaddudm(kplusw1, kplusw0, w);
1091
    vsldoi (kplusw0, kplusw1, kplusw1, 8);
1092
#endif
1093

1094
    sha512_round(hs, total_hs, h_cnt, kplusw0);
1095
    sha512_round(hs, total_hs, h_cnt, kplusw1);
1096
  }
1097

1098
  Register tmp       = R8;
1099
  li    (tmp, (w_size-16)/total_hs);
1100
  mtctr (tmp);
1101
  // j will be aligned to 4 for loading words.
1102
  // Whenever read, advance the pointer (e.g: when j is used in a function)
1103
  Register j = tmp;
1104
  li     (j, 8*16);
1105

1106
  align(OptoLoopAlignment);
1107
  bind(core_loop);
1108

1109
  // due to VectorRegister rotate, always iterate in multiples of total_hs
1110
  for (int n = 0; n < total_hs/2; n++) {
1111
    sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1112
    sha512_round(hs, total_hs, h_cnt, kplusw0);
1113
    sha512_round(hs, total_hs, h_cnt, kplusw1);
1114
  }
1115

1116
  bdnz   (core_loop);
1117

1118
  sha512_update_sha_state(state, hs, total_hs);
1119

1120
  if (multi_block) {
1121
    addi(buf_in, buf_in, buf_size);
1122
    addi(ofs, ofs, buf_size);
1123
    cmplw(CCR0, ofs, limit);
1124
    ble(CCR0, sha_loop);
1125

1126
    // return ofs
1127
    mr(R3_RET, ofs);
1128
  }
1129

1130
  // Restore non-volatile registers
1131
  for (int c = 0; c < nv_size; c++) {
1132
    Register idx = R7;
1133
    li  (idx, (c - (nv_size)) * 16);
1134
    lvx(nv[c], idx, R1);
1135
  }
1136
}
1137

1138
Product

Resources

Company