Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/openjdk-multiarch-jdk8u
Path: blob/aarch64-shenandoah-jdk8u272-b10/hotspot/src/cpu/ppc/vm/macroAssembler_ppc_sha.cpp
32285 views
1
// Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
2
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3
//
4
// This code is free software; you can redistribute it and/or modify it
5
// under the terms of the GNU General Public License version 2 only, as
6
// published by the Free Software Foundation.
7
//
8
// This code is distributed in the hope that it will be useful, but WITHOUT
9
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11
// version 2 for more details (a copy is included in the LICENSE file that
12
// accompanied this code).
13
//
14
// You should have received a copy of the GNU General Public License version
15
// 2 along with this work; if not, write to the Free Software Foundation,
16
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17
//
18
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
19
// or visit www.oracle.com if you need additional information or have any
20
// questions.
21
22
// Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
23
// (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
24
25
#include "asm/macroAssembler.inline.hpp"
26
#include "runtime/stubRoutines.hpp"
27
28
/**********************************************************************
29
* SHA 256
30
*********************************************************************/
31
32
void MacroAssembler::sha256_deque(const VectorRegister src,
33
const VectorRegister dst1,
34
const VectorRegister dst2,
35
const VectorRegister dst3) {
36
vsldoi (dst1, src, src, 12);
37
vsldoi (dst2, src, src, 8);
38
vsldoi (dst3, src, src, 4);
39
}
40
41
void MacroAssembler::sha256_round(const VectorRegister* hs,
42
const int total_hs,
43
int& h_cnt,
44
const VectorRegister kpw) {
45
// convenience registers: cycle from 0-7 downwards
46
const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
47
const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
48
const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
49
const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
50
const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
51
const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
52
const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
53
const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
54
// temporaries
55
VectorRegister ch = VR0;
56
VectorRegister maj = VR1;
57
VectorRegister bsa = VR2;
58
VectorRegister bse = VR3;
59
VectorRegister vt0 = VR4;
60
VectorRegister vt1 = VR5;
61
VectorRegister vt2 = VR6;
62
VectorRegister vt3 = VR7;
63
64
vsel (ch, g, f, e);
65
vxor (maj, a, b);
66
vshasigmaw (bse, e, 1, 0xf);
67
vadduwm (vt2, ch, kpw);
68
vadduwm (vt1, h, bse);
69
vsel (maj, b, c, maj);
70
vadduwm (vt3, vt1, vt2);
71
vshasigmaw (bsa, a, 1, 0);
72
vadduwm (vt0, bsa, maj);
73
74
vadduwm (d, d, vt3);
75
vadduwm (h, vt3, vt0);
76
77
// advance vector pointer to the next iteration
78
h_cnt++;
79
}
80
81
void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
82
const VectorRegister e,
83
const Register hptr) {
84
// temporaries
85
Register tmp = R8;
86
VectorRegister vt0 = VR0;
87
VectorRegister vRb = VR6;
88
// labels
89
Label sha256_aligned;
90
91
andi_ (tmp, hptr, 0xf);
92
lvx (a, hptr);
93
addi (tmp, hptr, 16);
94
lvx (e, tmp);
95
beq (CCR0, sha256_aligned);
96
97
// handle unaligned accesses
98
load_perm(vRb, hptr);
99
addi (tmp, hptr, 32);
100
vec_perm(a, e, vRb);
101
102
lvx (vt0, tmp);
103
vec_perm(e, vt0, vRb);
104
105
// aligned accesses
106
bind(sha256_aligned);
107
}
108
109
void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
110
const VectorRegister* ws,
111
const int total_ws,
112
const Register k,
113
const VectorRegister* kpws,
114
const int total_kpws) {
115
Label w_aligned, after_w_load;
116
117
Register tmp = R8;
118
VectorRegister vt0 = VR0;
119
VectorRegister vt1 = VR1;
120
VectorRegister vRb = VR6;
121
122
andi_ (tmp, buf_in, 0xF);
123
beq (CCR0, w_aligned); // address ends with 0x0, not 0x8
124
125
// deal with unaligned addresses
126
lvx (ws[0], buf_in);
127
load_perm(vRb, buf_in);
128
129
for (int n = 1; n < total_ws; n++) {
130
VectorRegister w_cur = ws[n];
131
VectorRegister w_prev = ws[n-1];
132
133
addi (tmp, buf_in, n * 16);
134
lvx (w_cur, tmp);
135
vec_perm(w_prev, w_cur, vRb);
136
}
137
addi (tmp, buf_in, total_ws * 16);
138
lvx (vt0, tmp);
139
vec_perm(ws[total_ws-1], vt0, vRb);
140
b (after_w_load);
141
142
bind(w_aligned);
143
144
// deal with aligned addresses
145
lvx(ws[0], buf_in);
146
for (int n = 1; n < total_ws; n++) {
147
VectorRegister w = ws[n];
148
addi (tmp, buf_in, n * 16);
149
lvx (w, tmp);
150
}
151
152
bind(after_w_load);
153
154
#if defined(VM_LITTLE_ENDIAN)
155
// Byte swapping within int values
156
li (tmp, 8);
157
lvsl (vt0, tmp);
158
vspltisb (vt1, 0xb);
159
vxor (vt1, vt0, vt1);
160
for (int n = 0; n < total_ws; n++) {
161
VectorRegister w = ws[n];
162
vec_perm(w, w, vt1);
163
}
164
#endif
165
166
// Loading k, which is always aligned to 16-bytes
167
lvx (kpws[0], k);
168
for (int n = 1; n < total_kpws; n++) {
169
VectorRegister kpw = kpws[n];
170
addi (tmp, k, 16 * n);
171
lvx (kpw, tmp);
172
}
173
174
// Add w to K
175
assert(total_ws == total_kpws, "Redesign the loop below");
176
for (int n = 0; n < total_kpws; n++) {
177
VectorRegister kpw = kpws[n];
178
VectorRegister w = ws[n];
179
180
vadduwm (kpw, kpw, w);
181
}
182
}
183
184
void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
185
const VectorRegister w1,
186
const VectorRegister w2,
187
const VectorRegister w3,
188
const VectorRegister kpw0,
189
const VectorRegister kpw1,
190
const VectorRegister kpw2,
191
const VectorRegister kpw3,
192
const Register j,
193
const Register k) {
194
// Temporaries
195
const VectorRegister vt0 = VR0;
196
const VectorRegister vt1 = VR1;
197
const VectorSRegister vsrt1 = vt1->to_vsr();
198
const VectorRegister vt2 = VR2;
199
const VectorRegister vt3 = VR3;
200
const VectorSRegister vst3 = vt3->to_vsr();
201
const VectorRegister vt4 = VR4;
202
203
// load to k[j]
204
lvx (vt0, j, k);
205
206
// advance j
207
addi (j, j, 16); // 16 bytes were read
208
209
#if defined(VM_LITTLE_ENDIAN)
210
// b = w[j-15], w[j-14], w[j-13], w[j-12]
211
vsldoi (vt1, w1, w0, 12);
212
213
// c = w[j-7], w[j-6], w[j-5], w[j-4]
214
vsldoi (vt2, w3, w2, 12);
215
216
#else
217
// b = w[j-15], w[j-14], w[j-13], w[j-12]
218
vsldoi (vt1, w0, w1, 4);
219
220
// c = w[j-7], w[j-6], w[j-5], w[j-4]
221
vsldoi (vt2, w2, w3, 4);
222
#endif
223
224
// d = w[j-2], w[j-1], w[j-4], w[j-3]
225
vsldoi (vt3, w3, w3, 8);
226
227
// b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
228
vshasigmaw (vt1, vt1, 0, 0);
229
230
// d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
231
vshasigmaw (vt3, vt3, 0, 0xf);
232
233
// c = s0(w[j-15]) + w[j-7],
234
// s0(w[j-14]) + w[j-6],
235
// s0(w[j-13]) + w[j-5],
236
// s0(w[j-12]) + w[j-4]
237
vadduwm (vt2, vt1, vt2);
238
239
// c = s0(w[j-15]) + w[j-7] + w[j-16],
240
// s0(w[j-14]) + w[j-6] + w[j-15],
241
// s0(w[j-13]) + w[j-5] + w[j-14],
242
// s0(w[j-12]) + w[j-4] + w[j-13]
243
vadduwm (vt2, vt2, w0);
244
245
// e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
246
// s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
247
// s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
248
// s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED
249
vadduwm (vt4, vt2, vt3);
250
251
// At this point, e[0] and e[1] are the correct values to be stored at w[j]
252
// and w[j+1].
253
// e[2] and e[3] are not considered.
254
// b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
255
vshasigmaw (vt1, vt4, 0, 0xf);
256
257
// v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
258
#if defined(VM_LITTLE_ENDIAN)
259
xxmrgld (vst3, vsrt1, vst3);
260
#else
261
xxmrghd (vst3, vst3, vsrt1);
262
#endif
263
264
// c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
265
// s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
266
// s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2]
267
// s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4]
268
vadduwm (vt2, vt2, vt3);
269
270
// Updating w0 to w3 to hold the new previous 16 values from w.
271
vmr (w0, w1);
272
vmr (w1, w2);
273
vmr (w2, w3);
274
vmr (w3, vt2);
275
276
// store k + w to v9 (4 values at once)
277
#if defined(VM_LITTLE_ENDIAN)
278
vadduwm (kpw0, vt2, vt0);
279
280
vsldoi (kpw1, kpw0, kpw0, 12);
281
vsldoi (kpw2, kpw0, kpw0, 8);
282
vsldoi (kpw3, kpw0, kpw0, 4);
283
#else
284
vadduwm (kpw3, vt2, vt0);
285
286
vsldoi (kpw2, kpw3, kpw3, 12);
287
vsldoi (kpw1, kpw3, kpw3, 8);
288
vsldoi (kpw0, kpw3, kpw3, 4);
289
#endif
290
}
291
292
void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
293
const VectorRegister b_,
294
const VectorRegister c,
295
const VectorRegister d,
296
const VectorRegister e,
297
const VectorRegister f,
298
const VectorRegister g,
299
const VectorRegister h,
300
const Register hptr) {
301
// temporaries
302
VectorRegister vt0 = VR0;
303
VectorRegister vt1 = VR1;
304
VectorRegister vt2 = VR2;
305
VectorRegister vt3 = VR3;
306
VectorRegister vt4 = VR4;
307
VectorRegister vt5 = VR5;
308
VectorRegister vaux = VR6;
309
VectorRegister vRb = VR6;
310
Register tmp = R8;
311
Register of16 = R8;
312
Register of32 = R9;
313
Label state_load_aligned;
314
315
// Load hptr
316
andi_ (tmp, hptr, 0xf);
317
li (of16, 16);
318
lvx (vt0, hptr);
319
lvx (vt5, of16, hptr);
320
beq (CCR0, state_load_aligned);
321
322
// handle unaligned accesses
323
li (of32, 32);
324
load_perm(vRb, hptr);
325
326
vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3]
327
328
lvx (vt1, hptr, of32);
329
vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7]
330
331
// aligned accesses
332
bind(state_load_aligned);
333
334
#if defined(VM_LITTLE_ENDIAN)
335
vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?}
336
vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?}
337
vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?}
338
vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?}
339
xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
340
xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
341
vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
342
vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
343
344
// Save hptr back, works for any alignment
345
xxswapd (vt0->to_vsr(), a->to_vsr());
346
stxvd2x (vt0->to_vsr(), hptr);
347
xxswapd (vt5->to_vsr(), e->to_vsr());
348
stxvd2x (vt5->to_vsr(), of16, hptr);
349
#else
350
vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?}
351
vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?}
352
vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?}
353
vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?}
354
xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
355
xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
356
vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
357
vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
358
359
// Save hptr back, works for any alignment
360
stxvd2x (d->to_vsr(), hptr);
361
stxvd2x (h->to_vsr(), of16, hptr);
362
#endif
363
}
364
365
static const uint32_t sha256_round_table[64] __attribute((aligned(16))) = {
366
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
367
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
368
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
369
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
370
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
371
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
372
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
373
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
374
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
375
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
376
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
377
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
378
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
379
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
380
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
381
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
382
};
383
static const uint32_t *sha256_round_consts = sha256_round_table;
384
385
// R3_ARG1 - byte[] Input string with padding but in Big Endian
386
// R4_ARG2 - int[] SHA.state (at first, the root of primes)
387
// R5_ARG3 - int offset
388
// R6_ARG4 - int limit
389
//
390
// Internal Register usage:
391
// R7 - k
392
// R8 - tmp | j | of16
393
// R9 - of32
394
// VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
395
// VR9-VR16 - a-h
396
// VR17-VR20 - w0-w3
397
// VR21-VR23 - vRb | vaux0-vaux2
398
// VR24-VR27 - kpw0-kpw3
399
void MacroAssembler::sha256(bool multi_block) {
400
static const ssize_t buf_size = 64;
401
static const uint8_t w_size = sizeof(sha256_round_table)/sizeof(uint32_t);
402
#ifdef AIX
403
// malloc provides 16 byte alignment
404
if (((uintptr_t)sha256_round_consts & 0xF) != 0) {
405
uint32_t *new_round_consts = (uint32_t*)malloc(sizeof(sha256_round_table));
406
guarantee(new_round_consts, "oom");
407
memcpy(new_round_consts, sha256_round_consts, sizeof(sha256_round_table));
408
sha256_round_consts = (const uint32_t*)new_round_consts;
409
}
410
#endif
411
412
Register buf_in = R3_ARG1;
413
Register state = R4_ARG2;
414
Register ofs = R5_ARG3;
415
Register limit = R6_ARG4;
416
417
Label sha_loop, core_loop;
418
419
// Save non-volatile vector registers in the red zone
420
static const VectorRegister nv[] = {
421
VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
422
};
423
static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
424
425
for (int c = 0; c < nv_size; c++) {
426
Register tmp = R8;
427
li (tmp, (c - (nv_size)) * 16);
428
stvx(nv[c], tmp, R1);
429
}
430
431
// Load hash state to registers
432
VectorRegister a = VR9;
433
VectorRegister b = VR10;
434
VectorRegister c = VR11;
435
VectorRegister d = VR12;
436
VectorRegister e = VR13;
437
VectorRegister f = VR14;
438
VectorRegister g = VR15;
439
VectorRegister h = VR16;
440
static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
441
static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
442
// counter for cycling through hs vector to avoid register moves between iterations
443
int h_cnt = 0;
444
445
// Load a-h registers from the memory pointed by state
446
#if defined(VM_LITTLE_ENDIAN)
447
sha256_load_h_vec(a, e, state);
448
#else
449
sha256_load_h_vec(d, h, state);
450
#endif
451
452
// keep k loaded also during MultiBlock loops
453
Register k = R7;
454
assert(((uintptr_t)sha256_round_consts & 0xF) == 0, "k alignment");
455
load_const_optimized(k, (address)sha256_round_consts, R0);
456
457
// Avoiding redundant loads
458
if (multi_block) {
459
align(OptoLoopAlignment);
460
}
461
bind(sha_loop);
462
#if defined(VM_LITTLE_ENDIAN)
463
sha256_deque(a, b, c, d);
464
sha256_deque(e, f, g, h);
465
#else
466
sha256_deque(d, c, b, a);
467
sha256_deque(h, g, f, e);
468
#endif
469
470
// Load 16 elements from w out of the loop.
471
// Order of the int values is Endianess specific.
472
VectorRegister w0 = VR17;
473
VectorRegister w1 = VR18;
474
VectorRegister w2 = VR19;
475
VectorRegister w3 = VR20;
476
static const VectorRegister ws[] = {w0, w1, w2, w3};
477
static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
478
479
VectorRegister kpw0 = VR24;
480
VectorRegister kpw1 = VR25;
481
VectorRegister kpw2 = VR26;
482
VectorRegister kpw3 = VR27;
483
static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
484
static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
485
486
sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
487
488
// Cycle through the first 16 elements
489
assert(total_ws == total_kpws, "Redesign the loop below");
490
for (int n = 0; n < total_ws; n++) {
491
VectorRegister vaux0 = VR21;
492
VectorRegister vaux1 = VR22;
493
VectorRegister vaux2 = VR23;
494
495
sha256_deque(kpws[n], vaux0, vaux1, vaux2);
496
497
#if defined(VM_LITTLE_ENDIAN)
498
sha256_round(hs, total_hs, h_cnt, kpws[n]);
499
sha256_round(hs, total_hs, h_cnt, vaux0);
500
sha256_round(hs, total_hs, h_cnt, vaux1);
501
sha256_round(hs, total_hs, h_cnt, vaux2);
502
#else
503
sha256_round(hs, total_hs, h_cnt, vaux2);
504
sha256_round(hs, total_hs, h_cnt, vaux1);
505
sha256_round(hs, total_hs, h_cnt, vaux0);
506
sha256_round(hs, total_hs, h_cnt, kpws[n]);
507
#endif
508
}
509
510
Register tmp = R8;
511
// loop the 16th to the 64th iteration by 8 steps
512
li (tmp, (w_size - 16) / total_hs);
513
mtctr(tmp);
514
515
// j will be aligned to 4 for loading words.
516
// Whenever read, advance the pointer (e.g: when j is used in a function)
517
Register j = R8;
518
li (j, 16*4);
519
520
align(OptoLoopAlignment);
521
bind(core_loop);
522
523
// due to VectorRegister rotate, always iterate in multiples of total_hs
524
for (int n = 0; n < total_hs/4; n++) {
525
sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
526
sha256_round(hs, total_hs, h_cnt, kpw0);
527
sha256_round(hs, total_hs, h_cnt, kpw1);
528
sha256_round(hs, total_hs, h_cnt, kpw2);
529
sha256_round(hs, total_hs, h_cnt, kpw3);
530
}
531
532
bdnz (core_loop);
533
534
// Update hash state
535
sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
536
537
if (multi_block) {
538
addi(buf_in, buf_in, buf_size);
539
addi(ofs, ofs, buf_size);
540
cmplw(CCR0, ofs, limit);
541
ble(CCR0, sha_loop);
542
543
// return ofs
544
mr(R3_RET, ofs);
545
}
546
547
// Restore non-volatile registers
548
for (int c = 0; c < nv_size; c++) {
549
Register tmp = R8;
550
li (tmp, (c - (nv_size)) * 16);
551
lvx(nv[c], tmp, R1);
552
}
553
}
554
555
556
/**********************************************************************
557
* SHA 512
558
*********************************************************************/
559
560
void MacroAssembler::sha512_load_w_vec(const Register buf_in,
561
const VectorRegister* ws,
562
const int total_ws) {
563
Register tmp = R8;
564
VectorRegister vRb = VR8;
565
VectorRegister aux = VR9;
566
Label is_aligned, after_alignment;
567
568
andi_ (tmp, buf_in, 0xF);
569
beq (CCR0, is_aligned); // address ends with 0x0, not 0x8
570
571
// deal with unaligned addresses
572
lvx (ws[0], buf_in);
573
load_perm(vRb, buf_in);
574
575
for (int n = 1; n < total_ws; n++) {
576
VectorRegister w_cur = ws[n];
577
VectorRegister w_prev = ws[n-1];
578
addi (tmp, buf_in, n * 16);
579
lvx (w_cur, tmp);
580
vec_perm(w_prev, w_cur, vRb);
581
}
582
addi (tmp, buf_in, total_ws * 16);
583
lvx (aux, tmp);
584
vec_perm(ws[total_ws-1], aux, vRb);
585
b (after_alignment);
586
587
bind(is_aligned);
588
lvx (ws[0], buf_in);
589
for (int n = 1; n < total_ws; n++) {
590
VectorRegister w = ws[n];
591
addi (tmp, buf_in, n * 16);
592
lvx (w, tmp);
593
}
594
595
bind(after_alignment);
596
}
597
598
// Update hash state
599
void MacroAssembler::sha512_update_sha_state(const Register state,
600
const VectorRegister* hs,
601
const int total_hs) {
602
603
#if defined(VM_LITTLE_ENDIAN)
604
int start_idx = 0;
605
#else
606
int start_idx = 1;
607
#endif
608
609
// load initial hash from the memory pointed by state
610
VectorRegister ini_a = VR10;
611
VectorRegister ini_c = VR12;
612
VectorRegister ini_e = VR14;
613
VectorRegister ini_g = VR16;
614
static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
615
static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
616
617
Label state_save_aligned, after_state_save_aligned;
618
619
Register addr = R7;
620
Register tmp = R8;
621
VectorRegister vRb = VR8;
622
VectorRegister aux = VR9;
623
624
andi_(tmp, state, 0xf);
625
beq(CCR0, state_save_aligned);
626
// deal with unaligned addresses
627
628
{
629
VectorRegister a = hs[0];
630
VectorRegister b_ = hs[1];
631
VectorRegister c = hs[2];
632
VectorRegister d = hs[3];
633
VectorRegister e = hs[4];
634
VectorRegister f = hs[5];
635
VectorRegister g = hs[6];
636
VectorRegister h = hs[7];
637
load_perm(vRb, state);
638
lvx (ini_a, state);
639
addi (addr, state, 16);
640
641
lvx (ini_c, addr);
642
addi (addr, state, 32);
643
vec_perm(ini_a, ini_c, vRb);
644
645
lvx (ini_e, addr);
646
addi (addr, state, 48);
647
vec_perm(ini_c, ini_e, vRb);
648
649
lvx (ini_g, addr);
650
addi (addr, state, 64);
651
vec_perm(ini_e, ini_g, vRb);
652
653
lvx (aux, addr);
654
vec_perm(ini_g, aux, vRb);
655
656
#if defined(VM_LITTLE_ENDIAN)
657
xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
658
xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
659
xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
660
xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
661
#else
662
xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
663
xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
664
xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
665
xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
666
#endif
667
668
for (int n = start_idx; n < total_hs; n += 2) {
669
VectorRegister h_cur = hs[n];
670
VectorRegister ini_cur = inis[n/2];
671
672
vaddudm(h_cur, ini_cur, h_cur);
673
}
674
675
for (int n = start_idx; n < total_hs; n += 2) {
676
VectorRegister h_cur = hs[n];
677
678
mfvrd (tmp, h_cur);
679
#if defined(VM_LITTLE_ENDIAN)
680
std (tmp, 8*n + 8, state);
681
#else
682
std (tmp, 8*n - 8, state);
683
#endif
684
vsldoi (aux, h_cur, h_cur, 8);
685
mfvrd (tmp, aux);
686
std (tmp, 8*n + 0, state);
687
}
688
689
b (after_state_save_aligned);
690
}
691
692
bind(state_save_aligned);
693
{
694
for (int n = 0; n < total_hs; n += 2) {
695
#if defined(VM_LITTLE_ENDIAN)
696
VectorRegister h_cur = hs[n];
697
VectorRegister h_next = hs[n+1];
698
#else
699
VectorRegister h_cur = hs[n+1];
700
VectorRegister h_next = hs[n];
701
#endif
702
VectorRegister ini_cur = inis[n/2];
703
704
if (n/2 == 0) {
705
lvx(ini_cur, state);
706
} else {
707
addi(addr, state, (n/2) * 16);
708
lvx(ini_cur, addr);
709
}
710
xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
711
}
712
713
for (int n = start_idx; n < total_hs; n += 2) {
714
VectorRegister h_cur = hs[n];
715
VectorRegister ini_cur = inis[n/2];
716
717
vaddudm(h_cur, ini_cur, h_cur);
718
}
719
720
for (int n = start_idx; n < total_hs; n += 2) {
721
VectorRegister h_cur = hs[n];
722
723
if (n/2 == 0) {
724
stvx(h_cur, state);
725
} else {
726
addi(addr, state, (n/2) * 16);
727
stvx(h_cur, addr);
728
}
729
}
730
}
731
732
bind(after_state_save_aligned);
733
}
734
735
// Use h_cnt to cycle through hs elements but also increment it at the end
736
void MacroAssembler::sha512_round(const VectorRegister* hs,
737
const int total_hs, int& h_cnt,
738
const VectorRegister kpw) {
739
740
// convenience registers: cycle from 0-7 downwards
741
const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
742
const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
743
const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
744
const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
745
const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
746
const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
747
const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
748
const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
749
// temporaries
750
const VectorRegister Ch = VR20;
751
const VectorRegister Maj = VR21;
752
const VectorRegister bsa = VR22;
753
const VectorRegister bse = VR23;
754
const VectorRegister tmp1 = VR24;
755
const VectorRegister tmp2 = VR25;
756
757
vsel (Ch, g, f, e);
758
vxor (Maj, a, b);
759
vshasigmad(bse, e, 1, 0xf);
760
vaddudm (tmp2, Ch, kpw);
761
vaddudm (tmp1, h, bse);
762
vsel (Maj, b, c, Maj);
763
vaddudm (tmp1, tmp1, tmp2);
764
vshasigmad(bsa, a, 1, 0);
765
vaddudm (tmp2, bsa, Maj);
766
vaddudm (d, d, tmp1);
767
vaddudm (h, tmp1, tmp2);
768
769
// advance vector pointer to the next iteration
770
h_cnt++;
771
}
772
773
void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
774
const VectorRegister w1,
775
const VectorRegister w2,
776
const VectorRegister w3,
777
const VectorRegister w4,
778
const VectorRegister w5,
779
const VectorRegister w6,
780
const VectorRegister w7,
781
const VectorRegister kpw0,
782
const VectorRegister kpw1,
783
const Register j,
784
const VectorRegister vRb,
785
const Register k) {
786
// Temporaries
787
const VectorRegister VR_a = VR20;
788
const VectorRegister VR_b = VR21;
789
const VectorRegister VR_c = VR22;
790
const VectorRegister VR_d = VR23;
791
792
// load to k[j]
793
lvx (VR_a, j, k);
794
// advance j
795
addi (j, j, 16); // 16 bytes were read
796
797
#if defined(VM_LITTLE_ENDIAN)
798
// v6 = w[j-15], w[j-14]
799
vperm (VR_b, w1, w0, vRb);
800
// v12 = w[j-7], w[j-6]
801
vperm (VR_c, w5, w4, vRb);
802
#else
803
// v6 = w[j-15], w[j-14]
804
vperm (VR_b, w0, w1, vRb);
805
// v12 = w[j-7], w[j-6]
806
vperm (VR_c, w4, w5, vRb);
807
#endif
808
809
// v6 = s0(w[j-15]) , s0(w[j-14])
810
vshasigmad (VR_b, VR_b, 0, 0);
811
// v5 = s1(w[j-2]) , s1(w[j-1])
812
vshasigmad (VR_d, w7, 0, 0xf);
813
// v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
814
vaddudm (VR_b, VR_b, VR_c);
815
// v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
816
vaddudm (VR_d, VR_d, w0);
817
// v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
818
// s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
819
vaddudm (VR_c, VR_d, VR_b);
820
// Updating w0 to w7 to hold the new previous 16 values from w.
821
vmr (w0, w1);
822
vmr (w1, w2);
823
vmr (w2, w3);
824
vmr (w3, w4);
825
vmr (w4, w5);
826
vmr (w5, w6);
827
vmr (w6, w7);
828
vmr (w7, VR_c);
829
830
#if defined(VM_LITTLE_ENDIAN)
831
// store k + w to kpw0 (2 values at once)
832
vaddudm (kpw0, VR_c, VR_a);
833
// kpw1 holds (k + w)[1]
834
vsldoi (kpw1, kpw0, kpw0, 8);
835
#else
836
// store k + w to kpw0 (2 values at once)
837
vaddudm (kpw1, VR_c, VR_a);
838
// kpw1 holds (k + w)[1]
839
vsldoi (kpw0, kpw1, kpw1, 8);
840
#endif
841
}
842
843
void MacroAssembler::sha512_load_h_vec(const Register state,
844
const VectorRegister* hs,
845
const int total_hs) {
846
#if defined(VM_LITTLE_ENDIAN)
847
VectorRegister a = hs[0];
848
VectorRegister g = hs[6];
849
int start_idx = 0;
850
#else
851
VectorRegister a = hs[1];
852
VectorRegister g = hs[7];
853
int start_idx = 1;
854
#endif
855
856
Register addr = R7;
857
VectorRegister vRb = VR8;
858
Register tmp = R8;
859
Label state_aligned, after_state_aligned;
860
861
andi_(tmp, state, 0xf);
862
beq(CCR0, state_aligned);
863
864
// deal with unaligned addresses
865
VectorRegister aux = VR9;
866
867
lvx(hs[start_idx], state);
868
load_perm(vRb, state);
869
870
for (int n = start_idx + 2; n < total_hs; n += 2) {
871
VectorRegister h_cur = hs[n];
872
VectorRegister h_prev2 = hs[n - 2];
873
addi(addr, state, (n/2) * 16);
874
lvx(h_cur, addr);
875
vec_perm(h_prev2, h_cur, vRb);
876
}
877
addi(addr, state, (total_hs/2) * 16);
878
lvx (aux, addr);
879
vec_perm(hs[total_hs - 2 + start_idx], aux, vRb);
880
b (after_state_aligned);
881
882
bind(state_aligned);
883
884
// deal with aligned addresses
885
lvx(hs[start_idx], state);
886
887
for (int n = start_idx + 2; n < total_hs; n += 2) {
888
VectorRegister h_cur = hs[n];
889
addi(addr, state, (n/2) * 16);
890
lvx(h_cur, addr);
891
}
892
893
bind(after_state_aligned);
894
}
895
896
static const uint64_t sha512_round_table[80] __attribute((aligned(16))) = {
897
0x428a2f98d728ae22, 0x7137449123ef65cd,
898
0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
899
0x3956c25bf348b538, 0x59f111f1b605d019,
900
0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
901
0xd807aa98a3030242, 0x12835b0145706fbe,
902
0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
903
0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
904
0x9bdc06a725c71235, 0xc19bf174cf692694,
905
0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
906
0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
907
0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
908
0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
909
0x983e5152ee66dfab, 0xa831c66d2db43210,
910
0xb00327c898fb213f, 0xbf597fc7beef0ee4,
911
0xc6e00bf33da88fc2, 0xd5a79147930aa725,
912
0x06ca6351e003826f, 0x142929670a0e6e70,
913
0x27b70a8546d22ffc, 0x2e1b21385c26c926,
914
0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
915
0x650a73548baf63de, 0x766a0abb3c77b2a8,
916
0x81c2c92e47edaee6, 0x92722c851482353b,
917
0xa2bfe8a14cf10364, 0xa81a664bbc423001,
918
0xc24b8b70d0f89791, 0xc76c51a30654be30,
919
0xd192e819d6ef5218, 0xd69906245565a910,
920
0xf40e35855771202a, 0x106aa07032bbd1b8,
921
0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
922
0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
923
0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
924
0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
925
0x748f82ee5defb2fc, 0x78a5636f43172f60,
926
0x84c87814a1f0ab72, 0x8cc702081a6439ec,
927
0x90befffa23631e28, 0xa4506cebde82bde9,
928
0xbef9a3f7b2c67915, 0xc67178f2e372532b,
929
0xca273eceea26619c, 0xd186b8c721c0c207,
930
0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
931
0x06f067aa72176fba, 0x0a637dc5a2c898a6,
932
0x113f9804bef90dae, 0x1b710b35131c471b,
933
0x28db77f523047d84, 0x32caab7b40c72493,
934
0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
935
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
936
0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
937
};
938
static const uint64_t *sha512_round_consts = sha512_round_table;
939
940
// R3_ARG1 - byte[] Input string with padding but in Big Endian
941
// R4_ARG2 - int[] SHA.state (at first, the root of primes)
942
// R5_ARG3 - int offset
943
// R6_ARG4 - int limit
944
//
945
// Internal Register usage:
946
// R7 R8 R9 - volatile temporaries
947
// VR0-VR7 - a-h
948
// VR8 - vRb
949
// VR9 - aux (highly volatile, use with care)
950
// VR10-VR17 - w0-w7 | ini_a-ini_h
951
// VR18 - vsp16 | kplusw0
952
// VR19 - vsp32 | kplusw1
953
// VR20-VR25 - sha512_calc_2w and sha512_round temporaries
954
void MacroAssembler::sha512(bool multi_block) {
955
static const ssize_t buf_size = 128;
956
static const uint8_t w_size = sizeof(sha512_round_table)/sizeof(uint64_t);
957
#ifdef AIX
958
// malloc provides 16 byte alignment
959
if (((uintptr_t)sha512_round_consts & 0xF) != 0) {
960
uint64_t *new_round_consts = (uint64_t*)malloc(sizeof(sha512_round_table));
961
guarantee(new_round_consts, "oom");
962
memcpy(new_round_consts, sha512_round_consts, sizeof(sha512_round_table));
963
sha512_round_consts = (const uint64_t*)new_round_consts;
964
}
965
#endif
966
967
Register buf_in = R3_ARG1;
968
Register state = R4_ARG2;
969
Register ofs = R5_ARG3;
970
Register limit = R6_ARG4;
971
972
Label sha_loop, core_loop;
973
974
// Save non-volatile vector registers in the red zone
975
static const VectorRegister nv[] = {
976
VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
977
};
978
static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
979
980
for (int c = 0; c < nv_size; c++) {
981
Register idx = R7;
982
li (idx, (c - (nv_size)) * 16);
983
stvx(nv[c], idx, R1);
984
}
985
986
// Load hash state to registers
987
VectorRegister a = VR0;
988
VectorRegister b = VR1;
989
VectorRegister c = VR2;
990
VectorRegister d = VR3;
991
VectorRegister e = VR4;
992
VectorRegister f = VR5;
993
VectorRegister g = VR6;
994
VectorRegister h = VR7;
995
static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
996
static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
997
// counter for cycling through hs vector to avoid register moves between iterations
998
int h_cnt = 0;
999
1000
// Load a-h registers from the memory pointed by state
1001
sha512_load_h_vec(state, hs, total_hs);
1002
1003
Register k = R9;
1004
assert(((uintptr_t)sha512_round_consts & 0xF) == 0, "k alignment");
1005
load_const_optimized(k, (address)sha512_round_consts, R0);
1006
1007
if (multi_block) {
1008
align(OptoLoopAlignment);
1009
}
1010
bind(sha_loop);
1011
1012
for (int n = 0; n < total_hs; n += 2) {
1013
#if defined(VM_LITTLE_ENDIAN)
1014
VectorRegister h_cur = hs[n];
1015
VectorRegister h_next = hs[n + 1];
1016
#else
1017
VectorRegister h_cur = hs[n + 1];
1018
VectorRegister h_next = hs[n];
1019
#endif
1020
vsldoi (h_next, h_cur, h_cur, 8);
1021
}
1022
1023
// Load 16 elements from w out of the loop.
1024
// Order of the long values is Endianess specific.
1025
VectorRegister w0 = VR10;
1026
VectorRegister w1 = VR11;
1027
VectorRegister w2 = VR12;
1028
VectorRegister w3 = VR13;
1029
VectorRegister w4 = VR14;
1030
VectorRegister w5 = VR15;
1031
VectorRegister w6 = VR16;
1032
VectorRegister w7 = VR17;
1033
static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1034
static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1035
1036
// Load 16 w into vectors and setup vsl for vperm
1037
sha512_load_w_vec(buf_in, ws, total_ws);
1038
1039
#if defined(VM_LITTLE_ENDIAN)
1040
VectorRegister vsp16 = VR18;
1041
VectorRegister vsp32 = VR19;
1042
VectorRegister shiftarg = VR9;
1043
1044
vspltisw(vsp16, 8);
1045
vspltisw(shiftarg, 1);
1046
vsl (vsp16, vsp16, shiftarg);
1047
vsl (vsp32, vsp16, shiftarg);
1048
1049
VectorRegister vsp8 = VR9;
1050
vspltish(vsp8, 8);
1051
1052
// Convert input from Big Endian to Little Endian
1053
for (int c = 0; c < total_ws; c++) {
1054
VectorRegister w = ws[c];
1055
vrlh (w, w, vsp8);
1056
}
1057
for (int c = 0; c < total_ws; c++) {
1058
VectorRegister w = ws[c];
1059
vrlw (w, w, vsp16);
1060
}
1061
for (int c = 0; c < total_ws; c++) {
1062
VectorRegister w = ws[c];
1063
vrld (w, w, vsp32);
1064
}
1065
#endif
1066
1067
Register Rb = R10;
1068
VectorRegister vRb = VR8;
1069
li (Rb, 8);
1070
load_perm(vRb, Rb);
1071
1072
VectorRegister kplusw0 = VR18;
1073
VectorRegister kplusw1 = VR19;
1074
1075
Register addr = R7;
1076
1077
for (int n = 0; n < total_ws; n++) {
1078
VectorRegister w = ws[n];
1079
1080
if (n == 0) {
1081
lvx (kplusw0, k);
1082
} else {
1083
addi (addr, k, n * 16);
1084
lvx (kplusw0, addr);
1085
}
1086
#if defined(VM_LITTLE_ENDIAN)
1087
vaddudm(kplusw0, kplusw0, w);
1088
vsldoi (kplusw1, kplusw0, kplusw0, 8);
1089
#else
1090
vaddudm(kplusw1, kplusw0, w);
1091
vsldoi (kplusw0, kplusw1, kplusw1, 8);
1092
#endif
1093
1094
sha512_round(hs, total_hs, h_cnt, kplusw0);
1095
sha512_round(hs, total_hs, h_cnt, kplusw1);
1096
}
1097
1098
Register tmp = R8;
1099
li (tmp, (w_size-16)/total_hs);
1100
mtctr (tmp);
1101
// j will be aligned to 4 for loading words.
1102
// Whenever read, advance the pointer (e.g: when j is used in a function)
1103
Register j = tmp;
1104
li (j, 8*16);
1105
1106
align(OptoLoopAlignment);
1107
bind(core_loop);
1108
1109
// due to VectorRegister rotate, always iterate in multiples of total_hs
1110
for (int n = 0; n < total_hs/2; n++) {
1111
sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1112
sha512_round(hs, total_hs, h_cnt, kplusw0);
1113
sha512_round(hs, total_hs, h_cnt, kplusw1);
1114
}
1115
1116
bdnz (core_loop);
1117
1118
sha512_update_sha_state(state, hs, total_hs);
1119
1120
if (multi_block) {
1121
addi(buf_in, buf_in, buf_size);
1122
addi(ofs, ofs, buf_size);
1123
cmplw(CCR0, ofs, limit);
1124
ble(CCR0, sha_loop);
1125
1126
// return ofs
1127
mr(R3_RET, ofs);
1128
}
1129
1130
// Restore non-volatile registers
1131
for (int c = 0; c < nv_size; c++) {
1132
Register idx = R7;
1133
li (idx, (c - (nv_size)) * 16);
1134
lvx(nv[c], idx, R1);
1135
}
1136
}
1137
1138