Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/jdk17u
Path: blob/master/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
64440 views
1
/*
2
* Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
3
* Copyright (c) 2012, 2022 SAP SE. All rights reserved.
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
*
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
9
*
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
15
*
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
*
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
22
* questions.
23
*
24
*/
25
26
#include "precompiled.hpp"
27
#include "asm/macroAssembler.inline.hpp"
28
#include "compiler/disassembler.hpp"
29
#include "gc/shared/collectedHeap.inline.hpp"
30
#include "gc/shared/barrierSet.hpp"
31
#include "gc/shared/barrierSetAssembler.hpp"
32
#include "interpreter/interpreter.hpp"
33
#include "memory/resourceArea.hpp"
34
#include "nativeInst_ppc.hpp"
35
#include "oops/klass.inline.hpp"
36
#include "oops/methodData.hpp"
37
#include "prims/methodHandles.hpp"
38
#include "runtime/biasedLocking.hpp"
39
#include "runtime/icache.hpp"
40
#include "runtime/interfaceSupport.inline.hpp"
41
#include "runtime/objectMonitor.hpp"
42
#include "runtime/os.hpp"
43
#include "runtime/safepoint.hpp"
44
#include "runtime/safepointMechanism.hpp"
45
#include "runtime/sharedRuntime.hpp"
46
#include "runtime/stubRoutines.hpp"
47
#include "runtime/vm_version.hpp"
48
#include "utilities/macros.hpp"
49
#include "utilities/powerOfTwo.hpp"
50
51
#ifdef PRODUCT
52
#define BLOCK_COMMENT(str) // nothing
53
#else
54
#define BLOCK_COMMENT(str) block_comment(str)
55
#endif
56
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58
#ifdef ASSERT
59
// On RISC, there's no benefit to verifying instruction boundaries.
60
bool AbstractAssembler::pd_check_instruction_mark() { return false; }
61
#endif
62
63
void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
64
assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
65
if (Assembler::is_simm(si31, 16)) {
66
ld(d, si31, a);
67
if (emit_filler_nop) nop();
68
} else {
69
const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
70
const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
71
addis(d, a, hi);
72
ld(d, lo, d);
73
}
74
}
75
76
void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
77
assert_different_registers(d, a);
78
ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
79
}
80
81
void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
82
size_t size_in_bytes, bool is_signed) {
83
switch (size_in_bytes) {
84
case 8: ld(dst, offs, base); break;
85
case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
86
case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
87
case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
88
default: ShouldNotReachHere();
89
}
90
}
91
92
void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
93
size_t size_in_bytes) {
94
switch (size_in_bytes) {
95
case 8: std(dst, offs, base); break;
96
case 4: stw(dst, offs, base); break;
97
case 2: sth(dst, offs, base); break;
98
case 1: stb(dst, offs, base); break;
99
default: ShouldNotReachHere();
100
}
101
}
102
103
void MacroAssembler::align(int modulus, int max, int rem) {
104
int padding = (rem + modulus - (offset() % modulus)) % modulus;
105
if (padding > max) return;
106
for (int c = (padding >> 2); c > 0; --c) { nop(); }
107
}
108
109
// Issue instructions that calculate given TOC from global TOC.
110
void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
111
bool add_relocation, bool emit_dummy_addr) {
112
int offset = -1;
113
if (emit_dummy_addr) {
114
offset = -128; // dummy address
115
} else if (addr != (address)(intptr_t)-1) {
116
offset = MacroAssembler::offset_to_global_toc(addr);
117
}
118
119
if (hi16) {
120
addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
121
}
122
if (lo16) {
123
if (add_relocation) {
124
// Relocate at the addi to avoid confusion with a load from the method's TOC.
125
relocate(internal_word_Relocation::spec(addr));
126
}
127
addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
128
}
129
}
130
131
address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
132
const int offset = MacroAssembler::offset_to_global_toc(addr);
133
134
const address inst2_addr = a;
135
const int inst2 = *(int *)inst2_addr;
136
137
// The relocation points to the second instruction, the addi,
138
// and the addi reads and writes the same register dst.
139
const int dst = inv_rt_field(inst2);
140
assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
141
142
// Now, find the preceding addis which writes to dst.
143
int inst1 = 0;
144
address inst1_addr = inst2_addr - BytesPerInstWord;
145
while (inst1_addr >= bound) {
146
inst1 = *(int *) inst1_addr;
147
if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
148
// Stop, found the addis which writes dst.
149
break;
150
}
151
inst1_addr -= BytesPerInstWord;
152
}
153
154
assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
155
set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
156
set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
157
return inst1_addr;
158
}
159
160
address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
161
const address inst2_addr = a;
162
const int inst2 = *(int *)inst2_addr;
163
164
// The relocation points to the second instruction, the addi,
165
// and the addi reads and writes the same register dst.
166
const int dst = inv_rt_field(inst2);
167
assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
168
169
// Now, find the preceding addis which writes to dst.
170
int inst1 = 0;
171
address inst1_addr = inst2_addr - BytesPerInstWord;
172
while (inst1_addr >= bound) {
173
inst1 = *(int *) inst1_addr;
174
if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
175
// stop, found the addis which writes dst
176
break;
177
}
178
inst1_addr -= BytesPerInstWord;
179
}
180
181
assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
182
183
int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
184
// -1 is a special case
185
if (offset == -1) {
186
return (address)(intptr_t)-1;
187
} else {
188
return global_toc() + offset;
189
}
190
}
191
192
#ifdef _LP64
193
// Patch compressed oops or klass constants.
194
// Assembler sequence is
195
// 1) compressed oops:
196
// lis rx = const.hi
197
// ori rx = rx | const.lo
198
// 2) compressed klass:
199
// lis rx = const.hi
200
// clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
201
// ori rx = rx | const.lo
202
// Clrldi will be passed by.
203
address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
204
assert(UseCompressedOops, "Should only patch compressed oops");
205
206
const address inst2_addr = a;
207
const int inst2 = *(int *)inst2_addr;
208
209
// The relocation points to the second instruction, the ori,
210
// and the ori reads and writes the same register dst.
211
const int dst = inv_rta_field(inst2);
212
assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
213
// Now, find the preceding addis which writes to dst.
214
int inst1 = 0;
215
address inst1_addr = inst2_addr - BytesPerInstWord;
216
bool inst1_found = false;
217
while (inst1_addr >= bound) {
218
inst1 = *(int *)inst1_addr;
219
if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
220
inst1_addr -= BytesPerInstWord;
221
}
222
assert(inst1_found, "inst is not lis");
223
224
uint32_t data_value = CompressedOops::narrow_oop_value(data);
225
int xc = (data_value >> 16) & 0xffff;
226
int xd = (data_value >> 0) & 0xffff;
227
228
set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
229
set_imm((int *)inst2_addr, (xd)); // unsigned int
230
return inst1_addr;
231
}
232
233
// Get compressed oop or klass constant.
234
narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
235
assert(UseCompressedOops, "Should only patch compressed oops");
236
237
const address inst2_addr = a;
238
const int inst2 = *(int *)inst2_addr;
239
240
// The relocation points to the second instruction, the ori,
241
// and the ori reads and writes the same register dst.
242
const int dst = inv_rta_field(inst2);
243
assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
244
// Now, find the preceding lis which writes to dst.
245
int inst1 = 0;
246
address inst1_addr = inst2_addr - BytesPerInstWord;
247
bool inst1_found = false;
248
249
while (inst1_addr >= bound) {
250
inst1 = *(int *) inst1_addr;
251
if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
252
inst1_addr -= BytesPerInstWord;
253
}
254
assert(inst1_found, "inst is not lis");
255
256
uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
257
uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
258
259
return CompressedOops::narrow_oop_cast(xl | xh);
260
}
261
#endif // _LP64
262
263
// Returns true if successful.
264
bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
265
Register toc, bool fixed_size) {
266
int toc_offset = 0;
267
// Use RelocationHolder::none for the constant pool entry, otherwise
268
// we will end up with a failing NativeCall::verify(x) where x is
269
// the address of the constant pool entry.
270
// FIXME: We should insert relocation information for oops at the constant
271
// pool entries instead of inserting it at the loads; patching of a constant
272
// pool entry should be less expensive.
273
address const_address = address_constant((address)a.value(), RelocationHolder::none);
274
if (const_address == NULL) { return false; } // allocation failure
275
// Relocate at the pc of the load.
276
relocate(a.rspec());
277
toc_offset = (int)(const_address - code()->consts()->start());
278
ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
279
return true;
280
}
281
282
bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
283
const address inst1_addr = a;
284
const int inst1 = *(int *)inst1_addr;
285
286
// The relocation points to the ld or the addis.
287
return (is_ld(inst1)) ||
288
(is_addis(inst1) && inv_ra_field(inst1) != 0);
289
}
290
291
int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
292
assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
293
294
const address inst1_addr = a;
295
const int inst1 = *(int *)inst1_addr;
296
297
if (is_ld(inst1)) {
298
return inv_d1_field(inst1);
299
} else if (is_addis(inst1)) {
300
const int dst = inv_rt_field(inst1);
301
302
// Now, find the succeeding ld which reads and writes to dst.
303
address inst2_addr = inst1_addr + BytesPerInstWord;
304
int inst2 = 0;
305
while (true) {
306
inst2 = *(int *) inst2_addr;
307
if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
308
// Stop, found the ld which reads and writes dst.
309
break;
310
}
311
inst2_addr += BytesPerInstWord;
312
}
313
return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
314
}
315
ShouldNotReachHere();
316
return 0;
317
}
318
319
// Get the constant from a `load_const' sequence.
320
long MacroAssembler::get_const(address a) {
321
assert(is_load_const_at(a), "not a load of a constant");
322
const int *p = (const int*) a;
323
unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
324
if (is_ori(*(p+1))) {
325
x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
326
x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
327
x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
328
} else if (is_lis(*(p+1))) {
329
x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
330
x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
331
x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
332
} else {
333
ShouldNotReachHere();
334
return (long) 0;
335
}
336
return (long) x;
337
}
338
339
// Patch the 64 bit constant of a `load_const' sequence. This is a low
340
// level procedure. It neither flushes the instruction cache nor is it
341
// mt safe.
342
void MacroAssembler::patch_const(address a, long x) {
343
assert(is_load_const_at(a), "not a load of a constant");
344
int *p = (int*) a;
345
if (is_ori(*(p+1))) {
346
set_imm(0 + p, (x >> 48) & 0xffff);
347
set_imm(1 + p, (x >> 32) & 0xffff);
348
set_imm(3 + p, (x >> 16) & 0xffff);
349
set_imm(4 + p, x & 0xffff);
350
} else if (is_lis(*(p+1))) {
351
set_imm(0 + p, (x >> 48) & 0xffff);
352
set_imm(2 + p, (x >> 32) & 0xffff);
353
set_imm(1 + p, (x >> 16) & 0xffff);
354
set_imm(3 + p, x & 0xffff);
355
} else {
356
ShouldNotReachHere();
357
}
358
}
359
360
AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
361
assert(oop_recorder() != NULL, "this assembler needs a Recorder");
362
int index = oop_recorder()->allocate_metadata_index(obj);
363
RelocationHolder rspec = metadata_Relocation::spec(index);
364
return AddressLiteral((address)obj, rspec);
365
}
366
367
AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
368
assert(oop_recorder() != NULL, "this assembler needs a Recorder");
369
int index = oop_recorder()->find_index(obj);
370
RelocationHolder rspec = metadata_Relocation::spec(index);
371
return AddressLiteral((address)obj, rspec);
372
}
373
374
AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
375
assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
376
int oop_index = oop_recorder()->allocate_oop_index(obj);
377
return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
378
}
379
380
AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
381
assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
382
int oop_index = oop_recorder()->find_index(obj);
383
return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
384
}
385
386
#ifndef PRODUCT
387
void MacroAssembler::pd_print_patched_instruction(address branch) {
388
Unimplemented(); // TODO: PPC port
389
}
390
#endif // ndef PRODUCT
391
392
// Conditional far branch for destinations encodable in 24+2 bits.
393
void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
394
395
// If requested by flag optimize, relocate the bc_far as a
396
// runtime_call and prepare for optimizing it when the code gets
397
// relocated.
398
if (optimize == bc_far_optimize_on_relocate) {
399
relocate(relocInfo::runtime_call_type);
400
}
401
402
// variant 2:
403
//
404
// b!cxx SKIP
405
// bxx DEST
406
// SKIP:
407
//
408
409
const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
410
opposite_bcond(inv_boint_bcond(boint)));
411
412
// We emit two branches.
413
// First, a conditional branch which jumps around the far branch.
414
const address not_taken_pc = pc() + 2 * BytesPerInstWord;
415
const address bc_pc = pc();
416
bc(opposite_boint, biint, not_taken_pc);
417
418
const int bc_instr = *(int*)bc_pc;
419
assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
420
assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
421
assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
422
opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
423
"postcondition");
424
assert(biint == inv_bi_field(bc_instr), "postcondition");
425
426
// Second, an unconditional far branch which jumps to dest.
427
// Note: target(dest) remembers the current pc (see CodeSection::target)
428
// and returns the current pc if the label is not bound yet; when
429
// the label gets bound, the unconditional far branch will be patched.
430
const address target_pc = target(dest);
431
const address b_pc = pc();
432
b(target_pc);
433
434
assert(not_taken_pc == pc(), "postcondition");
435
assert(dest.is_bound() || target_pc == b_pc, "postcondition");
436
}
437
438
// 1 or 2 instructions
439
void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
440
if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
441
bc(boint, biint, dest);
442
} else {
443
bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
444
}
445
}
446
447
bool MacroAssembler::is_bc_far_at(address instruction_addr) {
448
return is_bc_far_variant1_at(instruction_addr) ||
449
is_bc_far_variant2_at(instruction_addr) ||
450
is_bc_far_variant3_at(instruction_addr);
451
}
452
453
address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
454
if (is_bc_far_variant1_at(instruction_addr)) {
455
const address instruction_1_addr = instruction_addr;
456
const int instruction_1 = *(int*)instruction_1_addr;
457
return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
458
} else if (is_bc_far_variant2_at(instruction_addr)) {
459
const address instruction_2_addr = instruction_addr + 4;
460
return bxx_destination(instruction_2_addr);
461
} else if (is_bc_far_variant3_at(instruction_addr)) {
462
return instruction_addr + 8;
463
}
464
// variant 4 ???
465
ShouldNotReachHere();
466
return NULL;
467
}
468
void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
469
470
if (is_bc_far_variant3_at(instruction_addr)) {
471
// variant 3, far cond branch to the next instruction, already patched to nops:
472
//
473
// nop
474
// endgroup
475
// SKIP/DEST:
476
//
477
return;
478
}
479
480
// first, extract boint and biint from the current branch
481
int boint = 0;
482
int biint = 0;
483
484
ResourceMark rm;
485
const int code_size = 2 * BytesPerInstWord;
486
CodeBuffer buf(instruction_addr, code_size);
487
MacroAssembler masm(&buf);
488
if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
489
// Far branch to next instruction: Optimize it by patching nops (produce variant 3).
490
masm.nop();
491
masm.endgroup();
492
} else {
493
if (is_bc_far_variant1_at(instruction_addr)) {
494
// variant 1, the 1st instruction contains the destination address:
495
//
496
// bcxx DEST
497
// nop
498
//
499
const int instruction_1 = *(int*)(instruction_addr);
500
boint = inv_bo_field(instruction_1);
501
biint = inv_bi_field(instruction_1);
502
} else if (is_bc_far_variant2_at(instruction_addr)) {
503
// variant 2, the 2nd instruction contains the destination address:
504
//
505
// b!cxx SKIP
506
// bxx DEST
507
// SKIP:
508
//
509
const int instruction_1 = *(int*)(instruction_addr);
510
boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
511
opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
512
biint = inv_bi_field(instruction_1);
513
} else {
514
// variant 4???
515
ShouldNotReachHere();
516
}
517
518
// second, set the new branch destination and optimize the code
519
if (dest != instruction_addr + 4 && // the bc_far is still unbound!
520
masm.is_within_range_of_bcxx(dest, instruction_addr)) {
521
// variant 1:
522
//
523
// bcxx DEST
524
// nop
525
//
526
masm.bc(boint, biint, dest);
527
masm.nop();
528
} else {
529
// variant 2:
530
//
531
// b!cxx SKIP
532
// bxx DEST
533
// SKIP:
534
//
535
const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
536
opposite_bcond(inv_boint_bcond(boint)));
537
const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
538
masm.bc(opposite_boint, biint, not_taken_pc);
539
masm.b(dest);
540
}
541
}
542
ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
543
}
544
545
// Emit a NOT mt-safe patchable 64 bit absolute call/jump.
546
void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
547
// get current pc
548
uint64_t start_pc = (uint64_t) pc();
549
550
const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
551
const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
552
553
// relocate here
554
if (rt != relocInfo::none) {
555
relocate(rt);
556
}
557
558
if ( ReoptimizeCallSequences &&
559
(( link && is_within_range_of_b(dest, pc_of_bl)) ||
560
(!link && is_within_range_of_b(dest, pc_of_b)))) {
561
// variant 2:
562
// Emit an optimized, pc-relative call/jump.
563
564
if (link) {
565
// some padding
566
nop();
567
nop();
568
nop();
569
nop();
570
nop();
571
nop();
572
573
// do the call
574
assert(pc() == pc_of_bl, "just checking");
575
bl(dest, relocInfo::none);
576
} else {
577
// do the jump
578
assert(pc() == pc_of_b, "just checking");
579
b(dest, relocInfo::none);
580
581
// some padding
582
nop();
583
nop();
584
nop();
585
nop();
586
nop();
587
nop();
588
}
589
590
// Assert that we can identify the emitted call/jump.
591
assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
592
"can't identify emitted call");
593
} else {
594
// variant 1:
595
mr(R0, R11); // spill R11 -> R0.
596
597
// Load the destination address into CTR,
598
// calculate destination relative to global toc.
599
calculate_address_from_global_toc(R11, dest, true, true, false);
600
601
mtctr(R11);
602
mr(R11, R0); // spill R11 <- R0.
603
nop();
604
605
// do the call/jump
606
if (link) {
607
bctrl();
608
} else{
609
bctr();
610
}
611
// Assert that we can identify the emitted call/jump.
612
assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
613
"can't identify emitted call");
614
}
615
616
// Assert that we can identify the emitted call/jump.
617
assert(is_bxx64_patchable_at((address)start_pc, link),
618
"can't identify emitted call");
619
assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
620
"wrong encoding of dest address");
621
}
622
623
// Identify a bxx64_patchable instruction.
624
bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
625
return is_bxx64_patchable_variant1b_at(instruction_addr, link)
626
//|| is_bxx64_patchable_variant1_at(instruction_addr, link)
627
|| is_bxx64_patchable_variant2_at(instruction_addr, link);
628
}
629
630
// Does the call64_patchable instruction use a pc-relative encoding of
631
// the call destination?
632
bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
633
// variant 2 is pc-relative
634
return is_bxx64_patchable_variant2_at(instruction_addr, link);
635
}
636
637
// Identify variant 1.
638
bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
639
unsigned int* instr = (unsigned int*) instruction_addr;
640
return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
641
&& is_mtctr(instr[5]) // mtctr
642
&& is_load_const_at(instruction_addr);
643
}
644
645
// Identify variant 1b: load destination relative to global toc.
646
bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
647
unsigned int* instr = (unsigned int*) instruction_addr;
648
return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
649
&& is_mtctr(instr[3]) // mtctr
650
&& is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
651
}
652
653
// Identify variant 2.
654
bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
655
unsigned int* instr = (unsigned int*) instruction_addr;
656
if (link) {
657
return is_bl (instr[6]) // bl dest is last
658
&& is_nop(instr[0]) // nop
659
&& is_nop(instr[1]) // nop
660
&& is_nop(instr[2]) // nop
661
&& is_nop(instr[3]) // nop
662
&& is_nop(instr[4]) // nop
663
&& is_nop(instr[5]); // nop
664
} else {
665
return is_b (instr[0]) // b dest is first
666
&& is_nop(instr[1]) // nop
667
&& is_nop(instr[2]) // nop
668
&& is_nop(instr[3]) // nop
669
&& is_nop(instr[4]) // nop
670
&& is_nop(instr[5]) // nop
671
&& is_nop(instr[6]); // nop
672
}
673
}
674
675
// Set dest address of a bxx64_patchable instruction.
676
void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
677
ResourceMark rm;
678
int code_size = MacroAssembler::bxx64_patchable_size;
679
CodeBuffer buf(instruction_addr, code_size);
680
MacroAssembler masm(&buf);
681
masm.bxx64_patchable(dest, relocInfo::none, link);
682
ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
683
}
684
685
// Get dest address of a bxx64_patchable instruction.
686
address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
687
if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
688
return (address) (unsigned long) get_const(instruction_addr);
689
} else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
690
unsigned int* instr = (unsigned int*) instruction_addr;
691
if (link) {
692
const int instr_idx = 6; // bl is last
693
int branchoffset = branch_destination(instr[instr_idx], 0);
694
return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
695
} else {
696
const int instr_idx = 0; // b is first
697
int branchoffset = branch_destination(instr[instr_idx], 0);
698
return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
699
}
700
// Load dest relative to global toc.
701
} else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
702
return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
703
instruction_addr);
704
} else {
705
ShouldNotReachHere();
706
return NULL;
707
}
708
}
709
710
void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
711
const int magic_number = 0x42;
712
713
// Preserve stack pointer register (R1_SP) and system thread id register (R13);
714
// although they're technically volatile
715
for (int i = 2; i < 13; i++) {
716
Register reg = as_Register(i);
717
if (reg == excluded_register) {
718
continue;
719
}
720
721
li(reg, magic_number);
722
}
723
}
724
725
void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
726
const int magic_number = 0x43;
727
728
li(tmp, magic_number);
729
for (int m = 0; m <= 7; m++) {
730
std(tmp, frame::abi_minframe_size + m * 8, R1_SP);
731
}
732
}
733
734
// Uses ordering which corresponds to ABI:
735
// _savegpr0_14: std r14,-144(r1)
736
// _savegpr0_15: std r15,-136(r1)
737
// _savegpr0_16: std r16,-128(r1)
738
void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
739
std(R14, offset, dst); offset += 8;
740
std(R15, offset, dst); offset += 8;
741
std(R16, offset, dst); offset += 8;
742
std(R17, offset, dst); offset += 8;
743
std(R18, offset, dst); offset += 8;
744
std(R19, offset, dst); offset += 8;
745
std(R20, offset, dst); offset += 8;
746
std(R21, offset, dst); offset += 8;
747
std(R22, offset, dst); offset += 8;
748
std(R23, offset, dst); offset += 8;
749
std(R24, offset, dst); offset += 8;
750
std(R25, offset, dst); offset += 8;
751
std(R26, offset, dst); offset += 8;
752
std(R27, offset, dst); offset += 8;
753
std(R28, offset, dst); offset += 8;
754
std(R29, offset, dst); offset += 8;
755
std(R30, offset, dst); offset += 8;
756
std(R31, offset, dst); offset += 8;
757
758
stfd(F14, offset, dst); offset += 8;
759
stfd(F15, offset, dst); offset += 8;
760
stfd(F16, offset, dst); offset += 8;
761
stfd(F17, offset, dst); offset += 8;
762
stfd(F18, offset, dst); offset += 8;
763
stfd(F19, offset, dst); offset += 8;
764
stfd(F20, offset, dst); offset += 8;
765
stfd(F21, offset, dst); offset += 8;
766
stfd(F22, offset, dst); offset += 8;
767
stfd(F23, offset, dst); offset += 8;
768
stfd(F24, offset, dst); offset += 8;
769
stfd(F25, offset, dst); offset += 8;
770
stfd(F26, offset, dst); offset += 8;
771
stfd(F27, offset, dst); offset += 8;
772
stfd(F28, offset, dst); offset += 8;
773
stfd(F29, offset, dst); offset += 8;
774
stfd(F30, offset, dst); offset += 8;
775
stfd(F31, offset, dst);
776
}
777
778
// Uses ordering which corresponds to ABI:
779
// _restgpr0_14: ld r14,-144(r1)
780
// _restgpr0_15: ld r15,-136(r1)
781
// _restgpr0_16: ld r16,-128(r1)
782
void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
783
ld(R14, offset, src); offset += 8;
784
ld(R15, offset, src); offset += 8;
785
ld(R16, offset, src); offset += 8;
786
ld(R17, offset, src); offset += 8;
787
ld(R18, offset, src); offset += 8;
788
ld(R19, offset, src); offset += 8;
789
ld(R20, offset, src); offset += 8;
790
ld(R21, offset, src); offset += 8;
791
ld(R22, offset, src); offset += 8;
792
ld(R23, offset, src); offset += 8;
793
ld(R24, offset, src); offset += 8;
794
ld(R25, offset, src); offset += 8;
795
ld(R26, offset, src); offset += 8;
796
ld(R27, offset, src); offset += 8;
797
ld(R28, offset, src); offset += 8;
798
ld(R29, offset, src); offset += 8;
799
ld(R30, offset, src); offset += 8;
800
ld(R31, offset, src); offset += 8;
801
802
// FP registers
803
lfd(F14, offset, src); offset += 8;
804
lfd(F15, offset, src); offset += 8;
805
lfd(F16, offset, src); offset += 8;
806
lfd(F17, offset, src); offset += 8;
807
lfd(F18, offset, src); offset += 8;
808
lfd(F19, offset, src); offset += 8;
809
lfd(F20, offset, src); offset += 8;
810
lfd(F21, offset, src); offset += 8;
811
lfd(F22, offset, src); offset += 8;
812
lfd(F23, offset, src); offset += 8;
813
lfd(F24, offset, src); offset += 8;
814
lfd(F25, offset, src); offset += 8;
815
lfd(F26, offset, src); offset += 8;
816
lfd(F27, offset, src); offset += 8;
817
lfd(F28, offset, src); offset += 8;
818
lfd(F29, offset, src); offset += 8;
819
lfd(F30, offset, src); offset += 8;
820
lfd(F31, offset, src);
821
}
822
823
// For verify_oops.
824
void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
825
std(R2, offset, dst); offset += 8;
826
if (include_R3_RET_reg) {
827
std(R3, offset, dst); offset += 8;
828
}
829
std(R4, offset, dst); offset += 8;
830
std(R5, offset, dst); offset += 8;
831
std(R6, offset, dst); offset += 8;
832
std(R7, offset, dst); offset += 8;
833
std(R8, offset, dst); offset += 8;
834
std(R9, offset, dst); offset += 8;
835
std(R10, offset, dst); offset += 8;
836
std(R11, offset, dst); offset += 8;
837
std(R12, offset, dst); offset += 8;
838
839
if (include_fp_regs) {
840
stfd(F0, offset, dst); offset += 8;
841
stfd(F1, offset, dst); offset += 8;
842
stfd(F2, offset, dst); offset += 8;
843
stfd(F3, offset, dst); offset += 8;
844
stfd(F4, offset, dst); offset += 8;
845
stfd(F5, offset, dst); offset += 8;
846
stfd(F6, offset, dst); offset += 8;
847
stfd(F7, offset, dst); offset += 8;
848
stfd(F8, offset, dst); offset += 8;
849
stfd(F9, offset, dst); offset += 8;
850
stfd(F10, offset, dst); offset += 8;
851
stfd(F11, offset, dst); offset += 8;
852
stfd(F12, offset, dst); offset += 8;
853
stfd(F13, offset, dst);
854
}
855
}
856
857
// For verify_oops.
858
void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
859
ld(R2, offset, src); offset += 8;
860
if (include_R3_RET_reg) {
861
ld(R3, offset, src); offset += 8;
862
}
863
ld(R4, offset, src); offset += 8;
864
ld(R5, offset, src); offset += 8;
865
ld(R6, offset, src); offset += 8;
866
ld(R7, offset, src); offset += 8;
867
ld(R8, offset, src); offset += 8;
868
ld(R9, offset, src); offset += 8;
869
ld(R10, offset, src); offset += 8;
870
ld(R11, offset, src); offset += 8;
871
ld(R12, offset, src); offset += 8;
872
873
if (include_fp_regs) {
874
lfd(F0, offset, src); offset += 8;
875
lfd(F1, offset, src); offset += 8;
876
lfd(F2, offset, src); offset += 8;
877
lfd(F3, offset, src); offset += 8;
878
lfd(F4, offset, src); offset += 8;
879
lfd(F5, offset, src); offset += 8;
880
lfd(F6, offset, src); offset += 8;
881
lfd(F7, offset, src); offset += 8;
882
lfd(F8, offset, src); offset += 8;
883
lfd(F9, offset, src); offset += 8;
884
lfd(F10, offset, src); offset += 8;
885
lfd(F11, offset, src); offset += 8;
886
lfd(F12, offset, src); offset += 8;
887
lfd(F13, offset, src);
888
}
889
}
890
891
void MacroAssembler::save_LR_CR(Register tmp) {
892
mfcr(tmp);
893
std(tmp, _abi0(cr), R1_SP);
894
mflr(tmp);
895
std(tmp, _abi0(lr), R1_SP);
896
// Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
897
}
898
899
void MacroAssembler::restore_LR_CR(Register tmp) {
900
assert(tmp != R1_SP, "must be distinct");
901
ld(tmp, _abi0(lr), R1_SP);
902
mtlr(tmp);
903
ld(tmp, _abi0(cr), R1_SP);
904
mtcr(tmp);
905
}
906
907
address MacroAssembler::get_PC_trash_LR(Register result) {
908
Label L;
909
bl(L);
910
bind(L);
911
address lr_pc = pc();
912
mflr(result);
913
return lr_pc;
914
}
915
916
void MacroAssembler::resize_frame(Register offset, Register tmp) {
917
#ifdef ASSERT
918
assert_different_registers(offset, tmp, R1_SP);
919
andi_(tmp, offset, frame::alignment_in_bytes-1);
920
asm_assert_eq("resize_frame: unaligned");
921
#endif
922
923
// tmp <- *(SP)
924
ld(tmp, _abi0(callers_sp), R1_SP);
925
// addr <- SP + offset;
926
// *(addr) <- tmp;
927
// SP <- addr
928
stdux(tmp, R1_SP, offset);
929
}
930
931
void MacroAssembler::resize_frame(int offset, Register tmp) {
932
assert(is_simm(offset, 16), "too big an offset");
933
assert_different_registers(tmp, R1_SP);
934
assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
935
// tmp <- *(SP)
936
ld(tmp, _abi0(callers_sp), R1_SP);
937
// addr <- SP + offset;
938
// *(addr) <- tmp;
939
// SP <- addr
940
stdu(tmp, offset, R1_SP);
941
}
942
943
void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
944
// (addr == tmp1) || (addr == tmp2) is allowed here!
945
assert(tmp1 != tmp2, "must be distinct");
946
947
// compute offset w.r.t. current stack pointer
948
// tmp_1 <- addr - SP (!)
949
subf(tmp1, R1_SP, addr);
950
951
// atomically update SP keeping back link.
952
resize_frame(tmp1/* offset */, tmp2/* tmp */);
953
}
954
955
void MacroAssembler::push_frame(Register bytes, Register tmp) {
956
#ifdef ASSERT
957
assert(bytes != R0, "r0 not allowed here");
958
andi_(R0, bytes, frame::alignment_in_bytes-1);
959
asm_assert_eq("push_frame(Reg, Reg): unaligned");
960
#endif
961
neg(tmp, bytes);
962
stdux(R1_SP, R1_SP, tmp);
963
}
964
965
// Push a frame of size `bytes'.
966
void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
967
long offset = align_addr(bytes, frame::alignment_in_bytes);
968
if (is_simm(-offset, 16)) {
969
stdu(R1_SP, -offset, R1_SP);
970
} else {
971
load_const_optimized(tmp, -offset);
972
stdux(R1_SP, R1_SP, tmp);
973
}
974
}
975
976
// Push a frame of size `bytes' plus abi_reg_args on top.
977
void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
978
push_frame(bytes + frame::abi_reg_args_size, tmp);
979
}
980
981
// Setup up a new C frame with a spill area for non-volatile GPRs and
982
// additional space for local variables.
983
void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
984
Register tmp) {
985
push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
986
}
987
988
// Pop current C frame.
989
void MacroAssembler::pop_frame() {
990
ld(R1_SP, _abi0(callers_sp), R1_SP);
991
}
992
993
#if defined(ABI_ELFv2)
994
address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
995
// TODO(asmundak): make sure the caller uses R12 as function descriptor
996
// most of the times.
997
if (R12 != r_function_entry) {
998
mr(R12, r_function_entry);
999
}
1000
mtctr(R12);
1001
// Do a call or a branch.
1002
if (and_link) {
1003
bctrl();
1004
} else {
1005
bctr();
1006
}
1007
_last_calls_return_pc = pc();
1008
1009
return _last_calls_return_pc;
1010
}
1011
1012
// Call a C function via a function descriptor and use full C
1013
// calling conventions. Updates and returns _last_calls_return_pc.
1014
address MacroAssembler::call_c(Register r_function_entry) {
1015
return branch_to(r_function_entry, /*and_link=*/true);
1016
}
1017
1018
// For tail calls: only branch, don't link, so callee returns to caller of this function.
1019
address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1020
return branch_to(r_function_entry, /*and_link=*/false);
1021
}
1022
1023
address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1024
load_const(R12, function_entry, R0);
1025
return branch_to(R12, /*and_link=*/true);
1026
}
1027
1028
#else
1029
// Generic version of a call to C function via a function descriptor
1030
// with variable support for C calling conventions (TOC, ENV, etc.).
1031
// Updates and returns _last_calls_return_pc.
1032
address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1033
bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1034
// we emit standard ptrgl glue code here
1035
assert((function_descriptor != R0), "function_descriptor cannot be R0");
1036
1037
// retrieve necessary entries from the function descriptor
1038
ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1039
mtctr(R0);
1040
1041
if (load_toc_of_callee) {
1042
ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1043
}
1044
if (load_env_of_callee) {
1045
ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1046
} else if (load_toc_of_callee) {
1047
li(R11, 0);
1048
}
1049
1050
// do a call or a branch
1051
if (and_link) {
1052
bctrl();
1053
} else {
1054
bctr();
1055
}
1056
_last_calls_return_pc = pc();
1057
1058
return _last_calls_return_pc;
1059
}
1060
1061
// Call a C function via a function descriptor and use full C calling
1062
// conventions.
1063
// We don't use the TOC in generated code, so there is no need to save
1064
// and restore its value.
1065
address MacroAssembler::call_c(Register fd) {
1066
return branch_to(fd, /*and_link=*/true,
1067
/*save toc=*/false,
1068
/*restore toc=*/false,
1069
/*load toc=*/true,
1070
/*load env=*/true);
1071
}
1072
1073
address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1074
return branch_to(fd, /*and_link=*/false,
1075
/*save toc=*/false,
1076
/*restore toc=*/false,
1077
/*load toc=*/true,
1078
/*load env=*/true);
1079
}
1080
1081
address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1082
if (rt != relocInfo::none) {
1083
// this call needs to be relocatable
1084
if (!ReoptimizeCallSequences
1085
|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1086
|| fd == NULL // support code-size estimation
1087
|| !fd->is_friend_function()
1088
|| fd->entry() == NULL) {
1089
// it's not a friend function as defined by class FunctionDescriptor,
1090
// so do a full call-c here.
1091
load_const(R11, (address)fd, R0);
1092
1093
bool has_env = (fd != NULL && fd->env() != NULL);
1094
return branch_to(R11, /*and_link=*/true,
1095
/*save toc=*/false,
1096
/*restore toc=*/false,
1097
/*load toc=*/true,
1098
/*load env=*/has_env);
1099
} else {
1100
// It's a friend function. Load the entry point and don't care about
1101
// toc and env. Use an optimizable call instruction, but ensure the
1102
// same code-size as in the case of a non-friend function.
1103
nop();
1104
nop();
1105
nop();
1106
bl64_patchable(fd->entry(), rt);
1107
_last_calls_return_pc = pc();
1108
return _last_calls_return_pc;
1109
}
1110
} else {
1111
// This call does not need to be relocatable, do more aggressive
1112
// optimizations.
1113
if (!ReoptimizeCallSequences
1114
|| !fd->is_friend_function()) {
1115
// It's not a friend function as defined by class FunctionDescriptor,
1116
// so do a full call-c here.
1117
load_const(R11, (address)fd, R0);
1118
return branch_to(R11, /*and_link=*/true,
1119
/*save toc=*/false,
1120
/*restore toc=*/false,
1121
/*load toc=*/true,
1122
/*load env=*/true);
1123
} else {
1124
// it's a friend function, load the entry point and don't care about
1125
// toc and env.
1126
address dest = fd->entry();
1127
if (is_within_range_of_b(dest, pc())) {
1128
bl(dest);
1129
} else {
1130
bl64_patchable(dest, rt);
1131
}
1132
_last_calls_return_pc = pc();
1133
return _last_calls_return_pc;
1134
}
1135
}
1136
}
1137
1138
// Call a C function. All constants needed reside in TOC.
1139
//
1140
// Read the address to call from the TOC.
1141
// Read env from TOC, if fd specifies an env.
1142
// Read new TOC from TOC.
1143
address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1144
relocInfo::relocType rt, Register toc) {
1145
if (!ReoptimizeCallSequences
1146
|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1147
|| !fd->is_friend_function()) {
1148
// It's not a friend function as defined by class FunctionDescriptor,
1149
// so do a full call-c here.
1150
assert(fd->entry() != NULL, "function must be linked");
1151
1152
AddressLiteral fd_entry(fd->entry());
1153
bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1154
mtctr(R11);
1155
if (fd->env() == NULL) {
1156
li(R11, 0);
1157
nop();
1158
} else {
1159
AddressLiteral fd_env(fd->env());
1160
success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1161
}
1162
AddressLiteral fd_toc(fd->toc());
1163
// Set R2_TOC (load from toc)
1164
success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1165
bctrl();
1166
_last_calls_return_pc = pc();
1167
if (!success) { return NULL; }
1168
} else {
1169
// It's a friend function, load the entry point and don't care about
1170
// toc and env. Use an optimizable call instruction, but ensure the
1171
// same code-size as in the case of a non-friend function.
1172
nop();
1173
bl64_patchable(fd->entry(), rt);
1174
_last_calls_return_pc = pc();
1175
}
1176
return _last_calls_return_pc;
1177
}
1178
#endif // ABI_ELFv2
1179
1180
void MacroAssembler::call_VM_base(Register oop_result,
1181
Register last_java_sp,
1182
address entry_point,
1183
bool check_exceptions) {
1184
BLOCK_COMMENT("call_VM {");
1185
// Determine last_java_sp register.
1186
if (!last_java_sp->is_valid()) {
1187
last_java_sp = R1_SP;
1188
}
1189
set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1190
1191
// ARG1 must hold thread address.
1192
mr(R3_ARG1, R16_thread);
1193
#if defined(ABI_ELFv2)
1194
address return_pc = call_c(entry_point, relocInfo::none);
1195
#else
1196
address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1197
#endif
1198
1199
reset_last_Java_frame();
1200
1201
// Check for pending exceptions.
1202
if (check_exceptions) {
1203
// We don't check for exceptions here.
1204
ShouldNotReachHere();
1205
}
1206
1207
// Get oop result if there is one and reset the value in the thread.
1208
if (oop_result->is_valid()) {
1209
get_vm_result(oop_result);
1210
}
1211
1212
_last_calls_return_pc = return_pc;
1213
BLOCK_COMMENT("} call_VM");
1214
}
1215
1216
void MacroAssembler::call_VM_leaf_base(address entry_point) {
1217
BLOCK_COMMENT("call_VM_leaf {");
1218
#if defined(ABI_ELFv2)
1219
call_c(entry_point, relocInfo::none);
1220
#else
1221
call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1222
#endif
1223
BLOCK_COMMENT("} call_VM_leaf");
1224
}
1225
1226
void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1227
call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1228
}
1229
1230
void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1231
bool check_exceptions) {
1232
// R3_ARG1 is reserved for the thread.
1233
mr_if_needed(R4_ARG2, arg_1);
1234
call_VM(oop_result, entry_point, check_exceptions);
1235
}
1236
1237
void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1238
bool check_exceptions) {
1239
// R3_ARG1 is reserved for the thread
1240
mr_if_needed(R4_ARG2, arg_1);
1241
assert(arg_2 != R4_ARG2, "smashed argument");
1242
mr_if_needed(R5_ARG3, arg_2);
1243
call_VM(oop_result, entry_point, check_exceptions);
1244
}
1245
1246
void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1247
bool check_exceptions) {
1248
// R3_ARG1 is reserved for the thread
1249
mr_if_needed(R4_ARG2, arg_1);
1250
assert(arg_2 != R4_ARG2, "smashed argument");
1251
mr_if_needed(R5_ARG3, arg_2);
1252
mr_if_needed(R6_ARG4, arg_3);
1253
call_VM(oop_result, entry_point, check_exceptions);
1254
}
1255
1256
void MacroAssembler::call_VM_leaf(address entry_point) {
1257
call_VM_leaf_base(entry_point);
1258
}
1259
1260
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1261
mr_if_needed(R3_ARG1, arg_1);
1262
call_VM_leaf(entry_point);
1263
}
1264
1265
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1266
mr_if_needed(R3_ARG1, arg_1);
1267
assert(arg_2 != R3_ARG1, "smashed argument");
1268
mr_if_needed(R4_ARG2, arg_2);
1269
call_VM_leaf(entry_point);
1270
}
1271
1272
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1273
mr_if_needed(R3_ARG1, arg_1);
1274
assert(arg_2 != R3_ARG1, "smashed argument");
1275
mr_if_needed(R4_ARG2, arg_2);
1276
assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1277
mr_if_needed(R5_ARG3, arg_3);
1278
call_VM_leaf(entry_point);
1279
}
1280
1281
// Check whether instruction is a read access to the polling page
1282
// which was emitted by load_from_polling_page(..).
1283
bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1284
address* polling_address_ptr) {
1285
if (!is_ld(instruction))
1286
return false; // It's not a ld. Fail.
1287
1288
int rt = inv_rt_field(instruction);
1289
int ra = inv_ra_field(instruction);
1290
int ds = inv_ds_field(instruction);
1291
if (!(ds == 0 && ra != 0 && rt == 0)) {
1292
return false; // It's not a ld(r0, X, ra). Fail.
1293
}
1294
1295
if (!ucontext) {
1296
// Set polling address.
1297
if (polling_address_ptr != NULL) {
1298
*polling_address_ptr = NULL;
1299
}
1300
return true; // No ucontext given. Can't check value of ra. Assume true.
1301
}
1302
1303
#ifdef LINUX
1304
// Ucontext given. Check that register ra contains the address of
1305
// the safepoing polling page.
1306
ucontext_t* uc = (ucontext_t*) ucontext;
1307
// Set polling address.
1308
address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1309
if (polling_address_ptr != NULL) {
1310
*polling_address_ptr = addr;
1311
}
1312
return SafepointMechanism::is_poll_address(addr);
1313
#else
1314
// Not on Linux, ucontext must be NULL.
1315
ShouldNotReachHere();
1316
return false;
1317
#endif
1318
}
1319
1320
void MacroAssembler::bang_stack_with_offset(int offset) {
1321
// When increasing the stack, the old stack pointer will be written
1322
// to the new top of stack according to the PPC64 abi.
1323
// Therefore, stack banging is not necessary when increasing
1324
// the stack by <= os::vm_page_size() bytes.
1325
// When increasing the stack by a larger amount, this method is
1326
// called repeatedly to bang the intermediate pages.
1327
1328
// Stack grows down, caller passes positive offset.
1329
assert(offset > 0, "must bang with positive offset");
1330
1331
long stdoffset = -offset;
1332
1333
if (is_simm(stdoffset, 16)) {
1334
// Signed 16 bit offset, a simple std is ok.
1335
if (UseLoadInstructionsForStackBangingPPC64) {
1336
ld(R0, (int)(signed short)stdoffset, R1_SP);
1337
} else {
1338
std(R0,(int)(signed short)stdoffset, R1_SP);
1339
}
1340
} else if (is_simm(stdoffset, 31)) {
1341
const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1342
const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1343
1344
Register tmp = R11;
1345
addis(tmp, R1_SP, hi);
1346
if (UseLoadInstructionsForStackBangingPPC64) {
1347
ld(R0, lo, tmp);
1348
} else {
1349
std(R0, lo, tmp);
1350
}
1351
} else {
1352
ShouldNotReachHere();
1353
}
1354
}
1355
1356
// If instruction is a stack bang of the form
1357
// std R0, x(Ry), (see bang_stack_with_offset())
1358
// stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1359
// or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1360
// return the banged address. Otherwise, return 0.
1361
address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1362
#ifdef LINUX
1363
ucontext_t* uc = (ucontext_t*) ucontext;
1364
int rs = inv_rs_field(instruction);
1365
int ra = inv_ra_field(instruction);
1366
if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1367
|| (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1368
|| (is_stdu(instruction) && rs == 1)) {
1369
int ds = inv_ds_field(instruction);
1370
// return banged address
1371
return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1372
} else if (is_stdux(instruction) && rs == 1) {
1373
int rb = inv_rb_field(instruction);
1374
address sp = (address)uc->uc_mcontext.regs->gpr[1];
1375
long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1376
return ra != 1 || rb_val >= 0 ? NULL // not a stack bang
1377
: sp + rb_val; // banged address
1378
}
1379
return NULL; // not a stack bang
1380
#else
1381
// workaround not needed on !LINUX :-)
1382
ShouldNotCallThis();
1383
return NULL;
1384
#endif
1385
}
1386
1387
void MacroAssembler::reserved_stack_check(Register return_pc) {
1388
// Test if reserved zone needs to be enabled.
1389
Label no_reserved_zone_enabling;
1390
1391
ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1392
cmpld(CCR0, R1_SP, R0);
1393
blt_predict_taken(CCR0, no_reserved_zone_enabling);
1394
1395
// Enable reserved zone again, throw stack overflow exception.
1396
push_frame_reg_args(0, R0);
1397
call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1398
pop_frame();
1399
mtlr(return_pc);
1400
load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1401
mtctr(R0);
1402
bctr();
1403
1404
should_not_reach_here();
1405
1406
bind(no_reserved_zone_enabling);
1407
}
1408
1409
void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1410
bool cmpxchgx_hint) {
1411
Label retry;
1412
bind(retry);
1413
ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1414
stdcx_(exchange_value, addr_base);
1415
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1416
bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1417
} else {
1418
bne( CCR0, retry); // StXcx_ sets CCR0.
1419
}
1420
}
1421
1422
void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1423
Register tmp, bool cmpxchgx_hint) {
1424
Label retry;
1425
bind(retry);
1426
ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1427
add(tmp, dest_current_value, inc_value);
1428
stdcx_(tmp, addr_base);
1429
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1430
bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1431
} else {
1432
bne( CCR0, retry); // StXcx_ sets CCR0.
1433
}
1434
}
1435
1436
// Word/sub-word atomic helper functions
1437
1438
// Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1439
// Only signed types are supported with size < 4.
1440
// Atomic add always kills tmp1.
1441
void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1442
Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1443
bool cmpxchgx_hint, bool is_add, int size) {
1444
// Sub-word instructions are available since Power 8.
1445
// For older processors, instruction_type != size holds, and we
1446
// emulate the sub-word instructions by constructing a 4-byte value
1447
// that leaves the other bytes unchanged.
1448
const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1449
1450
Label retry;
1451
Register shift_amount = noreg,
1452
val32 = dest_current_value,
1453
modval = is_add ? tmp1 : exchange_value;
1454
1455
if (instruction_type != size) {
1456
assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1457
modval = tmp1;
1458
shift_amount = tmp2;
1459
val32 = tmp3;
1460
// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1461
#ifdef VM_LITTLE_ENDIAN
1462
rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1463
clrrdi(addr_base, addr_base, 2);
1464
#else
1465
xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1466
clrrdi(addr_base, addr_base, 2);
1467
rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1468
#endif
1469
}
1470
1471
// atomic emulation loop
1472
bind(retry);
1473
1474
switch (instruction_type) {
1475
case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1476
case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1477
case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1478
default: ShouldNotReachHere();
1479
}
1480
1481
if (instruction_type != size) {
1482
srw(dest_current_value, val32, shift_amount);
1483
}
1484
1485
if (is_add) { add(modval, dest_current_value, exchange_value); }
1486
1487
if (instruction_type != size) {
1488
// Transform exchange value such that the replacement can be done by one xor instruction.
1489
xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1490
clrldi(modval, modval, (size == 1) ? 56 : 48);
1491
slw(modval, modval, shift_amount);
1492
xorr(modval, val32, modval);
1493
}
1494
1495
switch (instruction_type) {
1496
case 4: stwcx_(modval, addr_base); break;
1497
case 2: sthcx_(modval, addr_base); break;
1498
case 1: stbcx_(modval, addr_base); break;
1499
default: ShouldNotReachHere();
1500
}
1501
1502
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1503
bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1504
} else {
1505
bne( CCR0, retry); // StXcx_ sets CCR0.
1506
}
1507
1508
// l?arx zero-extends, but Java wants byte/short values sign-extended.
1509
if (size == 1) {
1510
extsb(dest_current_value, dest_current_value);
1511
} else if (size == 2) {
1512
extsh(dest_current_value, dest_current_value);
1513
};
1514
}
1515
1516
// Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1517
// Only signed types are supported with size < 4.
1518
void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1519
Register compare_value, Register exchange_value,
1520
Register addr_base, Register tmp1, Register tmp2,
1521
Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1522
// Sub-word instructions are available since Power 8.
1523
// For older processors, instruction_type != size holds, and we
1524
// emulate the sub-word instructions by constructing a 4-byte value
1525
// that leaves the other bytes unchanged.
1526
const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1527
1528
Register shift_amount = noreg,
1529
val32 = dest_current_value,
1530
modval = exchange_value;
1531
1532
if (instruction_type != size) {
1533
assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1534
shift_amount = tmp1;
1535
val32 = tmp2;
1536
modval = tmp2;
1537
// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1538
#ifdef VM_LITTLE_ENDIAN
1539
rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1540
clrrdi(addr_base, addr_base, 2);
1541
#else
1542
xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1543
clrrdi(addr_base, addr_base, 2);
1544
rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1545
#endif
1546
// Transform exchange value such that the replacement can be done by one xor instruction.
1547
xorr(exchange_value, compare_value, exchange_value);
1548
clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1549
slw(exchange_value, exchange_value, shift_amount);
1550
}
1551
1552
// atomic emulation loop
1553
bind(retry);
1554
1555
switch (instruction_type) {
1556
case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1557
case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1558
case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1559
default: ShouldNotReachHere();
1560
}
1561
1562
if (instruction_type != size) {
1563
srw(dest_current_value, val32, shift_amount);
1564
}
1565
if (size == 1) {
1566
extsb(dest_current_value, dest_current_value);
1567
} else if (size == 2) {
1568
extsh(dest_current_value, dest_current_value);
1569
};
1570
1571
cmpw(flag, dest_current_value, compare_value);
1572
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1573
bne_predict_not_taken(flag, failed);
1574
} else {
1575
bne( flag, failed);
1576
}
1577
// branch to done => (flag == ne), (dest_current_value != compare_value)
1578
// fall through => (flag == eq), (dest_current_value == compare_value)
1579
1580
if (instruction_type != size) {
1581
xorr(modval, val32, exchange_value);
1582
}
1583
1584
switch (instruction_type) {
1585
case 4: stwcx_(modval, addr_base); break;
1586
case 2: sthcx_(modval, addr_base); break;
1587
case 1: stbcx_(modval, addr_base); break;
1588
default: ShouldNotReachHere();
1589
}
1590
}
1591
1592
// CmpxchgX sets condition register to cmpX(current, compare).
1593
void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1594
Register compare_value, Register exchange_value,
1595
Register addr_base, Register tmp1, Register tmp2,
1596
int semantics, bool cmpxchgx_hint,
1597
Register int_flag_success, bool contention_hint, bool weak, int size) {
1598
Label retry;
1599
Label failed;
1600
Label done;
1601
1602
// Save one branch if result is returned via register and
1603
// result register is different from the other ones.
1604
bool use_result_reg = (int_flag_success != noreg);
1605
bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1606
int_flag_success != exchange_value && int_flag_success != addr_base &&
1607
int_flag_success != tmp1 && int_flag_success != tmp2);
1608
assert(!weak || flag == CCR0, "weak only supported with CCR0");
1609
assert(size == 1 || size == 2 || size == 4, "unsupported");
1610
1611
if (use_result_reg && preset_result_reg) {
1612
li(int_flag_success, 0); // preset (assume cas failed)
1613
}
1614
1615
// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1616
if (contention_hint) { // Don't try to reserve if cmp fails.
1617
switch (size) {
1618
case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1619
case 2: lha(dest_current_value, 0, addr_base); break;
1620
case 4: lwz(dest_current_value, 0, addr_base); break;
1621
default: ShouldNotReachHere();
1622
}
1623
cmpw(flag, dest_current_value, compare_value);
1624
bne(flag, failed);
1625
}
1626
1627
// release/fence semantics
1628
if (semantics & MemBarRel) {
1629
release();
1630
}
1631
1632
cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1633
retry, failed, cmpxchgx_hint, size);
1634
if (!weak || use_result_reg) {
1635
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1636
bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1637
} else {
1638
bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1639
}
1640
}
1641
// fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1642
1643
// Result in register (must do this at the end because int_flag_success can be the
1644
// same register as one above).
1645
if (use_result_reg) {
1646
li(int_flag_success, 1);
1647
}
1648
1649
if (semantics & MemBarFenceAfter) {
1650
fence();
1651
} else if (semantics & MemBarAcq) {
1652
isync();
1653
}
1654
1655
if (use_result_reg && !preset_result_reg) {
1656
b(done);
1657
}
1658
1659
bind(failed);
1660
if (use_result_reg && !preset_result_reg) {
1661
li(int_flag_success, 0);
1662
}
1663
1664
bind(done);
1665
// (flag == ne) => (dest_current_value != compare_value), (!swapped)
1666
// (flag == eq) => (dest_current_value == compare_value), ( swapped)
1667
}
1668
1669
// Preforms atomic compare exchange:
1670
// if (compare_value == *addr_base)
1671
// *addr_base = exchange_value
1672
// int_flag_success = 1;
1673
// else
1674
// int_flag_success = 0;
1675
//
1676
// ConditionRegister flag = cmp(compare_value, *addr_base)
1677
// Register dest_current_value = *addr_base
1678
// Register compare_value Used to compare with value in memory
1679
// Register exchange_value Written to memory if compare_value == *addr_base
1680
// Register addr_base The memory location to compareXChange
1681
// Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1682
//
1683
// To avoid the costly compare exchange the value is tested beforehand.
1684
// Several special cases exist to avoid that unnecessary information is generated.
1685
//
1686
void MacroAssembler::cmpxchgd(ConditionRegister flag,
1687
Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1688
Register addr_base, int semantics, bool cmpxchgx_hint,
1689
Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1690
Label retry;
1691
Label failed_int;
1692
Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1693
Label done;
1694
1695
// Save one branch if result is returned via register and result register is different from the other ones.
1696
bool use_result_reg = (int_flag_success!=noreg);
1697
bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1698
int_flag_success!=exchange_value && int_flag_success!=addr_base);
1699
assert(!weak || flag == CCR0, "weak only supported with CCR0");
1700
assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1701
1702
if (use_result_reg && preset_result_reg) {
1703
li(int_flag_success, 0); // preset (assume cas failed)
1704
}
1705
1706
// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1707
if (contention_hint) { // Don't try to reserve if cmp fails.
1708
ld(dest_current_value, 0, addr_base);
1709
cmpd(flag, compare_value, dest_current_value);
1710
bne(flag, failed);
1711
}
1712
1713
// release/fence semantics
1714
if (semantics & MemBarRel) {
1715
release();
1716
}
1717
1718
// atomic emulation loop
1719
bind(retry);
1720
1721
ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1722
cmpd(flag, compare_value, dest_current_value);
1723
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1724
bne_predict_not_taken(flag, failed);
1725
} else {
1726
bne( flag, failed);
1727
}
1728
1729
stdcx_(exchange_value, addr_base);
1730
if (!weak || use_result_reg || failed_ext) {
1731
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1732
bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1733
} else {
1734
bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1735
}
1736
}
1737
1738
// result in register (must do this at the end because int_flag_success can be the same register as one above)
1739
if (use_result_reg) {
1740
li(int_flag_success, 1);
1741
}
1742
1743
if (semantics & MemBarFenceAfter) {
1744
fence();
1745
} else if (semantics & MemBarAcq) {
1746
isync();
1747
}
1748
1749
if (use_result_reg && !preset_result_reg) {
1750
b(done);
1751
}
1752
1753
bind(failed_int);
1754
if (use_result_reg && !preset_result_reg) {
1755
li(int_flag_success, 0);
1756
}
1757
1758
bind(done);
1759
// (flag == ne) => (dest_current_value != compare_value), (!swapped)
1760
// (flag == eq) => (dest_current_value == compare_value), ( swapped)
1761
}
1762
1763
// Look up the method for a megamorphic invokeinterface call.
1764
// The target method is determined by <intf_klass, itable_index>.
1765
// The receiver klass is in recv_klass.
1766
// On success, the result will be in method_result, and execution falls through.
1767
// On failure, execution transfers to the given label.
1768
void MacroAssembler::lookup_interface_method(Register recv_klass,
1769
Register intf_klass,
1770
RegisterOrConstant itable_index,
1771
Register method_result,
1772
Register scan_temp,
1773
Register temp2,
1774
Label& L_no_such_interface,
1775
bool return_method) {
1776
assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1777
1778
// Compute start of first itableOffsetEntry (which is at the end of the vtable).
1779
int vtable_base = in_bytes(Klass::vtable_start_offset());
1780
int itentry_off = itableMethodEntry::method_offset_in_bytes();
1781
int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1782
int scan_step = itableOffsetEntry::size() * wordSize;
1783
int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1784
1785
lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1786
// %%% We should store the aligned, prescaled offset in the klassoop.
1787
// Then the next several instructions would fold away.
1788
1789
sldi(scan_temp, scan_temp, log_vte_size);
1790
addi(scan_temp, scan_temp, vtable_base);
1791
add(scan_temp, recv_klass, scan_temp);
1792
1793
// Adjust recv_klass by scaled itable_index, so we can free itable_index.
1794
if (return_method) {
1795
if (itable_index.is_register()) {
1796
Register itable_offset = itable_index.as_register();
1797
sldi(method_result, itable_offset, logMEsize);
1798
if (itentry_off) { addi(method_result, method_result, itentry_off); }
1799
add(method_result, method_result, recv_klass);
1800
} else {
1801
long itable_offset = (long)itable_index.as_constant();
1802
// static address, no relocation
1803
add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1804
}
1805
}
1806
1807
// for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1808
// if (scan->interface() == intf) {
1809
// result = (klass + scan->offset() + itable_index);
1810
// }
1811
// }
1812
Label search, found_method;
1813
1814
for (int peel = 1; peel >= 0; peel--) {
1815
// %%%% Could load both offset and interface in one ldx, if they were
1816
// in the opposite order. This would save a load.
1817
ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1818
1819
// Check that this entry is non-null. A null entry means that
1820
// the receiver class doesn't implement the interface, and wasn't the
1821
// same as when the caller was compiled.
1822
cmpd(CCR0, temp2, intf_klass);
1823
1824
if (peel) {
1825
beq(CCR0, found_method);
1826
} else {
1827
bne(CCR0, search);
1828
// (invert the test to fall through to found_method...)
1829
}
1830
1831
if (!peel) break;
1832
1833
bind(search);
1834
1835
cmpdi(CCR0, temp2, 0);
1836
beq(CCR0, L_no_such_interface);
1837
addi(scan_temp, scan_temp, scan_step);
1838
}
1839
1840
bind(found_method);
1841
1842
// Got a hit.
1843
if (return_method) {
1844
int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1845
lwz(scan_temp, ito_offset, scan_temp);
1846
ldx(method_result, scan_temp, method_result);
1847
}
1848
}
1849
1850
// virtual method calling
1851
void MacroAssembler::lookup_virtual_method(Register recv_klass,
1852
RegisterOrConstant vtable_index,
1853
Register method_result) {
1854
1855
assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1856
1857
const int base = in_bytes(Klass::vtable_start_offset());
1858
assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1859
1860
if (vtable_index.is_register()) {
1861
sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1862
add(recv_klass, vtable_index.as_register(), recv_klass);
1863
} else {
1864
addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1865
}
1866
ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1867
}
1868
1869
/////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1870
void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1871
Register super_klass,
1872
Register temp1_reg,
1873
Register temp2_reg,
1874
Label* L_success,
1875
Label* L_failure,
1876
Label* L_slow_path,
1877
RegisterOrConstant super_check_offset) {
1878
1879
const Register check_cache_offset = temp1_reg;
1880
const Register cached_super = temp2_reg;
1881
1882
assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1883
1884
int sco_offset = in_bytes(Klass::super_check_offset_offset());
1885
int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1886
1887
bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1888
bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1889
1890
Label L_fallthrough;
1891
int label_nulls = 0;
1892
if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1893
if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1894
if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1895
assert(label_nulls <= 1 ||
1896
(L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1897
"at most one NULL in the batch, usually");
1898
1899
// If the pointers are equal, we are done (e.g., String[] elements).
1900
// This self-check enables sharing of secondary supertype arrays among
1901
// non-primary types such as array-of-interface. Otherwise, each such
1902
// type would need its own customized SSA.
1903
// We move this check to the front of the fast path because many
1904
// type checks are in fact trivially successful in this manner,
1905
// so we get a nicely predicted branch right at the start of the check.
1906
cmpd(CCR0, sub_klass, super_klass);
1907
beq(CCR0, *L_success);
1908
1909
// Check the supertype display:
1910
if (must_load_sco) {
1911
// The super check offset is always positive...
1912
lwz(check_cache_offset, sco_offset, super_klass);
1913
super_check_offset = RegisterOrConstant(check_cache_offset);
1914
// super_check_offset is register.
1915
assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1916
}
1917
// The loaded value is the offset from KlassOopDesc.
1918
1919
ld(cached_super, super_check_offset, sub_klass);
1920
cmpd(CCR0, cached_super, super_klass);
1921
1922
// This check has worked decisively for primary supers.
1923
// Secondary supers are sought in the super_cache ('super_cache_addr').
1924
// (Secondary supers are interfaces and very deeply nested subtypes.)
1925
// This works in the same check above because of a tricky aliasing
1926
// between the super_cache and the primary super display elements.
1927
// (The 'super_check_addr' can address either, as the case requires.)
1928
// Note that the cache is updated below if it does not help us find
1929
// what we need immediately.
1930
// So if it was a primary super, we can just fail immediately.
1931
// Otherwise, it's the slow path for us (no success at this point).
1932
1933
#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1934
1935
if (super_check_offset.is_register()) {
1936
beq(CCR0, *L_success);
1937
cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1938
if (L_failure == &L_fallthrough) {
1939
beq(CCR0, *L_slow_path);
1940
} else {
1941
bne(CCR0, *L_failure);
1942
FINAL_JUMP(*L_slow_path);
1943
}
1944
} else {
1945
if (super_check_offset.as_constant() == sc_offset) {
1946
// Need a slow path; fast failure is impossible.
1947
if (L_slow_path == &L_fallthrough) {
1948
beq(CCR0, *L_success);
1949
} else {
1950
bne(CCR0, *L_slow_path);
1951
FINAL_JUMP(*L_success);
1952
}
1953
} else {
1954
// No slow path; it's a fast decision.
1955
if (L_failure == &L_fallthrough) {
1956
beq(CCR0, *L_success);
1957
} else {
1958
bne(CCR0, *L_failure);
1959
FINAL_JUMP(*L_success);
1960
}
1961
}
1962
}
1963
1964
bind(L_fallthrough);
1965
#undef FINAL_JUMP
1966
}
1967
1968
void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1969
Register super_klass,
1970
Register temp1_reg,
1971
Register temp2_reg,
1972
Label* L_success,
1973
Register result_reg) {
1974
const Register array_ptr = temp1_reg; // current value from cache array
1975
const Register temp = temp2_reg;
1976
1977
assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1978
1979
int source_offset = in_bytes(Klass::secondary_supers_offset());
1980
int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1981
1982
int length_offset = Array<Klass*>::length_offset_in_bytes();
1983
int base_offset = Array<Klass*>::base_offset_in_bytes();
1984
1985
Label hit, loop, failure, fallthru;
1986
1987
ld(array_ptr, source_offset, sub_klass);
1988
1989
// TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1990
lwz(temp, length_offset, array_ptr);
1991
cmpwi(CCR0, temp, 0);
1992
beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1993
1994
mtctr(temp); // load ctr
1995
1996
bind(loop);
1997
// Oops in table are NO MORE compressed.
1998
ld(temp, base_offset, array_ptr);
1999
cmpd(CCR0, temp, super_klass);
2000
beq(CCR0, hit);
2001
addi(array_ptr, array_ptr, BytesPerWord);
2002
bdnz(loop);
2003
2004
bind(failure);
2005
if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2006
b(fallthru);
2007
2008
bind(hit);
2009
std(super_klass, target_offset, sub_klass); // save result to cache
2010
if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2011
if (L_success != NULL) { b(*L_success); }
2012
else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2013
2014
bind(fallthru);
2015
}
2016
2017
// Try fast path, then go to slow one if not successful
2018
void MacroAssembler::check_klass_subtype(Register sub_klass,
2019
Register super_klass,
2020
Register temp1_reg,
2021
Register temp2_reg,
2022
Label& L_success) {
2023
Label L_failure;
2024
check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2025
check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2026
bind(L_failure); // Fallthru if not successful.
2027
}
2028
2029
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2030
assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2031
2032
Label L_fallthrough;
2033
if (L_fast_path == NULL) {
2034
L_fast_path = &L_fallthrough;
2035
} else if (L_slow_path == NULL) {
2036
L_slow_path = &L_fallthrough;
2037
}
2038
2039
// Fast path check: class is fully initialized
2040
lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2041
cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2042
beq(CCR0, *L_fast_path);
2043
2044
// Fast path check: current thread is initializer thread
2045
ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2046
cmpd(CCR0, thread, R0);
2047
if (L_slow_path == &L_fallthrough) {
2048
beq(CCR0, *L_fast_path);
2049
} else if (L_fast_path == &L_fallthrough) {
2050
bne(CCR0, *L_slow_path);
2051
} else {
2052
Unimplemented();
2053
}
2054
2055
bind(L_fallthrough);
2056
}
2057
2058
RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2059
Register temp_reg,
2060
int extra_slot_offset) {
2061
// cf. TemplateTable::prepare_invoke(), if (load_receiver).
2062
int stackElementSize = Interpreter::stackElementSize;
2063
int offset = extra_slot_offset * stackElementSize;
2064
if (arg_slot.is_constant()) {
2065
offset += arg_slot.as_constant() * stackElementSize;
2066
return offset;
2067
} else {
2068
assert(temp_reg != noreg, "must specify");
2069
sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2070
if (offset != 0)
2071
addi(temp_reg, temp_reg, offset);
2072
return temp_reg;
2073
}
2074
}
2075
2076
// Supports temp2_reg = R0.
2077
void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2078
Register mark_reg, Register temp_reg,
2079
Register temp2_reg, Label& done, Label* slow_case) {
2080
assert(UseBiasedLocking, "why call this otherwise?");
2081
2082
#ifdef ASSERT
2083
assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2084
#endif
2085
2086
Label cas_label;
2087
2088
// Branch to done if fast path fails and no slow_case provided.
2089
Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2090
2091
// Biased locking
2092
// See whether the lock is currently biased toward our thread and
2093
// whether the epoch is still valid
2094
// Note that the runtime guarantees sufficient alignment of JavaThread
2095
// pointers to allow age to be placed into low bits
2096
assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2097
"biased locking makes assumptions about bit layout");
2098
2099
if (PrintBiasedLockingStatistics) {
2100
load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2101
lwzx(temp_reg, temp2_reg);
2102
addi(temp_reg, temp_reg, 1);
2103
stwx(temp_reg, temp2_reg);
2104
}
2105
2106
andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2107
cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2108
bne(cr_reg, cas_label);
2109
2110
load_klass(temp_reg, obj_reg);
2111
2112
load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2113
ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2114
orr(temp_reg, R16_thread, temp_reg);
2115
xorr(temp_reg, mark_reg, temp_reg);
2116
andr(temp_reg, temp_reg, temp2_reg);
2117
cmpdi(cr_reg, temp_reg, 0);
2118
if (PrintBiasedLockingStatistics) {
2119
Label l;
2120
bne(cr_reg, l);
2121
load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2122
lwzx(mark_reg, temp2_reg);
2123
addi(mark_reg, mark_reg, 1);
2124
stwx(mark_reg, temp2_reg);
2125
// restore mark_reg
2126
ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2127
bind(l);
2128
}
2129
beq(cr_reg, done);
2130
2131
Label try_revoke_bias;
2132
Label try_rebias;
2133
2134
// At this point we know that the header has the bias pattern and
2135
// that we are not the bias owner in the current epoch. We need to
2136
// figure out more details about the state of the header in order to
2137
// know what operations can be legally performed on the object's
2138
// header.
2139
2140
// If the low three bits in the xor result aren't clear, that means
2141
// the prototype header is no longer biased and we have to revoke
2142
// the bias on this object.
2143
andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2144
cmpwi(cr_reg, temp2_reg, 0);
2145
bne(cr_reg, try_revoke_bias);
2146
2147
// Biasing is still enabled for this data type. See whether the
2148
// epoch of the current bias is still valid, meaning that the epoch
2149
// bits of the mark word are equal to the epoch bits of the
2150
// prototype header. (Note that the prototype header's epoch bits
2151
// only change at a safepoint.) If not, attempt to rebias the object
2152
// toward the current thread. Note that we must be absolutely sure
2153
// that the current epoch is invalid in order to do this because
2154
// otherwise the manipulations it performs on the mark word are
2155
// illegal.
2156
2157
int shift_amount = 64 - markWord::epoch_shift;
2158
// rotate epoch bits to right (little) end and set other bits to 0
2159
// [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2160
rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2161
// branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2162
bne(CCR0, try_rebias);
2163
2164
// The epoch of the current bias is still valid but we know nothing
2165
// about the owner; it might be set or it might be clear. Try to
2166
// acquire the bias of the object using an atomic operation. If this
2167
// fails we will go in to the runtime to revoke the object's bias.
2168
// Note that we first construct the presumed unbiased header so we
2169
// don't accidentally blow away another thread's valid bias.
2170
andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2171
markWord::age_mask_in_place |
2172
markWord::epoch_mask_in_place));
2173
orr(temp_reg, R16_thread, mark_reg);
2174
2175
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2176
2177
// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2178
cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2179
/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2180
/*where=*/obj_reg,
2181
MacroAssembler::MemBarAcq,
2182
MacroAssembler::cmpxchgx_hint_acquire_lock(),
2183
noreg, slow_case_int); // bail out if failed
2184
2185
// If the biasing toward our thread failed, this means that
2186
// another thread succeeded in biasing it toward itself and we
2187
// need to revoke that bias. The revocation will occur in the
2188
// interpreter runtime in the slow case.
2189
if (PrintBiasedLockingStatistics) {
2190
load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2191
lwzx(temp_reg, temp2_reg);
2192
addi(temp_reg, temp_reg, 1);
2193
stwx(temp_reg, temp2_reg);
2194
}
2195
b(done);
2196
2197
bind(try_rebias);
2198
// At this point we know the epoch has expired, meaning that the
2199
// current "bias owner", if any, is actually invalid. Under these
2200
// circumstances _only_, we are allowed to use the current header's
2201
// value as the comparison value when doing the cas to acquire the
2202
// bias in the current epoch. In other words, we allow transfer of
2203
// the bias from one thread to another directly in this situation.
2204
load_klass(temp_reg, obj_reg);
2205
andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2206
orr(temp2_reg, R16_thread, temp2_reg);
2207
ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2208
orr(temp_reg, temp2_reg, temp_reg);
2209
2210
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2211
2212
cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2213
/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2214
/*where=*/obj_reg,
2215
MacroAssembler::MemBarAcq,
2216
MacroAssembler::cmpxchgx_hint_acquire_lock(),
2217
noreg, slow_case_int); // bail out if failed
2218
2219
// If the biasing toward our thread failed, this means that
2220
// another thread succeeded in biasing it toward itself and we
2221
// need to revoke that bias. The revocation will occur in the
2222
// interpreter runtime in the slow case.
2223
if (PrintBiasedLockingStatistics) {
2224
load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2225
lwzx(temp_reg, temp2_reg);
2226
addi(temp_reg, temp_reg, 1);
2227
stwx(temp_reg, temp2_reg);
2228
}
2229
b(done);
2230
2231
bind(try_revoke_bias);
2232
// The prototype mark in the klass doesn't have the bias bit set any
2233
// more, indicating that objects of this data type are not supposed
2234
// to be biased any more. We are going to try to reset the mark of
2235
// this object to the prototype value and fall through to the
2236
// CAS-based locking scheme. Note that if our CAS fails, it means
2237
// that another thread raced us for the privilege of revoking the
2238
// bias of this particular object, so it's okay to continue in the
2239
// normal locking code.
2240
load_klass(temp_reg, obj_reg);
2241
ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2242
andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2243
orr(temp_reg, temp_reg, temp2_reg);
2244
2245
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2246
2247
// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2248
cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2249
/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2250
/*where=*/obj_reg,
2251
MacroAssembler::MemBarAcq,
2252
MacroAssembler::cmpxchgx_hint_acquire_lock());
2253
2254
// reload markWord in mark_reg before continuing with lightweight locking
2255
ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2256
2257
// Fall through to the normal CAS-based lock, because no matter what
2258
// the result of the above CAS, some thread must have succeeded in
2259
// removing the bias bit from the object's header.
2260
if (PrintBiasedLockingStatistics) {
2261
Label l;
2262
bne(cr_reg, l);
2263
load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2264
lwzx(temp_reg, temp2_reg);
2265
addi(temp_reg, temp_reg, 1);
2266
stwx(temp_reg, temp2_reg);
2267
bind(l);
2268
}
2269
2270
bind(cas_label);
2271
}
2272
2273
void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2274
// Check for biased locking unlock case, which is a no-op
2275
// Note: we do not have to check the thread ID for two reasons.
2276
// First, the interpreter checks for IllegalMonitorStateException at
2277
// a higher level. Second, if the bias was revoked while we held the
2278
// lock, the object could not be rebiased toward another thread, so
2279
// the bias bit would be clear.
2280
2281
ld(temp_reg, 0, mark_addr);
2282
andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2283
2284
cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2285
beq(cr_reg, done);
2286
}
2287
2288
// allocation (for C1)
2289
void MacroAssembler::eden_allocate(
2290
Register obj, // result: pointer to object after successful allocation
2291
Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2292
int con_size_in_bytes, // object size in bytes if known at compile time
2293
Register t1, // temp register
2294
Register t2, // temp register
2295
Label& slow_case // continuation point if fast allocation fails
2296
) {
2297
b(slow_case);
2298
}
2299
2300
void MacroAssembler::tlab_allocate(
2301
Register obj, // result: pointer to object after successful allocation
2302
Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2303
int con_size_in_bytes, // object size in bytes if known at compile time
2304
Register t1, // temp register
2305
Label& slow_case // continuation point if fast allocation fails
2306
) {
2307
// make sure arguments make sense
2308
assert_different_registers(obj, var_size_in_bytes, t1);
2309
assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2310
assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2311
2312
const Register new_top = t1;
2313
//verify_tlab(); not implemented
2314
2315
ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2316
ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2317
if (var_size_in_bytes == noreg) {
2318
addi(new_top, obj, con_size_in_bytes);
2319
} else {
2320
add(new_top, obj, var_size_in_bytes);
2321
}
2322
cmpld(CCR0, new_top, R0);
2323
bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2324
2325
#ifdef ASSERT
2326
// make sure new free pointer is properly aligned
2327
{
2328
Label L;
2329
andi_(R0, new_top, MinObjAlignmentInBytesMask);
2330
beq(CCR0, L);
2331
stop("updated TLAB free is not properly aligned");
2332
bind(L);
2333
}
2334
#endif // ASSERT
2335
2336
// update the tlab top pointer
2337
std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2338
//verify_tlab(); not implemented
2339
}
2340
void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2341
unimplemented("incr_allocated_bytes");
2342
}
2343
2344
address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2345
int insts_call_instruction_offset, Register Rtoc) {
2346
// Start the stub.
2347
address stub = start_a_stub(64);
2348
if (stub == NULL) { return NULL; } // CodeCache full: bail out
2349
2350
// Create a trampoline stub relocation which relates this trampoline stub
2351
// with the call instruction at insts_call_instruction_offset in the
2352
// instructions code-section.
2353
relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2354
const int stub_start_offset = offset();
2355
2356
// For java_to_interp stubs we use R11_scratch1 as scratch register
2357
// and in call trampoline stubs we use R12_scratch2. This way we
2358
// can distinguish them (see is_NativeCallTrampolineStub_at()).
2359
Register reg_scratch = R12_scratch2;
2360
2361
// Now, create the trampoline stub's code:
2362
// - load the TOC
2363
// - load the call target from the constant pool
2364
// - call
2365
if (Rtoc == noreg) {
2366
calculate_address_from_global_toc(reg_scratch, method_toc());
2367
Rtoc = reg_scratch;
2368
}
2369
2370
ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2371
mtctr(reg_scratch);
2372
bctr();
2373
2374
const address stub_start_addr = addr_at(stub_start_offset);
2375
2376
// Assert that the encoded destination_toc_offset can be identified and that it is correct.
2377
assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2378
"encoded offset into the constant pool must match");
2379
// Trampoline_stub_size should be good.
2380
assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2381
assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2382
2383
// End the stub.
2384
end_a_stub();
2385
return stub;
2386
}
2387
2388
// TM on PPC64.
2389
void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2390
Label retry;
2391
bind(retry);
2392
ldarx(result, addr, /*hint*/ false);
2393
addi(result, result, simm16);
2394
stdcx_(result, addr);
2395
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2396
bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2397
} else {
2398
bne( CCR0, retry); // stXcx_ sets CCR0
2399
}
2400
}
2401
2402
void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2403
Label retry;
2404
bind(retry);
2405
lwarx(result, addr, /*hint*/ false);
2406
ori(result, result, uimm16);
2407
stwcx_(result, addr);
2408
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2409
bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2410
} else {
2411
bne( CCR0, retry); // stXcx_ sets CCR0
2412
}
2413
}
2414
2415
#if INCLUDE_RTM_OPT
2416
2417
// Update rtm_counters based on abort status
2418
// input: abort_status
2419
// rtm_counters_Reg (RTMLockingCounters*)
2420
void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2421
// Mapping to keep PreciseRTMLockingStatistics similar to x86.
2422
// x86 ppc (! means inverted, ? means not the same)
2423
// 0 31 Set if abort caused by XABORT instruction.
2424
// 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2425
// 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2426
// 3 10 Set if an internal buffer overflowed.
2427
// 4 ?12 Set if a debug breakpoint was hit.
2428
// 5 ?32 Set if an abort occurred during execution of a nested transaction.
2429
const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2430
tm_failure_persistent,
2431
tm_non_trans_cf,
2432
tm_trans_cf,
2433
tm_footprint_of,
2434
tm_failure_code,
2435
tm_transaction_level};
2436
2437
const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2438
const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2439
2440
const int bit2counter_map[][num_counters] =
2441
// 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2442
// Inverted logic means that if a bit is set don't count it, or vice-versa.
2443
// Care must be taken when mapping bits to counters as bits for a given
2444
// counter must be mutually exclusive. Otherwise, the counter will be
2445
// incremented more than once.
2446
// counters:
2447
// 0 1 2 3 4 5
2448
// abort , persist, conflict, overflow, debug , nested bits:
2449
{{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort
2450
{ 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent
2451
{ 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf
2452
{ 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf
2453
{ 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of
2454
{ 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4
2455
{ 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1
2456
// ...
2457
2458
// Move abort_status value to R0 and use abort_status register as a
2459
// temporary register because R0 as third operand in ld/std is treated
2460
// as base address zero (value). Likewise, R0 as second operand in addi
2461
// is problematic because it amounts to li.
2462
const Register temp_Reg = abort_status;
2463
const Register abort_status_R0 = R0;
2464
mr(abort_status_R0, abort_status);
2465
2466
// Increment total abort counter.
2467
int counters_offs = RTMLockingCounters::abort_count_offset();
2468
ld(temp_Reg, counters_offs, rtm_counters_Reg);
2469
addi(temp_Reg, temp_Reg, 1);
2470
std(temp_Reg, counters_offs, rtm_counters_Reg);
2471
2472
// Increment specific abort counters.
2473
if (PrintPreciseRTMLockingStatistics) {
2474
2475
// #0 counter offset.
2476
int abortX_offs = RTMLockingCounters::abortX_count_offset();
2477
2478
for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2479
for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2480
if (bit2counter_map[nbit][ncounter] != 0) {
2481
Label check_abort;
2482
int abort_counter_offs = abortX_offs + (ncounter << 3);
2483
2484
if (failure_bit[nbit] == tm_transaction_level) {
2485
// Don't check outer transaction, TL = 1 (bit 63). Hence only
2486
// 11 bits in the TL field are checked to find out if failure
2487
// occured in a nested transaction. This check also matches
2488
// the case when nesting_of = 1 (nesting overflow).
2489
rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2490
} else if (failure_bit[nbit] == tm_failure_code) {
2491
// Check failure code for trap or illegal caught in TM.
2492
// Bits 0:7 are tested as bit 7 (persistent) is copied from
2493
// tabort or treclaim source operand.
2494
// On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2495
rldicl(temp_Reg, abort_status_R0, 8, 56);
2496
cmpdi(CCR0, temp_Reg, 0xD4);
2497
} else {
2498
rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2499
}
2500
2501
if (bit2counter_map[nbit][ncounter] == 1) {
2502
beq(CCR0, check_abort);
2503
} else {
2504
bne(CCR0, check_abort);
2505
}
2506
2507
// We don't increment atomically.
2508
ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2509
addi(temp_Reg, temp_Reg, 1);
2510
std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2511
2512
bind(check_abort);
2513
}
2514
}
2515
}
2516
}
2517
// Restore abort_status.
2518
mr(abort_status, abort_status_R0);
2519
}
2520
2521
// Branch if (random & (count-1) != 0), count is 2^n
2522
// tmp and CR0 are killed
2523
void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2524
mftb(tmp);
2525
andi_(tmp, tmp, count-1);
2526
bne(CCR0, brLabel);
2527
}
2528
2529
// Perform abort ratio calculation, set no_rtm bit if high ratio.
2530
// input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2531
void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2532
RTMLockingCounters* rtm_counters,
2533
Metadata* method_data) {
2534
Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2535
2536
if (RTMLockingCalculationDelay > 0) {
2537
// Delay calculation.
2538
ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2539
cmpdi(CCR0, rtm_counters_Reg, 0);
2540
beq(CCR0, L_done);
2541
load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2542
}
2543
// Abort ratio calculation only if abort_count > RTMAbortThreshold.
2544
// Aborted transactions = abort_count * 100
2545
// All transactions = total_count * RTMTotalCountIncrRate
2546
// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2547
ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2548
if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only.
2549
cmpdi(CCR0, R0, RTMAbortThreshold);
2550
blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary
2551
} else {
2552
load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2553
cmpd(CCR0, R0, rtm_counters_Reg);
2554
blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required
2555
}
2556
mulli(R0, R0, 100);
2557
2558
const Register tmpReg = rtm_counters_Reg;
2559
ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2560
mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2561
mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16
2562
cmpd(CCR0, R0, tmpReg);
2563
blt(CCR0, L_check_always_rtm1); // jump to reload
2564
if (method_data != NULL) {
2565
// Set rtm_state to "no rtm" in MDO.
2566
// Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2567
// (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2568
load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2569
atomic_ori_int(R0, tmpReg, NoRTM);
2570
}
2571
b(L_done);
2572
2573
bind(L_check_always_rtm1);
2574
load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2575
bind(L_check_always_rtm2);
2576
ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2577
int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2578
if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only.
2579
cmpdi(CCR0, tmpReg, thresholdValue);
2580
} else {
2581
load_const_optimized(R0, thresholdValue);
2582
cmpd(CCR0, tmpReg, R0);
2583
}
2584
blt(CCR0, L_done);
2585
if (method_data != NULL) {
2586
// Set rtm_state to "always rtm" in MDO.
2587
// Not using a metadata relocation. See above.
2588
load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2589
atomic_ori_int(R0, tmpReg, UseRTM);
2590
}
2591
bind(L_done);
2592
}
2593
2594
// Update counters and perform abort ratio calculation.
2595
// input: abort_status_Reg
2596
void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2597
RTMLockingCounters* rtm_counters,
2598
Metadata* method_data,
2599
bool profile_rtm) {
2600
2601
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2602
// Update rtm counters based on state at abort.
2603
// Reads abort_status_Reg, updates flags.
2604
assert_different_registers(abort_status_Reg, temp_Reg);
2605
load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2606
rtm_counters_update(abort_status_Reg, temp_Reg);
2607
if (profile_rtm) {
2608
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2609
rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2610
}
2611
}
2612
2613
// Retry on abort if abort's status indicates non-persistent failure.
2614
// inputs: retry_count_Reg
2615
// : abort_status_Reg
2616
// output: retry_count_Reg decremented by 1
2617
void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2618
Label& retryLabel, Label* checkRetry) {
2619
Label doneRetry;
2620
2621
// Don't retry if failure is persistent.
2622
// The persistent bit is set when a (A) Disallowed operation is performed in
2623
// transactional state, like for instance trying to write the TFHAR after a
2624
// transaction is started; or when there is (B) a Nesting Overflow (too many
2625
// nested transactions); or when (C) the Footprint overflows (too many
2626
// addressess touched in TM state so there is no more space in the footprint
2627
// area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2628
// store is performed to a given address in TM state, then once in suspended
2629
// state the same address is accessed. Failure (A) is very unlikely to occur
2630
// in the JVM. Failure (D) will never occur because Suspended state is never
2631
// used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2632
// Overflow will set the persistent bit.
2633
rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2634
bne(CCR0, doneRetry);
2635
2636
// Don't retry if transaction was deliberately aborted, i.e. caused by a
2637
// tabort instruction.
2638
rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2639
bne(CCR0, doneRetry);
2640
2641
// Retry if transaction aborted due to a conflict with another thread.
2642
if (checkRetry) { bind(*checkRetry); }
2643
addic_(retry_count_Reg, retry_count_Reg, -1);
2644
blt(CCR0, doneRetry);
2645
b(retryLabel);
2646
bind(doneRetry);
2647
}
2648
2649
// Spin and retry if lock is busy.
2650
// inputs: owner_addr_Reg (monitor address)
2651
// : retry_count_Reg
2652
// output: retry_count_Reg decremented by 1
2653
// CTR is killed
2654
void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2655
Label SpinLoop, doneRetry, doRetry;
2656
addic_(retry_count_Reg, retry_count_Reg, -1);
2657
blt(CCR0, doneRetry);
2658
2659
if (RTMSpinLoopCount > 1) {
2660
li(R0, RTMSpinLoopCount);
2661
mtctr(R0);
2662
}
2663
2664
// low thread priority
2665
smt_prio_low();
2666
bind(SpinLoop);
2667
2668
if (RTMSpinLoopCount > 1) {
2669
bdz(doRetry);
2670
ld(R0, 0, owner_addr_Reg);
2671
cmpdi(CCR0, R0, 0);
2672
bne(CCR0, SpinLoop);
2673
}
2674
2675
bind(doRetry);
2676
2677
// restore thread priority to default in userspace
2678
#ifdef LINUX
2679
smt_prio_medium_low();
2680
#else
2681
smt_prio_medium();
2682
#endif
2683
2684
b(retryLabel);
2685
2686
bind(doneRetry);
2687
}
2688
2689
// Use RTM for normal stack locks.
2690
// Input: objReg (object to lock)
2691
void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2692
Register obj, Register mark_word, Register tmp,
2693
Register retry_on_abort_count_Reg,
2694
RTMLockingCounters* stack_rtm_counters,
2695
Metadata* method_data, bool profile_rtm,
2696
Label& DONE_LABEL, Label& IsInflated) {
2697
assert(UseRTMForStackLocks, "why call this otherwise?");
2698
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2699
Label L_rtm_retry, L_decrement_retry, L_on_abort;
2700
2701
if (RTMRetryCount > 0) {
2702
load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2703
bind(L_rtm_retry);
2704
}
2705
andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
2706
bne(CCR0, IsInflated);
2707
2708
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2709
Label L_noincrement;
2710
if (RTMTotalCountIncrRate > 1) {
2711
branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2712
}
2713
assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2714
load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2715
//atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2716
ldx(mark_word, tmp);
2717
addi(mark_word, mark_word, 1);
2718
stdx(mark_word, tmp);
2719
bind(L_noincrement);
2720
}
2721
tbegin_();
2722
beq(CCR0, L_on_abort);
2723
ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked.
2724
andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2725
cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked
2726
beq(flag, DONE_LABEL); // all done if unlocked
2727
2728
if (UseRTMXendForLockBusy) {
2729
tend_();
2730
b(L_decrement_retry);
2731
} else {
2732
tabort_();
2733
}
2734
bind(L_on_abort);
2735
const Register abort_status_Reg = tmp;
2736
mftexasr(abort_status_Reg);
2737
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2738
rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2739
}
2740
ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2741
if (RTMRetryCount > 0) {
2742
// Retry on lock abort if abort status is not permanent.
2743
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2744
} else {
2745
bind(L_decrement_retry);
2746
}
2747
}
2748
2749
// Use RTM for inflating locks
2750
// inputs: obj (object to lock)
2751
// mark_word (current header - KILLED)
2752
// boxReg (on-stack box address (displaced header location) - KILLED)
2753
void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2754
Register obj, Register mark_word, Register boxReg,
2755
Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2756
RTMLockingCounters* rtm_counters,
2757
Metadata* method_data, bool profile_rtm,
2758
Label& DONE_LABEL) {
2759
assert(UseRTMLocking, "why call this otherwise?");
2760
Label L_rtm_retry, L_decrement_retry, L_on_abort;
2761
// Clean monitor_value bit to get valid pointer.
2762
int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2763
2764
// Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2765
std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2766
const Register tmpReg = boxReg;
2767
const Register owner_addr_Reg = mark_word;
2768
addi(owner_addr_Reg, mark_word, owner_offset);
2769
2770
if (RTMRetryCount > 0) {
2771
load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy.
2772
load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2773
bind(L_rtm_retry);
2774
}
2775
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2776
Label L_noincrement;
2777
if (RTMTotalCountIncrRate > 1) {
2778
branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2779
}
2780
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2781
load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2782
//atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2783
ldx(tmpReg, R0);
2784
addi(tmpReg, tmpReg, 1);
2785
stdx(tmpReg, R0);
2786
bind(L_noincrement);
2787
}
2788
tbegin_();
2789
beq(CCR0, L_on_abort);
2790
// We don't reload mark word. Will only be reset at safepoint.
2791
ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2792
cmpdi(flag, R0, 0);
2793
beq(flag, DONE_LABEL);
2794
2795
if (UseRTMXendForLockBusy) {
2796
tend_();
2797
b(L_decrement_retry);
2798
} else {
2799
tabort_();
2800
}
2801
bind(L_on_abort);
2802
const Register abort_status_Reg = tmpReg;
2803
mftexasr(abort_status_Reg);
2804
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2805
rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2806
// Restore owner_addr_Reg
2807
ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2808
#ifdef ASSERT
2809
andi_(R0, mark_word, markWord::monitor_value);
2810
asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2811
#endif
2812
addi(owner_addr_Reg, mark_word, owner_offset);
2813
}
2814
if (RTMRetryCount > 0) {
2815
// Retry on lock abort if abort status is not permanent.
2816
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2817
}
2818
2819
// Appears unlocked - try to swing _owner from null to non-null.
2820
cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2821
MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2822
MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2823
2824
if (RTMRetryCount > 0) {
2825
// success done else retry
2826
b(DONE_LABEL);
2827
bind(L_decrement_retry);
2828
// Spin and retry if lock is busy.
2829
rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2830
} else {
2831
bind(L_decrement_retry);
2832
}
2833
}
2834
2835
#endif // INCLUDE_RTM_OPT
2836
2837
// "The box" is the space on the stack where we copy the object mark.
2838
void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2839
Register temp, Register displaced_header, Register current_header,
2840
bool try_bias,
2841
RTMLockingCounters* rtm_counters,
2842
RTMLockingCounters* stack_rtm_counters,
2843
Metadata* method_data,
2844
bool use_rtm, bool profile_rtm) {
2845
assert_different_registers(oop, box, temp, displaced_header, current_header);
2846
assert(flag != CCR0, "bad condition register");
2847
Label cont;
2848
Label object_has_monitor;
2849
Label cas_failed;
2850
2851
// Load markWord from object into displaced_header.
2852
ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2853
2854
if (DiagnoseSyncOnValueBasedClasses != 0) {
2855
load_klass(temp, oop);
2856
lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2857
testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2858
bne(flag, cont);
2859
}
2860
2861
if (try_bias) {
2862
biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2863
}
2864
2865
#if INCLUDE_RTM_OPT
2866
if (UseRTMForStackLocks && use_rtm) {
2867
rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2868
stack_rtm_counters, method_data, profile_rtm,
2869
cont, object_has_monitor);
2870
}
2871
#endif // INCLUDE_RTM_OPT
2872
2873
// Handle existing monitor.
2874
// The object has an existing monitor iff (mark & monitor_value) != 0.
2875
andi_(temp, displaced_header, markWord::monitor_value);
2876
bne(CCR0, object_has_monitor);
2877
2878
// Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2879
ori(displaced_header, displaced_header, markWord::unlocked_value);
2880
2881
// Load Compare Value application register.
2882
2883
// Initialize the box. (Must happen before we update the object mark!)
2884
std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2885
2886
// Must fence, otherwise, preceding store(s) may float below cmpxchg.
2887
// Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2888
cmpxchgd(/*flag=*/flag,
2889
/*current_value=*/current_header,
2890
/*compare_value=*/displaced_header,
2891
/*exchange_value=*/box,
2892
/*where=*/oop,
2893
MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2894
MacroAssembler::cmpxchgx_hint_acquire_lock(),
2895
noreg,
2896
&cas_failed,
2897
/*check without membar and ldarx first*/true);
2898
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2899
2900
// If the compare-and-exchange succeeded, then we found an unlocked
2901
// object and we have now locked it.
2902
b(cont);
2903
2904
bind(cas_failed);
2905
// We did not see an unlocked object so try the fast recursive case.
2906
2907
// Check if the owner is self by comparing the value in the markWord of object
2908
// (current_header) with the stack pointer.
2909
sub(current_header, current_header, R1_SP);
2910
load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2911
2912
and_(R0/*==0?*/, current_header, temp);
2913
// If condition is true we are cont and hence we can store 0 as the
2914
// displaced header in the box, which indicates that it is a recursive lock.
2915
mcrf(flag,CCR0);
2916
std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2917
2918
// Handle existing monitor.
2919
b(cont);
2920
2921
bind(object_has_monitor);
2922
// The object's monitor m is unlocked iff m->owner == NULL,
2923
// otherwise m->owner may contain a thread or a stack address.
2924
2925
#if INCLUDE_RTM_OPT
2926
// Use the same RTM locking code in 32- and 64-bit VM.
2927
if (use_rtm) {
2928
rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2929
rtm_counters, method_data, profile_rtm, cont);
2930
} else {
2931
#endif // INCLUDE_RTM_OPT
2932
2933
// Try to CAS m->owner from NULL to current thread.
2934
addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2935
cmpxchgd(/*flag=*/flag,
2936
/*current_value=*/current_header,
2937
/*compare_value=*/(intptr_t)0,
2938
/*exchange_value=*/R16_thread,
2939
/*where=*/temp,
2940
MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2941
MacroAssembler::cmpxchgx_hint_acquire_lock());
2942
2943
// Store a non-null value into the box.
2944
std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2945
beq(flag, cont);
2946
2947
// Check for recursive locking.
2948
cmpd(flag, current_header, R16_thread);
2949
bne(flag, cont);
2950
2951
// Current thread already owns the lock. Just increment recursions.
2952
Register recursions = displaced_header;
2953
ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2954
addi(recursions, recursions, 1);
2955
std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);
2956
2957
#if INCLUDE_RTM_OPT
2958
} // use_rtm()
2959
#endif
2960
2961
bind(cont);
2962
// flag == EQ indicates success
2963
// flag == NE indicates failure
2964
}
2965
2966
void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2967
Register temp, Register displaced_header, Register current_header,
2968
bool try_bias, bool use_rtm) {
2969
assert_different_registers(oop, box, temp, displaced_header, current_header);
2970
assert(flag != CCR0, "bad condition register");
2971
Label cont, object_has_monitor, notRecursive;
2972
2973
if (try_bias) {
2974
biased_locking_exit(flag, oop, current_header, cont);
2975
}
2976
2977
#if INCLUDE_RTM_OPT
2978
if (UseRTMForStackLocks && use_rtm) {
2979
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2980
Label L_regular_unlock;
2981
ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword
2982
andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2983
cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked
2984
bne(flag, L_regular_unlock); // else RegularLock
2985
tend_(); // otherwise end...
2986
b(cont); // ... and we're done
2987
bind(L_regular_unlock);
2988
}
2989
#endif
2990
2991
// Find the lock address and load the displaced header from the stack.
2992
ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2993
2994
// If the displaced header is 0, we have a recursive unlock.
2995
cmpdi(flag, displaced_header, 0);
2996
beq(flag, cont);
2997
2998
// Handle existing monitor.
2999
// The object has an existing monitor iff (mark & monitor_value) != 0.
3000
RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
3001
ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
3002
andi_(R0, current_header, markWord::monitor_value);
3003
bne(CCR0, object_has_monitor);
3004
3005
// Check if it is still a light weight lock, this is is true if we see
3006
// the stack address of the basicLock in the markWord of the object.
3007
// Cmpxchg sets flag to cmpd(current_header, box).
3008
cmpxchgd(/*flag=*/flag,
3009
/*current_value=*/current_header,
3010
/*compare_value=*/box,
3011
/*exchange_value=*/displaced_header,
3012
/*where=*/oop,
3013
MacroAssembler::MemBarRel,
3014
MacroAssembler::cmpxchgx_hint_release_lock(),
3015
noreg,
3016
&cont);
3017
3018
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
3019
3020
// Handle existing monitor.
3021
b(cont);
3022
3023
bind(object_has_monitor);
3024
STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3025
addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3026
ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3027
3028
// It's inflated.
3029
#if INCLUDE_RTM_OPT
3030
if (use_rtm) {
3031
Label L_regular_inflated_unlock;
3032
// Clean monitor_value bit to get valid pointer
3033
cmpdi(flag, temp, 0);
3034
bne(flag, L_regular_inflated_unlock);
3035
tend_();
3036
b(cont);
3037
bind(L_regular_inflated_unlock);
3038
}
3039
#endif
3040
3041
ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3042
3043
cmpd(flag, temp, R16_thread);
3044
bne(flag, cont);
3045
3046
addic_(displaced_header, displaced_header, -1);
3047
blt(CCR0, notRecursive); // Not recursive if negative after decrement.
3048
std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3049
b(cont); // flag is already EQ here.
3050
3051
bind(notRecursive);
3052
ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3053
ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3054
orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3055
cmpdi(flag, temp, 0);
3056
bne(flag, cont);
3057
release();
3058
std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3059
3060
bind(cont);
3061
// flag == EQ indicates success
3062
// flag == NE indicates failure
3063
}
3064
3065
void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3066
ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3067
3068
if (at_return) {
3069
if (in_nmethod) {
3070
if (UseSIGTRAP) {
3071
// Use Signal Handler.
3072
relocate(relocInfo::poll_return_type);
3073
td(traptoGreaterThanUnsigned, R1_SP, temp);
3074
} else {
3075
cmpld(CCR0, R1_SP, temp);
3076
// Stub may be out of range for short conditional branch.
3077
bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
3078
}
3079
} else { // Not in nmethod.
3080
// Frame still on stack, need to get fp.
3081
Register fp = R0;
3082
ld(fp, _abi0(callers_sp), R1_SP);
3083
cmpld(CCR0, fp, temp);
3084
bgt(CCR0, slow_path);
3085
}
3086
} else { // Normal safepoint poll. Not at return.
3087
assert(!in_nmethod, "should use load_from_polling_page");
3088
andi_(temp, temp, SafepointMechanism::poll_bit());
3089
bne(CCR0, slow_path);
3090
}
3091
}
3092
3093
void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3094
MacroAssembler::PreservationLevel preservation_level) {
3095
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3096
bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3097
}
3098
3099
// Values for last_Java_pc, and last_Java_sp must comply to the rules
3100
// in frame_ppc.hpp.
3101
void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3102
// Always set last_Java_pc and flags first because once last_Java_sp
3103
// is visible has_last_Java_frame is true and users will look at the
3104
// rest of the fields. (Note: flags should always be zero before we
3105
// get here so doesn't need to be set.)
3106
3107
// Verify that last_Java_pc was zeroed on return to Java
3108
asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3109
"last_Java_pc not zeroed before leaving Java");
3110
3111
// When returning from calling out from Java mode the frame anchor's
3112
// last_Java_pc will always be set to NULL. It is set here so that
3113
// if we are doing a call to native (not VM) that we capture the
3114
// known pc and don't have to rely on the native call having a
3115
// standard frame linkage where we can find the pc.
3116
if (last_Java_pc != noreg)
3117
std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3118
3119
// Set last_Java_sp last.
3120
std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3121
}
3122
3123
void MacroAssembler::reset_last_Java_frame(void) {
3124
asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3125
R16_thread, "SP was not set, still zero");
3126
3127
BLOCK_COMMENT("reset_last_Java_frame {");
3128
li(R0, 0);
3129
3130
// _last_Java_sp = 0
3131
std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3132
3133
// _last_Java_pc = 0
3134
std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3135
BLOCK_COMMENT("} reset_last_Java_frame");
3136
}
3137
3138
void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3139
assert_different_registers(sp, tmp1);
3140
3141
// sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3142
// TOP_IJAVA_FRAME_ABI.
3143
// FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3144
address entry = pc();
3145
load_const_optimized(tmp1, entry);
3146
3147
set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3148
}
3149
3150
void MacroAssembler::get_vm_result(Register oop_result) {
3151
// Read:
3152
// R16_thread
3153
// R16_thread->in_bytes(JavaThread::vm_result_offset())
3154
//
3155
// Updated:
3156
// oop_result
3157
// R16_thread->in_bytes(JavaThread::vm_result_offset())
3158
3159
verify_thread();
3160
3161
ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3162
li(R0, 0);
3163
std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3164
3165
verify_oop(oop_result, FILE_AND_LINE);
3166
}
3167
3168
void MacroAssembler::get_vm_result_2(Register metadata_result) {
3169
// Read:
3170
// R16_thread
3171
// R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3172
//
3173
// Updated:
3174
// metadata_result
3175
// R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3176
3177
ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3178
li(R0, 0);
3179
std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3180
}
3181
3182
Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3183
Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3184
if (CompressedKlassPointers::base() != 0) {
3185
// Use dst as temp if it is free.
3186
sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3187
current = dst;
3188
}
3189
if (CompressedKlassPointers::shift() != 0) {
3190
srdi(dst, current, CompressedKlassPointers::shift());
3191
current = dst;
3192
}
3193
return current;
3194
}
3195
3196
void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3197
if (UseCompressedClassPointers) {
3198
Register compressedKlass = encode_klass_not_null(ck, klass);
3199
stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3200
} else {
3201
std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3202
}
3203
}
3204
3205
void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3206
if (UseCompressedClassPointers) {
3207
if (val == noreg) {
3208
val = R0;
3209
li(val, 0);
3210
}
3211
stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3212
}
3213
}
3214
3215
int MacroAssembler::instr_size_for_decode_klass_not_null() {
3216
static int computed_size = -1;
3217
3218
// Not yet computed?
3219
if (computed_size == -1) {
3220
3221
if (!UseCompressedClassPointers) {
3222
computed_size = 0;
3223
} else {
3224
// Determine by scratch emit.
3225
ResourceMark rm;
3226
int code_size = 8 * BytesPerInstWord;
3227
CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3228
MacroAssembler* a = new MacroAssembler(&cb);
3229
a->decode_klass_not_null(R11_scratch1);
3230
computed_size = a->offset();
3231
}
3232
}
3233
3234
return computed_size;
3235
}
3236
3237
void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3238
assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3239
if (src == noreg) src = dst;
3240
Register shifted_src = src;
3241
if (CompressedKlassPointers::shift() != 0 ||
3242
CompressedKlassPointers::base() == 0 && src != dst) { // Move required.
3243
shifted_src = dst;
3244
sldi(shifted_src, src, CompressedKlassPointers::shift());
3245
}
3246
if (CompressedKlassPointers::base() != 0) {
3247
add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3248
}
3249
}
3250
3251
void MacroAssembler::load_klass(Register dst, Register src) {
3252
if (UseCompressedClassPointers) {
3253
lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3254
// Attention: no null check here!
3255
decode_klass_not_null(dst, dst);
3256
} else {
3257
ld(dst, oopDesc::klass_offset_in_bytes(), src);
3258
}
3259
}
3260
3261
// ((OopHandle)result).resolve();
3262
void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3263
MacroAssembler::PreservationLevel preservation_level) {
3264
access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3265
}
3266
3267
void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3268
MacroAssembler::PreservationLevel preservation_level) {
3269
Label resolved;
3270
3271
// A null weak handle resolves to null.
3272
cmpdi(CCR0, result, 0);
3273
beq(CCR0, resolved);
3274
3275
access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3276
preservation_level);
3277
bind(resolved);
3278
}
3279
3280
void MacroAssembler::load_method_holder(Register holder, Register method) {
3281
ld(holder, in_bytes(Method::const_offset()), method);
3282
ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3283
ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3284
}
3285
3286
// Clear Array
3287
// For very short arrays. tmp == R0 is allowed.
3288
void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3289
if (cnt_dwords > 0) { li(tmp, 0); }
3290
for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3291
}
3292
3293
// Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3294
void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3295
if (cnt_dwords < 8) {
3296
clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3297
return;
3298
}
3299
3300
Label loop;
3301
const long loopcnt = cnt_dwords >> 1,
3302
remainder = cnt_dwords & 1;
3303
3304
li(tmp, loopcnt);
3305
mtctr(tmp);
3306
li(tmp, 0);
3307
bind(loop);
3308
std(tmp, 0, base_ptr);
3309
std(tmp, 8, base_ptr);
3310
addi(base_ptr, base_ptr, 16);
3311
bdnz(loop);
3312
if (remainder) { std(tmp, 0, base_ptr); }
3313
}
3314
3315
// Kills both input registers. tmp == R0 is allowed.
3316
void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3317
// Procedure for large arrays (uses data cache block zero instruction).
3318
Label startloop, fast, fastloop, small_rest, restloop, done;
3319
const int cl_size = VM_Version::L1_data_cache_line_size(),
3320
cl_dwords = cl_size >> 3,
3321
cl_dw_addr_bits = exact_log2(cl_dwords),
3322
dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3323
min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3324
3325
if (const_cnt >= 0) {
3326
// Constant case.
3327
if (const_cnt < min_cnt) {
3328
clear_memory_constlen(base_ptr, const_cnt, tmp);
3329
return;
3330
}
3331
load_const_optimized(cnt_dwords, const_cnt, tmp);
3332
} else {
3333
// cnt_dwords already loaded in register. Need to check size.
3334
cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3335
blt(CCR1, small_rest);
3336
}
3337
rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3338
beq(CCR0, fast); // Already 128byte aligned.
3339
3340
subfic(tmp, tmp, cl_dwords);
3341
mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3342
subf(cnt_dwords, tmp, cnt_dwords); // rest.
3343
li(tmp, 0);
3344
3345
bind(startloop); // Clear at the beginning to reach 128byte boundary.
3346
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3347
addi(base_ptr, base_ptr, 8);
3348
bdnz(startloop);
3349
3350
bind(fast); // Clear 128byte blocks.
3351
srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3352
andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3353
mtctr(tmp); // Load counter.
3354
3355
bind(fastloop);
3356
dcbz(base_ptr); // Clear 128byte aligned block.
3357
addi(base_ptr, base_ptr, cl_size);
3358
bdnz(fastloop);
3359
3360
bind(small_rest);
3361
cmpdi(CCR0, cnt_dwords, 0); // size 0?
3362
beq(CCR0, done); // rest == 0
3363
li(tmp, 0);
3364
mtctr(cnt_dwords); // Load counter.
3365
3366
bind(restloop); // Clear rest.
3367
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3368
addi(base_ptr, base_ptr, 8);
3369
bdnz(restloop);
3370
3371
bind(done);
3372
}
3373
3374
/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3375
3376
// Helpers for Intrinsic Emitters
3377
//
3378
// Revert the byte order of a 32bit value in a register
3379
// src: 0x44556677
3380
// dst: 0x77665544
3381
// Three steps to obtain the result:
3382
// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3383
// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3384
// This value initializes dst.
3385
// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3386
// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3387
// This value is mask inserted into dst with a [0..23] mask of 1s.
3388
// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3389
// This value is mask inserted into dst with a [8..15] mask of 1s.
3390
void MacroAssembler::load_reverse_32(Register dst, Register src) {
3391
assert_different_registers(dst, src);
3392
3393
rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3394
rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3395
rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3396
}
3397
3398
// Calculate the column addresses of the crc32 lookup table into distinct registers.
3399
// This loop-invariant calculation is moved out of the loop body, reducing the loop
3400
// body size from 20 to 16 instructions.
3401
// Returns the offset that was used to calculate the address of column tc3.
3402
// Due to register shortage, setting tc3 may overwrite table. With the return offset
3403
// at hand, the original table address can be easily reconstructed.
3404
int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3405
assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3406
3407
// Point to 4 byte folding tables (byte-reversed version for Big Endian)
3408
// Layout: See StubRoutines::ppc::generate_crc_constants.
3409
#ifdef VM_LITTLE_ENDIAN
3410
const int ix0 = 3 * CRC32_TABLE_SIZE;
3411
const int ix1 = 2 * CRC32_TABLE_SIZE;
3412
const int ix2 = 1 * CRC32_TABLE_SIZE;
3413
const int ix3 = 0 * CRC32_TABLE_SIZE;
3414
#else
3415
const int ix0 = 1 * CRC32_TABLE_SIZE;
3416
const int ix1 = 2 * CRC32_TABLE_SIZE;
3417
const int ix2 = 3 * CRC32_TABLE_SIZE;
3418
const int ix3 = 4 * CRC32_TABLE_SIZE;
3419
#endif
3420
assert_different_registers(table, tc0, tc1, tc2);
3421
assert(table == tc3, "must be!");
3422
3423
addi(tc0, table, ix0);
3424
addi(tc1, table, ix1);
3425
addi(tc2, table, ix2);
3426
if (ix3 != 0) addi(tc3, table, ix3);
3427
3428
return ix3;
3429
}
3430
3431
/**
3432
* uint32_t crc;
3433
* table[crc & 0xFF] ^ (crc >> 8);
3434
*/
3435
void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3436
assert_different_registers(crc, table, tmp);
3437
assert_different_registers(val, table);
3438
3439
if (crc == val) { // Must rotate first to use the unmodified value.
3440
rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3441
// As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3442
srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3443
} else {
3444
srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3445
rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3446
}
3447
lwzx(tmp, table, tmp);
3448
xorr(crc, crc, tmp);
3449
}
3450
3451
/**
3452
* Emits code to update CRC-32 with a byte value according to constants in table.
3453
*
3454
* @param [in,out]crc Register containing the crc.
3455
* @param [in]val Register containing the byte to fold into the CRC.
3456
* @param [in]table Register containing the table of crc constants.
3457
*
3458
* uint32_t crc;
3459
* val = crc_table[(val ^ crc) & 0xFF];
3460
* crc = val ^ (crc >> 8);
3461
*/
3462
void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3463
BLOCK_COMMENT("update_byte_crc32:");
3464
xorr(val, val, crc);
3465
fold_byte_crc32(crc, val, table, val);
3466
}
3467
3468
/**
3469
* @param crc register containing existing CRC (32-bit)
3470
* @param buf register pointing to input byte buffer (byte*)
3471
* @param len register containing number of bytes
3472
* @param table register pointing to CRC table
3473
*/
3474
void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3475
Register data, bool loopAlignment) {
3476
assert_different_registers(crc, buf, len, table, data);
3477
3478
Label L_mainLoop, L_done;
3479
const int mainLoop_stepping = 1;
3480
const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3481
3482
// Process all bytes in a single-byte loop.
3483
clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3484
beq(CCR0, L_done);
3485
3486
mtctr(len);
3487
align(mainLoop_alignment);
3488
BIND(L_mainLoop);
3489
lbz(data, 0, buf); // Byte from buffer, zero-extended.
3490
addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3491
update_byte_crc32(crc, data, table);
3492
bdnz(L_mainLoop); // Iterate.
3493
3494
bind(L_done);
3495
}
3496
3497
/**
3498
* Emits code to update CRC-32 with a 4-byte value according to constants in table
3499
* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3500
*/
3501
// A note on the lookup table address(es):
3502
// The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3503
// To save the effort of adding the column offset to the table address each time
3504
// a table element is looked up, it is possible to pass the pre-calculated
3505
// column addresses.
3506
// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3507
void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3508
Register t0, Register t1, Register t2, Register t3,
3509
Register tc0, Register tc1, Register tc2, Register tc3) {
3510
assert_different_registers(crc, t3);
3511
3512
// XOR crc with next four bytes of buffer.
3513
lwz(t3, bufDisp, buf);
3514
if (bufInc != 0) {
3515
addi(buf, buf, bufInc);
3516
}
3517
xorr(t3, t3, crc);
3518
3519
// Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3520
rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3521
rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3522
rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3523
rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3524
3525
// Use the pre-calculated column addresses.
3526
// Load pre-calculated table values.
3527
lwzx(t0, tc0, t0);
3528
lwzx(t1, tc1, t1);
3529
lwzx(t2, tc2, t2);
3530
lwzx(t3, tc3, t3);
3531
3532
// Calculate new crc from table values.
3533
xorr(t0, t0, t1);
3534
xorr(t2, t2, t3);
3535
xorr(crc, t0, t2); // Now crc contains the final checksum value.
3536
}
3537
3538
/**
3539
* @param crc register containing existing CRC (32-bit)
3540
* @param buf register pointing to input byte buffer (byte*)
3541
* @param len register containing number of bytes
3542
* @param table register pointing to CRC table
3543
*
3544
* uses R9..R12 as work register. Must be saved/restored by caller!
3545
*/
3546
void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3547
Register t0, Register t1, Register t2, Register t3,
3548
Register tc0, Register tc1, Register tc2, Register tc3,
3549
bool invertCRC) {
3550
assert_different_registers(crc, buf, len, table);
3551
3552
Label L_mainLoop, L_tail;
3553
Register tmp = t0;
3554
Register data = t0;
3555
Register tmp2 = t1;
3556
const int mainLoop_stepping = 4;
3557
const int tailLoop_stepping = 1;
3558
const int log_stepping = exact_log2(mainLoop_stepping);
3559
const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3560
const int complexThreshold = 2*mainLoop_stepping;
3561
3562
// Don't test for len <= 0 here. This pathological case should not occur anyway.
3563
// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3564
// for all well-behaved cases. The situation itself is detected and handled correctly
3565
// within update_byteLoop_crc32.
3566
assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3567
3568
BLOCK_COMMENT("kernel_crc32_1word {");
3569
3570
if (invertCRC) {
3571
nand(crc, crc, crc); // 1s complement of crc
3572
}
3573
3574
// Check for short (<mainLoop_stepping) buffer.
3575
cmpdi(CCR0, len, complexThreshold);
3576
blt(CCR0, L_tail);
3577
3578
// Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3579
// We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3580
{
3581
// Align buf addr to mainLoop_stepping boundary.
3582
neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
3583
rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3584
3585
if (complexThreshold > mainLoop_stepping) {
3586
sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3587
} else {
3588
sub(tmp, len, tmp2); // Remaining bytes for main loop.
3589
cmpdi(CCR0, tmp, mainLoop_stepping);
3590
blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
3591
mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3592
}
3593
update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3594
}
3595
3596
srdi(tmp2, len, log_stepping); // #iterations for mainLoop
3597
andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
3598
mtctr(tmp2);
3599
3600
#ifdef VM_LITTLE_ENDIAN
3601
Register crc_rv = crc;
3602
#else
3603
Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
3604
// Occupies tmp, but frees up crc.
3605
load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
3606
tmp = crc;
3607
#endif
3608
3609
int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3610
3611
align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
3612
BIND(L_mainLoop);
3613
update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3614
bdnz(L_mainLoop);
3615
3616
#ifndef VM_LITTLE_ENDIAN
3617
load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
3618
tmp = crc_rv; // Tmp uses it's original register again.
3619
#endif
3620
3621
// Restore original table address for tailLoop.
3622
if (reconstructTableOffset != 0) {
3623
addi(table, table, -reconstructTableOffset);
3624
}
3625
3626
// Process last few (<complexThreshold) bytes of buffer.
3627
BIND(L_tail);
3628
update_byteLoop_crc32(crc, buf, len, table, data, false);
3629
3630
if (invertCRC) {
3631
nand(crc, crc, crc); // 1s complement of crc
3632
}
3633
BLOCK_COMMENT("} kernel_crc32_1word");
3634
}
3635
3636
/**
3637
* @param crc register containing existing CRC (32-bit)
3638
* @param buf register pointing to input byte buffer (byte*)
3639
* @param len register containing number of bytes
3640
* @param constants register pointing to precomputed constants
3641
* @param t0-t6 temp registers
3642
*/
3643
void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3644
Register t0, Register t1, Register t2, Register t3,
3645
Register t4, Register t5, Register t6, bool invertCRC) {
3646
assert_different_registers(crc, buf, len, constants);
3647
3648
Label L_tail;
3649
3650
BLOCK_COMMENT("kernel_crc32_vpmsum {");
3651
3652
if (invertCRC) {
3653
nand(crc, crc, crc); // 1s complement of crc
3654
}
3655
3656
// Enforce 32 bit.
3657
clrldi(len, len, 32);
3658
3659
// Align if we have enough bytes for the fast version.
3660
const int alignment = 16,
3661
threshold = 32;
3662
Register prealign = t0;
3663
3664
neg(prealign, buf);
3665
addi(t1, len, -threshold);
3666
andi(prealign, prealign, alignment - 1);
3667
cmpw(CCR0, t1, prealign);
3668
blt(CCR0, L_tail); // len - prealign < threshold?
3669
3670
subf(len, prealign, len);
3671
update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3672
3673
// Calculate from first aligned address as far as possible.
3674
addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3675
kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3676
addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3677
3678
// Remaining bytes.
3679
BIND(L_tail);
3680
update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3681
3682
if (invertCRC) {
3683
nand(crc, crc, crc); // 1s complement of crc
3684
}
3685
3686
BLOCK_COMMENT("} kernel_crc32_vpmsum");
3687
}
3688
3689
/**
3690
* @param crc register containing existing CRC (32-bit)
3691
* @param buf register pointing to input byte buffer (byte*)
3692
* @param len register containing number of bytes (will get updated to remaining bytes)
3693
* @param constants register pointing to CRC table for 128-bit aligned memory
3694
* @param t0-t6 temp registers
3695
*/
3696
void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3697
Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3698
3699
// Save non-volatile vector registers (frameless).
3700
Register offset = t1;
3701
int offsetInt = 0;
3702
offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3703
offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3704
offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3705
offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3706
offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3707
offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3708
#ifndef VM_LITTLE_ENDIAN
3709
offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3710
#endif
3711
offsetInt -= 8; std(R14, offsetInt, R1_SP);
3712
offsetInt -= 8; std(R15, offsetInt, R1_SP);
3713
3714
// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3715
// bytes per iteration. The basic scheme is:
3716
// lvx: load vector (Big Endian needs reversal)
3717
// vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3718
// vxor: xor partial results together to get unroll_factor2 vectors
3719
3720
// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3721
3722
// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3723
const int unroll_factor = CRC32_UNROLL_FACTOR,
3724
unroll_factor2 = CRC32_UNROLL_FACTOR2;
3725
3726
const int outer_consts_size = (unroll_factor2 - 1) * 16,
3727
inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3728
3729
// Support registers.
3730
Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3731
Register num_bytes = R14,
3732
loop_count = R15,
3733
cur_const = crc; // will live in VCRC
3734
// Constant array for outer loop: unroll_factor2 - 1 registers,
3735
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3736
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3737
consts1[] = { VR23, VR24 };
3738
// Data register arrays: 2 arrays with unroll_factor2 registers.
3739
VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3740
data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3741
3742
VectorRegister VCRC = data0[0];
3743
VectorRegister Vc = VR25;
3744
VectorRegister swap_bytes = VR26; // Only for Big Endian.
3745
3746
// We have at least 1 iteration (ensured by caller).
3747
Label L_outer_loop, L_inner_loop, L_last;
3748
3749
// If supported set DSCR pre-fetch to deepest.
3750
if (VM_Version::has_mfdscr()) {
3751
load_const_optimized(t0, VM_Version::_dscr_val | 7);
3752
mtdscr(t0);
3753
}
3754
3755
mtvrwz(VCRC, crc); // crc lives in VCRC, now
3756
3757
for (int i = 1; i < unroll_factor2; ++i) {
3758
li(offs[i], 16 * i);
3759
}
3760
3761
// Load consts for outer loop
3762
lvx(consts0[0], constants);
3763
for (int i = 1; i < unroll_factor2 - 1; ++i) {
3764
lvx(consts0[i], offs[i], constants);
3765
}
3766
3767
load_const_optimized(num_bytes, 16 * unroll_factor);
3768
3769
// Reuse data registers outside of the loop.
3770
VectorRegister Vtmp = data1[0];
3771
VectorRegister Vtmp2 = data1[1];
3772
VectorRegister zeroes = data1[2];
3773
3774
vspltisb(Vtmp, 0);
3775
vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3776
3777
// Load vector for vpermxor (to xor both 64 bit parts together)
3778
lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3779
vspltisb(Vc, 4);
3780
vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3781
xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3782
vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3783
3784
#ifdef VM_LITTLE_ENDIAN
3785
#define BE_swap_bytes(x)
3786
#else
3787
vspltisb(Vtmp2, 0xf);
3788
vxor(swap_bytes, Vtmp, Vtmp2);
3789
#define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3790
#endif
3791
3792
cmpd(CCR0, len, num_bytes);
3793
blt(CCR0, L_last);
3794
3795
addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3796
load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3797
3798
// ********** Main loop start **********
3799
align(32);
3800
bind(L_outer_loop);
3801
3802
// Begin of unrolled first iteration (no xor).
3803
lvx(data1[0], buf);
3804
for (int i = 1; i < unroll_factor2 / 2; ++i) {
3805
lvx(data1[i], offs[i], buf);
3806
}
3807
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3808
lvx(consts1[0], cur_const);
3809
mtctr(loop_count);
3810
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3811
BE_swap_bytes(data1[i]);
3812
if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3813
lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3814
vpmsumw(data0[i], data1[i], consts1[0]);
3815
}
3816
addi(buf, buf, 16 * unroll_factor2);
3817
subf(len, num_bytes, len);
3818
lvx(consts1[1], offs[1], cur_const);
3819
addi(cur_const, cur_const, 32);
3820
// Begin of unrolled second iteration (head).
3821
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3822
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3823
if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3824
vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3825
}
3826
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3827
BE_swap_bytes(data1[i]);
3828
lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3829
vpmsumw(data1[i], data1[i], consts1[1]);
3830
}
3831
addi(buf, buf, 16 * unroll_factor2);
3832
3833
// Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3834
// Double-iteration allows using the 2 constant registers alternatingly.
3835
align(32);
3836
bind(L_inner_loop);
3837
for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3838
if (j & 1) {
3839
lvx(consts1[0], cur_const);
3840
} else {
3841
lvx(consts1[1], offs[1], cur_const);
3842
addi(cur_const, cur_const, 32);
3843
}
3844
for (int i = 0; i < unroll_factor2; ++i) {
3845
int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3846
if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3847
BE_swap_bytes(data1[idx]);
3848
vxor(data0[i], data0[i], data1[i]);
3849
if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3850
vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3851
}
3852
addi(buf, buf, 16 * unroll_factor2);
3853
}
3854
bdnz(L_inner_loop);
3855
3856
addi(cur_const, constants, outer_consts_size); // Reset
3857
3858
// Tail of last iteration (no loads).
3859
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3860
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3861
vxor(data0[i], data0[i], data1[i]);
3862
vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3863
}
3864
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3865
vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3866
vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3867
}
3868
3869
// Last data register is ok, other ones need fixup shift.
3870
for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3871
vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3872
}
3873
3874
// Combine to 128 bit result vector VCRC = data0[0].
3875
for (int i = 1; i < unroll_factor2; i<<=1) {
3876
for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3877
vxor(data0[j], data0[j], data0[j+i]);
3878
}
3879
}
3880
cmpd(CCR0, len, num_bytes);
3881
bge(CCR0, L_outer_loop);
3882
3883
// Last chance with lower num_bytes.
3884
bind(L_last);
3885
srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3886
// Point behind last const for inner loop.
3887
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3888
sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3889
clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3890
subf(cur_const, R0, cur_const); // Point to constant to be used first.
3891
3892
addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3893
bgt(CCR0, L_outer_loop);
3894
// ********** Main loop end **********
3895
3896
// Restore DSCR pre-fetch value.
3897
if (VM_Version::has_mfdscr()) {
3898
load_const_optimized(t0, VM_Version::_dscr_val);
3899
mtdscr(t0);
3900
}
3901
3902
// ********** Simple loop for remaining 16 byte blocks **********
3903
{
3904
Label L_loop, L_done;
3905
3906
srdi_(t0, len, 4); // 16 bytes per iteration
3907
clrldi(len, len, 64-4);
3908
beq(CCR0, L_done);
3909
3910
// Point to const (same as last const for inner loop).
3911
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3912
mtctr(t0);
3913
lvx(Vtmp2, cur_const);
3914
3915
align(32);
3916
bind(L_loop);
3917
3918
lvx(Vtmp, buf);
3919
addi(buf, buf, 16);
3920
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3921
BE_swap_bytes(Vtmp);
3922
vxor(VCRC, VCRC, Vtmp);
3923
vpmsumw(VCRC, VCRC, Vtmp2);
3924
bdnz(L_loop);
3925
3926
bind(L_done);
3927
}
3928
// ********** Simple loop end **********
3929
#undef BE_swap_bytes
3930
3931
// Point to Barrett constants
3932
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3933
3934
vspltisb(zeroes, 0);
3935
3936
// Combine to 64 bit result.
3937
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3938
3939
// Reduce to 32 bit CRC: Remainder by multiply-high.
3940
lvx(Vtmp, cur_const);
3941
vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
3942
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
3943
vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3944
vsldoi(Vtmp, zeroes, Vtmp, 8);
3945
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
3946
vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
3947
3948
// Move result. len is already updated.
3949
vsldoi(VCRC, VCRC, zeroes, 8);
3950
mfvrd(crc, VCRC);
3951
3952
// Restore non-volatile Vector registers (frameless).
3953
offsetInt = 0;
3954
offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3955
offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3956
offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3957
offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3958
offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3959
offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3960
#ifndef VM_LITTLE_ENDIAN
3961
offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3962
#endif
3963
offsetInt -= 8; ld(R14, offsetInt, R1_SP);
3964
offsetInt -= 8; ld(R15, offsetInt, R1_SP);
3965
}
3966
3967
void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3968
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3969
load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3970
: StubRoutines::crc_table_addr() , R0);
3971
3972
if (VM_Version::has_vpmsumb()) {
3973
kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3974
} else {
3975
kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3976
}
3977
}
3978
3979
void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3980
assert_different_registers(crc, val, table);
3981
3982
BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3983
if (invertCRC) {
3984
nand(crc, crc, crc); // 1s complement of crc
3985
}
3986
3987
update_byte_crc32(crc, val, table);
3988
3989
if (invertCRC) {
3990
nand(crc, crc, crc); // 1s complement of crc
3991
}
3992
}
3993
3994
// dest_lo += src1 + src2
3995
// dest_hi += carry1 + carry2
3996
void MacroAssembler::add2_with_carry(Register dest_hi,
3997
Register dest_lo,
3998
Register src1, Register src2) {
3999
li(R0, 0);
4000
addc(dest_lo, dest_lo, src1);
4001
adde(dest_hi, dest_hi, R0);
4002
addc(dest_lo, dest_lo, src2);
4003
adde(dest_hi, dest_hi, R0);
4004
}
4005
4006
// Multiply 64 bit by 64 bit first loop.
4007
void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4008
Register x_xstart,
4009
Register y, Register y_idx,
4010
Register z,
4011
Register carry,
4012
Register product_high, Register product,
4013
Register idx, Register kdx,
4014
Register tmp) {
4015
// jlong carry, x[], y[], z[];
4016
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4017
// huge_128 product = y[idx] * x[xstart] + carry;
4018
// z[kdx] = (jlong)product;
4019
// carry = (jlong)(product >>> 64);
4020
// }
4021
// z[xstart] = carry;
4022
4023
Label L_first_loop, L_first_loop_exit;
4024
Label L_one_x, L_one_y, L_multiply;
4025
4026
addic_(xstart, xstart, -1);
4027
blt(CCR0, L_one_x); // Special case: length of x is 1.
4028
4029
// Load next two integers of x.
4030
sldi(tmp, xstart, LogBytesPerInt);
4031
ldx(x_xstart, x, tmp);
4032
#ifdef VM_LITTLE_ENDIAN
4033
rldicl(x_xstart, x_xstart, 32, 0);
4034
#endif
4035
4036
align(32, 16);
4037
bind(L_first_loop);
4038
4039
cmpdi(CCR0, idx, 1);
4040
blt(CCR0, L_first_loop_exit);
4041
addi(idx, idx, -2);
4042
beq(CCR0, L_one_y);
4043
4044
// Load next two integers of y.
4045
sldi(tmp, idx, LogBytesPerInt);
4046
ldx(y_idx, y, tmp);
4047
#ifdef VM_LITTLE_ENDIAN
4048
rldicl(y_idx, y_idx, 32, 0);
4049
#endif
4050
4051
4052
bind(L_multiply);
4053
multiply64(product_high, product, x_xstart, y_idx);
4054
4055
li(tmp, 0);
4056
addc(product, product, carry); // Add carry to result.
4057
adde(product_high, product_high, tmp); // Add carry of the last addition.
4058
addi(kdx, kdx, -2);
4059
4060
// Store result.
4061
#ifdef VM_LITTLE_ENDIAN
4062
rldicl(product, product, 32, 0);
4063
#endif
4064
sldi(tmp, kdx, LogBytesPerInt);
4065
stdx(product, z, tmp);
4066
mr_if_needed(carry, product_high);
4067
b(L_first_loop);
4068
4069
4070
bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4071
4072
lwz(y_idx, 0, y);
4073
b(L_multiply);
4074
4075
4076
bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4077
4078
lwz(x_xstart, 0, x);
4079
b(L_first_loop);
4080
4081
bind(L_first_loop_exit);
4082
}
4083
4084
// Multiply 64 bit by 64 bit and add 128 bit.
4085
void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4086
Register z, Register yz_idx,
4087
Register idx, Register carry,
4088
Register product_high, Register product,
4089
Register tmp, int offset) {
4090
4091
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4092
// z[kdx] = (jlong)product;
4093
4094
sldi(tmp, idx, LogBytesPerInt);
4095
if (offset) {
4096
addi(tmp, tmp, offset);
4097
}
4098
ldx(yz_idx, y, tmp);
4099
#ifdef VM_LITTLE_ENDIAN
4100
rldicl(yz_idx, yz_idx, 32, 0);
4101
#endif
4102
4103
multiply64(product_high, product, x_xstart, yz_idx);
4104
ldx(yz_idx, z, tmp);
4105
#ifdef VM_LITTLE_ENDIAN
4106
rldicl(yz_idx, yz_idx, 32, 0);
4107
#endif
4108
4109
add2_with_carry(product_high, product, carry, yz_idx);
4110
4111
sldi(tmp, idx, LogBytesPerInt);
4112
if (offset) {
4113
addi(tmp, tmp, offset);
4114
}
4115
#ifdef VM_LITTLE_ENDIAN
4116
rldicl(product, product, 32, 0);
4117
#endif
4118
stdx(product, z, tmp);
4119
}
4120
4121
// Multiply 128 bit by 128 bit. Unrolled inner loop.
4122
void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4123
Register y, Register z,
4124
Register yz_idx, Register idx, Register carry,
4125
Register product_high, Register product,
4126
Register carry2, Register tmp) {
4127
4128
// jlong carry, x[], y[], z[];
4129
// int kdx = ystart+1;
4130
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4131
// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4132
// z[kdx+idx+1] = (jlong)product;
4133
// jlong carry2 = (jlong)(product >>> 64);
4134
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4135
// z[kdx+idx] = (jlong)product;
4136
// carry = (jlong)(product >>> 64);
4137
// }
4138
// idx += 2;
4139
// if (idx > 0) {
4140
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4141
// z[kdx+idx] = (jlong)product;
4142
// carry = (jlong)(product >>> 64);
4143
// }
4144
4145
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4146
const Register jdx = R0;
4147
4148
// Scale the index.
4149
srdi_(jdx, idx, 2);
4150
beq(CCR0, L_third_loop_exit);
4151
mtctr(jdx);
4152
4153
align(32, 16);
4154
bind(L_third_loop);
4155
4156
addi(idx, idx, -4);
4157
4158
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4159
mr_if_needed(carry2, product_high);
4160
4161
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4162
mr_if_needed(carry, product_high);
4163
bdnz(L_third_loop);
4164
4165
bind(L_third_loop_exit); // Handle any left-over operand parts.
4166
4167
andi_(idx, idx, 0x3);
4168
beq(CCR0, L_post_third_loop_done);
4169
4170
Label L_check_1;
4171
4172
addic_(idx, idx, -2);
4173
blt(CCR0, L_check_1);
4174
4175
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4176
mr_if_needed(carry, product_high);
4177
4178
bind(L_check_1);
4179
4180
addi(idx, idx, 0x2);
4181
andi_(idx, idx, 0x1);
4182
addic_(idx, idx, -1);
4183
blt(CCR0, L_post_third_loop_done);
4184
4185
sldi(tmp, idx, LogBytesPerInt);
4186
lwzx(yz_idx, y, tmp);
4187
multiply64(product_high, product, x_xstart, yz_idx);
4188
lwzx(yz_idx, z, tmp);
4189
4190
add2_with_carry(product_high, product, yz_idx, carry);
4191
4192
sldi(tmp, idx, LogBytesPerInt);
4193
stwx(product, z, tmp);
4194
srdi(product, product, 32);
4195
4196
sldi(product_high, product_high, 32);
4197
orr(product, product, product_high);
4198
mr_if_needed(carry, product);
4199
4200
bind(L_post_third_loop_done);
4201
} // multiply_128_x_128_loop
4202
4203
void MacroAssembler::muladd(Register out, Register in,
4204
Register offset, Register len, Register k,
4205
Register tmp1, Register tmp2, Register carry) {
4206
4207
// Labels
4208
Label LOOP, SKIP;
4209
4210
// Make sure length is positive.
4211
cmpdi (CCR0, len, 0);
4212
4213
// Prepare variables
4214
subi (offset, offset, 4);
4215
li (carry, 0);
4216
ble (CCR0, SKIP);
4217
4218
mtctr (len);
4219
subi (len, len, 1 );
4220
sldi (len, len, 2 );
4221
4222
// Main loop
4223
bind(LOOP);
4224
lwzx (tmp1, len, in );
4225
lwzx (tmp2, offset, out );
4226
mulld (tmp1, tmp1, k );
4227
add (tmp2, carry, tmp2 );
4228
add (tmp2, tmp1, tmp2 );
4229
stwx (tmp2, offset, out );
4230
srdi (carry, tmp2, 32 );
4231
subi (offset, offset, 4 );
4232
subi (len, len, 4 );
4233
bdnz (LOOP);
4234
bind(SKIP);
4235
}
4236
4237
void MacroAssembler::multiply_to_len(Register x, Register xlen,
4238
Register y, Register ylen,
4239
Register z, Register zlen,
4240
Register tmp1, Register tmp2,
4241
Register tmp3, Register tmp4,
4242
Register tmp5, Register tmp6,
4243
Register tmp7, Register tmp8,
4244
Register tmp9, Register tmp10,
4245
Register tmp11, Register tmp12,
4246
Register tmp13) {
4247
4248
ShortBranchVerifier sbv(this);
4249
4250
assert_different_registers(x, xlen, y, ylen, z, zlen,
4251
tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4252
assert_different_registers(x, xlen, y, ylen, z, zlen,
4253
tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4254
assert_different_registers(x, xlen, y, ylen, z, zlen,
4255
tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4256
4257
const Register idx = tmp1;
4258
const Register kdx = tmp2;
4259
const Register xstart = tmp3;
4260
4261
const Register y_idx = tmp4;
4262
const Register carry = tmp5;
4263
const Register product = tmp6;
4264
const Register product_high = tmp7;
4265
const Register x_xstart = tmp8;
4266
const Register tmp = tmp9;
4267
4268
// First Loop.
4269
//
4270
// final static long LONG_MASK = 0xffffffffL;
4271
// int xstart = xlen - 1;
4272
// int ystart = ylen - 1;
4273
// long carry = 0;
4274
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4275
// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4276
// z[kdx] = (int)product;
4277
// carry = product >>> 32;
4278
// }
4279
// z[xstart] = (int)carry;
4280
4281
mr_if_needed(idx, ylen); // idx = ylen
4282
mr_if_needed(kdx, zlen); // kdx = xlen + ylen
4283
li(carry, 0); // carry = 0
4284
4285
Label L_done;
4286
4287
addic_(xstart, xlen, -1);
4288
blt(CCR0, L_done);
4289
4290
multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4291
carry, product_high, product, idx, kdx, tmp);
4292
4293
Label L_second_loop;
4294
4295
cmpdi(CCR0, kdx, 0);
4296
beq(CCR0, L_second_loop);
4297
4298
Label L_carry;
4299
4300
addic_(kdx, kdx, -1);
4301
beq(CCR0, L_carry);
4302
4303
// Store lower 32 bits of carry.
4304
sldi(tmp, kdx, LogBytesPerInt);
4305
stwx(carry, z, tmp);
4306
srdi(carry, carry, 32);
4307
addi(kdx, kdx, -1);
4308
4309
4310
bind(L_carry);
4311
4312
// Store upper 32 bits of carry.
4313
sldi(tmp, kdx, LogBytesPerInt);
4314
stwx(carry, z, tmp);
4315
4316
// Second and third (nested) loops.
4317
//
4318
// for (int i = xstart-1; i >= 0; i--) { // Second loop
4319
// carry = 0;
4320
// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4321
// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4322
// (z[k] & LONG_MASK) + carry;
4323
// z[k] = (int)product;
4324
// carry = product >>> 32;
4325
// }
4326
// z[i] = (int)carry;
4327
// }
4328
//
4329
// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4330
4331
bind(L_second_loop);
4332
4333
li(carry, 0); // carry = 0;
4334
4335
addic_(xstart, xstart, -1); // i = xstart-1;
4336
blt(CCR0, L_done);
4337
4338
Register zsave = tmp10;
4339
4340
mr(zsave, z);
4341
4342
4343
Label L_last_x;
4344
4345
sldi(tmp, xstart, LogBytesPerInt);
4346
add(z, z, tmp); // z = z + k - j
4347
addi(z, z, 4);
4348
addic_(xstart, xstart, -1); // i = xstart-1;
4349
blt(CCR0, L_last_x);
4350
4351
sldi(tmp, xstart, LogBytesPerInt);
4352
ldx(x_xstart, x, tmp);
4353
#ifdef VM_LITTLE_ENDIAN
4354
rldicl(x_xstart, x_xstart, 32, 0);
4355
#endif
4356
4357
4358
Label L_third_loop_prologue;
4359
4360
bind(L_third_loop_prologue);
4361
4362
Register xsave = tmp11;
4363
Register xlensave = tmp12;
4364
Register ylensave = tmp13;
4365
4366
mr(xsave, x);
4367
mr(xlensave, xstart);
4368
mr(ylensave, ylen);
4369
4370
4371
multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4372
carry, product_high, product, x, tmp);
4373
4374
mr(z, zsave);
4375
mr(x, xsave);
4376
mr(xlen, xlensave); // This is the decrement of the loop counter!
4377
mr(ylen, ylensave);
4378
4379
addi(tmp3, xlen, 1);
4380
sldi(tmp, tmp3, LogBytesPerInt);
4381
stwx(carry, z, tmp);
4382
addic_(tmp3, tmp3, -1);
4383
blt(CCR0, L_done);
4384
4385
srdi(carry, carry, 32);
4386
sldi(tmp, tmp3, LogBytesPerInt);
4387
stwx(carry, z, tmp);
4388
b(L_second_loop);
4389
4390
// Next infrequent code is moved outside loops.
4391
bind(L_last_x);
4392
4393
lwz(x_xstart, 0, x);
4394
b(L_third_loop_prologue);
4395
4396
bind(L_done);
4397
} // multiply_to_len
4398
4399
void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4400
#ifdef ASSERT
4401
Label ok;
4402
if (check_equal) {
4403
beq(CCR0, ok);
4404
} else {
4405
bne(CCR0, ok);
4406
}
4407
stop(msg);
4408
bind(ok);
4409
#endif
4410
}
4411
4412
void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4413
Register mem_base, const char* msg) {
4414
#ifdef ASSERT
4415
switch (size) {
4416
case 4:
4417
lwz(R0, mem_offset, mem_base);
4418
cmpwi(CCR0, R0, 0);
4419
break;
4420
case 8:
4421
ld(R0, mem_offset, mem_base);
4422
cmpdi(CCR0, R0, 0);
4423
break;
4424
default:
4425
ShouldNotReachHere();
4426
}
4427
asm_assert(check_equal, msg);
4428
#endif // ASSERT
4429
}
4430
4431
void MacroAssembler::verify_thread() {
4432
if (VerifyThread) {
4433
unimplemented("'VerifyThread' currently not implemented on PPC");
4434
}
4435
}
4436
4437
void MacroAssembler::verify_coop(Register coop, const char* msg) {
4438
if (!VerifyOops) { return; }
4439
if (UseCompressedOops) { decode_heap_oop(coop); }
4440
verify_oop(coop, msg);
4441
if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4442
}
4443
4444
// READ: oop. KILL: R0. Volatile floats perhaps.
4445
void MacroAssembler::verify_oop(Register oop, const char* msg) {
4446
if (!VerifyOops) {
4447
return;
4448
}
4449
4450
address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4451
const Register tmp = R11; // Will be preserved.
4452
const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4453
4454
BLOCK_COMMENT("verify_oop {");
4455
4456
save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4457
4458
mr_if_needed(R4_ARG2, oop);
4459
save_LR_CR(tmp); // save in old frame
4460
push_frame_reg_args(nbytes_save, tmp);
4461
// load FunctionDescriptor** / entry_address *
4462
load_const_optimized(tmp, fd, R0);
4463
// load FunctionDescriptor* / entry_address
4464
ld(tmp, 0, tmp);
4465
load_const_optimized(R3_ARG1, (address)msg, R0);
4466
// Call destination for its side effect.
4467
call_c(tmp);
4468
4469
pop_frame();
4470
restore_LR_CR(tmp);
4471
restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4472
4473
BLOCK_COMMENT("} verify_oop");
4474
}
4475
4476
void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4477
if (!VerifyOops) {
4478
return;
4479
}
4480
4481
address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4482
const Register tmp = R11; // Will be preserved.
4483
const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4484
save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4485
4486
ld(R4_ARG2, offs, base);
4487
save_LR_CR(tmp); // save in old frame
4488
push_frame_reg_args(nbytes_save, tmp);
4489
// load FunctionDescriptor** / entry_address *
4490
load_const_optimized(tmp, fd, R0);
4491
// load FunctionDescriptor* / entry_address
4492
ld(tmp, 0, tmp);
4493
load_const_optimized(R3_ARG1, (address)msg, R0);
4494
// Call destination for its side effect.
4495
call_c(tmp);
4496
4497
pop_frame();
4498
restore_LR_CR(tmp);
4499
restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4500
}
4501
4502
// Call a C-function that prints output.
4503
void MacroAssembler::stop(int type, const char* msg) {
4504
bool msg_present = (msg != NULL);
4505
4506
#ifndef PRODUCT
4507
block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4508
#else
4509
block_comment("stop {");
4510
#endif
4511
4512
if (msg_present) {
4513
type |= stop_msg_present;
4514
}
4515
tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4516
if (msg_present) {
4517
emit_int64((uintptr_t)msg);
4518
}
4519
4520
block_comment("} stop;");
4521
}
4522
4523
#ifndef PRODUCT
4524
// Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4525
// Val, addr are temp registers.
4526
// If low == addr, addr is killed.
4527
// High is preserved.
4528
void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4529
if (!ZapMemory) return;
4530
4531
assert_different_registers(low, val);
4532
4533
BLOCK_COMMENT("zap memory region {");
4534
load_const_optimized(val, 0x0101010101010101);
4535
int size = before + after;
4536
if (low == high && size < 5 && size > 0) {
4537
int offset = -before*BytesPerWord;
4538
for (int i = 0; i < size; ++i) {
4539
std(val, offset, low);
4540
offset += (1*BytesPerWord);
4541
}
4542
} else {
4543
addi(addr, low, -before*BytesPerWord);
4544
assert_different_registers(high, val);
4545
if (after) addi(high, high, after * BytesPerWord);
4546
Label loop;
4547
bind(loop);
4548
std(val, 0, addr);
4549
addi(addr, addr, 8);
4550
cmpd(CCR6, addr, high);
4551
ble(CCR6, loop);
4552
if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4553
}
4554
BLOCK_COMMENT("} zap memory region");
4555
}
4556
4557
#endif // !PRODUCT
4558
4559
void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4560
const bool* flag_addr, Label& label) {
4561
int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4562
assert(sizeof(bool) == 1, "PowerPC ABI");
4563
masm->lbz(temp, simm16_offset, temp);
4564
masm->cmpwi(CCR0, temp, 0);
4565
masm->beq(CCR0, label);
4566
}
4567
4568
SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4569
skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4570
}
4571
4572
SkipIfEqualZero::~SkipIfEqualZero() {
4573
_masm->bind(_label);
4574
}
4575
4576
void MacroAssembler::cache_wb(Address line) {
4577
assert(line.index() == noreg, "index should be noreg");
4578
assert(line.disp() == 0, "displacement should be 0");
4579
assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4580
// Data Cache Store, not really a flush, so it works like a sync of cache
4581
// line and persistent mem, i.e. copying the cache line to persistent whilst
4582
// not invalidating the cache line.
4583
dcbst(line.base());
4584
}
4585
4586
void MacroAssembler::cache_wbsync(bool is_presync) {
4587
assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4588
// We only need a post sync barrier. Post means _after_ a cache line flush or
4589
// store instruction, pre means a barrier emitted before such a instructions.
4590
if (!is_presync) {
4591
fence();
4592
}
4593
}
4594
4595