Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
40930 views
1
/*
2
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
3
* Copyright (c) 2012, 2021 SAP SE. All rights reserved.
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
*
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
9
*
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
15
*
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
*
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
22
* questions.
23
*
24
*/
25
26
#include "precompiled.hpp"
27
#include "asm/macroAssembler.inline.hpp"
28
#include "compiler/disassembler.hpp"
29
#include "gc/shared/collectedHeap.inline.hpp"
30
#include "gc/shared/barrierSet.hpp"
31
#include "gc/shared/barrierSetAssembler.hpp"
32
#include "interpreter/interpreter.hpp"
33
#include "memory/resourceArea.hpp"
34
#include "nativeInst_ppc.hpp"
35
#include "oops/klass.inline.hpp"
36
#include "oops/methodData.hpp"
37
#include "prims/methodHandles.hpp"
38
#include "runtime/biasedLocking.hpp"
39
#include "runtime/icache.hpp"
40
#include "runtime/interfaceSupport.inline.hpp"
41
#include "runtime/objectMonitor.hpp"
42
#include "runtime/os.hpp"
43
#include "runtime/safepoint.hpp"
44
#include "runtime/safepointMechanism.hpp"
45
#include "runtime/sharedRuntime.hpp"
46
#include "runtime/stubRoutines.hpp"
47
#include "runtime/vm_version.hpp"
48
#include "utilities/macros.hpp"
49
#include "utilities/powerOfTwo.hpp"
50
51
#ifdef PRODUCT
52
#define BLOCK_COMMENT(str) // nothing
53
#else
54
#define BLOCK_COMMENT(str) block_comment(str)
55
#endif
56
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58
#ifdef ASSERT
59
// On RISC, there's no benefit to verifying instruction boundaries.
60
bool AbstractAssembler::pd_check_instruction_mark() { return false; }
61
#endif
62
63
void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
64
assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
65
if (Assembler::is_simm(si31, 16)) {
66
ld(d, si31, a);
67
if (emit_filler_nop) nop();
68
} else {
69
const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
70
const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
71
addis(d, a, hi);
72
ld(d, lo, d);
73
}
74
}
75
76
void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
77
assert_different_registers(d, a);
78
ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
79
}
80
81
void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
82
size_t size_in_bytes, bool is_signed) {
83
switch (size_in_bytes) {
84
case 8: ld(dst, offs, base); break;
85
case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
86
case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
87
case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
88
default: ShouldNotReachHere();
89
}
90
}
91
92
void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
93
size_t size_in_bytes) {
94
switch (size_in_bytes) {
95
case 8: std(dst, offs, base); break;
96
case 4: stw(dst, offs, base); break;
97
case 2: sth(dst, offs, base); break;
98
case 1: stb(dst, offs, base); break;
99
default: ShouldNotReachHere();
100
}
101
}
102
103
void MacroAssembler::align(int modulus, int max, int rem) {
104
int padding = (rem + modulus - (offset() % modulus)) % modulus;
105
if (padding > max) return;
106
for (int c = (padding >> 2); c > 0; --c) { nop(); }
107
}
108
109
// Issue instructions that calculate given TOC from global TOC.
110
void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
111
bool add_relocation, bool emit_dummy_addr) {
112
int offset = -1;
113
if (emit_dummy_addr) {
114
offset = -128; // dummy address
115
} else if (addr != (address)(intptr_t)-1) {
116
offset = MacroAssembler::offset_to_global_toc(addr);
117
}
118
119
if (hi16) {
120
addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
121
}
122
if (lo16) {
123
if (add_relocation) {
124
// Relocate at the addi to avoid confusion with a load from the method's TOC.
125
relocate(internal_word_Relocation::spec(addr));
126
}
127
addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
128
}
129
}
130
131
address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
132
const int offset = MacroAssembler::offset_to_global_toc(addr);
133
134
const address inst2_addr = a;
135
const int inst2 = *(int *)inst2_addr;
136
137
// The relocation points to the second instruction, the addi,
138
// and the addi reads and writes the same register dst.
139
const int dst = inv_rt_field(inst2);
140
assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
141
142
// Now, find the preceding addis which writes to dst.
143
int inst1 = 0;
144
address inst1_addr = inst2_addr - BytesPerInstWord;
145
while (inst1_addr >= bound) {
146
inst1 = *(int *) inst1_addr;
147
if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
148
// Stop, found the addis which writes dst.
149
break;
150
}
151
inst1_addr -= BytesPerInstWord;
152
}
153
154
assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
155
set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
156
set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
157
return inst1_addr;
158
}
159
160
address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
161
const address inst2_addr = a;
162
const int inst2 = *(int *)inst2_addr;
163
164
// The relocation points to the second instruction, the addi,
165
// and the addi reads and writes the same register dst.
166
const int dst = inv_rt_field(inst2);
167
assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
168
169
// Now, find the preceding addis which writes to dst.
170
int inst1 = 0;
171
address inst1_addr = inst2_addr - BytesPerInstWord;
172
while (inst1_addr >= bound) {
173
inst1 = *(int *) inst1_addr;
174
if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
175
// stop, found the addis which writes dst
176
break;
177
}
178
inst1_addr -= BytesPerInstWord;
179
}
180
181
assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
182
183
int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
184
// -1 is a special case
185
if (offset == -1) {
186
return (address)(intptr_t)-1;
187
} else {
188
return global_toc() + offset;
189
}
190
}
191
192
#ifdef _LP64
193
// Patch compressed oops or klass constants.
194
// Assembler sequence is
195
// 1) compressed oops:
196
// lis rx = const.hi
197
// ori rx = rx | const.lo
198
// 2) compressed klass:
199
// lis rx = const.hi
200
// clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
201
// ori rx = rx | const.lo
202
// Clrldi will be passed by.
203
address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
204
assert(UseCompressedOops, "Should only patch compressed oops");
205
206
const address inst2_addr = a;
207
const int inst2 = *(int *)inst2_addr;
208
209
// The relocation points to the second instruction, the ori,
210
// and the ori reads and writes the same register dst.
211
const int dst = inv_rta_field(inst2);
212
assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
213
// Now, find the preceding addis which writes to dst.
214
int inst1 = 0;
215
address inst1_addr = inst2_addr - BytesPerInstWord;
216
bool inst1_found = false;
217
while (inst1_addr >= bound) {
218
inst1 = *(int *)inst1_addr;
219
if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
220
inst1_addr -= BytesPerInstWord;
221
}
222
assert(inst1_found, "inst is not lis");
223
224
uint32_t data_value = CompressedOops::narrow_oop_value(data);
225
int xc = (data_value >> 16) & 0xffff;
226
int xd = (data_value >> 0) & 0xffff;
227
228
set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
229
set_imm((int *)inst2_addr, (xd)); // unsigned int
230
return inst1_addr;
231
}
232
233
// Get compressed oop or klass constant.
234
narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
235
assert(UseCompressedOops, "Should only patch compressed oops");
236
237
const address inst2_addr = a;
238
const int inst2 = *(int *)inst2_addr;
239
240
// The relocation points to the second instruction, the ori,
241
// and the ori reads and writes the same register dst.
242
const int dst = inv_rta_field(inst2);
243
assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
244
// Now, find the preceding lis which writes to dst.
245
int inst1 = 0;
246
address inst1_addr = inst2_addr - BytesPerInstWord;
247
bool inst1_found = false;
248
249
while (inst1_addr >= bound) {
250
inst1 = *(int *) inst1_addr;
251
if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
252
inst1_addr -= BytesPerInstWord;
253
}
254
assert(inst1_found, "inst is not lis");
255
256
uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
257
uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
258
259
return CompressedOops::narrow_oop_cast(xl | xh);
260
}
261
#endif // _LP64
262
263
// Returns true if successful.
264
bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
265
Register toc, bool fixed_size) {
266
int toc_offset = 0;
267
// Use RelocationHolder::none for the constant pool entry, otherwise
268
// we will end up with a failing NativeCall::verify(x) where x is
269
// the address of the constant pool entry.
270
// FIXME: We should insert relocation information for oops at the constant
271
// pool entries instead of inserting it at the loads; patching of a constant
272
// pool entry should be less expensive.
273
address const_address = address_constant((address)a.value(), RelocationHolder::none);
274
if (const_address == NULL) { return false; } // allocation failure
275
// Relocate at the pc of the load.
276
relocate(a.rspec());
277
toc_offset = (int)(const_address - code()->consts()->start());
278
ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
279
return true;
280
}
281
282
bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
283
const address inst1_addr = a;
284
const int inst1 = *(int *)inst1_addr;
285
286
// The relocation points to the ld or the addis.
287
return (is_ld(inst1)) ||
288
(is_addis(inst1) && inv_ra_field(inst1) != 0);
289
}
290
291
int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
292
assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
293
294
const address inst1_addr = a;
295
const int inst1 = *(int *)inst1_addr;
296
297
if (is_ld(inst1)) {
298
return inv_d1_field(inst1);
299
} else if (is_addis(inst1)) {
300
const int dst = inv_rt_field(inst1);
301
302
// Now, find the succeeding ld which reads and writes to dst.
303
address inst2_addr = inst1_addr + BytesPerInstWord;
304
int inst2 = 0;
305
while (true) {
306
inst2 = *(int *) inst2_addr;
307
if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
308
// Stop, found the ld which reads and writes dst.
309
break;
310
}
311
inst2_addr += BytesPerInstWord;
312
}
313
return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
314
}
315
ShouldNotReachHere();
316
return 0;
317
}
318
319
// Get the constant from a `load_const' sequence.
320
long MacroAssembler::get_const(address a) {
321
assert(is_load_const_at(a), "not a load of a constant");
322
const int *p = (const int*) a;
323
unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
324
if (is_ori(*(p+1))) {
325
x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
326
x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
327
x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
328
} else if (is_lis(*(p+1))) {
329
x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
330
x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
331
x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
332
} else {
333
ShouldNotReachHere();
334
return (long) 0;
335
}
336
return (long) x;
337
}
338
339
// Patch the 64 bit constant of a `load_const' sequence. This is a low
340
// level procedure. It neither flushes the instruction cache nor is it
341
// mt safe.
342
void MacroAssembler::patch_const(address a, long x) {
343
assert(is_load_const_at(a), "not a load of a constant");
344
int *p = (int*) a;
345
if (is_ori(*(p+1))) {
346
set_imm(0 + p, (x >> 48) & 0xffff);
347
set_imm(1 + p, (x >> 32) & 0xffff);
348
set_imm(3 + p, (x >> 16) & 0xffff);
349
set_imm(4 + p, x & 0xffff);
350
} else if (is_lis(*(p+1))) {
351
set_imm(0 + p, (x >> 48) & 0xffff);
352
set_imm(2 + p, (x >> 32) & 0xffff);
353
set_imm(1 + p, (x >> 16) & 0xffff);
354
set_imm(3 + p, x & 0xffff);
355
} else {
356
ShouldNotReachHere();
357
}
358
}
359
360
AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
361
assert(oop_recorder() != NULL, "this assembler needs a Recorder");
362
int index = oop_recorder()->allocate_metadata_index(obj);
363
RelocationHolder rspec = metadata_Relocation::spec(index);
364
return AddressLiteral((address)obj, rspec);
365
}
366
367
AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
368
assert(oop_recorder() != NULL, "this assembler needs a Recorder");
369
int index = oop_recorder()->find_index(obj);
370
RelocationHolder rspec = metadata_Relocation::spec(index);
371
return AddressLiteral((address)obj, rspec);
372
}
373
374
AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
375
assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
376
int oop_index = oop_recorder()->allocate_oop_index(obj);
377
return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
378
}
379
380
AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
381
assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
382
int oop_index = oop_recorder()->find_index(obj);
383
return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
384
}
385
386
#ifndef PRODUCT
387
void MacroAssembler::pd_print_patched_instruction(address branch) {
388
Unimplemented(); // TODO: PPC port
389
}
390
#endif // ndef PRODUCT
391
392
// Conditional far branch for destinations encodable in 24+2 bits.
393
void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
394
395
// If requested by flag optimize, relocate the bc_far as a
396
// runtime_call and prepare for optimizing it when the code gets
397
// relocated.
398
if (optimize == bc_far_optimize_on_relocate) {
399
relocate(relocInfo::runtime_call_type);
400
}
401
402
// variant 2:
403
//
404
// b!cxx SKIP
405
// bxx DEST
406
// SKIP:
407
//
408
409
const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
410
opposite_bcond(inv_boint_bcond(boint)));
411
412
// We emit two branches.
413
// First, a conditional branch which jumps around the far branch.
414
const address not_taken_pc = pc() + 2 * BytesPerInstWord;
415
const address bc_pc = pc();
416
bc(opposite_boint, biint, not_taken_pc);
417
418
const int bc_instr = *(int*)bc_pc;
419
assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
420
assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
421
assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
422
opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
423
"postcondition");
424
assert(biint == inv_bi_field(bc_instr), "postcondition");
425
426
// Second, an unconditional far branch which jumps to dest.
427
// Note: target(dest) remembers the current pc (see CodeSection::target)
428
// and returns the current pc if the label is not bound yet; when
429
// the label gets bound, the unconditional far branch will be patched.
430
const address target_pc = target(dest);
431
const address b_pc = pc();
432
b(target_pc);
433
434
assert(not_taken_pc == pc(), "postcondition");
435
assert(dest.is_bound() || target_pc == b_pc, "postcondition");
436
}
437
438
// 1 or 2 instructions
439
void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
440
if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
441
bc(boint, biint, dest);
442
} else {
443
bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
444
}
445
}
446
447
bool MacroAssembler::is_bc_far_at(address instruction_addr) {
448
return is_bc_far_variant1_at(instruction_addr) ||
449
is_bc_far_variant2_at(instruction_addr) ||
450
is_bc_far_variant3_at(instruction_addr);
451
}
452
453
address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
454
if (is_bc_far_variant1_at(instruction_addr)) {
455
const address instruction_1_addr = instruction_addr;
456
const int instruction_1 = *(int*)instruction_1_addr;
457
return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
458
} else if (is_bc_far_variant2_at(instruction_addr)) {
459
const address instruction_2_addr = instruction_addr + 4;
460
return bxx_destination(instruction_2_addr);
461
} else if (is_bc_far_variant3_at(instruction_addr)) {
462
return instruction_addr + 8;
463
}
464
// variant 4 ???
465
ShouldNotReachHere();
466
return NULL;
467
}
468
void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
469
470
if (is_bc_far_variant3_at(instruction_addr)) {
471
// variant 3, far cond branch to the next instruction, already patched to nops:
472
//
473
// nop
474
// endgroup
475
// SKIP/DEST:
476
//
477
return;
478
}
479
480
// first, extract boint and biint from the current branch
481
int boint = 0;
482
int biint = 0;
483
484
ResourceMark rm;
485
const int code_size = 2 * BytesPerInstWord;
486
CodeBuffer buf(instruction_addr, code_size);
487
MacroAssembler masm(&buf);
488
if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
489
// Far branch to next instruction: Optimize it by patching nops (produce variant 3).
490
masm.nop();
491
masm.endgroup();
492
} else {
493
if (is_bc_far_variant1_at(instruction_addr)) {
494
// variant 1, the 1st instruction contains the destination address:
495
//
496
// bcxx DEST
497
// nop
498
//
499
const int instruction_1 = *(int*)(instruction_addr);
500
boint = inv_bo_field(instruction_1);
501
biint = inv_bi_field(instruction_1);
502
} else if (is_bc_far_variant2_at(instruction_addr)) {
503
// variant 2, the 2nd instruction contains the destination address:
504
//
505
// b!cxx SKIP
506
// bxx DEST
507
// SKIP:
508
//
509
const int instruction_1 = *(int*)(instruction_addr);
510
boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
511
opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
512
biint = inv_bi_field(instruction_1);
513
} else {
514
// variant 4???
515
ShouldNotReachHere();
516
}
517
518
// second, set the new branch destination and optimize the code
519
if (dest != instruction_addr + 4 && // the bc_far is still unbound!
520
masm.is_within_range_of_bcxx(dest, instruction_addr)) {
521
// variant 1:
522
//
523
// bcxx DEST
524
// nop
525
//
526
masm.bc(boint, biint, dest);
527
masm.nop();
528
} else {
529
// variant 2:
530
//
531
// b!cxx SKIP
532
// bxx DEST
533
// SKIP:
534
//
535
const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
536
opposite_bcond(inv_boint_bcond(boint)));
537
const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
538
masm.bc(opposite_boint, biint, not_taken_pc);
539
masm.b(dest);
540
}
541
}
542
ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
543
}
544
545
// Emit a NOT mt-safe patchable 64 bit absolute call/jump.
546
void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
547
// get current pc
548
uint64_t start_pc = (uint64_t) pc();
549
550
const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
551
const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
552
553
// relocate here
554
if (rt != relocInfo::none) {
555
relocate(rt);
556
}
557
558
if ( ReoptimizeCallSequences &&
559
(( link && is_within_range_of_b(dest, pc_of_bl)) ||
560
(!link && is_within_range_of_b(dest, pc_of_b)))) {
561
// variant 2:
562
// Emit an optimized, pc-relative call/jump.
563
564
if (link) {
565
// some padding
566
nop();
567
nop();
568
nop();
569
nop();
570
nop();
571
nop();
572
573
// do the call
574
assert(pc() == pc_of_bl, "just checking");
575
bl(dest, relocInfo::none);
576
} else {
577
// do the jump
578
assert(pc() == pc_of_b, "just checking");
579
b(dest, relocInfo::none);
580
581
// some padding
582
nop();
583
nop();
584
nop();
585
nop();
586
nop();
587
nop();
588
}
589
590
// Assert that we can identify the emitted call/jump.
591
assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
592
"can't identify emitted call");
593
} else {
594
// variant 1:
595
mr(R0, R11); // spill R11 -> R0.
596
597
// Load the destination address into CTR,
598
// calculate destination relative to global toc.
599
calculate_address_from_global_toc(R11, dest, true, true, false);
600
601
mtctr(R11);
602
mr(R11, R0); // spill R11 <- R0.
603
nop();
604
605
// do the call/jump
606
if (link) {
607
bctrl();
608
} else{
609
bctr();
610
}
611
// Assert that we can identify the emitted call/jump.
612
assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
613
"can't identify emitted call");
614
}
615
616
// Assert that we can identify the emitted call/jump.
617
assert(is_bxx64_patchable_at((address)start_pc, link),
618
"can't identify emitted call");
619
assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
620
"wrong encoding of dest address");
621
}
622
623
// Identify a bxx64_patchable instruction.
624
bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
625
return is_bxx64_patchable_variant1b_at(instruction_addr, link)
626
//|| is_bxx64_patchable_variant1_at(instruction_addr, link)
627
|| is_bxx64_patchable_variant2_at(instruction_addr, link);
628
}
629
630
// Does the call64_patchable instruction use a pc-relative encoding of
631
// the call destination?
632
bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
633
// variant 2 is pc-relative
634
return is_bxx64_patchable_variant2_at(instruction_addr, link);
635
}
636
637
// Identify variant 1.
638
bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
639
unsigned int* instr = (unsigned int*) instruction_addr;
640
return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
641
&& is_mtctr(instr[5]) // mtctr
642
&& is_load_const_at(instruction_addr);
643
}
644
645
// Identify variant 1b: load destination relative to global toc.
646
bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
647
unsigned int* instr = (unsigned int*) instruction_addr;
648
return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
649
&& is_mtctr(instr[3]) // mtctr
650
&& is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
651
}
652
653
// Identify variant 2.
654
bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
655
unsigned int* instr = (unsigned int*) instruction_addr;
656
if (link) {
657
return is_bl (instr[6]) // bl dest is last
658
&& is_nop(instr[0]) // nop
659
&& is_nop(instr[1]) // nop
660
&& is_nop(instr[2]) // nop
661
&& is_nop(instr[3]) // nop
662
&& is_nop(instr[4]) // nop
663
&& is_nop(instr[5]); // nop
664
} else {
665
return is_b (instr[0]) // b dest is first
666
&& is_nop(instr[1]) // nop
667
&& is_nop(instr[2]) // nop
668
&& is_nop(instr[3]) // nop
669
&& is_nop(instr[4]) // nop
670
&& is_nop(instr[5]) // nop
671
&& is_nop(instr[6]); // nop
672
}
673
}
674
675
// Set dest address of a bxx64_patchable instruction.
676
void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
677
ResourceMark rm;
678
int code_size = MacroAssembler::bxx64_patchable_size;
679
CodeBuffer buf(instruction_addr, code_size);
680
MacroAssembler masm(&buf);
681
masm.bxx64_patchable(dest, relocInfo::none, link);
682
ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
683
}
684
685
// Get dest address of a bxx64_patchable instruction.
686
address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
687
if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
688
return (address) (unsigned long) get_const(instruction_addr);
689
} else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
690
unsigned int* instr = (unsigned int*) instruction_addr;
691
if (link) {
692
const int instr_idx = 6; // bl is last
693
int branchoffset = branch_destination(instr[instr_idx], 0);
694
return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
695
} else {
696
const int instr_idx = 0; // b is first
697
int branchoffset = branch_destination(instr[instr_idx], 0);
698
return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
699
}
700
// Load dest relative to global toc.
701
} else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
702
return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
703
instruction_addr);
704
} else {
705
ShouldNotReachHere();
706
return NULL;
707
}
708
}
709
710
void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
711
const int magic_number = 0x42;
712
713
// Preserve stack pointer register (R1_SP) and system thread id register (R13);
714
// although they're technically volatile
715
for (int i = 2; i < 13; i++) {
716
Register reg = as_Register(i);
717
if (reg == excluded_register) {
718
continue;
719
}
720
721
li(reg, magic_number);
722
}
723
}
724
725
void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
726
const int magic_number = 0x43;
727
728
li(tmp, magic_number);
729
for (int m = 0; m <= 7; m++) {
730
std(tmp, frame::abi_minframe_size + m * 8, R1_SP);
731
}
732
}
733
734
// Uses ordering which corresponds to ABI:
735
// _savegpr0_14: std r14,-144(r1)
736
// _savegpr0_15: std r15,-136(r1)
737
// _savegpr0_16: std r16,-128(r1)
738
void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
739
std(R14, offset, dst); offset += 8;
740
std(R15, offset, dst); offset += 8;
741
std(R16, offset, dst); offset += 8;
742
std(R17, offset, dst); offset += 8;
743
std(R18, offset, dst); offset += 8;
744
std(R19, offset, dst); offset += 8;
745
std(R20, offset, dst); offset += 8;
746
std(R21, offset, dst); offset += 8;
747
std(R22, offset, dst); offset += 8;
748
std(R23, offset, dst); offset += 8;
749
std(R24, offset, dst); offset += 8;
750
std(R25, offset, dst); offset += 8;
751
std(R26, offset, dst); offset += 8;
752
std(R27, offset, dst); offset += 8;
753
std(R28, offset, dst); offset += 8;
754
std(R29, offset, dst); offset += 8;
755
std(R30, offset, dst); offset += 8;
756
std(R31, offset, dst); offset += 8;
757
758
stfd(F14, offset, dst); offset += 8;
759
stfd(F15, offset, dst); offset += 8;
760
stfd(F16, offset, dst); offset += 8;
761
stfd(F17, offset, dst); offset += 8;
762
stfd(F18, offset, dst); offset += 8;
763
stfd(F19, offset, dst); offset += 8;
764
stfd(F20, offset, dst); offset += 8;
765
stfd(F21, offset, dst); offset += 8;
766
stfd(F22, offset, dst); offset += 8;
767
stfd(F23, offset, dst); offset += 8;
768
stfd(F24, offset, dst); offset += 8;
769
stfd(F25, offset, dst); offset += 8;
770
stfd(F26, offset, dst); offset += 8;
771
stfd(F27, offset, dst); offset += 8;
772
stfd(F28, offset, dst); offset += 8;
773
stfd(F29, offset, dst); offset += 8;
774
stfd(F30, offset, dst); offset += 8;
775
stfd(F31, offset, dst);
776
}
777
778
// Uses ordering which corresponds to ABI:
779
// _restgpr0_14: ld r14,-144(r1)
780
// _restgpr0_15: ld r15,-136(r1)
781
// _restgpr0_16: ld r16,-128(r1)
782
void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
783
ld(R14, offset, src); offset += 8;
784
ld(R15, offset, src); offset += 8;
785
ld(R16, offset, src); offset += 8;
786
ld(R17, offset, src); offset += 8;
787
ld(R18, offset, src); offset += 8;
788
ld(R19, offset, src); offset += 8;
789
ld(R20, offset, src); offset += 8;
790
ld(R21, offset, src); offset += 8;
791
ld(R22, offset, src); offset += 8;
792
ld(R23, offset, src); offset += 8;
793
ld(R24, offset, src); offset += 8;
794
ld(R25, offset, src); offset += 8;
795
ld(R26, offset, src); offset += 8;
796
ld(R27, offset, src); offset += 8;
797
ld(R28, offset, src); offset += 8;
798
ld(R29, offset, src); offset += 8;
799
ld(R30, offset, src); offset += 8;
800
ld(R31, offset, src); offset += 8;
801
802
// FP registers
803
lfd(F14, offset, src); offset += 8;
804
lfd(F15, offset, src); offset += 8;
805
lfd(F16, offset, src); offset += 8;
806
lfd(F17, offset, src); offset += 8;
807
lfd(F18, offset, src); offset += 8;
808
lfd(F19, offset, src); offset += 8;
809
lfd(F20, offset, src); offset += 8;
810
lfd(F21, offset, src); offset += 8;
811
lfd(F22, offset, src); offset += 8;
812
lfd(F23, offset, src); offset += 8;
813
lfd(F24, offset, src); offset += 8;
814
lfd(F25, offset, src); offset += 8;
815
lfd(F26, offset, src); offset += 8;
816
lfd(F27, offset, src); offset += 8;
817
lfd(F28, offset, src); offset += 8;
818
lfd(F29, offset, src); offset += 8;
819
lfd(F30, offset, src); offset += 8;
820
lfd(F31, offset, src);
821
}
822
823
// For verify_oops.
824
void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
825
std(R2, offset, dst); offset += 8;
826
if (include_R3_RET_reg) {
827
std(R3, offset, dst); offset += 8;
828
}
829
std(R4, offset, dst); offset += 8;
830
std(R5, offset, dst); offset += 8;
831
std(R6, offset, dst); offset += 8;
832
std(R7, offset, dst); offset += 8;
833
std(R8, offset, dst); offset += 8;
834
std(R9, offset, dst); offset += 8;
835
std(R10, offset, dst); offset += 8;
836
std(R11, offset, dst); offset += 8;
837
std(R12, offset, dst); offset += 8;
838
839
if (include_fp_regs) {
840
stfd(F0, offset, dst); offset += 8;
841
stfd(F1, offset, dst); offset += 8;
842
stfd(F2, offset, dst); offset += 8;
843
stfd(F3, offset, dst); offset += 8;
844
stfd(F4, offset, dst); offset += 8;
845
stfd(F5, offset, dst); offset += 8;
846
stfd(F6, offset, dst); offset += 8;
847
stfd(F7, offset, dst); offset += 8;
848
stfd(F8, offset, dst); offset += 8;
849
stfd(F9, offset, dst); offset += 8;
850
stfd(F10, offset, dst); offset += 8;
851
stfd(F11, offset, dst); offset += 8;
852
stfd(F12, offset, dst); offset += 8;
853
stfd(F13, offset, dst);
854
}
855
}
856
857
// For verify_oops.
858
void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
859
ld(R2, offset, src); offset += 8;
860
if (include_R3_RET_reg) {
861
ld(R3, offset, src); offset += 8;
862
}
863
ld(R4, offset, src); offset += 8;
864
ld(R5, offset, src); offset += 8;
865
ld(R6, offset, src); offset += 8;
866
ld(R7, offset, src); offset += 8;
867
ld(R8, offset, src); offset += 8;
868
ld(R9, offset, src); offset += 8;
869
ld(R10, offset, src); offset += 8;
870
ld(R11, offset, src); offset += 8;
871
ld(R12, offset, src); offset += 8;
872
873
if (include_fp_regs) {
874
lfd(F0, offset, src); offset += 8;
875
lfd(F1, offset, src); offset += 8;
876
lfd(F2, offset, src); offset += 8;
877
lfd(F3, offset, src); offset += 8;
878
lfd(F4, offset, src); offset += 8;
879
lfd(F5, offset, src); offset += 8;
880
lfd(F6, offset, src); offset += 8;
881
lfd(F7, offset, src); offset += 8;
882
lfd(F8, offset, src); offset += 8;
883
lfd(F9, offset, src); offset += 8;
884
lfd(F10, offset, src); offset += 8;
885
lfd(F11, offset, src); offset += 8;
886
lfd(F12, offset, src); offset += 8;
887
lfd(F13, offset, src);
888
}
889
}
890
891
void MacroAssembler::save_LR_CR(Register tmp) {
892
mfcr(tmp);
893
std(tmp, _abi0(cr), R1_SP);
894
mflr(tmp);
895
std(tmp, _abi0(lr), R1_SP);
896
// Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
897
}
898
899
void MacroAssembler::restore_LR_CR(Register tmp) {
900
assert(tmp != R1_SP, "must be distinct");
901
ld(tmp, _abi0(lr), R1_SP);
902
mtlr(tmp);
903
ld(tmp, _abi0(cr), R1_SP);
904
mtcr(tmp);
905
}
906
907
address MacroAssembler::get_PC_trash_LR(Register result) {
908
Label L;
909
bl(L);
910
bind(L);
911
address lr_pc = pc();
912
mflr(result);
913
return lr_pc;
914
}
915
916
void MacroAssembler::resize_frame(Register offset, Register tmp) {
917
#ifdef ASSERT
918
assert_different_registers(offset, tmp, R1_SP);
919
andi_(tmp, offset, frame::alignment_in_bytes-1);
920
asm_assert_eq("resize_frame: unaligned");
921
#endif
922
923
// tmp <- *(SP)
924
ld(tmp, _abi0(callers_sp), R1_SP);
925
// addr <- SP + offset;
926
// *(addr) <- tmp;
927
// SP <- addr
928
stdux(tmp, R1_SP, offset);
929
}
930
931
void MacroAssembler::resize_frame(int offset, Register tmp) {
932
assert(is_simm(offset, 16), "too big an offset");
933
assert_different_registers(tmp, R1_SP);
934
assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
935
// tmp <- *(SP)
936
ld(tmp, _abi0(callers_sp), R1_SP);
937
// addr <- SP + offset;
938
// *(addr) <- tmp;
939
// SP <- addr
940
stdu(tmp, offset, R1_SP);
941
}
942
943
void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
944
// (addr == tmp1) || (addr == tmp2) is allowed here!
945
assert(tmp1 != tmp2, "must be distinct");
946
947
// compute offset w.r.t. current stack pointer
948
// tmp_1 <- addr - SP (!)
949
subf(tmp1, R1_SP, addr);
950
951
// atomically update SP keeping back link.
952
resize_frame(tmp1/* offset */, tmp2/* tmp */);
953
}
954
955
void MacroAssembler::push_frame(Register bytes, Register tmp) {
956
#ifdef ASSERT
957
assert(bytes != R0, "r0 not allowed here");
958
andi_(R0, bytes, frame::alignment_in_bytes-1);
959
asm_assert_eq("push_frame(Reg, Reg): unaligned");
960
#endif
961
neg(tmp, bytes);
962
stdux(R1_SP, R1_SP, tmp);
963
}
964
965
// Push a frame of size `bytes'.
966
void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
967
long offset = align_addr(bytes, frame::alignment_in_bytes);
968
if (is_simm(-offset, 16)) {
969
stdu(R1_SP, -offset, R1_SP);
970
} else {
971
load_const_optimized(tmp, -offset);
972
stdux(R1_SP, R1_SP, tmp);
973
}
974
}
975
976
// Push a frame of size `bytes' plus abi_reg_args on top.
977
void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
978
push_frame(bytes + frame::abi_reg_args_size, tmp);
979
}
980
981
// Setup up a new C frame with a spill area for non-volatile GPRs and
982
// additional space for local variables.
983
void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
984
Register tmp) {
985
push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
986
}
987
988
// Pop current C frame.
989
void MacroAssembler::pop_frame() {
990
ld(R1_SP, _abi0(callers_sp), R1_SP);
991
}
992
993
#if defined(ABI_ELFv2)
994
address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
995
// TODO(asmundak): make sure the caller uses R12 as function descriptor
996
// most of the times.
997
if (R12 != r_function_entry) {
998
mr(R12, r_function_entry);
999
}
1000
mtctr(R12);
1001
// Do a call or a branch.
1002
if (and_link) {
1003
bctrl();
1004
} else {
1005
bctr();
1006
}
1007
_last_calls_return_pc = pc();
1008
1009
return _last_calls_return_pc;
1010
}
1011
1012
// Call a C function via a function descriptor and use full C
1013
// calling conventions. Updates and returns _last_calls_return_pc.
1014
address MacroAssembler::call_c(Register r_function_entry) {
1015
return branch_to(r_function_entry, /*and_link=*/true);
1016
}
1017
1018
// For tail calls: only branch, don't link, so callee returns to caller of this function.
1019
address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1020
return branch_to(r_function_entry, /*and_link=*/false);
1021
}
1022
1023
address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1024
load_const(R12, function_entry, R0);
1025
return branch_to(R12, /*and_link=*/true);
1026
}
1027
1028
#else
1029
// Generic version of a call to C function via a function descriptor
1030
// with variable support for C calling conventions (TOC, ENV, etc.).
1031
// Updates and returns _last_calls_return_pc.
1032
address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1033
bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1034
// we emit standard ptrgl glue code here
1035
assert((function_descriptor != R0), "function_descriptor cannot be R0");
1036
1037
// retrieve necessary entries from the function descriptor
1038
ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1039
mtctr(R0);
1040
1041
if (load_toc_of_callee) {
1042
ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1043
}
1044
if (load_env_of_callee) {
1045
ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1046
} else if (load_toc_of_callee) {
1047
li(R11, 0);
1048
}
1049
1050
// do a call or a branch
1051
if (and_link) {
1052
bctrl();
1053
} else {
1054
bctr();
1055
}
1056
_last_calls_return_pc = pc();
1057
1058
return _last_calls_return_pc;
1059
}
1060
1061
// Call a C function via a function descriptor and use full C calling
1062
// conventions.
1063
// We don't use the TOC in generated code, so there is no need to save
1064
// and restore its value.
1065
address MacroAssembler::call_c(Register fd) {
1066
return branch_to(fd, /*and_link=*/true,
1067
/*save toc=*/false,
1068
/*restore toc=*/false,
1069
/*load toc=*/true,
1070
/*load env=*/true);
1071
}
1072
1073
address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1074
return branch_to(fd, /*and_link=*/false,
1075
/*save toc=*/false,
1076
/*restore toc=*/false,
1077
/*load toc=*/true,
1078
/*load env=*/true);
1079
}
1080
1081
address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1082
if (rt != relocInfo::none) {
1083
// this call needs to be relocatable
1084
if (!ReoptimizeCallSequences
1085
|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1086
|| fd == NULL // support code-size estimation
1087
|| !fd->is_friend_function()
1088
|| fd->entry() == NULL) {
1089
// it's not a friend function as defined by class FunctionDescriptor,
1090
// so do a full call-c here.
1091
load_const(R11, (address)fd, R0);
1092
1093
bool has_env = (fd != NULL && fd->env() != NULL);
1094
return branch_to(R11, /*and_link=*/true,
1095
/*save toc=*/false,
1096
/*restore toc=*/false,
1097
/*load toc=*/true,
1098
/*load env=*/has_env);
1099
} else {
1100
// It's a friend function. Load the entry point and don't care about
1101
// toc and env. Use an optimizable call instruction, but ensure the
1102
// same code-size as in the case of a non-friend function.
1103
nop();
1104
nop();
1105
nop();
1106
bl64_patchable(fd->entry(), rt);
1107
_last_calls_return_pc = pc();
1108
return _last_calls_return_pc;
1109
}
1110
} else {
1111
// This call does not need to be relocatable, do more aggressive
1112
// optimizations.
1113
if (!ReoptimizeCallSequences
1114
|| !fd->is_friend_function()) {
1115
// It's not a friend function as defined by class FunctionDescriptor,
1116
// so do a full call-c here.
1117
load_const(R11, (address)fd, R0);
1118
return branch_to(R11, /*and_link=*/true,
1119
/*save toc=*/false,
1120
/*restore toc=*/false,
1121
/*load toc=*/true,
1122
/*load env=*/true);
1123
} else {
1124
// it's a friend function, load the entry point and don't care about
1125
// toc and env.
1126
address dest = fd->entry();
1127
if (is_within_range_of_b(dest, pc())) {
1128
bl(dest);
1129
} else {
1130
bl64_patchable(dest, rt);
1131
}
1132
_last_calls_return_pc = pc();
1133
return _last_calls_return_pc;
1134
}
1135
}
1136
}
1137
1138
// Call a C function. All constants needed reside in TOC.
1139
//
1140
// Read the address to call from the TOC.
1141
// Read env from TOC, if fd specifies an env.
1142
// Read new TOC from TOC.
1143
address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1144
relocInfo::relocType rt, Register toc) {
1145
if (!ReoptimizeCallSequences
1146
|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1147
|| !fd->is_friend_function()) {
1148
// It's not a friend function as defined by class FunctionDescriptor,
1149
// so do a full call-c here.
1150
assert(fd->entry() != NULL, "function must be linked");
1151
1152
AddressLiteral fd_entry(fd->entry());
1153
bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1154
mtctr(R11);
1155
if (fd->env() == NULL) {
1156
li(R11, 0);
1157
nop();
1158
} else {
1159
AddressLiteral fd_env(fd->env());
1160
success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1161
}
1162
AddressLiteral fd_toc(fd->toc());
1163
// Set R2_TOC (load from toc)
1164
success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1165
bctrl();
1166
_last_calls_return_pc = pc();
1167
if (!success) { return NULL; }
1168
} else {
1169
// It's a friend function, load the entry point and don't care about
1170
// toc and env. Use an optimizable call instruction, but ensure the
1171
// same code-size as in the case of a non-friend function.
1172
nop();
1173
bl64_patchable(fd->entry(), rt);
1174
_last_calls_return_pc = pc();
1175
}
1176
return _last_calls_return_pc;
1177
}
1178
#endif // ABI_ELFv2
1179
1180
void MacroAssembler::call_VM_base(Register oop_result,
1181
Register last_java_sp,
1182
address entry_point,
1183
bool check_exceptions) {
1184
BLOCK_COMMENT("call_VM {");
1185
// Determine last_java_sp register.
1186
if (!last_java_sp->is_valid()) {
1187
last_java_sp = R1_SP;
1188
}
1189
set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1190
1191
// ARG1 must hold thread address.
1192
mr(R3_ARG1, R16_thread);
1193
#if defined(ABI_ELFv2)
1194
address return_pc = call_c(entry_point, relocInfo::none);
1195
#else
1196
address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1197
#endif
1198
1199
reset_last_Java_frame();
1200
1201
// Check for pending exceptions.
1202
if (check_exceptions) {
1203
// We don't check for exceptions here.
1204
ShouldNotReachHere();
1205
}
1206
1207
// Get oop result if there is one and reset the value in the thread.
1208
if (oop_result->is_valid()) {
1209
get_vm_result(oop_result);
1210
}
1211
1212
_last_calls_return_pc = return_pc;
1213
BLOCK_COMMENT("} call_VM");
1214
}
1215
1216
void MacroAssembler::call_VM_leaf_base(address entry_point) {
1217
BLOCK_COMMENT("call_VM_leaf {");
1218
#if defined(ABI_ELFv2)
1219
call_c(entry_point, relocInfo::none);
1220
#else
1221
call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1222
#endif
1223
BLOCK_COMMENT("} call_VM_leaf");
1224
}
1225
1226
void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1227
call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1228
}
1229
1230
void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1231
bool check_exceptions) {
1232
// R3_ARG1 is reserved for the thread.
1233
mr_if_needed(R4_ARG2, arg_1);
1234
call_VM(oop_result, entry_point, check_exceptions);
1235
}
1236
1237
void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1238
bool check_exceptions) {
1239
// R3_ARG1 is reserved for the thread
1240
mr_if_needed(R4_ARG2, arg_1);
1241
assert(arg_2 != R4_ARG2, "smashed argument");
1242
mr_if_needed(R5_ARG3, arg_2);
1243
call_VM(oop_result, entry_point, check_exceptions);
1244
}
1245
1246
void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1247
bool check_exceptions) {
1248
// R3_ARG1 is reserved for the thread
1249
mr_if_needed(R4_ARG2, arg_1);
1250
assert(arg_2 != R4_ARG2, "smashed argument");
1251
mr_if_needed(R5_ARG3, arg_2);
1252
mr_if_needed(R6_ARG4, arg_3);
1253
call_VM(oop_result, entry_point, check_exceptions);
1254
}
1255
1256
void MacroAssembler::call_VM_leaf(address entry_point) {
1257
call_VM_leaf_base(entry_point);
1258
}
1259
1260
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1261
mr_if_needed(R3_ARG1, arg_1);
1262
call_VM_leaf(entry_point);
1263
}
1264
1265
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1266
mr_if_needed(R3_ARG1, arg_1);
1267
assert(arg_2 != R3_ARG1, "smashed argument");
1268
mr_if_needed(R4_ARG2, arg_2);
1269
call_VM_leaf(entry_point);
1270
}
1271
1272
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1273
mr_if_needed(R3_ARG1, arg_1);
1274
assert(arg_2 != R3_ARG1, "smashed argument");
1275
mr_if_needed(R4_ARG2, arg_2);
1276
assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1277
mr_if_needed(R5_ARG3, arg_3);
1278
call_VM_leaf(entry_point);
1279
}
1280
1281
// Check whether instruction is a read access to the polling page
1282
// which was emitted by load_from_polling_page(..).
1283
bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1284
address* polling_address_ptr) {
1285
if (!is_ld(instruction))
1286
return false; // It's not a ld. Fail.
1287
1288
int rt = inv_rt_field(instruction);
1289
int ra = inv_ra_field(instruction);
1290
int ds = inv_ds_field(instruction);
1291
if (!(ds == 0 && ra != 0 && rt == 0)) {
1292
return false; // It's not a ld(r0, X, ra). Fail.
1293
}
1294
1295
if (!ucontext) {
1296
// Set polling address.
1297
if (polling_address_ptr != NULL) {
1298
*polling_address_ptr = NULL;
1299
}
1300
return true; // No ucontext given. Can't check value of ra. Assume true.
1301
}
1302
1303
#ifdef LINUX
1304
// Ucontext given. Check that register ra contains the address of
1305
// the safepoing polling page.
1306
ucontext_t* uc = (ucontext_t*) ucontext;
1307
// Set polling address.
1308
address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1309
if (polling_address_ptr != NULL) {
1310
*polling_address_ptr = addr;
1311
}
1312
return SafepointMechanism::is_poll_address(addr);
1313
#else
1314
// Not on Linux, ucontext must be NULL.
1315
ShouldNotReachHere();
1316
return false;
1317
#endif
1318
}
1319
1320
void MacroAssembler::bang_stack_with_offset(int offset) {
1321
// When increasing the stack, the old stack pointer will be written
1322
// to the new top of stack according to the PPC64 abi.
1323
// Therefore, stack banging is not necessary when increasing
1324
// the stack by <= os::vm_page_size() bytes.
1325
// When increasing the stack by a larger amount, this method is
1326
// called repeatedly to bang the intermediate pages.
1327
1328
// Stack grows down, caller passes positive offset.
1329
assert(offset > 0, "must bang with positive offset");
1330
1331
long stdoffset = -offset;
1332
1333
if (is_simm(stdoffset, 16)) {
1334
// Signed 16 bit offset, a simple std is ok.
1335
if (UseLoadInstructionsForStackBangingPPC64) {
1336
ld(R0, (int)(signed short)stdoffset, R1_SP);
1337
} else {
1338
std(R0,(int)(signed short)stdoffset, R1_SP);
1339
}
1340
} else if (is_simm(stdoffset, 31)) {
1341
const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1342
const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1343
1344
Register tmp = R11;
1345
addis(tmp, R1_SP, hi);
1346
if (UseLoadInstructionsForStackBangingPPC64) {
1347
ld(R0, lo, tmp);
1348
} else {
1349
std(R0, lo, tmp);
1350
}
1351
} else {
1352
ShouldNotReachHere();
1353
}
1354
}
1355
1356
// If instruction is a stack bang of the form
1357
// std R0, x(Ry), (see bang_stack_with_offset())
1358
// stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1359
// or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1360
// return the banged address. Otherwise, return 0.
1361
address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1362
#ifdef LINUX
1363
ucontext_t* uc = (ucontext_t*) ucontext;
1364
int rs = inv_rs_field(instruction);
1365
int ra = inv_ra_field(instruction);
1366
if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1367
|| (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1368
|| (is_stdu(instruction) && rs == 1)) {
1369
int ds = inv_ds_field(instruction);
1370
// return banged address
1371
return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1372
} else if (is_stdux(instruction) && rs == 1) {
1373
int rb = inv_rb_field(instruction);
1374
address sp = (address)uc->uc_mcontext.regs->gpr[1];
1375
long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1376
return ra != 1 || rb_val >= 0 ? NULL // not a stack bang
1377
: sp + rb_val; // banged address
1378
}
1379
return NULL; // not a stack bang
1380
#else
1381
// workaround not needed on !LINUX :-)
1382
ShouldNotCallThis();
1383
return NULL;
1384
#endif
1385
}
1386
1387
void MacroAssembler::reserved_stack_check(Register return_pc) {
1388
// Test if reserved zone needs to be enabled.
1389
Label no_reserved_zone_enabling;
1390
1391
ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1392
cmpld(CCR0, R1_SP, R0);
1393
blt_predict_taken(CCR0, no_reserved_zone_enabling);
1394
1395
// Enable reserved zone again, throw stack overflow exception.
1396
push_frame_reg_args(0, R0);
1397
call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1398
pop_frame();
1399
mtlr(return_pc);
1400
load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1401
mtctr(R0);
1402
bctr();
1403
1404
should_not_reach_here();
1405
1406
bind(no_reserved_zone_enabling);
1407
}
1408
1409
void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1410
bool cmpxchgx_hint) {
1411
Label retry;
1412
bind(retry);
1413
ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1414
stdcx_(exchange_value, addr_base);
1415
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1416
bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1417
} else {
1418
bne( CCR0, retry); // StXcx_ sets CCR0.
1419
}
1420
}
1421
1422
void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1423
Register tmp, bool cmpxchgx_hint) {
1424
Label retry;
1425
bind(retry);
1426
ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1427
add(tmp, dest_current_value, inc_value);
1428
stdcx_(tmp, addr_base);
1429
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1430
bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1431
} else {
1432
bne( CCR0, retry); // StXcx_ sets CCR0.
1433
}
1434
}
1435
1436
// Word/sub-word atomic helper functions
1437
1438
// Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1439
// Only signed types are supported with size < 4.
1440
// Atomic add always kills tmp1.
1441
void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1442
Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1443
bool cmpxchgx_hint, bool is_add, int size) {
1444
// Sub-word instructions are available since Power 8.
1445
// For older processors, instruction_type != size holds, and we
1446
// emulate the sub-word instructions by constructing a 4-byte value
1447
// that leaves the other bytes unchanged.
1448
const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1449
1450
Label retry;
1451
Register shift_amount = noreg,
1452
val32 = dest_current_value,
1453
modval = is_add ? tmp1 : exchange_value;
1454
1455
if (instruction_type != size) {
1456
assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1457
modval = tmp1;
1458
shift_amount = tmp2;
1459
val32 = tmp3;
1460
// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1461
#ifdef VM_LITTLE_ENDIAN
1462
rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1463
clrrdi(addr_base, addr_base, 2);
1464
#else
1465
xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1466
clrrdi(addr_base, addr_base, 2);
1467
rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1468
#endif
1469
}
1470
1471
// atomic emulation loop
1472
bind(retry);
1473
1474
switch (instruction_type) {
1475
case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1476
case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1477
case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1478
default: ShouldNotReachHere();
1479
}
1480
1481
if (instruction_type != size) {
1482
srw(dest_current_value, val32, shift_amount);
1483
}
1484
1485
if (is_add) { add(modval, dest_current_value, exchange_value); }
1486
1487
if (instruction_type != size) {
1488
// Transform exchange value such that the replacement can be done by one xor instruction.
1489
xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1490
clrldi(modval, modval, (size == 1) ? 56 : 48);
1491
slw(modval, modval, shift_amount);
1492
xorr(modval, val32, modval);
1493
}
1494
1495
switch (instruction_type) {
1496
case 4: stwcx_(modval, addr_base); break;
1497
case 2: sthcx_(modval, addr_base); break;
1498
case 1: stbcx_(modval, addr_base); break;
1499
default: ShouldNotReachHere();
1500
}
1501
1502
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1503
bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1504
} else {
1505
bne( CCR0, retry); // StXcx_ sets CCR0.
1506
}
1507
1508
// l?arx zero-extends, but Java wants byte/short values sign-extended.
1509
if (size == 1) {
1510
extsb(dest_current_value, dest_current_value);
1511
} else if (size == 2) {
1512
extsh(dest_current_value, dest_current_value);
1513
};
1514
}
1515
1516
// Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1517
// Only signed types are supported with size < 4.
1518
void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1519
Register compare_value, Register exchange_value,
1520
Register addr_base, Register tmp1, Register tmp2,
1521
Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1522
// Sub-word instructions are available since Power 8.
1523
// For older processors, instruction_type != size holds, and we
1524
// emulate the sub-word instructions by constructing a 4-byte value
1525
// that leaves the other bytes unchanged.
1526
const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1527
1528
Register shift_amount = noreg,
1529
val32 = dest_current_value,
1530
modval = exchange_value;
1531
1532
if (instruction_type != size) {
1533
assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1534
shift_amount = tmp1;
1535
val32 = tmp2;
1536
modval = tmp2;
1537
// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1538
#ifdef VM_LITTLE_ENDIAN
1539
rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1540
clrrdi(addr_base, addr_base, 2);
1541
#else
1542
xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1543
clrrdi(addr_base, addr_base, 2);
1544
rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1545
#endif
1546
// Transform exchange value such that the replacement can be done by one xor instruction.
1547
xorr(exchange_value, compare_value, exchange_value);
1548
clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1549
slw(exchange_value, exchange_value, shift_amount);
1550
}
1551
1552
// atomic emulation loop
1553
bind(retry);
1554
1555
switch (instruction_type) {
1556
case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1557
case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1558
case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1559
default: ShouldNotReachHere();
1560
}
1561
1562
if (instruction_type != size) {
1563
srw(dest_current_value, val32, shift_amount);
1564
}
1565
if (size == 1) {
1566
extsb(dest_current_value, dest_current_value);
1567
} else if (size == 2) {
1568
extsh(dest_current_value, dest_current_value);
1569
};
1570
1571
cmpw(flag, dest_current_value, compare_value);
1572
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1573
bne_predict_not_taken(flag, failed);
1574
} else {
1575
bne( flag, failed);
1576
}
1577
// branch to done => (flag == ne), (dest_current_value != compare_value)
1578
// fall through => (flag == eq), (dest_current_value == compare_value)
1579
1580
if (instruction_type != size) {
1581
xorr(modval, val32, exchange_value);
1582
}
1583
1584
switch (instruction_type) {
1585
case 4: stwcx_(modval, addr_base); break;
1586
case 2: sthcx_(modval, addr_base); break;
1587
case 1: stbcx_(modval, addr_base); break;
1588
default: ShouldNotReachHere();
1589
}
1590
}
1591
1592
// CmpxchgX sets condition register to cmpX(current, compare).
1593
void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1594
Register compare_value, Register exchange_value,
1595
Register addr_base, Register tmp1, Register tmp2,
1596
int semantics, bool cmpxchgx_hint,
1597
Register int_flag_success, bool contention_hint, bool weak, int size) {
1598
Label retry;
1599
Label failed;
1600
Label done;
1601
1602
// Save one branch if result is returned via register and
1603
// result register is different from the other ones.
1604
bool use_result_reg = (int_flag_success != noreg);
1605
bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1606
int_flag_success != exchange_value && int_flag_success != addr_base &&
1607
int_flag_success != tmp1 && int_flag_success != tmp2);
1608
assert(!weak || flag == CCR0, "weak only supported with CCR0");
1609
assert(size == 1 || size == 2 || size == 4, "unsupported");
1610
1611
if (use_result_reg && preset_result_reg) {
1612
li(int_flag_success, 0); // preset (assume cas failed)
1613
}
1614
1615
// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1616
if (contention_hint) { // Don't try to reserve if cmp fails.
1617
switch (size) {
1618
case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1619
case 2: lha(dest_current_value, 0, addr_base); break;
1620
case 4: lwz(dest_current_value, 0, addr_base); break;
1621
default: ShouldNotReachHere();
1622
}
1623
cmpw(flag, dest_current_value, compare_value);
1624
bne(flag, failed);
1625
}
1626
1627
// release/fence semantics
1628
if (semantics & MemBarRel) {
1629
release();
1630
}
1631
1632
cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1633
retry, failed, cmpxchgx_hint, size);
1634
if (!weak || use_result_reg) {
1635
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1636
bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1637
} else {
1638
bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1639
}
1640
}
1641
// fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1642
1643
// Result in register (must do this at the end because int_flag_success can be the
1644
// same register as one above).
1645
if (use_result_reg) {
1646
li(int_flag_success, 1);
1647
}
1648
1649
if (semantics & MemBarFenceAfter) {
1650
fence();
1651
} else if (semantics & MemBarAcq) {
1652
isync();
1653
}
1654
1655
if (use_result_reg && !preset_result_reg) {
1656
b(done);
1657
}
1658
1659
bind(failed);
1660
if (use_result_reg && !preset_result_reg) {
1661
li(int_flag_success, 0);
1662
}
1663
1664
bind(done);
1665
// (flag == ne) => (dest_current_value != compare_value), (!swapped)
1666
// (flag == eq) => (dest_current_value == compare_value), ( swapped)
1667
}
1668
1669
// Preforms atomic compare exchange:
1670
// if (compare_value == *addr_base)
1671
// *addr_base = exchange_value
1672
// int_flag_success = 1;
1673
// else
1674
// int_flag_success = 0;
1675
//
1676
// ConditionRegister flag = cmp(compare_value, *addr_base)
1677
// Register dest_current_value = *addr_base
1678
// Register compare_value Used to compare with value in memory
1679
// Register exchange_value Written to memory if compare_value == *addr_base
1680
// Register addr_base The memory location to compareXChange
1681
// Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1682
//
1683
// To avoid the costly compare exchange the value is tested beforehand.
1684
// Several special cases exist to avoid that unnecessary information is generated.
1685
//
1686
void MacroAssembler::cmpxchgd(ConditionRegister flag,
1687
Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1688
Register addr_base, int semantics, bool cmpxchgx_hint,
1689
Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1690
Label retry;
1691
Label failed_int;
1692
Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1693
Label done;
1694
1695
// Save one branch if result is returned via register and result register is different from the other ones.
1696
bool use_result_reg = (int_flag_success!=noreg);
1697
bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1698
int_flag_success!=exchange_value && int_flag_success!=addr_base);
1699
assert(!weak || flag == CCR0, "weak only supported with CCR0");
1700
assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1701
1702
if (use_result_reg && preset_result_reg) {
1703
li(int_flag_success, 0); // preset (assume cas failed)
1704
}
1705
1706
// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1707
if (contention_hint) { // Don't try to reserve if cmp fails.
1708
ld(dest_current_value, 0, addr_base);
1709
cmpd(flag, compare_value, dest_current_value);
1710
bne(flag, failed);
1711
}
1712
1713
// release/fence semantics
1714
if (semantics & MemBarRel) {
1715
release();
1716
}
1717
1718
// atomic emulation loop
1719
bind(retry);
1720
1721
ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1722
cmpd(flag, compare_value, dest_current_value);
1723
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1724
bne_predict_not_taken(flag, failed);
1725
} else {
1726
bne( flag, failed);
1727
}
1728
1729
stdcx_(exchange_value, addr_base);
1730
if (!weak || use_result_reg || failed_ext) {
1731
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1732
bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1733
} else {
1734
bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1735
}
1736
}
1737
1738
// result in register (must do this at the end because int_flag_success can be the same register as one above)
1739
if (use_result_reg) {
1740
li(int_flag_success, 1);
1741
}
1742
1743
if (semantics & MemBarFenceAfter) {
1744
fence();
1745
} else if (semantics & MemBarAcq) {
1746
isync();
1747
}
1748
1749
if (use_result_reg && !preset_result_reg) {
1750
b(done);
1751
}
1752
1753
bind(failed_int);
1754
if (use_result_reg && !preset_result_reg) {
1755
li(int_flag_success, 0);
1756
}
1757
1758
bind(done);
1759
// (flag == ne) => (dest_current_value != compare_value), (!swapped)
1760
// (flag == eq) => (dest_current_value == compare_value), ( swapped)
1761
}
1762
1763
// Look up the method for a megamorphic invokeinterface call.
1764
// The target method is determined by <intf_klass, itable_index>.
1765
// The receiver klass is in recv_klass.
1766
// On success, the result will be in method_result, and execution falls through.
1767
// On failure, execution transfers to the given label.
1768
void MacroAssembler::lookup_interface_method(Register recv_klass,
1769
Register intf_klass,
1770
RegisterOrConstant itable_index,
1771
Register method_result,
1772
Register scan_temp,
1773
Register temp2,
1774
Label& L_no_such_interface,
1775
bool return_method) {
1776
assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1777
1778
// Compute start of first itableOffsetEntry (which is at the end of the vtable).
1779
int vtable_base = in_bytes(Klass::vtable_start_offset());
1780
int itentry_off = itableMethodEntry::method_offset_in_bytes();
1781
int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1782
int scan_step = itableOffsetEntry::size() * wordSize;
1783
int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1784
1785
lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1786
// %%% We should store the aligned, prescaled offset in the klassoop.
1787
// Then the next several instructions would fold away.
1788
1789
sldi(scan_temp, scan_temp, log_vte_size);
1790
addi(scan_temp, scan_temp, vtable_base);
1791
add(scan_temp, recv_klass, scan_temp);
1792
1793
// Adjust recv_klass by scaled itable_index, so we can free itable_index.
1794
if (return_method) {
1795
if (itable_index.is_register()) {
1796
Register itable_offset = itable_index.as_register();
1797
sldi(method_result, itable_offset, logMEsize);
1798
if (itentry_off) { addi(method_result, method_result, itentry_off); }
1799
add(method_result, method_result, recv_klass);
1800
} else {
1801
long itable_offset = (long)itable_index.as_constant();
1802
// static address, no relocation
1803
add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1804
}
1805
}
1806
1807
// for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1808
// if (scan->interface() == intf) {
1809
// result = (klass + scan->offset() + itable_index);
1810
// }
1811
// }
1812
Label search, found_method;
1813
1814
for (int peel = 1; peel >= 0; peel--) {
1815
// %%%% Could load both offset and interface in one ldx, if they were
1816
// in the opposite order. This would save a load.
1817
ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1818
1819
// Check that this entry is non-null. A null entry means that
1820
// the receiver class doesn't implement the interface, and wasn't the
1821
// same as when the caller was compiled.
1822
cmpd(CCR0, temp2, intf_klass);
1823
1824
if (peel) {
1825
beq(CCR0, found_method);
1826
} else {
1827
bne(CCR0, search);
1828
// (invert the test to fall through to found_method...)
1829
}
1830
1831
if (!peel) break;
1832
1833
bind(search);
1834
1835
cmpdi(CCR0, temp2, 0);
1836
beq(CCR0, L_no_such_interface);
1837
addi(scan_temp, scan_temp, scan_step);
1838
}
1839
1840
bind(found_method);
1841
1842
// Got a hit.
1843
if (return_method) {
1844
int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1845
lwz(scan_temp, ito_offset, scan_temp);
1846
ldx(method_result, scan_temp, method_result);
1847
}
1848
}
1849
1850
// virtual method calling
1851
void MacroAssembler::lookup_virtual_method(Register recv_klass,
1852
RegisterOrConstant vtable_index,
1853
Register method_result) {
1854
1855
assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1856
1857
const int base = in_bytes(Klass::vtable_start_offset());
1858
assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1859
1860
if (vtable_index.is_register()) {
1861
sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1862
add(recv_klass, vtable_index.as_register(), recv_klass);
1863
} else {
1864
addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1865
}
1866
ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1867
}
1868
1869
/////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1870
void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1871
Register super_klass,
1872
Register temp1_reg,
1873
Register temp2_reg,
1874
Label* L_success,
1875
Label* L_failure,
1876
Label* L_slow_path,
1877
RegisterOrConstant super_check_offset) {
1878
1879
const Register check_cache_offset = temp1_reg;
1880
const Register cached_super = temp2_reg;
1881
1882
assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1883
1884
int sco_offset = in_bytes(Klass::super_check_offset_offset());
1885
int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1886
1887
bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1888
bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1889
1890
Label L_fallthrough;
1891
int label_nulls = 0;
1892
if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1893
if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1894
if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1895
assert(label_nulls <= 1 ||
1896
(L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1897
"at most one NULL in the batch, usually");
1898
1899
// If the pointers are equal, we are done (e.g., String[] elements).
1900
// This self-check enables sharing of secondary supertype arrays among
1901
// non-primary types such as array-of-interface. Otherwise, each such
1902
// type would need its own customized SSA.
1903
// We move this check to the front of the fast path because many
1904
// type checks are in fact trivially successful in this manner,
1905
// so we get a nicely predicted branch right at the start of the check.
1906
cmpd(CCR0, sub_klass, super_klass);
1907
beq(CCR0, *L_success);
1908
1909
// Check the supertype display:
1910
if (must_load_sco) {
1911
// The super check offset is always positive...
1912
lwz(check_cache_offset, sco_offset, super_klass);
1913
super_check_offset = RegisterOrConstant(check_cache_offset);
1914
// super_check_offset is register.
1915
assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1916
}
1917
// The loaded value is the offset from KlassOopDesc.
1918
1919
ld(cached_super, super_check_offset, sub_klass);
1920
cmpd(CCR0, cached_super, super_klass);
1921
1922
// This check has worked decisively for primary supers.
1923
// Secondary supers are sought in the super_cache ('super_cache_addr').
1924
// (Secondary supers are interfaces and very deeply nested subtypes.)
1925
// This works in the same check above because of a tricky aliasing
1926
// between the super_cache and the primary super display elements.
1927
// (The 'super_check_addr' can address either, as the case requires.)
1928
// Note that the cache is updated below if it does not help us find
1929
// what we need immediately.
1930
// So if it was a primary super, we can just fail immediately.
1931
// Otherwise, it's the slow path for us (no success at this point).
1932
1933
#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1934
1935
if (super_check_offset.is_register()) {
1936
beq(CCR0, *L_success);
1937
cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1938
if (L_failure == &L_fallthrough) {
1939
beq(CCR0, *L_slow_path);
1940
} else {
1941
bne(CCR0, *L_failure);
1942
FINAL_JUMP(*L_slow_path);
1943
}
1944
} else {
1945
if (super_check_offset.as_constant() == sc_offset) {
1946
// Need a slow path; fast failure is impossible.
1947
if (L_slow_path == &L_fallthrough) {
1948
beq(CCR0, *L_success);
1949
} else {
1950
bne(CCR0, *L_slow_path);
1951
FINAL_JUMP(*L_success);
1952
}
1953
} else {
1954
// No slow path; it's a fast decision.
1955
if (L_failure == &L_fallthrough) {
1956
beq(CCR0, *L_success);
1957
} else {
1958
bne(CCR0, *L_failure);
1959
FINAL_JUMP(*L_success);
1960
}
1961
}
1962
}
1963
1964
bind(L_fallthrough);
1965
#undef FINAL_JUMP
1966
}
1967
1968
void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1969
Register super_klass,
1970
Register temp1_reg,
1971
Register temp2_reg,
1972
Label* L_success,
1973
Register result_reg) {
1974
const Register array_ptr = temp1_reg; // current value from cache array
1975
const Register temp = temp2_reg;
1976
1977
assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1978
1979
int source_offset = in_bytes(Klass::secondary_supers_offset());
1980
int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1981
1982
int length_offset = Array<Klass*>::length_offset_in_bytes();
1983
int base_offset = Array<Klass*>::base_offset_in_bytes();
1984
1985
Label hit, loop, failure, fallthru;
1986
1987
ld(array_ptr, source_offset, sub_klass);
1988
1989
// TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1990
lwz(temp, length_offset, array_ptr);
1991
cmpwi(CCR0, temp, 0);
1992
beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1993
1994
mtctr(temp); // load ctr
1995
1996
bind(loop);
1997
// Oops in table are NO MORE compressed.
1998
ld(temp, base_offset, array_ptr);
1999
cmpd(CCR0, temp, super_klass);
2000
beq(CCR0, hit);
2001
addi(array_ptr, array_ptr, BytesPerWord);
2002
bdnz(loop);
2003
2004
bind(failure);
2005
if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2006
b(fallthru);
2007
2008
bind(hit);
2009
std(super_klass, target_offset, sub_klass); // save result to cache
2010
if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2011
if (L_success != NULL) { b(*L_success); }
2012
else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2013
2014
bind(fallthru);
2015
}
2016
2017
// Try fast path, then go to slow one if not successful
2018
void MacroAssembler::check_klass_subtype(Register sub_klass,
2019
Register super_klass,
2020
Register temp1_reg,
2021
Register temp2_reg,
2022
Label& L_success) {
2023
Label L_failure;
2024
check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2025
check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2026
bind(L_failure); // Fallthru if not successful.
2027
}
2028
2029
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2030
assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2031
2032
Label L_fallthrough;
2033
if (L_fast_path == NULL) {
2034
L_fast_path = &L_fallthrough;
2035
} else if (L_slow_path == NULL) {
2036
L_slow_path = &L_fallthrough;
2037
}
2038
2039
// Fast path check: class is fully initialized
2040
lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2041
cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2042
beq(CCR0, *L_fast_path);
2043
2044
// Fast path check: current thread is initializer thread
2045
ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2046
cmpd(CCR0, thread, R0);
2047
if (L_slow_path == &L_fallthrough) {
2048
beq(CCR0, *L_fast_path);
2049
} else if (L_fast_path == &L_fallthrough) {
2050
bne(CCR0, *L_slow_path);
2051
} else {
2052
Unimplemented();
2053
}
2054
2055
bind(L_fallthrough);
2056
}
2057
2058
RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2059
Register temp_reg,
2060
int extra_slot_offset) {
2061
// cf. TemplateTable::prepare_invoke(), if (load_receiver).
2062
int stackElementSize = Interpreter::stackElementSize;
2063
int offset = extra_slot_offset * stackElementSize;
2064
if (arg_slot.is_constant()) {
2065
offset += arg_slot.as_constant() * stackElementSize;
2066
return offset;
2067
} else {
2068
assert(temp_reg != noreg, "must specify");
2069
sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2070
if (offset != 0)
2071
addi(temp_reg, temp_reg, offset);
2072
return temp_reg;
2073
}
2074
}
2075
2076
// Supports temp2_reg = R0.
2077
void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2078
Register mark_reg, Register temp_reg,
2079
Register temp2_reg, Label& done, Label* slow_case) {
2080
assert(UseBiasedLocking, "why call this otherwise?");
2081
2082
#ifdef ASSERT
2083
assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2084
#endif
2085
2086
Label cas_label;
2087
2088
// Branch to done if fast path fails and no slow_case provided.
2089
Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2090
2091
// Biased locking
2092
// See whether the lock is currently biased toward our thread and
2093
// whether the epoch is still valid
2094
// Note that the runtime guarantees sufficient alignment of JavaThread
2095
// pointers to allow age to be placed into low bits
2096
assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2097
"biased locking makes assumptions about bit layout");
2098
2099
if (PrintBiasedLockingStatistics) {
2100
load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2101
lwzx(temp_reg, temp2_reg);
2102
addi(temp_reg, temp_reg, 1);
2103
stwx(temp_reg, temp2_reg);
2104
}
2105
2106
andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2107
cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2108
bne(cr_reg, cas_label);
2109
2110
load_klass(temp_reg, obj_reg);
2111
2112
load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2113
ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2114
orr(temp_reg, R16_thread, temp_reg);
2115
xorr(temp_reg, mark_reg, temp_reg);
2116
andr(temp_reg, temp_reg, temp2_reg);
2117
cmpdi(cr_reg, temp_reg, 0);
2118
if (PrintBiasedLockingStatistics) {
2119
Label l;
2120
bne(cr_reg, l);
2121
load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2122
lwzx(mark_reg, temp2_reg);
2123
addi(mark_reg, mark_reg, 1);
2124
stwx(mark_reg, temp2_reg);
2125
// restore mark_reg
2126
ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2127
bind(l);
2128
}
2129
beq(cr_reg, done);
2130
2131
Label try_revoke_bias;
2132
Label try_rebias;
2133
2134
// At this point we know that the header has the bias pattern and
2135
// that we are not the bias owner in the current epoch. We need to
2136
// figure out more details about the state of the header in order to
2137
// know what operations can be legally performed on the object's
2138
// header.
2139
2140
// If the low three bits in the xor result aren't clear, that means
2141
// the prototype header is no longer biased and we have to revoke
2142
// the bias on this object.
2143
andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2144
cmpwi(cr_reg, temp2_reg, 0);
2145
bne(cr_reg, try_revoke_bias);
2146
2147
// Biasing is still enabled for this data type. See whether the
2148
// epoch of the current bias is still valid, meaning that the epoch
2149
// bits of the mark word are equal to the epoch bits of the
2150
// prototype header. (Note that the prototype header's epoch bits
2151
// only change at a safepoint.) If not, attempt to rebias the object
2152
// toward the current thread. Note that we must be absolutely sure
2153
// that the current epoch is invalid in order to do this because
2154
// otherwise the manipulations it performs on the mark word are
2155
// illegal.
2156
2157
int shift_amount = 64 - markWord::epoch_shift;
2158
// rotate epoch bits to right (little) end and set other bits to 0
2159
// [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2160
rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2161
// branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2162
bne(CCR0, try_rebias);
2163
2164
// The epoch of the current bias is still valid but we know nothing
2165
// about the owner; it might be set or it might be clear. Try to
2166
// acquire the bias of the object using an atomic operation. If this
2167
// fails we will go in to the runtime to revoke the object's bias.
2168
// Note that we first construct the presumed unbiased header so we
2169
// don't accidentally blow away another thread's valid bias.
2170
andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2171
markWord::age_mask_in_place |
2172
markWord::epoch_mask_in_place));
2173
orr(temp_reg, R16_thread, mark_reg);
2174
2175
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2176
2177
// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2178
cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2179
/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2180
/*where=*/obj_reg,
2181
MacroAssembler::MemBarAcq,
2182
MacroAssembler::cmpxchgx_hint_acquire_lock(),
2183
noreg, slow_case_int); // bail out if failed
2184
2185
// If the biasing toward our thread failed, this means that
2186
// another thread succeeded in biasing it toward itself and we
2187
// need to revoke that bias. The revocation will occur in the
2188
// interpreter runtime in the slow case.
2189
if (PrintBiasedLockingStatistics) {
2190
load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2191
lwzx(temp_reg, temp2_reg);
2192
addi(temp_reg, temp_reg, 1);
2193
stwx(temp_reg, temp2_reg);
2194
}
2195
b(done);
2196
2197
bind(try_rebias);
2198
// At this point we know the epoch has expired, meaning that the
2199
// current "bias owner", if any, is actually invalid. Under these
2200
// circumstances _only_, we are allowed to use the current header's
2201
// value as the comparison value when doing the cas to acquire the
2202
// bias in the current epoch. In other words, we allow transfer of
2203
// the bias from one thread to another directly in this situation.
2204
load_klass(temp_reg, obj_reg);
2205
andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2206
orr(temp2_reg, R16_thread, temp2_reg);
2207
ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2208
orr(temp_reg, temp2_reg, temp_reg);
2209
2210
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2211
2212
cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2213
/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2214
/*where=*/obj_reg,
2215
MacroAssembler::MemBarAcq,
2216
MacroAssembler::cmpxchgx_hint_acquire_lock(),
2217
noreg, slow_case_int); // bail out if failed
2218
2219
// If the biasing toward our thread failed, this means that
2220
// another thread succeeded in biasing it toward itself and we
2221
// need to revoke that bias. The revocation will occur in the
2222
// interpreter runtime in the slow case.
2223
if (PrintBiasedLockingStatistics) {
2224
load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2225
lwzx(temp_reg, temp2_reg);
2226
addi(temp_reg, temp_reg, 1);
2227
stwx(temp_reg, temp2_reg);
2228
}
2229
b(done);
2230
2231
bind(try_revoke_bias);
2232
// The prototype mark in the klass doesn't have the bias bit set any
2233
// more, indicating that objects of this data type are not supposed
2234
// to be biased any more. We are going to try to reset the mark of
2235
// this object to the prototype value and fall through to the
2236
// CAS-based locking scheme. Note that if our CAS fails, it means
2237
// that another thread raced us for the privilege of revoking the
2238
// bias of this particular object, so it's okay to continue in the
2239
// normal locking code.
2240
load_klass(temp_reg, obj_reg);
2241
ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2242
andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2243
orr(temp_reg, temp_reg, temp2_reg);
2244
2245
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2246
2247
// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2248
cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2249
/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2250
/*where=*/obj_reg,
2251
MacroAssembler::MemBarAcq,
2252
MacroAssembler::cmpxchgx_hint_acquire_lock());
2253
2254
// reload markWord in mark_reg before continuing with lightweight locking
2255
ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2256
2257
// Fall through to the normal CAS-based lock, because no matter what
2258
// the result of the above CAS, some thread must have succeeded in
2259
// removing the bias bit from the object's header.
2260
if (PrintBiasedLockingStatistics) {
2261
Label l;
2262
bne(cr_reg, l);
2263
load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2264
lwzx(temp_reg, temp2_reg);
2265
addi(temp_reg, temp_reg, 1);
2266
stwx(temp_reg, temp2_reg);
2267
bind(l);
2268
}
2269
2270
bind(cas_label);
2271
}
2272
2273
void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2274
// Check for biased locking unlock case, which is a no-op
2275
// Note: we do not have to check the thread ID for two reasons.
2276
// First, the interpreter checks for IllegalMonitorStateException at
2277
// a higher level. Second, if the bias was revoked while we held the
2278
// lock, the object could not be rebiased toward another thread, so
2279
// the bias bit would be clear.
2280
2281
ld(temp_reg, 0, mark_addr);
2282
andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2283
2284
cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2285
beq(cr_reg, done);
2286
}
2287
2288
// allocation (for C1)
2289
void MacroAssembler::eden_allocate(
2290
Register obj, // result: pointer to object after successful allocation
2291
Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2292
int con_size_in_bytes, // object size in bytes if known at compile time
2293
Register t1, // temp register
2294
Register t2, // temp register
2295
Label& slow_case // continuation point if fast allocation fails
2296
) {
2297
b(slow_case);
2298
}
2299
2300
void MacroAssembler::tlab_allocate(
2301
Register obj, // result: pointer to object after successful allocation
2302
Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2303
int con_size_in_bytes, // object size in bytes if known at compile time
2304
Register t1, // temp register
2305
Label& slow_case // continuation point if fast allocation fails
2306
) {
2307
// make sure arguments make sense
2308
assert_different_registers(obj, var_size_in_bytes, t1);
2309
assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2310
assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2311
2312
const Register new_top = t1;
2313
//verify_tlab(); not implemented
2314
2315
ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2316
ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2317
if (var_size_in_bytes == noreg) {
2318
addi(new_top, obj, con_size_in_bytes);
2319
} else {
2320
add(new_top, obj, var_size_in_bytes);
2321
}
2322
cmpld(CCR0, new_top, R0);
2323
bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2324
2325
#ifdef ASSERT
2326
// make sure new free pointer is properly aligned
2327
{
2328
Label L;
2329
andi_(R0, new_top, MinObjAlignmentInBytesMask);
2330
beq(CCR0, L);
2331
stop("updated TLAB free is not properly aligned");
2332
bind(L);
2333
}
2334
#endif // ASSERT
2335
2336
// update the tlab top pointer
2337
std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2338
//verify_tlab(); not implemented
2339
}
2340
void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2341
unimplemented("incr_allocated_bytes");
2342
}
2343
2344
address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2345
int insts_call_instruction_offset, Register Rtoc) {
2346
// Start the stub.
2347
address stub = start_a_stub(64);
2348
if (stub == NULL) { return NULL; } // CodeCache full: bail out
2349
2350
// Create a trampoline stub relocation which relates this trampoline stub
2351
// with the call instruction at insts_call_instruction_offset in the
2352
// instructions code-section.
2353
relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2354
const int stub_start_offset = offset();
2355
2356
// For java_to_interp stubs we use R11_scratch1 as scratch register
2357
// and in call trampoline stubs we use R12_scratch2. This way we
2358
// can distinguish them (see is_NativeCallTrampolineStub_at()).
2359
Register reg_scratch = R12_scratch2;
2360
2361
// Now, create the trampoline stub's code:
2362
// - load the TOC
2363
// - load the call target from the constant pool
2364
// - call
2365
if (Rtoc == noreg) {
2366
calculate_address_from_global_toc(reg_scratch, method_toc());
2367
Rtoc = reg_scratch;
2368
}
2369
2370
ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2371
mtctr(reg_scratch);
2372
bctr();
2373
2374
const address stub_start_addr = addr_at(stub_start_offset);
2375
2376
// Assert that the encoded destination_toc_offset can be identified and that it is correct.
2377
assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2378
"encoded offset into the constant pool must match");
2379
// Trampoline_stub_size should be good.
2380
assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2381
assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2382
2383
// End the stub.
2384
end_a_stub();
2385
return stub;
2386
}
2387
2388
// TM on PPC64.
2389
void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2390
Label retry;
2391
bind(retry);
2392
ldarx(result, addr, /*hint*/ false);
2393
addi(result, result, simm16);
2394
stdcx_(result, addr);
2395
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2396
bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2397
} else {
2398
bne( CCR0, retry); // stXcx_ sets CCR0
2399
}
2400
}
2401
2402
void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2403
Label retry;
2404
bind(retry);
2405
lwarx(result, addr, /*hint*/ false);
2406
ori(result, result, uimm16);
2407
stwcx_(result, addr);
2408
if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2409
bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2410
} else {
2411
bne( CCR0, retry); // stXcx_ sets CCR0
2412
}
2413
}
2414
2415
#if INCLUDE_RTM_OPT
2416
2417
// Update rtm_counters based on abort status
2418
// input: abort_status
2419
// rtm_counters_Reg (RTMLockingCounters*)
2420
void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2421
// Mapping to keep PreciseRTMLockingStatistics similar to x86.
2422
// x86 ppc (! means inverted, ? means not the same)
2423
// 0 31 Set if abort caused by XABORT instruction.
2424
// 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2425
// 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2426
// 3 10 Set if an internal buffer overflowed.
2427
// 4 ?12 Set if a debug breakpoint was hit.
2428
// 5 ?32 Set if an abort occurred during execution of a nested transaction.
2429
const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2430
tm_failure_persistent,
2431
tm_non_trans_cf,
2432
tm_trans_cf,
2433
tm_footprint_of,
2434
tm_failure_code,
2435
tm_transaction_level};
2436
2437
const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2438
const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2439
2440
const int bit2counter_map[][num_counters] =
2441
// 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2442
// Inverted logic means that if a bit is set don't count it, or vice-versa.
2443
// Care must be taken when mapping bits to counters as bits for a given
2444
// counter must be mutually exclusive. Otherwise, the counter will be
2445
// incremented more than once.
2446
// counters:
2447
// 0 1 2 3 4 5
2448
// abort , persist, conflict, overflow, debug , nested bits:
2449
{{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort
2450
{ 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent
2451
{ 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf
2452
{ 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf
2453
{ 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of
2454
{ 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4
2455
{ 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1
2456
// ...
2457
2458
// Move abort_status value to R0 and use abort_status register as a
2459
// temporary register because R0 as third operand in ld/std is treated
2460
// as base address zero (value). Likewise, R0 as second operand in addi
2461
// is problematic because it amounts to li.
2462
const Register temp_Reg = abort_status;
2463
const Register abort_status_R0 = R0;
2464
mr(abort_status_R0, abort_status);
2465
2466
// Increment total abort counter.
2467
int counters_offs = RTMLockingCounters::abort_count_offset();
2468
ld(temp_Reg, counters_offs, rtm_counters_Reg);
2469
addi(temp_Reg, temp_Reg, 1);
2470
std(temp_Reg, counters_offs, rtm_counters_Reg);
2471
2472
// Increment specific abort counters.
2473
if (PrintPreciseRTMLockingStatistics) {
2474
2475
// #0 counter offset.
2476
int abortX_offs = RTMLockingCounters::abortX_count_offset();
2477
2478
for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2479
for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2480
if (bit2counter_map[nbit][ncounter] != 0) {
2481
Label check_abort;
2482
int abort_counter_offs = abortX_offs + (ncounter << 3);
2483
2484
if (failure_bit[nbit] == tm_transaction_level) {
2485
// Don't check outer transaction, TL = 1 (bit 63). Hence only
2486
// 11 bits in the TL field are checked to find out if failure
2487
// occured in a nested transaction. This check also matches
2488
// the case when nesting_of = 1 (nesting overflow).
2489
rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2490
} else if (failure_bit[nbit] == tm_failure_code) {
2491
// Check failure code for trap or illegal caught in TM.
2492
// Bits 0:7 are tested as bit 7 (persistent) is copied from
2493
// tabort or treclaim source operand.
2494
// On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2495
rldicl(temp_Reg, abort_status_R0, 8, 56);
2496
cmpdi(CCR0, temp_Reg, 0xD4);
2497
} else {
2498
rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2499
}
2500
2501
if (bit2counter_map[nbit][ncounter] == 1) {
2502
beq(CCR0, check_abort);
2503
} else {
2504
bne(CCR0, check_abort);
2505
}
2506
2507
// We don't increment atomically.
2508
ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2509
addi(temp_Reg, temp_Reg, 1);
2510
std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2511
2512
bind(check_abort);
2513
}
2514
}
2515
}
2516
}
2517
// Restore abort_status.
2518
mr(abort_status, abort_status_R0);
2519
}
2520
2521
// Branch if (random & (count-1) != 0), count is 2^n
2522
// tmp and CR0 are killed
2523
void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2524
mftb(tmp);
2525
andi_(tmp, tmp, count-1);
2526
bne(CCR0, brLabel);
2527
}
2528
2529
// Perform abort ratio calculation, set no_rtm bit if high ratio.
2530
// input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2531
void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2532
RTMLockingCounters* rtm_counters,
2533
Metadata* method_data) {
2534
Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2535
2536
if (RTMLockingCalculationDelay > 0) {
2537
// Delay calculation.
2538
ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2539
cmpdi(CCR0, rtm_counters_Reg, 0);
2540
beq(CCR0, L_done);
2541
load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2542
}
2543
// Abort ratio calculation only if abort_count > RTMAbortThreshold.
2544
// Aborted transactions = abort_count * 100
2545
// All transactions = total_count * RTMTotalCountIncrRate
2546
// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2547
ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2548
if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only.
2549
cmpdi(CCR0, R0, RTMAbortThreshold);
2550
blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary
2551
} else {
2552
load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2553
cmpd(CCR0, R0, rtm_counters_Reg);
2554
blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required
2555
}
2556
mulli(R0, R0, 100);
2557
2558
const Register tmpReg = rtm_counters_Reg;
2559
ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2560
mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2561
mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16
2562
cmpd(CCR0, R0, tmpReg);
2563
blt(CCR0, L_check_always_rtm1); // jump to reload
2564
if (method_data != NULL) {
2565
// Set rtm_state to "no rtm" in MDO.
2566
// Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2567
// (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2568
load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2569
atomic_ori_int(R0, tmpReg, NoRTM);
2570
}
2571
b(L_done);
2572
2573
bind(L_check_always_rtm1);
2574
load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2575
bind(L_check_always_rtm2);
2576
ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2577
int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2578
if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only.
2579
cmpdi(CCR0, tmpReg, thresholdValue);
2580
} else {
2581
load_const_optimized(R0, thresholdValue);
2582
cmpd(CCR0, tmpReg, R0);
2583
}
2584
blt(CCR0, L_done);
2585
if (method_data != NULL) {
2586
// Set rtm_state to "always rtm" in MDO.
2587
// Not using a metadata relocation. See above.
2588
load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2589
atomic_ori_int(R0, tmpReg, UseRTM);
2590
}
2591
bind(L_done);
2592
}
2593
2594
// Update counters and perform abort ratio calculation.
2595
// input: abort_status_Reg
2596
void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2597
RTMLockingCounters* rtm_counters,
2598
Metadata* method_data,
2599
bool profile_rtm) {
2600
2601
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2602
// Update rtm counters based on state at abort.
2603
// Reads abort_status_Reg, updates flags.
2604
assert_different_registers(abort_status_Reg, temp_Reg);
2605
load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2606
rtm_counters_update(abort_status_Reg, temp_Reg);
2607
if (profile_rtm) {
2608
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2609
rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2610
}
2611
}
2612
2613
// Retry on abort if abort's status indicates non-persistent failure.
2614
// inputs: retry_count_Reg
2615
// : abort_status_Reg
2616
// output: retry_count_Reg decremented by 1
2617
void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2618
Label& retryLabel, Label* checkRetry) {
2619
Label doneRetry;
2620
2621
// Don't retry if failure is persistent.
2622
// The persistent bit is set when a (A) Disallowed operation is performed in
2623
// transactional state, like for instance trying to write the TFHAR after a
2624
// transaction is started; or when there is (B) a Nesting Overflow (too many
2625
// nested transactions); or when (C) the Footprint overflows (too many
2626
// addressess touched in TM state so there is no more space in the footprint
2627
// area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2628
// store is performed to a given address in TM state, then once in suspended
2629
// state the same address is accessed. Failure (A) is very unlikely to occur
2630
// in the JVM. Failure (D) will never occur because Suspended state is never
2631
// used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2632
// Overflow will set the persistent bit.
2633
rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2634
bne(CCR0, doneRetry);
2635
2636
// Don't retry if transaction was deliberately aborted, i.e. caused by a
2637
// tabort instruction.
2638
rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2639
bne(CCR0, doneRetry);
2640
2641
// Retry if transaction aborted due to a conflict with another thread.
2642
if (checkRetry) { bind(*checkRetry); }
2643
addic_(retry_count_Reg, retry_count_Reg, -1);
2644
blt(CCR0, doneRetry);
2645
b(retryLabel);
2646
bind(doneRetry);
2647
}
2648
2649
// Spin and retry if lock is busy.
2650
// inputs: owner_addr_Reg (monitor address)
2651
// : retry_count_Reg
2652
// output: retry_count_Reg decremented by 1
2653
// CTR is killed
2654
void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2655
Label SpinLoop, doneRetry, doRetry;
2656
addic_(retry_count_Reg, retry_count_Reg, -1);
2657
blt(CCR0, doneRetry);
2658
2659
if (RTMSpinLoopCount > 1) {
2660
li(R0, RTMSpinLoopCount);
2661
mtctr(R0);
2662
}
2663
2664
// low thread priority
2665
smt_prio_low();
2666
bind(SpinLoop);
2667
2668
if (RTMSpinLoopCount > 1) {
2669
bdz(doRetry);
2670
ld(R0, 0, owner_addr_Reg);
2671
cmpdi(CCR0, R0, 0);
2672
bne(CCR0, SpinLoop);
2673
}
2674
2675
bind(doRetry);
2676
2677
// restore thread priority to default in userspace
2678
#ifdef LINUX
2679
smt_prio_medium_low();
2680
#else
2681
smt_prio_medium();
2682
#endif
2683
2684
b(retryLabel);
2685
2686
bind(doneRetry);
2687
}
2688
2689
// Use RTM for normal stack locks.
2690
// Input: objReg (object to lock)
2691
void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2692
Register obj, Register mark_word, Register tmp,
2693
Register retry_on_abort_count_Reg,
2694
RTMLockingCounters* stack_rtm_counters,
2695
Metadata* method_data, bool profile_rtm,
2696
Label& DONE_LABEL, Label& IsInflated) {
2697
assert(UseRTMForStackLocks, "why call this otherwise?");
2698
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2699
Label L_rtm_retry, L_decrement_retry, L_on_abort;
2700
2701
if (RTMRetryCount > 0) {
2702
load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2703
bind(L_rtm_retry);
2704
}
2705
andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
2706
bne(CCR0, IsInflated);
2707
2708
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2709
Label L_noincrement;
2710
if (RTMTotalCountIncrRate > 1) {
2711
branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2712
}
2713
assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2714
load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2715
//atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2716
ldx(mark_word, tmp);
2717
addi(mark_word, mark_word, 1);
2718
stdx(mark_word, tmp);
2719
bind(L_noincrement);
2720
}
2721
tbegin_();
2722
beq(CCR0, L_on_abort);
2723
ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked.
2724
andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2725
cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked
2726
beq(flag, DONE_LABEL); // all done if unlocked
2727
2728
if (UseRTMXendForLockBusy) {
2729
tend_();
2730
b(L_decrement_retry);
2731
} else {
2732
tabort_();
2733
}
2734
bind(L_on_abort);
2735
const Register abort_status_Reg = tmp;
2736
mftexasr(abort_status_Reg);
2737
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2738
rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2739
}
2740
ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2741
if (RTMRetryCount > 0) {
2742
// Retry on lock abort if abort status is not permanent.
2743
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2744
} else {
2745
bind(L_decrement_retry);
2746
}
2747
}
2748
2749
// Use RTM for inflating locks
2750
// inputs: obj (object to lock)
2751
// mark_word (current header - KILLED)
2752
// boxReg (on-stack box address (displaced header location) - KILLED)
2753
void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2754
Register obj, Register mark_word, Register boxReg,
2755
Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2756
RTMLockingCounters* rtm_counters,
2757
Metadata* method_data, bool profile_rtm,
2758
Label& DONE_LABEL) {
2759
assert(UseRTMLocking, "why call this otherwise?");
2760
Label L_rtm_retry, L_decrement_retry, L_on_abort;
2761
// Clean monitor_value bit to get valid pointer.
2762
int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2763
2764
// Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2765
std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2766
const Register tmpReg = boxReg;
2767
const Register owner_addr_Reg = mark_word;
2768
addi(owner_addr_Reg, mark_word, owner_offset);
2769
2770
if (RTMRetryCount > 0) {
2771
load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy.
2772
load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2773
bind(L_rtm_retry);
2774
}
2775
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2776
Label L_noincrement;
2777
if (RTMTotalCountIncrRate > 1) {
2778
branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2779
}
2780
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2781
load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2782
//atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2783
ldx(tmpReg, R0);
2784
addi(tmpReg, tmpReg, 1);
2785
stdx(tmpReg, R0);
2786
bind(L_noincrement);
2787
}
2788
tbegin_();
2789
beq(CCR0, L_on_abort);
2790
// We don't reload mark word. Will only be reset at safepoint.
2791
ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2792
cmpdi(flag, R0, 0);
2793
beq(flag, DONE_LABEL);
2794
2795
if (UseRTMXendForLockBusy) {
2796
tend_();
2797
b(L_decrement_retry);
2798
} else {
2799
tabort_();
2800
}
2801
bind(L_on_abort);
2802
const Register abort_status_Reg = tmpReg;
2803
mftexasr(abort_status_Reg);
2804
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2805
rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2806
// Restore owner_addr_Reg
2807
ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2808
#ifdef ASSERT
2809
andi_(R0, mark_word, markWord::monitor_value);
2810
asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2811
#endif
2812
addi(owner_addr_Reg, mark_word, owner_offset);
2813
}
2814
if (RTMRetryCount > 0) {
2815
// Retry on lock abort if abort status is not permanent.
2816
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2817
}
2818
2819
// Appears unlocked - try to swing _owner from null to non-null.
2820
cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2821
MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2822
MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2823
2824
if (RTMRetryCount > 0) {
2825
// success done else retry
2826
b(DONE_LABEL);
2827
bind(L_decrement_retry);
2828
// Spin and retry if lock is busy.
2829
rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2830
} else {
2831
bind(L_decrement_retry);
2832
}
2833
}
2834
2835
#endif // INCLUDE_RTM_OPT
2836
2837
// "The box" is the space on the stack where we copy the object mark.
2838
void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2839
Register temp, Register displaced_header, Register current_header,
2840
bool try_bias,
2841
RTMLockingCounters* rtm_counters,
2842
RTMLockingCounters* stack_rtm_counters,
2843
Metadata* method_data,
2844
bool use_rtm, bool profile_rtm) {
2845
assert_different_registers(oop, box, temp, displaced_header, current_header);
2846
assert(flag != CCR0, "bad condition register");
2847
Label cont;
2848
Label object_has_monitor;
2849
Label cas_failed;
2850
2851
// Load markWord from object into displaced_header.
2852
ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2853
2854
if (DiagnoseSyncOnValueBasedClasses != 0) {
2855
load_klass(temp, oop);
2856
lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2857
testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2858
bne(flag, cont);
2859
}
2860
2861
if (try_bias) {
2862
biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2863
}
2864
2865
#if INCLUDE_RTM_OPT
2866
if (UseRTMForStackLocks && use_rtm) {
2867
rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2868
stack_rtm_counters, method_data, profile_rtm,
2869
cont, object_has_monitor);
2870
}
2871
#endif // INCLUDE_RTM_OPT
2872
2873
// Handle existing monitor.
2874
// The object has an existing monitor iff (mark & monitor_value) != 0.
2875
andi_(temp, displaced_header, markWord::monitor_value);
2876
bne(CCR0, object_has_monitor);
2877
2878
// Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2879
ori(displaced_header, displaced_header, markWord::unlocked_value);
2880
2881
// Load Compare Value application register.
2882
2883
// Initialize the box. (Must happen before we update the object mark!)
2884
std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2885
2886
// Must fence, otherwise, preceding store(s) may float below cmpxchg.
2887
// Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2888
cmpxchgd(/*flag=*/flag,
2889
/*current_value=*/current_header,
2890
/*compare_value=*/displaced_header,
2891
/*exchange_value=*/box,
2892
/*where=*/oop,
2893
MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2894
MacroAssembler::cmpxchgx_hint_acquire_lock(),
2895
noreg,
2896
&cas_failed,
2897
/*check without membar and ldarx first*/true);
2898
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2899
2900
// If the compare-and-exchange succeeded, then we found an unlocked
2901
// object and we have now locked it.
2902
b(cont);
2903
2904
bind(cas_failed);
2905
// We did not see an unlocked object so try the fast recursive case.
2906
2907
// Check if the owner is self by comparing the value in the markWord of object
2908
// (current_header) with the stack pointer.
2909
sub(current_header, current_header, R1_SP);
2910
load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2911
2912
and_(R0/*==0?*/, current_header, temp);
2913
// If condition is true we are cont and hence we can store 0 as the
2914
// displaced header in the box, which indicates that it is a recursive lock.
2915
mcrf(flag,CCR0);
2916
std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2917
2918
// Handle existing monitor.
2919
b(cont);
2920
2921
bind(object_has_monitor);
2922
// The object's monitor m is unlocked iff m->owner == NULL,
2923
// otherwise m->owner may contain a thread or a stack address.
2924
2925
#if INCLUDE_RTM_OPT
2926
// Use the same RTM locking code in 32- and 64-bit VM.
2927
if (use_rtm) {
2928
rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2929
rtm_counters, method_data, profile_rtm, cont);
2930
} else {
2931
#endif // INCLUDE_RTM_OPT
2932
2933
// Try to CAS m->owner from NULL to current thread.
2934
addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2935
cmpxchgd(/*flag=*/flag,
2936
/*current_value=*/current_header,
2937
/*compare_value=*/(intptr_t)0,
2938
/*exchange_value=*/R16_thread,
2939
/*where=*/temp,
2940
MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2941
MacroAssembler::cmpxchgx_hint_acquire_lock());
2942
2943
// Store a non-null value into the box.
2944
std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2945
2946
# ifdef ASSERT
2947
bne(flag, cont);
2948
// We have acquired the monitor, check some invariants.
2949
addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2950
// Invariant 1: _recursions should be 0.
2951
//assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2952
asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2953
"monitor->_recursions should be 0");
2954
# endif
2955
2956
#if INCLUDE_RTM_OPT
2957
} // use_rtm()
2958
#endif
2959
2960
bind(cont);
2961
// flag == EQ indicates success
2962
// flag == NE indicates failure
2963
}
2964
2965
void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2966
Register temp, Register displaced_header, Register current_header,
2967
bool try_bias, bool use_rtm) {
2968
assert_different_registers(oop, box, temp, displaced_header, current_header);
2969
assert(flag != CCR0, "bad condition register");
2970
Label cont;
2971
Label object_has_monitor;
2972
2973
if (try_bias) {
2974
biased_locking_exit(flag, oop, current_header, cont);
2975
}
2976
2977
#if INCLUDE_RTM_OPT
2978
if (UseRTMForStackLocks && use_rtm) {
2979
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2980
Label L_regular_unlock;
2981
ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword
2982
andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2983
cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked
2984
bne(flag, L_regular_unlock); // else RegularLock
2985
tend_(); // otherwise end...
2986
b(cont); // ... and we're done
2987
bind(L_regular_unlock);
2988
}
2989
#endif
2990
2991
// Find the lock address and load the displaced header from the stack.
2992
ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2993
2994
// If the displaced header is 0, we have a recursive unlock.
2995
cmpdi(flag, displaced_header, 0);
2996
beq(flag, cont);
2997
2998
// Handle existing monitor.
2999
// The object has an existing monitor iff (mark & monitor_value) != 0.
3000
RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
3001
ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
3002
andi_(R0, current_header, markWord::monitor_value);
3003
bne(CCR0, object_has_monitor);
3004
3005
// Check if it is still a light weight lock, this is is true if we see
3006
// the stack address of the basicLock in the markWord of the object.
3007
// Cmpxchg sets flag to cmpd(current_header, box).
3008
cmpxchgd(/*flag=*/flag,
3009
/*current_value=*/current_header,
3010
/*compare_value=*/box,
3011
/*exchange_value=*/displaced_header,
3012
/*where=*/oop,
3013
MacroAssembler::MemBarRel,
3014
MacroAssembler::cmpxchgx_hint_release_lock(),
3015
noreg,
3016
&cont);
3017
3018
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
3019
3020
// Handle existing monitor.
3021
b(cont);
3022
3023
bind(object_has_monitor);
3024
STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3025
addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3026
ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3027
3028
// It's inflated.
3029
#if INCLUDE_RTM_OPT
3030
if (use_rtm) {
3031
Label L_regular_inflated_unlock;
3032
// Clean monitor_value bit to get valid pointer
3033
cmpdi(flag, temp, 0);
3034
bne(flag, L_regular_inflated_unlock);
3035
tend_();
3036
b(cont);
3037
bind(L_regular_inflated_unlock);
3038
}
3039
#endif
3040
3041
ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3042
xorr(temp, R16_thread, temp); // Will be 0 if we are the owner.
3043
orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3044
cmpdi(flag, temp, 0);
3045
bne(flag, cont);
3046
3047
ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3048
ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3049
orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3050
cmpdi(flag, temp, 0);
3051
bne(flag, cont);
3052
release();
3053
std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3054
3055
bind(cont);
3056
// flag == EQ indicates success
3057
// flag == NE indicates failure
3058
}
3059
3060
void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3061
ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3062
3063
if (at_return) {
3064
if (in_nmethod) {
3065
if (UseSIGTRAP) {
3066
// Use Signal Handler.
3067
relocate(relocInfo::poll_return_type);
3068
td(traptoGreaterThanUnsigned, R1_SP, temp);
3069
} else {
3070
cmpld(CCR0, R1_SP, temp);
3071
// Stub may be out of range for short conditional branch.
3072
bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
3073
}
3074
} else { // Not in nmethod.
3075
// Frame still on stack, need to get fp.
3076
Register fp = R0;
3077
ld(fp, _abi0(callers_sp), R1_SP);
3078
cmpld(CCR0, fp, temp);
3079
bgt(CCR0, slow_path);
3080
}
3081
} else { // Normal safepoint poll. Not at return.
3082
assert(!in_nmethod, "should use load_from_polling_page");
3083
andi_(temp, temp, SafepointMechanism::poll_bit());
3084
bne(CCR0, slow_path);
3085
}
3086
}
3087
3088
void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3089
MacroAssembler::PreservationLevel preservation_level) {
3090
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3091
bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3092
}
3093
3094
// Values for last_Java_pc, and last_Java_sp must comply to the rules
3095
// in frame_ppc.hpp.
3096
void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3097
// Always set last_Java_pc and flags first because once last_Java_sp
3098
// is visible has_last_Java_frame is true and users will look at the
3099
// rest of the fields. (Note: flags should always be zero before we
3100
// get here so doesn't need to be set.)
3101
3102
// Verify that last_Java_pc was zeroed on return to Java
3103
asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3104
"last_Java_pc not zeroed before leaving Java");
3105
3106
// When returning from calling out from Java mode the frame anchor's
3107
// last_Java_pc will always be set to NULL. It is set here so that
3108
// if we are doing a call to native (not VM) that we capture the
3109
// known pc and don't have to rely on the native call having a
3110
// standard frame linkage where we can find the pc.
3111
if (last_Java_pc != noreg)
3112
std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3113
3114
// Set last_Java_sp last.
3115
std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3116
}
3117
3118
void MacroAssembler::reset_last_Java_frame(void) {
3119
asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3120
R16_thread, "SP was not set, still zero");
3121
3122
BLOCK_COMMENT("reset_last_Java_frame {");
3123
li(R0, 0);
3124
3125
// _last_Java_sp = 0
3126
std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3127
3128
// _last_Java_pc = 0
3129
std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3130
BLOCK_COMMENT("} reset_last_Java_frame");
3131
}
3132
3133
void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3134
assert_different_registers(sp, tmp1);
3135
3136
// sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3137
// TOP_IJAVA_FRAME_ABI.
3138
// FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3139
address entry = pc();
3140
load_const_optimized(tmp1, entry);
3141
3142
set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3143
}
3144
3145
void MacroAssembler::get_vm_result(Register oop_result) {
3146
// Read:
3147
// R16_thread
3148
// R16_thread->in_bytes(JavaThread::vm_result_offset())
3149
//
3150
// Updated:
3151
// oop_result
3152
// R16_thread->in_bytes(JavaThread::vm_result_offset())
3153
3154
verify_thread();
3155
3156
ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3157
li(R0, 0);
3158
std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3159
3160
verify_oop(oop_result, FILE_AND_LINE);
3161
}
3162
3163
void MacroAssembler::get_vm_result_2(Register metadata_result) {
3164
// Read:
3165
// R16_thread
3166
// R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3167
//
3168
// Updated:
3169
// metadata_result
3170
// R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3171
3172
ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3173
li(R0, 0);
3174
std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3175
}
3176
3177
Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3178
Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3179
if (CompressedKlassPointers::base() != 0) {
3180
// Use dst as temp if it is free.
3181
sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3182
current = dst;
3183
}
3184
if (CompressedKlassPointers::shift() != 0) {
3185
srdi(dst, current, CompressedKlassPointers::shift());
3186
current = dst;
3187
}
3188
return current;
3189
}
3190
3191
void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3192
if (UseCompressedClassPointers) {
3193
Register compressedKlass = encode_klass_not_null(ck, klass);
3194
stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3195
} else {
3196
std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3197
}
3198
}
3199
3200
void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3201
if (UseCompressedClassPointers) {
3202
if (val == noreg) {
3203
val = R0;
3204
li(val, 0);
3205
}
3206
stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3207
}
3208
}
3209
3210
int MacroAssembler::instr_size_for_decode_klass_not_null() {
3211
static int computed_size = -1;
3212
3213
// Not yet computed?
3214
if (computed_size == -1) {
3215
3216
if (!UseCompressedClassPointers) {
3217
computed_size = 0;
3218
} else {
3219
// Determine by scratch emit.
3220
ResourceMark rm;
3221
int code_size = 8 * BytesPerInstWord;
3222
CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
3223
MacroAssembler* a = new MacroAssembler(&cb);
3224
a->decode_klass_not_null(R11_scratch1);
3225
computed_size = a->offset();
3226
}
3227
}
3228
3229
return computed_size;
3230
}
3231
3232
void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3233
assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3234
if (src == noreg) src = dst;
3235
Register shifted_src = src;
3236
if (CompressedKlassPointers::shift() != 0 ||
3237
CompressedKlassPointers::base() == 0 && src != dst) { // Move required.
3238
shifted_src = dst;
3239
sldi(shifted_src, src, CompressedKlassPointers::shift());
3240
}
3241
if (CompressedKlassPointers::base() != 0) {
3242
add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3243
}
3244
}
3245
3246
void MacroAssembler::load_klass(Register dst, Register src) {
3247
if (UseCompressedClassPointers) {
3248
lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3249
// Attention: no null check here!
3250
decode_klass_not_null(dst, dst);
3251
} else {
3252
ld(dst, oopDesc::klass_offset_in_bytes(), src);
3253
}
3254
}
3255
3256
// ((OopHandle)result).resolve();
3257
void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3258
MacroAssembler::PreservationLevel preservation_level) {
3259
access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3260
}
3261
3262
void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3263
MacroAssembler::PreservationLevel preservation_level) {
3264
Label resolved;
3265
3266
// A null weak handle resolves to null.
3267
cmpdi(CCR0, result, 0);
3268
beq(CCR0, resolved);
3269
3270
access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3271
preservation_level);
3272
bind(resolved);
3273
}
3274
3275
void MacroAssembler::load_method_holder(Register holder, Register method) {
3276
ld(holder, in_bytes(Method::const_offset()), method);
3277
ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3278
ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3279
}
3280
3281
// Clear Array
3282
// For very short arrays. tmp == R0 is allowed.
3283
void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3284
if (cnt_dwords > 0) { li(tmp, 0); }
3285
for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3286
}
3287
3288
// Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3289
void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3290
if (cnt_dwords < 8) {
3291
clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3292
return;
3293
}
3294
3295
Label loop;
3296
const long loopcnt = cnt_dwords >> 1,
3297
remainder = cnt_dwords & 1;
3298
3299
li(tmp, loopcnt);
3300
mtctr(tmp);
3301
li(tmp, 0);
3302
bind(loop);
3303
std(tmp, 0, base_ptr);
3304
std(tmp, 8, base_ptr);
3305
addi(base_ptr, base_ptr, 16);
3306
bdnz(loop);
3307
if (remainder) { std(tmp, 0, base_ptr); }
3308
}
3309
3310
// Kills both input registers. tmp == R0 is allowed.
3311
void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3312
// Procedure for large arrays (uses data cache block zero instruction).
3313
Label startloop, fast, fastloop, small_rest, restloop, done;
3314
const int cl_size = VM_Version::L1_data_cache_line_size(),
3315
cl_dwords = cl_size >> 3,
3316
cl_dw_addr_bits = exact_log2(cl_dwords),
3317
dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3318
min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3319
3320
if (const_cnt >= 0) {
3321
// Constant case.
3322
if (const_cnt < min_cnt) {
3323
clear_memory_constlen(base_ptr, const_cnt, tmp);
3324
return;
3325
}
3326
load_const_optimized(cnt_dwords, const_cnt, tmp);
3327
} else {
3328
// cnt_dwords already loaded in register. Need to check size.
3329
cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3330
blt(CCR1, small_rest);
3331
}
3332
rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3333
beq(CCR0, fast); // Already 128byte aligned.
3334
3335
subfic(tmp, tmp, cl_dwords);
3336
mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3337
subf(cnt_dwords, tmp, cnt_dwords); // rest.
3338
li(tmp, 0);
3339
3340
bind(startloop); // Clear at the beginning to reach 128byte boundary.
3341
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3342
addi(base_ptr, base_ptr, 8);
3343
bdnz(startloop);
3344
3345
bind(fast); // Clear 128byte blocks.
3346
srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3347
andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3348
mtctr(tmp); // Load counter.
3349
3350
bind(fastloop);
3351
dcbz(base_ptr); // Clear 128byte aligned block.
3352
addi(base_ptr, base_ptr, cl_size);
3353
bdnz(fastloop);
3354
3355
bind(small_rest);
3356
cmpdi(CCR0, cnt_dwords, 0); // size 0?
3357
beq(CCR0, done); // rest == 0
3358
li(tmp, 0);
3359
mtctr(cnt_dwords); // Load counter.
3360
3361
bind(restloop); // Clear rest.
3362
std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3363
addi(base_ptr, base_ptr, 8);
3364
bdnz(restloop);
3365
3366
bind(done);
3367
}
3368
3369
/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3370
3371
// Helpers for Intrinsic Emitters
3372
//
3373
// Revert the byte order of a 32bit value in a register
3374
// src: 0x44556677
3375
// dst: 0x77665544
3376
// Three steps to obtain the result:
3377
// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3378
// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3379
// This value initializes dst.
3380
// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3381
// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3382
// This value is mask inserted into dst with a [0..23] mask of 1s.
3383
// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3384
// This value is mask inserted into dst with a [8..15] mask of 1s.
3385
void MacroAssembler::load_reverse_32(Register dst, Register src) {
3386
assert_different_registers(dst, src);
3387
3388
rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3389
rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3390
rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3391
}
3392
3393
// Calculate the column addresses of the crc32 lookup table into distinct registers.
3394
// This loop-invariant calculation is moved out of the loop body, reducing the loop
3395
// body size from 20 to 16 instructions.
3396
// Returns the offset that was used to calculate the address of column tc3.
3397
// Due to register shortage, setting tc3 may overwrite table. With the return offset
3398
// at hand, the original table address can be easily reconstructed.
3399
int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3400
assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3401
3402
// Point to 4 byte folding tables (byte-reversed version for Big Endian)
3403
// Layout: See StubRoutines::ppc::generate_crc_constants.
3404
#ifdef VM_LITTLE_ENDIAN
3405
const int ix0 = 3 * CRC32_TABLE_SIZE;
3406
const int ix1 = 2 * CRC32_TABLE_SIZE;
3407
const int ix2 = 1 * CRC32_TABLE_SIZE;
3408
const int ix3 = 0 * CRC32_TABLE_SIZE;
3409
#else
3410
const int ix0 = 1 * CRC32_TABLE_SIZE;
3411
const int ix1 = 2 * CRC32_TABLE_SIZE;
3412
const int ix2 = 3 * CRC32_TABLE_SIZE;
3413
const int ix3 = 4 * CRC32_TABLE_SIZE;
3414
#endif
3415
assert_different_registers(table, tc0, tc1, tc2);
3416
assert(table == tc3, "must be!");
3417
3418
addi(tc0, table, ix0);
3419
addi(tc1, table, ix1);
3420
addi(tc2, table, ix2);
3421
if (ix3 != 0) addi(tc3, table, ix3);
3422
3423
return ix3;
3424
}
3425
3426
/**
3427
* uint32_t crc;
3428
* table[crc & 0xFF] ^ (crc >> 8);
3429
*/
3430
void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3431
assert_different_registers(crc, table, tmp);
3432
assert_different_registers(val, table);
3433
3434
if (crc == val) { // Must rotate first to use the unmodified value.
3435
rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3436
// As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3437
srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3438
} else {
3439
srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3440
rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3441
}
3442
lwzx(tmp, table, tmp);
3443
xorr(crc, crc, tmp);
3444
}
3445
3446
/**
3447
* Emits code to update CRC-32 with a byte value according to constants in table.
3448
*
3449
* @param [in,out]crc Register containing the crc.
3450
* @param [in]val Register containing the byte to fold into the CRC.
3451
* @param [in]table Register containing the table of crc constants.
3452
*
3453
* uint32_t crc;
3454
* val = crc_table[(val ^ crc) & 0xFF];
3455
* crc = val ^ (crc >> 8);
3456
*/
3457
void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3458
BLOCK_COMMENT("update_byte_crc32:");
3459
xorr(val, val, crc);
3460
fold_byte_crc32(crc, val, table, val);
3461
}
3462
3463
/**
3464
* @param crc register containing existing CRC (32-bit)
3465
* @param buf register pointing to input byte buffer (byte*)
3466
* @param len register containing number of bytes
3467
* @param table register pointing to CRC table
3468
*/
3469
void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3470
Register data, bool loopAlignment) {
3471
assert_different_registers(crc, buf, len, table, data);
3472
3473
Label L_mainLoop, L_done;
3474
const int mainLoop_stepping = 1;
3475
const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3476
3477
// Process all bytes in a single-byte loop.
3478
clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3479
beq(CCR0, L_done);
3480
3481
mtctr(len);
3482
align(mainLoop_alignment);
3483
BIND(L_mainLoop);
3484
lbz(data, 0, buf); // Byte from buffer, zero-extended.
3485
addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3486
update_byte_crc32(crc, data, table);
3487
bdnz(L_mainLoop); // Iterate.
3488
3489
bind(L_done);
3490
}
3491
3492
/**
3493
* Emits code to update CRC-32 with a 4-byte value according to constants in table
3494
* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3495
*/
3496
// A note on the lookup table address(es):
3497
// The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3498
// To save the effort of adding the column offset to the table address each time
3499
// a table element is looked up, it is possible to pass the pre-calculated
3500
// column addresses.
3501
// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3502
void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3503
Register t0, Register t1, Register t2, Register t3,
3504
Register tc0, Register tc1, Register tc2, Register tc3) {
3505
assert_different_registers(crc, t3);
3506
3507
// XOR crc with next four bytes of buffer.
3508
lwz(t3, bufDisp, buf);
3509
if (bufInc != 0) {
3510
addi(buf, buf, bufInc);
3511
}
3512
xorr(t3, t3, crc);
3513
3514
// Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3515
rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3516
rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3517
rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3518
rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3519
3520
// Use the pre-calculated column addresses.
3521
// Load pre-calculated table values.
3522
lwzx(t0, tc0, t0);
3523
lwzx(t1, tc1, t1);
3524
lwzx(t2, tc2, t2);
3525
lwzx(t3, tc3, t3);
3526
3527
// Calculate new crc from table values.
3528
xorr(t0, t0, t1);
3529
xorr(t2, t2, t3);
3530
xorr(crc, t0, t2); // Now crc contains the final checksum value.
3531
}
3532
3533
/**
3534
* @param crc register containing existing CRC (32-bit)
3535
* @param buf register pointing to input byte buffer (byte*)
3536
* @param len register containing number of bytes
3537
* @param table register pointing to CRC table
3538
*
3539
* uses R9..R12 as work register. Must be saved/restored by caller!
3540
*/
3541
void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3542
Register t0, Register t1, Register t2, Register t3,
3543
Register tc0, Register tc1, Register tc2, Register tc3,
3544
bool invertCRC) {
3545
assert_different_registers(crc, buf, len, table);
3546
3547
Label L_mainLoop, L_tail;
3548
Register tmp = t0;
3549
Register data = t0;
3550
Register tmp2 = t1;
3551
const int mainLoop_stepping = 4;
3552
const int tailLoop_stepping = 1;
3553
const int log_stepping = exact_log2(mainLoop_stepping);
3554
const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3555
const int complexThreshold = 2*mainLoop_stepping;
3556
3557
// Don't test for len <= 0 here. This pathological case should not occur anyway.
3558
// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3559
// for all well-behaved cases. The situation itself is detected and handled correctly
3560
// within update_byteLoop_crc32.
3561
assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3562
3563
BLOCK_COMMENT("kernel_crc32_1word {");
3564
3565
if (invertCRC) {
3566
nand(crc, crc, crc); // 1s complement of crc
3567
}
3568
3569
// Check for short (<mainLoop_stepping) buffer.
3570
cmpdi(CCR0, len, complexThreshold);
3571
blt(CCR0, L_tail);
3572
3573
// Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3574
// We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3575
{
3576
// Align buf addr to mainLoop_stepping boundary.
3577
neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
3578
rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3579
3580
if (complexThreshold > mainLoop_stepping) {
3581
sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3582
} else {
3583
sub(tmp, len, tmp2); // Remaining bytes for main loop.
3584
cmpdi(CCR0, tmp, mainLoop_stepping);
3585
blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
3586
mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3587
}
3588
update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3589
}
3590
3591
srdi(tmp2, len, log_stepping); // #iterations for mainLoop
3592
andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
3593
mtctr(tmp2);
3594
3595
#ifdef VM_LITTLE_ENDIAN
3596
Register crc_rv = crc;
3597
#else
3598
Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
3599
// Occupies tmp, but frees up crc.
3600
load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
3601
tmp = crc;
3602
#endif
3603
3604
int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3605
3606
align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
3607
BIND(L_mainLoop);
3608
update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3609
bdnz(L_mainLoop);
3610
3611
#ifndef VM_LITTLE_ENDIAN
3612
load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
3613
tmp = crc_rv; // Tmp uses it's original register again.
3614
#endif
3615
3616
// Restore original table address for tailLoop.
3617
if (reconstructTableOffset != 0) {
3618
addi(table, table, -reconstructTableOffset);
3619
}
3620
3621
// Process last few (<complexThreshold) bytes of buffer.
3622
BIND(L_tail);
3623
update_byteLoop_crc32(crc, buf, len, table, data, false);
3624
3625
if (invertCRC) {
3626
nand(crc, crc, crc); // 1s complement of crc
3627
}
3628
BLOCK_COMMENT("} kernel_crc32_1word");
3629
}
3630
3631
/**
3632
* @param crc register containing existing CRC (32-bit)
3633
* @param buf register pointing to input byte buffer (byte*)
3634
* @param len register containing number of bytes
3635
* @param constants register pointing to precomputed constants
3636
* @param t0-t6 temp registers
3637
*/
3638
void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3639
Register t0, Register t1, Register t2, Register t3,
3640
Register t4, Register t5, Register t6, bool invertCRC) {
3641
assert_different_registers(crc, buf, len, constants);
3642
3643
Label L_tail;
3644
3645
BLOCK_COMMENT("kernel_crc32_vpmsum {");
3646
3647
if (invertCRC) {
3648
nand(crc, crc, crc); // 1s complement of crc
3649
}
3650
3651
// Enforce 32 bit.
3652
clrldi(len, len, 32);
3653
3654
// Align if we have enough bytes for the fast version.
3655
const int alignment = 16,
3656
threshold = 32;
3657
Register prealign = t0;
3658
3659
neg(prealign, buf);
3660
addi(t1, len, -threshold);
3661
andi(prealign, prealign, alignment - 1);
3662
cmpw(CCR0, t1, prealign);
3663
blt(CCR0, L_tail); // len - prealign < threshold?
3664
3665
subf(len, prealign, len);
3666
update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3667
3668
// Calculate from first aligned address as far as possible.
3669
addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3670
kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3671
addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3672
3673
// Remaining bytes.
3674
BIND(L_tail);
3675
update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3676
3677
if (invertCRC) {
3678
nand(crc, crc, crc); // 1s complement of crc
3679
}
3680
3681
BLOCK_COMMENT("} kernel_crc32_vpmsum");
3682
}
3683
3684
/**
3685
* @param crc register containing existing CRC (32-bit)
3686
* @param buf register pointing to input byte buffer (byte*)
3687
* @param len register containing number of bytes (will get updated to remaining bytes)
3688
* @param constants register pointing to CRC table for 128-bit aligned memory
3689
* @param t0-t6 temp registers
3690
*/
3691
void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3692
Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3693
3694
// Save non-volatile vector registers (frameless).
3695
Register offset = t1;
3696
int offsetInt = 0;
3697
offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3698
offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3699
offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3700
offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3701
offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3702
offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3703
#ifndef VM_LITTLE_ENDIAN
3704
offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3705
#endif
3706
offsetInt -= 8; std(R14, offsetInt, R1_SP);
3707
offsetInt -= 8; std(R15, offsetInt, R1_SP);
3708
3709
// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3710
// bytes per iteration. The basic scheme is:
3711
// lvx: load vector (Big Endian needs reversal)
3712
// vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3713
// vxor: xor partial results together to get unroll_factor2 vectors
3714
3715
// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3716
3717
// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3718
const int unroll_factor = CRC32_UNROLL_FACTOR,
3719
unroll_factor2 = CRC32_UNROLL_FACTOR2;
3720
3721
const int outer_consts_size = (unroll_factor2 - 1) * 16,
3722
inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3723
3724
// Support registers.
3725
Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3726
Register num_bytes = R14,
3727
loop_count = R15,
3728
cur_const = crc; // will live in VCRC
3729
// Constant array for outer loop: unroll_factor2 - 1 registers,
3730
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3731
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3732
consts1[] = { VR23, VR24 };
3733
// Data register arrays: 2 arrays with unroll_factor2 registers.
3734
VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3735
data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3736
3737
VectorRegister VCRC = data0[0];
3738
VectorRegister Vc = VR25;
3739
VectorRegister swap_bytes = VR26; // Only for Big Endian.
3740
3741
// We have at least 1 iteration (ensured by caller).
3742
Label L_outer_loop, L_inner_loop, L_last;
3743
3744
// If supported set DSCR pre-fetch to deepest.
3745
if (VM_Version::has_mfdscr()) {
3746
load_const_optimized(t0, VM_Version::_dscr_val | 7);
3747
mtdscr(t0);
3748
}
3749
3750
mtvrwz(VCRC, crc); // crc lives in VCRC, now
3751
3752
for (int i = 1; i < unroll_factor2; ++i) {
3753
li(offs[i], 16 * i);
3754
}
3755
3756
// Load consts for outer loop
3757
lvx(consts0[0], constants);
3758
for (int i = 1; i < unroll_factor2 - 1; ++i) {
3759
lvx(consts0[i], offs[i], constants);
3760
}
3761
3762
load_const_optimized(num_bytes, 16 * unroll_factor);
3763
3764
// Reuse data registers outside of the loop.
3765
VectorRegister Vtmp = data1[0];
3766
VectorRegister Vtmp2 = data1[1];
3767
VectorRegister zeroes = data1[2];
3768
3769
vspltisb(Vtmp, 0);
3770
vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3771
3772
// Load vector for vpermxor (to xor both 64 bit parts together)
3773
lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3774
vspltisb(Vc, 4);
3775
vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3776
xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3777
vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3778
3779
#ifdef VM_LITTLE_ENDIAN
3780
#define BE_swap_bytes(x)
3781
#else
3782
vspltisb(Vtmp2, 0xf);
3783
vxor(swap_bytes, Vtmp, Vtmp2);
3784
#define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3785
#endif
3786
3787
cmpd(CCR0, len, num_bytes);
3788
blt(CCR0, L_last);
3789
3790
addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3791
load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3792
3793
// ********** Main loop start **********
3794
align(32);
3795
bind(L_outer_loop);
3796
3797
// Begin of unrolled first iteration (no xor).
3798
lvx(data1[0], buf);
3799
for (int i = 1; i < unroll_factor2 / 2; ++i) {
3800
lvx(data1[i], offs[i], buf);
3801
}
3802
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3803
lvx(consts1[0], cur_const);
3804
mtctr(loop_count);
3805
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3806
BE_swap_bytes(data1[i]);
3807
if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3808
lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3809
vpmsumw(data0[i], data1[i], consts1[0]);
3810
}
3811
addi(buf, buf, 16 * unroll_factor2);
3812
subf(len, num_bytes, len);
3813
lvx(consts1[1], offs[1], cur_const);
3814
addi(cur_const, cur_const, 32);
3815
// Begin of unrolled second iteration (head).
3816
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3817
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3818
if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3819
vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3820
}
3821
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3822
BE_swap_bytes(data1[i]);
3823
lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3824
vpmsumw(data1[i], data1[i], consts1[1]);
3825
}
3826
addi(buf, buf, 16 * unroll_factor2);
3827
3828
// Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3829
// Double-iteration allows using the 2 constant registers alternatingly.
3830
align(32);
3831
bind(L_inner_loop);
3832
for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3833
if (j & 1) {
3834
lvx(consts1[0], cur_const);
3835
} else {
3836
lvx(consts1[1], offs[1], cur_const);
3837
addi(cur_const, cur_const, 32);
3838
}
3839
for (int i = 0; i < unroll_factor2; ++i) {
3840
int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3841
if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3842
BE_swap_bytes(data1[idx]);
3843
vxor(data0[i], data0[i], data1[i]);
3844
if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3845
vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3846
}
3847
addi(buf, buf, 16 * unroll_factor2);
3848
}
3849
bdnz(L_inner_loop);
3850
3851
addi(cur_const, constants, outer_consts_size); // Reset
3852
3853
// Tail of last iteration (no loads).
3854
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3855
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3856
vxor(data0[i], data0[i], data1[i]);
3857
vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3858
}
3859
for (int i = 0; i < unroll_factor2 / 2; ++i) {
3860
vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3861
vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3862
}
3863
3864
// Last data register is ok, other ones need fixup shift.
3865
for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3866
vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3867
}
3868
3869
// Combine to 128 bit result vector VCRC = data0[0].
3870
for (int i = 1; i < unroll_factor2; i<<=1) {
3871
for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3872
vxor(data0[j], data0[j], data0[j+i]);
3873
}
3874
}
3875
cmpd(CCR0, len, num_bytes);
3876
bge(CCR0, L_outer_loop);
3877
3878
// Last chance with lower num_bytes.
3879
bind(L_last);
3880
srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3881
// Point behind last const for inner loop.
3882
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3883
sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3884
clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3885
subf(cur_const, R0, cur_const); // Point to constant to be used first.
3886
3887
addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3888
bgt(CCR0, L_outer_loop);
3889
// ********** Main loop end **********
3890
3891
// Restore DSCR pre-fetch value.
3892
if (VM_Version::has_mfdscr()) {
3893
load_const_optimized(t0, VM_Version::_dscr_val);
3894
mtdscr(t0);
3895
}
3896
3897
// ********** Simple loop for remaining 16 byte blocks **********
3898
{
3899
Label L_loop, L_done;
3900
3901
srdi_(t0, len, 4); // 16 bytes per iteration
3902
clrldi(len, len, 64-4);
3903
beq(CCR0, L_done);
3904
3905
// Point to const (same as last const for inner loop).
3906
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3907
mtctr(t0);
3908
lvx(Vtmp2, cur_const);
3909
3910
align(32);
3911
bind(L_loop);
3912
3913
lvx(Vtmp, buf);
3914
addi(buf, buf, 16);
3915
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3916
BE_swap_bytes(Vtmp);
3917
vxor(VCRC, VCRC, Vtmp);
3918
vpmsumw(VCRC, VCRC, Vtmp2);
3919
bdnz(L_loop);
3920
3921
bind(L_done);
3922
}
3923
// ********** Simple loop end **********
3924
#undef BE_swap_bytes
3925
3926
// Point to Barrett constants
3927
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3928
3929
vspltisb(zeroes, 0);
3930
3931
// Combine to 64 bit result.
3932
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3933
3934
// Reduce to 32 bit CRC: Remainder by multiply-high.
3935
lvx(Vtmp, cur_const);
3936
vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
3937
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
3938
vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3939
vsldoi(Vtmp, zeroes, Vtmp, 8);
3940
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
3941
vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
3942
3943
// Move result. len is already updated.
3944
vsldoi(VCRC, VCRC, zeroes, 8);
3945
mfvrd(crc, VCRC);
3946
3947
// Restore non-volatile Vector registers (frameless).
3948
offsetInt = 0;
3949
offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3950
offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3951
offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3952
offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3953
offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3954
offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3955
#ifndef VM_LITTLE_ENDIAN
3956
offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3957
#endif
3958
offsetInt -= 8; ld(R14, offsetInt, R1_SP);
3959
offsetInt -= 8; ld(R15, offsetInt, R1_SP);
3960
}
3961
3962
void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3963
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3964
load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3965
: StubRoutines::crc_table_addr() , R0);
3966
3967
if (VM_Version::has_vpmsumb()) {
3968
kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3969
} else {
3970
kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3971
}
3972
}
3973
3974
void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3975
assert_different_registers(crc, val, table);
3976
3977
BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3978
if (invertCRC) {
3979
nand(crc, crc, crc); // 1s complement of crc
3980
}
3981
3982
update_byte_crc32(crc, val, table);
3983
3984
if (invertCRC) {
3985
nand(crc, crc, crc); // 1s complement of crc
3986
}
3987
}
3988
3989
// dest_lo += src1 + src2
3990
// dest_hi += carry1 + carry2
3991
void MacroAssembler::add2_with_carry(Register dest_hi,
3992
Register dest_lo,
3993
Register src1, Register src2) {
3994
li(R0, 0);
3995
addc(dest_lo, dest_lo, src1);
3996
adde(dest_hi, dest_hi, R0);
3997
addc(dest_lo, dest_lo, src2);
3998
adde(dest_hi, dest_hi, R0);
3999
}
4000
4001
// Multiply 64 bit by 64 bit first loop.
4002
void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4003
Register x_xstart,
4004
Register y, Register y_idx,
4005
Register z,
4006
Register carry,
4007
Register product_high, Register product,
4008
Register idx, Register kdx,
4009
Register tmp) {
4010
// jlong carry, x[], y[], z[];
4011
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4012
// huge_128 product = y[idx] * x[xstart] + carry;
4013
// z[kdx] = (jlong)product;
4014
// carry = (jlong)(product >>> 64);
4015
// }
4016
// z[xstart] = carry;
4017
4018
Label L_first_loop, L_first_loop_exit;
4019
Label L_one_x, L_one_y, L_multiply;
4020
4021
addic_(xstart, xstart, -1);
4022
blt(CCR0, L_one_x); // Special case: length of x is 1.
4023
4024
// Load next two integers of x.
4025
sldi(tmp, xstart, LogBytesPerInt);
4026
ldx(x_xstart, x, tmp);
4027
#ifdef VM_LITTLE_ENDIAN
4028
rldicl(x_xstart, x_xstart, 32, 0);
4029
#endif
4030
4031
align(32, 16);
4032
bind(L_first_loop);
4033
4034
cmpdi(CCR0, idx, 1);
4035
blt(CCR0, L_first_loop_exit);
4036
addi(idx, idx, -2);
4037
beq(CCR0, L_one_y);
4038
4039
// Load next two integers of y.
4040
sldi(tmp, idx, LogBytesPerInt);
4041
ldx(y_idx, y, tmp);
4042
#ifdef VM_LITTLE_ENDIAN
4043
rldicl(y_idx, y_idx, 32, 0);
4044
#endif
4045
4046
4047
bind(L_multiply);
4048
multiply64(product_high, product, x_xstart, y_idx);
4049
4050
li(tmp, 0);
4051
addc(product, product, carry); // Add carry to result.
4052
adde(product_high, product_high, tmp); // Add carry of the last addition.
4053
addi(kdx, kdx, -2);
4054
4055
// Store result.
4056
#ifdef VM_LITTLE_ENDIAN
4057
rldicl(product, product, 32, 0);
4058
#endif
4059
sldi(tmp, kdx, LogBytesPerInt);
4060
stdx(product, z, tmp);
4061
mr_if_needed(carry, product_high);
4062
b(L_first_loop);
4063
4064
4065
bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4066
4067
lwz(y_idx, 0, y);
4068
b(L_multiply);
4069
4070
4071
bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4072
4073
lwz(x_xstart, 0, x);
4074
b(L_first_loop);
4075
4076
bind(L_first_loop_exit);
4077
}
4078
4079
// Multiply 64 bit by 64 bit and add 128 bit.
4080
void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4081
Register z, Register yz_idx,
4082
Register idx, Register carry,
4083
Register product_high, Register product,
4084
Register tmp, int offset) {
4085
4086
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4087
// z[kdx] = (jlong)product;
4088
4089
sldi(tmp, idx, LogBytesPerInt);
4090
if (offset) {
4091
addi(tmp, tmp, offset);
4092
}
4093
ldx(yz_idx, y, tmp);
4094
#ifdef VM_LITTLE_ENDIAN
4095
rldicl(yz_idx, yz_idx, 32, 0);
4096
#endif
4097
4098
multiply64(product_high, product, x_xstart, yz_idx);
4099
ldx(yz_idx, z, tmp);
4100
#ifdef VM_LITTLE_ENDIAN
4101
rldicl(yz_idx, yz_idx, 32, 0);
4102
#endif
4103
4104
add2_with_carry(product_high, product, carry, yz_idx);
4105
4106
sldi(tmp, idx, LogBytesPerInt);
4107
if (offset) {
4108
addi(tmp, tmp, offset);
4109
}
4110
#ifdef VM_LITTLE_ENDIAN
4111
rldicl(product, product, 32, 0);
4112
#endif
4113
stdx(product, z, tmp);
4114
}
4115
4116
// Multiply 128 bit by 128 bit. Unrolled inner loop.
4117
void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4118
Register y, Register z,
4119
Register yz_idx, Register idx, Register carry,
4120
Register product_high, Register product,
4121
Register carry2, Register tmp) {
4122
4123
// jlong carry, x[], y[], z[];
4124
// int kdx = ystart+1;
4125
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4126
// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4127
// z[kdx+idx+1] = (jlong)product;
4128
// jlong carry2 = (jlong)(product >>> 64);
4129
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4130
// z[kdx+idx] = (jlong)product;
4131
// carry = (jlong)(product >>> 64);
4132
// }
4133
// idx += 2;
4134
// if (idx > 0) {
4135
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4136
// z[kdx+idx] = (jlong)product;
4137
// carry = (jlong)(product >>> 64);
4138
// }
4139
4140
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4141
const Register jdx = R0;
4142
4143
// Scale the index.
4144
srdi_(jdx, idx, 2);
4145
beq(CCR0, L_third_loop_exit);
4146
mtctr(jdx);
4147
4148
align(32, 16);
4149
bind(L_third_loop);
4150
4151
addi(idx, idx, -4);
4152
4153
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4154
mr_if_needed(carry2, product_high);
4155
4156
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4157
mr_if_needed(carry, product_high);
4158
bdnz(L_third_loop);
4159
4160
bind(L_third_loop_exit); // Handle any left-over operand parts.
4161
4162
andi_(idx, idx, 0x3);
4163
beq(CCR0, L_post_third_loop_done);
4164
4165
Label L_check_1;
4166
4167
addic_(idx, idx, -2);
4168
blt(CCR0, L_check_1);
4169
4170
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4171
mr_if_needed(carry, product_high);
4172
4173
bind(L_check_1);
4174
4175
addi(idx, idx, 0x2);
4176
andi_(idx, idx, 0x1);
4177
addic_(idx, idx, -1);
4178
blt(CCR0, L_post_third_loop_done);
4179
4180
sldi(tmp, idx, LogBytesPerInt);
4181
lwzx(yz_idx, y, tmp);
4182
multiply64(product_high, product, x_xstart, yz_idx);
4183
lwzx(yz_idx, z, tmp);
4184
4185
add2_with_carry(product_high, product, yz_idx, carry);
4186
4187
sldi(tmp, idx, LogBytesPerInt);
4188
stwx(product, z, tmp);
4189
srdi(product, product, 32);
4190
4191
sldi(product_high, product_high, 32);
4192
orr(product, product, product_high);
4193
mr_if_needed(carry, product);
4194
4195
bind(L_post_third_loop_done);
4196
} // multiply_128_x_128_loop
4197
4198
void MacroAssembler::muladd(Register out, Register in,
4199
Register offset, Register len, Register k,
4200
Register tmp1, Register tmp2, Register carry) {
4201
4202
// Labels
4203
Label LOOP, SKIP;
4204
4205
// Make sure length is positive.
4206
cmpdi (CCR0, len, 0);
4207
4208
// Prepare variables
4209
subi (offset, offset, 4);
4210
li (carry, 0);
4211
ble (CCR0, SKIP);
4212
4213
mtctr (len);
4214
subi (len, len, 1 );
4215
sldi (len, len, 2 );
4216
4217
// Main loop
4218
bind(LOOP);
4219
lwzx (tmp1, len, in );
4220
lwzx (tmp2, offset, out );
4221
mulld (tmp1, tmp1, k );
4222
add (tmp2, carry, tmp2 );
4223
add (tmp2, tmp1, tmp2 );
4224
stwx (tmp2, offset, out );
4225
srdi (carry, tmp2, 32 );
4226
subi (offset, offset, 4 );
4227
subi (len, len, 4 );
4228
bdnz (LOOP);
4229
bind(SKIP);
4230
}
4231
4232
void MacroAssembler::multiply_to_len(Register x, Register xlen,
4233
Register y, Register ylen,
4234
Register z, Register zlen,
4235
Register tmp1, Register tmp2,
4236
Register tmp3, Register tmp4,
4237
Register tmp5, Register tmp6,
4238
Register tmp7, Register tmp8,
4239
Register tmp9, Register tmp10,
4240
Register tmp11, Register tmp12,
4241
Register tmp13) {
4242
4243
ShortBranchVerifier sbv(this);
4244
4245
assert_different_registers(x, xlen, y, ylen, z, zlen,
4246
tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4247
assert_different_registers(x, xlen, y, ylen, z, zlen,
4248
tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4249
assert_different_registers(x, xlen, y, ylen, z, zlen,
4250
tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4251
4252
const Register idx = tmp1;
4253
const Register kdx = tmp2;
4254
const Register xstart = tmp3;
4255
4256
const Register y_idx = tmp4;
4257
const Register carry = tmp5;
4258
const Register product = tmp6;
4259
const Register product_high = tmp7;
4260
const Register x_xstart = tmp8;
4261
const Register tmp = tmp9;
4262
4263
// First Loop.
4264
//
4265
// final static long LONG_MASK = 0xffffffffL;
4266
// int xstart = xlen - 1;
4267
// int ystart = ylen - 1;
4268
// long carry = 0;
4269
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4270
// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4271
// z[kdx] = (int)product;
4272
// carry = product >>> 32;
4273
// }
4274
// z[xstart] = (int)carry;
4275
4276
mr_if_needed(idx, ylen); // idx = ylen
4277
mr_if_needed(kdx, zlen); // kdx = xlen + ylen
4278
li(carry, 0); // carry = 0
4279
4280
Label L_done;
4281
4282
addic_(xstart, xlen, -1);
4283
blt(CCR0, L_done);
4284
4285
multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4286
carry, product_high, product, idx, kdx, tmp);
4287
4288
Label L_second_loop;
4289
4290
cmpdi(CCR0, kdx, 0);
4291
beq(CCR0, L_second_loop);
4292
4293
Label L_carry;
4294
4295
addic_(kdx, kdx, -1);
4296
beq(CCR0, L_carry);
4297
4298
// Store lower 32 bits of carry.
4299
sldi(tmp, kdx, LogBytesPerInt);
4300
stwx(carry, z, tmp);
4301
srdi(carry, carry, 32);
4302
addi(kdx, kdx, -1);
4303
4304
4305
bind(L_carry);
4306
4307
// Store upper 32 bits of carry.
4308
sldi(tmp, kdx, LogBytesPerInt);
4309
stwx(carry, z, tmp);
4310
4311
// Second and third (nested) loops.
4312
//
4313
// for (int i = xstart-1; i >= 0; i--) { // Second loop
4314
// carry = 0;
4315
// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4316
// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4317
// (z[k] & LONG_MASK) + carry;
4318
// z[k] = (int)product;
4319
// carry = product >>> 32;
4320
// }
4321
// z[i] = (int)carry;
4322
// }
4323
//
4324
// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4325
4326
bind(L_second_loop);
4327
4328
li(carry, 0); // carry = 0;
4329
4330
addic_(xstart, xstart, -1); // i = xstart-1;
4331
blt(CCR0, L_done);
4332
4333
Register zsave = tmp10;
4334
4335
mr(zsave, z);
4336
4337
4338
Label L_last_x;
4339
4340
sldi(tmp, xstart, LogBytesPerInt);
4341
add(z, z, tmp); // z = z + k - j
4342
addi(z, z, 4);
4343
addic_(xstart, xstart, -1); // i = xstart-1;
4344
blt(CCR0, L_last_x);
4345
4346
sldi(tmp, xstart, LogBytesPerInt);
4347
ldx(x_xstart, x, tmp);
4348
#ifdef VM_LITTLE_ENDIAN
4349
rldicl(x_xstart, x_xstart, 32, 0);
4350
#endif
4351
4352
4353
Label L_third_loop_prologue;
4354
4355
bind(L_third_loop_prologue);
4356
4357
Register xsave = tmp11;
4358
Register xlensave = tmp12;
4359
Register ylensave = tmp13;
4360
4361
mr(xsave, x);
4362
mr(xlensave, xstart);
4363
mr(ylensave, ylen);
4364
4365
4366
multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4367
carry, product_high, product, x, tmp);
4368
4369
mr(z, zsave);
4370
mr(x, xsave);
4371
mr(xlen, xlensave); // This is the decrement of the loop counter!
4372
mr(ylen, ylensave);
4373
4374
addi(tmp3, xlen, 1);
4375
sldi(tmp, tmp3, LogBytesPerInt);
4376
stwx(carry, z, tmp);
4377
addic_(tmp3, tmp3, -1);
4378
blt(CCR0, L_done);
4379
4380
srdi(carry, carry, 32);
4381
sldi(tmp, tmp3, LogBytesPerInt);
4382
stwx(carry, z, tmp);
4383
b(L_second_loop);
4384
4385
// Next infrequent code is moved outside loops.
4386
bind(L_last_x);
4387
4388
lwz(x_xstart, 0, x);
4389
b(L_third_loop_prologue);
4390
4391
bind(L_done);
4392
} // multiply_to_len
4393
4394
void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4395
#ifdef ASSERT
4396
Label ok;
4397
if (check_equal) {
4398
beq(CCR0, ok);
4399
} else {
4400
bne(CCR0, ok);
4401
}
4402
stop(msg);
4403
bind(ok);
4404
#endif
4405
}
4406
4407
void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4408
Register mem_base, const char* msg) {
4409
#ifdef ASSERT
4410
switch (size) {
4411
case 4:
4412
lwz(R0, mem_offset, mem_base);
4413
cmpwi(CCR0, R0, 0);
4414
break;
4415
case 8:
4416
ld(R0, mem_offset, mem_base);
4417
cmpdi(CCR0, R0, 0);
4418
break;
4419
default:
4420
ShouldNotReachHere();
4421
}
4422
asm_assert(check_equal, msg);
4423
#endif // ASSERT
4424
}
4425
4426
void MacroAssembler::verify_thread() {
4427
if (VerifyThread) {
4428
unimplemented("'VerifyThread' currently not implemented on PPC");
4429
}
4430
}
4431
4432
void MacroAssembler::verify_coop(Register coop, const char* msg) {
4433
if (!VerifyOops) { return; }
4434
if (UseCompressedOops) { decode_heap_oop(coop); }
4435
verify_oop(coop, msg);
4436
if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4437
}
4438
4439
// READ: oop. KILL: R0. Volatile floats perhaps.
4440
void MacroAssembler::verify_oop(Register oop, const char* msg) {
4441
if (!VerifyOops) {
4442
return;
4443
}
4444
4445
address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4446
const Register tmp = R11; // Will be preserved.
4447
const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4448
4449
BLOCK_COMMENT("verify_oop {");
4450
4451
save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4452
4453
mr_if_needed(R4_ARG2, oop);
4454
save_LR_CR(tmp); // save in old frame
4455
push_frame_reg_args(nbytes_save, tmp);
4456
// load FunctionDescriptor** / entry_address *
4457
load_const_optimized(tmp, fd, R0);
4458
// load FunctionDescriptor* / entry_address
4459
ld(tmp, 0, tmp);
4460
load_const_optimized(R3_ARG1, (address)msg, R0);
4461
// Call destination for its side effect.
4462
call_c(tmp);
4463
4464
pop_frame();
4465
restore_LR_CR(tmp);
4466
restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4467
4468
BLOCK_COMMENT("} verify_oop");
4469
}
4470
4471
void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4472
if (!VerifyOops) {
4473
return;
4474
}
4475
4476
address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4477
const Register tmp = R11; // Will be preserved.
4478
const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4479
save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4480
4481
ld(R4_ARG2, offs, base);
4482
save_LR_CR(tmp); // save in old frame
4483
push_frame_reg_args(nbytes_save, tmp);
4484
// load FunctionDescriptor** / entry_address *
4485
load_const_optimized(tmp, fd, R0);
4486
// load FunctionDescriptor* / entry_address
4487
ld(tmp, 0, tmp);
4488
load_const_optimized(R3_ARG1, (address)msg, R0);
4489
// Call destination for its side effect.
4490
call_c(tmp);
4491
4492
pop_frame();
4493
restore_LR_CR(tmp);
4494
restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4495
}
4496
4497
// Call a C-function that prints output.
4498
void MacroAssembler::stop(int type, const char* msg) {
4499
bool msg_present = (msg != NULL);
4500
4501
#ifndef PRODUCT
4502
block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4503
#else
4504
block_comment("stop {");
4505
#endif
4506
4507
if (msg_present) {
4508
type |= stop_msg_present;
4509
}
4510
tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4511
if (msg_present) {
4512
emit_int64((uintptr_t)msg);
4513
}
4514
4515
block_comment("} stop;");
4516
}
4517
4518
#ifndef PRODUCT
4519
// Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4520
// Val, addr are temp registers.
4521
// If low == addr, addr is killed.
4522
// High is preserved.
4523
void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4524
if (!ZapMemory) return;
4525
4526
assert_different_registers(low, val);
4527
4528
BLOCK_COMMENT("zap memory region {");
4529
load_const_optimized(val, 0x0101010101010101);
4530
int size = before + after;
4531
if (low == high && size < 5 && size > 0) {
4532
int offset = -before*BytesPerWord;
4533
for (int i = 0; i < size; ++i) {
4534
std(val, offset, low);
4535
offset += (1*BytesPerWord);
4536
}
4537
} else {
4538
addi(addr, low, -before*BytesPerWord);
4539
assert_different_registers(high, val);
4540
if (after) addi(high, high, after * BytesPerWord);
4541
Label loop;
4542
bind(loop);
4543
std(val, 0, addr);
4544
addi(addr, addr, 8);
4545
cmpd(CCR6, addr, high);
4546
ble(CCR6, loop);
4547
if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4548
}
4549
BLOCK_COMMENT("} zap memory region");
4550
}
4551
4552
#endif // !PRODUCT
4553
4554
void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4555
const bool* flag_addr, Label& label) {
4556
int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4557
assert(sizeof(bool) == 1, "PowerPC ABI");
4558
masm->lbz(temp, simm16_offset, temp);
4559
masm->cmpwi(CCR0, temp, 0);
4560
masm->beq(CCR0, label);
4561
}
4562
4563
SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4564
skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4565
}
4566
4567
SkipIfEqualZero::~SkipIfEqualZero() {
4568
_masm->bind(_label);
4569
}
4570
4571
void MacroAssembler::cache_wb(Address line) {
4572
assert(line.index() == noreg, "index should be noreg");
4573
assert(line.disp() == 0, "displacement should be 0");
4574
assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4575
// Data Cache Store, not really a flush, so it works like a sync of cache
4576
// line and persistent mem, i.e. copying the cache line to persistent whilst
4577
// not invalidating the cache line.
4578
dcbst(line.base());
4579
}
4580
4581
void MacroAssembler::cache_wbsync(bool is_presync) {
4582
assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4583
// We only need a post sync barrier. Post means _after_ a cache line flush or
4584
// store instruction, pre means a barrier emitted before such a instructions.
4585
if (!is_presync) {
4586
fence();
4587
}
4588
}
4589
4590