Path: blob/master/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
64440 views
/*1* Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.2* Copyright (c) 2012, 2022 SAP SE. All rights reserved.3* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.4*5* This code is free software; you can redistribute it and/or modify it6* under the terms of the GNU General Public License version 2 only, as7* published by the Free Software Foundation.8*9* This code is distributed in the hope that it will be useful, but WITHOUT10* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or11* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License12* version 2 for more details (a copy is included in the LICENSE file that13* accompanied this code).14*15* You should have received a copy of the GNU General Public License version16* 2 along with this work; if not, write to the Free Software Foundation,17* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.18*19* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA20* or visit www.oracle.com if you need additional information or have any21* questions.22*23*/2425#include "precompiled.hpp"26#include "asm/macroAssembler.inline.hpp"27#include "compiler/disassembler.hpp"28#include "gc/shared/collectedHeap.inline.hpp"29#include "gc/shared/barrierSet.hpp"30#include "gc/shared/barrierSetAssembler.hpp"31#include "interpreter/interpreter.hpp"32#include "memory/resourceArea.hpp"33#include "nativeInst_ppc.hpp"34#include "oops/klass.inline.hpp"35#include "oops/methodData.hpp"36#include "prims/methodHandles.hpp"37#include "runtime/biasedLocking.hpp"38#include "runtime/icache.hpp"39#include "runtime/interfaceSupport.inline.hpp"40#include "runtime/objectMonitor.hpp"41#include "runtime/os.hpp"42#include "runtime/safepoint.hpp"43#include "runtime/safepointMechanism.hpp"44#include "runtime/sharedRuntime.hpp"45#include "runtime/stubRoutines.hpp"46#include "runtime/vm_version.hpp"47#include "utilities/macros.hpp"48#include "utilities/powerOfTwo.hpp"4950#ifdef PRODUCT51#define BLOCK_COMMENT(str) // nothing52#else53#define BLOCK_COMMENT(str) block_comment(str)54#endif55#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")5657#ifdef ASSERT58// On RISC, there's no benefit to verifying instruction boundaries.59bool AbstractAssembler::pd_check_instruction_mark() { return false; }60#endif6162void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {63assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");64if (Assembler::is_simm(si31, 16)) {65ld(d, si31, a);66if (emit_filler_nop) nop();67} else {68const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);69const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);70addis(d, a, hi);71ld(d, lo, d);72}73}7475void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {76assert_different_registers(d, a);77ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);78}7980void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,81size_t size_in_bytes, bool is_signed) {82switch (size_in_bytes) {83case 8: ld(dst, offs, base); break;84case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;85case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;86case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(87default: ShouldNotReachHere();88}89}9091void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,92size_t size_in_bytes) {93switch (size_in_bytes) {94case 8: std(dst, offs, base); break;95case 4: stw(dst, offs, base); break;96case 2: sth(dst, offs, base); break;97case 1: stb(dst, offs, base); break;98default: ShouldNotReachHere();99}100}101102void MacroAssembler::align(int modulus, int max, int rem) {103int padding = (rem + modulus - (offset() % modulus)) % modulus;104if (padding > max) return;105for (int c = (padding >> 2); c > 0; --c) { nop(); }106}107108// Issue instructions that calculate given TOC from global TOC.109void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,110bool add_relocation, bool emit_dummy_addr) {111int offset = -1;112if (emit_dummy_addr) {113offset = -128; // dummy address114} else if (addr != (address)(intptr_t)-1) {115offset = MacroAssembler::offset_to_global_toc(addr);116}117118if (hi16) {119addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));120}121if (lo16) {122if (add_relocation) {123// Relocate at the addi to avoid confusion with a load from the method's TOC.124relocate(internal_word_Relocation::spec(addr));125}126addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));127}128}129130address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {131const int offset = MacroAssembler::offset_to_global_toc(addr);132133const address inst2_addr = a;134const int inst2 = *(int *)inst2_addr;135136// The relocation points to the second instruction, the addi,137// and the addi reads and writes the same register dst.138const int dst = inv_rt_field(inst2);139assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");140141// Now, find the preceding addis which writes to dst.142int inst1 = 0;143address inst1_addr = inst2_addr - BytesPerInstWord;144while (inst1_addr >= bound) {145inst1 = *(int *) inst1_addr;146if (is_addis(inst1) && inv_rt_field(inst1) == dst) {147// Stop, found the addis which writes dst.148break;149}150inst1_addr -= BytesPerInstWord;151}152153assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");154set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));155set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));156return inst1_addr;157}158159address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {160const address inst2_addr = a;161const int inst2 = *(int *)inst2_addr;162163// The relocation points to the second instruction, the addi,164// and the addi reads and writes the same register dst.165const int dst = inv_rt_field(inst2);166assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");167168// Now, find the preceding addis which writes to dst.169int inst1 = 0;170address inst1_addr = inst2_addr - BytesPerInstWord;171while (inst1_addr >= bound) {172inst1 = *(int *) inst1_addr;173if (is_addis(inst1) && inv_rt_field(inst1) == dst) {174// stop, found the addis which writes dst175break;176}177inst1_addr -= BytesPerInstWord;178}179180assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");181182int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);183// -1 is a special case184if (offset == -1) {185return (address)(intptr_t)-1;186} else {187return global_toc() + offset;188}189}190191#ifdef _LP64192// Patch compressed oops or klass constants.193// Assembler sequence is194// 1) compressed oops:195// lis rx = const.hi196// ori rx = rx | const.lo197// 2) compressed klass:198// lis rx = const.hi199// clrldi rx = rx & 0xFFFFffff // clearMS32b, optional200// ori rx = rx | const.lo201// Clrldi will be passed by.202address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {203assert(UseCompressedOops, "Should only patch compressed oops");204205const address inst2_addr = a;206const int inst2 = *(int *)inst2_addr;207208// The relocation points to the second instruction, the ori,209// and the ori reads and writes the same register dst.210const int dst = inv_rta_field(inst2);211assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");212// Now, find the preceding addis which writes to dst.213int inst1 = 0;214address inst1_addr = inst2_addr - BytesPerInstWord;215bool inst1_found = false;216while (inst1_addr >= bound) {217inst1 = *(int *)inst1_addr;218if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }219inst1_addr -= BytesPerInstWord;220}221assert(inst1_found, "inst is not lis");222223uint32_t data_value = CompressedOops::narrow_oop_value(data);224int xc = (data_value >> 16) & 0xffff;225int xd = (data_value >> 0) & 0xffff;226227set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo228set_imm((int *)inst2_addr, (xd)); // unsigned int229return inst1_addr;230}231232// Get compressed oop or klass constant.233narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {234assert(UseCompressedOops, "Should only patch compressed oops");235236const address inst2_addr = a;237const int inst2 = *(int *)inst2_addr;238239// The relocation points to the second instruction, the ori,240// and the ori reads and writes the same register dst.241const int dst = inv_rta_field(inst2);242assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");243// Now, find the preceding lis which writes to dst.244int inst1 = 0;245address inst1_addr = inst2_addr - BytesPerInstWord;246bool inst1_found = false;247248while (inst1_addr >= bound) {249inst1 = *(int *) inst1_addr;250if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}251inst1_addr -= BytesPerInstWord;252}253assert(inst1_found, "inst is not lis");254255uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));256uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);257258return CompressedOops::narrow_oop_cast(xl | xh);259}260#endif // _LP64261262// Returns true if successful.263bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,264Register toc, bool fixed_size) {265int toc_offset = 0;266// Use RelocationHolder::none for the constant pool entry, otherwise267// we will end up with a failing NativeCall::verify(x) where x is268// the address of the constant pool entry.269// FIXME: We should insert relocation information for oops at the constant270// pool entries instead of inserting it at the loads; patching of a constant271// pool entry should be less expensive.272address const_address = address_constant((address)a.value(), RelocationHolder::none);273if (const_address == NULL) { return false; } // allocation failure274// Relocate at the pc of the load.275relocate(a.rspec());276toc_offset = (int)(const_address - code()->consts()->start());277ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);278return true;279}280281bool MacroAssembler::is_load_const_from_method_toc_at(address a) {282const address inst1_addr = a;283const int inst1 = *(int *)inst1_addr;284285// The relocation points to the ld or the addis.286return (is_ld(inst1)) ||287(is_addis(inst1) && inv_ra_field(inst1) != 0);288}289290int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {291assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");292293const address inst1_addr = a;294const int inst1 = *(int *)inst1_addr;295296if (is_ld(inst1)) {297return inv_d1_field(inst1);298} else if (is_addis(inst1)) {299const int dst = inv_rt_field(inst1);300301// Now, find the succeeding ld which reads and writes to dst.302address inst2_addr = inst1_addr + BytesPerInstWord;303int inst2 = 0;304while (true) {305inst2 = *(int *) inst2_addr;306if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {307// Stop, found the ld which reads and writes dst.308break;309}310inst2_addr += BytesPerInstWord;311}312return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);313}314ShouldNotReachHere();315return 0;316}317318// Get the constant from a `load_const' sequence.319long MacroAssembler::get_const(address a) {320assert(is_load_const_at(a), "not a load of a constant");321const int *p = (const int*) a;322unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);323if (is_ori(*(p+1))) {324x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);325x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);326x |= (((unsigned long) (get_imm(a,4) & 0xffff)));327} else if (is_lis(*(p+1))) {328x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);329x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);330x |= (((unsigned long) (get_imm(a,3) & 0xffff)));331} else {332ShouldNotReachHere();333return (long) 0;334}335return (long) x;336}337338// Patch the 64 bit constant of a `load_const' sequence. This is a low339// level procedure. It neither flushes the instruction cache nor is it340// mt safe.341void MacroAssembler::patch_const(address a, long x) {342assert(is_load_const_at(a), "not a load of a constant");343int *p = (int*) a;344if (is_ori(*(p+1))) {345set_imm(0 + p, (x >> 48) & 0xffff);346set_imm(1 + p, (x >> 32) & 0xffff);347set_imm(3 + p, (x >> 16) & 0xffff);348set_imm(4 + p, x & 0xffff);349} else if (is_lis(*(p+1))) {350set_imm(0 + p, (x >> 48) & 0xffff);351set_imm(2 + p, (x >> 32) & 0xffff);352set_imm(1 + p, (x >> 16) & 0xffff);353set_imm(3 + p, x & 0xffff);354} else {355ShouldNotReachHere();356}357}358359AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {360assert(oop_recorder() != NULL, "this assembler needs a Recorder");361int index = oop_recorder()->allocate_metadata_index(obj);362RelocationHolder rspec = metadata_Relocation::spec(index);363return AddressLiteral((address)obj, rspec);364}365366AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {367assert(oop_recorder() != NULL, "this assembler needs a Recorder");368int index = oop_recorder()->find_index(obj);369RelocationHolder rspec = metadata_Relocation::spec(index);370return AddressLiteral((address)obj, rspec);371}372373AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {374assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");375int oop_index = oop_recorder()->allocate_oop_index(obj);376return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));377}378379AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {380assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");381int oop_index = oop_recorder()->find_index(obj);382return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));383}384385#ifndef PRODUCT386void MacroAssembler::pd_print_patched_instruction(address branch) {387Unimplemented(); // TODO: PPC port388}389#endif // ndef PRODUCT390391// Conditional far branch for destinations encodable in 24+2 bits.392void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {393394// If requested by flag optimize, relocate the bc_far as a395// runtime_call and prepare for optimizing it when the code gets396// relocated.397if (optimize == bc_far_optimize_on_relocate) {398relocate(relocInfo::runtime_call_type);399}400401// variant 2:402//403// b!cxx SKIP404// bxx DEST405// SKIP:406//407408const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),409opposite_bcond(inv_boint_bcond(boint)));410411// We emit two branches.412// First, a conditional branch which jumps around the far branch.413const address not_taken_pc = pc() + 2 * BytesPerInstWord;414const address bc_pc = pc();415bc(opposite_boint, biint, not_taken_pc);416417const int bc_instr = *(int*)bc_pc;418assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");419assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");420assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),421opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),422"postcondition");423assert(biint == inv_bi_field(bc_instr), "postcondition");424425// Second, an unconditional far branch which jumps to dest.426// Note: target(dest) remembers the current pc (see CodeSection::target)427// and returns the current pc if the label is not bound yet; when428// the label gets bound, the unconditional far branch will be patched.429const address target_pc = target(dest);430const address b_pc = pc();431b(target_pc);432433assert(not_taken_pc == pc(), "postcondition");434assert(dest.is_bound() || target_pc == b_pc, "postcondition");435}436437// 1 or 2 instructions438void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {439if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {440bc(boint, biint, dest);441} else {442bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);443}444}445446bool MacroAssembler::is_bc_far_at(address instruction_addr) {447return is_bc_far_variant1_at(instruction_addr) ||448is_bc_far_variant2_at(instruction_addr) ||449is_bc_far_variant3_at(instruction_addr);450}451452address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {453if (is_bc_far_variant1_at(instruction_addr)) {454const address instruction_1_addr = instruction_addr;455const int instruction_1 = *(int*)instruction_1_addr;456return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);457} else if (is_bc_far_variant2_at(instruction_addr)) {458const address instruction_2_addr = instruction_addr + 4;459return bxx_destination(instruction_2_addr);460} else if (is_bc_far_variant3_at(instruction_addr)) {461return instruction_addr + 8;462}463// variant 4 ???464ShouldNotReachHere();465return NULL;466}467void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {468469if (is_bc_far_variant3_at(instruction_addr)) {470// variant 3, far cond branch to the next instruction, already patched to nops:471//472// nop473// endgroup474// SKIP/DEST:475//476return;477}478479// first, extract boint and biint from the current branch480int boint = 0;481int biint = 0;482483ResourceMark rm;484const int code_size = 2 * BytesPerInstWord;485CodeBuffer buf(instruction_addr, code_size);486MacroAssembler masm(&buf);487if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {488// Far branch to next instruction: Optimize it by patching nops (produce variant 3).489masm.nop();490masm.endgroup();491} else {492if (is_bc_far_variant1_at(instruction_addr)) {493// variant 1, the 1st instruction contains the destination address:494//495// bcxx DEST496// nop497//498const int instruction_1 = *(int*)(instruction_addr);499boint = inv_bo_field(instruction_1);500biint = inv_bi_field(instruction_1);501} else if (is_bc_far_variant2_at(instruction_addr)) {502// variant 2, the 2nd instruction contains the destination address:503//504// b!cxx SKIP505// bxx DEST506// SKIP:507//508const int instruction_1 = *(int*)(instruction_addr);509boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),510opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));511biint = inv_bi_field(instruction_1);512} else {513// variant 4???514ShouldNotReachHere();515}516517// second, set the new branch destination and optimize the code518if (dest != instruction_addr + 4 && // the bc_far is still unbound!519masm.is_within_range_of_bcxx(dest, instruction_addr)) {520// variant 1:521//522// bcxx DEST523// nop524//525masm.bc(boint, biint, dest);526masm.nop();527} else {528// variant 2:529//530// b!cxx SKIP531// bxx DEST532// SKIP:533//534const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),535opposite_bcond(inv_boint_bcond(boint)));536const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;537masm.bc(opposite_boint, biint, not_taken_pc);538masm.b(dest);539}540}541ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);542}543544// Emit a NOT mt-safe patchable 64 bit absolute call/jump.545void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {546// get current pc547uint64_t start_pc = (uint64_t) pc();548549const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last550const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first551552// relocate here553if (rt != relocInfo::none) {554relocate(rt);555}556557if ( ReoptimizeCallSequences &&558(( link && is_within_range_of_b(dest, pc_of_bl)) ||559(!link && is_within_range_of_b(dest, pc_of_b)))) {560// variant 2:561// Emit an optimized, pc-relative call/jump.562563if (link) {564// some padding565nop();566nop();567nop();568nop();569nop();570nop();571572// do the call573assert(pc() == pc_of_bl, "just checking");574bl(dest, relocInfo::none);575} else {576// do the jump577assert(pc() == pc_of_b, "just checking");578b(dest, relocInfo::none);579580// some padding581nop();582nop();583nop();584nop();585nop();586nop();587}588589// Assert that we can identify the emitted call/jump.590assert(is_bxx64_patchable_variant2_at((address)start_pc, link),591"can't identify emitted call");592} else {593// variant 1:594mr(R0, R11); // spill R11 -> R0.595596// Load the destination address into CTR,597// calculate destination relative to global toc.598calculate_address_from_global_toc(R11, dest, true, true, false);599600mtctr(R11);601mr(R11, R0); // spill R11 <- R0.602nop();603604// do the call/jump605if (link) {606bctrl();607} else{608bctr();609}610// Assert that we can identify the emitted call/jump.611assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),612"can't identify emitted call");613}614615// Assert that we can identify the emitted call/jump.616assert(is_bxx64_patchable_at((address)start_pc, link),617"can't identify emitted call");618assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,619"wrong encoding of dest address");620}621622// Identify a bxx64_patchable instruction.623bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {624return is_bxx64_patchable_variant1b_at(instruction_addr, link)625//|| is_bxx64_patchable_variant1_at(instruction_addr, link)626|| is_bxx64_patchable_variant2_at(instruction_addr, link);627}628629// Does the call64_patchable instruction use a pc-relative encoding of630// the call destination?631bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {632// variant 2 is pc-relative633return is_bxx64_patchable_variant2_at(instruction_addr, link);634}635636// Identify variant 1.637bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {638unsigned int* instr = (unsigned int*) instruction_addr;639return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]640&& is_mtctr(instr[5]) // mtctr641&& is_load_const_at(instruction_addr);642}643644// Identify variant 1b: load destination relative to global toc.645bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {646unsigned int* instr = (unsigned int*) instruction_addr;647return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]648&& is_mtctr(instr[3]) // mtctr649&& is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);650}651652// Identify variant 2.653bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {654unsigned int* instr = (unsigned int*) instruction_addr;655if (link) {656return is_bl (instr[6]) // bl dest is last657&& is_nop(instr[0]) // nop658&& is_nop(instr[1]) // nop659&& is_nop(instr[2]) // nop660&& is_nop(instr[3]) // nop661&& is_nop(instr[4]) // nop662&& is_nop(instr[5]); // nop663} else {664return is_b (instr[0]) // b dest is first665&& is_nop(instr[1]) // nop666&& is_nop(instr[2]) // nop667&& is_nop(instr[3]) // nop668&& is_nop(instr[4]) // nop669&& is_nop(instr[5]) // nop670&& is_nop(instr[6]); // nop671}672}673674// Set dest address of a bxx64_patchable instruction.675void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {676ResourceMark rm;677int code_size = MacroAssembler::bxx64_patchable_size;678CodeBuffer buf(instruction_addr, code_size);679MacroAssembler masm(&buf);680masm.bxx64_patchable(dest, relocInfo::none, link);681ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);682}683684// Get dest address of a bxx64_patchable instruction.685address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {686if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {687return (address) (unsigned long) get_const(instruction_addr);688} else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {689unsigned int* instr = (unsigned int*) instruction_addr;690if (link) {691const int instr_idx = 6; // bl is last692int branchoffset = branch_destination(instr[instr_idx], 0);693return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;694} else {695const int instr_idx = 0; // b is first696int branchoffset = branch_destination(instr[instr_idx], 0);697return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;698}699// Load dest relative to global toc.700} else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {701return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,702instruction_addr);703} else {704ShouldNotReachHere();705return NULL;706}707}708709void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {710const int magic_number = 0x42;711712// Preserve stack pointer register (R1_SP) and system thread id register (R13);713// although they're technically volatile714for (int i = 2; i < 13; i++) {715Register reg = as_Register(i);716if (reg == excluded_register) {717continue;718}719720li(reg, magic_number);721}722}723724void MacroAssembler::clobber_carg_stack_slots(Register tmp) {725const int magic_number = 0x43;726727li(tmp, magic_number);728for (int m = 0; m <= 7; m++) {729std(tmp, frame::abi_minframe_size + m * 8, R1_SP);730}731}732733// Uses ordering which corresponds to ABI:734// _savegpr0_14: std r14,-144(r1)735// _savegpr0_15: std r15,-136(r1)736// _savegpr0_16: std r16,-128(r1)737void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {738std(R14, offset, dst); offset += 8;739std(R15, offset, dst); offset += 8;740std(R16, offset, dst); offset += 8;741std(R17, offset, dst); offset += 8;742std(R18, offset, dst); offset += 8;743std(R19, offset, dst); offset += 8;744std(R20, offset, dst); offset += 8;745std(R21, offset, dst); offset += 8;746std(R22, offset, dst); offset += 8;747std(R23, offset, dst); offset += 8;748std(R24, offset, dst); offset += 8;749std(R25, offset, dst); offset += 8;750std(R26, offset, dst); offset += 8;751std(R27, offset, dst); offset += 8;752std(R28, offset, dst); offset += 8;753std(R29, offset, dst); offset += 8;754std(R30, offset, dst); offset += 8;755std(R31, offset, dst); offset += 8;756757stfd(F14, offset, dst); offset += 8;758stfd(F15, offset, dst); offset += 8;759stfd(F16, offset, dst); offset += 8;760stfd(F17, offset, dst); offset += 8;761stfd(F18, offset, dst); offset += 8;762stfd(F19, offset, dst); offset += 8;763stfd(F20, offset, dst); offset += 8;764stfd(F21, offset, dst); offset += 8;765stfd(F22, offset, dst); offset += 8;766stfd(F23, offset, dst); offset += 8;767stfd(F24, offset, dst); offset += 8;768stfd(F25, offset, dst); offset += 8;769stfd(F26, offset, dst); offset += 8;770stfd(F27, offset, dst); offset += 8;771stfd(F28, offset, dst); offset += 8;772stfd(F29, offset, dst); offset += 8;773stfd(F30, offset, dst); offset += 8;774stfd(F31, offset, dst);775}776777// Uses ordering which corresponds to ABI:778// _restgpr0_14: ld r14,-144(r1)779// _restgpr0_15: ld r15,-136(r1)780// _restgpr0_16: ld r16,-128(r1)781void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {782ld(R14, offset, src); offset += 8;783ld(R15, offset, src); offset += 8;784ld(R16, offset, src); offset += 8;785ld(R17, offset, src); offset += 8;786ld(R18, offset, src); offset += 8;787ld(R19, offset, src); offset += 8;788ld(R20, offset, src); offset += 8;789ld(R21, offset, src); offset += 8;790ld(R22, offset, src); offset += 8;791ld(R23, offset, src); offset += 8;792ld(R24, offset, src); offset += 8;793ld(R25, offset, src); offset += 8;794ld(R26, offset, src); offset += 8;795ld(R27, offset, src); offset += 8;796ld(R28, offset, src); offset += 8;797ld(R29, offset, src); offset += 8;798ld(R30, offset, src); offset += 8;799ld(R31, offset, src); offset += 8;800801// FP registers802lfd(F14, offset, src); offset += 8;803lfd(F15, offset, src); offset += 8;804lfd(F16, offset, src); offset += 8;805lfd(F17, offset, src); offset += 8;806lfd(F18, offset, src); offset += 8;807lfd(F19, offset, src); offset += 8;808lfd(F20, offset, src); offset += 8;809lfd(F21, offset, src); offset += 8;810lfd(F22, offset, src); offset += 8;811lfd(F23, offset, src); offset += 8;812lfd(F24, offset, src); offset += 8;813lfd(F25, offset, src); offset += 8;814lfd(F26, offset, src); offset += 8;815lfd(F27, offset, src); offset += 8;816lfd(F28, offset, src); offset += 8;817lfd(F29, offset, src); offset += 8;818lfd(F30, offset, src); offset += 8;819lfd(F31, offset, src);820}821822// For verify_oops.823void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {824std(R2, offset, dst); offset += 8;825if (include_R3_RET_reg) {826std(R3, offset, dst); offset += 8;827}828std(R4, offset, dst); offset += 8;829std(R5, offset, dst); offset += 8;830std(R6, offset, dst); offset += 8;831std(R7, offset, dst); offset += 8;832std(R8, offset, dst); offset += 8;833std(R9, offset, dst); offset += 8;834std(R10, offset, dst); offset += 8;835std(R11, offset, dst); offset += 8;836std(R12, offset, dst); offset += 8;837838if (include_fp_regs) {839stfd(F0, offset, dst); offset += 8;840stfd(F1, offset, dst); offset += 8;841stfd(F2, offset, dst); offset += 8;842stfd(F3, offset, dst); offset += 8;843stfd(F4, offset, dst); offset += 8;844stfd(F5, offset, dst); offset += 8;845stfd(F6, offset, dst); offset += 8;846stfd(F7, offset, dst); offset += 8;847stfd(F8, offset, dst); offset += 8;848stfd(F9, offset, dst); offset += 8;849stfd(F10, offset, dst); offset += 8;850stfd(F11, offset, dst); offset += 8;851stfd(F12, offset, dst); offset += 8;852stfd(F13, offset, dst);853}854}855856// For verify_oops.857void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {858ld(R2, offset, src); offset += 8;859if (include_R3_RET_reg) {860ld(R3, offset, src); offset += 8;861}862ld(R4, offset, src); offset += 8;863ld(R5, offset, src); offset += 8;864ld(R6, offset, src); offset += 8;865ld(R7, offset, src); offset += 8;866ld(R8, offset, src); offset += 8;867ld(R9, offset, src); offset += 8;868ld(R10, offset, src); offset += 8;869ld(R11, offset, src); offset += 8;870ld(R12, offset, src); offset += 8;871872if (include_fp_regs) {873lfd(F0, offset, src); offset += 8;874lfd(F1, offset, src); offset += 8;875lfd(F2, offset, src); offset += 8;876lfd(F3, offset, src); offset += 8;877lfd(F4, offset, src); offset += 8;878lfd(F5, offset, src); offset += 8;879lfd(F6, offset, src); offset += 8;880lfd(F7, offset, src); offset += 8;881lfd(F8, offset, src); offset += 8;882lfd(F9, offset, src); offset += 8;883lfd(F10, offset, src); offset += 8;884lfd(F11, offset, src); offset += 8;885lfd(F12, offset, src); offset += 8;886lfd(F13, offset, src);887}888}889890void MacroAssembler::save_LR_CR(Register tmp) {891mfcr(tmp);892std(tmp, _abi0(cr), R1_SP);893mflr(tmp);894std(tmp, _abi0(lr), R1_SP);895// Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)896}897898void MacroAssembler::restore_LR_CR(Register tmp) {899assert(tmp != R1_SP, "must be distinct");900ld(tmp, _abi0(lr), R1_SP);901mtlr(tmp);902ld(tmp, _abi0(cr), R1_SP);903mtcr(tmp);904}905906address MacroAssembler::get_PC_trash_LR(Register result) {907Label L;908bl(L);909bind(L);910address lr_pc = pc();911mflr(result);912return lr_pc;913}914915void MacroAssembler::resize_frame(Register offset, Register tmp) {916#ifdef ASSERT917assert_different_registers(offset, tmp, R1_SP);918andi_(tmp, offset, frame::alignment_in_bytes-1);919asm_assert_eq("resize_frame: unaligned");920#endif921922// tmp <- *(SP)923ld(tmp, _abi0(callers_sp), R1_SP);924// addr <- SP + offset;925// *(addr) <- tmp;926// SP <- addr927stdux(tmp, R1_SP, offset);928}929930void MacroAssembler::resize_frame(int offset, Register tmp) {931assert(is_simm(offset, 16), "too big an offset");932assert_different_registers(tmp, R1_SP);933assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");934// tmp <- *(SP)935ld(tmp, _abi0(callers_sp), R1_SP);936// addr <- SP + offset;937// *(addr) <- tmp;938// SP <- addr939stdu(tmp, offset, R1_SP);940}941942void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {943// (addr == tmp1) || (addr == tmp2) is allowed here!944assert(tmp1 != tmp2, "must be distinct");945946// compute offset w.r.t. current stack pointer947// tmp_1 <- addr - SP (!)948subf(tmp1, R1_SP, addr);949950// atomically update SP keeping back link.951resize_frame(tmp1/* offset */, tmp2/* tmp */);952}953954void MacroAssembler::push_frame(Register bytes, Register tmp) {955#ifdef ASSERT956assert(bytes != R0, "r0 not allowed here");957andi_(R0, bytes, frame::alignment_in_bytes-1);958asm_assert_eq("push_frame(Reg, Reg): unaligned");959#endif960neg(tmp, bytes);961stdux(R1_SP, R1_SP, tmp);962}963964// Push a frame of size `bytes'.965void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {966long offset = align_addr(bytes, frame::alignment_in_bytes);967if (is_simm(-offset, 16)) {968stdu(R1_SP, -offset, R1_SP);969} else {970load_const_optimized(tmp, -offset);971stdux(R1_SP, R1_SP, tmp);972}973}974975// Push a frame of size `bytes' plus abi_reg_args on top.976void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {977push_frame(bytes + frame::abi_reg_args_size, tmp);978}979980// Setup up a new C frame with a spill area for non-volatile GPRs and981// additional space for local variables.982void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,983Register tmp) {984push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);985}986987// Pop current C frame.988void MacroAssembler::pop_frame() {989ld(R1_SP, _abi0(callers_sp), R1_SP);990}991992#if defined(ABI_ELFv2)993address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {994// TODO(asmundak): make sure the caller uses R12 as function descriptor995// most of the times.996if (R12 != r_function_entry) {997mr(R12, r_function_entry);998}999mtctr(R12);1000// Do a call or a branch.1001if (and_link) {1002bctrl();1003} else {1004bctr();1005}1006_last_calls_return_pc = pc();10071008return _last_calls_return_pc;1009}10101011// Call a C function via a function descriptor and use full C1012// calling conventions. Updates and returns _last_calls_return_pc.1013address MacroAssembler::call_c(Register r_function_entry) {1014return branch_to(r_function_entry, /*and_link=*/true);1015}10161017// For tail calls: only branch, don't link, so callee returns to caller of this function.1018address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {1019return branch_to(r_function_entry, /*and_link=*/false);1020}10211022address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {1023load_const(R12, function_entry, R0);1024return branch_to(R12, /*and_link=*/true);1025}10261027#else1028// Generic version of a call to C function via a function descriptor1029// with variable support for C calling conventions (TOC, ENV, etc.).1030// Updates and returns _last_calls_return_pc.1031address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,1032bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {1033// we emit standard ptrgl glue code here1034assert((function_descriptor != R0), "function_descriptor cannot be R0");10351036// retrieve necessary entries from the function descriptor1037ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);1038mtctr(R0);10391040if (load_toc_of_callee) {1041ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);1042}1043if (load_env_of_callee) {1044ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);1045} else if (load_toc_of_callee) {1046li(R11, 0);1047}10481049// do a call or a branch1050if (and_link) {1051bctrl();1052} else {1053bctr();1054}1055_last_calls_return_pc = pc();10561057return _last_calls_return_pc;1058}10591060// Call a C function via a function descriptor and use full C calling1061// conventions.1062// We don't use the TOC in generated code, so there is no need to save1063// and restore its value.1064address MacroAssembler::call_c(Register fd) {1065return branch_to(fd, /*and_link=*/true,1066/*save toc=*/false,1067/*restore toc=*/false,1068/*load toc=*/true,1069/*load env=*/true);1070}10711072address MacroAssembler::call_c_and_return_to_caller(Register fd) {1073return branch_to(fd, /*and_link=*/false,1074/*save toc=*/false,1075/*restore toc=*/false,1076/*load toc=*/true,1077/*load env=*/true);1078}10791080address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {1081if (rt != relocInfo::none) {1082// this call needs to be relocatable1083if (!ReoptimizeCallSequences1084|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)1085|| fd == NULL // support code-size estimation1086|| !fd->is_friend_function()1087|| fd->entry() == NULL) {1088// it's not a friend function as defined by class FunctionDescriptor,1089// so do a full call-c here.1090load_const(R11, (address)fd, R0);10911092bool has_env = (fd != NULL && fd->env() != NULL);1093return branch_to(R11, /*and_link=*/true,1094/*save toc=*/false,1095/*restore toc=*/false,1096/*load toc=*/true,1097/*load env=*/has_env);1098} else {1099// It's a friend function. Load the entry point and don't care about1100// toc and env. Use an optimizable call instruction, but ensure the1101// same code-size as in the case of a non-friend function.1102nop();1103nop();1104nop();1105bl64_patchable(fd->entry(), rt);1106_last_calls_return_pc = pc();1107return _last_calls_return_pc;1108}1109} else {1110// This call does not need to be relocatable, do more aggressive1111// optimizations.1112if (!ReoptimizeCallSequences1113|| !fd->is_friend_function()) {1114// It's not a friend function as defined by class FunctionDescriptor,1115// so do a full call-c here.1116load_const(R11, (address)fd, R0);1117return branch_to(R11, /*and_link=*/true,1118/*save toc=*/false,1119/*restore toc=*/false,1120/*load toc=*/true,1121/*load env=*/true);1122} else {1123// it's a friend function, load the entry point and don't care about1124// toc and env.1125address dest = fd->entry();1126if (is_within_range_of_b(dest, pc())) {1127bl(dest);1128} else {1129bl64_patchable(dest, rt);1130}1131_last_calls_return_pc = pc();1132return _last_calls_return_pc;1133}1134}1135}11361137// Call a C function. All constants needed reside in TOC.1138//1139// Read the address to call from the TOC.1140// Read env from TOC, if fd specifies an env.1141// Read new TOC from TOC.1142address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,1143relocInfo::relocType rt, Register toc) {1144if (!ReoptimizeCallSequences1145|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)1146|| !fd->is_friend_function()) {1147// It's not a friend function as defined by class FunctionDescriptor,1148// so do a full call-c here.1149assert(fd->entry() != NULL, "function must be linked");11501151AddressLiteral fd_entry(fd->entry());1152bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);1153mtctr(R11);1154if (fd->env() == NULL) {1155li(R11, 0);1156nop();1157} else {1158AddressLiteral fd_env(fd->env());1159success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);1160}1161AddressLiteral fd_toc(fd->toc());1162// Set R2_TOC (load from toc)1163success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);1164bctrl();1165_last_calls_return_pc = pc();1166if (!success) { return NULL; }1167} else {1168// It's a friend function, load the entry point and don't care about1169// toc and env. Use an optimizable call instruction, but ensure the1170// same code-size as in the case of a non-friend function.1171nop();1172bl64_patchable(fd->entry(), rt);1173_last_calls_return_pc = pc();1174}1175return _last_calls_return_pc;1176}1177#endif // ABI_ELFv211781179void MacroAssembler::call_VM_base(Register oop_result,1180Register last_java_sp,1181address entry_point,1182bool check_exceptions) {1183BLOCK_COMMENT("call_VM {");1184// Determine last_java_sp register.1185if (!last_java_sp->is_valid()) {1186last_java_sp = R1_SP;1187}1188set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);11891190// ARG1 must hold thread address.1191mr(R3_ARG1, R16_thread);1192#if defined(ABI_ELFv2)1193address return_pc = call_c(entry_point, relocInfo::none);1194#else1195address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);1196#endif11971198reset_last_Java_frame();11991200// Check for pending exceptions.1201if (check_exceptions) {1202// We don't check for exceptions here.1203ShouldNotReachHere();1204}12051206// Get oop result if there is one and reset the value in the thread.1207if (oop_result->is_valid()) {1208get_vm_result(oop_result);1209}12101211_last_calls_return_pc = return_pc;1212BLOCK_COMMENT("} call_VM");1213}12141215void MacroAssembler::call_VM_leaf_base(address entry_point) {1216BLOCK_COMMENT("call_VM_leaf {");1217#if defined(ABI_ELFv2)1218call_c(entry_point, relocInfo::none);1219#else1220call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);1221#endif1222BLOCK_COMMENT("} call_VM_leaf");1223}12241225void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {1226call_VM_base(oop_result, noreg, entry_point, check_exceptions);1227}12281229void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,1230bool check_exceptions) {1231// R3_ARG1 is reserved for the thread.1232mr_if_needed(R4_ARG2, arg_1);1233call_VM(oop_result, entry_point, check_exceptions);1234}12351236void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,1237bool check_exceptions) {1238// R3_ARG1 is reserved for the thread1239mr_if_needed(R4_ARG2, arg_1);1240assert(arg_2 != R4_ARG2, "smashed argument");1241mr_if_needed(R5_ARG3, arg_2);1242call_VM(oop_result, entry_point, check_exceptions);1243}12441245void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,1246bool check_exceptions) {1247// R3_ARG1 is reserved for the thread1248mr_if_needed(R4_ARG2, arg_1);1249assert(arg_2 != R4_ARG2, "smashed argument");1250mr_if_needed(R5_ARG3, arg_2);1251mr_if_needed(R6_ARG4, arg_3);1252call_VM(oop_result, entry_point, check_exceptions);1253}12541255void MacroAssembler::call_VM_leaf(address entry_point) {1256call_VM_leaf_base(entry_point);1257}12581259void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {1260mr_if_needed(R3_ARG1, arg_1);1261call_VM_leaf(entry_point);1262}12631264void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {1265mr_if_needed(R3_ARG1, arg_1);1266assert(arg_2 != R3_ARG1, "smashed argument");1267mr_if_needed(R4_ARG2, arg_2);1268call_VM_leaf(entry_point);1269}12701271void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {1272mr_if_needed(R3_ARG1, arg_1);1273assert(arg_2 != R3_ARG1, "smashed argument");1274mr_if_needed(R4_ARG2, arg_2);1275assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");1276mr_if_needed(R5_ARG3, arg_3);1277call_VM_leaf(entry_point);1278}12791280// Check whether instruction is a read access to the polling page1281// which was emitted by load_from_polling_page(..).1282bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,1283address* polling_address_ptr) {1284if (!is_ld(instruction))1285return false; // It's not a ld. Fail.12861287int rt = inv_rt_field(instruction);1288int ra = inv_ra_field(instruction);1289int ds = inv_ds_field(instruction);1290if (!(ds == 0 && ra != 0 && rt == 0)) {1291return false; // It's not a ld(r0, X, ra). Fail.1292}12931294if (!ucontext) {1295// Set polling address.1296if (polling_address_ptr != NULL) {1297*polling_address_ptr = NULL;1298}1299return true; // No ucontext given. Can't check value of ra. Assume true.1300}13011302#ifdef LINUX1303// Ucontext given. Check that register ra contains the address of1304// the safepoing polling page.1305ucontext_t* uc = (ucontext_t*) ucontext;1306// Set polling address.1307address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;1308if (polling_address_ptr != NULL) {1309*polling_address_ptr = addr;1310}1311return SafepointMechanism::is_poll_address(addr);1312#else1313// Not on Linux, ucontext must be NULL.1314ShouldNotReachHere();1315return false;1316#endif1317}13181319void MacroAssembler::bang_stack_with_offset(int offset) {1320// When increasing the stack, the old stack pointer will be written1321// to the new top of stack according to the PPC64 abi.1322// Therefore, stack banging is not necessary when increasing1323// the stack by <= os::vm_page_size() bytes.1324// When increasing the stack by a larger amount, this method is1325// called repeatedly to bang the intermediate pages.13261327// Stack grows down, caller passes positive offset.1328assert(offset > 0, "must bang with positive offset");13291330long stdoffset = -offset;13311332if (is_simm(stdoffset, 16)) {1333// Signed 16 bit offset, a simple std is ok.1334if (UseLoadInstructionsForStackBangingPPC64) {1335ld(R0, (int)(signed short)stdoffset, R1_SP);1336} else {1337std(R0,(int)(signed short)stdoffset, R1_SP);1338}1339} else if (is_simm(stdoffset, 31)) {1340const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);1341const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);13421343Register tmp = R11;1344addis(tmp, R1_SP, hi);1345if (UseLoadInstructionsForStackBangingPPC64) {1346ld(R0, lo, tmp);1347} else {1348std(R0, lo, tmp);1349}1350} else {1351ShouldNotReachHere();1352}1353}13541355// If instruction is a stack bang of the form1356// std R0, x(Ry), (see bang_stack_with_offset())1357// stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())1358// or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())1359// return the banged address. Otherwise, return 0.1360address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {1361#ifdef LINUX1362ucontext_t* uc = (ucontext_t*) ucontext;1363int rs = inv_rs_field(instruction);1364int ra = inv_ra_field(instruction);1365if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)1366|| (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)1367|| (is_stdu(instruction) && rs == 1)) {1368int ds = inv_ds_field(instruction);1369// return banged address1370return ds+(address)uc->uc_mcontext.regs->gpr[ra];1371} else if (is_stdux(instruction) && rs == 1) {1372int rb = inv_rb_field(instruction);1373address sp = (address)uc->uc_mcontext.regs->gpr[1];1374long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];1375return ra != 1 || rb_val >= 0 ? NULL // not a stack bang1376: sp + rb_val; // banged address1377}1378return NULL; // not a stack bang1379#else1380// workaround not needed on !LINUX :-)1381ShouldNotCallThis();1382return NULL;1383#endif1384}13851386void MacroAssembler::reserved_stack_check(Register return_pc) {1387// Test if reserved zone needs to be enabled.1388Label no_reserved_zone_enabling;13891390ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);1391cmpld(CCR0, R1_SP, R0);1392blt_predict_taken(CCR0, no_reserved_zone_enabling);13931394// Enable reserved zone again, throw stack overflow exception.1395push_frame_reg_args(0, R0);1396call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);1397pop_frame();1398mtlr(return_pc);1399load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());1400mtctr(R0);1401bctr();14021403should_not_reach_here();14041405bind(no_reserved_zone_enabling);1406}14071408void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,1409bool cmpxchgx_hint) {1410Label retry;1411bind(retry);1412ldarx(dest_current_value, addr_base, cmpxchgx_hint);1413stdcx_(exchange_value, addr_base);1414if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1415bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1416} else {1417bne( CCR0, retry); // StXcx_ sets CCR0.1418}1419}14201421void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,1422Register tmp, bool cmpxchgx_hint) {1423Label retry;1424bind(retry);1425ldarx(dest_current_value, addr_base, cmpxchgx_hint);1426add(tmp, dest_current_value, inc_value);1427stdcx_(tmp, addr_base);1428if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1429bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1430} else {1431bne( CCR0, retry); // StXcx_ sets CCR0.1432}1433}14341435// Word/sub-word atomic helper functions14361437// Temps and addr_base are killed if size < 4 and processor does not support respective instructions.1438// Only signed types are supported with size < 4.1439// Atomic add always kills tmp1.1440void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,1441Register addr_base, Register tmp1, Register tmp2, Register tmp3,1442bool cmpxchgx_hint, bool is_add, int size) {1443// Sub-word instructions are available since Power 8.1444// For older processors, instruction_type != size holds, and we1445// emulate the sub-word instructions by constructing a 4-byte value1446// that leaves the other bytes unchanged.1447const int instruction_type = VM_Version::has_lqarx() ? size : 4;14481449Label retry;1450Register shift_amount = noreg,1451val32 = dest_current_value,1452modval = is_add ? tmp1 : exchange_value;14531454if (instruction_type != size) {1455assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);1456modval = tmp1;1457shift_amount = tmp2;1458val32 = tmp3;1459// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.1460#ifdef VM_LITTLE_ENDIAN1461rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;1462clrrdi(addr_base, addr_base, 2);1463#else1464xori(shift_amount, addr_base, (size == 1) ? 3 : 2);1465clrrdi(addr_base, addr_base, 2);1466rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;1467#endif1468}14691470// atomic emulation loop1471bind(retry);14721473switch (instruction_type) {1474case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;1475case 2: lharx(val32, addr_base, cmpxchgx_hint); break;1476case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;1477default: ShouldNotReachHere();1478}14791480if (instruction_type != size) {1481srw(dest_current_value, val32, shift_amount);1482}14831484if (is_add) { add(modval, dest_current_value, exchange_value); }14851486if (instruction_type != size) {1487// Transform exchange value such that the replacement can be done by one xor instruction.1488xorr(modval, dest_current_value, is_add ? modval : exchange_value);1489clrldi(modval, modval, (size == 1) ? 56 : 48);1490slw(modval, modval, shift_amount);1491xorr(modval, val32, modval);1492}14931494switch (instruction_type) {1495case 4: stwcx_(modval, addr_base); break;1496case 2: sthcx_(modval, addr_base); break;1497case 1: stbcx_(modval, addr_base); break;1498default: ShouldNotReachHere();1499}15001501if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1502bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1503} else {1504bne( CCR0, retry); // StXcx_ sets CCR0.1505}15061507// l?arx zero-extends, but Java wants byte/short values sign-extended.1508if (size == 1) {1509extsb(dest_current_value, dest_current_value);1510} else if (size == 2) {1511extsh(dest_current_value, dest_current_value);1512};1513}15141515// Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.1516// Only signed types are supported with size < 4.1517void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,1518Register compare_value, Register exchange_value,1519Register addr_base, Register tmp1, Register tmp2,1520Label &retry, Label &failed, bool cmpxchgx_hint, int size) {1521// Sub-word instructions are available since Power 8.1522// For older processors, instruction_type != size holds, and we1523// emulate the sub-word instructions by constructing a 4-byte value1524// that leaves the other bytes unchanged.1525const int instruction_type = VM_Version::has_lqarx() ? size : 4;15261527Register shift_amount = noreg,1528val32 = dest_current_value,1529modval = exchange_value;15301531if (instruction_type != size) {1532assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);1533shift_amount = tmp1;1534val32 = tmp2;1535modval = tmp2;1536// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.1537#ifdef VM_LITTLE_ENDIAN1538rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;1539clrrdi(addr_base, addr_base, 2);1540#else1541xori(shift_amount, addr_base, (size == 1) ? 3 : 2);1542clrrdi(addr_base, addr_base, 2);1543rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;1544#endif1545// Transform exchange value such that the replacement can be done by one xor instruction.1546xorr(exchange_value, compare_value, exchange_value);1547clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);1548slw(exchange_value, exchange_value, shift_amount);1549}15501551// atomic emulation loop1552bind(retry);15531554switch (instruction_type) {1555case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;1556case 2: lharx(val32, addr_base, cmpxchgx_hint); break;1557case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;1558default: ShouldNotReachHere();1559}15601561if (instruction_type != size) {1562srw(dest_current_value, val32, shift_amount);1563}1564if (size == 1) {1565extsb(dest_current_value, dest_current_value);1566} else if (size == 2) {1567extsh(dest_current_value, dest_current_value);1568};15691570cmpw(flag, dest_current_value, compare_value);1571if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1572bne_predict_not_taken(flag, failed);1573} else {1574bne( flag, failed);1575}1576// branch to done => (flag == ne), (dest_current_value != compare_value)1577// fall through => (flag == eq), (dest_current_value == compare_value)15781579if (instruction_type != size) {1580xorr(modval, val32, exchange_value);1581}15821583switch (instruction_type) {1584case 4: stwcx_(modval, addr_base); break;1585case 2: sthcx_(modval, addr_base); break;1586case 1: stbcx_(modval, addr_base); break;1587default: ShouldNotReachHere();1588}1589}15901591// CmpxchgX sets condition register to cmpX(current, compare).1592void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,1593Register compare_value, Register exchange_value,1594Register addr_base, Register tmp1, Register tmp2,1595int semantics, bool cmpxchgx_hint,1596Register int_flag_success, bool contention_hint, bool weak, int size) {1597Label retry;1598Label failed;1599Label done;16001601// Save one branch if result is returned via register and1602// result register is different from the other ones.1603bool use_result_reg = (int_flag_success != noreg);1604bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&1605int_flag_success != exchange_value && int_flag_success != addr_base &&1606int_flag_success != tmp1 && int_flag_success != tmp2);1607assert(!weak || flag == CCR0, "weak only supported with CCR0");1608assert(size == 1 || size == 2 || size == 4, "unsupported");16091610if (use_result_reg && preset_result_reg) {1611li(int_flag_success, 0); // preset (assume cas failed)1612}16131614// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).1615if (contention_hint) { // Don't try to reserve if cmp fails.1616switch (size) {1617case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;1618case 2: lha(dest_current_value, 0, addr_base); break;1619case 4: lwz(dest_current_value, 0, addr_base); break;1620default: ShouldNotReachHere();1621}1622cmpw(flag, dest_current_value, compare_value);1623bne(flag, failed);1624}16251626// release/fence semantics1627if (semantics & MemBarRel) {1628release();1629}16301631cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,1632retry, failed, cmpxchgx_hint, size);1633if (!weak || use_result_reg) {1634if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1635bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.1636} else {1637bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.1638}1639}1640// fall through => (flag == eq), (dest_current_value == compare_value), (swapped)16411642// Result in register (must do this at the end because int_flag_success can be the1643// same register as one above).1644if (use_result_reg) {1645li(int_flag_success, 1);1646}16471648if (semantics & MemBarFenceAfter) {1649fence();1650} else if (semantics & MemBarAcq) {1651isync();1652}16531654if (use_result_reg && !preset_result_reg) {1655b(done);1656}16571658bind(failed);1659if (use_result_reg && !preset_result_reg) {1660li(int_flag_success, 0);1661}16621663bind(done);1664// (flag == ne) => (dest_current_value != compare_value), (!swapped)1665// (flag == eq) => (dest_current_value == compare_value), ( swapped)1666}16671668// Preforms atomic compare exchange:1669// if (compare_value == *addr_base)1670// *addr_base = exchange_value1671// int_flag_success = 1;1672// else1673// int_flag_success = 0;1674//1675// ConditionRegister flag = cmp(compare_value, *addr_base)1676// Register dest_current_value = *addr_base1677// Register compare_value Used to compare with value in memory1678// Register exchange_value Written to memory if compare_value == *addr_base1679// Register addr_base The memory location to compareXChange1680// Register int_flag_success Set to 1 if exchange_value was written to *addr_base1681//1682// To avoid the costly compare exchange the value is tested beforehand.1683// Several special cases exist to avoid that unnecessary information is generated.1684//1685void MacroAssembler::cmpxchgd(ConditionRegister flag,1686Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,1687Register addr_base, int semantics, bool cmpxchgx_hint,1688Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {1689Label retry;1690Label failed_int;1691Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;1692Label done;16931694// Save one branch if result is returned via register and result register is different from the other ones.1695bool use_result_reg = (int_flag_success!=noreg);1696bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&1697int_flag_success!=exchange_value && int_flag_success!=addr_base);1698assert(!weak || flag == CCR0, "weak only supported with CCR0");1699assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");17001701if (use_result_reg && preset_result_reg) {1702li(int_flag_success, 0); // preset (assume cas failed)1703}17041705// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).1706if (contention_hint) { // Don't try to reserve if cmp fails.1707ld(dest_current_value, 0, addr_base);1708cmpd(flag, compare_value, dest_current_value);1709bne(flag, failed);1710}17111712// release/fence semantics1713if (semantics & MemBarRel) {1714release();1715}17161717// atomic emulation loop1718bind(retry);17191720ldarx(dest_current_value, addr_base, cmpxchgx_hint);1721cmpd(flag, compare_value, dest_current_value);1722if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1723bne_predict_not_taken(flag, failed);1724} else {1725bne( flag, failed);1726}17271728stdcx_(exchange_value, addr_base);1729if (!weak || use_result_reg || failed_ext) {1730if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1731bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR01732} else {1733bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR01734}1735}17361737// result in register (must do this at the end because int_flag_success can be the same register as one above)1738if (use_result_reg) {1739li(int_flag_success, 1);1740}17411742if (semantics & MemBarFenceAfter) {1743fence();1744} else if (semantics & MemBarAcq) {1745isync();1746}17471748if (use_result_reg && !preset_result_reg) {1749b(done);1750}17511752bind(failed_int);1753if (use_result_reg && !preset_result_reg) {1754li(int_flag_success, 0);1755}17561757bind(done);1758// (flag == ne) => (dest_current_value != compare_value), (!swapped)1759// (flag == eq) => (dest_current_value == compare_value), ( swapped)1760}17611762// Look up the method for a megamorphic invokeinterface call.1763// The target method is determined by <intf_klass, itable_index>.1764// The receiver klass is in recv_klass.1765// On success, the result will be in method_result, and execution falls through.1766// On failure, execution transfers to the given label.1767void MacroAssembler::lookup_interface_method(Register recv_klass,1768Register intf_klass,1769RegisterOrConstant itable_index,1770Register method_result,1771Register scan_temp,1772Register temp2,1773Label& L_no_such_interface,1774bool return_method) {1775assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);17761777// Compute start of first itableOffsetEntry (which is at the end of the vtable).1778int vtable_base = in_bytes(Klass::vtable_start_offset());1779int itentry_off = itableMethodEntry::method_offset_in_bytes();1780int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);1781int scan_step = itableOffsetEntry::size() * wordSize;1782int log_vte_size= exact_log2(vtableEntry::size_in_bytes());17831784lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);1785// %%% We should store the aligned, prescaled offset in the klassoop.1786// Then the next several instructions would fold away.17871788sldi(scan_temp, scan_temp, log_vte_size);1789addi(scan_temp, scan_temp, vtable_base);1790add(scan_temp, recv_klass, scan_temp);17911792// Adjust recv_klass by scaled itable_index, so we can free itable_index.1793if (return_method) {1794if (itable_index.is_register()) {1795Register itable_offset = itable_index.as_register();1796sldi(method_result, itable_offset, logMEsize);1797if (itentry_off) { addi(method_result, method_result, itentry_off); }1798add(method_result, method_result, recv_klass);1799} else {1800long itable_offset = (long)itable_index.as_constant();1801// static address, no relocation1802add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);1803}1804}18051806// for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {1807// if (scan->interface() == intf) {1808// result = (klass + scan->offset() + itable_index);1809// }1810// }1811Label search, found_method;18121813for (int peel = 1; peel >= 0; peel--) {1814// %%%% Could load both offset and interface in one ldx, if they were1815// in the opposite order. This would save a load.1816ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);18171818// Check that this entry is non-null. A null entry means that1819// the receiver class doesn't implement the interface, and wasn't the1820// same as when the caller was compiled.1821cmpd(CCR0, temp2, intf_klass);18221823if (peel) {1824beq(CCR0, found_method);1825} else {1826bne(CCR0, search);1827// (invert the test to fall through to found_method...)1828}18291830if (!peel) break;18311832bind(search);18331834cmpdi(CCR0, temp2, 0);1835beq(CCR0, L_no_such_interface);1836addi(scan_temp, scan_temp, scan_step);1837}18381839bind(found_method);18401841// Got a hit.1842if (return_method) {1843int ito_offset = itableOffsetEntry::offset_offset_in_bytes();1844lwz(scan_temp, ito_offset, scan_temp);1845ldx(method_result, scan_temp, method_result);1846}1847}18481849// virtual method calling1850void MacroAssembler::lookup_virtual_method(Register recv_klass,1851RegisterOrConstant vtable_index,1852Register method_result) {18531854assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());18551856const int base = in_bytes(Klass::vtable_start_offset());1857assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");18581859if (vtable_index.is_register()) {1860sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);1861add(recv_klass, vtable_index.as_register(), recv_klass);1862} else {1863addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);1864}1865ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);1866}18671868/////////////////////////////////////////// subtype checking ////////////////////////////////////////////1869void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,1870Register super_klass,1871Register temp1_reg,1872Register temp2_reg,1873Label* L_success,1874Label* L_failure,1875Label* L_slow_path,1876RegisterOrConstant super_check_offset) {18771878const Register check_cache_offset = temp1_reg;1879const Register cached_super = temp2_reg;18801881assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);18821883int sco_offset = in_bytes(Klass::super_check_offset_offset());1884int sc_offset = in_bytes(Klass::secondary_super_cache_offset());18851886bool must_load_sco = (super_check_offset.constant_or_zero() == -1);1887bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);18881889Label L_fallthrough;1890int label_nulls = 0;1891if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }1892if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }1893if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }1894assert(label_nulls <= 1 ||1895(L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),1896"at most one NULL in the batch, usually");18971898// If the pointers are equal, we are done (e.g., String[] elements).1899// This self-check enables sharing of secondary supertype arrays among1900// non-primary types such as array-of-interface. Otherwise, each such1901// type would need its own customized SSA.1902// We move this check to the front of the fast path because many1903// type checks are in fact trivially successful in this manner,1904// so we get a nicely predicted branch right at the start of the check.1905cmpd(CCR0, sub_klass, super_klass);1906beq(CCR0, *L_success);19071908// Check the supertype display:1909if (must_load_sco) {1910// The super check offset is always positive...1911lwz(check_cache_offset, sco_offset, super_klass);1912super_check_offset = RegisterOrConstant(check_cache_offset);1913// super_check_offset is register.1914assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());1915}1916// The loaded value is the offset from KlassOopDesc.19171918ld(cached_super, super_check_offset, sub_klass);1919cmpd(CCR0, cached_super, super_klass);19201921// This check has worked decisively for primary supers.1922// Secondary supers are sought in the super_cache ('super_cache_addr').1923// (Secondary supers are interfaces and very deeply nested subtypes.)1924// This works in the same check above because of a tricky aliasing1925// between the super_cache and the primary super display elements.1926// (The 'super_check_addr' can address either, as the case requires.)1927// Note that the cache is updated below if it does not help us find1928// what we need immediately.1929// So if it was a primary super, we can just fail immediately.1930// Otherwise, it's the slow path for us (no success at this point).19311932#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }19331934if (super_check_offset.is_register()) {1935beq(CCR0, *L_success);1936cmpwi(CCR0, super_check_offset.as_register(), sc_offset);1937if (L_failure == &L_fallthrough) {1938beq(CCR0, *L_slow_path);1939} else {1940bne(CCR0, *L_failure);1941FINAL_JUMP(*L_slow_path);1942}1943} else {1944if (super_check_offset.as_constant() == sc_offset) {1945// Need a slow path; fast failure is impossible.1946if (L_slow_path == &L_fallthrough) {1947beq(CCR0, *L_success);1948} else {1949bne(CCR0, *L_slow_path);1950FINAL_JUMP(*L_success);1951}1952} else {1953// No slow path; it's a fast decision.1954if (L_failure == &L_fallthrough) {1955beq(CCR0, *L_success);1956} else {1957bne(CCR0, *L_failure);1958FINAL_JUMP(*L_success);1959}1960}1961}19621963bind(L_fallthrough);1964#undef FINAL_JUMP1965}19661967void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,1968Register super_klass,1969Register temp1_reg,1970Register temp2_reg,1971Label* L_success,1972Register result_reg) {1973const Register array_ptr = temp1_reg; // current value from cache array1974const Register temp = temp2_reg;19751976assert_different_registers(sub_klass, super_klass, array_ptr, temp);19771978int source_offset = in_bytes(Klass::secondary_supers_offset());1979int target_offset = in_bytes(Klass::secondary_super_cache_offset());19801981int length_offset = Array<Klass*>::length_offset_in_bytes();1982int base_offset = Array<Klass*>::base_offset_in_bytes();19831984Label hit, loop, failure, fallthru;19851986ld(array_ptr, source_offset, sub_klass);19871988// TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");1989lwz(temp, length_offset, array_ptr);1990cmpwi(CCR0, temp, 0);1991beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 019921993mtctr(temp); // load ctr19941995bind(loop);1996// Oops in table are NO MORE compressed.1997ld(temp, base_offset, array_ptr);1998cmpd(CCR0, temp, super_klass);1999beq(CCR0, hit);2000addi(array_ptr, array_ptr, BytesPerWord);2001bdnz(loop);20022003bind(failure);2004if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)2005b(fallthru);20062007bind(hit);2008std(super_klass, target_offset, sub_klass); // save result to cache2009if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)2010if (L_success != NULL) { b(*L_success); }2011else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided20122013bind(fallthru);2014}20152016// Try fast path, then go to slow one if not successful2017void MacroAssembler::check_klass_subtype(Register sub_klass,2018Register super_klass,2019Register temp1_reg,2020Register temp2_reg,2021Label& L_success) {2022Label L_failure;2023check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);2024check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);2025bind(L_failure); // Fallthru if not successful.2026}20272028void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {2029assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");20302031Label L_fallthrough;2032if (L_fast_path == NULL) {2033L_fast_path = &L_fallthrough;2034} else if (L_slow_path == NULL) {2035L_slow_path = &L_fallthrough;2036}20372038// Fast path check: class is fully initialized2039lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);2040cmpwi(CCR0, R0, InstanceKlass::fully_initialized);2041beq(CCR0, *L_fast_path);20422043// Fast path check: current thread is initializer thread2044ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);2045cmpd(CCR0, thread, R0);2046if (L_slow_path == &L_fallthrough) {2047beq(CCR0, *L_fast_path);2048} else if (L_fast_path == &L_fallthrough) {2049bne(CCR0, *L_slow_path);2050} else {2051Unimplemented();2052}20532054bind(L_fallthrough);2055}20562057RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,2058Register temp_reg,2059int extra_slot_offset) {2060// cf. TemplateTable::prepare_invoke(), if (load_receiver).2061int stackElementSize = Interpreter::stackElementSize;2062int offset = extra_slot_offset * stackElementSize;2063if (arg_slot.is_constant()) {2064offset += arg_slot.as_constant() * stackElementSize;2065return offset;2066} else {2067assert(temp_reg != noreg, "must specify");2068sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));2069if (offset != 0)2070addi(temp_reg, temp_reg, offset);2071return temp_reg;2072}2073}20742075// Supports temp2_reg = R0.2076void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,2077Register mark_reg, Register temp_reg,2078Register temp2_reg, Label& done, Label* slow_case) {2079assert(UseBiasedLocking, "why call this otherwise?");20802081#ifdef ASSERT2082assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);2083#endif20842085Label cas_label;20862087// Branch to done if fast path fails and no slow_case provided.2088Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;20892090// Biased locking2091// See whether the lock is currently biased toward our thread and2092// whether the epoch is still valid2093// Note that the runtime guarantees sufficient alignment of JavaThread2094// pointers to allow age to be placed into low bits2095assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,2096"biased locking makes assumptions about bit layout");20972098if (PrintBiasedLockingStatistics) {2099load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);2100lwzx(temp_reg, temp2_reg);2101addi(temp_reg, temp_reg, 1);2102stwx(temp_reg, temp2_reg);2103}21042105andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);2106cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);2107bne(cr_reg, cas_label);21082109load_klass(temp_reg, obj_reg);21102111load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));2112ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);2113orr(temp_reg, R16_thread, temp_reg);2114xorr(temp_reg, mark_reg, temp_reg);2115andr(temp_reg, temp_reg, temp2_reg);2116cmpdi(cr_reg, temp_reg, 0);2117if (PrintBiasedLockingStatistics) {2118Label l;2119bne(cr_reg, l);2120load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());2121lwzx(mark_reg, temp2_reg);2122addi(mark_reg, mark_reg, 1);2123stwx(mark_reg, temp2_reg);2124// restore mark_reg2125ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);2126bind(l);2127}2128beq(cr_reg, done);21292130Label try_revoke_bias;2131Label try_rebias;21322133// At this point we know that the header has the bias pattern and2134// that we are not the bias owner in the current epoch. We need to2135// figure out more details about the state of the header in order to2136// know what operations can be legally performed on the object's2137// header.21382139// If the low three bits in the xor result aren't clear, that means2140// the prototype header is no longer biased and we have to revoke2141// the bias on this object.2142andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);2143cmpwi(cr_reg, temp2_reg, 0);2144bne(cr_reg, try_revoke_bias);21452146// Biasing is still enabled for this data type. See whether the2147// epoch of the current bias is still valid, meaning that the epoch2148// bits of the mark word are equal to the epoch bits of the2149// prototype header. (Note that the prototype header's epoch bits2150// only change at a safepoint.) If not, attempt to rebias the object2151// toward the current thread. Note that we must be absolutely sure2152// that the current epoch is invalid in order to do this because2153// otherwise the manipulations it performs on the mark word are2154// illegal.21552156int shift_amount = 64 - markWord::epoch_shift;2157// rotate epoch bits to right (little) end and set other bits to 02158// [ big part | epoch | little part ] -> [ 0..0 | epoch ]2159rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);2160// branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented2161bne(CCR0, try_rebias);21622163// The epoch of the current bias is still valid but we know nothing2164// about the owner; it might be set or it might be clear. Try to2165// acquire the bias of the object using an atomic operation. If this2166// fails we will go in to the runtime to revoke the object's bias.2167// Note that we first construct the presumed unbiased header so we2168// don't accidentally blow away another thread's valid bias.2169andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |2170markWord::age_mask_in_place |2171markWord::epoch_mask_in_place));2172orr(temp_reg, R16_thread, mark_reg);21732174assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");21752176// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).2177cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,2178/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,2179/*where=*/obj_reg,2180MacroAssembler::MemBarAcq,2181MacroAssembler::cmpxchgx_hint_acquire_lock(),2182noreg, slow_case_int); // bail out if failed21832184// If the biasing toward our thread failed, this means that2185// another thread succeeded in biasing it toward itself and we2186// need to revoke that bias. The revocation will occur in the2187// interpreter runtime in the slow case.2188if (PrintBiasedLockingStatistics) {2189load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);2190lwzx(temp_reg, temp2_reg);2191addi(temp_reg, temp_reg, 1);2192stwx(temp_reg, temp2_reg);2193}2194b(done);21952196bind(try_rebias);2197// At this point we know the epoch has expired, meaning that the2198// current "bias owner", if any, is actually invalid. Under these2199// circumstances _only_, we are allowed to use the current header's2200// value as the comparison value when doing the cas to acquire the2201// bias in the current epoch. In other words, we allow transfer of2202// the bias from one thread to another directly in this situation.2203load_klass(temp_reg, obj_reg);2204andi(temp2_reg, mark_reg, markWord::age_mask_in_place);2205orr(temp2_reg, R16_thread, temp2_reg);2206ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);2207orr(temp_reg, temp2_reg, temp_reg);22082209assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");22102211cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,2212/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,2213/*where=*/obj_reg,2214MacroAssembler::MemBarAcq,2215MacroAssembler::cmpxchgx_hint_acquire_lock(),2216noreg, slow_case_int); // bail out if failed22172218// If the biasing toward our thread failed, this means that2219// another thread succeeded in biasing it toward itself and we2220// need to revoke that bias. The revocation will occur in the2221// interpreter runtime in the slow case.2222if (PrintBiasedLockingStatistics) {2223load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);2224lwzx(temp_reg, temp2_reg);2225addi(temp_reg, temp_reg, 1);2226stwx(temp_reg, temp2_reg);2227}2228b(done);22292230bind(try_revoke_bias);2231// The prototype mark in the klass doesn't have the bias bit set any2232// more, indicating that objects of this data type are not supposed2233// to be biased any more. We are going to try to reset the mark of2234// this object to the prototype value and fall through to the2235// CAS-based locking scheme. Note that if our CAS fails, it means2236// that another thread raced us for the privilege of revoking the2237// bias of this particular object, so it's okay to continue in the2238// normal locking code.2239load_klass(temp_reg, obj_reg);2240ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);2241andi(temp2_reg, mark_reg, markWord::age_mask_in_place);2242orr(temp_reg, temp_reg, temp2_reg);22432244assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");22452246// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).2247cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,2248/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,2249/*where=*/obj_reg,2250MacroAssembler::MemBarAcq,2251MacroAssembler::cmpxchgx_hint_acquire_lock());22522253// reload markWord in mark_reg before continuing with lightweight locking2254ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);22552256// Fall through to the normal CAS-based lock, because no matter what2257// the result of the above CAS, some thread must have succeeded in2258// removing the bias bit from the object's header.2259if (PrintBiasedLockingStatistics) {2260Label l;2261bne(cr_reg, l);2262load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);2263lwzx(temp_reg, temp2_reg);2264addi(temp_reg, temp_reg, 1);2265stwx(temp_reg, temp2_reg);2266bind(l);2267}22682269bind(cas_label);2270}22712272void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {2273// Check for biased locking unlock case, which is a no-op2274// Note: we do not have to check the thread ID for two reasons.2275// First, the interpreter checks for IllegalMonitorStateException at2276// a higher level. Second, if the bias was revoked while we held the2277// lock, the object could not be rebiased toward another thread, so2278// the bias bit would be clear.22792280ld(temp_reg, 0, mark_addr);2281andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);22822283cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);2284beq(cr_reg, done);2285}22862287// allocation (for C1)2288void MacroAssembler::eden_allocate(2289Register obj, // result: pointer to object after successful allocation2290Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise2291int con_size_in_bytes, // object size in bytes if known at compile time2292Register t1, // temp register2293Register t2, // temp register2294Label& slow_case // continuation point if fast allocation fails2295) {2296b(slow_case);2297}22982299void MacroAssembler::tlab_allocate(2300Register obj, // result: pointer to object after successful allocation2301Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise2302int con_size_in_bytes, // object size in bytes if known at compile time2303Register t1, // temp register2304Label& slow_case // continuation point if fast allocation fails2305) {2306// make sure arguments make sense2307assert_different_registers(obj, var_size_in_bytes, t1);2308assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");2309assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");23102311const Register new_top = t1;2312//verify_tlab(); not implemented23132314ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);2315ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);2316if (var_size_in_bytes == noreg) {2317addi(new_top, obj, con_size_in_bytes);2318} else {2319add(new_top, obj, var_size_in_bytes);2320}2321cmpld(CCR0, new_top, R0);2322bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);23232324#ifdef ASSERT2325// make sure new free pointer is properly aligned2326{2327Label L;2328andi_(R0, new_top, MinObjAlignmentInBytesMask);2329beq(CCR0, L);2330stop("updated TLAB free is not properly aligned");2331bind(L);2332}2333#endif // ASSERT23342335// update the tlab top pointer2336std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);2337//verify_tlab(); not implemented2338}2339void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {2340unimplemented("incr_allocated_bytes");2341}23422343address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,2344int insts_call_instruction_offset, Register Rtoc) {2345// Start the stub.2346address stub = start_a_stub(64);2347if (stub == NULL) { return NULL; } // CodeCache full: bail out23482349// Create a trampoline stub relocation which relates this trampoline stub2350// with the call instruction at insts_call_instruction_offset in the2351// instructions code-section.2352relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));2353const int stub_start_offset = offset();23542355// For java_to_interp stubs we use R11_scratch1 as scratch register2356// and in call trampoline stubs we use R12_scratch2. This way we2357// can distinguish them (see is_NativeCallTrampolineStub_at()).2358Register reg_scratch = R12_scratch2;23592360// Now, create the trampoline stub's code:2361// - load the TOC2362// - load the call target from the constant pool2363// - call2364if (Rtoc == noreg) {2365calculate_address_from_global_toc(reg_scratch, method_toc());2366Rtoc = reg_scratch;2367}23682369ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);2370mtctr(reg_scratch);2371bctr();23722373const address stub_start_addr = addr_at(stub_start_offset);23742375// Assert that the encoded destination_toc_offset can be identified and that it is correct.2376assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),2377"encoded offset into the constant pool must match");2378// Trampoline_stub_size should be good.2379assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");2380assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");23812382// End the stub.2383end_a_stub();2384return stub;2385}23862387// TM on PPC64.2388void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {2389Label retry;2390bind(retry);2391ldarx(result, addr, /*hint*/ false);2392addi(result, result, simm16);2393stdcx_(result, addr);2394if (UseStaticBranchPredictionInCompareAndSwapPPC64) {2395bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR02396} else {2397bne( CCR0, retry); // stXcx_ sets CCR02398}2399}24002401void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {2402Label retry;2403bind(retry);2404lwarx(result, addr, /*hint*/ false);2405ori(result, result, uimm16);2406stwcx_(result, addr);2407if (UseStaticBranchPredictionInCompareAndSwapPPC64) {2408bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR02409} else {2410bne( CCR0, retry); // stXcx_ sets CCR02411}2412}24132414#if INCLUDE_RTM_OPT24152416// Update rtm_counters based on abort status2417// input: abort_status2418// rtm_counters_Reg (RTMLockingCounters*)2419void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {2420// Mapping to keep PreciseRTMLockingStatistics similar to x86.2421// x86 ppc (! means inverted, ? means not the same)2422// 0 31 Set if abort caused by XABORT instruction.2423// 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.2424// 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.2425// 3 10 Set if an internal buffer overflowed.2426// 4 ?12 Set if a debug breakpoint was hit.2427// 5 ?32 Set if an abort occurred during execution of a nested transaction.2428const int failure_bit[] = {tm_tabort, // Signal handler will set this too.2429tm_failure_persistent,2430tm_non_trans_cf,2431tm_trans_cf,2432tm_footprint_of,2433tm_failure_code,2434tm_transaction_level};24352436const int num_failure_bits = sizeof(failure_bit) / sizeof(int);2437const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;24382439const int bit2counter_map[][num_counters] =2440// 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic2441// Inverted logic means that if a bit is set don't count it, or vice-versa.2442// Care must be taken when mapping bits to counters as bits for a given2443// counter must be mutually exclusive. Otherwise, the counter will be2444// incremented more than once.2445// counters:2446// 0 1 2 3 4 52447// abort , persist, conflict, overflow, debug , nested bits:2448{{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort2449{ 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent2450{ 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf2451{ 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf2452{ 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of2453{ 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD42454{ 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 12455// ...24562457// Move abort_status value to R0 and use abort_status register as a2458// temporary register because R0 as third operand in ld/std is treated2459// as base address zero (value). Likewise, R0 as second operand in addi2460// is problematic because it amounts to li.2461const Register temp_Reg = abort_status;2462const Register abort_status_R0 = R0;2463mr(abort_status_R0, abort_status);24642465// Increment total abort counter.2466int counters_offs = RTMLockingCounters::abort_count_offset();2467ld(temp_Reg, counters_offs, rtm_counters_Reg);2468addi(temp_Reg, temp_Reg, 1);2469std(temp_Reg, counters_offs, rtm_counters_Reg);24702471// Increment specific abort counters.2472if (PrintPreciseRTMLockingStatistics) {24732474// #0 counter offset.2475int abortX_offs = RTMLockingCounters::abortX_count_offset();24762477for (int nbit = 0; nbit < num_failure_bits; nbit++) {2478for (int ncounter = 0; ncounter < num_counters; ncounter++) {2479if (bit2counter_map[nbit][ncounter] != 0) {2480Label check_abort;2481int abort_counter_offs = abortX_offs + (ncounter << 3);24822483if (failure_bit[nbit] == tm_transaction_level) {2484// Don't check outer transaction, TL = 1 (bit 63). Hence only2485// 11 bits in the TL field are checked to find out if failure2486// occured in a nested transaction. This check also matches2487// the case when nesting_of = 1 (nesting overflow).2488rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);2489} else if (failure_bit[nbit] == tm_failure_code) {2490// Check failure code for trap or illegal caught in TM.2491// Bits 0:7 are tested as bit 7 (persistent) is copied from2492// tabort or treclaim source operand.2493// On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).2494rldicl(temp_Reg, abort_status_R0, 8, 56);2495cmpdi(CCR0, temp_Reg, 0xD4);2496} else {2497rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);2498}24992500if (bit2counter_map[nbit][ncounter] == 1) {2501beq(CCR0, check_abort);2502} else {2503bne(CCR0, check_abort);2504}25052506// We don't increment atomically.2507ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);2508addi(temp_Reg, temp_Reg, 1);2509std(temp_Reg, abort_counter_offs, rtm_counters_Reg);25102511bind(check_abort);2512}2513}2514}2515}2516// Restore abort_status.2517mr(abort_status, abort_status_R0);2518}25192520// Branch if (random & (count-1) != 0), count is 2^n2521// tmp and CR0 are killed2522void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {2523mftb(tmp);2524andi_(tmp, tmp, count-1);2525bne(CCR0, brLabel);2526}25272528// Perform abort ratio calculation, set no_rtm bit if high ratio.2529// input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED2530void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,2531RTMLockingCounters* rtm_counters,2532Metadata* method_data) {2533Label L_done, L_check_always_rtm1, L_check_always_rtm2;25342535if (RTMLockingCalculationDelay > 0) {2536// Delay calculation.2537ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());2538cmpdi(CCR0, rtm_counters_Reg, 0);2539beq(CCR0, L_done);2540load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload2541}2542// Abort ratio calculation only if abort_count > RTMAbortThreshold.2543// Aborted transactions = abort_count * 1002544// All transactions = total_count * RTMTotalCountIncrRate2545// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)2546ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);2547if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only.2548cmpdi(CCR0, R0, RTMAbortThreshold);2549blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary2550} else {2551load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);2552cmpd(CCR0, R0, rtm_counters_Reg);2553blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required2554}2555mulli(R0, R0, 100);25562557const Register tmpReg = rtm_counters_Reg;2558ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);2559mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int162560mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int162561cmpd(CCR0, R0, tmpReg);2562blt(CCR0, L_check_always_rtm1); // jump to reload2563if (method_data != NULL) {2564// Set rtm_state to "no rtm" in MDO.2565// Not using a metadata relocation. Method and Class Loader are kept alive anyway.2566// (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)2567load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);2568atomic_ori_int(R0, tmpReg, NoRTM);2569}2570b(L_done);25712572bind(L_check_always_rtm1);2573load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload2574bind(L_check_always_rtm2);2575ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);2576int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;2577if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only.2578cmpdi(CCR0, tmpReg, thresholdValue);2579} else {2580load_const_optimized(R0, thresholdValue);2581cmpd(CCR0, tmpReg, R0);2582}2583blt(CCR0, L_done);2584if (method_data != NULL) {2585// Set rtm_state to "always rtm" in MDO.2586// Not using a metadata relocation. See above.2587load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);2588atomic_ori_int(R0, tmpReg, UseRTM);2589}2590bind(L_done);2591}25922593// Update counters and perform abort ratio calculation.2594// input: abort_status_Reg2595void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,2596RTMLockingCounters* rtm_counters,2597Metadata* method_data,2598bool profile_rtm) {25992600assert(rtm_counters != NULL, "should not be NULL when profiling RTM");2601// Update rtm counters based on state at abort.2602// Reads abort_status_Reg, updates flags.2603assert_different_registers(abort_status_Reg, temp_Reg);2604load_const_optimized(temp_Reg, (address)rtm_counters, R0);2605rtm_counters_update(abort_status_Reg, temp_Reg);2606if (profile_rtm) {2607assert(rtm_counters != NULL, "should not be NULL when profiling RTM");2608rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);2609}2610}26112612// Retry on abort if abort's status indicates non-persistent failure.2613// inputs: retry_count_Reg2614// : abort_status_Reg2615// output: retry_count_Reg decremented by 12616void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,2617Label& retryLabel, Label* checkRetry) {2618Label doneRetry;26192620// Don't retry if failure is persistent.2621// The persistent bit is set when a (A) Disallowed operation is performed in2622// transactional state, like for instance trying to write the TFHAR after a2623// transaction is started; or when there is (B) a Nesting Overflow (too many2624// nested transactions); or when (C) the Footprint overflows (too many2625// addressess touched in TM state so there is no more space in the footprint2626// area to track them); or in case of (D) a Self-Induced Conflict, i.e. a2627// store is performed to a given address in TM state, then once in suspended2628// state the same address is accessed. Failure (A) is very unlikely to occur2629// in the JVM. Failure (D) will never occur because Suspended state is never2630// used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint2631// Overflow will set the persistent bit.2632rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);2633bne(CCR0, doneRetry);26342635// Don't retry if transaction was deliberately aborted, i.e. caused by a2636// tabort instruction.2637rldicr_(R0, abort_status_Reg, tm_tabort, 0);2638bne(CCR0, doneRetry);26392640// Retry if transaction aborted due to a conflict with another thread.2641if (checkRetry) { bind(*checkRetry); }2642addic_(retry_count_Reg, retry_count_Reg, -1);2643blt(CCR0, doneRetry);2644b(retryLabel);2645bind(doneRetry);2646}26472648// Spin and retry if lock is busy.2649// inputs: owner_addr_Reg (monitor address)2650// : retry_count_Reg2651// output: retry_count_Reg decremented by 12652// CTR is killed2653void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {2654Label SpinLoop, doneRetry, doRetry;2655addic_(retry_count_Reg, retry_count_Reg, -1);2656blt(CCR0, doneRetry);26572658if (RTMSpinLoopCount > 1) {2659li(R0, RTMSpinLoopCount);2660mtctr(R0);2661}26622663// low thread priority2664smt_prio_low();2665bind(SpinLoop);26662667if (RTMSpinLoopCount > 1) {2668bdz(doRetry);2669ld(R0, 0, owner_addr_Reg);2670cmpdi(CCR0, R0, 0);2671bne(CCR0, SpinLoop);2672}26732674bind(doRetry);26752676// restore thread priority to default in userspace2677#ifdef LINUX2678smt_prio_medium_low();2679#else2680smt_prio_medium();2681#endif26822683b(retryLabel);26842685bind(doneRetry);2686}26872688// Use RTM for normal stack locks.2689// Input: objReg (object to lock)2690void MacroAssembler::rtm_stack_locking(ConditionRegister flag,2691Register obj, Register mark_word, Register tmp,2692Register retry_on_abort_count_Reg,2693RTMLockingCounters* stack_rtm_counters,2694Metadata* method_data, bool profile_rtm,2695Label& DONE_LABEL, Label& IsInflated) {2696assert(UseRTMForStackLocks, "why call this otherwise?");2697assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");2698Label L_rtm_retry, L_decrement_retry, L_on_abort;26992700if (RTMRetryCount > 0) {2701load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort2702bind(L_rtm_retry);2703}2704andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased2705bne(CCR0, IsInflated);27062707if (PrintPreciseRTMLockingStatistics || profile_rtm) {2708Label L_noincrement;2709if (RTMTotalCountIncrRate > 1) {2710branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);2711}2712assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");2713load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);2714//atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically2715ldx(mark_word, tmp);2716addi(mark_word, mark_word, 1);2717stdx(mark_word, tmp);2718bind(L_noincrement);2719}2720tbegin_();2721beq(CCR0, L_on_abort);2722ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked.2723andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits2724cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked2725beq(flag, DONE_LABEL); // all done if unlocked27262727if (UseRTMXendForLockBusy) {2728tend_();2729b(L_decrement_retry);2730} else {2731tabort_();2732}2733bind(L_on_abort);2734const Register abort_status_Reg = tmp;2735mftexasr(abort_status_Reg);2736if (PrintPreciseRTMLockingStatistics || profile_rtm) {2737rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);2738}2739ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload2740if (RTMRetryCount > 0) {2741// Retry on lock abort if abort status is not permanent.2742rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);2743} else {2744bind(L_decrement_retry);2745}2746}27472748// Use RTM for inflating locks2749// inputs: obj (object to lock)2750// mark_word (current header - KILLED)2751// boxReg (on-stack box address (displaced header location) - KILLED)2752void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,2753Register obj, Register mark_word, Register boxReg,2754Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,2755RTMLockingCounters* rtm_counters,2756Metadata* method_data, bool profile_rtm,2757Label& DONE_LABEL) {2758assert(UseRTMLocking, "why call this otherwise?");2759Label L_rtm_retry, L_decrement_retry, L_on_abort;2760// Clean monitor_value bit to get valid pointer.2761int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;27622763// Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().2764std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);2765const Register tmpReg = boxReg;2766const Register owner_addr_Reg = mark_word;2767addi(owner_addr_Reg, mark_word, owner_offset);27682769if (RTMRetryCount > 0) {2770load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy.2771load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.2772bind(L_rtm_retry);2773}2774if (PrintPreciseRTMLockingStatistics || profile_rtm) {2775Label L_noincrement;2776if (RTMTotalCountIncrRate > 1) {2777branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);2778}2779assert(rtm_counters != NULL, "should not be NULL when profiling RTM");2780load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);2781//atomic_inc_ptr(R0, tmpReg); We don't increment atomically2782ldx(tmpReg, R0);2783addi(tmpReg, tmpReg, 1);2784stdx(tmpReg, R0);2785bind(L_noincrement);2786}2787tbegin_();2788beq(CCR0, L_on_abort);2789// We don't reload mark word. Will only be reset at safepoint.2790ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.2791cmpdi(flag, R0, 0);2792beq(flag, DONE_LABEL);27932794if (UseRTMXendForLockBusy) {2795tend_();2796b(L_decrement_retry);2797} else {2798tabort_();2799}2800bind(L_on_abort);2801const Register abort_status_Reg = tmpReg;2802mftexasr(abort_status_Reg);2803if (PrintPreciseRTMLockingStatistics || profile_rtm) {2804rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);2805// Restore owner_addr_Reg2806ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);2807#ifdef ASSERT2808andi_(R0, mark_word, markWord::monitor_value);2809asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.2810#endif2811addi(owner_addr_Reg, mark_word, owner_offset);2812}2813if (RTMRetryCount > 0) {2814// Retry on lock abort if abort status is not permanent.2815rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);2816}28172818// Appears unlocked - try to swing _owner from null to non-null.2819cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,2820MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2821MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);28222823if (RTMRetryCount > 0) {2824// success done else retry2825b(DONE_LABEL);2826bind(L_decrement_retry);2827// Spin and retry if lock is busy.2828rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);2829} else {2830bind(L_decrement_retry);2831}2832}28332834#endif // INCLUDE_RTM_OPT28352836// "The box" is the space on the stack where we copy the object mark.2837void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,2838Register temp, Register displaced_header, Register current_header,2839bool try_bias,2840RTMLockingCounters* rtm_counters,2841RTMLockingCounters* stack_rtm_counters,2842Metadata* method_data,2843bool use_rtm, bool profile_rtm) {2844assert_different_registers(oop, box, temp, displaced_header, current_header);2845assert(flag != CCR0, "bad condition register");2846Label cont;2847Label object_has_monitor;2848Label cas_failed;28492850// Load markWord from object into displaced_header.2851ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);28522853if (DiagnoseSyncOnValueBasedClasses != 0) {2854load_klass(temp, oop);2855lwz(temp, in_bytes(Klass::access_flags_offset()), temp);2856testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));2857bne(flag, cont);2858}28592860if (try_bias) {2861biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);2862}28632864#if INCLUDE_RTM_OPT2865if (UseRTMForStackLocks && use_rtm) {2866rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,2867stack_rtm_counters, method_data, profile_rtm,2868cont, object_has_monitor);2869}2870#endif // INCLUDE_RTM_OPT28712872// Handle existing monitor.2873// The object has an existing monitor iff (mark & monitor_value) != 0.2874andi_(temp, displaced_header, markWord::monitor_value);2875bne(CCR0, object_has_monitor);28762877// Set displaced_header to be (markWord of object | UNLOCK_VALUE).2878ori(displaced_header, displaced_header, markWord::unlocked_value);28792880// Load Compare Value application register.28812882// Initialize the box. (Must happen before we update the object mark!)2883std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);28842885// Must fence, otherwise, preceding store(s) may float below cmpxchg.2886// Compare object markWord with mark and if equal exchange scratch1 with object markWord.2887cmpxchgd(/*flag=*/flag,2888/*current_value=*/current_header,2889/*compare_value=*/displaced_header,2890/*exchange_value=*/box,2891/*where=*/oop,2892MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2893MacroAssembler::cmpxchgx_hint_acquire_lock(),2894noreg,2895&cas_failed,2896/*check without membar and ldarx first*/true);2897assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");28982899// If the compare-and-exchange succeeded, then we found an unlocked2900// object and we have now locked it.2901b(cont);29022903bind(cas_failed);2904// We did not see an unlocked object so try the fast recursive case.29052906// Check if the owner is self by comparing the value in the markWord of object2907// (current_header) with the stack pointer.2908sub(current_header, current_header, R1_SP);2909load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);29102911and_(R0/*==0?*/, current_header, temp);2912// If condition is true we are cont and hence we can store 0 as the2913// displaced header in the box, which indicates that it is a recursive lock.2914mcrf(flag,CCR0);2915std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);29162917// Handle existing monitor.2918b(cont);29192920bind(object_has_monitor);2921// The object's monitor m is unlocked iff m->owner == NULL,2922// otherwise m->owner may contain a thread or a stack address.29232924#if INCLUDE_RTM_OPT2925// Use the same RTM locking code in 32- and 64-bit VM.2926if (use_rtm) {2927rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,2928rtm_counters, method_data, profile_rtm, cont);2929} else {2930#endif // INCLUDE_RTM_OPT29312932// Try to CAS m->owner from NULL to current thread.2933addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);2934cmpxchgd(/*flag=*/flag,2935/*current_value=*/current_header,2936/*compare_value=*/(intptr_t)0,2937/*exchange_value=*/R16_thread,2938/*where=*/temp,2939MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2940MacroAssembler::cmpxchgx_hint_acquire_lock());29412942// Store a non-null value into the box.2943std(box, BasicLock::displaced_header_offset_in_bytes(), box);2944beq(flag, cont);29452946// Check for recursive locking.2947cmpd(flag, current_header, R16_thread);2948bne(flag, cont);29492950// Current thread already owns the lock. Just increment recursions.2951Register recursions = displaced_header;2952ld(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);2953addi(recursions, recursions, 1);2954std(recursions, ObjectMonitor::recursions_offset_in_bytes()-ObjectMonitor::owner_offset_in_bytes(), temp);29552956#if INCLUDE_RTM_OPT2957} // use_rtm()2958#endif29592960bind(cont);2961// flag == EQ indicates success2962// flag == NE indicates failure2963}29642965void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,2966Register temp, Register displaced_header, Register current_header,2967bool try_bias, bool use_rtm) {2968assert_different_registers(oop, box, temp, displaced_header, current_header);2969assert(flag != CCR0, "bad condition register");2970Label cont, object_has_monitor, notRecursive;29712972if (try_bias) {2973biased_locking_exit(flag, oop, current_header, cont);2974}29752976#if INCLUDE_RTM_OPT2977if (UseRTMForStackLocks && use_rtm) {2978assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");2979Label L_regular_unlock;2980ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword2981andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits2982cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked2983bne(flag, L_regular_unlock); // else RegularLock2984tend_(); // otherwise end...2985b(cont); // ... and we're done2986bind(L_regular_unlock);2987}2988#endif29892990// Find the lock address and load the displaced header from the stack.2991ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);29922993// If the displaced header is 0, we have a recursive unlock.2994cmpdi(flag, displaced_header, 0);2995beq(flag, cont);29962997// Handle existing monitor.2998// The object has an existing monitor iff (mark & monitor_value) != 0.2999RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done3000ld(current_header, oopDesc::mark_offset_in_bytes(), oop);3001andi_(R0, current_header, markWord::monitor_value);3002bne(CCR0, object_has_monitor);30033004// Check if it is still a light weight lock, this is is true if we see3005// the stack address of the basicLock in the markWord of the object.3006// Cmpxchg sets flag to cmpd(current_header, box).3007cmpxchgd(/*flag=*/flag,3008/*current_value=*/current_header,3009/*compare_value=*/box,3010/*exchange_value=*/displaced_header,3011/*where=*/oop,3012MacroAssembler::MemBarRel,3013MacroAssembler::cmpxchgx_hint_release_lock(),3014noreg,3015&cont);30163017assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");30183019// Handle existing monitor.3020b(cont);30213022bind(object_has_monitor);3023STATIC_ASSERT(markWord::monitor_value <= INT_MAX);3024addi(current_header, current_header, -(int)markWord::monitor_value); // monitor3025ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);30263027// It's inflated.3028#if INCLUDE_RTM_OPT3029if (use_rtm) {3030Label L_regular_inflated_unlock;3031// Clean monitor_value bit to get valid pointer3032cmpdi(flag, temp, 0);3033bne(flag, L_regular_inflated_unlock);3034tend_();3035b(cont);3036bind(L_regular_inflated_unlock);3037}3038#endif30393040ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);30413042cmpd(flag, temp, R16_thread);3043bne(flag, cont);30443045addic_(displaced_header, displaced_header, -1);3046blt(CCR0, notRecursive); // Not recursive if negative after decrement.3047std(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);3048b(cont); // flag is already EQ here.30493050bind(notRecursive);3051ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);3052ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);3053orr(temp, temp, displaced_header); // Will be 0 if both are 0.3054cmpdi(flag, temp, 0);3055bne(flag, cont);3056release();3057std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);30583059bind(cont);3060// flag == EQ indicates success3061// flag == NE indicates failure3062}30633064void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {3065ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);30663067if (at_return) {3068if (in_nmethod) {3069if (UseSIGTRAP) {3070// Use Signal Handler.3071relocate(relocInfo::poll_return_type);3072td(traptoGreaterThanUnsigned, R1_SP, temp);3073} else {3074cmpld(CCR0, R1_SP, temp);3075// Stub may be out of range for short conditional branch.3076bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);3077}3078} else { // Not in nmethod.3079// Frame still on stack, need to get fp.3080Register fp = R0;3081ld(fp, _abi0(callers_sp), R1_SP);3082cmpld(CCR0, fp, temp);3083bgt(CCR0, slow_path);3084}3085} else { // Normal safepoint poll. Not at return.3086assert(!in_nmethod, "should use load_from_polling_page");3087andi_(temp, temp, SafepointMechanism::poll_bit());3088bne(CCR0, slow_path);3089}3090}30913092void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,3093MacroAssembler::PreservationLevel preservation_level) {3094BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();3095bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);3096}30973098// Values for last_Java_pc, and last_Java_sp must comply to the rules3099// in frame_ppc.hpp.3100void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {3101// Always set last_Java_pc and flags first because once last_Java_sp3102// is visible has_last_Java_frame is true and users will look at the3103// rest of the fields. (Note: flags should always be zero before we3104// get here so doesn't need to be set.)31053106// Verify that last_Java_pc was zeroed on return to Java3107asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,3108"last_Java_pc not zeroed before leaving Java");31093110// When returning from calling out from Java mode the frame anchor's3111// last_Java_pc will always be set to NULL. It is set here so that3112// if we are doing a call to native (not VM) that we capture the3113// known pc and don't have to rely on the native call having a3114// standard frame linkage where we can find the pc.3115if (last_Java_pc != noreg)3116std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);31173118// Set last_Java_sp last.3119std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);3120}31213122void MacroAssembler::reset_last_Java_frame(void) {3123asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),3124R16_thread, "SP was not set, still zero");31253126BLOCK_COMMENT("reset_last_Java_frame {");3127li(R0, 0);31283129// _last_Java_sp = 03130std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);31313132// _last_Java_pc = 03133std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);3134BLOCK_COMMENT("} reset_last_Java_frame");3135}31363137void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {3138assert_different_registers(sp, tmp1);31393140// sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via3141// TOP_IJAVA_FRAME_ABI.3142// FIXME: assert that we really have a TOP_IJAVA_FRAME here!3143address entry = pc();3144load_const_optimized(tmp1, entry);31453146set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);3147}31483149void MacroAssembler::get_vm_result(Register oop_result) {3150// Read:3151// R16_thread3152// R16_thread->in_bytes(JavaThread::vm_result_offset())3153//3154// Updated:3155// oop_result3156// R16_thread->in_bytes(JavaThread::vm_result_offset())31573158verify_thread();31593160ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);3161li(R0, 0);3162std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);31633164verify_oop(oop_result, FILE_AND_LINE);3165}31663167void MacroAssembler::get_vm_result_2(Register metadata_result) {3168// Read:3169// R16_thread3170// R16_thread->in_bytes(JavaThread::vm_result_2_offset())3171//3172// Updated:3173// metadata_result3174// R16_thread->in_bytes(JavaThread::vm_result_2_offset())31753176ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);3177li(R0, 0);3178std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);3179}31803181Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {3182Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.3183if (CompressedKlassPointers::base() != 0) {3184// Use dst as temp if it is free.3185sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);3186current = dst;3187}3188if (CompressedKlassPointers::shift() != 0) {3189srdi(dst, current, CompressedKlassPointers::shift());3190current = dst;3191}3192return current;3193}31943195void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {3196if (UseCompressedClassPointers) {3197Register compressedKlass = encode_klass_not_null(ck, klass);3198stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);3199} else {3200std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);3201}3202}32033204void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {3205if (UseCompressedClassPointers) {3206if (val == noreg) {3207val = R0;3208li(val, 0);3209}3210stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed3211}3212}32133214int MacroAssembler::instr_size_for_decode_klass_not_null() {3215static int computed_size = -1;32163217// Not yet computed?3218if (computed_size == -1) {32193220if (!UseCompressedClassPointers) {3221computed_size = 0;3222} else {3223// Determine by scratch emit.3224ResourceMark rm;3225int code_size = 8 * BytesPerInstWord;3226CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);3227MacroAssembler* a = new MacroAssembler(&cb);3228a->decode_klass_not_null(R11_scratch1);3229computed_size = a->offset();3230}3231}32323233return computed_size;3234}32353236void MacroAssembler::decode_klass_not_null(Register dst, Register src) {3237assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");3238if (src == noreg) src = dst;3239Register shifted_src = src;3240if (CompressedKlassPointers::shift() != 0 ||3241CompressedKlassPointers::base() == 0 && src != dst) { // Move required.3242shifted_src = dst;3243sldi(shifted_src, src, CompressedKlassPointers::shift());3244}3245if (CompressedKlassPointers::base() != 0) {3246add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);3247}3248}32493250void MacroAssembler::load_klass(Register dst, Register src) {3251if (UseCompressedClassPointers) {3252lwz(dst, oopDesc::klass_offset_in_bytes(), src);3253// Attention: no null check here!3254decode_klass_not_null(dst, dst);3255} else {3256ld(dst, oopDesc::klass_offset_in_bytes(), src);3257}3258}32593260// ((OopHandle)result).resolve();3261void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,3262MacroAssembler::PreservationLevel preservation_level) {3263access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);3264}32653266void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,3267MacroAssembler::PreservationLevel preservation_level) {3268Label resolved;32693270// A null weak handle resolves to null.3271cmpdi(CCR0, result, 0);3272beq(CCR0, resolved);32733274access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,3275preservation_level);3276bind(resolved);3277}32783279void MacroAssembler::load_method_holder(Register holder, Register method) {3280ld(holder, in_bytes(Method::const_offset()), method);3281ld(holder, in_bytes(ConstMethod::constants_offset()), holder);3282ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);3283}32843285// Clear Array3286// For very short arrays. tmp == R0 is allowed.3287void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {3288if (cnt_dwords > 0) { li(tmp, 0); }3289for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }3290}32913292// Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.3293void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {3294if (cnt_dwords < 8) {3295clear_memory_unrolled(base_ptr, cnt_dwords, tmp);3296return;3297}32983299Label loop;3300const long loopcnt = cnt_dwords >> 1,3301remainder = cnt_dwords & 1;33023303li(tmp, loopcnt);3304mtctr(tmp);3305li(tmp, 0);3306bind(loop);3307std(tmp, 0, base_ptr);3308std(tmp, 8, base_ptr);3309addi(base_ptr, base_ptr, 16);3310bdnz(loop);3311if (remainder) { std(tmp, 0, base_ptr); }3312}33133314// Kills both input registers. tmp == R0 is allowed.3315void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {3316// Procedure for large arrays (uses data cache block zero instruction).3317Label startloop, fast, fastloop, small_rest, restloop, done;3318const int cl_size = VM_Version::L1_data_cache_line_size(),3319cl_dwords = cl_size >> 3,3320cl_dw_addr_bits = exact_log2(cl_dwords),3321dcbz_min = 1, // Min count of dcbz executions, needs to be >0.3322min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;33233324if (const_cnt >= 0) {3325// Constant case.3326if (const_cnt < min_cnt) {3327clear_memory_constlen(base_ptr, const_cnt, tmp);3328return;3329}3330load_const_optimized(cnt_dwords, const_cnt, tmp);3331} else {3332// cnt_dwords already loaded in register. Need to check size.3333cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).3334blt(CCR1, small_rest);3335}3336rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.3337beq(CCR0, fast); // Already 128byte aligned.33383339subfic(tmp, tmp, cl_dwords);3340mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).3341subf(cnt_dwords, tmp, cnt_dwords); // rest.3342li(tmp, 0);33433344bind(startloop); // Clear at the beginning to reach 128byte boundary.3345std(tmp, 0, base_ptr); // Clear 8byte aligned block.3346addi(base_ptr, base_ptr, 8);3347bdnz(startloop);33483349bind(fast); // Clear 128byte blocks.3350srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).3351andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.3352mtctr(tmp); // Load counter.33533354bind(fastloop);3355dcbz(base_ptr); // Clear 128byte aligned block.3356addi(base_ptr, base_ptr, cl_size);3357bdnz(fastloop);33583359bind(small_rest);3360cmpdi(CCR0, cnt_dwords, 0); // size 0?3361beq(CCR0, done); // rest == 03362li(tmp, 0);3363mtctr(cnt_dwords); // Load counter.33643365bind(restloop); // Clear rest.3366std(tmp, 0, base_ptr); // Clear 8byte aligned block.3367addi(base_ptr, base_ptr, 8);3368bdnz(restloop);33693370bind(done);3371}33723373/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////33743375// Helpers for Intrinsic Emitters3376//3377// Revert the byte order of a 32bit value in a register3378// src: 0x445566773379// dst: 0x776655443380// Three steps to obtain the result:3381// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word3382// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.3383// This value initializes dst.3384// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost3385// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.3386// This value is mask inserted into dst with a [0..23] mask of 1s.3387// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.3388// This value is mask inserted into dst with a [8..15] mask of 1s.3389void MacroAssembler::load_reverse_32(Register dst, Register src) {3390assert_different_registers(dst, src);33913392rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.3393rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.3394rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.3395}33963397// Calculate the column addresses of the crc32 lookup table into distinct registers.3398// This loop-invariant calculation is moved out of the loop body, reducing the loop3399// body size from 20 to 16 instructions.3400// Returns the offset that was used to calculate the address of column tc3.3401// Due to register shortage, setting tc3 may overwrite table. With the return offset3402// at hand, the original table address can be easily reconstructed.3403int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {3404assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");34053406// Point to 4 byte folding tables (byte-reversed version for Big Endian)3407// Layout: See StubRoutines::ppc::generate_crc_constants.3408#ifdef VM_LITTLE_ENDIAN3409const int ix0 = 3 * CRC32_TABLE_SIZE;3410const int ix1 = 2 * CRC32_TABLE_SIZE;3411const int ix2 = 1 * CRC32_TABLE_SIZE;3412const int ix3 = 0 * CRC32_TABLE_SIZE;3413#else3414const int ix0 = 1 * CRC32_TABLE_SIZE;3415const int ix1 = 2 * CRC32_TABLE_SIZE;3416const int ix2 = 3 * CRC32_TABLE_SIZE;3417const int ix3 = 4 * CRC32_TABLE_SIZE;3418#endif3419assert_different_registers(table, tc0, tc1, tc2);3420assert(table == tc3, "must be!");34213422addi(tc0, table, ix0);3423addi(tc1, table, ix1);3424addi(tc2, table, ix2);3425if (ix3 != 0) addi(tc3, table, ix3);34263427return ix3;3428}34293430/**3431* uint32_t crc;3432* table[crc & 0xFF] ^ (crc >> 8);3433*/3434void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {3435assert_different_registers(crc, table, tmp);3436assert_different_registers(val, table);34373438if (crc == val) { // Must rotate first to use the unmodified value.3439rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.3440// As we use a word (4-byte) instruction, we have to adapt the mask bit positions.3441srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.3442} else {3443srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.3444rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.3445}3446lwzx(tmp, table, tmp);3447xorr(crc, crc, tmp);3448}34493450/**3451* Emits code to update CRC-32 with a byte value according to constants in table.3452*3453* @param [in,out]crc Register containing the crc.3454* @param [in]val Register containing the byte to fold into the CRC.3455* @param [in]table Register containing the table of crc constants.3456*3457* uint32_t crc;3458* val = crc_table[(val ^ crc) & 0xFF];3459* crc = val ^ (crc >> 8);3460*/3461void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {3462BLOCK_COMMENT("update_byte_crc32:");3463xorr(val, val, crc);3464fold_byte_crc32(crc, val, table, val);3465}34663467/**3468* @param crc register containing existing CRC (32-bit)3469* @param buf register pointing to input byte buffer (byte*)3470* @param len register containing number of bytes3471* @param table register pointing to CRC table3472*/3473void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,3474Register data, bool loopAlignment) {3475assert_different_registers(crc, buf, len, table, data);34763477Label L_mainLoop, L_done;3478const int mainLoop_stepping = 1;3479const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;34803481// Process all bytes in a single-byte loop.3482clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?3483beq(CCR0, L_done);34843485mtctr(len);3486align(mainLoop_alignment);3487BIND(L_mainLoop);3488lbz(data, 0, buf); // Byte from buffer, zero-extended.3489addi(buf, buf, mainLoop_stepping); // Advance buffer position.3490update_byte_crc32(crc, data, table);3491bdnz(L_mainLoop); // Iterate.34923493bind(L_done);3494}34953496/**3497* Emits code to update CRC-32 with a 4-byte value according to constants in table3498* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c3499*/3500// A note on the lookup table address(es):3501// The implementation uses 4 table columns (byte-reversed versions for Big Endian).3502// To save the effort of adding the column offset to the table address each time3503// a table element is looked up, it is possible to pass the pre-calculated3504// column addresses.3505// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.3506void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,3507Register t0, Register t1, Register t2, Register t3,3508Register tc0, Register tc1, Register tc2, Register tc3) {3509assert_different_registers(crc, t3);35103511// XOR crc with next four bytes of buffer.3512lwz(t3, bufDisp, buf);3513if (bufInc != 0) {3514addi(buf, buf, bufInc);3515}3516xorr(t3, t3, crc);35173518// Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.3519rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 23520rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 23521rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 23522rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 235233524// Use the pre-calculated column addresses.3525// Load pre-calculated table values.3526lwzx(t0, tc0, t0);3527lwzx(t1, tc1, t1);3528lwzx(t2, tc2, t2);3529lwzx(t3, tc3, t3);35303531// Calculate new crc from table values.3532xorr(t0, t0, t1);3533xorr(t2, t2, t3);3534xorr(crc, t0, t2); // Now crc contains the final checksum value.3535}35363537/**3538* @param crc register containing existing CRC (32-bit)3539* @param buf register pointing to input byte buffer (byte*)3540* @param len register containing number of bytes3541* @param table register pointing to CRC table3542*3543* uses R9..R12 as work register. Must be saved/restored by caller!3544*/3545void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,3546Register t0, Register t1, Register t2, Register t3,3547Register tc0, Register tc1, Register tc2, Register tc3,3548bool invertCRC) {3549assert_different_registers(crc, buf, len, table);35503551Label L_mainLoop, L_tail;3552Register tmp = t0;3553Register data = t0;3554Register tmp2 = t1;3555const int mainLoop_stepping = 4;3556const int tailLoop_stepping = 1;3557const int log_stepping = exact_log2(mainLoop_stepping);3558const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;3559const int complexThreshold = 2*mainLoop_stepping;35603561// Don't test for len <= 0 here. This pathological case should not occur anyway.3562// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles3563// for all well-behaved cases. The situation itself is detected and handled correctly3564// within update_byteLoop_crc32.3565assert(tailLoop_stepping == 1, "check tailLoop_stepping!");35663567BLOCK_COMMENT("kernel_crc32_1word {");35683569if (invertCRC) {3570nand(crc, crc, crc); // 1s complement of crc3571}35723573// Check for short (<mainLoop_stepping) buffer.3574cmpdi(CCR0, len, complexThreshold);3575blt(CCR0, L_tail);35763577// Pre-mainLoop alignment did show a slight (1%) positive effect on performance.3578// We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.3579{3580// Align buf addr to mainLoop_stepping boundary.3581neg(tmp2, buf); // Calculate # preLoop iterations for alignment.3582rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.35833584if (complexThreshold > mainLoop_stepping) {3585sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3586} else {3587sub(tmp, len, tmp2); // Remaining bytes for main loop.3588cmpdi(CCR0, tmp, mainLoop_stepping);3589blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing3590mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3591}3592update_byteLoop_crc32(crc, buf, tmp2, table, data, false);3593}35943595srdi(tmp2, len, log_stepping); // #iterations for mainLoop3596andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop3597mtctr(tmp2);35983599#ifdef VM_LITTLE_ENDIAN3600Register crc_rv = crc;3601#else3602Register crc_rv = tmp; // Load_reverse needs separate registers to work on.3603// Occupies tmp, but frees up crc.3604load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.3605tmp = crc;3606#endif36073608int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);36093610align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.3611BIND(L_mainLoop);3612update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);3613bdnz(L_mainLoop);36143615#ifndef VM_LITTLE_ENDIAN3616load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.3617tmp = crc_rv; // Tmp uses it's original register again.3618#endif36193620// Restore original table address for tailLoop.3621if (reconstructTableOffset != 0) {3622addi(table, table, -reconstructTableOffset);3623}36243625// Process last few (<complexThreshold) bytes of buffer.3626BIND(L_tail);3627update_byteLoop_crc32(crc, buf, len, table, data, false);36283629if (invertCRC) {3630nand(crc, crc, crc); // 1s complement of crc3631}3632BLOCK_COMMENT("} kernel_crc32_1word");3633}36343635/**3636* @param crc register containing existing CRC (32-bit)3637* @param buf register pointing to input byte buffer (byte*)3638* @param len register containing number of bytes3639* @param constants register pointing to precomputed constants3640* @param t0-t6 temp registers3641*/3642void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,3643Register t0, Register t1, Register t2, Register t3,3644Register t4, Register t5, Register t6, bool invertCRC) {3645assert_different_registers(crc, buf, len, constants);36463647Label L_tail;36483649BLOCK_COMMENT("kernel_crc32_vpmsum {");36503651if (invertCRC) {3652nand(crc, crc, crc); // 1s complement of crc3653}36543655// Enforce 32 bit.3656clrldi(len, len, 32);36573658// Align if we have enough bytes for the fast version.3659const int alignment = 16,3660threshold = 32;3661Register prealign = t0;36623663neg(prealign, buf);3664addi(t1, len, -threshold);3665andi(prealign, prealign, alignment - 1);3666cmpw(CCR0, t1, prealign);3667blt(CCR0, L_tail); // len - prealign < threshold?36683669subf(len, prealign, len);3670update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);36713672// Calculate from first aligned address as far as possible.3673addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.3674kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);3675addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.36763677// Remaining bytes.3678BIND(L_tail);3679update_byteLoop_crc32(crc, buf, len, constants, t2, false);36803681if (invertCRC) {3682nand(crc, crc, crc); // 1s complement of crc3683}36843685BLOCK_COMMENT("} kernel_crc32_vpmsum");3686}36873688/**3689* @param crc register containing existing CRC (32-bit)3690* @param buf register pointing to input byte buffer (byte*)3691* @param len register containing number of bytes (will get updated to remaining bytes)3692* @param constants register pointing to CRC table for 128-bit aligned memory3693* @param t0-t6 temp registers3694*/3695void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,3696Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {36973698// Save non-volatile vector registers (frameless).3699Register offset = t1;3700int offsetInt = 0;3701offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);3702offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);3703offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);3704offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);3705offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);3706offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);3707#ifndef VM_LITTLE_ENDIAN3708offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);3709#endif3710offsetInt -= 8; std(R14, offsetInt, R1_SP);3711offsetInt -= 8; std(R15, offsetInt, R1_SP);37123713// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor3714// bytes per iteration. The basic scheme is:3715// lvx: load vector (Big Endian needs reversal)3716// vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift3717// vxor: xor partial results together to get unroll_factor2 vectors37183719// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.37203721// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.3722const int unroll_factor = CRC32_UNROLL_FACTOR,3723unroll_factor2 = CRC32_UNROLL_FACTOR2;37243725const int outer_consts_size = (unroll_factor2 - 1) * 16,3726inner_consts_size = (unroll_factor / unroll_factor2) * 16;37273728// Support registers.3729Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };3730Register num_bytes = R14,3731loop_count = R15,3732cur_const = crc; // will live in VCRC3733// Constant array for outer loop: unroll_factor2 - 1 registers,3734// Constant array for inner loop: unroll_factor / unroll_factor2 registers.3735VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },3736consts1[] = { VR23, VR24 };3737// Data register arrays: 2 arrays with unroll_factor2 registers.3738VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },3739data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };37403741VectorRegister VCRC = data0[0];3742VectorRegister Vc = VR25;3743VectorRegister swap_bytes = VR26; // Only for Big Endian.37443745// We have at least 1 iteration (ensured by caller).3746Label L_outer_loop, L_inner_loop, L_last;37473748// If supported set DSCR pre-fetch to deepest.3749if (VM_Version::has_mfdscr()) {3750load_const_optimized(t0, VM_Version::_dscr_val | 7);3751mtdscr(t0);3752}37533754mtvrwz(VCRC, crc); // crc lives in VCRC, now37553756for (int i = 1; i < unroll_factor2; ++i) {3757li(offs[i], 16 * i);3758}37593760// Load consts for outer loop3761lvx(consts0[0], constants);3762for (int i = 1; i < unroll_factor2 - 1; ++i) {3763lvx(consts0[i], offs[i], constants);3764}37653766load_const_optimized(num_bytes, 16 * unroll_factor);37673768// Reuse data registers outside of the loop.3769VectorRegister Vtmp = data1[0];3770VectorRegister Vtmp2 = data1[1];3771VectorRegister zeroes = data1[2];37723773vspltisb(Vtmp, 0);3774vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.37753776// Load vector for vpermxor (to xor both 64 bit parts together)3777lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f3778vspltisb(Vc, 4);3779vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f03780xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);3781vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f37823783#ifdef VM_LITTLE_ENDIAN3784#define BE_swap_bytes(x)3785#else3786vspltisb(Vtmp2, 0xf);3787vxor(swap_bytes, Vtmp, Vtmp2);3788#define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)3789#endif37903791cmpd(CCR0, len, num_bytes);3792blt(CCR0, L_last);37933794addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop3795load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.37963797// ********** Main loop start **********3798align(32);3799bind(L_outer_loop);38003801// Begin of unrolled first iteration (no xor).3802lvx(data1[0], buf);3803for (int i = 1; i < unroll_factor2 / 2; ++i) {3804lvx(data1[i], offs[i], buf);3805}3806vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.3807lvx(consts1[0], cur_const);3808mtctr(loop_count);3809for (int i = 0; i < unroll_factor2 / 2; ++i) {3810BE_swap_bytes(data1[i]);3811if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.3812lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);3813vpmsumw(data0[i], data1[i], consts1[0]);3814}3815addi(buf, buf, 16 * unroll_factor2);3816subf(len, num_bytes, len);3817lvx(consts1[1], offs[1], cur_const);3818addi(cur_const, cur_const, 32);3819// Begin of unrolled second iteration (head).3820for (int i = 0; i < unroll_factor2 / 2; ++i) {3821BE_swap_bytes(data1[i + unroll_factor2 / 2]);3822if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }3823vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);3824}3825for (int i = 0; i < unroll_factor2 / 2; ++i) {3826BE_swap_bytes(data1[i]);3827lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);3828vpmsumw(data1[i], data1[i], consts1[1]);3829}3830addi(buf, buf, 16 * unroll_factor2);38313832// Generate most performance relevant code. Loads + half of the vpmsumw have been generated.3833// Double-iteration allows using the 2 constant registers alternatingly.3834align(32);3835bind(L_inner_loop);3836for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.3837if (j & 1) {3838lvx(consts1[0], cur_const);3839} else {3840lvx(consts1[1], offs[1], cur_const);3841addi(cur_const, cur_const, 32);3842}3843for (int i = 0; i < unroll_factor2; ++i) {3844int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.3845if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }3846BE_swap_bytes(data1[idx]);3847vxor(data0[i], data0[i], data1[i]);3848if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);3849vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);3850}3851addi(buf, buf, 16 * unroll_factor2);3852}3853bdnz(L_inner_loop);38543855addi(cur_const, constants, outer_consts_size); // Reset38563857// Tail of last iteration (no loads).3858for (int i = 0; i < unroll_factor2 / 2; ++i) {3859BE_swap_bytes(data1[i + unroll_factor2 / 2]);3860vxor(data0[i], data0[i], data1[i]);3861vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);3862}3863for (int i = 0; i < unroll_factor2 / 2; ++i) {3864vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.3865vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);3866}38673868// Last data register is ok, other ones need fixup shift.3869for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {3870vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);3871}38723873// Combine to 128 bit result vector VCRC = data0[0].3874for (int i = 1; i < unroll_factor2; i<<=1) {3875for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {3876vxor(data0[j], data0[j], data0[j+i]);3877}3878}3879cmpd(CCR0, len, num_bytes);3880bge(CCR0, L_outer_loop);38813882// Last chance with lower num_bytes.3883bind(L_last);3884srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.3885// Point behind last const for inner loop.3886add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);3887sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.3888clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));3889subf(cur_const, R0, cur_const); // Point to constant to be used first.38903891addic_(loop_count, loop_count, -1); // One double-iteration peeled off.3892bgt(CCR0, L_outer_loop);3893// ********** Main loop end **********38943895// Restore DSCR pre-fetch value.3896if (VM_Version::has_mfdscr()) {3897load_const_optimized(t0, VM_Version::_dscr_val);3898mtdscr(t0);3899}39003901// ********** Simple loop for remaining 16 byte blocks **********3902{3903Label L_loop, L_done;39043905srdi_(t0, len, 4); // 16 bytes per iteration3906clrldi(len, len, 64-4);3907beq(CCR0, L_done);39083909// Point to const (same as last const for inner loop).3910add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);3911mtctr(t0);3912lvx(Vtmp2, cur_const);39133914align(32);3915bind(L_loop);39163917lvx(Vtmp, buf);3918addi(buf, buf, 16);3919vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.3920BE_swap_bytes(Vtmp);3921vxor(VCRC, VCRC, Vtmp);3922vpmsumw(VCRC, VCRC, Vtmp2);3923bdnz(L_loop);39243925bind(L_done);3926}3927// ********** Simple loop end **********3928#undef BE_swap_bytes39293930// Point to Barrett constants3931add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);39323933vspltisb(zeroes, 0);39343935// Combine to 64 bit result.3936vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.39373938// Reduce to 32 bit CRC: Remainder by multiply-high.3939lvx(Vtmp, cur_const);3940vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.3941vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.3942vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.3943vsldoi(Vtmp, zeroes, Vtmp, 8);3944vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.3945vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.39463947// Move result. len is already updated.3948vsldoi(VCRC, VCRC, zeroes, 8);3949mfvrd(crc, VCRC);39503951// Restore non-volatile Vector registers (frameless).3952offsetInt = 0;3953offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);3954offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);3955offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);3956offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);3957offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);3958offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);3959#ifndef VM_LITTLE_ENDIAN3960offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);3961#endif3962offsetInt -= 8; ld(R14, offsetInt, R1_SP);3963offsetInt -= 8; ld(R15, offsetInt, R1_SP);3964}39653966void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,3967Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {3968load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()3969: StubRoutines::crc_table_addr() , R0);39703971if (VM_Version::has_vpmsumb()) {3972kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);3973} else {3974kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);3975}3976}39773978void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {3979assert_different_registers(crc, val, table);39803981BLOCK_COMMENT("kernel_crc32_singleByteReg:");3982if (invertCRC) {3983nand(crc, crc, crc); // 1s complement of crc3984}39853986update_byte_crc32(crc, val, table);39873988if (invertCRC) {3989nand(crc, crc, crc); // 1s complement of crc3990}3991}39923993// dest_lo += src1 + src23994// dest_hi += carry1 + carry23995void MacroAssembler::add2_with_carry(Register dest_hi,3996Register dest_lo,3997Register src1, Register src2) {3998li(R0, 0);3999addc(dest_lo, dest_lo, src1);4000adde(dest_hi, dest_hi, R0);4001addc(dest_lo, dest_lo, src2);4002adde(dest_hi, dest_hi, R0);4003}40044005// Multiply 64 bit by 64 bit first loop.4006void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,4007Register x_xstart,4008Register y, Register y_idx,4009Register z,4010Register carry,4011Register product_high, Register product,4012Register idx, Register kdx,4013Register tmp) {4014// jlong carry, x[], y[], z[];4015// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {4016// huge_128 product = y[idx] * x[xstart] + carry;4017// z[kdx] = (jlong)product;4018// carry = (jlong)(product >>> 64);4019// }4020// z[xstart] = carry;40214022Label L_first_loop, L_first_loop_exit;4023Label L_one_x, L_one_y, L_multiply;40244025addic_(xstart, xstart, -1);4026blt(CCR0, L_one_x); // Special case: length of x is 1.40274028// Load next two integers of x.4029sldi(tmp, xstart, LogBytesPerInt);4030ldx(x_xstart, x, tmp);4031#ifdef VM_LITTLE_ENDIAN4032rldicl(x_xstart, x_xstart, 32, 0);4033#endif40344035align(32, 16);4036bind(L_first_loop);40374038cmpdi(CCR0, idx, 1);4039blt(CCR0, L_first_loop_exit);4040addi(idx, idx, -2);4041beq(CCR0, L_one_y);40424043// Load next two integers of y.4044sldi(tmp, idx, LogBytesPerInt);4045ldx(y_idx, y, tmp);4046#ifdef VM_LITTLE_ENDIAN4047rldicl(y_idx, y_idx, 32, 0);4048#endif404940504051bind(L_multiply);4052multiply64(product_high, product, x_xstart, y_idx);40534054li(tmp, 0);4055addc(product, product, carry); // Add carry to result.4056adde(product_high, product_high, tmp); // Add carry of the last addition.4057addi(kdx, kdx, -2);40584059// Store result.4060#ifdef VM_LITTLE_ENDIAN4061rldicl(product, product, 32, 0);4062#endif4063sldi(tmp, kdx, LogBytesPerInt);4064stdx(product, z, tmp);4065mr_if_needed(carry, product_high);4066b(L_first_loop);406740684069bind(L_one_y); // Load one 32 bit portion of y as (0,value).40704071lwz(y_idx, 0, y);4072b(L_multiply);407340744075bind(L_one_x); // Load one 32 bit portion of x as (0,value).40764077lwz(x_xstart, 0, x);4078b(L_first_loop);40794080bind(L_first_loop_exit);4081}40824083// Multiply 64 bit by 64 bit and add 128 bit.4084void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,4085Register z, Register yz_idx,4086Register idx, Register carry,4087Register product_high, Register product,4088Register tmp, int offset) {40894090// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;4091// z[kdx] = (jlong)product;40924093sldi(tmp, idx, LogBytesPerInt);4094if (offset) {4095addi(tmp, tmp, offset);4096}4097ldx(yz_idx, y, tmp);4098#ifdef VM_LITTLE_ENDIAN4099rldicl(yz_idx, yz_idx, 32, 0);4100#endif41014102multiply64(product_high, product, x_xstart, yz_idx);4103ldx(yz_idx, z, tmp);4104#ifdef VM_LITTLE_ENDIAN4105rldicl(yz_idx, yz_idx, 32, 0);4106#endif41074108add2_with_carry(product_high, product, carry, yz_idx);41094110sldi(tmp, idx, LogBytesPerInt);4111if (offset) {4112addi(tmp, tmp, offset);4113}4114#ifdef VM_LITTLE_ENDIAN4115rldicl(product, product, 32, 0);4116#endif4117stdx(product, z, tmp);4118}41194120// Multiply 128 bit by 128 bit. Unrolled inner loop.4121void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,4122Register y, Register z,4123Register yz_idx, Register idx, Register carry,4124Register product_high, Register product,4125Register carry2, Register tmp) {41264127// jlong carry, x[], y[], z[];4128// int kdx = ystart+1;4129// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop4130// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;4131// z[kdx+idx+1] = (jlong)product;4132// jlong carry2 = (jlong)(product >>> 64);4133// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;4134// z[kdx+idx] = (jlong)product;4135// carry = (jlong)(product >>> 64);4136// }4137// idx += 2;4138// if (idx > 0) {4139// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;4140// z[kdx+idx] = (jlong)product;4141// carry = (jlong)(product >>> 64);4142// }41434144Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;4145const Register jdx = R0;41464147// Scale the index.4148srdi_(jdx, idx, 2);4149beq(CCR0, L_third_loop_exit);4150mtctr(jdx);41514152align(32, 16);4153bind(L_third_loop);41544155addi(idx, idx, -4);41564157multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);4158mr_if_needed(carry2, product_high);41594160multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);4161mr_if_needed(carry, product_high);4162bdnz(L_third_loop);41634164bind(L_third_loop_exit); // Handle any left-over operand parts.41654166andi_(idx, idx, 0x3);4167beq(CCR0, L_post_third_loop_done);41684169Label L_check_1;41704171addic_(idx, idx, -2);4172blt(CCR0, L_check_1);41734174multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);4175mr_if_needed(carry, product_high);41764177bind(L_check_1);41784179addi(idx, idx, 0x2);4180andi_(idx, idx, 0x1);4181addic_(idx, idx, -1);4182blt(CCR0, L_post_third_loop_done);41834184sldi(tmp, idx, LogBytesPerInt);4185lwzx(yz_idx, y, tmp);4186multiply64(product_high, product, x_xstart, yz_idx);4187lwzx(yz_idx, z, tmp);41884189add2_with_carry(product_high, product, yz_idx, carry);41904191sldi(tmp, idx, LogBytesPerInt);4192stwx(product, z, tmp);4193srdi(product, product, 32);41944195sldi(product_high, product_high, 32);4196orr(product, product, product_high);4197mr_if_needed(carry, product);41984199bind(L_post_third_loop_done);4200} // multiply_128_x_128_loop42014202void MacroAssembler::muladd(Register out, Register in,4203Register offset, Register len, Register k,4204Register tmp1, Register tmp2, Register carry) {42054206// Labels4207Label LOOP, SKIP;42084209// Make sure length is positive.4210cmpdi (CCR0, len, 0);42114212// Prepare variables4213subi (offset, offset, 4);4214li (carry, 0);4215ble (CCR0, SKIP);42164217mtctr (len);4218subi (len, len, 1 );4219sldi (len, len, 2 );42204221// Main loop4222bind(LOOP);4223lwzx (tmp1, len, in );4224lwzx (tmp2, offset, out );4225mulld (tmp1, tmp1, k );4226add (tmp2, carry, tmp2 );4227add (tmp2, tmp1, tmp2 );4228stwx (tmp2, offset, out );4229srdi (carry, tmp2, 32 );4230subi (offset, offset, 4 );4231subi (len, len, 4 );4232bdnz (LOOP);4233bind(SKIP);4234}42354236void MacroAssembler::multiply_to_len(Register x, Register xlen,4237Register y, Register ylen,4238Register z, Register zlen,4239Register tmp1, Register tmp2,4240Register tmp3, Register tmp4,4241Register tmp5, Register tmp6,4242Register tmp7, Register tmp8,4243Register tmp9, Register tmp10,4244Register tmp11, Register tmp12,4245Register tmp13) {42464247ShortBranchVerifier sbv(this);42484249assert_different_registers(x, xlen, y, ylen, z, zlen,4250tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);4251assert_different_registers(x, xlen, y, ylen, z, zlen,4252tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);4253assert_different_registers(x, xlen, y, ylen, z, zlen,4254tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);42554256const Register idx = tmp1;4257const Register kdx = tmp2;4258const Register xstart = tmp3;42594260const Register y_idx = tmp4;4261const Register carry = tmp5;4262const Register product = tmp6;4263const Register product_high = tmp7;4264const Register x_xstart = tmp8;4265const Register tmp = tmp9;42664267// First Loop.4268//4269// final static long LONG_MASK = 0xffffffffL;4270// int xstart = xlen - 1;4271// int ystart = ylen - 1;4272// long carry = 0;4273// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {4274// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;4275// z[kdx] = (int)product;4276// carry = product >>> 32;4277// }4278// z[xstart] = (int)carry;42794280mr_if_needed(idx, ylen); // idx = ylen4281mr_if_needed(kdx, zlen); // kdx = xlen + ylen4282li(carry, 0); // carry = 042834284Label L_done;42854286addic_(xstart, xlen, -1);4287blt(CCR0, L_done);42884289multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,4290carry, product_high, product, idx, kdx, tmp);42914292Label L_second_loop;42934294cmpdi(CCR0, kdx, 0);4295beq(CCR0, L_second_loop);42964297Label L_carry;42984299addic_(kdx, kdx, -1);4300beq(CCR0, L_carry);43014302// Store lower 32 bits of carry.4303sldi(tmp, kdx, LogBytesPerInt);4304stwx(carry, z, tmp);4305srdi(carry, carry, 32);4306addi(kdx, kdx, -1);430743084309bind(L_carry);43104311// Store upper 32 bits of carry.4312sldi(tmp, kdx, LogBytesPerInt);4313stwx(carry, z, tmp);43144315// Second and third (nested) loops.4316//4317// for (int i = xstart-1; i >= 0; i--) { // Second loop4318// carry = 0;4319// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop4320// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +4321// (z[k] & LONG_MASK) + carry;4322// z[k] = (int)product;4323// carry = product >>> 32;4324// }4325// z[i] = (int)carry;4326// }4327//4328// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx43294330bind(L_second_loop);43314332li(carry, 0); // carry = 0;43334334addic_(xstart, xstart, -1); // i = xstart-1;4335blt(CCR0, L_done);43364337Register zsave = tmp10;43384339mr(zsave, z);434043414342Label L_last_x;43434344sldi(tmp, xstart, LogBytesPerInt);4345add(z, z, tmp); // z = z + k - j4346addi(z, z, 4);4347addic_(xstart, xstart, -1); // i = xstart-1;4348blt(CCR0, L_last_x);43494350sldi(tmp, xstart, LogBytesPerInt);4351ldx(x_xstart, x, tmp);4352#ifdef VM_LITTLE_ENDIAN4353rldicl(x_xstart, x_xstart, 32, 0);4354#endif435543564357Label L_third_loop_prologue;43584359bind(L_third_loop_prologue);43604361Register xsave = tmp11;4362Register xlensave = tmp12;4363Register ylensave = tmp13;43644365mr(xsave, x);4366mr(xlensave, xstart);4367mr(ylensave, ylen);436843694370multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,4371carry, product_high, product, x, tmp);43724373mr(z, zsave);4374mr(x, xsave);4375mr(xlen, xlensave); // This is the decrement of the loop counter!4376mr(ylen, ylensave);43774378addi(tmp3, xlen, 1);4379sldi(tmp, tmp3, LogBytesPerInt);4380stwx(carry, z, tmp);4381addic_(tmp3, tmp3, -1);4382blt(CCR0, L_done);43834384srdi(carry, carry, 32);4385sldi(tmp, tmp3, LogBytesPerInt);4386stwx(carry, z, tmp);4387b(L_second_loop);43884389// Next infrequent code is moved outside loops.4390bind(L_last_x);43914392lwz(x_xstart, 0, x);4393b(L_third_loop_prologue);43944395bind(L_done);4396} // multiply_to_len43974398void MacroAssembler::asm_assert(bool check_equal, const char *msg) {4399#ifdef ASSERT4400Label ok;4401if (check_equal) {4402beq(CCR0, ok);4403} else {4404bne(CCR0, ok);4405}4406stop(msg);4407bind(ok);4408#endif4409}44104411void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,4412Register mem_base, const char* msg) {4413#ifdef ASSERT4414switch (size) {4415case 4:4416lwz(R0, mem_offset, mem_base);4417cmpwi(CCR0, R0, 0);4418break;4419case 8:4420ld(R0, mem_offset, mem_base);4421cmpdi(CCR0, R0, 0);4422break;4423default:4424ShouldNotReachHere();4425}4426asm_assert(check_equal, msg);4427#endif // ASSERT4428}44294430void MacroAssembler::verify_thread() {4431if (VerifyThread) {4432unimplemented("'VerifyThread' currently not implemented on PPC");4433}4434}44354436void MacroAssembler::verify_coop(Register coop, const char* msg) {4437if (!VerifyOops) { return; }4438if (UseCompressedOops) { decode_heap_oop(coop); }4439verify_oop(coop, msg);4440if (UseCompressedOops) { encode_heap_oop(coop, coop); }4441}44424443// READ: oop. KILL: R0. Volatile floats perhaps.4444void MacroAssembler::verify_oop(Register oop, const char* msg) {4445if (!VerifyOops) {4446return;4447}44484449address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();4450const Register tmp = R11; // Will be preserved.4451const int nbytes_save = MacroAssembler::num_volatile_regs * 8;44524453BLOCK_COMMENT("verify_oop {");44544455save_volatile_gprs(R1_SP, -nbytes_save); // except R044564457mr_if_needed(R4_ARG2, oop);4458save_LR_CR(tmp); // save in old frame4459push_frame_reg_args(nbytes_save, tmp);4460// load FunctionDescriptor** / entry_address *4461load_const_optimized(tmp, fd, R0);4462// load FunctionDescriptor* / entry_address4463ld(tmp, 0, tmp);4464load_const_optimized(R3_ARG1, (address)msg, R0);4465// Call destination for its side effect.4466call_c(tmp);44674468pop_frame();4469restore_LR_CR(tmp);4470restore_volatile_gprs(R1_SP, -nbytes_save); // except R044714472BLOCK_COMMENT("} verify_oop");4473}44744475void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {4476if (!VerifyOops) {4477return;4478}44794480address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();4481const Register tmp = R11; // Will be preserved.4482const int nbytes_save = MacroAssembler::num_volatile_regs * 8;4483save_volatile_gprs(R1_SP, -nbytes_save); // except R044844485ld(R4_ARG2, offs, base);4486save_LR_CR(tmp); // save in old frame4487push_frame_reg_args(nbytes_save, tmp);4488// load FunctionDescriptor** / entry_address *4489load_const_optimized(tmp, fd, R0);4490// load FunctionDescriptor* / entry_address4491ld(tmp, 0, tmp);4492load_const_optimized(R3_ARG1, (address)msg, R0);4493// Call destination for its side effect.4494call_c(tmp);44954496pop_frame();4497restore_LR_CR(tmp);4498restore_volatile_gprs(R1_SP, -nbytes_save); // except R04499}45004501// Call a C-function that prints output.4502void MacroAssembler::stop(int type, const char* msg) {4503bool msg_present = (msg != NULL);45044505#ifndef PRODUCT4506block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));4507#else4508block_comment("stop {");4509#endif45104511if (msg_present) {4512type |= stop_msg_present;4513}4514tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);4515if (msg_present) {4516emit_int64((uintptr_t)msg);4517}45184519block_comment("} stop;");4520}45214522#ifndef PRODUCT4523// Write pattern 0x0101010101010101 in memory region [low-before, high+after].4524// Val, addr are temp registers.4525// If low == addr, addr is killed.4526// High is preserved.4527void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {4528if (!ZapMemory) return;45294530assert_different_registers(low, val);45314532BLOCK_COMMENT("zap memory region {");4533load_const_optimized(val, 0x0101010101010101);4534int size = before + after;4535if (low == high && size < 5 && size > 0) {4536int offset = -before*BytesPerWord;4537for (int i = 0; i < size; ++i) {4538std(val, offset, low);4539offset += (1*BytesPerWord);4540}4541} else {4542addi(addr, low, -before*BytesPerWord);4543assert_different_registers(high, val);4544if (after) addi(high, high, after * BytesPerWord);4545Label loop;4546bind(loop);4547std(val, 0, addr);4548addi(addr, addr, 8);4549cmpd(CCR6, addr, high);4550ble(CCR6, loop);4551if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.4552}4553BLOCK_COMMENT("} zap memory region");4554}45554556#endif // !PRODUCT45574558void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,4559const bool* flag_addr, Label& label) {4560int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);4561assert(sizeof(bool) == 1, "PowerPC ABI");4562masm->lbz(temp, simm16_offset, temp);4563masm->cmpwi(CCR0, temp, 0);4564masm->beq(CCR0, label);4565}45664567SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {4568skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);4569}45704571SkipIfEqualZero::~SkipIfEqualZero() {4572_masm->bind(_label);4573}45744575void MacroAssembler::cache_wb(Address line) {4576assert(line.index() == noreg, "index should be noreg");4577assert(line.disp() == 0, "displacement should be 0");4578assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");4579// Data Cache Store, not really a flush, so it works like a sync of cache4580// line and persistent mem, i.e. copying the cache line to persistent whilst4581// not invalidating the cache line.4582dcbst(line.base());4583}45844585void MacroAssembler::cache_wbsync(bool is_presync) {4586assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");4587// We only need a post sync barrier. Post means _after_ a cache line flush or4588// store instruction, pre means a barrier emitted before such a instructions.4589if (!is_presync) {4590fence();4591}4592}459345944595