Path: blob/master/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
40930 views
/*1* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.2* Copyright (c) 2012, 2021 SAP SE. All rights reserved.3* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.4*5* This code is free software; you can redistribute it and/or modify it6* under the terms of the GNU General Public License version 2 only, as7* published by the Free Software Foundation.8*9* This code is distributed in the hope that it will be useful, but WITHOUT10* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or11* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License12* version 2 for more details (a copy is included in the LICENSE file that13* accompanied this code).14*15* You should have received a copy of the GNU General Public License version16* 2 along with this work; if not, write to the Free Software Foundation,17* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.18*19* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA20* or visit www.oracle.com if you need additional information or have any21* questions.22*23*/2425#include "precompiled.hpp"26#include "asm/macroAssembler.inline.hpp"27#include "compiler/disassembler.hpp"28#include "gc/shared/collectedHeap.inline.hpp"29#include "gc/shared/barrierSet.hpp"30#include "gc/shared/barrierSetAssembler.hpp"31#include "interpreter/interpreter.hpp"32#include "memory/resourceArea.hpp"33#include "nativeInst_ppc.hpp"34#include "oops/klass.inline.hpp"35#include "oops/methodData.hpp"36#include "prims/methodHandles.hpp"37#include "runtime/biasedLocking.hpp"38#include "runtime/icache.hpp"39#include "runtime/interfaceSupport.inline.hpp"40#include "runtime/objectMonitor.hpp"41#include "runtime/os.hpp"42#include "runtime/safepoint.hpp"43#include "runtime/safepointMechanism.hpp"44#include "runtime/sharedRuntime.hpp"45#include "runtime/stubRoutines.hpp"46#include "runtime/vm_version.hpp"47#include "utilities/macros.hpp"48#include "utilities/powerOfTwo.hpp"4950#ifdef PRODUCT51#define BLOCK_COMMENT(str) // nothing52#else53#define BLOCK_COMMENT(str) block_comment(str)54#endif55#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")5657#ifdef ASSERT58// On RISC, there's no benefit to verifying instruction boundaries.59bool AbstractAssembler::pd_check_instruction_mark() { return false; }60#endif6162void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {63assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");64if (Assembler::is_simm(si31, 16)) {65ld(d, si31, a);66if (emit_filler_nop) nop();67} else {68const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);69const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);70addis(d, a, hi);71ld(d, lo, d);72}73}7475void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {76assert_different_registers(d, a);77ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);78}7980void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,81size_t size_in_bytes, bool is_signed) {82switch (size_in_bytes) {83case 8: ld(dst, offs, base); break;84case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;85case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;86case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(87default: ShouldNotReachHere();88}89}9091void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,92size_t size_in_bytes) {93switch (size_in_bytes) {94case 8: std(dst, offs, base); break;95case 4: stw(dst, offs, base); break;96case 2: sth(dst, offs, base); break;97case 1: stb(dst, offs, base); break;98default: ShouldNotReachHere();99}100}101102void MacroAssembler::align(int modulus, int max, int rem) {103int padding = (rem + modulus - (offset() % modulus)) % modulus;104if (padding > max) return;105for (int c = (padding >> 2); c > 0; --c) { nop(); }106}107108// Issue instructions that calculate given TOC from global TOC.109void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,110bool add_relocation, bool emit_dummy_addr) {111int offset = -1;112if (emit_dummy_addr) {113offset = -128; // dummy address114} else if (addr != (address)(intptr_t)-1) {115offset = MacroAssembler::offset_to_global_toc(addr);116}117118if (hi16) {119addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));120}121if (lo16) {122if (add_relocation) {123// Relocate at the addi to avoid confusion with a load from the method's TOC.124relocate(internal_word_Relocation::spec(addr));125}126addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));127}128}129130address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {131const int offset = MacroAssembler::offset_to_global_toc(addr);132133const address inst2_addr = a;134const int inst2 = *(int *)inst2_addr;135136// The relocation points to the second instruction, the addi,137// and the addi reads and writes the same register dst.138const int dst = inv_rt_field(inst2);139assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");140141// Now, find the preceding addis which writes to dst.142int inst1 = 0;143address inst1_addr = inst2_addr - BytesPerInstWord;144while (inst1_addr >= bound) {145inst1 = *(int *) inst1_addr;146if (is_addis(inst1) && inv_rt_field(inst1) == dst) {147// Stop, found the addis which writes dst.148break;149}150inst1_addr -= BytesPerInstWord;151}152153assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");154set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));155set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));156return inst1_addr;157}158159address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {160const address inst2_addr = a;161const int inst2 = *(int *)inst2_addr;162163// The relocation points to the second instruction, the addi,164// and the addi reads and writes the same register dst.165const int dst = inv_rt_field(inst2);166assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");167168// Now, find the preceding addis which writes to dst.169int inst1 = 0;170address inst1_addr = inst2_addr - BytesPerInstWord;171while (inst1_addr >= bound) {172inst1 = *(int *) inst1_addr;173if (is_addis(inst1) && inv_rt_field(inst1) == dst) {174// stop, found the addis which writes dst175break;176}177inst1_addr -= BytesPerInstWord;178}179180assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");181182int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);183// -1 is a special case184if (offset == -1) {185return (address)(intptr_t)-1;186} else {187return global_toc() + offset;188}189}190191#ifdef _LP64192// Patch compressed oops or klass constants.193// Assembler sequence is194// 1) compressed oops:195// lis rx = const.hi196// ori rx = rx | const.lo197// 2) compressed klass:198// lis rx = const.hi199// clrldi rx = rx & 0xFFFFffff // clearMS32b, optional200// ori rx = rx | const.lo201// Clrldi will be passed by.202address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {203assert(UseCompressedOops, "Should only patch compressed oops");204205const address inst2_addr = a;206const int inst2 = *(int *)inst2_addr;207208// The relocation points to the second instruction, the ori,209// and the ori reads and writes the same register dst.210const int dst = inv_rta_field(inst2);211assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");212// Now, find the preceding addis which writes to dst.213int inst1 = 0;214address inst1_addr = inst2_addr - BytesPerInstWord;215bool inst1_found = false;216while (inst1_addr >= bound) {217inst1 = *(int *)inst1_addr;218if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }219inst1_addr -= BytesPerInstWord;220}221assert(inst1_found, "inst is not lis");222223uint32_t data_value = CompressedOops::narrow_oop_value(data);224int xc = (data_value >> 16) & 0xffff;225int xd = (data_value >> 0) & 0xffff;226227set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo228set_imm((int *)inst2_addr, (xd)); // unsigned int229return inst1_addr;230}231232// Get compressed oop or klass constant.233narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {234assert(UseCompressedOops, "Should only patch compressed oops");235236const address inst2_addr = a;237const int inst2 = *(int *)inst2_addr;238239// The relocation points to the second instruction, the ori,240// and the ori reads and writes the same register dst.241const int dst = inv_rta_field(inst2);242assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");243// Now, find the preceding lis which writes to dst.244int inst1 = 0;245address inst1_addr = inst2_addr - BytesPerInstWord;246bool inst1_found = false;247248while (inst1_addr >= bound) {249inst1 = *(int *) inst1_addr;250if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}251inst1_addr -= BytesPerInstWord;252}253assert(inst1_found, "inst is not lis");254255uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));256uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);257258return CompressedOops::narrow_oop_cast(xl | xh);259}260#endif // _LP64261262// Returns true if successful.263bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,264Register toc, bool fixed_size) {265int toc_offset = 0;266// Use RelocationHolder::none for the constant pool entry, otherwise267// we will end up with a failing NativeCall::verify(x) where x is268// the address of the constant pool entry.269// FIXME: We should insert relocation information for oops at the constant270// pool entries instead of inserting it at the loads; patching of a constant271// pool entry should be less expensive.272address const_address = address_constant((address)a.value(), RelocationHolder::none);273if (const_address == NULL) { return false; } // allocation failure274// Relocate at the pc of the load.275relocate(a.rspec());276toc_offset = (int)(const_address - code()->consts()->start());277ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);278return true;279}280281bool MacroAssembler::is_load_const_from_method_toc_at(address a) {282const address inst1_addr = a;283const int inst1 = *(int *)inst1_addr;284285// The relocation points to the ld or the addis.286return (is_ld(inst1)) ||287(is_addis(inst1) && inv_ra_field(inst1) != 0);288}289290int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {291assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");292293const address inst1_addr = a;294const int inst1 = *(int *)inst1_addr;295296if (is_ld(inst1)) {297return inv_d1_field(inst1);298} else if (is_addis(inst1)) {299const int dst = inv_rt_field(inst1);300301// Now, find the succeeding ld which reads and writes to dst.302address inst2_addr = inst1_addr + BytesPerInstWord;303int inst2 = 0;304while (true) {305inst2 = *(int *) inst2_addr;306if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {307// Stop, found the ld which reads and writes dst.308break;309}310inst2_addr += BytesPerInstWord;311}312return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);313}314ShouldNotReachHere();315return 0;316}317318// Get the constant from a `load_const' sequence.319long MacroAssembler::get_const(address a) {320assert(is_load_const_at(a), "not a load of a constant");321const int *p = (const int*) a;322unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);323if (is_ori(*(p+1))) {324x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);325x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);326x |= (((unsigned long) (get_imm(a,4) & 0xffff)));327} else if (is_lis(*(p+1))) {328x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);329x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);330x |= (((unsigned long) (get_imm(a,3) & 0xffff)));331} else {332ShouldNotReachHere();333return (long) 0;334}335return (long) x;336}337338// Patch the 64 bit constant of a `load_const' sequence. This is a low339// level procedure. It neither flushes the instruction cache nor is it340// mt safe.341void MacroAssembler::patch_const(address a, long x) {342assert(is_load_const_at(a), "not a load of a constant");343int *p = (int*) a;344if (is_ori(*(p+1))) {345set_imm(0 + p, (x >> 48) & 0xffff);346set_imm(1 + p, (x >> 32) & 0xffff);347set_imm(3 + p, (x >> 16) & 0xffff);348set_imm(4 + p, x & 0xffff);349} else if (is_lis(*(p+1))) {350set_imm(0 + p, (x >> 48) & 0xffff);351set_imm(2 + p, (x >> 32) & 0xffff);352set_imm(1 + p, (x >> 16) & 0xffff);353set_imm(3 + p, x & 0xffff);354} else {355ShouldNotReachHere();356}357}358359AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {360assert(oop_recorder() != NULL, "this assembler needs a Recorder");361int index = oop_recorder()->allocate_metadata_index(obj);362RelocationHolder rspec = metadata_Relocation::spec(index);363return AddressLiteral((address)obj, rspec);364}365366AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {367assert(oop_recorder() != NULL, "this assembler needs a Recorder");368int index = oop_recorder()->find_index(obj);369RelocationHolder rspec = metadata_Relocation::spec(index);370return AddressLiteral((address)obj, rspec);371}372373AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {374assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");375int oop_index = oop_recorder()->allocate_oop_index(obj);376return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));377}378379AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {380assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");381int oop_index = oop_recorder()->find_index(obj);382return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));383}384385#ifndef PRODUCT386void MacroAssembler::pd_print_patched_instruction(address branch) {387Unimplemented(); // TODO: PPC port388}389#endif // ndef PRODUCT390391// Conditional far branch for destinations encodable in 24+2 bits.392void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {393394// If requested by flag optimize, relocate the bc_far as a395// runtime_call and prepare for optimizing it when the code gets396// relocated.397if (optimize == bc_far_optimize_on_relocate) {398relocate(relocInfo::runtime_call_type);399}400401// variant 2:402//403// b!cxx SKIP404// bxx DEST405// SKIP:406//407408const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),409opposite_bcond(inv_boint_bcond(boint)));410411// We emit two branches.412// First, a conditional branch which jumps around the far branch.413const address not_taken_pc = pc() + 2 * BytesPerInstWord;414const address bc_pc = pc();415bc(opposite_boint, biint, not_taken_pc);416417const int bc_instr = *(int*)bc_pc;418assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");419assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");420assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),421opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),422"postcondition");423assert(biint == inv_bi_field(bc_instr), "postcondition");424425// Second, an unconditional far branch which jumps to dest.426// Note: target(dest) remembers the current pc (see CodeSection::target)427// and returns the current pc if the label is not bound yet; when428// the label gets bound, the unconditional far branch will be patched.429const address target_pc = target(dest);430const address b_pc = pc();431b(target_pc);432433assert(not_taken_pc == pc(), "postcondition");434assert(dest.is_bound() || target_pc == b_pc, "postcondition");435}436437// 1 or 2 instructions438void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {439if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {440bc(boint, biint, dest);441} else {442bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);443}444}445446bool MacroAssembler::is_bc_far_at(address instruction_addr) {447return is_bc_far_variant1_at(instruction_addr) ||448is_bc_far_variant2_at(instruction_addr) ||449is_bc_far_variant3_at(instruction_addr);450}451452address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {453if (is_bc_far_variant1_at(instruction_addr)) {454const address instruction_1_addr = instruction_addr;455const int instruction_1 = *(int*)instruction_1_addr;456return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);457} else if (is_bc_far_variant2_at(instruction_addr)) {458const address instruction_2_addr = instruction_addr + 4;459return bxx_destination(instruction_2_addr);460} else if (is_bc_far_variant3_at(instruction_addr)) {461return instruction_addr + 8;462}463// variant 4 ???464ShouldNotReachHere();465return NULL;466}467void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {468469if (is_bc_far_variant3_at(instruction_addr)) {470// variant 3, far cond branch to the next instruction, already patched to nops:471//472// nop473// endgroup474// SKIP/DEST:475//476return;477}478479// first, extract boint and biint from the current branch480int boint = 0;481int biint = 0;482483ResourceMark rm;484const int code_size = 2 * BytesPerInstWord;485CodeBuffer buf(instruction_addr, code_size);486MacroAssembler masm(&buf);487if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {488// Far branch to next instruction: Optimize it by patching nops (produce variant 3).489masm.nop();490masm.endgroup();491} else {492if (is_bc_far_variant1_at(instruction_addr)) {493// variant 1, the 1st instruction contains the destination address:494//495// bcxx DEST496// nop497//498const int instruction_1 = *(int*)(instruction_addr);499boint = inv_bo_field(instruction_1);500biint = inv_bi_field(instruction_1);501} else if (is_bc_far_variant2_at(instruction_addr)) {502// variant 2, the 2nd instruction contains the destination address:503//504// b!cxx SKIP505// bxx DEST506// SKIP:507//508const int instruction_1 = *(int*)(instruction_addr);509boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),510opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));511biint = inv_bi_field(instruction_1);512} else {513// variant 4???514ShouldNotReachHere();515}516517// second, set the new branch destination and optimize the code518if (dest != instruction_addr + 4 && // the bc_far is still unbound!519masm.is_within_range_of_bcxx(dest, instruction_addr)) {520// variant 1:521//522// bcxx DEST523// nop524//525masm.bc(boint, biint, dest);526masm.nop();527} else {528// variant 2:529//530// b!cxx SKIP531// bxx DEST532// SKIP:533//534const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),535opposite_bcond(inv_boint_bcond(boint)));536const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;537masm.bc(opposite_boint, biint, not_taken_pc);538masm.b(dest);539}540}541ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);542}543544// Emit a NOT mt-safe patchable 64 bit absolute call/jump.545void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {546// get current pc547uint64_t start_pc = (uint64_t) pc();548549const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last550const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first551552// relocate here553if (rt != relocInfo::none) {554relocate(rt);555}556557if ( ReoptimizeCallSequences &&558(( link && is_within_range_of_b(dest, pc_of_bl)) ||559(!link && is_within_range_of_b(dest, pc_of_b)))) {560// variant 2:561// Emit an optimized, pc-relative call/jump.562563if (link) {564// some padding565nop();566nop();567nop();568nop();569nop();570nop();571572// do the call573assert(pc() == pc_of_bl, "just checking");574bl(dest, relocInfo::none);575} else {576// do the jump577assert(pc() == pc_of_b, "just checking");578b(dest, relocInfo::none);579580// some padding581nop();582nop();583nop();584nop();585nop();586nop();587}588589// Assert that we can identify the emitted call/jump.590assert(is_bxx64_patchable_variant2_at((address)start_pc, link),591"can't identify emitted call");592} else {593// variant 1:594mr(R0, R11); // spill R11 -> R0.595596// Load the destination address into CTR,597// calculate destination relative to global toc.598calculate_address_from_global_toc(R11, dest, true, true, false);599600mtctr(R11);601mr(R11, R0); // spill R11 <- R0.602nop();603604// do the call/jump605if (link) {606bctrl();607} else{608bctr();609}610// Assert that we can identify the emitted call/jump.611assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),612"can't identify emitted call");613}614615// Assert that we can identify the emitted call/jump.616assert(is_bxx64_patchable_at((address)start_pc, link),617"can't identify emitted call");618assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,619"wrong encoding of dest address");620}621622// Identify a bxx64_patchable instruction.623bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {624return is_bxx64_patchable_variant1b_at(instruction_addr, link)625//|| is_bxx64_patchable_variant1_at(instruction_addr, link)626|| is_bxx64_patchable_variant2_at(instruction_addr, link);627}628629// Does the call64_patchable instruction use a pc-relative encoding of630// the call destination?631bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {632// variant 2 is pc-relative633return is_bxx64_patchable_variant2_at(instruction_addr, link);634}635636// Identify variant 1.637bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {638unsigned int* instr = (unsigned int*) instruction_addr;639return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]640&& is_mtctr(instr[5]) // mtctr641&& is_load_const_at(instruction_addr);642}643644// Identify variant 1b: load destination relative to global toc.645bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {646unsigned int* instr = (unsigned int*) instruction_addr;647return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]648&& is_mtctr(instr[3]) // mtctr649&& is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);650}651652// Identify variant 2.653bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {654unsigned int* instr = (unsigned int*) instruction_addr;655if (link) {656return is_bl (instr[6]) // bl dest is last657&& is_nop(instr[0]) // nop658&& is_nop(instr[1]) // nop659&& is_nop(instr[2]) // nop660&& is_nop(instr[3]) // nop661&& is_nop(instr[4]) // nop662&& is_nop(instr[5]); // nop663} else {664return is_b (instr[0]) // b dest is first665&& is_nop(instr[1]) // nop666&& is_nop(instr[2]) // nop667&& is_nop(instr[3]) // nop668&& is_nop(instr[4]) // nop669&& is_nop(instr[5]) // nop670&& is_nop(instr[6]); // nop671}672}673674// Set dest address of a bxx64_patchable instruction.675void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {676ResourceMark rm;677int code_size = MacroAssembler::bxx64_patchable_size;678CodeBuffer buf(instruction_addr, code_size);679MacroAssembler masm(&buf);680masm.bxx64_patchable(dest, relocInfo::none, link);681ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);682}683684// Get dest address of a bxx64_patchable instruction.685address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {686if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {687return (address) (unsigned long) get_const(instruction_addr);688} else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {689unsigned int* instr = (unsigned int*) instruction_addr;690if (link) {691const int instr_idx = 6; // bl is last692int branchoffset = branch_destination(instr[instr_idx], 0);693return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;694} else {695const int instr_idx = 0; // b is first696int branchoffset = branch_destination(instr[instr_idx], 0);697return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;698}699// Load dest relative to global toc.700} else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {701return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,702instruction_addr);703} else {704ShouldNotReachHere();705return NULL;706}707}708709void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {710const int magic_number = 0x42;711712// Preserve stack pointer register (R1_SP) and system thread id register (R13);713// although they're technically volatile714for (int i = 2; i < 13; i++) {715Register reg = as_Register(i);716if (reg == excluded_register) {717continue;718}719720li(reg, magic_number);721}722}723724void MacroAssembler::clobber_carg_stack_slots(Register tmp) {725const int magic_number = 0x43;726727li(tmp, magic_number);728for (int m = 0; m <= 7; m++) {729std(tmp, frame::abi_minframe_size + m * 8, R1_SP);730}731}732733// Uses ordering which corresponds to ABI:734// _savegpr0_14: std r14,-144(r1)735// _savegpr0_15: std r15,-136(r1)736// _savegpr0_16: std r16,-128(r1)737void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {738std(R14, offset, dst); offset += 8;739std(R15, offset, dst); offset += 8;740std(R16, offset, dst); offset += 8;741std(R17, offset, dst); offset += 8;742std(R18, offset, dst); offset += 8;743std(R19, offset, dst); offset += 8;744std(R20, offset, dst); offset += 8;745std(R21, offset, dst); offset += 8;746std(R22, offset, dst); offset += 8;747std(R23, offset, dst); offset += 8;748std(R24, offset, dst); offset += 8;749std(R25, offset, dst); offset += 8;750std(R26, offset, dst); offset += 8;751std(R27, offset, dst); offset += 8;752std(R28, offset, dst); offset += 8;753std(R29, offset, dst); offset += 8;754std(R30, offset, dst); offset += 8;755std(R31, offset, dst); offset += 8;756757stfd(F14, offset, dst); offset += 8;758stfd(F15, offset, dst); offset += 8;759stfd(F16, offset, dst); offset += 8;760stfd(F17, offset, dst); offset += 8;761stfd(F18, offset, dst); offset += 8;762stfd(F19, offset, dst); offset += 8;763stfd(F20, offset, dst); offset += 8;764stfd(F21, offset, dst); offset += 8;765stfd(F22, offset, dst); offset += 8;766stfd(F23, offset, dst); offset += 8;767stfd(F24, offset, dst); offset += 8;768stfd(F25, offset, dst); offset += 8;769stfd(F26, offset, dst); offset += 8;770stfd(F27, offset, dst); offset += 8;771stfd(F28, offset, dst); offset += 8;772stfd(F29, offset, dst); offset += 8;773stfd(F30, offset, dst); offset += 8;774stfd(F31, offset, dst);775}776777// Uses ordering which corresponds to ABI:778// _restgpr0_14: ld r14,-144(r1)779// _restgpr0_15: ld r15,-136(r1)780// _restgpr0_16: ld r16,-128(r1)781void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {782ld(R14, offset, src); offset += 8;783ld(R15, offset, src); offset += 8;784ld(R16, offset, src); offset += 8;785ld(R17, offset, src); offset += 8;786ld(R18, offset, src); offset += 8;787ld(R19, offset, src); offset += 8;788ld(R20, offset, src); offset += 8;789ld(R21, offset, src); offset += 8;790ld(R22, offset, src); offset += 8;791ld(R23, offset, src); offset += 8;792ld(R24, offset, src); offset += 8;793ld(R25, offset, src); offset += 8;794ld(R26, offset, src); offset += 8;795ld(R27, offset, src); offset += 8;796ld(R28, offset, src); offset += 8;797ld(R29, offset, src); offset += 8;798ld(R30, offset, src); offset += 8;799ld(R31, offset, src); offset += 8;800801// FP registers802lfd(F14, offset, src); offset += 8;803lfd(F15, offset, src); offset += 8;804lfd(F16, offset, src); offset += 8;805lfd(F17, offset, src); offset += 8;806lfd(F18, offset, src); offset += 8;807lfd(F19, offset, src); offset += 8;808lfd(F20, offset, src); offset += 8;809lfd(F21, offset, src); offset += 8;810lfd(F22, offset, src); offset += 8;811lfd(F23, offset, src); offset += 8;812lfd(F24, offset, src); offset += 8;813lfd(F25, offset, src); offset += 8;814lfd(F26, offset, src); offset += 8;815lfd(F27, offset, src); offset += 8;816lfd(F28, offset, src); offset += 8;817lfd(F29, offset, src); offset += 8;818lfd(F30, offset, src); offset += 8;819lfd(F31, offset, src);820}821822// For verify_oops.823void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {824std(R2, offset, dst); offset += 8;825if (include_R3_RET_reg) {826std(R3, offset, dst); offset += 8;827}828std(R4, offset, dst); offset += 8;829std(R5, offset, dst); offset += 8;830std(R6, offset, dst); offset += 8;831std(R7, offset, dst); offset += 8;832std(R8, offset, dst); offset += 8;833std(R9, offset, dst); offset += 8;834std(R10, offset, dst); offset += 8;835std(R11, offset, dst); offset += 8;836std(R12, offset, dst); offset += 8;837838if (include_fp_regs) {839stfd(F0, offset, dst); offset += 8;840stfd(F1, offset, dst); offset += 8;841stfd(F2, offset, dst); offset += 8;842stfd(F3, offset, dst); offset += 8;843stfd(F4, offset, dst); offset += 8;844stfd(F5, offset, dst); offset += 8;845stfd(F6, offset, dst); offset += 8;846stfd(F7, offset, dst); offset += 8;847stfd(F8, offset, dst); offset += 8;848stfd(F9, offset, dst); offset += 8;849stfd(F10, offset, dst); offset += 8;850stfd(F11, offset, dst); offset += 8;851stfd(F12, offset, dst); offset += 8;852stfd(F13, offset, dst);853}854}855856// For verify_oops.857void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {858ld(R2, offset, src); offset += 8;859if (include_R3_RET_reg) {860ld(R3, offset, src); offset += 8;861}862ld(R4, offset, src); offset += 8;863ld(R5, offset, src); offset += 8;864ld(R6, offset, src); offset += 8;865ld(R7, offset, src); offset += 8;866ld(R8, offset, src); offset += 8;867ld(R9, offset, src); offset += 8;868ld(R10, offset, src); offset += 8;869ld(R11, offset, src); offset += 8;870ld(R12, offset, src); offset += 8;871872if (include_fp_regs) {873lfd(F0, offset, src); offset += 8;874lfd(F1, offset, src); offset += 8;875lfd(F2, offset, src); offset += 8;876lfd(F3, offset, src); offset += 8;877lfd(F4, offset, src); offset += 8;878lfd(F5, offset, src); offset += 8;879lfd(F6, offset, src); offset += 8;880lfd(F7, offset, src); offset += 8;881lfd(F8, offset, src); offset += 8;882lfd(F9, offset, src); offset += 8;883lfd(F10, offset, src); offset += 8;884lfd(F11, offset, src); offset += 8;885lfd(F12, offset, src); offset += 8;886lfd(F13, offset, src);887}888}889890void MacroAssembler::save_LR_CR(Register tmp) {891mfcr(tmp);892std(tmp, _abi0(cr), R1_SP);893mflr(tmp);894std(tmp, _abi0(lr), R1_SP);895// Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)896}897898void MacroAssembler::restore_LR_CR(Register tmp) {899assert(tmp != R1_SP, "must be distinct");900ld(tmp, _abi0(lr), R1_SP);901mtlr(tmp);902ld(tmp, _abi0(cr), R1_SP);903mtcr(tmp);904}905906address MacroAssembler::get_PC_trash_LR(Register result) {907Label L;908bl(L);909bind(L);910address lr_pc = pc();911mflr(result);912return lr_pc;913}914915void MacroAssembler::resize_frame(Register offset, Register tmp) {916#ifdef ASSERT917assert_different_registers(offset, tmp, R1_SP);918andi_(tmp, offset, frame::alignment_in_bytes-1);919asm_assert_eq("resize_frame: unaligned");920#endif921922// tmp <- *(SP)923ld(tmp, _abi0(callers_sp), R1_SP);924// addr <- SP + offset;925// *(addr) <- tmp;926// SP <- addr927stdux(tmp, R1_SP, offset);928}929930void MacroAssembler::resize_frame(int offset, Register tmp) {931assert(is_simm(offset, 16), "too big an offset");932assert_different_registers(tmp, R1_SP);933assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");934// tmp <- *(SP)935ld(tmp, _abi0(callers_sp), R1_SP);936// addr <- SP + offset;937// *(addr) <- tmp;938// SP <- addr939stdu(tmp, offset, R1_SP);940}941942void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {943// (addr == tmp1) || (addr == tmp2) is allowed here!944assert(tmp1 != tmp2, "must be distinct");945946// compute offset w.r.t. current stack pointer947// tmp_1 <- addr - SP (!)948subf(tmp1, R1_SP, addr);949950// atomically update SP keeping back link.951resize_frame(tmp1/* offset */, tmp2/* tmp */);952}953954void MacroAssembler::push_frame(Register bytes, Register tmp) {955#ifdef ASSERT956assert(bytes != R0, "r0 not allowed here");957andi_(R0, bytes, frame::alignment_in_bytes-1);958asm_assert_eq("push_frame(Reg, Reg): unaligned");959#endif960neg(tmp, bytes);961stdux(R1_SP, R1_SP, tmp);962}963964// Push a frame of size `bytes'.965void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {966long offset = align_addr(bytes, frame::alignment_in_bytes);967if (is_simm(-offset, 16)) {968stdu(R1_SP, -offset, R1_SP);969} else {970load_const_optimized(tmp, -offset);971stdux(R1_SP, R1_SP, tmp);972}973}974975// Push a frame of size `bytes' plus abi_reg_args on top.976void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {977push_frame(bytes + frame::abi_reg_args_size, tmp);978}979980// Setup up a new C frame with a spill area for non-volatile GPRs and981// additional space for local variables.982void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,983Register tmp) {984push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);985}986987// Pop current C frame.988void MacroAssembler::pop_frame() {989ld(R1_SP, _abi0(callers_sp), R1_SP);990}991992#if defined(ABI_ELFv2)993address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {994// TODO(asmundak): make sure the caller uses R12 as function descriptor995// most of the times.996if (R12 != r_function_entry) {997mr(R12, r_function_entry);998}999mtctr(R12);1000// Do a call or a branch.1001if (and_link) {1002bctrl();1003} else {1004bctr();1005}1006_last_calls_return_pc = pc();10071008return _last_calls_return_pc;1009}10101011// Call a C function via a function descriptor and use full C1012// calling conventions. Updates and returns _last_calls_return_pc.1013address MacroAssembler::call_c(Register r_function_entry) {1014return branch_to(r_function_entry, /*and_link=*/true);1015}10161017// For tail calls: only branch, don't link, so callee returns to caller of this function.1018address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {1019return branch_to(r_function_entry, /*and_link=*/false);1020}10211022address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {1023load_const(R12, function_entry, R0);1024return branch_to(R12, /*and_link=*/true);1025}10261027#else1028// Generic version of a call to C function via a function descriptor1029// with variable support for C calling conventions (TOC, ENV, etc.).1030// Updates and returns _last_calls_return_pc.1031address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,1032bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {1033// we emit standard ptrgl glue code here1034assert((function_descriptor != R0), "function_descriptor cannot be R0");10351036// retrieve necessary entries from the function descriptor1037ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);1038mtctr(R0);10391040if (load_toc_of_callee) {1041ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);1042}1043if (load_env_of_callee) {1044ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);1045} else if (load_toc_of_callee) {1046li(R11, 0);1047}10481049// do a call or a branch1050if (and_link) {1051bctrl();1052} else {1053bctr();1054}1055_last_calls_return_pc = pc();10561057return _last_calls_return_pc;1058}10591060// Call a C function via a function descriptor and use full C calling1061// conventions.1062// We don't use the TOC in generated code, so there is no need to save1063// and restore its value.1064address MacroAssembler::call_c(Register fd) {1065return branch_to(fd, /*and_link=*/true,1066/*save toc=*/false,1067/*restore toc=*/false,1068/*load toc=*/true,1069/*load env=*/true);1070}10711072address MacroAssembler::call_c_and_return_to_caller(Register fd) {1073return branch_to(fd, /*and_link=*/false,1074/*save toc=*/false,1075/*restore toc=*/false,1076/*load toc=*/true,1077/*load env=*/true);1078}10791080address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {1081if (rt != relocInfo::none) {1082// this call needs to be relocatable1083if (!ReoptimizeCallSequences1084|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)1085|| fd == NULL // support code-size estimation1086|| !fd->is_friend_function()1087|| fd->entry() == NULL) {1088// it's not a friend function as defined by class FunctionDescriptor,1089// so do a full call-c here.1090load_const(R11, (address)fd, R0);10911092bool has_env = (fd != NULL && fd->env() != NULL);1093return branch_to(R11, /*and_link=*/true,1094/*save toc=*/false,1095/*restore toc=*/false,1096/*load toc=*/true,1097/*load env=*/has_env);1098} else {1099// It's a friend function. Load the entry point and don't care about1100// toc and env. Use an optimizable call instruction, but ensure the1101// same code-size as in the case of a non-friend function.1102nop();1103nop();1104nop();1105bl64_patchable(fd->entry(), rt);1106_last_calls_return_pc = pc();1107return _last_calls_return_pc;1108}1109} else {1110// This call does not need to be relocatable, do more aggressive1111// optimizations.1112if (!ReoptimizeCallSequences1113|| !fd->is_friend_function()) {1114// It's not a friend function as defined by class FunctionDescriptor,1115// so do a full call-c here.1116load_const(R11, (address)fd, R0);1117return branch_to(R11, /*and_link=*/true,1118/*save toc=*/false,1119/*restore toc=*/false,1120/*load toc=*/true,1121/*load env=*/true);1122} else {1123// it's a friend function, load the entry point and don't care about1124// toc and env.1125address dest = fd->entry();1126if (is_within_range_of_b(dest, pc())) {1127bl(dest);1128} else {1129bl64_patchable(dest, rt);1130}1131_last_calls_return_pc = pc();1132return _last_calls_return_pc;1133}1134}1135}11361137// Call a C function. All constants needed reside in TOC.1138//1139// Read the address to call from the TOC.1140// Read env from TOC, if fd specifies an env.1141// Read new TOC from TOC.1142address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,1143relocInfo::relocType rt, Register toc) {1144if (!ReoptimizeCallSequences1145|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)1146|| !fd->is_friend_function()) {1147// It's not a friend function as defined by class FunctionDescriptor,1148// so do a full call-c here.1149assert(fd->entry() != NULL, "function must be linked");11501151AddressLiteral fd_entry(fd->entry());1152bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);1153mtctr(R11);1154if (fd->env() == NULL) {1155li(R11, 0);1156nop();1157} else {1158AddressLiteral fd_env(fd->env());1159success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);1160}1161AddressLiteral fd_toc(fd->toc());1162// Set R2_TOC (load from toc)1163success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);1164bctrl();1165_last_calls_return_pc = pc();1166if (!success) { return NULL; }1167} else {1168// It's a friend function, load the entry point and don't care about1169// toc and env. Use an optimizable call instruction, but ensure the1170// same code-size as in the case of a non-friend function.1171nop();1172bl64_patchable(fd->entry(), rt);1173_last_calls_return_pc = pc();1174}1175return _last_calls_return_pc;1176}1177#endif // ABI_ELFv211781179void MacroAssembler::call_VM_base(Register oop_result,1180Register last_java_sp,1181address entry_point,1182bool check_exceptions) {1183BLOCK_COMMENT("call_VM {");1184// Determine last_java_sp register.1185if (!last_java_sp->is_valid()) {1186last_java_sp = R1_SP;1187}1188set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);11891190// ARG1 must hold thread address.1191mr(R3_ARG1, R16_thread);1192#if defined(ABI_ELFv2)1193address return_pc = call_c(entry_point, relocInfo::none);1194#else1195address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);1196#endif11971198reset_last_Java_frame();11991200// Check for pending exceptions.1201if (check_exceptions) {1202// We don't check for exceptions here.1203ShouldNotReachHere();1204}12051206// Get oop result if there is one and reset the value in the thread.1207if (oop_result->is_valid()) {1208get_vm_result(oop_result);1209}12101211_last_calls_return_pc = return_pc;1212BLOCK_COMMENT("} call_VM");1213}12141215void MacroAssembler::call_VM_leaf_base(address entry_point) {1216BLOCK_COMMENT("call_VM_leaf {");1217#if defined(ABI_ELFv2)1218call_c(entry_point, relocInfo::none);1219#else1220call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);1221#endif1222BLOCK_COMMENT("} call_VM_leaf");1223}12241225void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {1226call_VM_base(oop_result, noreg, entry_point, check_exceptions);1227}12281229void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,1230bool check_exceptions) {1231// R3_ARG1 is reserved for the thread.1232mr_if_needed(R4_ARG2, arg_1);1233call_VM(oop_result, entry_point, check_exceptions);1234}12351236void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,1237bool check_exceptions) {1238// R3_ARG1 is reserved for the thread1239mr_if_needed(R4_ARG2, arg_1);1240assert(arg_2 != R4_ARG2, "smashed argument");1241mr_if_needed(R5_ARG3, arg_2);1242call_VM(oop_result, entry_point, check_exceptions);1243}12441245void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,1246bool check_exceptions) {1247// R3_ARG1 is reserved for the thread1248mr_if_needed(R4_ARG2, arg_1);1249assert(arg_2 != R4_ARG2, "smashed argument");1250mr_if_needed(R5_ARG3, arg_2);1251mr_if_needed(R6_ARG4, arg_3);1252call_VM(oop_result, entry_point, check_exceptions);1253}12541255void MacroAssembler::call_VM_leaf(address entry_point) {1256call_VM_leaf_base(entry_point);1257}12581259void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {1260mr_if_needed(R3_ARG1, arg_1);1261call_VM_leaf(entry_point);1262}12631264void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {1265mr_if_needed(R3_ARG1, arg_1);1266assert(arg_2 != R3_ARG1, "smashed argument");1267mr_if_needed(R4_ARG2, arg_2);1268call_VM_leaf(entry_point);1269}12701271void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {1272mr_if_needed(R3_ARG1, arg_1);1273assert(arg_2 != R3_ARG1, "smashed argument");1274mr_if_needed(R4_ARG2, arg_2);1275assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");1276mr_if_needed(R5_ARG3, arg_3);1277call_VM_leaf(entry_point);1278}12791280// Check whether instruction is a read access to the polling page1281// which was emitted by load_from_polling_page(..).1282bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,1283address* polling_address_ptr) {1284if (!is_ld(instruction))1285return false; // It's not a ld. Fail.12861287int rt = inv_rt_field(instruction);1288int ra = inv_ra_field(instruction);1289int ds = inv_ds_field(instruction);1290if (!(ds == 0 && ra != 0 && rt == 0)) {1291return false; // It's not a ld(r0, X, ra). Fail.1292}12931294if (!ucontext) {1295// Set polling address.1296if (polling_address_ptr != NULL) {1297*polling_address_ptr = NULL;1298}1299return true; // No ucontext given. Can't check value of ra. Assume true.1300}13011302#ifdef LINUX1303// Ucontext given. Check that register ra contains the address of1304// the safepoing polling page.1305ucontext_t* uc = (ucontext_t*) ucontext;1306// Set polling address.1307address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;1308if (polling_address_ptr != NULL) {1309*polling_address_ptr = addr;1310}1311return SafepointMechanism::is_poll_address(addr);1312#else1313// Not on Linux, ucontext must be NULL.1314ShouldNotReachHere();1315return false;1316#endif1317}13181319void MacroAssembler::bang_stack_with_offset(int offset) {1320// When increasing the stack, the old stack pointer will be written1321// to the new top of stack according to the PPC64 abi.1322// Therefore, stack banging is not necessary when increasing1323// the stack by <= os::vm_page_size() bytes.1324// When increasing the stack by a larger amount, this method is1325// called repeatedly to bang the intermediate pages.13261327// Stack grows down, caller passes positive offset.1328assert(offset > 0, "must bang with positive offset");13291330long stdoffset = -offset;13311332if (is_simm(stdoffset, 16)) {1333// Signed 16 bit offset, a simple std is ok.1334if (UseLoadInstructionsForStackBangingPPC64) {1335ld(R0, (int)(signed short)stdoffset, R1_SP);1336} else {1337std(R0,(int)(signed short)stdoffset, R1_SP);1338}1339} else if (is_simm(stdoffset, 31)) {1340const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);1341const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);13421343Register tmp = R11;1344addis(tmp, R1_SP, hi);1345if (UseLoadInstructionsForStackBangingPPC64) {1346ld(R0, lo, tmp);1347} else {1348std(R0, lo, tmp);1349}1350} else {1351ShouldNotReachHere();1352}1353}13541355// If instruction is a stack bang of the form1356// std R0, x(Ry), (see bang_stack_with_offset())1357// stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())1358// or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())1359// return the banged address. Otherwise, return 0.1360address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {1361#ifdef LINUX1362ucontext_t* uc = (ucontext_t*) ucontext;1363int rs = inv_rs_field(instruction);1364int ra = inv_ra_field(instruction);1365if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)1366|| (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)1367|| (is_stdu(instruction) && rs == 1)) {1368int ds = inv_ds_field(instruction);1369// return banged address1370return ds+(address)uc->uc_mcontext.regs->gpr[ra];1371} else if (is_stdux(instruction) && rs == 1) {1372int rb = inv_rb_field(instruction);1373address sp = (address)uc->uc_mcontext.regs->gpr[1];1374long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];1375return ra != 1 || rb_val >= 0 ? NULL // not a stack bang1376: sp + rb_val; // banged address1377}1378return NULL; // not a stack bang1379#else1380// workaround not needed on !LINUX :-)1381ShouldNotCallThis();1382return NULL;1383#endif1384}13851386void MacroAssembler::reserved_stack_check(Register return_pc) {1387// Test if reserved zone needs to be enabled.1388Label no_reserved_zone_enabling;13891390ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);1391cmpld(CCR0, R1_SP, R0);1392blt_predict_taken(CCR0, no_reserved_zone_enabling);13931394// Enable reserved zone again, throw stack overflow exception.1395push_frame_reg_args(0, R0);1396call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);1397pop_frame();1398mtlr(return_pc);1399load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());1400mtctr(R0);1401bctr();14021403should_not_reach_here();14041405bind(no_reserved_zone_enabling);1406}14071408void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,1409bool cmpxchgx_hint) {1410Label retry;1411bind(retry);1412ldarx(dest_current_value, addr_base, cmpxchgx_hint);1413stdcx_(exchange_value, addr_base);1414if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1415bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1416} else {1417bne( CCR0, retry); // StXcx_ sets CCR0.1418}1419}14201421void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,1422Register tmp, bool cmpxchgx_hint) {1423Label retry;1424bind(retry);1425ldarx(dest_current_value, addr_base, cmpxchgx_hint);1426add(tmp, dest_current_value, inc_value);1427stdcx_(tmp, addr_base);1428if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1429bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1430} else {1431bne( CCR0, retry); // StXcx_ sets CCR0.1432}1433}14341435// Word/sub-word atomic helper functions14361437// Temps and addr_base are killed if size < 4 and processor does not support respective instructions.1438// Only signed types are supported with size < 4.1439// Atomic add always kills tmp1.1440void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,1441Register addr_base, Register tmp1, Register tmp2, Register tmp3,1442bool cmpxchgx_hint, bool is_add, int size) {1443// Sub-word instructions are available since Power 8.1444// For older processors, instruction_type != size holds, and we1445// emulate the sub-word instructions by constructing a 4-byte value1446// that leaves the other bytes unchanged.1447const int instruction_type = VM_Version::has_lqarx() ? size : 4;14481449Label retry;1450Register shift_amount = noreg,1451val32 = dest_current_value,1452modval = is_add ? tmp1 : exchange_value;14531454if (instruction_type != size) {1455assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);1456modval = tmp1;1457shift_amount = tmp2;1458val32 = tmp3;1459// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.1460#ifdef VM_LITTLE_ENDIAN1461rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;1462clrrdi(addr_base, addr_base, 2);1463#else1464xori(shift_amount, addr_base, (size == 1) ? 3 : 2);1465clrrdi(addr_base, addr_base, 2);1466rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;1467#endif1468}14691470// atomic emulation loop1471bind(retry);14721473switch (instruction_type) {1474case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;1475case 2: lharx(val32, addr_base, cmpxchgx_hint); break;1476case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;1477default: ShouldNotReachHere();1478}14791480if (instruction_type != size) {1481srw(dest_current_value, val32, shift_amount);1482}14831484if (is_add) { add(modval, dest_current_value, exchange_value); }14851486if (instruction_type != size) {1487// Transform exchange value such that the replacement can be done by one xor instruction.1488xorr(modval, dest_current_value, is_add ? modval : exchange_value);1489clrldi(modval, modval, (size == 1) ? 56 : 48);1490slw(modval, modval, shift_amount);1491xorr(modval, val32, modval);1492}14931494switch (instruction_type) {1495case 4: stwcx_(modval, addr_base); break;1496case 2: sthcx_(modval, addr_base); break;1497case 1: stbcx_(modval, addr_base); break;1498default: ShouldNotReachHere();1499}15001501if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1502bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1503} else {1504bne( CCR0, retry); // StXcx_ sets CCR0.1505}15061507// l?arx zero-extends, but Java wants byte/short values sign-extended.1508if (size == 1) {1509extsb(dest_current_value, dest_current_value);1510} else if (size == 2) {1511extsh(dest_current_value, dest_current_value);1512};1513}15141515// Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.1516// Only signed types are supported with size < 4.1517void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,1518Register compare_value, Register exchange_value,1519Register addr_base, Register tmp1, Register tmp2,1520Label &retry, Label &failed, bool cmpxchgx_hint, int size) {1521// Sub-word instructions are available since Power 8.1522// For older processors, instruction_type != size holds, and we1523// emulate the sub-word instructions by constructing a 4-byte value1524// that leaves the other bytes unchanged.1525const int instruction_type = VM_Version::has_lqarx() ? size : 4;15261527Register shift_amount = noreg,1528val32 = dest_current_value,1529modval = exchange_value;15301531if (instruction_type != size) {1532assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);1533shift_amount = tmp1;1534val32 = tmp2;1535modval = tmp2;1536// Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.1537#ifdef VM_LITTLE_ENDIAN1538rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;1539clrrdi(addr_base, addr_base, 2);1540#else1541xori(shift_amount, addr_base, (size == 1) ? 3 : 2);1542clrrdi(addr_base, addr_base, 2);1543rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;1544#endif1545// Transform exchange value such that the replacement can be done by one xor instruction.1546xorr(exchange_value, compare_value, exchange_value);1547clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);1548slw(exchange_value, exchange_value, shift_amount);1549}15501551// atomic emulation loop1552bind(retry);15531554switch (instruction_type) {1555case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;1556case 2: lharx(val32, addr_base, cmpxchgx_hint); break;1557case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;1558default: ShouldNotReachHere();1559}15601561if (instruction_type != size) {1562srw(dest_current_value, val32, shift_amount);1563}1564if (size == 1) {1565extsb(dest_current_value, dest_current_value);1566} else if (size == 2) {1567extsh(dest_current_value, dest_current_value);1568};15691570cmpw(flag, dest_current_value, compare_value);1571if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1572bne_predict_not_taken(flag, failed);1573} else {1574bne( flag, failed);1575}1576// branch to done => (flag == ne), (dest_current_value != compare_value)1577// fall through => (flag == eq), (dest_current_value == compare_value)15781579if (instruction_type != size) {1580xorr(modval, val32, exchange_value);1581}15821583switch (instruction_type) {1584case 4: stwcx_(modval, addr_base); break;1585case 2: sthcx_(modval, addr_base); break;1586case 1: stbcx_(modval, addr_base); break;1587default: ShouldNotReachHere();1588}1589}15901591// CmpxchgX sets condition register to cmpX(current, compare).1592void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,1593Register compare_value, Register exchange_value,1594Register addr_base, Register tmp1, Register tmp2,1595int semantics, bool cmpxchgx_hint,1596Register int_flag_success, bool contention_hint, bool weak, int size) {1597Label retry;1598Label failed;1599Label done;16001601// Save one branch if result is returned via register and1602// result register is different from the other ones.1603bool use_result_reg = (int_flag_success != noreg);1604bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&1605int_flag_success != exchange_value && int_flag_success != addr_base &&1606int_flag_success != tmp1 && int_flag_success != tmp2);1607assert(!weak || flag == CCR0, "weak only supported with CCR0");1608assert(size == 1 || size == 2 || size == 4, "unsupported");16091610if (use_result_reg && preset_result_reg) {1611li(int_flag_success, 0); // preset (assume cas failed)1612}16131614// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).1615if (contention_hint) { // Don't try to reserve if cmp fails.1616switch (size) {1617case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;1618case 2: lha(dest_current_value, 0, addr_base); break;1619case 4: lwz(dest_current_value, 0, addr_base); break;1620default: ShouldNotReachHere();1621}1622cmpw(flag, dest_current_value, compare_value);1623bne(flag, failed);1624}16251626// release/fence semantics1627if (semantics & MemBarRel) {1628release();1629}16301631cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,1632retry, failed, cmpxchgx_hint, size);1633if (!weak || use_result_reg) {1634if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1635bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.1636} else {1637bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.1638}1639}1640// fall through => (flag == eq), (dest_current_value == compare_value), (swapped)16411642// Result in register (must do this at the end because int_flag_success can be the1643// same register as one above).1644if (use_result_reg) {1645li(int_flag_success, 1);1646}16471648if (semantics & MemBarFenceAfter) {1649fence();1650} else if (semantics & MemBarAcq) {1651isync();1652}16531654if (use_result_reg && !preset_result_reg) {1655b(done);1656}16571658bind(failed);1659if (use_result_reg && !preset_result_reg) {1660li(int_flag_success, 0);1661}16621663bind(done);1664// (flag == ne) => (dest_current_value != compare_value), (!swapped)1665// (flag == eq) => (dest_current_value == compare_value), ( swapped)1666}16671668// Preforms atomic compare exchange:1669// if (compare_value == *addr_base)1670// *addr_base = exchange_value1671// int_flag_success = 1;1672// else1673// int_flag_success = 0;1674//1675// ConditionRegister flag = cmp(compare_value, *addr_base)1676// Register dest_current_value = *addr_base1677// Register compare_value Used to compare with value in memory1678// Register exchange_value Written to memory if compare_value == *addr_base1679// Register addr_base The memory location to compareXChange1680// Register int_flag_success Set to 1 if exchange_value was written to *addr_base1681//1682// To avoid the costly compare exchange the value is tested beforehand.1683// Several special cases exist to avoid that unnecessary information is generated.1684//1685void MacroAssembler::cmpxchgd(ConditionRegister flag,1686Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,1687Register addr_base, int semantics, bool cmpxchgx_hint,1688Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {1689Label retry;1690Label failed_int;1691Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;1692Label done;16931694// Save one branch if result is returned via register and result register is different from the other ones.1695bool use_result_reg = (int_flag_success!=noreg);1696bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&1697int_flag_success!=exchange_value && int_flag_success!=addr_base);1698assert(!weak || flag == CCR0, "weak only supported with CCR0");1699assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");17001701if (use_result_reg && preset_result_reg) {1702li(int_flag_success, 0); // preset (assume cas failed)1703}17041705// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).1706if (contention_hint) { // Don't try to reserve if cmp fails.1707ld(dest_current_value, 0, addr_base);1708cmpd(flag, compare_value, dest_current_value);1709bne(flag, failed);1710}17111712// release/fence semantics1713if (semantics & MemBarRel) {1714release();1715}17161717// atomic emulation loop1718bind(retry);17191720ldarx(dest_current_value, addr_base, cmpxchgx_hint);1721cmpd(flag, compare_value, dest_current_value);1722if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1723bne_predict_not_taken(flag, failed);1724} else {1725bne( flag, failed);1726}17271728stdcx_(exchange_value, addr_base);1729if (!weak || use_result_reg || failed_ext) {1730if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1731bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR01732} else {1733bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR01734}1735}17361737// result in register (must do this at the end because int_flag_success can be the same register as one above)1738if (use_result_reg) {1739li(int_flag_success, 1);1740}17411742if (semantics & MemBarFenceAfter) {1743fence();1744} else if (semantics & MemBarAcq) {1745isync();1746}17471748if (use_result_reg && !preset_result_reg) {1749b(done);1750}17511752bind(failed_int);1753if (use_result_reg && !preset_result_reg) {1754li(int_flag_success, 0);1755}17561757bind(done);1758// (flag == ne) => (dest_current_value != compare_value), (!swapped)1759// (flag == eq) => (dest_current_value == compare_value), ( swapped)1760}17611762// Look up the method for a megamorphic invokeinterface call.1763// The target method is determined by <intf_klass, itable_index>.1764// The receiver klass is in recv_klass.1765// On success, the result will be in method_result, and execution falls through.1766// On failure, execution transfers to the given label.1767void MacroAssembler::lookup_interface_method(Register recv_klass,1768Register intf_klass,1769RegisterOrConstant itable_index,1770Register method_result,1771Register scan_temp,1772Register temp2,1773Label& L_no_such_interface,1774bool return_method) {1775assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);17761777// Compute start of first itableOffsetEntry (which is at the end of the vtable).1778int vtable_base = in_bytes(Klass::vtable_start_offset());1779int itentry_off = itableMethodEntry::method_offset_in_bytes();1780int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);1781int scan_step = itableOffsetEntry::size() * wordSize;1782int log_vte_size= exact_log2(vtableEntry::size_in_bytes());17831784lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);1785// %%% We should store the aligned, prescaled offset in the klassoop.1786// Then the next several instructions would fold away.17871788sldi(scan_temp, scan_temp, log_vte_size);1789addi(scan_temp, scan_temp, vtable_base);1790add(scan_temp, recv_klass, scan_temp);17911792// Adjust recv_klass by scaled itable_index, so we can free itable_index.1793if (return_method) {1794if (itable_index.is_register()) {1795Register itable_offset = itable_index.as_register();1796sldi(method_result, itable_offset, logMEsize);1797if (itentry_off) { addi(method_result, method_result, itentry_off); }1798add(method_result, method_result, recv_klass);1799} else {1800long itable_offset = (long)itable_index.as_constant();1801// static address, no relocation1802add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);1803}1804}18051806// for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {1807// if (scan->interface() == intf) {1808// result = (klass + scan->offset() + itable_index);1809// }1810// }1811Label search, found_method;18121813for (int peel = 1; peel >= 0; peel--) {1814// %%%% Could load both offset and interface in one ldx, if they were1815// in the opposite order. This would save a load.1816ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);18171818// Check that this entry is non-null. A null entry means that1819// the receiver class doesn't implement the interface, and wasn't the1820// same as when the caller was compiled.1821cmpd(CCR0, temp2, intf_klass);18221823if (peel) {1824beq(CCR0, found_method);1825} else {1826bne(CCR0, search);1827// (invert the test to fall through to found_method...)1828}18291830if (!peel) break;18311832bind(search);18331834cmpdi(CCR0, temp2, 0);1835beq(CCR0, L_no_such_interface);1836addi(scan_temp, scan_temp, scan_step);1837}18381839bind(found_method);18401841// Got a hit.1842if (return_method) {1843int ito_offset = itableOffsetEntry::offset_offset_in_bytes();1844lwz(scan_temp, ito_offset, scan_temp);1845ldx(method_result, scan_temp, method_result);1846}1847}18481849// virtual method calling1850void MacroAssembler::lookup_virtual_method(Register recv_klass,1851RegisterOrConstant vtable_index,1852Register method_result) {18531854assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());18551856const int base = in_bytes(Klass::vtable_start_offset());1857assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");18581859if (vtable_index.is_register()) {1860sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);1861add(recv_klass, vtable_index.as_register(), recv_klass);1862} else {1863addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);1864}1865ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);1866}18671868/////////////////////////////////////////// subtype checking ////////////////////////////////////////////1869void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,1870Register super_klass,1871Register temp1_reg,1872Register temp2_reg,1873Label* L_success,1874Label* L_failure,1875Label* L_slow_path,1876RegisterOrConstant super_check_offset) {18771878const Register check_cache_offset = temp1_reg;1879const Register cached_super = temp2_reg;18801881assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);18821883int sco_offset = in_bytes(Klass::super_check_offset_offset());1884int sc_offset = in_bytes(Klass::secondary_super_cache_offset());18851886bool must_load_sco = (super_check_offset.constant_or_zero() == -1);1887bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);18881889Label L_fallthrough;1890int label_nulls = 0;1891if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }1892if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }1893if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }1894assert(label_nulls <= 1 ||1895(L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),1896"at most one NULL in the batch, usually");18971898// If the pointers are equal, we are done (e.g., String[] elements).1899// This self-check enables sharing of secondary supertype arrays among1900// non-primary types such as array-of-interface. Otherwise, each such1901// type would need its own customized SSA.1902// We move this check to the front of the fast path because many1903// type checks are in fact trivially successful in this manner,1904// so we get a nicely predicted branch right at the start of the check.1905cmpd(CCR0, sub_klass, super_klass);1906beq(CCR0, *L_success);19071908// Check the supertype display:1909if (must_load_sco) {1910// The super check offset is always positive...1911lwz(check_cache_offset, sco_offset, super_klass);1912super_check_offset = RegisterOrConstant(check_cache_offset);1913// super_check_offset is register.1914assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());1915}1916// The loaded value is the offset from KlassOopDesc.19171918ld(cached_super, super_check_offset, sub_klass);1919cmpd(CCR0, cached_super, super_klass);19201921// This check has worked decisively for primary supers.1922// Secondary supers are sought in the super_cache ('super_cache_addr').1923// (Secondary supers are interfaces and very deeply nested subtypes.)1924// This works in the same check above because of a tricky aliasing1925// between the super_cache and the primary super display elements.1926// (The 'super_check_addr' can address either, as the case requires.)1927// Note that the cache is updated below if it does not help us find1928// what we need immediately.1929// So if it was a primary super, we can just fail immediately.1930// Otherwise, it's the slow path for us (no success at this point).19311932#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }19331934if (super_check_offset.is_register()) {1935beq(CCR0, *L_success);1936cmpwi(CCR0, super_check_offset.as_register(), sc_offset);1937if (L_failure == &L_fallthrough) {1938beq(CCR0, *L_slow_path);1939} else {1940bne(CCR0, *L_failure);1941FINAL_JUMP(*L_slow_path);1942}1943} else {1944if (super_check_offset.as_constant() == sc_offset) {1945// Need a slow path; fast failure is impossible.1946if (L_slow_path == &L_fallthrough) {1947beq(CCR0, *L_success);1948} else {1949bne(CCR0, *L_slow_path);1950FINAL_JUMP(*L_success);1951}1952} else {1953// No slow path; it's a fast decision.1954if (L_failure == &L_fallthrough) {1955beq(CCR0, *L_success);1956} else {1957bne(CCR0, *L_failure);1958FINAL_JUMP(*L_success);1959}1960}1961}19621963bind(L_fallthrough);1964#undef FINAL_JUMP1965}19661967void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,1968Register super_klass,1969Register temp1_reg,1970Register temp2_reg,1971Label* L_success,1972Register result_reg) {1973const Register array_ptr = temp1_reg; // current value from cache array1974const Register temp = temp2_reg;19751976assert_different_registers(sub_klass, super_klass, array_ptr, temp);19771978int source_offset = in_bytes(Klass::secondary_supers_offset());1979int target_offset = in_bytes(Klass::secondary_super_cache_offset());19801981int length_offset = Array<Klass*>::length_offset_in_bytes();1982int base_offset = Array<Klass*>::base_offset_in_bytes();19831984Label hit, loop, failure, fallthru;19851986ld(array_ptr, source_offset, sub_klass);19871988// TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");1989lwz(temp, length_offset, array_ptr);1990cmpwi(CCR0, temp, 0);1991beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 019921993mtctr(temp); // load ctr19941995bind(loop);1996// Oops in table are NO MORE compressed.1997ld(temp, base_offset, array_ptr);1998cmpd(CCR0, temp, super_klass);1999beq(CCR0, hit);2000addi(array_ptr, array_ptr, BytesPerWord);2001bdnz(loop);20022003bind(failure);2004if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)2005b(fallthru);20062007bind(hit);2008std(super_klass, target_offset, sub_klass); // save result to cache2009if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)2010if (L_success != NULL) { b(*L_success); }2011else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided20122013bind(fallthru);2014}20152016// Try fast path, then go to slow one if not successful2017void MacroAssembler::check_klass_subtype(Register sub_klass,2018Register super_klass,2019Register temp1_reg,2020Register temp2_reg,2021Label& L_success) {2022Label L_failure;2023check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);2024check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);2025bind(L_failure); // Fallthru if not successful.2026}20272028void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {2029assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");20302031Label L_fallthrough;2032if (L_fast_path == NULL) {2033L_fast_path = &L_fallthrough;2034} else if (L_slow_path == NULL) {2035L_slow_path = &L_fallthrough;2036}20372038// Fast path check: class is fully initialized2039lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);2040cmpwi(CCR0, R0, InstanceKlass::fully_initialized);2041beq(CCR0, *L_fast_path);20422043// Fast path check: current thread is initializer thread2044ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);2045cmpd(CCR0, thread, R0);2046if (L_slow_path == &L_fallthrough) {2047beq(CCR0, *L_fast_path);2048} else if (L_fast_path == &L_fallthrough) {2049bne(CCR0, *L_slow_path);2050} else {2051Unimplemented();2052}20532054bind(L_fallthrough);2055}20562057RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,2058Register temp_reg,2059int extra_slot_offset) {2060// cf. TemplateTable::prepare_invoke(), if (load_receiver).2061int stackElementSize = Interpreter::stackElementSize;2062int offset = extra_slot_offset * stackElementSize;2063if (arg_slot.is_constant()) {2064offset += arg_slot.as_constant() * stackElementSize;2065return offset;2066} else {2067assert(temp_reg != noreg, "must specify");2068sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));2069if (offset != 0)2070addi(temp_reg, temp_reg, offset);2071return temp_reg;2072}2073}20742075// Supports temp2_reg = R0.2076void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,2077Register mark_reg, Register temp_reg,2078Register temp2_reg, Label& done, Label* slow_case) {2079assert(UseBiasedLocking, "why call this otherwise?");20802081#ifdef ASSERT2082assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);2083#endif20842085Label cas_label;20862087// Branch to done if fast path fails and no slow_case provided.2088Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;20892090// Biased locking2091// See whether the lock is currently biased toward our thread and2092// whether the epoch is still valid2093// Note that the runtime guarantees sufficient alignment of JavaThread2094// pointers to allow age to be placed into low bits2095assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,2096"biased locking makes assumptions about bit layout");20972098if (PrintBiasedLockingStatistics) {2099load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);2100lwzx(temp_reg, temp2_reg);2101addi(temp_reg, temp_reg, 1);2102stwx(temp_reg, temp2_reg);2103}21042105andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);2106cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);2107bne(cr_reg, cas_label);21082109load_klass(temp_reg, obj_reg);21102111load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));2112ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);2113orr(temp_reg, R16_thread, temp_reg);2114xorr(temp_reg, mark_reg, temp_reg);2115andr(temp_reg, temp_reg, temp2_reg);2116cmpdi(cr_reg, temp_reg, 0);2117if (PrintBiasedLockingStatistics) {2118Label l;2119bne(cr_reg, l);2120load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());2121lwzx(mark_reg, temp2_reg);2122addi(mark_reg, mark_reg, 1);2123stwx(mark_reg, temp2_reg);2124// restore mark_reg2125ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);2126bind(l);2127}2128beq(cr_reg, done);21292130Label try_revoke_bias;2131Label try_rebias;21322133// At this point we know that the header has the bias pattern and2134// that we are not the bias owner in the current epoch. We need to2135// figure out more details about the state of the header in order to2136// know what operations can be legally performed on the object's2137// header.21382139// If the low three bits in the xor result aren't clear, that means2140// the prototype header is no longer biased and we have to revoke2141// the bias on this object.2142andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);2143cmpwi(cr_reg, temp2_reg, 0);2144bne(cr_reg, try_revoke_bias);21452146// Biasing is still enabled for this data type. See whether the2147// epoch of the current bias is still valid, meaning that the epoch2148// bits of the mark word are equal to the epoch bits of the2149// prototype header. (Note that the prototype header's epoch bits2150// only change at a safepoint.) If not, attempt to rebias the object2151// toward the current thread. Note that we must be absolutely sure2152// that the current epoch is invalid in order to do this because2153// otherwise the manipulations it performs on the mark word are2154// illegal.21552156int shift_amount = 64 - markWord::epoch_shift;2157// rotate epoch bits to right (little) end and set other bits to 02158// [ big part | epoch | little part ] -> [ 0..0 | epoch ]2159rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);2160// branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented2161bne(CCR0, try_rebias);21622163// The epoch of the current bias is still valid but we know nothing2164// about the owner; it might be set or it might be clear. Try to2165// acquire the bias of the object using an atomic operation. If this2166// fails we will go in to the runtime to revoke the object's bias.2167// Note that we first construct the presumed unbiased header so we2168// don't accidentally blow away another thread's valid bias.2169andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |2170markWord::age_mask_in_place |2171markWord::epoch_mask_in_place));2172orr(temp_reg, R16_thread, mark_reg);21732174assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");21752176// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).2177cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,2178/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,2179/*where=*/obj_reg,2180MacroAssembler::MemBarAcq,2181MacroAssembler::cmpxchgx_hint_acquire_lock(),2182noreg, slow_case_int); // bail out if failed21832184// If the biasing toward our thread failed, this means that2185// another thread succeeded in biasing it toward itself and we2186// need to revoke that bias. The revocation will occur in the2187// interpreter runtime in the slow case.2188if (PrintBiasedLockingStatistics) {2189load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);2190lwzx(temp_reg, temp2_reg);2191addi(temp_reg, temp_reg, 1);2192stwx(temp_reg, temp2_reg);2193}2194b(done);21952196bind(try_rebias);2197// At this point we know the epoch has expired, meaning that the2198// current "bias owner", if any, is actually invalid. Under these2199// circumstances _only_, we are allowed to use the current header's2200// value as the comparison value when doing the cas to acquire the2201// bias in the current epoch. In other words, we allow transfer of2202// the bias from one thread to another directly in this situation.2203load_klass(temp_reg, obj_reg);2204andi(temp2_reg, mark_reg, markWord::age_mask_in_place);2205orr(temp2_reg, R16_thread, temp2_reg);2206ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);2207orr(temp_reg, temp2_reg, temp_reg);22082209assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");22102211cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,2212/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,2213/*where=*/obj_reg,2214MacroAssembler::MemBarAcq,2215MacroAssembler::cmpxchgx_hint_acquire_lock(),2216noreg, slow_case_int); // bail out if failed22172218// If the biasing toward our thread failed, this means that2219// another thread succeeded in biasing it toward itself and we2220// need to revoke that bias. The revocation will occur in the2221// interpreter runtime in the slow case.2222if (PrintBiasedLockingStatistics) {2223load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);2224lwzx(temp_reg, temp2_reg);2225addi(temp_reg, temp_reg, 1);2226stwx(temp_reg, temp2_reg);2227}2228b(done);22292230bind(try_revoke_bias);2231// The prototype mark in the klass doesn't have the bias bit set any2232// more, indicating that objects of this data type are not supposed2233// to be biased any more. We are going to try to reset the mark of2234// this object to the prototype value and fall through to the2235// CAS-based locking scheme. Note that if our CAS fails, it means2236// that another thread raced us for the privilege of revoking the2237// bias of this particular object, so it's okay to continue in the2238// normal locking code.2239load_klass(temp_reg, obj_reg);2240ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);2241andi(temp2_reg, mark_reg, markWord::age_mask_in_place);2242orr(temp_reg, temp_reg, temp2_reg);22432244assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");22452246// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).2247cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,2248/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,2249/*where=*/obj_reg,2250MacroAssembler::MemBarAcq,2251MacroAssembler::cmpxchgx_hint_acquire_lock());22522253// reload markWord in mark_reg before continuing with lightweight locking2254ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);22552256// Fall through to the normal CAS-based lock, because no matter what2257// the result of the above CAS, some thread must have succeeded in2258// removing the bias bit from the object's header.2259if (PrintBiasedLockingStatistics) {2260Label l;2261bne(cr_reg, l);2262load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);2263lwzx(temp_reg, temp2_reg);2264addi(temp_reg, temp_reg, 1);2265stwx(temp_reg, temp2_reg);2266bind(l);2267}22682269bind(cas_label);2270}22712272void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {2273// Check for biased locking unlock case, which is a no-op2274// Note: we do not have to check the thread ID for two reasons.2275// First, the interpreter checks for IllegalMonitorStateException at2276// a higher level. Second, if the bias was revoked while we held the2277// lock, the object could not be rebiased toward another thread, so2278// the bias bit would be clear.22792280ld(temp_reg, 0, mark_addr);2281andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);22822283cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);2284beq(cr_reg, done);2285}22862287// allocation (for C1)2288void MacroAssembler::eden_allocate(2289Register obj, // result: pointer to object after successful allocation2290Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise2291int con_size_in_bytes, // object size in bytes if known at compile time2292Register t1, // temp register2293Register t2, // temp register2294Label& slow_case // continuation point if fast allocation fails2295) {2296b(slow_case);2297}22982299void MacroAssembler::tlab_allocate(2300Register obj, // result: pointer to object after successful allocation2301Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise2302int con_size_in_bytes, // object size in bytes if known at compile time2303Register t1, // temp register2304Label& slow_case // continuation point if fast allocation fails2305) {2306// make sure arguments make sense2307assert_different_registers(obj, var_size_in_bytes, t1);2308assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");2309assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");23102311const Register new_top = t1;2312//verify_tlab(); not implemented23132314ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);2315ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);2316if (var_size_in_bytes == noreg) {2317addi(new_top, obj, con_size_in_bytes);2318} else {2319add(new_top, obj, var_size_in_bytes);2320}2321cmpld(CCR0, new_top, R0);2322bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);23232324#ifdef ASSERT2325// make sure new free pointer is properly aligned2326{2327Label L;2328andi_(R0, new_top, MinObjAlignmentInBytesMask);2329beq(CCR0, L);2330stop("updated TLAB free is not properly aligned");2331bind(L);2332}2333#endif // ASSERT23342335// update the tlab top pointer2336std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);2337//verify_tlab(); not implemented2338}2339void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {2340unimplemented("incr_allocated_bytes");2341}23422343address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,2344int insts_call_instruction_offset, Register Rtoc) {2345// Start the stub.2346address stub = start_a_stub(64);2347if (stub == NULL) { return NULL; } // CodeCache full: bail out23482349// Create a trampoline stub relocation which relates this trampoline stub2350// with the call instruction at insts_call_instruction_offset in the2351// instructions code-section.2352relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));2353const int stub_start_offset = offset();23542355// For java_to_interp stubs we use R11_scratch1 as scratch register2356// and in call trampoline stubs we use R12_scratch2. This way we2357// can distinguish them (see is_NativeCallTrampolineStub_at()).2358Register reg_scratch = R12_scratch2;23592360// Now, create the trampoline stub's code:2361// - load the TOC2362// - load the call target from the constant pool2363// - call2364if (Rtoc == noreg) {2365calculate_address_from_global_toc(reg_scratch, method_toc());2366Rtoc = reg_scratch;2367}23682369ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);2370mtctr(reg_scratch);2371bctr();23722373const address stub_start_addr = addr_at(stub_start_offset);23742375// Assert that the encoded destination_toc_offset can be identified and that it is correct.2376assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),2377"encoded offset into the constant pool must match");2378// Trampoline_stub_size should be good.2379assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");2380assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");23812382// End the stub.2383end_a_stub();2384return stub;2385}23862387// TM on PPC64.2388void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {2389Label retry;2390bind(retry);2391ldarx(result, addr, /*hint*/ false);2392addi(result, result, simm16);2393stdcx_(result, addr);2394if (UseStaticBranchPredictionInCompareAndSwapPPC64) {2395bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR02396} else {2397bne( CCR0, retry); // stXcx_ sets CCR02398}2399}24002401void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {2402Label retry;2403bind(retry);2404lwarx(result, addr, /*hint*/ false);2405ori(result, result, uimm16);2406stwcx_(result, addr);2407if (UseStaticBranchPredictionInCompareAndSwapPPC64) {2408bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR02409} else {2410bne( CCR0, retry); // stXcx_ sets CCR02411}2412}24132414#if INCLUDE_RTM_OPT24152416// Update rtm_counters based on abort status2417// input: abort_status2418// rtm_counters_Reg (RTMLockingCounters*)2419void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {2420// Mapping to keep PreciseRTMLockingStatistics similar to x86.2421// x86 ppc (! means inverted, ? means not the same)2422// 0 31 Set if abort caused by XABORT instruction.2423// 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.2424// 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.2425// 3 10 Set if an internal buffer overflowed.2426// 4 ?12 Set if a debug breakpoint was hit.2427// 5 ?32 Set if an abort occurred during execution of a nested transaction.2428const int failure_bit[] = {tm_tabort, // Signal handler will set this too.2429tm_failure_persistent,2430tm_non_trans_cf,2431tm_trans_cf,2432tm_footprint_of,2433tm_failure_code,2434tm_transaction_level};24352436const int num_failure_bits = sizeof(failure_bit) / sizeof(int);2437const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;24382439const int bit2counter_map[][num_counters] =2440// 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic2441// Inverted logic means that if a bit is set don't count it, or vice-versa.2442// Care must be taken when mapping bits to counters as bits for a given2443// counter must be mutually exclusive. Otherwise, the counter will be2444// incremented more than once.2445// counters:2446// 0 1 2 3 4 52447// abort , persist, conflict, overflow, debug , nested bits:2448{{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort2449{ 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent2450{ 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf2451{ 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf2452{ 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of2453{ 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD42454{ 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 12455// ...24562457// Move abort_status value to R0 and use abort_status register as a2458// temporary register because R0 as third operand in ld/std is treated2459// as base address zero (value). Likewise, R0 as second operand in addi2460// is problematic because it amounts to li.2461const Register temp_Reg = abort_status;2462const Register abort_status_R0 = R0;2463mr(abort_status_R0, abort_status);24642465// Increment total abort counter.2466int counters_offs = RTMLockingCounters::abort_count_offset();2467ld(temp_Reg, counters_offs, rtm_counters_Reg);2468addi(temp_Reg, temp_Reg, 1);2469std(temp_Reg, counters_offs, rtm_counters_Reg);24702471// Increment specific abort counters.2472if (PrintPreciseRTMLockingStatistics) {24732474// #0 counter offset.2475int abortX_offs = RTMLockingCounters::abortX_count_offset();24762477for (int nbit = 0; nbit < num_failure_bits; nbit++) {2478for (int ncounter = 0; ncounter < num_counters; ncounter++) {2479if (bit2counter_map[nbit][ncounter] != 0) {2480Label check_abort;2481int abort_counter_offs = abortX_offs + (ncounter << 3);24822483if (failure_bit[nbit] == tm_transaction_level) {2484// Don't check outer transaction, TL = 1 (bit 63). Hence only2485// 11 bits in the TL field are checked to find out if failure2486// occured in a nested transaction. This check also matches2487// the case when nesting_of = 1 (nesting overflow).2488rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);2489} else if (failure_bit[nbit] == tm_failure_code) {2490// Check failure code for trap or illegal caught in TM.2491// Bits 0:7 are tested as bit 7 (persistent) is copied from2492// tabort or treclaim source operand.2493// On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).2494rldicl(temp_Reg, abort_status_R0, 8, 56);2495cmpdi(CCR0, temp_Reg, 0xD4);2496} else {2497rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);2498}24992500if (bit2counter_map[nbit][ncounter] == 1) {2501beq(CCR0, check_abort);2502} else {2503bne(CCR0, check_abort);2504}25052506// We don't increment atomically.2507ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);2508addi(temp_Reg, temp_Reg, 1);2509std(temp_Reg, abort_counter_offs, rtm_counters_Reg);25102511bind(check_abort);2512}2513}2514}2515}2516// Restore abort_status.2517mr(abort_status, abort_status_R0);2518}25192520// Branch if (random & (count-1) != 0), count is 2^n2521// tmp and CR0 are killed2522void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {2523mftb(tmp);2524andi_(tmp, tmp, count-1);2525bne(CCR0, brLabel);2526}25272528// Perform abort ratio calculation, set no_rtm bit if high ratio.2529// input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED2530void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,2531RTMLockingCounters* rtm_counters,2532Metadata* method_data) {2533Label L_done, L_check_always_rtm1, L_check_always_rtm2;25342535if (RTMLockingCalculationDelay > 0) {2536// Delay calculation.2537ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());2538cmpdi(CCR0, rtm_counters_Reg, 0);2539beq(CCR0, L_done);2540load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload2541}2542// Abort ratio calculation only if abort_count > RTMAbortThreshold.2543// Aborted transactions = abort_count * 1002544// All transactions = total_count * RTMTotalCountIncrRate2545// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)2546ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);2547if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only.2548cmpdi(CCR0, R0, RTMAbortThreshold);2549blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary2550} else {2551load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);2552cmpd(CCR0, R0, rtm_counters_Reg);2553blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required2554}2555mulli(R0, R0, 100);25562557const Register tmpReg = rtm_counters_Reg;2558ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);2559mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int162560mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int162561cmpd(CCR0, R0, tmpReg);2562blt(CCR0, L_check_always_rtm1); // jump to reload2563if (method_data != NULL) {2564// Set rtm_state to "no rtm" in MDO.2565// Not using a metadata relocation. Method and Class Loader are kept alive anyway.2566// (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)2567load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);2568atomic_ori_int(R0, tmpReg, NoRTM);2569}2570b(L_done);25712572bind(L_check_always_rtm1);2573load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload2574bind(L_check_always_rtm2);2575ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);2576int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;2577if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only.2578cmpdi(CCR0, tmpReg, thresholdValue);2579} else {2580load_const_optimized(R0, thresholdValue);2581cmpd(CCR0, tmpReg, R0);2582}2583blt(CCR0, L_done);2584if (method_data != NULL) {2585// Set rtm_state to "always rtm" in MDO.2586// Not using a metadata relocation. See above.2587load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);2588atomic_ori_int(R0, tmpReg, UseRTM);2589}2590bind(L_done);2591}25922593// Update counters and perform abort ratio calculation.2594// input: abort_status_Reg2595void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,2596RTMLockingCounters* rtm_counters,2597Metadata* method_data,2598bool profile_rtm) {25992600assert(rtm_counters != NULL, "should not be NULL when profiling RTM");2601// Update rtm counters based on state at abort.2602// Reads abort_status_Reg, updates flags.2603assert_different_registers(abort_status_Reg, temp_Reg);2604load_const_optimized(temp_Reg, (address)rtm_counters, R0);2605rtm_counters_update(abort_status_Reg, temp_Reg);2606if (profile_rtm) {2607assert(rtm_counters != NULL, "should not be NULL when profiling RTM");2608rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);2609}2610}26112612// Retry on abort if abort's status indicates non-persistent failure.2613// inputs: retry_count_Reg2614// : abort_status_Reg2615// output: retry_count_Reg decremented by 12616void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,2617Label& retryLabel, Label* checkRetry) {2618Label doneRetry;26192620// Don't retry if failure is persistent.2621// The persistent bit is set when a (A) Disallowed operation is performed in2622// transactional state, like for instance trying to write the TFHAR after a2623// transaction is started; or when there is (B) a Nesting Overflow (too many2624// nested transactions); or when (C) the Footprint overflows (too many2625// addressess touched in TM state so there is no more space in the footprint2626// area to track them); or in case of (D) a Self-Induced Conflict, i.e. a2627// store is performed to a given address in TM state, then once in suspended2628// state the same address is accessed. Failure (A) is very unlikely to occur2629// in the JVM. Failure (D) will never occur because Suspended state is never2630// used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint2631// Overflow will set the persistent bit.2632rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);2633bne(CCR0, doneRetry);26342635// Don't retry if transaction was deliberately aborted, i.e. caused by a2636// tabort instruction.2637rldicr_(R0, abort_status_Reg, tm_tabort, 0);2638bne(CCR0, doneRetry);26392640// Retry if transaction aborted due to a conflict with another thread.2641if (checkRetry) { bind(*checkRetry); }2642addic_(retry_count_Reg, retry_count_Reg, -1);2643blt(CCR0, doneRetry);2644b(retryLabel);2645bind(doneRetry);2646}26472648// Spin and retry if lock is busy.2649// inputs: owner_addr_Reg (monitor address)2650// : retry_count_Reg2651// output: retry_count_Reg decremented by 12652// CTR is killed2653void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {2654Label SpinLoop, doneRetry, doRetry;2655addic_(retry_count_Reg, retry_count_Reg, -1);2656blt(CCR0, doneRetry);26572658if (RTMSpinLoopCount > 1) {2659li(R0, RTMSpinLoopCount);2660mtctr(R0);2661}26622663// low thread priority2664smt_prio_low();2665bind(SpinLoop);26662667if (RTMSpinLoopCount > 1) {2668bdz(doRetry);2669ld(R0, 0, owner_addr_Reg);2670cmpdi(CCR0, R0, 0);2671bne(CCR0, SpinLoop);2672}26732674bind(doRetry);26752676// restore thread priority to default in userspace2677#ifdef LINUX2678smt_prio_medium_low();2679#else2680smt_prio_medium();2681#endif26822683b(retryLabel);26842685bind(doneRetry);2686}26872688// Use RTM for normal stack locks.2689// Input: objReg (object to lock)2690void MacroAssembler::rtm_stack_locking(ConditionRegister flag,2691Register obj, Register mark_word, Register tmp,2692Register retry_on_abort_count_Reg,2693RTMLockingCounters* stack_rtm_counters,2694Metadata* method_data, bool profile_rtm,2695Label& DONE_LABEL, Label& IsInflated) {2696assert(UseRTMForStackLocks, "why call this otherwise?");2697assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");2698Label L_rtm_retry, L_decrement_retry, L_on_abort;26992700if (RTMRetryCount > 0) {2701load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort2702bind(L_rtm_retry);2703}2704andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased2705bne(CCR0, IsInflated);27062707if (PrintPreciseRTMLockingStatistics || profile_rtm) {2708Label L_noincrement;2709if (RTMTotalCountIncrRate > 1) {2710branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);2711}2712assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");2713load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);2714//atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically2715ldx(mark_word, tmp);2716addi(mark_word, mark_word, 1);2717stdx(mark_word, tmp);2718bind(L_noincrement);2719}2720tbegin_();2721beq(CCR0, L_on_abort);2722ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked.2723andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits2724cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked2725beq(flag, DONE_LABEL); // all done if unlocked27262727if (UseRTMXendForLockBusy) {2728tend_();2729b(L_decrement_retry);2730} else {2731tabort_();2732}2733bind(L_on_abort);2734const Register abort_status_Reg = tmp;2735mftexasr(abort_status_Reg);2736if (PrintPreciseRTMLockingStatistics || profile_rtm) {2737rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);2738}2739ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload2740if (RTMRetryCount > 0) {2741// Retry on lock abort if abort status is not permanent.2742rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);2743} else {2744bind(L_decrement_retry);2745}2746}27472748// Use RTM for inflating locks2749// inputs: obj (object to lock)2750// mark_word (current header - KILLED)2751// boxReg (on-stack box address (displaced header location) - KILLED)2752void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,2753Register obj, Register mark_word, Register boxReg,2754Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,2755RTMLockingCounters* rtm_counters,2756Metadata* method_data, bool profile_rtm,2757Label& DONE_LABEL) {2758assert(UseRTMLocking, "why call this otherwise?");2759Label L_rtm_retry, L_decrement_retry, L_on_abort;2760// Clean monitor_value bit to get valid pointer.2761int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;27622763// Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().2764std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);2765const Register tmpReg = boxReg;2766const Register owner_addr_Reg = mark_word;2767addi(owner_addr_Reg, mark_word, owner_offset);27682769if (RTMRetryCount > 0) {2770load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy.2771load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.2772bind(L_rtm_retry);2773}2774if (PrintPreciseRTMLockingStatistics || profile_rtm) {2775Label L_noincrement;2776if (RTMTotalCountIncrRate > 1) {2777branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);2778}2779assert(rtm_counters != NULL, "should not be NULL when profiling RTM");2780load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);2781//atomic_inc_ptr(R0, tmpReg); We don't increment atomically2782ldx(tmpReg, R0);2783addi(tmpReg, tmpReg, 1);2784stdx(tmpReg, R0);2785bind(L_noincrement);2786}2787tbegin_();2788beq(CCR0, L_on_abort);2789// We don't reload mark word. Will only be reset at safepoint.2790ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.2791cmpdi(flag, R0, 0);2792beq(flag, DONE_LABEL);27932794if (UseRTMXendForLockBusy) {2795tend_();2796b(L_decrement_retry);2797} else {2798tabort_();2799}2800bind(L_on_abort);2801const Register abort_status_Reg = tmpReg;2802mftexasr(abort_status_Reg);2803if (PrintPreciseRTMLockingStatistics || profile_rtm) {2804rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);2805// Restore owner_addr_Reg2806ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);2807#ifdef ASSERT2808andi_(R0, mark_word, markWord::monitor_value);2809asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.2810#endif2811addi(owner_addr_Reg, mark_word, owner_offset);2812}2813if (RTMRetryCount > 0) {2814// Retry on lock abort if abort status is not permanent.2815rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);2816}28172818// Appears unlocked - try to swing _owner from null to non-null.2819cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,2820MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2821MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);28222823if (RTMRetryCount > 0) {2824// success done else retry2825b(DONE_LABEL);2826bind(L_decrement_retry);2827// Spin and retry if lock is busy.2828rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);2829} else {2830bind(L_decrement_retry);2831}2832}28332834#endif // INCLUDE_RTM_OPT28352836// "The box" is the space on the stack where we copy the object mark.2837void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,2838Register temp, Register displaced_header, Register current_header,2839bool try_bias,2840RTMLockingCounters* rtm_counters,2841RTMLockingCounters* stack_rtm_counters,2842Metadata* method_data,2843bool use_rtm, bool profile_rtm) {2844assert_different_registers(oop, box, temp, displaced_header, current_header);2845assert(flag != CCR0, "bad condition register");2846Label cont;2847Label object_has_monitor;2848Label cas_failed;28492850// Load markWord from object into displaced_header.2851ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);28522853if (DiagnoseSyncOnValueBasedClasses != 0) {2854load_klass(temp, oop);2855lwz(temp, in_bytes(Klass::access_flags_offset()), temp);2856testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));2857bne(flag, cont);2858}28592860if (try_bias) {2861biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);2862}28632864#if INCLUDE_RTM_OPT2865if (UseRTMForStackLocks && use_rtm) {2866rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,2867stack_rtm_counters, method_data, profile_rtm,2868cont, object_has_monitor);2869}2870#endif // INCLUDE_RTM_OPT28712872// Handle existing monitor.2873// The object has an existing monitor iff (mark & monitor_value) != 0.2874andi_(temp, displaced_header, markWord::monitor_value);2875bne(CCR0, object_has_monitor);28762877// Set displaced_header to be (markWord of object | UNLOCK_VALUE).2878ori(displaced_header, displaced_header, markWord::unlocked_value);28792880// Load Compare Value application register.28812882// Initialize the box. (Must happen before we update the object mark!)2883std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);28842885// Must fence, otherwise, preceding store(s) may float below cmpxchg.2886// Compare object markWord with mark and if equal exchange scratch1 with object markWord.2887cmpxchgd(/*flag=*/flag,2888/*current_value=*/current_header,2889/*compare_value=*/displaced_header,2890/*exchange_value=*/box,2891/*where=*/oop,2892MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2893MacroAssembler::cmpxchgx_hint_acquire_lock(),2894noreg,2895&cas_failed,2896/*check without membar and ldarx first*/true);2897assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");28982899// If the compare-and-exchange succeeded, then we found an unlocked2900// object and we have now locked it.2901b(cont);29022903bind(cas_failed);2904// We did not see an unlocked object so try the fast recursive case.29052906// Check if the owner is self by comparing the value in the markWord of object2907// (current_header) with the stack pointer.2908sub(current_header, current_header, R1_SP);2909load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);29102911and_(R0/*==0?*/, current_header, temp);2912// If condition is true we are cont and hence we can store 0 as the2913// displaced header in the box, which indicates that it is a recursive lock.2914mcrf(flag,CCR0);2915std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);29162917// Handle existing monitor.2918b(cont);29192920bind(object_has_monitor);2921// The object's monitor m is unlocked iff m->owner == NULL,2922// otherwise m->owner may contain a thread or a stack address.29232924#if INCLUDE_RTM_OPT2925// Use the same RTM locking code in 32- and 64-bit VM.2926if (use_rtm) {2927rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,2928rtm_counters, method_data, profile_rtm, cont);2929} else {2930#endif // INCLUDE_RTM_OPT29312932// Try to CAS m->owner from NULL to current thread.2933addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);2934cmpxchgd(/*flag=*/flag,2935/*current_value=*/current_header,2936/*compare_value=*/(intptr_t)0,2937/*exchange_value=*/R16_thread,2938/*where=*/temp,2939MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2940MacroAssembler::cmpxchgx_hint_acquire_lock());29412942// Store a non-null value into the box.2943std(box, BasicLock::displaced_header_offset_in_bytes(), box);29442945# ifdef ASSERT2946bne(flag, cont);2947// We have acquired the monitor, check some invariants.2948addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());2949// Invariant 1: _recursions should be 0.2950//assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");2951asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,2952"monitor->_recursions should be 0");2953# endif29542955#if INCLUDE_RTM_OPT2956} // use_rtm()2957#endif29582959bind(cont);2960// flag == EQ indicates success2961// flag == NE indicates failure2962}29632964void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,2965Register temp, Register displaced_header, Register current_header,2966bool try_bias, bool use_rtm) {2967assert_different_registers(oop, box, temp, displaced_header, current_header);2968assert(flag != CCR0, "bad condition register");2969Label cont;2970Label object_has_monitor;29712972if (try_bias) {2973biased_locking_exit(flag, oop, current_header, cont);2974}29752976#if INCLUDE_RTM_OPT2977if (UseRTMForStackLocks && use_rtm) {2978assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");2979Label L_regular_unlock;2980ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword2981andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits2982cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked2983bne(flag, L_regular_unlock); // else RegularLock2984tend_(); // otherwise end...2985b(cont); // ... and we're done2986bind(L_regular_unlock);2987}2988#endif29892990// Find the lock address and load the displaced header from the stack.2991ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);29922993// If the displaced header is 0, we have a recursive unlock.2994cmpdi(flag, displaced_header, 0);2995beq(flag, cont);29962997// Handle existing monitor.2998// The object has an existing monitor iff (mark & monitor_value) != 0.2999RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done3000ld(current_header, oopDesc::mark_offset_in_bytes(), oop);3001andi_(R0, current_header, markWord::monitor_value);3002bne(CCR0, object_has_monitor);30033004// Check if it is still a light weight lock, this is is true if we see3005// the stack address of the basicLock in the markWord of the object.3006// Cmpxchg sets flag to cmpd(current_header, box).3007cmpxchgd(/*flag=*/flag,3008/*current_value=*/current_header,3009/*compare_value=*/box,3010/*exchange_value=*/displaced_header,3011/*where=*/oop,3012MacroAssembler::MemBarRel,3013MacroAssembler::cmpxchgx_hint_release_lock(),3014noreg,3015&cont);30163017assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");30183019// Handle existing monitor.3020b(cont);30213022bind(object_has_monitor);3023STATIC_ASSERT(markWord::monitor_value <= INT_MAX);3024addi(current_header, current_header, -(int)markWord::monitor_value); // monitor3025ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);30263027// It's inflated.3028#if INCLUDE_RTM_OPT3029if (use_rtm) {3030Label L_regular_inflated_unlock;3031// Clean monitor_value bit to get valid pointer3032cmpdi(flag, temp, 0);3033bne(flag, L_regular_inflated_unlock);3034tend_();3035b(cont);3036bind(L_regular_inflated_unlock);3037}3038#endif30393040ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);3041xorr(temp, R16_thread, temp); // Will be 0 if we are the owner.3042orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.3043cmpdi(flag, temp, 0);3044bne(flag, cont);30453046ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);3047ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);3048orr(temp, temp, displaced_header); // Will be 0 if both are 0.3049cmpdi(flag, temp, 0);3050bne(flag, cont);3051release();3052std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);30533054bind(cont);3055// flag == EQ indicates success3056// flag == NE indicates failure3057}30583059void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {3060ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);30613062if (at_return) {3063if (in_nmethod) {3064if (UseSIGTRAP) {3065// Use Signal Handler.3066relocate(relocInfo::poll_return_type);3067td(traptoGreaterThanUnsigned, R1_SP, temp);3068} else {3069cmpld(CCR0, R1_SP, temp);3070// Stub may be out of range for short conditional branch.3071bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);3072}3073} else { // Not in nmethod.3074// Frame still on stack, need to get fp.3075Register fp = R0;3076ld(fp, _abi0(callers_sp), R1_SP);3077cmpld(CCR0, fp, temp);3078bgt(CCR0, slow_path);3079}3080} else { // Normal safepoint poll. Not at return.3081assert(!in_nmethod, "should use load_from_polling_page");3082andi_(temp, temp, SafepointMechanism::poll_bit());3083bne(CCR0, slow_path);3084}3085}30863087void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,3088MacroAssembler::PreservationLevel preservation_level) {3089BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();3090bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);3091}30923093// Values for last_Java_pc, and last_Java_sp must comply to the rules3094// in frame_ppc.hpp.3095void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {3096// Always set last_Java_pc and flags first because once last_Java_sp3097// is visible has_last_Java_frame is true and users will look at the3098// rest of the fields. (Note: flags should always be zero before we3099// get here so doesn't need to be set.)31003101// Verify that last_Java_pc was zeroed on return to Java3102asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,3103"last_Java_pc not zeroed before leaving Java");31043105// When returning from calling out from Java mode the frame anchor's3106// last_Java_pc will always be set to NULL. It is set here so that3107// if we are doing a call to native (not VM) that we capture the3108// known pc and don't have to rely on the native call having a3109// standard frame linkage where we can find the pc.3110if (last_Java_pc != noreg)3111std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);31123113// Set last_Java_sp last.3114std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);3115}31163117void MacroAssembler::reset_last_Java_frame(void) {3118asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),3119R16_thread, "SP was not set, still zero");31203121BLOCK_COMMENT("reset_last_Java_frame {");3122li(R0, 0);31233124// _last_Java_sp = 03125std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);31263127// _last_Java_pc = 03128std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);3129BLOCK_COMMENT("} reset_last_Java_frame");3130}31313132void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {3133assert_different_registers(sp, tmp1);31343135// sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via3136// TOP_IJAVA_FRAME_ABI.3137// FIXME: assert that we really have a TOP_IJAVA_FRAME here!3138address entry = pc();3139load_const_optimized(tmp1, entry);31403141set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);3142}31433144void MacroAssembler::get_vm_result(Register oop_result) {3145// Read:3146// R16_thread3147// R16_thread->in_bytes(JavaThread::vm_result_offset())3148//3149// Updated:3150// oop_result3151// R16_thread->in_bytes(JavaThread::vm_result_offset())31523153verify_thread();31543155ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);3156li(R0, 0);3157std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);31583159verify_oop(oop_result, FILE_AND_LINE);3160}31613162void MacroAssembler::get_vm_result_2(Register metadata_result) {3163// Read:3164// R16_thread3165// R16_thread->in_bytes(JavaThread::vm_result_2_offset())3166//3167// Updated:3168// metadata_result3169// R16_thread->in_bytes(JavaThread::vm_result_2_offset())31703171ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);3172li(R0, 0);3173std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);3174}31753176Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {3177Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.3178if (CompressedKlassPointers::base() != 0) {3179// Use dst as temp if it is free.3180sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);3181current = dst;3182}3183if (CompressedKlassPointers::shift() != 0) {3184srdi(dst, current, CompressedKlassPointers::shift());3185current = dst;3186}3187return current;3188}31893190void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {3191if (UseCompressedClassPointers) {3192Register compressedKlass = encode_klass_not_null(ck, klass);3193stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);3194} else {3195std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);3196}3197}31983199void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {3200if (UseCompressedClassPointers) {3201if (val == noreg) {3202val = R0;3203li(val, 0);3204}3205stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed3206}3207}32083209int MacroAssembler::instr_size_for_decode_klass_not_null() {3210static int computed_size = -1;32113212// Not yet computed?3213if (computed_size == -1) {32143215if (!UseCompressedClassPointers) {3216computed_size = 0;3217} else {3218// Determine by scratch emit.3219ResourceMark rm;3220int code_size = 8 * BytesPerInstWord;3221CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);3222MacroAssembler* a = new MacroAssembler(&cb);3223a->decode_klass_not_null(R11_scratch1);3224computed_size = a->offset();3225}3226}32273228return computed_size;3229}32303231void MacroAssembler::decode_klass_not_null(Register dst, Register src) {3232assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");3233if (src == noreg) src = dst;3234Register shifted_src = src;3235if (CompressedKlassPointers::shift() != 0 ||3236CompressedKlassPointers::base() == 0 && src != dst) { // Move required.3237shifted_src = dst;3238sldi(shifted_src, src, CompressedKlassPointers::shift());3239}3240if (CompressedKlassPointers::base() != 0) {3241add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);3242}3243}32443245void MacroAssembler::load_klass(Register dst, Register src) {3246if (UseCompressedClassPointers) {3247lwz(dst, oopDesc::klass_offset_in_bytes(), src);3248// Attention: no null check here!3249decode_klass_not_null(dst, dst);3250} else {3251ld(dst, oopDesc::klass_offset_in_bytes(), src);3252}3253}32543255// ((OopHandle)result).resolve();3256void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,3257MacroAssembler::PreservationLevel preservation_level) {3258access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);3259}32603261void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,3262MacroAssembler::PreservationLevel preservation_level) {3263Label resolved;32643265// A null weak handle resolves to null.3266cmpdi(CCR0, result, 0);3267beq(CCR0, resolved);32683269access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,3270preservation_level);3271bind(resolved);3272}32733274void MacroAssembler::load_method_holder(Register holder, Register method) {3275ld(holder, in_bytes(Method::const_offset()), method);3276ld(holder, in_bytes(ConstMethod::constants_offset()), holder);3277ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);3278}32793280// Clear Array3281// For very short arrays. tmp == R0 is allowed.3282void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {3283if (cnt_dwords > 0) { li(tmp, 0); }3284for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }3285}32863287// Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.3288void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {3289if (cnt_dwords < 8) {3290clear_memory_unrolled(base_ptr, cnt_dwords, tmp);3291return;3292}32933294Label loop;3295const long loopcnt = cnt_dwords >> 1,3296remainder = cnt_dwords & 1;32973298li(tmp, loopcnt);3299mtctr(tmp);3300li(tmp, 0);3301bind(loop);3302std(tmp, 0, base_ptr);3303std(tmp, 8, base_ptr);3304addi(base_ptr, base_ptr, 16);3305bdnz(loop);3306if (remainder) { std(tmp, 0, base_ptr); }3307}33083309// Kills both input registers. tmp == R0 is allowed.3310void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {3311// Procedure for large arrays (uses data cache block zero instruction).3312Label startloop, fast, fastloop, small_rest, restloop, done;3313const int cl_size = VM_Version::L1_data_cache_line_size(),3314cl_dwords = cl_size >> 3,3315cl_dw_addr_bits = exact_log2(cl_dwords),3316dcbz_min = 1, // Min count of dcbz executions, needs to be >0.3317min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;33183319if (const_cnt >= 0) {3320// Constant case.3321if (const_cnt < min_cnt) {3322clear_memory_constlen(base_ptr, const_cnt, tmp);3323return;3324}3325load_const_optimized(cnt_dwords, const_cnt, tmp);3326} else {3327// cnt_dwords already loaded in register. Need to check size.3328cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).3329blt(CCR1, small_rest);3330}3331rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.3332beq(CCR0, fast); // Already 128byte aligned.33333334subfic(tmp, tmp, cl_dwords);3335mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).3336subf(cnt_dwords, tmp, cnt_dwords); // rest.3337li(tmp, 0);33383339bind(startloop); // Clear at the beginning to reach 128byte boundary.3340std(tmp, 0, base_ptr); // Clear 8byte aligned block.3341addi(base_ptr, base_ptr, 8);3342bdnz(startloop);33433344bind(fast); // Clear 128byte blocks.3345srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).3346andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.3347mtctr(tmp); // Load counter.33483349bind(fastloop);3350dcbz(base_ptr); // Clear 128byte aligned block.3351addi(base_ptr, base_ptr, cl_size);3352bdnz(fastloop);33533354bind(small_rest);3355cmpdi(CCR0, cnt_dwords, 0); // size 0?3356beq(CCR0, done); // rest == 03357li(tmp, 0);3358mtctr(cnt_dwords); // Load counter.33593360bind(restloop); // Clear rest.3361std(tmp, 0, base_ptr); // Clear 8byte aligned block.3362addi(base_ptr, base_ptr, 8);3363bdnz(restloop);33643365bind(done);3366}33673368/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////33693370// Helpers for Intrinsic Emitters3371//3372// Revert the byte order of a 32bit value in a register3373// src: 0x445566773374// dst: 0x776655443375// Three steps to obtain the result:3376// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word3377// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.3378// This value initializes dst.3379// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost3380// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.3381// This value is mask inserted into dst with a [0..23] mask of 1s.3382// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.3383// This value is mask inserted into dst with a [8..15] mask of 1s.3384void MacroAssembler::load_reverse_32(Register dst, Register src) {3385assert_different_registers(dst, src);33863387rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.3388rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.3389rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.3390}33913392// Calculate the column addresses of the crc32 lookup table into distinct registers.3393// This loop-invariant calculation is moved out of the loop body, reducing the loop3394// body size from 20 to 16 instructions.3395// Returns the offset that was used to calculate the address of column tc3.3396// Due to register shortage, setting tc3 may overwrite table. With the return offset3397// at hand, the original table address can be easily reconstructed.3398int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {3399assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");34003401// Point to 4 byte folding tables (byte-reversed version for Big Endian)3402// Layout: See StubRoutines::ppc::generate_crc_constants.3403#ifdef VM_LITTLE_ENDIAN3404const int ix0 = 3 * CRC32_TABLE_SIZE;3405const int ix1 = 2 * CRC32_TABLE_SIZE;3406const int ix2 = 1 * CRC32_TABLE_SIZE;3407const int ix3 = 0 * CRC32_TABLE_SIZE;3408#else3409const int ix0 = 1 * CRC32_TABLE_SIZE;3410const int ix1 = 2 * CRC32_TABLE_SIZE;3411const int ix2 = 3 * CRC32_TABLE_SIZE;3412const int ix3 = 4 * CRC32_TABLE_SIZE;3413#endif3414assert_different_registers(table, tc0, tc1, tc2);3415assert(table == tc3, "must be!");34163417addi(tc0, table, ix0);3418addi(tc1, table, ix1);3419addi(tc2, table, ix2);3420if (ix3 != 0) addi(tc3, table, ix3);34213422return ix3;3423}34243425/**3426* uint32_t crc;3427* table[crc & 0xFF] ^ (crc >> 8);3428*/3429void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {3430assert_different_registers(crc, table, tmp);3431assert_different_registers(val, table);34323433if (crc == val) { // Must rotate first to use the unmodified value.3434rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.3435// As we use a word (4-byte) instruction, we have to adapt the mask bit positions.3436srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.3437} else {3438srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.3439rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.3440}3441lwzx(tmp, table, tmp);3442xorr(crc, crc, tmp);3443}34443445/**3446* Emits code to update CRC-32 with a byte value according to constants in table.3447*3448* @param [in,out]crc Register containing the crc.3449* @param [in]val Register containing the byte to fold into the CRC.3450* @param [in]table Register containing the table of crc constants.3451*3452* uint32_t crc;3453* val = crc_table[(val ^ crc) & 0xFF];3454* crc = val ^ (crc >> 8);3455*/3456void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {3457BLOCK_COMMENT("update_byte_crc32:");3458xorr(val, val, crc);3459fold_byte_crc32(crc, val, table, val);3460}34613462/**3463* @param crc register containing existing CRC (32-bit)3464* @param buf register pointing to input byte buffer (byte*)3465* @param len register containing number of bytes3466* @param table register pointing to CRC table3467*/3468void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,3469Register data, bool loopAlignment) {3470assert_different_registers(crc, buf, len, table, data);34713472Label L_mainLoop, L_done;3473const int mainLoop_stepping = 1;3474const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;34753476// Process all bytes in a single-byte loop.3477clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?3478beq(CCR0, L_done);34793480mtctr(len);3481align(mainLoop_alignment);3482BIND(L_mainLoop);3483lbz(data, 0, buf); // Byte from buffer, zero-extended.3484addi(buf, buf, mainLoop_stepping); // Advance buffer position.3485update_byte_crc32(crc, data, table);3486bdnz(L_mainLoop); // Iterate.34873488bind(L_done);3489}34903491/**3492* Emits code to update CRC-32 with a 4-byte value according to constants in table3493* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c3494*/3495// A note on the lookup table address(es):3496// The implementation uses 4 table columns (byte-reversed versions for Big Endian).3497// To save the effort of adding the column offset to the table address each time3498// a table element is looked up, it is possible to pass the pre-calculated3499// column addresses.3500// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.3501void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,3502Register t0, Register t1, Register t2, Register t3,3503Register tc0, Register tc1, Register tc2, Register tc3) {3504assert_different_registers(crc, t3);35053506// XOR crc with next four bytes of buffer.3507lwz(t3, bufDisp, buf);3508if (bufInc != 0) {3509addi(buf, buf, bufInc);3510}3511xorr(t3, t3, crc);35123513// Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.3514rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 23515rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 23516rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 23517rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 235183519// Use the pre-calculated column addresses.3520// Load pre-calculated table values.3521lwzx(t0, tc0, t0);3522lwzx(t1, tc1, t1);3523lwzx(t2, tc2, t2);3524lwzx(t3, tc3, t3);35253526// Calculate new crc from table values.3527xorr(t0, t0, t1);3528xorr(t2, t2, t3);3529xorr(crc, t0, t2); // Now crc contains the final checksum value.3530}35313532/**3533* @param crc register containing existing CRC (32-bit)3534* @param buf register pointing to input byte buffer (byte*)3535* @param len register containing number of bytes3536* @param table register pointing to CRC table3537*3538* uses R9..R12 as work register. Must be saved/restored by caller!3539*/3540void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,3541Register t0, Register t1, Register t2, Register t3,3542Register tc0, Register tc1, Register tc2, Register tc3,3543bool invertCRC) {3544assert_different_registers(crc, buf, len, table);35453546Label L_mainLoop, L_tail;3547Register tmp = t0;3548Register data = t0;3549Register tmp2 = t1;3550const int mainLoop_stepping = 4;3551const int tailLoop_stepping = 1;3552const int log_stepping = exact_log2(mainLoop_stepping);3553const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;3554const int complexThreshold = 2*mainLoop_stepping;35553556// Don't test for len <= 0 here. This pathological case should not occur anyway.3557// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles3558// for all well-behaved cases. The situation itself is detected and handled correctly3559// within update_byteLoop_crc32.3560assert(tailLoop_stepping == 1, "check tailLoop_stepping!");35613562BLOCK_COMMENT("kernel_crc32_1word {");35633564if (invertCRC) {3565nand(crc, crc, crc); // 1s complement of crc3566}35673568// Check for short (<mainLoop_stepping) buffer.3569cmpdi(CCR0, len, complexThreshold);3570blt(CCR0, L_tail);35713572// Pre-mainLoop alignment did show a slight (1%) positive effect on performance.3573// We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.3574{3575// Align buf addr to mainLoop_stepping boundary.3576neg(tmp2, buf); // Calculate # preLoop iterations for alignment.3577rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.35783579if (complexThreshold > mainLoop_stepping) {3580sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3581} else {3582sub(tmp, len, tmp2); // Remaining bytes for main loop.3583cmpdi(CCR0, tmp, mainLoop_stepping);3584blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing3585mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3586}3587update_byteLoop_crc32(crc, buf, tmp2, table, data, false);3588}35893590srdi(tmp2, len, log_stepping); // #iterations for mainLoop3591andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop3592mtctr(tmp2);35933594#ifdef VM_LITTLE_ENDIAN3595Register crc_rv = crc;3596#else3597Register crc_rv = tmp; // Load_reverse needs separate registers to work on.3598// Occupies tmp, but frees up crc.3599load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.3600tmp = crc;3601#endif36023603int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);36043605align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.3606BIND(L_mainLoop);3607update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);3608bdnz(L_mainLoop);36093610#ifndef VM_LITTLE_ENDIAN3611load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.3612tmp = crc_rv; // Tmp uses it's original register again.3613#endif36143615// Restore original table address for tailLoop.3616if (reconstructTableOffset != 0) {3617addi(table, table, -reconstructTableOffset);3618}36193620// Process last few (<complexThreshold) bytes of buffer.3621BIND(L_tail);3622update_byteLoop_crc32(crc, buf, len, table, data, false);36233624if (invertCRC) {3625nand(crc, crc, crc); // 1s complement of crc3626}3627BLOCK_COMMENT("} kernel_crc32_1word");3628}36293630/**3631* @param crc register containing existing CRC (32-bit)3632* @param buf register pointing to input byte buffer (byte*)3633* @param len register containing number of bytes3634* @param constants register pointing to precomputed constants3635* @param t0-t6 temp registers3636*/3637void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,3638Register t0, Register t1, Register t2, Register t3,3639Register t4, Register t5, Register t6, bool invertCRC) {3640assert_different_registers(crc, buf, len, constants);36413642Label L_tail;36433644BLOCK_COMMENT("kernel_crc32_vpmsum {");36453646if (invertCRC) {3647nand(crc, crc, crc); // 1s complement of crc3648}36493650// Enforce 32 bit.3651clrldi(len, len, 32);36523653// Align if we have enough bytes for the fast version.3654const int alignment = 16,3655threshold = 32;3656Register prealign = t0;36573658neg(prealign, buf);3659addi(t1, len, -threshold);3660andi(prealign, prealign, alignment - 1);3661cmpw(CCR0, t1, prealign);3662blt(CCR0, L_tail); // len - prealign < threshold?36633664subf(len, prealign, len);3665update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);36663667// Calculate from first aligned address as far as possible.3668addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.3669kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);3670addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.36713672// Remaining bytes.3673BIND(L_tail);3674update_byteLoop_crc32(crc, buf, len, constants, t2, false);36753676if (invertCRC) {3677nand(crc, crc, crc); // 1s complement of crc3678}36793680BLOCK_COMMENT("} kernel_crc32_vpmsum");3681}36823683/**3684* @param crc register containing existing CRC (32-bit)3685* @param buf register pointing to input byte buffer (byte*)3686* @param len register containing number of bytes (will get updated to remaining bytes)3687* @param constants register pointing to CRC table for 128-bit aligned memory3688* @param t0-t6 temp registers3689*/3690void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,3691Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {36923693// Save non-volatile vector registers (frameless).3694Register offset = t1;3695int offsetInt = 0;3696offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);3697offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);3698offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);3699offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);3700offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);3701offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);3702#ifndef VM_LITTLE_ENDIAN3703offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);3704#endif3705offsetInt -= 8; std(R14, offsetInt, R1_SP);3706offsetInt -= 8; std(R15, offsetInt, R1_SP);37073708// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor3709// bytes per iteration. The basic scheme is:3710// lvx: load vector (Big Endian needs reversal)3711// vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift3712// vxor: xor partial results together to get unroll_factor2 vectors37133714// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.37153716// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.3717const int unroll_factor = CRC32_UNROLL_FACTOR,3718unroll_factor2 = CRC32_UNROLL_FACTOR2;37193720const int outer_consts_size = (unroll_factor2 - 1) * 16,3721inner_consts_size = (unroll_factor / unroll_factor2) * 16;37223723// Support registers.3724Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };3725Register num_bytes = R14,3726loop_count = R15,3727cur_const = crc; // will live in VCRC3728// Constant array for outer loop: unroll_factor2 - 1 registers,3729// Constant array for inner loop: unroll_factor / unroll_factor2 registers.3730VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },3731consts1[] = { VR23, VR24 };3732// Data register arrays: 2 arrays with unroll_factor2 registers.3733VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },3734data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };37353736VectorRegister VCRC = data0[0];3737VectorRegister Vc = VR25;3738VectorRegister swap_bytes = VR26; // Only for Big Endian.37393740// We have at least 1 iteration (ensured by caller).3741Label L_outer_loop, L_inner_loop, L_last;37423743// If supported set DSCR pre-fetch to deepest.3744if (VM_Version::has_mfdscr()) {3745load_const_optimized(t0, VM_Version::_dscr_val | 7);3746mtdscr(t0);3747}37483749mtvrwz(VCRC, crc); // crc lives in VCRC, now37503751for (int i = 1; i < unroll_factor2; ++i) {3752li(offs[i], 16 * i);3753}37543755// Load consts for outer loop3756lvx(consts0[0], constants);3757for (int i = 1; i < unroll_factor2 - 1; ++i) {3758lvx(consts0[i], offs[i], constants);3759}37603761load_const_optimized(num_bytes, 16 * unroll_factor);37623763// Reuse data registers outside of the loop.3764VectorRegister Vtmp = data1[0];3765VectorRegister Vtmp2 = data1[1];3766VectorRegister zeroes = data1[2];37673768vspltisb(Vtmp, 0);3769vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.37703771// Load vector for vpermxor (to xor both 64 bit parts together)3772lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f3773vspltisb(Vc, 4);3774vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f03775xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);3776vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f37773778#ifdef VM_LITTLE_ENDIAN3779#define BE_swap_bytes(x)3780#else3781vspltisb(Vtmp2, 0xf);3782vxor(swap_bytes, Vtmp, Vtmp2);3783#define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)3784#endif37853786cmpd(CCR0, len, num_bytes);3787blt(CCR0, L_last);37883789addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop3790load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.37913792// ********** Main loop start **********3793align(32);3794bind(L_outer_loop);37953796// Begin of unrolled first iteration (no xor).3797lvx(data1[0], buf);3798for (int i = 1; i < unroll_factor2 / 2; ++i) {3799lvx(data1[i], offs[i], buf);3800}3801vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.3802lvx(consts1[0], cur_const);3803mtctr(loop_count);3804for (int i = 0; i < unroll_factor2 / 2; ++i) {3805BE_swap_bytes(data1[i]);3806if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.3807lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);3808vpmsumw(data0[i], data1[i], consts1[0]);3809}3810addi(buf, buf, 16 * unroll_factor2);3811subf(len, num_bytes, len);3812lvx(consts1[1], offs[1], cur_const);3813addi(cur_const, cur_const, 32);3814// Begin of unrolled second iteration (head).3815for (int i = 0; i < unroll_factor2 / 2; ++i) {3816BE_swap_bytes(data1[i + unroll_factor2 / 2]);3817if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }3818vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);3819}3820for (int i = 0; i < unroll_factor2 / 2; ++i) {3821BE_swap_bytes(data1[i]);3822lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);3823vpmsumw(data1[i], data1[i], consts1[1]);3824}3825addi(buf, buf, 16 * unroll_factor2);38263827// Generate most performance relevant code. Loads + half of the vpmsumw have been generated.3828// Double-iteration allows using the 2 constant registers alternatingly.3829align(32);3830bind(L_inner_loop);3831for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.3832if (j & 1) {3833lvx(consts1[0], cur_const);3834} else {3835lvx(consts1[1], offs[1], cur_const);3836addi(cur_const, cur_const, 32);3837}3838for (int i = 0; i < unroll_factor2; ++i) {3839int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.3840if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }3841BE_swap_bytes(data1[idx]);3842vxor(data0[i], data0[i], data1[i]);3843if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);3844vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);3845}3846addi(buf, buf, 16 * unroll_factor2);3847}3848bdnz(L_inner_loop);38493850addi(cur_const, constants, outer_consts_size); // Reset38513852// Tail of last iteration (no loads).3853for (int i = 0; i < unroll_factor2 / 2; ++i) {3854BE_swap_bytes(data1[i + unroll_factor2 / 2]);3855vxor(data0[i], data0[i], data1[i]);3856vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);3857}3858for (int i = 0; i < unroll_factor2 / 2; ++i) {3859vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.3860vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);3861}38623863// Last data register is ok, other ones need fixup shift.3864for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {3865vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);3866}38673868// Combine to 128 bit result vector VCRC = data0[0].3869for (int i = 1; i < unroll_factor2; i<<=1) {3870for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {3871vxor(data0[j], data0[j], data0[j+i]);3872}3873}3874cmpd(CCR0, len, num_bytes);3875bge(CCR0, L_outer_loop);38763877// Last chance with lower num_bytes.3878bind(L_last);3879srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.3880// Point behind last const for inner loop.3881add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);3882sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.3883clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));3884subf(cur_const, R0, cur_const); // Point to constant to be used first.38853886addic_(loop_count, loop_count, -1); // One double-iteration peeled off.3887bgt(CCR0, L_outer_loop);3888// ********** Main loop end **********38893890// Restore DSCR pre-fetch value.3891if (VM_Version::has_mfdscr()) {3892load_const_optimized(t0, VM_Version::_dscr_val);3893mtdscr(t0);3894}38953896// ********** Simple loop for remaining 16 byte blocks **********3897{3898Label L_loop, L_done;38993900srdi_(t0, len, 4); // 16 bytes per iteration3901clrldi(len, len, 64-4);3902beq(CCR0, L_done);39033904// Point to const (same as last const for inner loop).3905add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);3906mtctr(t0);3907lvx(Vtmp2, cur_const);39083909align(32);3910bind(L_loop);39113912lvx(Vtmp, buf);3913addi(buf, buf, 16);3914vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.3915BE_swap_bytes(Vtmp);3916vxor(VCRC, VCRC, Vtmp);3917vpmsumw(VCRC, VCRC, Vtmp2);3918bdnz(L_loop);39193920bind(L_done);3921}3922// ********** Simple loop end **********3923#undef BE_swap_bytes39243925// Point to Barrett constants3926add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);39273928vspltisb(zeroes, 0);39293930// Combine to 64 bit result.3931vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.39323933// Reduce to 32 bit CRC: Remainder by multiply-high.3934lvx(Vtmp, cur_const);3935vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.3936vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.3937vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.3938vsldoi(Vtmp, zeroes, Vtmp, 8);3939vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.3940vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.39413942// Move result. len is already updated.3943vsldoi(VCRC, VCRC, zeroes, 8);3944mfvrd(crc, VCRC);39453946// Restore non-volatile Vector registers (frameless).3947offsetInt = 0;3948offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);3949offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);3950offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);3951offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);3952offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);3953offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);3954#ifndef VM_LITTLE_ENDIAN3955offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);3956#endif3957offsetInt -= 8; ld(R14, offsetInt, R1_SP);3958offsetInt -= 8; ld(R15, offsetInt, R1_SP);3959}39603961void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,3962Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {3963load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()3964: StubRoutines::crc_table_addr() , R0);39653966if (VM_Version::has_vpmsumb()) {3967kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);3968} else {3969kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);3970}3971}39723973void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {3974assert_different_registers(crc, val, table);39753976BLOCK_COMMENT("kernel_crc32_singleByteReg:");3977if (invertCRC) {3978nand(crc, crc, crc); // 1s complement of crc3979}39803981update_byte_crc32(crc, val, table);39823983if (invertCRC) {3984nand(crc, crc, crc); // 1s complement of crc3985}3986}39873988// dest_lo += src1 + src23989// dest_hi += carry1 + carry23990void MacroAssembler::add2_with_carry(Register dest_hi,3991Register dest_lo,3992Register src1, Register src2) {3993li(R0, 0);3994addc(dest_lo, dest_lo, src1);3995adde(dest_hi, dest_hi, R0);3996addc(dest_lo, dest_lo, src2);3997adde(dest_hi, dest_hi, R0);3998}39994000// Multiply 64 bit by 64 bit first loop.4001void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,4002Register x_xstart,4003Register y, Register y_idx,4004Register z,4005Register carry,4006Register product_high, Register product,4007Register idx, Register kdx,4008Register tmp) {4009// jlong carry, x[], y[], z[];4010// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {4011// huge_128 product = y[idx] * x[xstart] + carry;4012// z[kdx] = (jlong)product;4013// carry = (jlong)(product >>> 64);4014// }4015// z[xstart] = carry;40164017Label L_first_loop, L_first_loop_exit;4018Label L_one_x, L_one_y, L_multiply;40194020addic_(xstart, xstart, -1);4021blt(CCR0, L_one_x); // Special case: length of x is 1.40224023// Load next two integers of x.4024sldi(tmp, xstart, LogBytesPerInt);4025ldx(x_xstart, x, tmp);4026#ifdef VM_LITTLE_ENDIAN4027rldicl(x_xstart, x_xstart, 32, 0);4028#endif40294030align(32, 16);4031bind(L_first_loop);40324033cmpdi(CCR0, idx, 1);4034blt(CCR0, L_first_loop_exit);4035addi(idx, idx, -2);4036beq(CCR0, L_one_y);40374038// Load next two integers of y.4039sldi(tmp, idx, LogBytesPerInt);4040ldx(y_idx, y, tmp);4041#ifdef VM_LITTLE_ENDIAN4042rldicl(y_idx, y_idx, 32, 0);4043#endif404440454046bind(L_multiply);4047multiply64(product_high, product, x_xstart, y_idx);40484049li(tmp, 0);4050addc(product, product, carry); // Add carry to result.4051adde(product_high, product_high, tmp); // Add carry of the last addition.4052addi(kdx, kdx, -2);40534054// Store result.4055#ifdef VM_LITTLE_ENDIAN4056rldicl(product, product, 32, 0);4057#endif4058sldi(tmp, kdx, LogBytesPerInt);4059stdx(product, z, tmp);4060mr_if_needed(carry, product_high);4061b(L_first_loop);406240634064bind(L_one_y); // Load one 32 bit portion of y as (0,value).40654066lwz(y_idx, 0, y);4067b(L_multiply);406840694070bind(L_one_x); // Load one 32 bit portion of x as (0,value).40714072lwz(x_xstart, 0, x);4073b(L_first_loop);40744075bind(L_first_loop_exit);4076}40774078// Multiply 64 bit by 64 bit and add 128 bit.4079void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,4080Register z, Register yz_idx,4081Register idx, Register carry,4082Register product_high, Register product,4083Register tmp, int offset) {40844085// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;4086// z[kdx] = (jlong)product;40874088sldi(tmp, idx, LogBytesPerInt);4089if (offset) {4090addi(tmp, tmp, offset);4091}4092ldx(yz_idx, y, tmp);4093#ifdef VM_LITTLE_ENDIAN4094rldicl(yz_idx, yz_idx, 32, 0);4095#endif40964097multiply64(product_high, product, x_xstart, yz_idx);4098ldx(yz_idx, z, tmp);4099#ifdef VM_LITTLE_ENDIAN4100rldicl(yz_idx, yz_idx, 32, 0);4101#endif41024103add2_with_carry(product_high, product, carry, yz_idx);41044105sldi(tmp, idx, LogBytesPerInt);4106if (offset) {4107addi(tmp, tmp, offset);4108}4109#ifdef VM_LITTLE_ENDIAN4110rldicl(product, product, 32, 0);4111#endif4112stdx(product, z, tmp);4113}41144115// Multiply 128 bit by 128 bit. Unrolled inner loop.4116void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,4117Register y, Register z,4118Register yz_idx, Register idx, Register carry,4119Register product_high, Register product,4120Register carry2, Register tmp) {41214122// jlong carry, x[], y[], z[];4123// int kdx = ystart+1;4124// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop4125// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;4126// z[kdx+idx+1] = (jlong)product;4127// jlong carry2 = (jlong)(product >>> 64);4128// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;4129// z[kdx+idx] = (jlong)product;4130// carry = (jlong)(product >>> 64);4131// }4132// idx += 2;4133// if (idx > 0) {4134// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;4135// z[kdx+idx] = (jlong)product;4136// carry = (jlong)(product >>> 64);4137// }41384139Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;4140const Register jdx = R0;41414142// Scale the index.4143srdi_(jdx, idx, 2);4144beq(CCR0, L_third_loop_exit);4145mtctr(jdx);41464147align(32, 16);4148bind(L_third_loop);41494150addi(idx, idx, -4);41514152multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);4153mr_if_needed(carry2, product_high);41544155multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);4156mr_if_needed(carry, product_high);4157bdnz(L_third_loop);41584159bind(L_third_loop_exit); // Handle any left-over operand parts.41604161andi_(idx, idx, 0x3);4162beq(CCR0, L_post_third_loop_done);41634164Label L_check_1;41654166addic_(idx, idx, -2);4167blt(CCR0, L_check_1);41684169multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);4170mr_if_needed(carry, product_high);41714172bind(L_check_1);41734174addi(idx, idx, 0x2);4175andi_(idx, idx, 0x1);4176addic_(idx, idx, -1);4177blt(CCR0, L_post_third_loop_done);41784179sldi(tmp, idx, LogBytesPerInt);4180lwzx(yz_idx, y, tmp);4181multiply64(product_high, product, x_xstart, yz_idx);4182lwzx(yz_idx, z, tmp);41834184add2_with_carry(product_high, product, yz_idx, carry);41854186sldi(tmp, idx, LogBytesPerInt);4187stwx(product, z, tmp);4188srdi(product, product, 32);41894190sldi(product_high, product_high, 32);4191orr(product, product, product_high);4192mr_if_needed(carry, product);41934194bind(L_post_third_loop_done);4195} // multiply_128_x_128_loop41964197void MacroAssembler::muladd(Register out, Register in,4198Register offset, Register len, Register k,4199Register tmp1, Register tmp2, Register carry) {42004201// Labels4202Label LOOP, SKIP;42034204// Make sure length is positive.4205cmpdi (CCR0, len, 0);42064207// Prepare variables4208subi (offset, offset, 4);4209li (carry, 0);4210ble (CCR0, SKIP);42114212mtctr (len);4213subi (len, len, 1 );4214sldi (len, len, 2 );42154216// Main loop4217bind(LOOP);4218lwzx (tmp1, len, in );4219lwzx (tmp2, offset, out );4220mulld (tmp1, tmp1, k );4221add (tmp2, carry, tmp2 );4222add (tmp2, tmp1, tmp2 );4223stwx (tmp2, offset, out );4224srdi (carry, tmp2, 32 );4225subi (offset, offset, 4 );4226subi (len, len, 4 );4227bdnz (LOOP);4228bind(SKIP);4229}42304231void MacroAssembler::multiply_to_len(Register x, Register xlen,4232Register y, Register ylen,4233Register z, Register zlen,4234Register tmp1, Register tmp2,4235Register tmp3, Register tmp4,4236Register tmp5, Register tmp6,4237Register tmp7, Register tmp8,4238Register tmp9, Register tmp10,4239Register tmp11, Register tmp12,4240Register tmp13) {42414242ShortBranchVerifier sbv(this);42434244assert_different_registers(x, xlen, y, ylen, z, zlen,4245tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);4246assert_different_registers(x, xlen, y, ylen, z, zlen,4247tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);4248assert_different_registers(x, xlen, y, ylen, z, zlen,4249tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);42504251const Register idx = tmp1;4252const Register kdx = tmp2;4253const Register xstart = tmp3;42544255const Register y_idx = tmp4;4256const Register carry = tmp5;4257const Register product = tmp6;4258const Register product_high = tmp7;4259const Register x_xstart = tmp8;4260const Register tmp = tmp9;42614262// First Loop.4263//4264// final static long LONG_MASK = 0xffffffffL;4265// int xstart = xlen - 1;4266// int ystart = ylen - 1;4267// long carry = 0;4268// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {4269// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;4270// z[kdx] = (int)product;4271// carry = product >>> 32;4272// }4273// z[xstart] = (int)carry;42744275mr_if_needed(idx, ylen); // idx = ylen4276mr_if_needed(kdx, zlen); // kdx = xlen + ylen4277li(carry, 0); // carry = 042784279Label L_done;42804281addic_(xstart, xlen, -1);4282blt(CCR0, L_done);42834284multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,4285carry, product_high, product, idx, kdx, tmp);42864287Label L_second_loop;42884289cmpdi(CCR0, kdx, 0);4290beq(CCR0, L_second_loop);42914292Label L_carry;42934294addic_(kdx, kdx, -1);4295beq(CCR0, L_carry);42964297// Store lower 32 bits of carry.4298sldi(tmp, kdx, LogBytesPerInt);4299stwx(carry, z, tmp);4300srdi(carry, carry, 32);4301addi(kdx, kdx, -1);430243034304bind(L_carry);43054306// Store upper 32 bits of carry.4307sldi(tmp, kdx, LogBytesPerInt);4308stwx(carry, z, tmp);43094310// Second and third (nested) loops.4311//4312// for (int i = xstart-1; i >= 0; i--) { // Second loop4313// carry = 0;4314// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop4315// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +4316// (z[k] & LONG_MASK) + carry;4317// z[k] = (int)product;4318// carry = product >>> 32;4319// }4320// z[i] = (int)carry;4321// }4322//4323// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx43244325bind(L_second_loop);43264327li(carry, 0); // carry = 0;43284329addic_(xstart, xstart, -1); // i = xstart-1;4330blt(CCR0, L_done);43314332Register zsave = tmp10;43334334mr(zsave, z);433543364337Label L_last_x;43384339sldi(tmp, xstart, LogBytesPerInt);4340add(z, z, tmp); // z = z + k - j4341addi(z, z, 4);4342addic_(xstart, xstart, -1); // i = xstart-1;4343blt(CCR0, L_last_x);43444345sldi(tmp, xstart, LogBytesPerInt);4346ldx(x_xstart, x, tmp);4347#ifdef VM_LITTLE_ENDIAN4348rldicl(x_xstart, x_xstart, 32, 0);4349#endif435043514352Label L_third_loop_prologue;43534354bind(L_third_loop_prologue);43554356Register xsave = tmp11;4357Register xlensave = tmp12;4358Register ylensave = tmp13;43594360mr(xsave, x);4361mr(xlensave, xstart);4362mr(ylensave, ylen);436343644365multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,4366carry, product_high, product, x, tmp);43674368mr(z, zsave);4369mr(x, xsave);4370mr(xlen, xlensave); // This is the decrement of the loop counter!4371mr(ylen, ylensave);43724373addi(tmp3, xlen, 1);4374sldi(tmp, tmp3, LogBytesPerInt);4375stwx(carry, z, tmp);4376addic_(tmp3, tmp3, -1);4377blt(CCR0, L_done);43784379srdi(carry, carry, 32);4380sldi(tmp, tmp3, LogBytesPerInt);4381stwx(carry, z, tmp);4382b(L_second_loop);43834384// Next infrequent code is moved outside loops.4385bind(L_last_x);43864387lwz(x_xstart, 0, x);4388b(L_third_loop_prologue);43894390bind(L_done);4391} // multiply_to_len43924393void MacroAssembler::asm_assert(bool check_equal, const char *msg) {4394#ifdef ASSERT4395Label ok;4396if (check_equal) {4397beq(CCR0, ok);4398} else {4399bne(CCR0, ok);4400}4401stop(msg);4402bind(ok);4403#endif4404}44054406void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,4407Register mem_base, const char* msg) {4408#ifdef ASSERT4409switch (size) {4410case 4:4411lwz(R0, mem_offset, mem_base);4412cmpwi(CCR0, R0, 0);4413break;4414case 8:4415ld(R0, mem_offset, mem_base);4416cmpdi(CCR0, R0, 0);4417break;4418default:4419ShouldNotReachHere();4420}4421asm_assert(check_equal, msg);4422#endif // ASSERT4423}44244425void MacroAssembler::verify_thread() {4426if (VerifyThread) {4427unimplemented("'VerifyThread' currently not implemented on PPC");4428}4429}44304431void MacroAssembler::verify_coop(Register coop, const char* msg) {4432if (!VerifyOops) { return; }4433if (UseCompressedOops) { decode_heap_oop(coop); }4434verify_oop(coop, msg);4435if (UseCompressedOops) { encode_heap_oop(coop, coop); }4436}44374438// READ: oop. KILL: R0. Volatile floats perhaps.4439void MacroAssembler::verify_oop(Register oop, const char* msg) {4440if (!VerifyOops) {4441return;4442}44434444address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();4445const Register tmp = R11; // Will be preserved.4446const int nbytes_save = MacroAssembler::num_volatile_regs * 8;44474448BLOCK_COMMENT("verify_oop {");44494450save_volatile_gprs(R1_SP, -nbytes_save); // except R044514452mr_if_needed(R4_ARG2, oop);4453save_LR_CR(tmp); // save in old frame4454push_frame_reg_args(nbytes_save, tmp);4455// load FunctionDescriptor** / entry_address *4456load_const_optimized(tmp, fd, R0);4457// load FunctionDescriptor* / entry_address4458ld(tmp, 0, tmp);4459load_const_optimized(R3_ARG1, (address)msg, R0);4460// Call destination for its side effect.4461call_c(tmp);44624463pop_frame();4464restore_LR_CR(tmp);4465restore_volatile_gprs(R1_SP, -nbytes_save); // except R044664467BLOCK_COMMENT("} verify_oop");4468}44694470void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {4471if (!VerifyOops) {4472return;4473}44744475address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();4476const Register tmp = R11; // Will be preserved.4477const int nbytes_save = MacroAssembler::num_volatile_regs * 8;4478save_volatile_gprs(R1_SP, -nbytes_save); // except R044794480ld(R4_ARG2, offs, base);4481save_LR_CR(tmp); // save in old frame4482push_frame_reg_args(nbytes_save, tmp);4483// load FunctionDescriptor** / entry_address *4484load_const_optimized(tmp, fd, R0);4485// load FunctionDescriptor* / entry_address4486ld(tmp, 0, tmp);4487load_const_optimized(R3_ARG1, (address)msg, R0);4488// Call destination for its side effect.4489call_c(tmp);44904491pop_frame();4492restore_LR_CR(tmp);4493restore_volatile_gprs(R1_SP, -nbytes_save); // except R04494}44954496// Call a C-function that prints output.4497void MacroAssembler::stop(int type, const char* msg) {4498bool msg_present = (msg != NULL);44994500#ifndef PRODUCT4501block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));4502#else4503block_comment("stop {");4504#endif45054506if (msg_present) {4507type |= stop_msg_present;4508}4509tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);4510if (msg_present) {4511emit_int64((uintptr_t)msg);4512}45134514block_comment("} stop;");4515}45164517#ifndef PRODUCT4518// Write pattern 0x0101010101010101 in memory region [low-before, high+after].4519// Val, addr are temp registers.4520// If low == addr, addr is killed.4521// High is preserved.4522void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {4523if (!ZapMemory) return;45244525assert_different_registers(low, val);45264527BLOCK_COMMENT("zap memory region {");4528load_const_optimized(val, 0x0101010101010101);4529int size = before + after;4530if (low == high && size < 5 && size > 0) {4531int offset = -before*BytesPerWord;4532for (int i = 0; i < size; ++i) {4533std(val, offset, low);4534offset += (1*BytesPerWord);4535}4536} else {4537addi(addr, low, -before*BytesPerWord);4538assert_different_registers(high, val);4539if (after) addi(high, high, after * BytesPerWord);4540Label loop;4541bind(loop);4542std(val, 0, addr);4543addi(addr, addr, 8);4544cmpd(CCR6, addr, high);4545ble(CCR6, loop);4546if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.4547}4548BLOCK_COMMENT("} zap memory region");4549}45504551#endif // !PRODUCT45524553void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,4554const bool* flag_addr, Label& label) {4555int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);4556assert(sizeof(bool) == 1, "PowerPC ABI");4557masm->lbz(temp, simm16_offset, temp);4558masm->cmpwi(CCR0, temp, 0);4559masm->beq(CCR0, label);4560}45614562SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {4563skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);4564}45654566SkipIfEqualZero::~SkipIfEqualZero() {4567_masm->bind(_label);4568}45694570void MacroAssembler::cache_wb(Address line) {4571assert(line.index() == noreg, "index should be noreg");4572assert(line.disp() == 0, "displacement should be 0");4573assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");4574// Data Cache Store, not really a flush, so it works like a sync of cache4575// line and persistent mem, i.e. copying the cache line to persistent whilst4576// not invalidating the cache line.4577dcbst(line.base());4578}45794580void MacroAssembler::cache_wbsync(bool is_presync) {4581assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");4582// We only need a post sync barrier. Post means _after_ a cache line flush or4583// store instruction, pre means a barrier emitted before such a instructions.4584if (!is_presync) {4585fence();4586}4587}458845894590