Path: blob/aarch64-shenandoah-jdk8u272-b10/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp
32285 views
/*1* Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.2* Copyright (c) 2012, 2017, SAP SE. All rights reserved.3* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.4*5* This code is free software; you can redistribute it and/or modify it6* under the terms of the GNU General Public License version 2 only, as7* published by the Free Software Foundation.8*9* This code is distributed in the hope that it will be useful, but WITHOUT10* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or11* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License12* version 2 for more details (a copy is included in the LICENSE file that13* accompanied this code).14*15* You should have received a copy of the GNU General Public License version16* 2 along with this work; if not, write to the Free Software Foundation,17* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.18*19* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA20* or visit www.oracle.com if you need additional information or have any21* questions.22*23*/2425#include "precompiled.hpp"26#include "asm/macroAssembler.inline.hpp"27#include "compiler/disassembler.hpp"28#include "gc_interface/collectedHeap.inline.hpp"29#include "interpreter/interpreter.hpp"30#include "memory/cardTableModRefBS.hpp"31#include "memory/resourceArea.hpp"32#include "prims/methodHandles.hpp"33#include "runtime/biasedLocking.hpp"34#include "runtime/interfaceSupport.hpp"35#include "runtime/objectMonitor.hpp"36#include "runtime/os.hpp"37#include "runtime/sharedRuntime.hpp"38#include "runtime/stubRoutines.hpp"39#include "utilities/macros.hpp"40#if INCLUDE_ALL_GCS41#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"42#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"43#include "gc_implementation/g1/heapRegion.hpp"44#endif // INCLUDE_ALL_GCS4546#ifdef PRODUCT47#define BLOCK_COMMENT(str) // nothing48#else49#define BLOCK_COMMENT(str) block_comment(str)50#endif51#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")5253#ifdef ASSERT54// On RISC, there's no benefit to verifying instruction boundaries.55bool AbstractAssembler::pd_check_instruction_mark() { return false; }56#endif5758void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {59assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");60if (Assembler::is_simm(si31, 16)) {61ld(d, si31, a);62if (emit_filler_nop) nop();63} else {64const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);65const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);66addis(d, a, hi);67ld(d, lo, d);68}69}7071void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {72assert_different_registers(d, a);73ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);74}7576void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,77size_t size_in_bytes, bool is_signed) {78switch (size_in_bytes) {79case 8: ld(dst, offs, base); break;80case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;81case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;82case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(83default: ShouldNotReachHere();84}85}8687void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,88size_t size_in_bytes) {89switch (size_in_bytes) {90case 8: std(dst, offs, base); break;91case 4: stw(dst, offs, base); break;92case 2: sth(dst, offs, base); break;93case 1: stb(dst, offs, base); break;94default: ShouldNotReachHere();95}96}9798void MacroAssembler::align(int modulus, int max, int rem) {99int padding = (rem + modulus - (offset() % modulus)) % modulus;100if (padding > max) return;101for (int c = (padding >> 2); c > 0; --c) { nop(); }102}103104// Issue instructions that calculate given TOC from global TOC.105void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,106bool add_relocation, bool emit_dummy_addr) {107int offset = -1;108if (emit_dummy_addr) {109offset = -128; // dummy address110} else if (addr != (address)(intptr_t)-1) {111offset = MacroAssembler::offset_to_global_toc(addr);112}113114if (hi16) {115addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));116}117if (lo16) {118if (add_relocation) {119// Relocate at the addi to avoid confusion with a load from the method's TOC.120relocate(internal_word_Relocation::spec(addr));121}122addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));123}124}125126int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {127const int offset = MacroAssembler::offset_to_global_toc(addr);128129const address inst2_addr = a;130const int inst2 = *(int *)inst2_addr;131132// The relocation points to the second instruction, the addi,133// and the addi reads and writes the same register dst.134const int dst = inv_rt_field(inst2);135assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");136137// Now, find the preceding addis which writes to dst.138int inst1 = 0;139address inst1_addr = inst2_addr - BytesPerInstWord;140while (inst1_addr >= bound) {141inst1 = *(int *) inst1_addr;142if (is_addis(inst1) && inv_rt_field(inst1) == dst) {143// Stop, found the addis which writes dst.144break;145}146inst1_addr -= BytesPerInstWord;147}148149assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");150set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));151set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));152return (int)((intptr_t)addr - (intptr_t)inst1_addr);153}154155address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {156const address inst2_addr = a;157const int inst2 = *(int *)inst2_addr;158159// The relocation points to the second instruction, the addi,160// and the addi reads and writes the same register dst.161const int dst = inv_rt_field(inst2);162assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");163164// Now, find the preceding addis which writes to dst.165int inst1 = 0;166address inst1_addr = inst2_addr - BytesPerInstWord;167while (inst1_addr >= bound) {168inst1 = *(int *) inst1_addr;169if (is_addis(inst1) && inv_rt_field(inst1) == dst) {170// stop, found the addis which writes dst171break;172}173inst1_addr -= BytesPerInstWord;174}175176assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");177178int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);179// -1 is a special case180if (offset == -1) {181return (address)(intptr_t)-1;182} else {183return global_toc() + offset;184}185}186187#ifdef _LP64188// Patch compressed oops or klass constants.189// Assembler sequence is190// 1) compressed oops:191// lis rx = const.hi192// ori rx = rx | const.lo193// 2) compressed klass:194// lis rx = const.hi195// clrldi rx = rx & 0xFFFFffff // clearMS32b, optional196// ori rx = rx | const.lo197// Clrldi will be passed by.198int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {199assert(UseCompressedOops, "Should only patch compressed oops");200201const address inst2_addr = a;202const int inst2 = *(int *)inst2_addr;203204// The relocation points to the second instruction, the ori,205// and the ori reads and writes the same register dst.206const int dst = inv_rta_field(inst2);207assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");208// Now, find the preceding addis which writes to dst.209int inst1 = 0;210address inst1_addr = inst2_addr - BytesPerInstWord;211bool inst1_found = false;212while (inst1_addr >= bound) {213inst1 = *(int *)inst1_addr;214if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }215inst1_addr -= BytesPerInstWord;216}217assert(inst1_found, "inst is not lis");218219int xc = (data >> 16) & 0xffff;220int xd = (data >> 0) & 0xffff;221222set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo223set_imm((int *)inst2_addr, (xd)); // unsigned int224return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);225}226227// Get compressed oop or klass constant.228narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {229assert(UseCompressedOops, "Should only patch compressed oops");230231const address inst2_addr = a;232const int inst2 = *(int *)inst2_addr;233234// The relocation points to the second instruction, the ori,235// and the ori reads and writes the same register dst.236const int dst = inv_rta_field(inst2);237assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");238// Now, find the preceding lis which writes to dst.239int inst1 = 0;240address inst1_addr = inst2_addr - BytesPerInstWord;241bool inst1_found = false;242243while (inst1_addr >= bound) {244inst1 = *(int *) inst1_addr;245if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}246inst1_addr -= BytesPerInstWord;247}248assert(inst1_found, "inst is not lis");249250uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));251uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);252253return (int) (xl | xh);254}255#endif // _LP64256257void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {258int toc_offset = 0;259// Use RelocationHolder::none for the constant pool entry, otherwise260// we will end up with a failing NativeCall::verify(x) where x is261// the address of the constant pool entry.262// FIXME: We should insert relocation information for oops at the constant263// pool entries instead of inserting it at the loads; patching of a constant264// pool entry should be less expensive.265address oop_address = address_constant((address)a.value(), RelocationHolder::none);266// Relocate at the pc of the load.267relocate(a.rspec());268toc_offset = (int)(oop_address - code()->consts()->start());269ld_largeoffset_unchecked(dst, toc_offset, toc, true);270}271272bool MacroAssembler::is_load_const_from_method_toc_at(address a) {273const address inst1_addr = a;274const int inst1 = *(int *)inst1_addr;275276// The relocation points to the ld or the addis.277return (is_ld(inst1)) ||278(is_addis(inst1) && inv_ra_field(inst1) != 0);279}280281int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {282assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");283284const address inst1_addr = a;285const int inst1 = *(int *)inst1_addr;286287if (is_ld(inst1)) {288return inv_d1_field(inst1);289} else if (is_addis(inst1)) {290const int dst = inv_rt_field(inst1);291292// Now, find the succeeding ld which reads and writes to dst.293address inst2_addr = inst1_addr + BytesPerInstWord;294int inst2 = 0;295while (true) {296inst2 = *(int *) inst2_addr;297if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {298// Stop, found the ld which reads and writes dst.299break;300}301inst2_addr += BytesPerInstWord;302}303return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);304}305ShouldNotReachHere();306return 0;307}308309// Get the constant from a `load_const' sequence.310long MacroAssembler::get_const(address a) {311assert(is_load_const_at(a), "not a load of a constant");312const int *p = (const int*) a;313unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);314if (is_ori(*(p+1))) {315x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);316x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);317x |= (((unsigned long) (get_imm(a,4) & 0xffff)));318} else if (is_lis(*(p+1))) {319x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);320x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);321x |= (((unsigned long) (get_imm(a,3) & 0xffff)));322} else {323ShouldNotReachHere();324return (long) 0;325}326return (long) x;327}328329// Patch the 64 bit constant of a `load_const' sequence. This is a low330// level procedure. It neither flushes the instruction cache nor is it331// mt safe.332void MacroAssembler::patch_const(address a, long x) {333assert(is_load_const_at(a), "not a load of a constant");334int *p = (int*) a;335if (is_ori(*(p+1))) {336set_imm(0 + p, (x >> 48) & 0xffff);337set_imm(1 + p, (x >> 32) & 0xffff);338set_imm(3 + p, (x >> 16) & 0xffff);339set_imm(4 + p, x & 0xffff);340} else if (is_lis(*(p+1))) {341set_imm(0 + p, (x >> 48) & 0xffff);342set_imm(2 + p, (x >> 32) & 0xffff);343set_imm(1 + p, (x >> 16) & 0xffff);344set_imm(3 + p, x & 0xffff);345} else {346ShouldNotReachHere();347}348}349350AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {351assert(oop_recorder() != NULL, "this assembler needs a Recorder");352int index = oop_recorder()->allocate_metadata_index(obj);353RelocationHolder rspec = metadata_Relocation::spec(index);354return AddressLiteral((address)obj, rspec);355}356357AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {358assert(oop_recorder() != NULL, "this assembler needs a Recorder");359int index = oop_recorder()->find_index(obj);360RelocationHolder rspec = metadata_Relocation::spec(index);361return AddressLiteral((address)obj, rspec);362}363364AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {365assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");366int oop_index = oop_recorder()->allocate_oop_index(obj);367return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));368}369370AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {371assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");372int oop_index = oop_recorder()->find_index(obj);373return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));374}375376RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,377Register tmp, int offset) {378intptr_t value = *delayed_value_addr;379if (value != 0) {380return RegisterOrConstant(value + offset);381}382383// Load indirectly to solve generation ordering problem.384// static address, no relocation385int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);386ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)387388if (offset != 0) {389addi(tmp, tmp, offset);390}391392return RegisterOrConstant(tmp);393}394395#ifndef PRODUCT396void MacroAssembler::pd_print_patched_instruction(address branch) {397Unimplemented(); // TODO: PPC port398}399#endif // ndef PRODUCT400401// Conditional far branch for destinations encodable in 24+2 bits.402void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {403404// If requested by flag optimize, relocate the bc_far as a405// runtime_call and prepare for optimizing it when the code gets406// relocated.407if (optimize == bc_far_optimize_on_relocate) {408relocate(relocInfo::runtime_call_type);409}410411// variant 2:412//413// b!cxx SKIP414// bxx DEST415// SKIP:416//417418const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),419opposite_bcond(inv_boint_bcond(boint)));420421// We emit two branches.422// First, a conditional branch which jumps around the far branch.423const address not_taken_pc = pc() + 2 * BytesPerInstWord;424const address bc_pc = pc();425bc(opposite_boint, biint, not_taken_pc);426427const int bc_instr = *(int*)bc_pc;428assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");429assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");430assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),431opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),432"postcondition");433assert(biint == inv_bi_field(bc_instr), "postcondition");434435// Second, an unconditional far branch which jumps to dest.436// Note: target(dest) remembers the current pc (see CodeSection::target)437// and returns the current pc if the label is not bound yet; when438// the label gets bound, the unconditional far branch will be patched.439const address target_pc = target(dest);440const address b_pc = pc();441b(target_pc);442443assert(not_taken_pc == pc(), "postcondition");444assert(dest.is_bound() || target_pc == b_pc, "postcondition");445}446447bool MacroAssembler::is_bc_far_at(address instruction_addr) {448return is_bc_far_variant1_at(instruction_addr) ||449is_bc_far_variant2_at(instruction_addr) ||450is_bc_far_variant3_at(instruction_addr);451}452453address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {454if (is_bc_far_variant1_at(instruction_addr)) {455const address instruction_1_addr = instruction_addr;456const int instruction_1 = *(int*)instruction_1_addr;457return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);458} else if (is_bc_far_variant2_at(instruction_addr)) {459const address instruction_2_addr = instruction_addr + 4;460return bxx_destination(instruction_2_addr);461} else if (is_bc_far_variant3_at(instruction_addr)) {462return instruction_addr + 8;463}464// variant 4 ???465ShouldNotReachHere();466return NULL;467}468void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {469470if (is_bc_far_variant3_at(instruction_addr)) {471// variant 3, far cond branch to the next instruction, already patched to nops:472//473// nop474// endgroup475// SKIP/DEST:476//477return;478}479480// first, extract boint and biint from the current branch481int boint = 0;482int biint = 0;483484ResourceMark rm;485const int code_size = 2 * BytesPerInstWord;486CodeBuffer buf(instruction_addr, code_size);487MacroAssembler masm(&buf);488if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {489// Far branch to next instruction: Optimize it by patching nops (produce variant 3).490masm.nop();491masm.endgroup();492} else {493if (is_bc_far_variant1_at(instruction_addr)) {494// variant 1, the 1st instruction contains the destination address:495//496// bcxx DEST497// endgroup498//499const int instruction_1 = *(int*)(instruction_addr);500boint = inv_bo_field(instruction_1);501biint = inv_bi_field(instruction_1);502} else if (is_bc_far_variant2_at(instruction_addr)) {503// variant 2, the 2nd instruction contains the destination address:504//505// b!cxx SKIP506// bxx DEST507// SKIP:508//509const int instruction_1 = *(int*)(instruction_addr);510boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),511opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));512biint = inv_bi_field(instruction_1);513} else {514// variant 4???515ShouldNotReachHere();516}517518// second, set the new branch destination and optimize the code519if (dest != instruction_addr + 4 && // the bc_far is still unbound!520masm.is_within_range_of_bcxx(dest, instruction_addr)) {521// variant 1:522//523// bcxx DEST524// endgroup525//526masm.bc(boint, biint, dest);527masm.endgroup();528} else {529// variant 2:530//531// b!cxx SKIP532// bxx DEST533// SKIP:534//535const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),536opposite_bcond(inv_boint_bcond(boint)));537const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;538masm.bc(opposite_boint, biint, not_taken_pc);539masm.b(dest);540}541}542ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);543}544545// Emit a NOT mt-safe patchable 64 bit absolute call/jump.546void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {547// get current pc548uint64_t start_pc = (uint64_t) pc();549550const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last551const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first552553// relocate here554if (rt != relocInfo::none) {555relocate(rt);556}557558if ( ReoptimizeCallSequences &&559(( link && is_within_range_of_b(dest, pc_of_bl)) ||560(!link && is_within_range_of_b(dest, pc_of_b)))) {561// variant 2:562// Emit an optimized, pc-relative call/jump.563564if (link) {565// some padding566nop();567nop();568nop();569nop();570nop();571nop();572573// do the call574assert(pc() == pc_of_bl, "just checking");575bl(dest, relocInfo::none);576} else {577// do the jump578assert(pc() == pc_of_b, "just checking");579b(dest, relocInfo::none);580581// some padding582nop();583nop();584nop();585nop();586nop();587nop();588}589590// Assert that we can identify the emitted call/jump.591assert(is_bxx64_patchable_variant2_at((address)start_pc, link),592"can't identify emitted call");593} else {594// variant 1:595mr(R0, R11); // spill R11 -> R0.596597// Load the destination address into CTR,598// calculate destination relative to global toc.599calculate_address_from_global_toc(R11, dest, true, true, false);600601mtctr(R11);602mr(R11, R0); // spill R11 <- R0.603nop();604605// do the call/jump606if (link) {607bctrl();608} else{609bctr();610}611// Assert that we can identify the emitted call/jump.612assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),613"can't identify emitted call");614}615616// Assert that we can identify the emitted call/jump.617assert(is_bxx64_patchable_at((address)start_pc, link),618"can't identify emitted call");619assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,620"wrong encoding of dest address");621}622623// Identify a bxx64_patchable instruction.624bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {625return is_bxx64_patchable_variant1b_at(instruction_addr, link)626//|| is_bxx64_patchable_variant1_at(instruction_addr, link)627|| is_bxx64_patchable_variant2_at(instruction_addr, link);628}629630// Does the call64_patchable instruction use a pc-relative encoding of631// the call destination?632bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {633// variant 2 is pc-relative634return is_bxx64_patchable_variant2_at(instruction_addr, link);635}636637// Identify variant 1.638bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {639unsigned int* instr = (unsigned int*) instruction_addr;640return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]641&& is_mtctr(instr[5]) // mtctr642&& is_load_const_at(instruction_addr);643}644645// Identify variant 1b: load destination relative to global toc.646bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {647unsigned int* instr = (unsigned int*) instruction_addr;648return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]649&& is_mtctr(instr[3]) // mtctr650&& is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);651}652653// Identify variant 2.654bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {655unsigned int* instr = (unsigned int*) instruction_addr;656if (link) {657return is_bl (instr[6]) // bl dest is last658&& is_nop(instr[0]) // nop659&& is_nop(instr[1]) // nop660&& is_nop(instr[2]) // nop661&& is_nop(instr[3]) // nop662&& is_nop(instr[4]) // nop663&& is_nop(instr[5]); // nop664} else {665return is_b (instr[0]) // b dest is first666&& is_nop(instr[1]) // nop667&& is_nop(instr[2]) // nop668&& is_nop(instr[3]) // nop669&& is_nop(instr[4]) // nop670&& is_nop(instr[5]) // nop671&& is_nop(instr[6]); // nop672}673}674675// Set dest address of a bxx64_patchable instruction.676void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {677ResourceMark rm;678int code_size = MacroAssembler::bxx64_patchable_size;679CodeBuffer buf(instruction_addr, code_size);680MacroAssembler masm(&buf);681masm.bxx64_patchable(dest, relocInfo::none, link);682ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);683}684685// Get dest address of a bxx64_patchable instruction.686address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {687if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {688return (address) (unsigned long) get_const(instruction_addr);689} else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {690unsigned int* instr = (unsigned int*) instruction_addr;691if (link) {692const int instr_idx = 6; // bl is last693int branchoffset = branch_destination(instr[instr_idx], 0);694return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;695} else {696const int instr_idx = 0; // b is first697int branchoffset = branch_destination(instr[instr_idx], 0);698return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;699}700// Load dest relative to global toc.701} else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {702return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,703instruction_addr);704} else {705ShouldNotReachHere();706return NULL;707}708}709710// Uses ordering which corresponds to ABI:711// _savegpr0_14: std r14,-144(r1)712// _savegpr0_15: std r15,-136(r1)713// _savegpr0_16: std r16,-128(r1)714void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {715std(R14, offset, dst); offset += 8;716std(R15, offset, dst); offset += 8;717std(R16, offset, dst); offset += 8;718std(R17, offset, dst); offset += 8;719std(R18, offset, dst); offset += 8;720std(R19, offset, dst); offset += 8;721std(R20, offset, dst); offset += 8;722std(R21, offset, dst); offset += 8;723std(R22, offset, dst); offset += 8;724std(R23, offset, dst); offset += 8;725std(R24, offset, dst); offset += 8;726std(R25, offset, dst); offset += 8;727std(R26, offset, dst); offset += 8;728std(R27, offset, dst); offset += 8;729std(R28, offset, dst); offset += 8;730std(R29, offset, dst); offset += 8;731std(R30, offset, dst); offset += 8;732std(R31, offset, dst); offset += 8;733734stfd(F14, offset, dst); offset += 8;735stfd(F15, offset, dst); offset += 8;736stfd(F16, offset, dst); offset += 8;737stfd(F17, offset, dst); offset += 8;738stfd(F18, offset, dst); offset += 8;739stfd(F19, offset, dst); offset += 8;740stfd(F20, offset, dst); offset += 8;741stfd(F21, offset, dst); offset += 8;742stfd(F22, offset, dst); offset += 8;743stfd(F23, offset, dst); offset += 8;744stfd(F24, offset, dst); offset += 8;745stfd(F25, offset, dst); offset += 8;746stfd(F26, offset, dst); offset += 8;747stfd(F27, offset, dst); offset += 8;748stfd(F28, offset, dst); offset += 8;749stfd(F29, offset, dst); offset += 8;750stfd(F30, offset, dst); offset += 8;751stfd(F31, offset, dst);752}753754// Uses ordering which corresponds to ABI:755// _restgpr0_14: ld r14,-144(r1)756// _restgpr0_15: ld r15,-136(r1)757// _restgpr0_16: ld r16,-128(r1)758void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {759ld(R14, offset, src); offset += 8;760ld(R15, offset, src); offset += 8;761ld(R16, offset, src); offset += 8;762ld(R17, offset, src); offset += 8;763ld(R18, offset, src); offset += 8;764ld(R19, offset, src); offset += 8;765ld(R20, offset, src); offset += 8;766ld(R21, offset, src); offset += 8;767ld(R22, offset, src); offset += 8;768ld(R23, offset, src); offset += 8;769ld(R24, offset, src); offset += 8;770ld(R25, offset, src); offset += 8;771ld(R26, offset, src); offset += 8;772ld(R27, offset, src); offset += 8;773ld(R28, offset, src); offset += 8;774ld(R29, offset, src); offset += 8;775ld(R30, offset, src); offset += 8;776ld(R31, offset, src); offset += 8;777778// FP registers779lfd(F14, offset, src); offset += 8;780lfd(F15, offset, src); offset += 8;781lfd(F16, offset, src); offset += 8;782lfd(F17, offset, src); offset += 8;783lfd(F18, offset, src); offset += 8;784lfd(F19, offset, src); offset += 8;785lfd(F20, offset, src); offset += 8;786lfd(F21, offset, src); offset += 8;787lfd(F22, offset, src); offset += 8;788lfd(F23, offset, src); offset += 8;789lfd(F24, offset, src); offset += 8;790lfd(F25, offset, src); offset += 8;791lfd(F26, offset, src); offset += 8;792lfd(F27, offset, src); offset += 8;793lfd(F28, offset, src); offset += 8;794lfd(F29, offset, src); offset += 8;795lfd(F30, offset, src); offset += 8;796lfd(F31, offset, src);797}798799// For verify_oops.800void MacroAssembler::save_volatile_gprs(Register dst, int offset) {801std(R2, offset, dst); offset += 8;802std(R3, offset, dst); offset += 8;803std(R4, offset, dst); offset += 8;804std(R5, offset, dst); offset += 8;805std(R6, offset, dst); offset += 8;806std(R7, offset, dst); offset += 8;807std(R8, offset, dst); offset += 8;808std(R9, offset, dst); offset += 8;809std(R10, offset, dst); offset += 8;810std(R11, offset, dst); offset += 8;811std(R12, offset, dst);812}813814// For verify_oops.815void MacroAssembler::restore_volatile_gprs(Register src, int offset) {816ld(R2, offset, src); offset += 8;817ld(R3, offset, src); offset += 8;818ld(R4, offset, src); offset += 8;819ld(R5, offset, src); offset += 8;820ld(R6, offset, src); offset += 8;821ld(R7, offset, src); offset += 8;822ld(R8, offset, src); offset += 8;823ld(R9, offset, src); offset += 8;824ld(R10, offset, src); offset += 8;825ld(R11, offset, src); offset += 8;826ld(R12, offset, src);827}828829void MacroAssembler::save_LR_CR(Register tmp) {830mfcr(tmp);831std(tmp, _abi(cr), R1_SP);832mflr(tmp);833std(tmp, _abi(lr), R1_SP);834// Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)835}836837void MacroAssembler::restore_LR_CR(Register tmp) {838assert(tmp != R1_SP, "must be distinct");839ld(tmp, _abi(lr), R1_SP);840mtlr(tmp);841ld(tmp, _abi(cr), R1_SP);842mtcr(tmp);843}844845address MacroAssembler::get_PC_trash_LR(Register result) {846Label L;847bl(L);848bind(L);849address lr_pc = pc();850mflr(result);851return lr_pc;852}853854void MacroAssembler::resize_frame(Register offset, Register tmp) {855#ifdef ASSERT856assert_different_registers(offset, tmp, R1_SP);857andi_(tmp, offset, frame::alignment_in_bytes-1);858asm_assert_eq("resize_frame: unaligned", 0x204);859#endif860861// tmp <- *(SP)862ld(tmp, _abi(callers_sp), R1_SP);863// addr <- SP + offset;864// *(addr) <- tmp;865// SP <- addr866stdux(tmp, R1_SP, offset);867}868869void MacroAssembler::resize_frame(int offset, Register tmp) {870assert(is_simm(offset, 16), "too big an offset");871assert_different_registers(tmp, R1_SP);872assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");873// tmp <- *(SP)874ld(tmp, _abi(callers_sp), R1_SP);875// addr <- SP + offset;876// *(addr) <- tmp;877// SP <- addr878stdu(tmp, offset, R1_SP);879}880881void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {882// (addr == tmp1) || (addr == tmp2) is allowed here!883assert(tmp1 != tmp2, "must be distinct");884885// compute offset w.r.t. current stack pointer886// tmp_1 <- addr - SP (!)887subf(tmp1, R1_SP, addr);888889// atomically update SP keeping back link.890resize_frame(tmp1/* offset */, tmp2/* tmp */);891}892893void MacroAssembler::push_frame(Register bytes, Register tmp) {894#ifdef ASSERT895assert(bytes != R0, "r0 not allowed here");896andi_(R0, bytes, frame::alignment_in_bytes-1);897asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);898#endif899neg(tmp, bytes);900stdux(R1_SP, R1_SP, tmp);901}902903// Push a frame of size `bytes'.904void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {905long offset = align_addr(bytes, frame::alignment_in_bytes);906if (is_simm(-offset, 16)) {907stdu(R1_SP, -offset, R1_SP);908} else {909load_const(tmp, -offset);910stdux(R1_SP, R1_SP, tmp);911}912}913914// Push a frame of size `bytes' plus abi_reg_args on top.915void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {916push_frame(bytes + frame::abi_reg_args_size, tmp);917}918919// Setup up a new C frame with a spill area for non-volatile GPRs and920// additional space for local variables.921void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,922Register tmp) {923push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);924}925926// Pop current C frame.927void MacroAssembler::pop_frame() {928ld(R1_SP, _abi(callers_sp), R1_SP);929}930931#if defined(ABI_ELFv2)932address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {933// TODO(asmundak): make sure the caller uses R12 as function descriptor934// most of the times.935if (R12 != r_function_entry) {936mr(R12, r_function_entry);937}938mtctr(R12);939// Do a call or a branch.940if (and_link) {941bctrl();942} else {943bctr();944}945_last_calls_return_pc = pc();946947return _last_calls_return_pc;948}949950// Call a C function via a function descriptor and use full C951// calling conventions. Updates and returns _last_calls_return_pc.952address MacroAssembler::call_c(Register r_function_entry) {953return branch_to(r_function_entry, /*and_link=*/true);954}955956// For tail calls: only branch, don't link, so callee returns to caller of this function.957address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {958return branch_to(r_function_entry, /*and_link=*/false);959}960961address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {962load_const(R12, function_entry, R0);963return branch_to(R12, /*and_link=*/true);964}965966#else967// Generic version of a call to C function via a function descriptor968// with variable support for C calling conventions (TOC, ENV, etc.).969// Updates and returns _last_calls_return_pc.970address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,971bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {972// we emit standard ptrgl glue code here973assert((function_descriptor != R0), "function_descriptor cannot be R0");974975// retrieve necessary entries from the function descriptor976ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);977mtctr(R0);978979if (load_toc_of_callee) {980ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);981}982if (load_env_of_callee) {983ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);984} else if (load_toc_of_callee) {985li(R11, 0);986}987988// do a call or a branch989if (and_link) {990bctrl();991} else {992bctr();993}994_last_calls_return_pc = pc();995996return _last_calls_return_pc;997}998999// Call a C function via a function descriptor and use full C calling1000// conventions.1001// We don't use the TOC in generated code, so there is no need to save1002// and restore its value.1003address MacroAssembler::call_c(Register fd) {1004return branch_to(fd, /*and_link=*/true,1005/*save toc=*/false,1006/*restore toc=*/false,1007/*load toc=*/true,1008/*load env=*/true);1009}10101011address MacroAssembler::call_c_and_return_to_caller(Register fd) {1012return branch_to(fd, /*and_link=*/false,1013/*save toc=*/false,1014/*restore toc=*/false,1015/*load toc=*/true,1016/*load env=*/true);1017}10181019address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {1020if (rt != relocInfo::none) {1021// this call needs to be relocatable1022if (!ReoptimizeCallSequences1023|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)1024|| fd == NULL // support code-size estimation1025|| !fd->is_friend_function()1026|| fd->entry() == NULL) {1027// it's not a friend function as defined by class FunctionDescriptor,1028// so do a full call-c here.1029load_const(R11, (address)fd, R0);10301031bool has_env = (fd != NULL && fd->env() != NULL);1032return branch_to(R11, /*and_link=*/true,1033/*save toc=*/false,1034/*restore toc=*/false,1035/*load toc=*/true,1036/*load env=*/has_env);1037} else {1038// It's a friend function. Load the entry point and don't care about1039// toc and env. Use an optimizable call instruction, but ensure the1040// same code-size as in the case of a non-friend function.1041nop();1042nop();1043nop();1044bl64_patchable(fd->entry(), rt);1045_last_calls_return_pc = pc();1046return _last_calls_return_pc;1047}1048} else {1049// This call does not need to be relocatable, do more aggressive1050// optimizations.1051if (!ReoptimizeCallSequences1052|| !fd->is_friend_function()) {1053// It's not a friend function as defined by class FunctionDescriptor,1054// so do a full call-c here.1055load_const(R11, (address)fd, R0);1056return branch_to(R11, /*and_link=*/true,1057/*save toc=*/false,1058/*restore toc=*/false,1059/*load toc=*/true,1060/*load env=*/true);1061} else {1062// it's a friend function, load the entry point and don't care about1063// toc and env.1064address dest = fd->entry();1065if (is_within_range_of_b(dest, pc())) {1066bl(dest);1067} else {1068bl64_patchable(dest, rt);1069}1070_last_calls_return_pc = pc();1071return _last_calls_return_pc;1072}1073}1074}10751076// Call a C function. All constants needed reside in TOC.1077//1078// Read the address to call from the TOC.1079// Read env from TOC, if fd specifies an env.1080// Read new TOC from TOC.1081address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,1082relocInfo::relocType rt, Register toc) {1083if (!ReoptimizeCallSequences1084|| (rt != relocInfo::runtime_call_type && rt != relocInfo::none)1085|| !fd->is_friend_function()) {1086// It's not a friend function as defined by class FunctionDescriptor,1087// so do a full call-c here.1088assert(fd->entry() != NULL, "function must be linked");10891090AddressLiteral fd_entry(fd->entry());1091load_const_from_method_toc(R11, fd_entry, toc);1092mtctr(R11);1093if (fd->env() == NULL) {1094li(R11, 0);1095nop();1096} else {1097AddressLiteral fd_env(fd->env());1098load_const_from_method_toc(R11, fd_env, toc);1099}1100AddressLiteral fd_toc(fd->toc());1101load_toc_from_toc(R2_TOC, fd_toc, toc);1102// R2_TOC is killed.1103bctrl();1104_last_calls_return_pc = pc();1105} else {1106// It's a friend function, load the entry point and don't care about1107// toc and env. Use an optimizable call instruction, but ensure the1108// same code-size as in the case of a non-friend function.1109nop();1110bl64_patchable(fd->entry(), rt);1111_last_calls_return_pc = pc();1112}1113return _last_calls_return_pc;1114}1115#endif // ABI_ELFv211161117void MacroAssembler::call_VM_base(Register oop_result,1118Register last_java_sp,1119address entry_point,1120bool check_exceptions) {1121BLOCK_COMMENT("call_VM {");1122// Determine last_java_sp register.1123if (!last_java_sp->is_valid()) {1124last_java_sp = R1_SP;1125}1126set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);11271128// ARG1 must hold thread address.1129mr(R3_ARG1, R16_thread);1130#if defined(ABI_ELFv2)1131address return_pc = call_c(entry_point, relocInfo::none);1132#else1133address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);1134#endif11351136reset_last_Java_frame();11371138// Check for pending exceptions.1139if (check_exceptions) {1140// We don't check for exceptions here.1141ShouldNotReachHere();1142}11431144// Get oop result if there is one and reset the value in the thread.1145if (oop_result->is_valid()) {1146get_vm_result(oop_result);1147}11481149_last_calls_return_pc = return_pc;1150BLOCK_COMMENT("} call_VM");1151}11521153void MacroAssembler::call_VM_leaf_base(address entry_point) {1154BLOCK_COMMENT("call_VM_leaf {");1155#if defined(ABI_ELFv2)1156call_c(entry_point, relocInfo::none);1157#else1158call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);1159#endif1160BLOCK_COMMENT("} call_VM_leaf");1161}11621163void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {1164call_VM_base(oop_result, noreg, entry_point, check_exceptions);1165}11661167void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,1168bool check_exceptions) {1169// R3_ARG1 is reserved for the thread.1170mr_if_needed(R4_ARG2, arg_1);1171call_VM(oop_result, entry_point, check_exceptions);1172}11731174void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,1175bool check_exceptions) {1176// R3_ARG1 is reserved for the thread1177mr_if_needed(R4_ARG2, arg_1);1178assert(arg_2 != R4_ARG2, "smashed argument");1179mr_if_needed(R5_ARG3, arg_2);1180call_VM(oop_result, entry_point, check_exceptions);1181}11821183void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,1184bool check_exceptions) {1185// R3_ARG1 is reserved for the thread1186mr_if_needed(R4_ARG2, arg_1);1187assert(arg_2 != R4_ARG2, "smashed argument");1188mr_if_needed(R5_ARG3, arg_2);1189mr_if_needed(R6_ARG4, arg_3);1190call_VM(oop_result, entry_point, check_exceptions);1191}11921193void MacroAssembler::call_VM_leaf(address entry_point) {1194call_VM_leaf_base(entry_point);1195}11961197void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {1198mr_if_needed(R3_ARG1, arg_1);1199call_VM_leaf(entry_point);1200}12011202void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {1203mr_if_needed(R3_ARG1, arg_1);1204assert(arg_2 != R3_ARG1, "smashed argument");1205mr_if_needed(R4_ARG2, arg_2);1206call_VM_leaf(entry_point);1207}12081209void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {1210mr_if_needed(R3_ARG1, arg_1);1211assert(arg_2 != R3_ARG1, "smashed argument");1212mr_if_needed(R4_ARG2, arg_2);1213assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");1214mr_if_needed(R5_ARG3, arg_3);1215call_VM_leaf(entry_point);1216}12171218// Check whether instruction is a read access to the polling page1219// which was emitted by load_from_polling_page(..).1220bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,1221address* polling_address_ptr) {1222if (!is_ld(instruction))1223return false; // It's not a ld. Fail.12241225int rt = inv_rt_field(instruction);1226int ra = inv_ra_field(instruction);1227int ds = inv_ds_field(instruction);1228if (!(ds == 0 && ra != 0 && rt == 0)) {1229return false; // It's not a ld(r0, X, ra). Fail.1230}12311232if (!ucontext) {1233// Set polling address.1234if (polling_address_ptr != NULL) {1235*polling_address_ptr = NULL;1236}1237return true; // No ucontext given. Can't check value of ra. Assume true.1238}12391240#ifdef LINUX1241// Ucontext given. Check that register ra contains the address of1242// the safepoing polling page.1243ucontext_t* uc = (ucontext_t*) ucontext;1244// Set polling address.1245address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;1246if (polling_address_ptr != NULL) {1247*polling_address_ptr = addr;1248}1249return os::is_poll_address(addr);1250#else1251// Not on Linux, ucontext must be NULL.1252ShouldNotReachHere();1253return false;1254#endif1255}12561257bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {1258#ifdef LINUX1259ucontext_t* uc = (ucontext_t*) ucontext;12601261if (is_stwx(instruction) || is_stwux(instruction)) {1262int ra = inv_ra_field(instruction);1263int rb = inv_rb_field(instruction);12641265// look up content of ra and rb in ucontext1266address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];1267long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];1268return os::is_memory_serialize_page(thread, ra_val+rb_val);1269} else if (is_stw(instruction) || is_stwu(instruction)) {1270int ra = inv_ra_field(instruction);1271int d1 = inv_d1_field(instruction);12721273// look up content of ra in ucontext1274address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];1275return os::is_memory_serialize_page(thread, ra_val+d1);1276} else {1277return false;1278}1279#else1280// workaround not needed on !LINUX :-)1281ShouldNotCallThis();1282return false;1283#endif1284}12851286void MacroAssembler::bang_stack_with_offset(int offset) {1287// When increasing the stack, the old stack pointer will be written1288// to the new top of stack according to the PPC64 abi.1289// Therefore, stack banging is not necessary when increasing1290// the stack by <= os::vm_page_size() bytes.1291// When increasing the stack by a larger amount, this method is1292// called repeatedly to bang the intermediate pages.12931294// Stack grows down, caller passes positive offset.1295assert(offset > 0, "must bang with positive offset");12961297long stdoffset = -offset;12981299if (is_simm(stdoffset, 16)) {1300// Signed 16 bit offset, a simple std is ok.1301if (UseLoadInstructionsForStackBangingPPC64) {1302ld(R0, (int)(signed short)stdoffset, R1_SP);1303} else {1304std(R0,(int)(signed short)stdoffset, R1_SP);1305}1306} else if (is_simm(stdoffset, 31)) {1307const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);1308const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);13091310Register tmp = R11;1311addis(tmp, R1_SP, hi);1312if (UseLoadInstructionsForStackBangingPPC64) {1313ld(R0, lo, tmp);1314} else {1315std(R0, lo, tmp);1316}1317} else {1318ShouldNotReachHere();1319}1320}13211322// If instruction is a stack bang of the form1323// std R0, x(Ry), (see bang_stack_with_offset())1324// stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())1325// or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())1326// return the banged address. Otherwise, return 0.1327address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {1328#ifdef LINUX1329ucontext_t* uc = (ucontext_t*) ucontext;1330int rs = inv_rs_field(instruction);1331int ra = inv_ra_field(instruction);1332if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)1333|| (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)1334|| (is_stdu(instruction) && rs == 1)) {1335int ds = inv_ds_field(instruction);1336// return banged address1337return ds+(address)uc->uc_mcontext.regs->gpr[ra];1338} else if (is_stdux(instruction) && rs == 1) {1339int rb = inv_rb_field(instruction);1340address sp = (address)uc->uc_mcontext.regs->gpr[1];1341long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];1342return ra != 1 || rb_val >= 0 ? NULL // not a stack bang1343: sp + rb_val; // banged address1344}1345return NULL; // not a stack bang1346#else1347// workaround not needed on !LINUX :-)1348ShouldNotCallThis();1349return NULL;1350#endif1351}13521353// CmpxchgX sets condition register to cmpX(current, compare).1354void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,1355Register compare_value, Register exchange_value,1356Register addr_base, int semantics, bool cmpxchgx_hint,1357Register int_flag_success, bool contention_hint) {1358Label retry;1359Label failed;1360Label done;13611362// Save one branch if result is returned via register and1363// result register is different from the other ones.1364bool use_result_reg = (int_flag_success != noreg);1365bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&1366int_flag_success != exchange_value && int_flag_success != addr_base);13671368// release/fence semantics1369if (semantics & MemBarRel) {1370release();1371}13721373if (use_result_reg && preset_result_reg) {1374li(int_flag_success, 0); // preset (assume cas failed)1375}13761377// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).1378if (contention_hint) { // Don't try to reserve if cmp fails.1379lwz(dest_current_value, 0, addr_base);1380cmpw(flag, dest_current_value, compare_value);1381bne(flag, failed);1382}13831384// atomic emulation loop1385bind(retry);13861387lwarx(dest_current_value, addr_base, cmpxchgx_hint);1388cmpw(flag, dest_current_value, compare_value);1389if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1390bne_predict_not_taken(flag, failed);1391} else {1392bne( flag, failed);1393}1394// branch to done => (flag == ne), (dest_current_value != compare_value)1395// fall through => (flag == eq), (dest_current_value == compare_value)13961397stwcx_(exchange_value, addr_base);1398if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1399bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.1400} else {1401bne( CCR0, retry); // StXcx_ sets CCR0.1402}1403// fall through => (flag == eq), (dest_current_value == compare_value), (swapped)14041405// Result in register (must do this at the end because int_flag_success can be the1406// same register as one above).1407if (use_result_reg) {1408li(int_flag_success, 1);1409}14101411if (semantics & MemBarFenceAfter) {1412fence();1413} else if (semantics & MemBarAcq) {1414isync();1415}14161417if (use_result_reg && !preset_result_reg) {1418b(done);1419}14201421bind(failed);1422if (use_result_reg && !preset_result_reg) {1423li(int_flag_success, 0);1424}14251426bind(done);1427// (flag == ne) => (dest_current_value != compare_value), (!swapped)1428// (flag == eq) => (dest_current_value == compare_value), ( swapped)1429}14301431// Preforms atomic compare exchange:1432// if (compare_value == *addr_base)1433// *addr_base = exchange_value1434// int_flag_success = 1;1435// else1436// int_flag_success = 0;1437//1438// ConditionRegister flag = cmp(compare_value, *addr_base)1439// Register dest_current_value = *addr_base1440// Register compare_value Used to compare with value in memory1441// Register exchange_value Written to memory if compare_value == *addr_base1442// Register addr_base The memory location to compareXChange1443// Register int_flag_success Set to 1 if exchange_value was written to *addr_base1444//1445// To avoid the costly compare exchange the value is tested beforehand.1446// Several special cases exist to avoid that unnecessary information is generated.1447//1448void MacroAssembler::cmpxchgd(ConditionRegister flag,1449Register dest_current_value, Register compare_value, Register exchange_value,1450Register addr_base, int semantics, bool cmpxchgx_hint,1451Register int_flag_success, Label* failed_ext, bool contention_hint) {1452Label retry;1453Label failed_int;1454Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;1455Label done;14561457// Save one branch if result is returned via register and result register is different from the other ones.1458bool use_result_reg = (int_flag_success!=noreg);1459bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value &&1460int_flag_success!=exchange_value && int_flag_success!=addr_base);1461assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");14621463// release/fence semantics1464if (semantics & MemBarRel) {1465release();1466}14671468if (use_result_reg && preset_result_reg) {1469li(int_flag_success, 0); // preset (assume cas failed)1470}14711472// Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).1473if (contention_hint) { // Don't try to reserve if cmp fails.1474ld(dest_current_value, 0, addr_base);1475cmpd(flag, dest_current_value, compare_value);1476bne(flag, failed);1477}14781479// atomic emulation loop1480bind(retry);14811482ldarx(dest_current_value, addr_base, cmpxchgx_hint);1483cmpd(flag, dest_current_value, compare_value);1484if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1485bne_predict_not_taken(flag, failed);1486} else {1487bne( flag, failed);1488}14891490stdcx_(exchange_value, addr_base);1491if (UseStaticBranchPredictionInCompareAndSwapPPC64) {1492bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR01493} else {1494bne( CCR0, retry); // stXcx_ sets CCR01495}14961497// result in register (must do this at the end because int_flag_success can be the same register as one above)1498if (use_result_reg) {1499li(int_flag_success, 1);1500}15011502// POWER6 doesn't need isync in CAS.1503// Always emit isync to be on the safe side.1504if (semantics & MemBarFenceAfter) {1505fence();1506} else if (semantics & MemBarAcq) {1507isync();1508}15091510if (use_result_reg && !preset_result_reg) {1511b(done);1512}15131514bind(failed_int);1515if (use_result_reg && !preset_result_reg) {1516li(int_flag_success, 0);1517}15181519bind(done);1520// (flag == ne) => (dest_current_value != compare_value), (!swapped)1521// (flag == eq) => (dest_current_value == compare_value), ( swapped)1522}15231524// Look up the method for a megamorphic invokeinterface call.1525// The target method is determined by <intf_klass, itable_index>.1526// The receiver klass is in recv_klass.1527// On success, the result will be in method_result, and execution falls through.1528// On failure, execution transfers to the given label.1529void MacroAssembler::lookup_interface_method(Register recv_klass,1530Register intf_klass,1531RegisterOrConstant itable_index,1532Register method_result,1533Register scan_temp,1534Register temp2,1535Label& L_no_such_interface,1536bool return_method) {1537assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);15381539// Compute start of first itableOffsetEntry (which is at the end of the vtable).1540int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;1541int itentry_off = itableMethodEntry::method_offset_in_bytes();1542int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);1543int scan_step = itableOffsetEntry::size() * wordSize;1544int log_vte_size= exact_log2(vtableEntry::size() * wordSize);15451546lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);1547// %%% We should store the aligned, prescaled offset in the klassoop.1548// Then the next several instructions would fold away.15491550sldi(scan_temp, scan_temp, log_vte_size);1551addi(scan_temp, scan_temp, vtable_base);1552add(scan_temp, recv_klass, scan_temp);15531554// Adjust recv_klass by scaled itable_index, so we can free itable_index.1555if (return_method) {1556if (itable_index.is_register()) {1557Register itable_offset = itable_index.as_register();1558sldi(method_result, itable_offset, logMEsize);1559if (itentry_off) { addi(method_result, method_result, itentry_off); }1560add(method_result, method_result, recv_klass);1561} else {1562long itable_offset = (long)itable_index.as_constant();1563// static address, no relocation1564load_const_optimized(temp2, (itable_offset << logMEsize) + itentry_off); // static address, no relocation1565add(method_result, temp2, recv_klass);1566}1567}15681569// for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {1570// if (scan->interface() == intf) {1571// result = (klass + scan->offset() + itable_index);1572// }1573// }1574Label search, found_method;15751576for (int peel = 1; peel >= 0; peel--) {1577// %%%% Could load both offset and interface in one ldx, if they were1578// in the opposite order. This would save a load.1579ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);15801581// Check that this entry is non-null. A null entry means that1582// the receiver class doesn't implement the interface, and wasn't the1583// same as when the caller was compiled.1584cmpd(CCR0, temp2, intf_klass);15851586if (peel) {1587beq(CCR0, found_method);1588} else {1589bne(CCR0, search);1590// (invert the test to fall through to found_method...)1591}15921593if (!peel) break;15941595bind(search);15961597cmpdi(CCR0, temp2, 0);1598beq(CCR0, L_no_such_interface);1599addi(scan_temp, scan_temp, scan_step);1600}16011602bind(found_method);16031604// Got a hit.1605if (return_method) {1606int ito_offset = itableOffsetEntry::offset_offset_in_bytes();1607lwz(scan_temp, ito_offset, scan_temp);1608ldx(method_result, scan_temp, method_result);1609}1610}16111612// virtual method calling1613void MacroAssembler::lookup_virtual_method(Register recv_klass,1614RegisterOrConstant vtable_index,1615Register method_result) {16161617assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());16181619const int base = InstanceKlass::vtable_start_offset() * wordSize;1620assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");16211622if (vtable_index.is_register()) {1623sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);1624add(recv_klass, vtable_index.as_register(), recv_klass);1625} else {1626addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);1627}1628ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);1629}16301631/////////////////////////////////////////// subtype checking ////////////////////////////////////////////16321633void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,1634Register super_klass,1635Register temp1_reg,1636Register temp2_reg,1637Label& L_success,1638Label& L_failure) {16391640const Register check_cache_offset = temp1_reg;1641const Register cached_super = temp2_reg;16421643assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);16441645int sco_offset = in_bytes(Klass::super_check_offset_offset());1646int sc_offset = in_bytes(Klass::secondary_super_cache_offset());16471648// If the pointers are equal, we are done (e.g., String[] elements).1649// This self-check enables sharing of secondary supertype arrays among1650// non-primary types such as array-of-interface. Otherwise, each such1651// type would need its own customized SSA.1652// We move this check to the front of the fast path because many1653// type checks are in fact trivially successful in this manner,1654// so we get a nicely predicted branch right at the start of the check.1655cmpd(CCR0, sub_klass, super_klass);1656beq(CCR0, L_success);16571658// Check the supertype display:1659lwz(check_cache_offset, sco_offset, super_klass);1660// The loaded value is the offset from KlassOopDesc.16611662ldx(cached_super, check_cache_offset, sub_klass);1663cmpd(CCR0, cached_super, super_klass);1664beq(CCR0, L_success);16651666// This check has worked decisively for primary supers.1667// Secondary supers are sought in the super_cache ('super_cache_addr').1668// (Secondary supers are interfaces and very deeply nested subtypes.)1669// This works in the same check above because of a tricky aliasing1670// between the super_cache and the primary super display elements.1671// (The 'super_check_addr' can address either, as the case requires.)1672// Note that the cache is updated below if it does not help us find1673// what we need immediately.1674// So if it was a primary super, we can just fail immediately.1675// Otherwise, it's the slow path for us (no success at this point).16761677cmpwi(CCR0, check_cache_offset, sc_offset);1678bne(CCR0, L_failure);1679// bind(slow_path); // fallthru1680}16811682void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,1683Register super_klass,1684Register temp1_reg,1685Register temp2_reg,1686Label* L_success,1687Register result_reg) {1688const Register array_ptr = temp1_reg; // current value from cache array1689const Register temp = temp2_reg;16901691assert_different_registers(sub_klass, super_klass, array_ptr, temp);16921693int source_offset = in_bytes(Klass::secondary_supers_offset());1694int target_offset = in_bytes(Klass::secondary_super_cache_offset());16951696int length_offset = Array<Klass*>::length_offset_in_bytes();1697int base_offset = Array<Klass*>::base_offset_in_bytes();16981699Label hit, loop, failure, fallthru;17001701ld(array_ptr, source_offset, sub_klass);17021703//assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");1704lwz(temp, length_offset, array_ptr);1705cmpwi(CCR0, temp, 0);1706beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 017071708mtctr(temp); // load ctr17091710bind(loop);1711// Oops in table are NO MORE compressed.1712ld(temp, base_offset, array_ptr);1713cmpd(CCR0, temp, super_klass);1714beq(CCR0, hit);1715addi(array_ptr, array_ptr, BytesPerWord);1716bdnz(loop);17171718bind(failure);1719if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)1720b(fallthru);17211722bind(hit);1723std(super_klass, target_offset, sub_klass); // save result to cache1724if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)1725if (L_success != NULL) b(*L_success);17261727bind(fallthru);1728}17291730// Try fast path, then go to slow one if not successful1731void MacroAssembler::check_klass_subtype(Register sub_klass,1732Register super_klass,1733Register temp1_reg,1734Register temp2_reg,1735Label& L_success) {1736Label L_failure;1737check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);1738check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);1739bind(L_failure); // Fallthru if not successful.1740}17411742void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,1743Register temp_reg,1744Label& wrong_method_type) {1745assert_different_registers(mtype_reg, mh_reg, temp_reg);1746// Compare method type against that of the receiver.1747load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);1748cmpd(CCR0, temp_reg, mtype_reg);1749bne(CCR0, wrong_method_type);1750}17511752RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,1753Register temp_reg,1754int extra_slot_offset) {1755// cf. TemplateTable::prepare_invoke(), if (load_receiver).1756int stackElementSize = Interpreter::stackElementSize;1757int offset = extra_slot_offset * stackElementSize;1758if (arg_slot.is_constant()) {1759offset += arg_slot.as_constant() * stackElementSize;1760return offset;1761} else {1762assert(temp_reg != noreg, "must specify");1763sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));1764if (offset != 0)1765addi(temp_reg, temp_reg, offset);1766return temp_reg;1767}1768}17691770void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,1771Register mark_reg, Register temp_reg,1772Register temp2_reg, Label& done, Label* slow_case) {1773assert(UseBiasedLocking, "why call this otherwise?");17741775#ifdef ASSERT1776assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);1777#endif17781779Label cas_label;17801781// Branch to done if fast path fails and no slow_case provided.1782Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;17831784// Biased locking1785// See whether the lock is currently biased toward our thread and1786// whether the epoch is still valid1787// Note that the runtime guarantees sufficient alignment of JavaThread1788// pointers to allow age to be placed into low bits1789assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,1790"biased locking makes assumptions about bit layout");17911792if (PrintBiasedLockingStatistics) {1793load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);1794lwz(temp2_reg, 0, temp_reg);1795addi(temp2_reg, temp2_reg, 1);1796stw(temp2_reg, 0, temp_reg);1797}17981799andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);1800cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);1801bne(cr_reg, cas_label);18021803load_klass(temp_reg, obj_reg);18041805load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));1806ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);1807orr(temp_reg, R16_thread, temp_reg);1808xorr(temp_reg, mark_reg, temp_reg);1809andr(temp_reg, temp_reg, temp2_reg);1810cmpdi(cr_reg, temp_reg, 0);1811if (PrintBiasedLockingStatistics) {1812Label l;1813bne(cr_reg, l);1814load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());1815lwz(temp2_reg, 0, mark_reg);1816addi(temp2_reg, temp2_reg, 1);1817stw(temp2_reg, 0, mark_reg);1818// restore mark_reg1819ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);1820bind(l);1821}1822beq(cr_reg, done);18231824Label try_revoke_bias;1825Label try_rebias;18261827// At this point we know that the header has the bias pattern and1828// that we are not the bias owner in the current epoch. We need to1829// figure out more details about the state of the header in order to1830// know what operations can be legally performed on the object's1831// header.18321833// If the low three bits in the xor result aren't clear, that means1834// the prototype header is no longer biased and we have to revoke1835// the bias on this object.1836andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);1837cmpwi(cr_reg, temp2_reg, 0);1838bne(cr_reg, try_revoke_bias);18391840// Biasing is still enabled for this data type. See whether the1841// epoch of the current bias is still valid, meaning that the epoch1842// bits of the mark word are equal to the epoch bits of the1843// prototype header. (Note that the prototype header's epoch bits1844// only change at a safepoint.) If not, attempt to rebias the object1845// toward the current thread. Note that we must be absolutely sure1846// that the current epoch is invalid in order to do this because1847// otherwise the manipulations it performs on the mark word are1848// illegal.18491850int shift_amount = 64 - markOopDesc::epoch_shift;1851// rotate epoch bits to right (little) end and set other bits to 01852// [ big part | epoch | little part ] -> [ 0..0 | epoch ]1853rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);1854// branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented1855bne(CCR0, try_rebias);18561857// The epoch of the current bias is still valid but we know nothing1858// about the owner; it might be set or it might be clear. Try to1859// acquire the bias of the object using an atomic operation. If this1860// fails we will go in to the runtime to revoke the object's bias.1861// Note that we first construct the presumed unbiased header so we1862// don't accidentally blow away another thread's valid bias.1863andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |1864markOopDesc::age_mask_in_place |1865markOopDesc::epoch_mask_in_place));1866orr(temp_reg, R16_thread, mark_reg);18671868assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");18691870// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).1871fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?1872cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,1873/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,1874/*where=*/obj_reg,1875MacroAssembler::MemBarAcq,1876MacroAssembler::cmpxchgx_hint_acquire_lock(),1877noreg, slow_case_int); // bail out if failed18781879// If the biasing toward our thread failed, this means that1880// another thread succeeded in biasing it toward itself and we1881// need to revoke that bias. The revocation will occur in the1882// interpreter runtime in the slow case.1883if (PrintBiasedLockingStatistics) {1884load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);1885lwz(temp2_reg, 0, temp_reg);1886addi(temp2_reg, temp2_reg, 1);1887stw(temp2_reg, 0, temp_reg);1888}1889b(done);18901891bind(try_rebias);1892// At this point we know the epoch has expired, meaning that the1893// current "bias owner", if any, is actually invalid. Under these1894// circumstances _only_, we are allowed to use the current header's1895// value as the comparison value when doing the cas to acquire the1896// bias in the current epoch. In other words, we allow transfer of1897// the bias from one thread to another directly in this situation.1898andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);1899orr(temp_reg, R16_thread, temp_reg);1900load_klass(temp2_reg, obj_reg);1901ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);1902orr(temp_reg, temp_reg, temp2_reg);19031904assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");19051906// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).1907fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?1908cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,1909/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,1910/*where=*/obj_reg,1911MacroAssembler::MemBarAcq,1912MacroAssembler::cmpxchgx_hint_acquire_lock(),1913noreg, slow_case_int); // bail out if failed19141915// If the biasing toward our thread failed, this means that1916// another thread succeeded in biasing it toward itself and we1917// need to revoke that bias. The revocation will occur in the1918// interpreter runtime in the slow case.1919if (PrintBiasedLockingStatistics) {1920load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);1921lwz(temp2_reg, 0, temp_reg);1922addi(temp2_reg, temp2_reg, 1);1923stw(temp2_reg, 0, temp_reg);1924}1925b(done);19261927bind(try_revoke_bias);1928// The prototype mark in the klass doesn't have the bias bit set any1929// more, indicating that objects of this data type are not supposed1930// to be biased any more. We are going to try to reset the mark of1931// this object to the prototype value and fall through to the1932// CAS-based locking scheme. Note that if our CAS fails, it means1933// that another thread raced us for the privilege of revoking the1934// bias of this particular object, so it's okay to continue in the1935// normal locking code.1936load_klass(temp_reg, obj_reg);1937ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);1938andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);1939orr(temp_reg, temp_reg, temp2_reg);19401941assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");19421943// CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).1944fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?1945cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,1946/*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,1947/*where=*/obj_reg,1948MacroAssembler::MemBarAcq,1949MacroAssembler::cmpxchgx_hint_acquire_lock());19501951// reload markOop in mark_reg before continuing with lightweight locking1952ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);19531954// Fall through to the normal CAS-based lock, because no matter what1955// the result of the above CAS, some thread must have succeeded in1956// removing the bias bit from the object's header.1957if (PrintBiasedLockingStatistics) {1958Label l;1959bne(cr_reg, l);1960load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);1961lwz(temp2_reg, 0, temp_reg);1962addi(temp2_reg, temp2_reg, 1);1963stw(temp2_reg, 0, temp_reg);1964bind(l);1965}19661967bind(cas_label);1968}19691970void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {1971// Check for biased locking unlock case, which is a no-op1972// Note: we do not have to check the thread ID for two reasons.1973// First, the interpreter checks for IllegalMonitorStateException at1974// a higher level. Second, if the bias was revoked while we held the1975// lock, the object could not be rebiased toward another thread, so1976// the bias bit would be clear.19771978ld(temp_reg, 0, mark_addr);1979andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);19801981cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);1982beq(cr_reg, done);1983}19841985// "The box" is the space on the stack where we copy the object mark.1986void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,1987Register temp, Register displaced_header, Register current_header) {1988assert_different_registers(oop, box, temp, displaced_header, current_header);1989assert(flag != CCR0, "bad condition register");1990Label cont;1991Label object_has_monitor;1992Label cas_failed;19931994// Load markOop from object into displaced_header.1995ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);199619971998// Always do locking in runtime.1999if (EmitSync & 0x01) {2000cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.2001return;2002}20032004if (UseBiasedLocking) {2005biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);2006}20072008// Handle existing monitor.2009if ((EmitSync & 0x02) == 0) {2010// The object has an existing monitor iff (mark & monitor_value) != 0.2011andi_(temp, displaced_header, markOopDesc::monitor_value);2012bne(CCR0, object_has_monitor);2013}20142015// Set displaced_header to be (markOop of object | UNLOCK_VALUE).2016ori(displaced_header, displaced_header, markOopDesc::unlocked_value);20172018// Load Compare Value application register.20192020// Initialize the box. (Must happen before we update the object mark!)2021std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);20222023// Must fence, otherwise, preceding store(s) may float below cmpxchg.2024// Compare object markOop with mark and if equal exchange scratch1 with object markOop.2025// CmpxchgX sets cr_reg to cmpX(current, displaced).2026membar(Assembler::StoreStore);2027cmpxchgd(/*flag=*/flag,2028/*current_value=*/current_header,2029/*compare_value=*/displaced_header,2030/*exchange_value=*/box,2031/*where=*/oop,2032MacroAssembler::MemBarAcq,2033MacroAssembler::cmpxchgx_hint_acquire_lock(),2034noreg,2035&cas_failed);2036assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");20372038// If the compare-and-exchange succeeded, then we found an unlocked2039// object and we have now locked it.2040b(cont);20412042bind(cas_failed);2043// We did not see an unlocked object so try the fast recursive case.20442045// Check if the owner is self by comparing the value in the markOop of object2046// (current_header) with the stack pointer.2047sub(current_header, current_header, R1_SP);2048load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);20492050and_(R0/*==0?*/, current_header, temp);2051// If condition is true we are cont and hence we can store 0 as the2052// displaced header in the box, which indicates that it is a recursive lock.2053mcrf(flag,CCR0);2054std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);20552056// Handle existing monitor.2057if ((EmitSync & 0x02) == 0) {2058b(cont);20592060bind(object_has_monitor);2061// The object's monitor m is unlocked iff m->owner == NULL,2062// otherwise m->owner may contain a thread or a stack address.2063//2064// Try to CAS m->owner from NULL to current thread.2065addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);2066li(displaced_header, 0);2067// CmpxchgX sets flag to cmpX(current, displaced).2068cmpxchgd(/*flag=*/flag,2069/*current_value=*/current_header,2070/*compare_value=*/displaced_header,2071/*exchange_value=*/R16_thread,2072/*where=*/temp,2073MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,2074MacroAssembler::cmpxchgx_hint_acquire_lock());20752076// Store a non-null value into the box.2077std(box, BasicLock::displaced_header_offset_in_bytes(), box);20782079# ifdef ASSERT2080bne(flag, cont);2081// We have acquired the monitor, check some invariants.2082addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());2083// Invariant 1: _recursions should be 0.2084//assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");2085asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,2086"monitor->_recursions should be 0", -1);2087// Invariant 2: OwnerIsThread shouldn't be 0.2088//assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");2089//asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,2090// "monitor->OwnerIsThread shouldn't be 0", -1);2091# endif2092}20932094bind(cont);2095// flag == EQ indicates success2096// flag == NE indicates failure2097}20982099void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,2100Register temp, Register displaced_header, Register current_header) {2101assert_different_registers(oop, box, temp, displaced_header, current_header);2102assert(flag != CCR0, "bad condition register");2103Label cont;2104Label object_has_monitor;21052106// Always do locking in runtime.2107if (EmitSync & 0x01) {2108cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.2109return;2110}21112112if (UseBiasedLocking) {2113biased_locking_exit(flag, oop, current_header, cont);2114}21152116// Find the lock address and load the displaced header from the stack.2117ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);21182119// If the displaced header is 0, we have a recursive unlock.2120cmpdi(flag, displaced_header, 0);2121beq(flag, cont);21222123// Handle existing monitor.2124if ((EmitSync & 0x02) == 0) {2125// The object has an existing monitor iff (mark & monitor_value) != 0.2126ld(current_header, oopDesc::mark_offset_in_bytes(), oop);2127andi(temp, current_header, markOopDesc::monitor_value);2128cmpdi(flag, temp, 0);2129bne(flag, object_has_monitor);2130}213121322133// Check if it is still a light weight lock, this is is true if we see2134// the stack address of the basicLock in the markOop of the object.2135// Cmpxchg sets flag to cmpd(current_header, box).2136cmpxchgd(/*flag=*/flag,2137/*current_value=*/current_header,2138/*compare_value=*/box,2139/*exchange_value=*/displaced_header,2140/*where=*/oop,2141MacroAssembler::MemBarRel,2142MacroAssembler::cmpxchgx_hint_release_lock(),2143noreg,2144&cont);21452146assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");21472148// Handle existing monitor.2149if ((EmitSync & 0x02) == 0) {2150b(cont);21512152bind(object_has_monitor);2153addi(current_header, current_header, -markOopDesc::monitor_value); // monitor2154ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);2155ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);2156xorr(temp, R16_thread, temp); // Will be 0 if we are the owner.2157orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.2158cmpdi(flag, temp, 0);2159bne(flag, cont);21602161ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);2162ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);2163orr(temp, temp, displaced_header); // Will be 0 if both are 0.2164cmpdi(flag, temp, 0);2165bne(flag, cont);2166release();2167std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);2168}21692170bind(cont);2171// flag == EQ indicates success2172// flag == NE indicates failure2173}21742175// Write serialization page so VM thread can do a pseudo remote membar.2176// We use the current thread pointer to calculate a thread specific2177// offset to write to within the page. This minimizes bus traffic2178// due to cache line collision.2179void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {2180srdi(tmp2, thread, os::get_serialize_page_shift_count());21812182int mask = os::vm_page_size() - sizeof(int);2183if (Assembler::is_simm(mask, 16)) {2184andi(tmp2, tmp2, mask);2185} else {2186lis(tmp1, (int)((signed short) (mask >> 16)));2187ori(tmp1, tmp1, mask & 0x0000ffff);2188andr(tmp2, tmp2, tmp1);2189}21902191load_const(tmp1, (long) os::get_memory_serialize_page());2192release();2193stwx(R0, tmp1, tmp2);2194}219521962197// GC barrier helper macros21982199// Write the card table byte if needed.2200void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {2201CardTableModRefBS* bs = (CardTableModRefBS*) Universe::heap()->barrier_set();2202assert(bs->kind() == BarrierSet::CardTableModRef ||2203bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");2204#ifdef ASSERT2205cmpdi(CCR0, Rnew_val, 0);2206asm_assert_ne("null oop not allowed", 0x321);2207#endif2208card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);2209}22102211// Write the card table byte.2212void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {2213assert_different_registers(Robj, Rtmp, R0);2214load_const_optimized(Rtmp, (address)byte_map_base, R0);2215srdi(Robj, Robj, CardTableModRefBS::card_shift);2216li(R0, 0); // dirty2217if (UseConcMarkSweepGC) membar(Assembler::StoreStore);2218stbx(R0, Rtmp, Robj);2219}22202221// Kills R31 if value is a volatile register.2222void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {2223Label done;2224cmpdi(CCR0, value, 0);2225beq(CCR0, done); // Use NULL as-is.22262227clrrdi(tmp1, value, JNIHandles::weak_tag_size);2228#if INCLUDE_ALL_GCS2229if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }2230#endif2231ld(value, 0, tmp1); // Resolve (untagged) jobject.22322233#if INCLUDE_ALL_GCS2234if (UseG1GC) {2235Label not_weak;2236beq(CCR0, not_weak); // Test for jweak tag.2237verify_oop(value);2238g1_write_barrier_pre(noreg, // obj2239noreg, // offset2240value, // pre_val2241tmp1, tmp2, needs_frame);2242bind(not_weak);2243}2244#endif // INCLUDE_ALL_GCS2245verify_oop(value);2246bind(done);2247}22482249#if INCLUDE_ALL_GCS2250// General G1 pre-barrier generator.2251// Goal: record the previous value if it is not null.2252void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,2253Register Rtmp1, Register Rtmp2, bool needs_frame) {2254Label runtime, filtered;22552256// Is marking active?2257if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {2258lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);2259} else {2260guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");2261lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);2262}2263cmpdi(CCR0, Rtmp1, 0);2264beq(CCR0, filtered);22652266// Do we need to load the previous value?2267if (Robj != noreg) {2268// Load the previous value...2269if (UseCompressedOops) {2270lwz(Rpre_val, offset, Robj);2271} else {2272ld(Rpre_val, offset, Robj);2273}2274// Previous value has been loaded into Rpre_val.2275}2276assert(Rpre_val != noreg, "must have a real register");22772278// Is the previous value null?2279cmpdi(CCR0, Rpre_val, 0);2280beq(CCR0, filtered);22812282if (Robj != noreg && UseCompressedOops) {2283decode_heap_oop_not_null(Rpre_val);2284}22852286// OK, it's not filtered, so we'll need to call enqueue. In the normal2287// case, pre_val will be a scratch G-reg, but there are some cases in2288// which it's an O-reg. In the first case, do a normal call. In the2289// latter, do a save here and call the frameless version.22902291// Can we store original value in the thread's buffer?2292// Is index == 0?2293// (The index field is typed as size_t.)2294const Register Rbuffer = Rtmp1, Rindex = Rtmp2;22952296ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);2297cmpdi(CCR0, Rindex, 0);2298beq(CCR0, runtime); // If index == 0, goto runtime.2299ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);23002301addi(Rindex, Rindex, -wordSize); // Decrement index.2302std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);23032304// Record the previous value.2305stdx(Rpre_val, Rbuffer, Rindex);2306b(filtered);23072308bind(runtime);23092310// May need to preserve LR. Also needed if current frame is not compatible with C calling convention.2311if (needs_frame) {2312save_LR_CR(Rtmp1);2313push_frame_reg_args(0, Rtmp2);2314}23152316if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.2317call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);2318if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore23192320if (needs_frame) {2321pop_frame();2322restore_LR_CR(Rtmp1);2323}23242325bind(filtered);2326}23272328// General G1 post-barrier generator2329// Store cross-region card.2330void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {2331Label runtime, filtered_int;2332Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;2333assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);23342335G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set();2336assert(bs->kind() == BarrierSet::G1SATBCT ||2337bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier");23382339// Does store cross heap regions?2340if (G1RSBarrierRegionFilter) {2341xorr(Rtmp1, Rstore_addr, Rnew_val);2342srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);2343beq(CCR0, filtered);2344}23452346// Crosses regions, storing NULL?2347#ifdef ASSERT2348cmpdi(CCR0, Rnew_val, 0);2349asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:2350//beq(CCR0, filtered);2351#endif23522353// Storing region crossing non-NULL, is card already dirty?2354assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");2355const Register Rcard_addr = Rtmp1;2356Register Rbase = Rtmp2;2357load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);23582359srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);23602361// Get the address of the card.2362lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);2363cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());2364beq(CCR0, filtered);23652366membar(Assembler::StoreLoad);2367lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar.2368cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());2369beq(CCR0, filtered);23702371// Storing a region crossing, non-NULL oop, card is clean.2372// Dirty card and log.2373li(Rtmp3, CardTableModRefBS::dirty_card_val());2374//release(); // G1: oops are allowed to get visible after dirty marking.2375stbx(Rtmp3, Rbase, Rcard_addr);23762377add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.2378Rbase = noreg; // end of lifetime23792380const Register Rqueue_index = Rtmp2,2381Rqueue_buf = Rtmp3;2382ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);2383cmpdi(CCR0, Rqueue_index, 0);2384beq(CCR0, runtime); // index == 0 then jump to runtime2385ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);23862387addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index2388std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);23892390stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card2391b(filtered);23922393bind(runtime);23942395// Save the live input values.2396call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);23972398bind(filtered_int);2399}2400#endif // INCLUDE_ALL_GCS24012402// Values for last_Java_pc, and last_Java_sp must comply to the rules2403// in frame_ppc.hpp.2404void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {2405// Always set last_Java_pc and flags first because once last_Java_sp2406// is visible has_last_Java_frame is true and users will look at the2407// rest of the fields. (Note: flags should always be zero before we2408// get here so doesn't need to be set.)24092410// Verify that last_Java_pc was zeroed on return to Java2411asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,2412"last_Java_pc not zeroed before leaving Java", 0x200);24132414// When returning from calling out from Java mode the frame anchor's2415// last_Java_pc will always be set to NULL. It is set here so that2416// if we are doing a call to native (not VM) that we capture the2417// known pc and don't have to rely on the native call having a2418// standard frame linkage where we can find the pc.2419if (last_Java_pc != noreg)2420std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);24212422// Set last_Java_sp last.2423std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);2424}24252426void MacroAssembler::reset_last_Java_frame(void) {2427asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),2428R16_thread, "SP was not set, still zero", 0x202);24292430BLOCK_COMMENT("reset_last_Java_frame {");2431li(R0, 0);24322433// _last_Java_sp = 02434std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);24352436// _last_Java_pc = 02437std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);2438BLOCK_COMMENT("} reset_last_Java_frame");2439}24402441void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {2442assert_different_registers(sp, tmp1);24432444// sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via2445// TOP_IJAVA_FRAME_ABI.2446// FIXME: assert that we really have a TOP_IJAVA_FRAME here!2447#ifdef CC_INTERP2448ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);2449#else2450address entry = pc();2451load_const_optimized(tmp1, entry);2452#endif24532454set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);2455}24562457void MacroAssembler::get_vm_result(Register oop_result) {2458// Read:2459// R16_thread2460// R16_thread->in_bytes(JavaThread::vm_result_offset())2461//2462// Updated:2463// oop_result2464// R16_thread->in_bytes(JavaThread::vm_result_offset())24652466ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);2467li(R0, 0);2468std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);24692470verify_oop(oop_result);2471}24722473void MacroAssembler::get_vm_result_2(Register metadata_result) {2474// Read:2475// R16_thread2476// R16_thread->in_bytes(JavaThread::vm_result_2_offset())2477//2478// Updated:2479// metadata_result2480// R16_thread->in_bytes(JavaThread::vm_result_2_offset())24812482ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);2483li(R0, 0);2484std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);2485}248624872488void MacroAssembler::encode_klass_not_null(Register dst, Register src) {2489Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.2490if (Universe::narrow_klass_base() != 0) {2491// Use dst as temp if it is free.2492load_const(R0, Universe::narrow_klass_base(), (dst != current && dst != R0) ? dst : noreg);2493sub(dst, current, R0);2494current = dst;2495}2496if (Universe::narrow_klass_shift() != 0) {2497srdi(dst, current, Universe::narrow_klass_shift());2498current = dst;2499}2500mr_if_needed(dst, current); // Move may be required.2501}25022503void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {2504if (UseCompressedClassPointers) {2505encode_klass_not_null(ck, klass);2506stw(ck, oopDesc::klass_offset_in_bytes(), dst_oop);2507} else {2508std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);2509}2510}25112512void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {2513if (UseCompressedClassPointers) {2514if (val == noreg) {2515val = R0;2516li(val, 0);2517}2518stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed2519}2520}25212522int MacroAssembler::instr_size_for_decode_klass_not_null() {2523if (!UseCompressedClassPointers) return 0;2524int num_instrs = 1; // shift or move2525if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add2526return num_instrs * BytesPerInstWord;2527}25282529void MacroAssembler::decode_klass_not_null(Register dst, Register src) {2530assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");2531if (src == noreg) src = dst;2532Register shifted_src = src;2533if (Universe::narrow_klass_shift() != 0 ||2534Universe::narrow_klass_base() == 0 && src != dst) { // Move required.2535shifted_src = dst;2536sldi(shifted_src, src, Universe::narrow_klass_shift());2537}2538if (Universe::narrow_klass_base() != 0) {2539load_const(R0, Universe::narrow_klass_base());2540add(dst, shifted_src, R0);2541}2542}25432544void MacroAssembler::load_klass(Register dst, Register src) {2545if (UseCompressedClassPointers) {2546lwz(dst, oopDesc::klass_offset_in_bytes(), src);2547// Attention: no null check here!2548decode_klass_not_null(dst, dst);2549} else {2550ld(dst, oopDesc::klass_offset_in_bytes(), src);2551}2552}25532554void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {2555if (!os::zero_page_read_protected()) {2556if (TrapBasedNullChecks) {2557trap_null_check(src);2558}2559}2560load_klass(dst, src);2561}25622563void MacroAssembler::reinit_heapbase(Register d, Register tmp) {2564if (Universe::heap() != NULL) {2565load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);2566} else {2567// Heap not yet allocated. Load indirectly.2568int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);2569ld(R30, simm16_offset, R30);2570}2571}25722573// Clear Array2574// Kills both input registers. tmp == R0 is allowed.2575void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {2576// Procedure for large arrays (uses data cache block zero instruction).2577Label startloop, fast, fastloop, small_rest, restloop, done;2578const int cl_size = VM_Version::get_cache_line_size(),2579cl_dwords = cl_size>>3,2580cl_dw_addr_bits = exact_log2(cl_dwords),2581dcbz_min = 1; // Min count of dcbz executions, needs to be >0.25822583//2:2584cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).2585blt(CCR1, small_rest); // Too small.2586rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.2587beq(CCR0, fast); // Already 128byte aligned.25882589subfic(tmp, tmp, cl_dwords);2590mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).2591subf(cnt_dwords, tmp, cnt_dwords); // rest.2592li(tmp, 0);2593//10:2594bind(startloop); // Clear at the beginning to reach 128byte boundary.2595std(tmp, 0, base_ptr); // Clear 8byte aligned block.2596addi(base_ptr, base_ptr, 8);2597bdnz(startloop);2598//13:2599bind(fast); // Clear 128byte blocks.2600srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).2601andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.2602mtctr(tmp); // Load counter.2603//16:2604bind(fastloop);2605dcbz(base_ptr); // Clear 128byte aligned block.2606addi(base_ptr, base_ptr, cl_size);2607bdnz(fastloop);2608if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }2609//20:2610bind(small_rest);2611cmpdi(CCR0, cnt_dwords, 0); // size 0?2612beq(CCR0, done); // rest == 02613li(tmp, 0);2614mtctr(cnt_dwords); // Load counter.2615//24:2616bind(restloop); // Clear rest.2617std(tmp, 0, base_ptr); // Clear 8byte aligned block.2618addi(base_ptr, base_ptr, 8);2619bdnz(restloop);2620//27:2621bind(done);2622}26232624/////////////////////////////////////////// String intrinsics ////////////////////////////////////////////26252626// Search for a single jchar in an jchar[].2627//2628// Assumes that result differs from all other registers.2629//2630// Haystack, needle are the addresses of jchar-arrays.2631// NeedleChar is needle[0] if it is known at compile time.2632// Haycnt is the length of the haystack. We assume haycnt >=1.2633//2634// Preserves haystack, haycnt, kills all other registers.2635//2636// If needle == R0, we search for the constant needleChar.2637void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,2638Register needle, jchar needleChar,2639Register tmp1, Register tmp2) {26402641assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);26422643Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;2644Register needle0 = needle, // Contains needle[0].2645addr = tmp1,2646ch1 = tmp2,2647ch2 = R0;26482649//2 (variable) or 3 (const):2650if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.2651dcbtct(haystack, 0x00); // Indicate R/O access to haystack.26522653srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR).2654mr(addr, haystack);2655beq(CCR0, L_FinalCheck);2656mtctr(tmp2); // Move to count register.2657//8:2658bind(L_InnerLoop); // Main work horse (2x unrolled search loop).2659lhz(ch1, 0, addr); // Load characters from haystack.2660lhz(ch2, 2, addr);2661(needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);2662(needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);2663beq(CCR0, L_Found1); // Did we find the needle?2664beq(CCR1, L_Found2);2665addi(addr, addr, 4);2666bdnz(L_InnerLoop);2667//16:2668bind(L_FinalCheck);2669andi_(R0, haycnt, 1);2670beq(CCR0, L_NotFound);2671lhz(ch1, 0, addr); // One position left at which we have to compare.2672(needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);2673beq(CCR1, L_Found3);2674//21:2675bind(L_NotFound);2676li(result, -1); // Not found.2677b(L_End);26782679bind(L_Found2);2680addi(addr, addr, 2);2681//24:2682bind(L_Found1);2683bind(L_Found3); // Return index ...2684subf(addr, haystack, addr); // relative to haystack,2685srdi(result, addr, 1); // in characters.2686bind(L_End);2687}268826892690// Implementation of IndexOf for jchar arrays.2691//2692// The length of haystack and needle are not constant, i.e. passed in a register.2693//2694// Preserves registers haystack, needle.2695// Kills registers haycnt, needlecnt.2696// Assumes that result differs from all other registers.2697// Haystack, needle are the addresses of jchar-arrays.2698// Haycnt, needlecnt are the lengths of them, respectively.2699//2700// Needlecntval must be zero or 15-bit unsigned immediate and > 1.2701void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,2702Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,2703Register tmp1, Register tmp2, Register tmp3, Register tmp4) {27042705// Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!2706Label L_TooShort, L_Found, L_NotFound, L_End;2707Register last_addr = haycnt, // Kill haycnt at the beginning.2708addr = tmp1,2709n_start = tmp2,2710ch1 = tmp3,2711ch2 = R0;27122713// **************************************************************************************************2714// Prepare for main loop: optimized for needle count >=2, bail out otherwise.2715// **************************************************************************************************27162717//1 (variable) or 3 (const):2718dcbtct(needle, 0x00); // Indicate R/O access to str1.2719dcbtct(haystack, 0x00); // Indicate R/O access to str2.27202721// Compute last haystack addr to use if no match gets found.2722if (needlecntval == 0) { // variable needlecnt2723//3:2724subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt.2725addi(addr, haystack, -2); // Accesses use pre-increment.2726cmpwi(CCR6, needlecnt, 2);2727blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately.2728slwi(ch1, ch1, 1); // Scale to number of bytes.2729lwz(n_start, 0, needle); // Load first 2 characters of needle.2730add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)).2731addi(needlecnt, needlecnt, -2); // Rest of needle.2732} else { // constant needlecnt2733guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");2734assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");2735//5:2736addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt.2737lwz(n_start, 0, needle); // Load first 2 characters of needle.2738addi(addr, haystack, -2); // Accesses use pre-increment.2739slwi(ch1, ch1, 1); // Scale to number of bytes.2740add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)).2741li(needlecnt, needlecntval-2); // Rest of needle.2742}27432744// Main Loop (now we have at least 3 characters).2745//11:2746Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;2747bind(L_OuterLoop); // Search for 1st 2 characters.2748Register addr_diff = tmp4;2749subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.2750addi(addr, addr, 2); // This is the new address we want to use for comparing.2751srdi_(ch2, addr_diff, 2);2752beq(CCR0, L_FinalCheck); // 2 characters left?2753mtctr(ch2); // addr_diff/42754//16:2755bind(L_InnerLoop); // Main work horse (2x unrolled search loop)2756lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment).2757lwz(ch2, 2, addr);2758cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).2759cmpw(CCR1, ch2, n_start);2760beq(CCR0, L_Comp1); // Did we find the needle start?2761beq(CCR1, L_Comp2);2762addi(addr, addr, 4);2763bdnz(L_InnerLoop);2764//24:2765bind(L_FinalCheck);2766rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.2767beq(CCR0, L_NotFound);2768lwz(ch1, 0, addr); // One position left at which we have to compare.2769cmpw(CCR1, ch1, n_start);2770beq(CCR1, L_Comp3);2771//29:2772bind(L_NotFound);2773li(result, -1); // not found2774b(L_End);277527762777// **************************************************************************************************2778// Special Case: unfortunately, the variable needle case can be called with needlecnt<22779// **************************************************************************************************2780//31:2781if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.2782int nopcnt = 5;2783if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).2784if (needlecntval == 0) { // We have to handle these cases separately.2785Label L_OneCharLoop;2786bind(L_TooShort);2787mtctr(haycnt);2788lhz(n_start, 0, needle); // First character of needle2789bind(L_OneCharLoop);2790lhzu(ch1, 2, addr);2791cmpw(CCR1, ch1, n_start);2792beq(CCR1, L_Found); // Did we find the one character needle?2793bdnz(L_OneCharLoop);2794li(result, -1); // Not found.2795b(L_End);2796} // 8 instructions, so no impact on alignment.2797for (int x = 0; x < nopcnt; ++x) nop();2798}27992800// **************************************************************************************************2801// Regular Case Part II: compare rest of needle (first 2 characters have been compared already)2802// **************************************************************************************************28032804// Compare the rest2805//36 if needlecntval==0, else 37:2806bind(L_Comp2);2807addi(addr, addr, 2); // First comparison has failed, 2nd one hit.2808bind(L_Comp1); // Addr points to possible needle start.2809bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here.2810if (needlecntval != 2) { // Const needlecnt==2?2811if (needlecntval != 3) {2812if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?2813Register ind_reg = tmp4;2814li(ind_reg, 2*2); // First 2 characters are already compared, use index 2.2815mtctr(needlecnt); // Decremented by 2, still > 0.2816//40:2817Label L_CompLoop;2818bind(L_CompLoop);2819lhzx(ch2, needle, ind_reg);2820lhzx(ch1, addr, ind_reg);2821cmpw(CCR1, ch1, ch2);2822bne(CCR1, L_OuterLoop);2823addi(ind_reg, ind_reg, 2);2824bdnz(L_CompLoop);2825} else { // No loop required if there's only one needle character left.2826lhz(ch2, 2*2, needle);2827lhz(ch1, 2*2, addr);2828cmpw(CCR1, ch1, ch2);2829bne(CCR1, L_OuterLoop);2830}2831}2832// Return index ...2833//46:2834bind(L_Found);2835subf(addr, haystack, addr); // relative to haystack, ...2836srdi(result, addr, 1); // in characters.2837//48:2838bind(L_End);2839}28402841// Implementation of Compare for jchar arrays.2842//2843// Kills the registers str1, str2, cnt1, cnt2.2844// Kills cr0, ctr.2845// Assumes that result differes from the input registers.2846void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,2847Register result_reg, Register tmp_reg) {2848assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);28492850Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;2851Register cnt_diff = R0,2852limit_reg = cnt1_reg,2853chr1_reg = result_reg,2854chr2_reg = cnt2_reg,2855addr_diff = str2_reg;28562857// Offset 0 should be 32 byte aligned.2858//-4:2859dcbtct(str1_reg, 0x00); // Indicate R/O access to str1.2860dcbtct(str2_reg, 0x00); // Indicate R/O access to str2.2861//-2:2862// Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).2863subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/22864subf_(addr_diff, str1_reg, str2_reg); // alias?2865beq(CCR0, Ldone); // return cnt difference if both ones are identical2866srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)2867mr(cnt_diff, result_reg);2868andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 02869add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0?2870beq(CCR0, Ldone); // return cnt difference if one has 0 length28712872lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch2873lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch2874addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-12875subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch2876bne(CCR0, Ldone); // optional: early out if first characters mismatch28772878// Set loop counter by scaling down tmp_reg2879srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/42880ble(CCR0, Lslow_case); // need >4 characters for fast loop2881andi(limit_reg, tmp_reg, 4-1); // remaining characters28822883// Adapt str1_reg str2_reg for the first loop iteration2884mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/42885addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop2886//16:2887// Compare the rest of the characters2888bind(Lfast_loop);2889ld(chr1_reg, 0, str1_reg);2890ldx(chr2_reg, str1_reg, addr_diff);2891cmpd(CCR0, chr2_reg, chr1_reg);2892bne(CCR0, Lslow_case); // return chr1_reg2893addi(str1_reg, str1_reg, 4*2);2894bdnz(Lfast_loop);2895addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing2896//23:2897bind(Lslow_case);2898mtctr(limit_reg);2899//24:2900bind(Lslow_loop);2901lhz(chr1_reg, 0, str1_reg);2902lhzx(chr2_reg, str1_reg, addr_diff);2903subf_(result_reg, chr2_reg, chr1_reg);2904bne(CCR0, Ldone); // return chr1_reg2905addi(str1_reg, str1_reg, 1*2);2906bdnz(Lslow_loop);2907//30:2908// If strings are equal up to min length, return the length difference.2909mr(result_reg, cnt_diff);2910nop(); // alignment2911//32:2912// Otherwise, return the difference between the first mismatched chars.2913bind(Ldone);2914}291529162917// Compare char[] arrays.2918//2919// str1_reg USE only2920// str2_reg USE only2921// cnt_reg USE_DEF, due to tmp reg shortage2922// result_reg DEF only, might compromise USE only registers2923void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,2924Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,2925Register tmp5_reg) {29262927// Str1 may be the same register as str2 which can occur e.g. after scalar replacement.2928assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);2929assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);29302931// Offset 0 should be 32 byte aligned.2932Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;2933Register index_reg = tmp5_reg;2934Register cbc_iter = tmp4_reg;29352936//-1:2937dcbtct(str1_reg, 0x00); // Indicate R/O access to str1.2938dcbtct(str2_reg, 0x00); // Indicate R/O access to str2.2939//1:2940andi(cbc_iter, cnt_reg, 4-1); // Remaining iterations after 4 java characters per iteration loop.2941li(index_reg, 0); // init2942li(result_reg, 0); // assume false2943srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).29442945cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0)2946beq(CCR0, Linit_cbc); // too short2947mtctr(tmp2_reg);2948//8:2949bind(Lloop);2950ldx(tmp1_reg, str1_reg, index_reg);2951ldx(tmp2_reg, str2_reg, index_reg);2952cmpd(CCR0, tmp1_reg, tmp2_reg);2953bne(CCR0, Ldone_false); // Unequal char pair found -> done.2954addi(index_reg, index_reg, 4*sizeof(jchar));2955bdnz(Lloop);2956//14:2957bind(Linit_cbc);2958beq(CCR1, Ldone_true);2959mtctr(cbc_iter);2960//16:2961bind(Lcbc);2962lhzx(tmp1_reg, str1_reg, index_reg);2963lhzx(tmp2_reg, str2_reg, index_reg);2964cmpw(CCR0, tmp1_reg, tmp2_reg);2965bne(CCR0, Ldone_false); // Unequal char pair found -> done.2966addi(index_reg, index_reg, 1*sizeof(jchar));2967bdnz(Lcbc);2968nop();2969bind(Ldone_true);2970li(result_reg, 1);2971//24:2972bind(Ldone_false);2973}297429752976void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,2977Register tmp1_reg, Register tmp2_reg) {2978// Str1 may be the same register as str2 which can occur e.g. after scalar replacement.2979assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);2980assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);2981assert(sizeof(jchar) == 2, "must be");2982assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");29832984Label Ldone_false;29852986if (cntval < 16) { // short case2987if (cntval != 0) li(result_reg, 0); // assume false29882989const int num_bytes = cntval*sizeof(jchar);2990int index = 0;2991for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {2992ld(tmp1_reg, index, str1_reg);2993ld(tmp2_reg, index, str2_reg);2994cmpd(CCR0, tmp1_reg, tmp2_reg);2995bne(CCR0, Ldone_false);2996}2997if (cntval & 2) {2998lwz(tmp1_reg, index, str1_reg);2999lwz(tmp2_reg, index, str2_reg);3000cmpw(CCR0, tmp1_reg, tmp2_reg);3001bne(CCR0, Ldone_false);3002index += 4;3003}3004if (cntval & 1) {3005lhz(tmp1_reg, index, str1_reg);3006lhz(tmp2_reg, index, str2_reg);3007cmpw(CCR0, tmp1_reg, tmp2_reg);3008bne(CCR0, Ldone_false);3009}3010// fallthrough: true3011} else {3012Label Lloop;3013Register index_reg = tmp1_reg;3014const int loopcnt = cntval/4;3015assert(loopcnt > 0, "must be");3016// Offset 0 should be 32 byte aligned.3017//2:3018dcbtct(str1_reg, 0x00); // Indicate R/O access to str1.3019dcbtct(str2_reg, 0x00); // Indicate R/O access to str2.3020li(tmp2_reg, loopcnt);3021li(index_reg, 0); // init3022li(result_reg, 0); // assume false3023mtctr(tmp2_reg);3024//8:3025bind(Lloop);3026ldx(R0, str1_reg, index_reg);3027ldx(tmp2_reg, str2_reg, index_reg);3028cmpd(CCR0, R0, tmp2_reg);3029bne(CCR0, Ldone_false); // Unequal char pair found -> done.3030addi(index_reg, index_reg, 4*sizeof(jchar));3031bdnz(Lloop);3032//14:3033if (cntval & 2) {3034lwzx(R0, str1_reg, index_reg);3035lwzx(tmp2_reg, str2_reg, index_reg);3036cmpw(CCR0, R0, tmp2_reg);3037bne(CCR0, Ldone_false);3038if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));3039}3040if (cntval & 1) {3041lhzx(R0, str1_reg, index_reg);3042lhzx(tmp2_reg, str2_reg, index_reg);3043cmpw(CCR0, R0, tmp2_reg);3044bne(CCR0, Ldone_false);3045}3046// fallthru: true3047}3048li(result_reg, 1);3049bind(Ldone_false);3050}30513052// Helpers for Intrinsic Emitters3053//3054// Revert the byte order of a 32bit value in a register3055// src: 0x445566773056// dst: 0x776655443057// Three steps to obtain the result:3058// 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word3059// into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.3060// This value initializes dst.3061// 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost3062// byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.3063// This value is mask inserted into dst with a [0..23] mask of 1s.3064// 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.3065// This value is mask inserted into dst with a [8..15] mask of 1s.3066void MacroAssembler::load_reverse_32(Register dst, Register src) {3067assert_different_registers(dst, src);30683069rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.3070rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.3071rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.3072}30733074// Calculate the column addresses of the crc32 lookup table into distinct registers.3075// This loop-invariant calculation is moved out of the loop body, reducing the loop3076// body size from 20 to 16 instructions.3077// Returns the offset that was used to calculate the address of column tc3.3078// Due to register shortage, setting tc3 may overwrite table. With the return offset3079// at hand, the original table address can be easily reconstructed.3080int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {30813082#ifdef VM_LITTLE_ENDIAN3083// This is what we implement (the DOLIT4 part):3084// ========================================================================= */3085// #define DOLIT4 c ^= *buf4++; \3086// c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \3087// crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]3088// #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT43089// ========================================================================= */3090const int ix0 = 3*(4*CRC32_COLUMN_SIZE);3091const int ix1 = 2*(4*CRC32_COLUMN_SIZE);3092const int ix2 = 1*(4*CRC32_COLUMN_SIZE);3093const int ix3 = 0*(4*CRC32_COLUMN_SIZE);3094#else3095// This is what we implement (the DOBIG4 part):3096// =========================================================================3097// #define DOBIG4 c ^= *++buf4; \3098// c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \3099// crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]3100// #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG43101// =========================================================================3102const int ix0 = 4*(4*CRC32_COLUMN_SIZE);3103const int ix1 = 5*(4*CRC32_COLUMN_SIZE);3104const int ix2 = 6*(4*CRC32_COLUMN_SIZE);3105const int ix3 = 7*(4*CRC32_COLUMN_SIZE);3106#endif3107assert_different_registers(table, tc0, tc1, tc2);3108assert(table == tc3, "must be!");31093110if (ix0 != 0) addi(tc0, table, ix0);3111if (ix1 != 0) addi(tc1, table, ix1);3112if (ix2 != 0) addi(tc2, table, ix2);3113if (ix3 != 0) addi(tc3, table, ix3);31143115return ix3;3116}31173118/**3119* uint32_t crc;3120* timesXtoThe32[crc & 0xFF] ^ (crc >> 8);3121*/3122void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {3123assert_different_registers(crc, table, tmp);3124assert_different_registers(val, table);31253126if (crc == val) { // Must rotate first to use the unmodified value.3127rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.3128// As we use a word (4-byte) instruction, we have to adapt the mask bit positions.3129srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.3130} else {3131srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.3132rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.3133}3134lwzx(tmp, table, tmp);3135xorr(crc, crc, tmp);3136}31373138/**3139* uint32_t crc;3140* timesXtoThe32[crc & 0xFF] ^ (crc >> 8);3141*/3142void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {3143fold_byte_crc32(crc, crc, table, tmp);3144}31453146/**3147* Emits code to update CRC-32 with a byte value according to constants in table.3148*3149* @param [in,out]crc Register containing the crc.3150* @param [in]val Register containing the byte to fold into the CRC.3151* @param [in]table Register containing the table of crc constants.3152*3153* uint32_t crc;3154* val = crc_table[(val ^ crc) & 0xFF];3155* crc = val ^ (crc >> 8);3156*/3157void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {3158BLOCK_COMMENT("update_byte_crc32:");3159xorr(val, val, crc);3160fold_byte_crc32(crc, val, table, val);3161}31623163/**3164* @param crc register containing existing CRC (32-bit)3165* @param buf register pointing to input byte buffer (byte*)3166* @param len register containing number of bytes3167* @param table register pointing to CRC table3168*/3169void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,3170Register data, bool loopAlignment, bool invertCRC) {3171assert_different_registers(crc, buf, len, table, data);31723173Label L_mainLoop, L_done;3174const int mainLoop_stepping = 1;3175const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;31763177// Process all bytes in a single-byte loop.3178cmpdi(CCR0, len, 0); // Anything to do?3179mtctr(len);3180beq(CCR0, L_done);31813182if (invertCRC) {3183nand(crc, crc, crc); // ~c3184}31853186align(mainLoop_alignment);3187BIND(L_mainLoop);3188lbz(data, 0, buf); // Byte from buffer, zero-extended.3189addi(buf, buf, mainLoop_stepping); // Advance buffer position.3190update_byte_crc32(crc, data, table);3191bdnz(L_mainLoop); // Iterate.31923193if (invertCRC) {3194nand(crc, crc, crc); // ~c3195}31963197bind(L_done);3198}31993200/**3201* Emits code to update CRC-32 with a 4-byte value according to constants in table3202* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c3203*/3204// A not on the lookup table address(es):3205// The lookup table consists of two sets of four columns each.3206// The columns {0..3} are used for little-endian machines.3207// The columns {4..7} are used for big-endian machines.3208// To save the effort of adding the column offset to the table address each time3209// a table element is looked up, it is possible to pass the pre-calculated3210// column addresses.3211// Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.3212void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,3213Register t0, Register t1, Register t2, Register t3,3214Register tc0, Register tc1, Register tc2, Register tc3) {3215assert_different_registers(crc, t3);32163217// XOR crc with next four bytes of buffer.3218lwz(t3, bufDisp, buf);3219if (bufInc != 0) {3220addi(buf, buf, bufInc);3221}3222xorr(t3, t3, crc);32233224// Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.3225rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 23226rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 23227rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 23228rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 232293230// Use the pre-calculated column addresses.3231// Load pre-calculated table values.3232lwzx(t0, tc0, t0);3233lwzx(t1, tc1, t1);3234lwzx(t2, tc2, t2);3235lwzx(t3, tc3, t3);32363237// Calculate new crc from table values.3238xorr(t0, t0, t1);3239xorr(t2, t2, t3);3240xorr(crc, t0, t2); // Now crc contains the final checksum value.3241}32423243/**3244* @param crc register containing existing CRC (32-bit)3245* @param buf register pointing to input byte buffer (byte*)3246* @param len register containing number of bytes3247* @param table register pointing to CRC table3248*3249* Uses R9..R12 as work register. Must be saved/restored by caller!3250*/3251void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,3252Register t0, Register t1, Register t2, Register t3,3253Register tc0, Register tc1, Register tc2, Register tc3) {3254assert_different_registers(crc, buf, len, table);32553256Label L_mainLoop, L_tail;3257Register tmp = t0;3258Register data = t0;3259Register tmp2 = t1;3260const int mainLoop_stepping = 8;3261const int tailLoop_stepping = 1;3262const int log_stepping = exact_log2(mainLoop_stepping);3263const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;3264const int complexThreshold = 2*mainLoop_stepping;32653266// Don't test for len <= 0 here. This pathological case should not occur anyway.3267// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.3268// The situation itself is detected and handled correctly by the conditional branches3269// following aghi(len, -stepping) and aghi(len, +stepping).3270assert(tailLoop_stepping == 1, "check tailLoop_stepping!");32713272BLOCK_COMMENT("kernel_crc32_2word {");32733274nand(crc, crc, crc); // ~c32753276// Check for short (<mainLoop_stepping) buffer.3277cmpdi(CCR0, len, complexThreshold);3278blt(CCR0, L_tail);32793280// Pre-mainLoop alignment did show a slight (1%) positive effect on performance.3281// We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.3282{3283// Align buf addr to mainLoop_stepping boundary.3284neg(tmp2, buf); // Calculate # preLoop iterations for alignment.3285rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.32863287if (complexThreshold > mainLoop_stepping) {3288sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3289} else {3290sub(tmp, len, tmp2); // Remaining bytes for main loop.3291cmpdi(CCR0, tmp, mainLoop_stepping);3292blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing3293mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3294}3295update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);3296}32973298srdi(tmp2, len, log_stepping); // #iterations for mainLoop3299andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop3300mtctr(tmp2);33013302#ifdef VM_LITTLE_ENDIAN3303Register crc_rv = crc;3304#else3305Register crc_rv = tmp; // Load_reverse needs separate registers to work on.3306// Occupies tmp, but frees up crc.3307load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.3308tmp = crc;3309#endif33103311int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);33123313align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.3314BIND(L_mainLoop);3315update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);3316update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);3317bdnz(L_mainLoop);33183319#ifndef VM_LITTLE_ENDIAN3320load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.3321tmp = crc_rv; // Tmp uses it's original register again.3322#endif33233324// Restore original table address for tailLoop.3325if (reconstructTableOffset != 0) {3326addi(table, table, -reconstructTableOffset);3327}33283329// Process last few (<complexThreshold) bytes of buffer.3330BIND(L_tail);3331update_byteLoop_crc32(crc, buf, len, table, data, false, false);33323333nand(crc, crc, crc); // ~c3334BLOCK_COMMENT("} kernel_crc32_2word");3335}33363337/**3338* @param crc register containing existing CRC (32-bit)3339* @param buf register pointing to input byte buffer (byte*)3340* @param len register containing number of bytes3341* @param table register pointing to CRC table3342*3343* uses R9..R12 as work register. Must be saved/restored by caller!3344*/3345void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,3346Register t0, Register t1, Register t2, Register t3,3347Register tc0, Register tc1, Register tc2, Register tc3) {3348assert_different_registers(crc, buf, len, table);33493350Label L_mainLoop, L_tail;3351Register tmp = t0;3352Register data = t0;3353Register tmp2 = t1;3354const int mainLoop_stepping = 4;3355const int tailLoop_stepping = 1;3356const int log_stepping = exact_log2(mainLoop_stepping);3357const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;3358const int complexThreshold = 2*mainLoop_stepping;33593360// Don't test for len <= 0 here. This pathological case should not occur anyway.3361// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.3362// The situation itself is detected and handled correctly by the conditional branches3363// following aghi(len, -stepping) and aghi(len, +stepping).3364assert(tailLoop_stepping == 1, "check tailLoop_stepping!");33653366BLOCK_COMMENT("kernel_crc32_1word {");33673368nand(crc, crc, crc); // ~c33693370// Check for short (<mainLoop_stepping) buffer.3371cmpdi(CCR0, len, complexThreshold);3372blt(CCR0, L_tail);33733374// Pre-mainLoop alignment did show a slight (1%) positive effect on performance.3375// We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.3376{3377// Align buf addr to mainLoop_stepping boundary.3378neg(tmp2, buf); // Calculate # preLoop iterations for alignment.3379rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.33803381if (complexThreshold > mainLoop_stepping) {3382sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3383} else {3384sub(tmp, len, tmp2); // Remaining bytes for main loop.3385cmpdi(CCR0, tmp, mainLoop_stepping);3386blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing3387mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).3388}3389update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);3390}33913392srdi(tmp2, len, log_stepping); // #iterations for mainLoop3393andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop3394mtctr(tmp2);33953396#ifdef VM_LITTLE_ENDIAN3397Register crc_rv = crc;3398#else3399Register crc_rv = tmp; // Load_reverse needs separate registers to work on.3400// Occupies tmp, but frees up crc.3401load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data.3402tmp = crc;3403#endif34043405int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);34063407align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.3408BIND(L_mainLoop);3409update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);3410bdnz(L_mainLoop);34113412#ifndef VM_LITTLE_ENDIAN3413load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.3414tmp = crc_rv; // Tmp uses it's original register again.3415#endif34163417// Restore original table address for tailLoop.3418if (reconstructTableOffset != 0) {3419addi(table, table, -reconstructTableOffset);3420}34213422// Process last few (<complexThreshold) bytes of buffer.3423BIND(L_tail);3424update_byteLoop_crc32(crc, buf, len, table, data, false, false);34253426nand(crc, crc, crc); // ~c3427BLOCK_COMMENT("} kernel_crc32_1word");3428}34293430/**3431* @param crc register containing existing CRC (32-bit)3432* @param buf register pointing to input byte buffer (byte*)3433* @param len register containing number of bytes3434* @param table register pointing to CRC table3435*3436* Uses R7_ARG5, R8_ARG6 as work registers.3437*/3438void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,3439Register t0, Register t1, Register t2, Register t3) {3440assert_different_registers(crc, buf, len, table);34413442Register data = t0; // Holds the current byte to be folded into crc.34433444BLOCK_COMMENT("kernel_crc32_1byte {");34453446// Process all bytes in a single-byte loop.3447update_byteLoop_crc32(crc, buf, len, table, data, true, true);34483449BLOCK_COMMENT("} kernel_crc32_1byte");3450}34513452/**3453* @param crc register containing existing CRC (32-bit)3454* @param buf register pointing to input byte buffer (byte*)3455* @param len register containing number of bytes3456* @param table register pointing to CRC table3457* @param constants register pointing to CRC table for 128-bit aligned memory3458* @param barretConstants register pointing to table for barrett reduction3459* @param t0 volatile register3460* @param t1 volatile register3461* @param t2 volatile register3462* @param t3 volatile register3463*/3464void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,3465Register constants, Register barretConstants,3466Register t0, Register t1, Register t2, Register t3, Register t4) {3467assert_different_registers(crc, buf, len, table);34683469Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;34703471Register prealign = t0;3472Register postalign = t0;34733474BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");34753476// 1. use kernel_crc32_1word for shorter than 384bit3477clrldi(len, len, 32);3478cmpdi(CCR0, len, 384);3479bge(CCR0, L_start);34803481Register tc0 = t4;3482Register tc1 = constants;3483Register tc2 = barretConstants;3484kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);3485b(L_end);34863487BIND(L_start);34883489// 2. ~c3490nand(crc, crc, crc);34913492// 3. calculate from 0 to first 128bit-aligned address3493clrldi_(prealign, buf, 57);3494beq(CCR0, L_alignedHead);34953496subfic(prealign, prealign, 128);34973498subf(len, prealign, len);3499update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);35003501// 4. calculate from first 128bit-aligned address to last 128bit-aligned address3502BIND(L_alignedHead);35033504clrldi(postalign, len, 57);3505subf(len, postalign, len);35063507// len must be more than 256bit3508kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);35093510// 5. calculate remaining3511cmpdi(CCR0, postalign, 0);3512beq(CCR0, L_tail);35133514update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);35153516BIND(L_tail);35173518// 6. ~c3519nand(crc, crc, crc);35203521BIND(L_end);35223523BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");3524}35253526/**3527* @param crc register containing existing CRC (32-bit)3528* @param buf register pointing to input byte buffer (byte*)3529* @param len register containing number of bytes3530* @param constants register pointing to CRC table for 128-bit aligned memory3531* @param barretConstants register pointing to table for barrett reduction3532* @param t0 volatile register3533* @param t1 volatile register3534* @param t2 volatile register3535*/3536void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,3537Register constants, Register barretConstants, Register t0, Register t1, Register t2) {3538Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;3539Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;3540Label L_1, L_2, L_3, L_4;35413542Register rLoaded = t0;3543Register rTmp1 = t1;3544Register rTmp2 = t2;3545Register off16 = R22;3546Register off32 = R23;3547Register off48 = R24;3548Register off64 = R25;3549Register off80 = R26;3550Register off96 = R27;3551Register off112 = R28;3552Register rIdx = R29;3553Register rMax = R30;3554Register constantsPos = R31;35553556VectorRegister mask_32bit = VR24;3557VectorRegister mask_64bit = VR25;3558VectorRegister zeroes = VR26;3559VectorRegister const1 = VR27;3560VectorRegister const2 = VR28;35613562// Save non-volatile vector registers (frameless).3563Register offset = t1; int offsetInt = 0;3564offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP);3565offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);3566offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);3567offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);3568offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);3569offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);3570offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);3571offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);3572offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);3573offsetInt -= 8; std(R22, offsetInt, R1_SP);3574offsetInt -= 8; std(R23, offsetInt, R1_SP);3575offsetInt -= 8; std(R24, offsetInt, R1_SP);3576offsetInt -= 8; std(R25, offsetInt, R1_SP);3577offsetInt -= 8; std(R26, offsetInt, R1_SP);3578offsetInt -= 8; std(R27, offsetInt, R1_SP);3579offsetInt -= 8; std(R28, offsetInt, R1_SP);3580offsetInt -= 8; std(R29, offsetInt, R1_SP);3581offsetInt -= 8; std(R30, offsetInt, R1_SP);3582offsetInt -= 8; std(R31, offsetInt, R1_SP);35833584// Set constants3585li(off16, 16);3586li(off32, 32);3587li(off48, 48);3588li(off64, 64);3589li(off80, 80);3590li(off96, 96);3591li(off112, 112);35923593clrldi(crc, crc, 32);35943595vxor(zeroes, zeroes, zeroes);3596vspltisw(VR0, -1);35973598vsldoi(mask_32bit, zeroes, VR0, 4);3599vsldoi(mask_64bit, zeroes, VR0, 8);36003601// Get the initial value into v83602vxor(VR8, VR8, VR8);3603mtvrd(VR8, crc);3604vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits36053606li (rLoaded, 0);36073608rldicr(rIdx, len, 0, 56);36093610{3611BIND(L_1);3612// Checksum in blocks of MAX_SIZE (32768)3613lis(rMax, 0);3614ori(rMax, rMax, 32768);3615mr(rTmp2, rMax);3616cmpd(CCR0, rIdx, rMax);3617bgt(CCR0, L_2);3618mr(rMax, rIdx);36193620BIND(L_2);3621subf(rIdx, rMax, rIdx);36223623// our main loop does 128 bytes at a time3624srdi(rMax, rMax, 7);36253626/*3627* Work out the offset into the constants table to start at. Each3628* constant is 16 bytes, and it is used against 128 bytes of input3629* data - 128 / 16 = 83630*/3631sldi(rTmp1, rMax, 4);3632srdi(rTmp2, rTmp2, 3);3633subf(rTmp1, rTmp1, rTmp2);36343635// We reduce our final 128 bytes in a separate step3636addi(rMax, rMax, -1);3637mtctr(rMax);36383639// Find the start of our constants3640add(constantsPos, constants, rTmp1);36413642// zero VR0-v7 which will contain our checksums3643vxor(VR0, VR0, VR0);3644vxor(VR1, VR1, VR1);3645vxor(VR2, VR2, VR2);3646vxor(VR3, VR3, VR3);3647vxor(VR4, VR4, VR4);3648vxor(VR5, VR5, VR5);3649vxor(VR6, VR6, VR6);3650vxor(VR7, VR7, VR7);36513652lvx(const1, constantsPos);36533654/*3655* If we are looping back to consume more data we use the values3656* already in VR16-v23.3657*/3658cmpdi(CCR0, rLoaded, 1);3659beq(CCR0, L_3);3660{36613662// First warm up pass3663lvx(VR16, buf);3664lvx(VR17, off16, buf);3665lvx(VR18, off32, buf);3666lvx(VR19, off48, buf);3667lvx(VR20, off64, buf);3668lvx(VR21, off80, buf);3669lvx(VR22, off96, buf);3670lvx(VR23, off112, buf);3671addi(buf, buf, 8*16);36723673// xor in initial value3674vxor(VR16, VR16, VR8);3675}36763677BIND(L_3);3678bdz(L_first_warm_up_done);36793680addi(constantsPos, constantsPos, 16);3681lvx(const2, constantsPos);36823683// Second warm up pass3684vpmsumd(VR8, VR16, const1);3685lvx(VR16, buf);36863687vpmsumd(VR9, VR17, const1);3688lvx(VR17, off16, buf);36893690vpmsumd(VR10, VR18, const1);3691lvx(VR18, off32, buf);36923693vpmsumd(VR11, VR19, const1);3694lvx(VR19, off48, buf);36953696vpmsumd(VR12, VR20, const1);3697lvx(VR20, off64, buf);36983699vpmsumd(VR13, VR21, const1);3700lvx(VR21, off80, buf);37013702vpmsumd(VR14, VR22, const1);3703lvx(VR22, off96, buf);37043705vpmsumd(VR15, VR23, const1);3706lvx(VR23, off112, buf);37073708addi(buf, buf, 8 * 16);37093710bdz(L_first_cool_down);37113712/*3713* main loop. We modulo schedule it such that it takes three iterations3714* to complete - first iteration load, second iteration vpmsum, third3715* iteration xor.3716*/3717{3718BIND(L_4);3719lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);37203721vxor(VR0, VR0, VR8);3722vpmsumd(VR8, VR16, const2);3723lvx(VR16, buf);37243725vxor(VR1, VR1, VR9);3726vpmsumd(VR9, VR17, const2);3727lvx(VR17, off16, buf);37283729vxor(VR2, VR2, VR10);3730vpmsumd(VR10, VR18, const2);3731lvx(VR18, off32, buf);37323733vxor(VR3, VR3, VR11);3734vpmsumd(VR11, VR19, const2);3735lvx(VR19, off48, buf);3736lvx(const2, constantsPos);37373738vxor(VR4, VR4, VR12);3739vpmsumd(VR12, VR20, const1);3740lvx(VR20, off64, buf);37413742vxor(VR5, VR5, VR13);3743vpmsumd(VR13, VR21, const1);3744lvx(VR21, off80, buf);37453746vxor(VR6, VR6, VR14);3747vpmsumd(VR14, VR22, const1);3748lvx(VR22, off96, buf);37493750vxor(VR7, VR7, VR15);3751vpmsumd(VR15, VR23, const1);3752lvx(VR23, off112, buf);37533754addi(buf, buf, 8 * 16);37553756bdnz(L_4);3757}37583759BIND(L_first_cool_down);37603761// First cool down pass3762lvx(const1, constantsPos);3763addi(constantsPos, constantsPos, 16);37643765vxor(VR0, VR0, VR8);3766vpmsumd(VR8, VR16, const1);37673768vxor(VR1, VR1, VR9);3769vpmsumd(VR9, VR17, const1);37703771vxor(VR2, VR2, VR10);3772vpmsumd(VR10, VR18, const1);37733774vxor(VR3, VR3, VR11);3775vpmsumd(VR11, VR19, const1);37763777vxor(VR4, VR4, VR12);3778vpmsumd(VR12, VR20, const1);37793780vxor(VR5, VR5, VR13);3781vpmsumd(VR13, VR21, const1);37823783vxor(VR6, VR6, VR14);3784vpmsumd(VR14, VR22, const1);37853786vxor(VR7, VR7, VR15);3787vpmsumd(VR15, VR23, const1);37883789BIND(L_second_cool_down);3790// Second cool down pass3791vxor(VR0, VR0, VR8);3792vxor(VR1, VR1, VR9);3793vxor(VR2, VR2, VR10);3794vxor(VR3, VR3, VR11);3795vxor(VR4, VR4, VR12);3796vxor(VR5, VR5, VR13);3797vxor(VR6, VR6, VR14);3798vxor(VR7, VR7, VR15);37993800/*3801* vpmsumd produces a 96 bit result in the least significant bits3802* of the register. Since we are bit reflected we have to shift it3803* left 32 bits so it occupies the least significant bits in the3804* bit reflected domain.3805*/3806vsldoi(VR0, VR0, zeroes, 4);3807vsldoi(VR1, VR1, zeroes, 4);3808vsldoi(VR2, VR2, zeroes, 4);3809vsldoi(VR3, VR3, zeroes, 4);3810vsldoi(VR4, VR4, zeroes, 4);3811vsldoi(VR5, VR5, zeroes, 4);3812vsldoi(VR6, VR6, zeroes, 4);3813vsldoi(VR7, VR7, zeroes, 4);38143815// xor with last 1024 bits3816lvx(VR8, buf);3817lvx(VR9, off16, buf);3818lvx(VR10, off32, buf);3819lvx(VR11, off48, buf);3820lvx(VR12, off64, buf);3821lvx(VR13, off80, buf);3822lvx(VR14, off96, buf);3823lvx(VR15, off112, buf);3824addi(buf, buf, 8 * 16);38253826vxor(VR16, VR0, VR8);3827vxor(VR17, VR1, VR9);3828vxor(VR18, VR2, VR10);3829vxor(VR19, VR3, VR11);3830vxor(VR20, VR4, VR12);3831vxor(VR21, VR5, VR13);3832vxor(VR22, VR6, VR14);3833vxor(VR23, VR7, VR15);38343835li(rLoaded, 1);3836cmpdi(CCR0, rIdx, 0);3837addi(rIdx, rIdx, 128);3838bne(CCR0, L_1);3839}38403841// Work out how many bytes we have left3842andi_(len, len, 127);38433844// Calculate where in the constant table we need to start3845subfic(rTmp1, len, 128);3846add(constantsPos, constantsPos, rTmp1);38473848// How many 16 byte chunks are in the tail3849srdi(rIdx, len, 4);3850mtctr(rIdx);38513852/*3853* Reduce the previously calculated 1024 bits to 64 bits, shifting3854* 32 bits to include the trailing 32 bits of zeros3855*/3856lvx(VR0, constantsPos);3857lvx(VR1, off16, constantsPos);3858lvx(VR2, off32, constantsPos);3859lvx(VR3, off48, constantsPos);3860lvx(VR4, off64, constantsPos);3861lvx(VR5, off80, constantsPos);3862lvx(VR6, off96, constantsPos);3863lvx(VR7, off112, constantsPos);3864addi(constantsPos, constantsPos, 8 * 16);38653866vpmsumw(VR0, VR16, VR0);3867vpmsumw(VR1, VR17, VR1);3868vpmsumw(VR2, VR18, VR2);3869vpmsumw(VR3, VR19, VR3);3870vpmsumw(VR4, VR20, VR4);3871vpmsumw(VR5, VR21, VR5);3872vpmsumw(VR6, VR22, VR6);3873vpmsumw(VR7, VR23, VR7);38743875// Now reduce the tail (0 - 112 bytes)3876cmpdi(CCR0, rIdx, 0);3877beq(CCR0, L_XOR);38783879lvx(VR16, buf); addi(buf, buf, 16);3880lvx(VR17, constantsPos);3881vpmsumw(VR16, VR16, VR17);3882vxor(VR0, VR0, VR16);3883beq(CCR0, L_XOR);38843885lvx(VR16, buf); addi(buf, buf, 16);3886lvx(VR17, off16, constantsPos);3887vpmsumw(VR16, VR16, VR17);3888vxor(VR0, VR0, VR16);3889beq(CCR0, L_XOR);38903891lvx(VR16, buf); addi(buf, buf, 16);3892lvx(VR17, off32, constantsPos);3893vpmsumw(VR16, VR16, VR17);3894vxor(VR0, VR0, VR16);3895beq(CCR0, L_XOR);38963897lvx(VR16, buf); addi(buf, buf, 16);3898lvx(VR17, off48,constantsPos);3899vpmsumw(VR16, VR16, VR17);3900vxor(VR0, VR0, VR16);3901beq(CCR0, L_XOR);39023903lvx(VR16, buf); addi(buf, buf, 16);3904lvx(VR17, off64, constantsPos);3905vpmsumw(VR16, VR16, VR17);3906vxor(VR0, VR0, VR16);3907beq(CCR0, L_XOR);39083909lvx(VR16, buf); addi(buf, buf, 16);3910lvx(VR17, off80, constantsPos);3911vpmsumw(VR16, VR16, VR17);3912vxor(VR0, VR0, VR16);3913beq(CCR0, L_XOR);39143915lvx(VR16, buf); addi(buf, buf, 16);3916lvx(VR17, off96, constantsPos);3917vpmsumw(VR16, VR16, VR17);3918vxor(VR0, VR0, VR16);39193920// Now xor all the parallel chunks together3921BIND(L_XOR);3922vxor(VR0, VR0, VR1);3923vxor(VR2, VR2, VR3);3924vxor(VR4, VR4, VR5);3925vxor(VR6, VR6, VR7);39263927vxor(VR0, VR0, VR2);3928vxor(VR4, VR4, VR6);39293930vxor(VR0, VR0, VR4);39313932b(L_barrett_reduction);39333934BIND(L_first_warm_up_done);3935lvx(const1, constantsPos);3936addi(constantsPos, constantsPos, 16);3937vpmsumd(VR8, VR16, const1);3938vpmsumd(VR9, VR17, const1);3939vpmsumd(VR10, VR18, const1);3940vpmsumd(VR11, VR19, const1);3941vpmsumd(VR12, VR20, const1);3942vpmsumd(VR13, VR21, const1);3943vpmsumd(VR14, VR22, const1);3944vpmsumd(VR15, VR23, const1);3945b(L_second_cool_down);39463947BIND(L_barrett_reduction);39483949lvx(const1, barretConstants);3950addi(barretConstants, barretConstants, 16);3951lvx(const2, barretConstants);39523953vsldoi(VR1, VR0, VR0, 8);3954vxor(VR0, VR0, VR1); // xor two 64 bit results together39553956// shift left one bit3957vspltisb(VR1, 1);3958vsl(VR0, VR0, VR1);39593960vand(VR0, VR0, mask_64bit);39613962/*3963* The reflected version of Barrett reduction. Instead of bit3964* reflecting our data (which is expensive to do), we bit reflect our3965* constants and our algorithm, which means the intermediate data in3966* our vector registers goes from 0-63 instead of 63-0. We can reflect3967* the algorithm because we don't carry in mod 2 arithmetic.3968*/3969vand(VR1, VR0, mask_32bit); // bottom 32 bits of a3970vpmsumd(VR1, VR1, const1); // ma3971vand(VR1, VR1, mask_32bit); // bottom 32bits of ma3972vpmsumd(VR1, VR1, const2); // qn */3973vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2)39743975/*3976* Since we are bit reflected, the result (ie the low 32 bits) is in3977* the high 32 bits. We just need to shift it left 4 bytes3978* V0 [ 0 1 X 3 ]3979* V0 [ 0 X 2 3 ]3980*/3981vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of39823983// Get it into r33984mfvrd(crc, VR0);39853986BIND(L_end);39873988offsetInt = 0;3989// Restore non-volatile Vector registers (frameless).3990offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP);3991offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);3992offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);3993offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);3994offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);3995offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);3996offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);3997offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);3998offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);3999offsetInt -= 8; ld(R22, offsetInt, R1_SP);4000offsetInt -= 8; ld(R23, offsetInt, R1_SP);4001offsetInt -= 8; ld(R24, offsetInt, R1_SP);4002offsetInt -= 8; ld(R25, offsetInt, R1_SP);4003offsetInt -= 8; ld(R26, offsetInt, R1_SP);4004offsetInt -= 8; ld(R27, offsetInt, R1_SP);4005offsetInt -= 8; ld(R28, offsetInt, R1_SP);4006offsetInt -= 8; ld(R29, offsetInt, R1_SP);4007offsetInt -= 8; ld(R30, offsetInt, R1_SP);4008offsetInt -= 8; ld(R31, offsetInt, R1_SP);4009}40104011void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {4012assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);40134014BLOCK_COMMENT("kernel_crc32_singleByte:");4015nand(crc, crc, crc); // ~c40164017lbz(tmp, 0, buf); // Byte from buffer, zero-extended.4018update_byte_crc32(crc, tmp, table);40194020nand(crc, crc, crc); // ~c4021}402240234024void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {4025#ifdef ASSERT4026Label ok;4027if (check_equal) {4028beq(CCR0, ok);4029} else {4030bne(CCR0, ok);4031}4032stop(msg, id);4033bind(ok);4034#endif4035}40364037void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,4038Register mem_base, const char* msg, int id) {4039#ifdef ASSERT4040switch (size) {4041case 4:4042lwz(R0, mem_offset, mem_base);4043cmpwi(CCR0, R0, 0);4044break;4045case 8:4046ld(R0, mem_offset, mem_base);4047cmpdi(CCR0, R0, 0);4048break;4049default:4050ShouldNotReachHere();4051}4052asm_assert(check_equal, msg, id);4053#endif // ASSERT4054}40554056void MacroAssembler::verify_thread() {4057if (VerifyThread) {4058unimplemented("'VerifyThread' currently not implemented on PPC");4059}4060}40614062// READ: oop. KILL: R0. Volatile floats perhaps.4063void MacroAssembler::verify_oop(Register oop, const char* msg) {4064if (!VerifyOops) {4065return;4066}40674068address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();4069const Register tmp = R11; // Will be preserved.4070const int nbytes_save = 11*8; // Volatile gprs except R0.4071save_volatile_gprs(R1_SP, -nbytes_save); // except R040724073if (oop == tmp) mr(R4_ARG2, oop);4074save_LR_CR(tmp); // save in old frame4075push_frame_reg_args(nbytes_save, tmp);4076// load FunctionDescriptor** / entry_address *4077load_const_optimized(tmp, fd, R0);4078// load FunctionDescriptor* / entry_address4079ld(tmp, 0, tmp);4080if (oop != tmp) mr_if_needed(R4_ARG2, oop);4081load_const_optimized(R3_ARG1, (address)msg, R0);4082// Call destination for its side effect.4083call_c(tmp);40844085pop_frame();4086restore_LR_CR(tmp);4087restore_volatile_gprs(R1_SP, -nbytes_save); // except R04088}40894090const char* stop_types[] = {4091"stop",4092"untested",4093"unimplemented",4094"shouldnotreachhere"4095};40964097static void stop_on_request(int tp, const char* msg) {4098tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);4099guarantee(false, err_msg("PPC assembly code requires stop: %s", msg));4100}41014102// Call a C-function that prints output.4103void MacroAssembler::stop(int type, const char* msg, int id) {4104#ifndef PRODUCT4105block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));4106#else4107block_comment("stop {");4108#endif41094110// setup arguments4111load_const_optimized(R3_ARG1, type);4112load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);4113call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);4114illtrap();4115emit_int32(id);4116block_comment("} stop;");4117}41184119#ifndef PRODUCT4120// Write pattern 0x0101010101010101 in memory region [low-before, high+after].4121// Val, addr are temp registers.4122// If low == addr, addr is killed.4123// High is preserved.4124void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {4125if (!ZapMemory) return;41264127assert_different_registers(low, val);41284129BLOCK_COMMENT("zap memory region {");4130load_const_optimized(val, 0x0101010101010101);4131int size = before + after;4132if (low == high && size < 5 && size > 0) {4133int offset = -before*BytesPerWord;4134for (int i = 0; i < size; ++i) {4135std(val, offset, low);4136offset += (1*BytesPerWord);4137}4138} else {4139addi(addr, low, -before*BytesPerWord);4140assert_different_registers(high, val);4141if (after) addi(high, high, after * BytesPerWord);4142Label loop;4143bind(loop);4144std(val, 0, addr);4145addi(addr, addr, 8);4146cmpd(CCR6, addr, high);4147ble(CCR6, loop);4148if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.4149}4150BLOCK_COMMENT("} zap memory region");4151}41524153#endif // !PRODUCT41544155SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {4156int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);4157assert(sizeof(bool) == 1, "PowerPC ABI");4158masm->lbz(temp, simm16_offset, temp);4159masm->cmpwi(CCR0, temp, 0);4160masm->beq(CCR0, _label);4161}41624163SkipIfEqualZero::~SkipIfEqualZero() {4164_masm->bind(_label);4165}416641674168