Path: blob/master/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
64440 views
/*1* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation.7*8* This code is distributed in the hope that it will be useful, but WITHOUT9* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or10* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License11* version 2 for more details (a copy is included in the LICENSE file that12* accompanied this code).13*14* You should have received a copy of the GNU General Public License version15* 2 along with this work; if not, write to the Free Software Foundation,16* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.17*18* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA19* or visit www.oracle.com if you need additional information or have any20* questions.21*22*/2324#include "precompiled.hpp"25#include "asm/assembler.hpp"26#include "asm/assembler.inline.hpp"27#include "oops/methodData.hpp"28#include "opto/c2_MacroAssembler.hpp"29#include "opto/intrinsicnode.hpp"30#include "opto/opcodes.hpp"31#include "opto/subnode.hpp"32#include "runtime/biasedLocking.hpp"33#include "runtime/objectMonitor.hpp"34#include "runtime/stubRoutines.hpp"3536inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {37switch (vlen_in_bytes) {38case 4: // fall-through39case 8: // fall-through40case 16: return Assembler::AVX_128bit;41case 32: return Assembler::AVX_256bit;42case 64: return Assembler::AVX_512bit;4344default: {45ShouldNotReachHere();46return Assembler::AVX_NoVec;47}48}49}5051void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {52guarantee(PostLoopMultiversioning, "must be");53Assembler::movl(dst, 1);54Assembler::shlxl(dst, dst, src);55Assembler::decl(dst);56Assembler::kmovdl(mask, dst);57Assembler::movl(dst, src);58}5960void C2_MacroAssembler::restorevectmask(KRegister mask) {61guarantee(PostLoopMultiversioning, "must be");62Assembler::knotwl(mask, k0);63}6465#if INCLUDE_RTM_OPT6667// Update rtm_counters based on abort status68// input: abort_status69// rtm_counters (RTMLockingCounters*)70// flags are killed71void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {7273atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));74if (PrintPreciseRTMLockingStatistics) {75for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {76Label check_abort;77testl(abort_status, (1<<i));78jccb(Assembler::equal, check_abort);79atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));80bind(check_abort);81}82}83}8485// Branch if (random & (count-1) != 0), count is 2^n86// tmp, scr and flags are killed87void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {88assert(tmp == rax, "");89assert(scr == rdx, "");90rdtsc(); // modifies EDX:EAX91andptr(tmp, count-1);92jccb(Assembler::notZero, brLabel);93}9495// Perform abort ratio calculation, set no_rtm bit if high ratio96// input: rtm_counters_Reg (RTMLockingCounters* address)97// tmpReg, rtm_counters_Reg and flags are killed98void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,99Register rtm_counters_Reg,100RTMLockingCounters* rtm_counters,101Metadata* method_data) {102Label L_done, L_check_always_rtm1, L_check_always_rtm2;103104if (RTMLockingCalculationDelay > 0) {105// Delay calculation106movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);107testptr(tmpReg, tmpReg);108jccb(Assembler::equal, L_done);109}110// Abort ratio calculation only if abort_count > RTMAbortThreshold111// Aborted transactions = abort_count * 100112// All transactions = total_count * RTMTotalCountIncrRate113// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)114115movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));116cmpptr(tmpReg, RTMAbortThreshold);117jccb(Assembler::below, L_check_always_rtm2);118imulptr(tmpReg, tmpReg, 100);119120Register scrReg = rtm_counters_Reg;121movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));122imulptr(scrReg, scrReg, RTMTotalCountIncrRate);123imulptr(scrReg, scrReg, RTMAbortRatio);124cmpptr(tmpReg, scrReg);125jccb(Assembler::below, L_check_always_rtm1);126if (method_data != NULL) {127// set rtm_state to "no rtm" in MDO128mov_metadata(tmpReg, method_data);129lock();130orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);131}132jmpb(L_done);133bind(L_check_always_rtm1);134// Reload RTMLockingCounters* address135lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));136bind(L_check_always_rtm2);137movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));138cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);139jccb(Assembler::below, L_done);140if (method_data != NULL) {141// set rtm_state to "always rtm" in MDO142mov_metadata(tmpReg, method_data);143lock();144orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);145}146bind(L_done);147}148149// Update counters and perform abort ratio calculation150// input: abort_status_Reg151// rtm_counters_Reg, flags are killed152void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,153Register rtm_counters_Reg,154RTMLockingCounters* rtm_counters,155Metadata* method_data,156bool profile_rtm) {157158assert(rtm_counters != NULL, "should not be NULL when profiling RTM");159// update rtm counters based on rax value at abort160// reads abort_status_Reg, updates flags161lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));162rtm_counters_update(abort_status_Reg, rtm_counters_Reg);163if (profile_rtm) {164// Save abort status because abort_status_Reg is used by following code.165if (RTMRetryCount > 0) {166push(abort_status_Reg);167}168assert(rtm_counters != NULL, "should not be NULL when profiling RTM");169rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);170// restore abort status171if (RTMRetryCount > 0) {172pop(abort_status_Reg);173}174}175}176177// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)178// inputs: retry_count_Reg179// : abort_status_Reg180// output: retry_count_Reg decremented by 1181// flags are killed182void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {183Label doneRetry;184assert(abort_status_Reg == rax, "");185// The abort reason bits are in eax (see all states in rtmLocking.hpp)186// 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)187// if reason is in 0x6 and retry count != 0 then retry188andptr(abort_status_Reg, 0x6);189jccb(Assembler::zero, doneRetry);190testl(retry_count_Reg, retry_count_Reg);191jccb(Assembler::zero, doneRetry);192pause();193decrementl(retry_count_Reg);194jmp(retryLabel);195bind(doneRetry);196}197198// Spin and retry if lock is busy,199// inputs: box_Reg (monitor address)200// : retry_count_Reg201// output: retry_count_Reg decremented by 1202// : clear z flag if retry count exceeded203// tmp_Reg, scr_Reg, flags are killed204void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,205Register tmp_Reg, Register scr_Reg, Label& retryLabel) {206Label SpinLoop, SpinExit, doneRetry;207int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);208209testl(retry_count_Reg, retry_count_Reg);210jccb(Assembler::zero, doneRetry);211decrementl(retry_count_Reg);212movptr(scr_Reg, RTMSpinLoopCount);213214bind(SpinLoop);215pause();216decrementl(scr_Reg);217jccb(Assembler::lessEqual, SpinExit);218movptr(tmp_Reg, Address(box_Reg, owner_offset));219testptr(tmp_Reg, tmp_Reg);220jccb(Assembler::notZero, SpinLoop);221222bind(SpinExit);223jmp(retryLabel);224bind(doneRetry);225incrementl(retry_count_Reg); // clear z flag226}227228// Use RTM for normal stack locks229// Input: objReg (object to lock)230void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,231Register retry_on_abort_count_Reg,232RTMLockingCounters* stack_rtm_counters,233Metadata* method_data, bool profile_rtm,234Label& DONE_LABEL, Label& IsInflated) {235assert(UseRTMForStackLocks, "why call this otherwise?");236assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");237assert(tmpReg == rax, "");238assert(scrReg == rdx, "");239Label L_rtm_retry, L_decrement_retry, L_on_abort;240241if (RTMRetryCount > 0) {242movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort243bind(L_rtm_retry);244}245movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));246testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased247jcc(Assembler::notZero, IsInflated);248249if (PrintPreciseRTMLockingStatistics || profile_rtm) {250Label L_noincrement;251if (RTMTotalCountIncrRate > 1) {252// tmpReg, scrReg and flags are killed253branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);254}255assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");256atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);257bind(L_noincrement);258}259xbegin(L_on_abort);260movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword261andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits262cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked263jcc(Assembler::equal, DONE_LABEL); // all done if unlocked264265Register abort_status_Reg = tmpReg; // status of abort is stored in RAX266if (UseRTMXendForLockBusy) {267xend();268movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)269jmp(L_decrement_retry);270}271else {272xabort(0);273}274bind(L_on_abort);275if (PrintPreciseRTMLockingStatistics || profile_rtm) {276rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);277}278bind(L_decrement_retry);279if (RTMRetryCount > 0) {280// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)281rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);282}283}284285// Use RTM for inflating locks286// inputs: objReg (object to lock)287// boxReg (on-stack box address (displaced header location) - KILLED)288// tmpReg (ObjectMonitor address + markWord::monitor_value)289void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,290Register scrReg, Register retry_on_busy_count_Reg,291Register retry_on_abort_count_Reg,292RTMLockingCounters* rtm_counters,293Metadata* method_data, bool profile_rtm,294Label& DONE_LABEL) {295assert(UseRTMLocking, "why call this otherwise?");296assert(tmpReg == rax, "");297assert(scrReg == rdx, "");298Label L_rtm_retry, L_decrement_retry, L_on_abort;299int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);300301// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.302movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));303movptr(boxReg, tmpReg); // Save ObjectMonitor address304305if (RTMRetryCount > 0) {306movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy307movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort308bind(L_rtm_retry);309}310if (PrintPreciseRTMLockingStatistics || profile_rtm) {311Label L_noincrement;312if (RTMTotalCountIncrRate > 1) {313// tmpReg, scrReg and flags are killed314branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);315}316assert(rtm_counters != NULL, "should not be NULL when profiling RTM");317atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);318bind(L_noincrement);319}320xbegin(L_on_abort);321movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));322movptr(tmpReg, Address(tmpReg, owner_offset));323testptr(tmpReg, tmpReg);324jcc(Assembler::zero, DONE_LABEL);325if (UseRTMXendForLockBusy) {326xend();327jmp(L_decrement_retry);328}329else {330xabort(0);331}332bind(L_on_abort);333Register abort_status_Reg = tmpReg; // status of abort is stored in RAX334if (PrintPreciseRTMLockingStatistics || profile_rtm) {335rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);336}337if (RTMRetryCount > 0) {338// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)339rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);340}341342movptr(tmpReg, Address(boxReg, owner_offset)) ;343testptr(tmpReg, tmpReg) ;344jccb(Assembler::notZero, L_decrement_retry) ;345346// Appears unlocked - try to swing _owner from null to non-null.347// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.348#ifdef _LP64349Register threadReg = r15_thread;350#else351get_thread(scrReg);352Register threadReg = scrReg;353#endif354lock();355cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg356357if (RTMRetryCount > 0) {358// success done else retry359jccb(Assembler::equal, DONE_LABEL) ;360bind(L_decrement_retry);361// Spin and retry if lock is busy.362rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);363}364else {365bind(L_decrement_retry);366}367}368369#endif // INCLUDE_RTM_OPT370371// fast_lock and fast_unlock used by C2372373// Because the transitions from emitted code to the runtime374// monitorenter/exit helper stubs are so slow it's critical that375// we inline both the stack-locking fast path and the inflated fast path.376//377// See also: cmpFastLock and cmpFastUnlock.378//379// What follows is a specialized inline transliteration of the code380// in enter() and exit(). If we're concerned about I$ bloat another381// option would be to emit TrySlowEnter and TrySlowExit methods382// at startup-time. These methods would accept arguments as383// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure384// indications in the icc.ZFlag. fast_lock and fast_unlock would simply385// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.386// In practice, however, the # of lock sites is bounded and is usually small.387// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer388// if the processor uses simple bimodal branch predictors keyed by EIP389// Since the helper routines would be called from multiple synchronization390// sites.391//392// An even better approach would be write "MonitorEnter()" and "MonitorExit()"393// in java - using j.u.c and unsafe - and just bind the lock and unlock sites394// to those specialized methods. That'd give us a mostly platform-independent395// implementation that the JITs could optimize and inline at their pleasure.396// Done correctly, the only time we'd need to cross to native could would be397// to park() or unpark() threads. We'd also need a few more unsafe operators398// to (a) prevent compiler-JIT reordering of non-volatile accesses, and399// (b) explicit barriers or fence operations.400//401// TODO:402//403// * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).404// This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.405// Given TLAB allocation, Self is usually manifested in a register, so passing it into406// the lock operators would typically be faster than reifying Self.407//408// * Ideally I'd define the primitives as:409// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.410// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED411// Unfortunately ADLC bugs prevent us from expressing the ideal form.412// Instead, we're stuck with a rather awkward and brittle register assignments below.413// Furthermore the register assignments are overconstrained, possibly resulting in414// sub-optimal code near the synchronization site.415//416// * Eliminate the sp-proximity tests and just use "== Self" tests instead.417// Alternately, use a better sp-proximity test.418//419// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.420// Either one is sufficient to uniquely identify a thread.421// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.422//423// * Intrinsify notify() and notifyAll() for the common cases where the424// object is locked by the calling thread but the waitlist is empty.425// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().426//427// * use jccb and jmpb instead of jcc and jmp to improve code density.428// But beware of excessive branch density on AMD Opterons.429//430// * Both fast_lock and fast_unlock set the ICC.ZF to indicate success431// or failure of the fast path. If the fast path fails then we pass432// control to the slow path, typically in C. In fast_lock and433// fast_unlock we often branch to DONE_LABEL, just to find that C2434// will emit a conditional branch immediately after the node.435// So we have branches to branches and lots of ICC.ZF games.436// Instead, it might be better to have C2 pass a "FailureLabel"437// into fast_lock and fast_unlock. In the case of success, control438// will drop through the node. ICC.ZF is undefined at exit.439// In the case of failure, the node will branch directly to the440// FailureLabel441442443// obj: object to lock444// box: on-stack box address (displaced header location) - KILLED445// rax,: tmp -- KILLED446// scr: tmp -- KILLED447void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,448Register scrReg, Register cx1Reg, Register cx2Reg,449BiasedLockingCounters* counters,450RTMLockingCounters* rtm_counters,451RTMLockingCounters* stack_rtm_counters,452Metadata* method_data,453bool use_rtm, bool profile_rtm) {454// Ensure the register assignments are disjoint455assert(tmpReg == rax, "");456457if (use_rtm) {458assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);459} else {460assert(cx2Reg == noreg, "");461assert_different_registers(objReg, boxReg, tmpReg, scrReg);462}463464if (counters != NULL) {465atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);466}467468// Possible cases that we'll encounter in fast_lock469// ------------------------------------------------470// * Inflated471// -- unlocked472// -- Locked473// = by self474// = by other475// * biased476// -- by Self477// -- by other478// * neutral479// * stack-locked480// -- by self481// = sp-proximity test hits482// = sp-proximity test generates false-negative483// -- by other484//485486Label IsInflated, DONE_LABEL;487488if (DiagnoseSyncOnValueBasedClasses != 0) {489load_klass(tmpReg, objReg, cx1Reg);490movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));491testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);492jcc(Assembler::notZero, DONE_LABEL);493}494495// it's stack-locked, biased or neutral496// TODO: optimize away redundant LDs of obj->mark and improve the markword triage497// order to reduce the number of conditional branches in the most common cases.498// Beware -- there's a subtle invariant that fetch of the markword499// at [FETCH], below, will never observe a biased encoding (*101b).500// If this invariant is not held we risk exclusion (safety) failure.501if (UseBiasedLocking && !UseOptoBiasInlining) {502biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);503}504505#if INCLUDE_RTM_OPT506if (UseRTMForStackLocks && use_rtm) {507rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,508stack_rtm_counters, method_data, profile_rtm,509DONE_LABEL, IsInflated);510}511#endif // INCLUDE_RTM_OPT512513movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]514testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased515jccb(Assembler::notZero, IsInflated);516517// Attempt stack-locking ...518orptr (tmpReg, markWord::unlocked_value);519movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS520lock();521cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg522if (counters != NULL) {523cond_inc32(Assembler::equal,524ExternalAddress((address)counters->fast_path_entry_count_addr()));525}526jcc(Assembler::equal, DONE_LABEL); // Success527528// Recursive locking.529// The object is stack-locked: markword contains stack pointer to BasicLock.530// Locked by current thread if difference with current SP is less than one page.531subptr(tmpReg, rsp);532// Next instruction set ZFlag == 1 (Success) if difference is less then one page.533andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );534movptr(Address(boxReg, 0), tmpReg);535if (counters != NULL) {536cond_inc32(Assembler::equal,537ExternalAddress((address)counters->fast_path_entry_count_addr()));538}539jmp(DONE_LABEL);540541bind(IsInflated);542// The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value543544#if INCLUDE_RTM_OPT545// Use the same RTM locking code in 32- and 64-bit VM.546if (use_rtm) {547rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,548rtm_counters, method_data, profile_rtm, DONE_LABEL);549} else {550#endif // INCLUDE_RTM_OPT551552#ifndef _LP64553// The object is inflated.554555// boxReg refers to the on-stack BasicLock in the current frame.556// We'd like to write:557// set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.558// This is convenient but results a ST-before-CAS penalty. The following CAS suffers559// additional latency as we have another ST in the store buffer that must drain.560561// avoid ST-before-CAS562// register juggle because we need tmpReg for cmpxchgptr below563movptr(scrReg, boxReg);564movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]565566// Optimistic form: consider XORL tmpReg,tmpReg567movptr(tmpReg, NULL_WORD);568569// Appears unlocked - try to swing _owner from null to non-null.570// Ideally, I'd manifest "Self" with get_thread and then attempt571// to CAS the register containing Self into m->Owner.572// But we don't have enough registers, so instead we can either try to CAS573// rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds574// we later store "Self" into m->Owner. Transiently storing a stack address575// (rsp or the address of the box) into m->owner is harmless.576// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.577lock();578cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));579movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3580// If we weren't able to swing _owner from NULL to the BasicLock581// then take the slow path.582jccb (Assembler::notZero, DONE_LABEL);583// update _owner from BasicLock to thread584get_thread (scrReg); // beware: clobbers ICCs585movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);586xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success587588// If the CAS fails we can either retry or pass control to the slow path.589// We use the latter tactic.590// Pass the CAS result in the icc.ZFlag into DONE_LABEL591// If the CAS was successful ...592// Self has acquired the lock593// Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.594// Intentional fall-through into DONE_LABEL ...595#else // _LP64596// It's inflated and we use scrReg for ObjectMonitor* in this section.597movq(scrReg, tmpReg);598xorq(tmpReg, tmpReg);599lock();600cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));601// Unconditionally set box->_displaced_header = markWord::unused_mark().602// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.603movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));604// Propagate ICC.ZF from CAS above into DONE_LABEL.605jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success)606607cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock)608jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail)609incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));610xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success611#endif // _LP64612#if INCLUDE_RTM_OPT613} // use_rtm()614#endif615// DONE_LABEL is a hot target - we'd really like to place it at the616// start of cache line by padding with NOPs.617// See the AMD and Intel software optimization manuals for the618// most efficient "long" NOP encodings.619// Unfortunately none of our alignment mechanisms suffice.620bind(DONE_LABEL);621622// At DONE_LABEL the icc ZFlag is set as follows ...623// fast_unlock uses the same protocol.624// ZFlag == 1 -> Success625// ZFlag == 0 -> Failure - force control through the slow path626}627628// obj: object to unlock629// box: box address (displaced header location), killed. Must be EAX.630// tmp: killed, cannot be obj nor box.631//632// Some commentary on balanced locking:633//634// fast_lock and fast_unlock are emitted only for provably balanced lock sites.635// Methods that don't have provably balanced locking are forced to run in the636// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.637// The interpreter provides two properties:638// I1: At return-time the interpreter automatically and quietly unlocks any639// objects acquired the current activation (frame). Recall that the640// interpreter maintains an on-stack list of locks currently held by641// a frame.642// I2: If a method attempts to unlock an object that is not held by the643// the frame the interpreter throws IMSX.644//645// Lets say A(), which has provably balanced locking, acquires O and then calls B().646// B() doesn't have provably balanced locking so it runs in the interpreter.647// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O648// is still locked by A().649//650// The only other source of unbalanced locking would be JNI. The "Java Native Interface:651// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter652// should not be unlocked by "normal" java-level locking and vice-versa. The specification653// doesn't specify what will occur if a program engages in such mixed-mode locking, however.654// Arguably given that the spec legislates the JNI case as undefined our implementation655// could reasonably *avoid* checking owner in fast_unlock().656// In the interest of performance we elide m->Owner==Self check in unlock.657// A perfectly viable alternative is to elide the owner check except when658// Xcheck:jni is enabled.659660void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {661assert(boxReg == rax, "");662assert_different_registers(objReg, boxReg, tmpReg);663664Label DONE_LABEL, Stacked, CheckSucc;665666// Critically, the biased locking test must have precedence over667// and appear before the (box->dhw == 0) recursive stack-lock test.668if (UseBiasedLocking && !UseOptoBiasInlining) {669biased_locking_exit(objReg, tmpReg, DONE_LABEL);670}671672#if INCLUDE_RTM_OPT673if (UseRTMForStackLocks && use_rtm) {674assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");675Label L_regular_unlock;676movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword677andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits678cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked679jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock680xend(); // otherwise end...681jmp(DONE_LABEL); // ... and we're done682bind(L_regular_unlock);683}684#endif685686cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header687jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock688movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword689testptr(tmpReg, markWord::monitor_value); // Inflated?690jccb (Assembler::zero, Stacked);691692// It's inflated.693#if INCLUDE_RTM_OPT694if (use_rtm) {695Label L_regular_inflated_unlock;696int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);697movptr(boxReg, Address(tmpReg, owner_offset));698testptr(boxReg, boxReg);699jccb(Assembler::notZero, L_regular_inflated_unlock);700xend();701jmpb(DONE_LABEL);702bind(L_regular_inflated_unlock);703}704#endif705706// Despite our balanced locking property we still check that m->_owner == Self707// as java routines or native JNI code called by this thread might708// have released the lock.709// Refer to the comments in synchronizer.cpp for how we might encode extra710// state in _succ so we can avoid fetching EntryList|cxq.711//712// If there's no contention try a 1-0 exit. That is, exit without713// a costly MEMBAR or CAS. See synchronizer.cpp for details on how714// we detect and recover from the race that the 1-0 exit admits.715//716// Conceptually fast_unlock() must execute a STST|LDST "release" barrier717// before it STs null into _owner, releasing the lock. Updates718// to data protected by the critical section must be visible before719// we drop the lock (and thus before any other thread could acquire720// the lock and observe the fields protected by the lock).721// IA32's memory-model is SPO, so STs are ordered with respect to722// each other and there's no need for an explicit barrier (fence).723// See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.724#ifndef _LP64725get_thread (boxReg);726727// Note that we could employ various encoding schemes to reduce728// the number of loads below (currently 4) to just 2 or 3.729// Refer to the comments in synchronizer.cpp.730// In practice the chain of fetches doesn't seem to impact performance, however.731xorptr(boxReg, boxReg);732orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));733jccb (Assembler::notZero, DONE_LABEL);734movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));735orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));736jccb (Assembler::notZero, CheckSucc);737movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);738jmpb (DONE_LABEL);739740bind (Stacked);741// It's not inflated and it's not recursively stack-locked and it's not biased.742// It must be stack-locked.743// Try to reset the header to displaced header.744// The "box" value on the stack is stable, so we can reload745// and be assured we observe the same value as above.746movptr(tmpReg, Address(boxReg, 0));747lock();748cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box749// Intention fall-thru into DONE_LABEL750751// DONE_LABEL is a hot target - we'd really like to place it at the752// start of cache line by padding with NOPs.753// See the AMD and Intel software optimization manuals for the754// most efficient "long" NOP encodings.755// Unfortunately none of our alignment mechanisms suffice.756bind (CheckSucc);757#else // _LP64758// It's inflated759Label LNotRecursive, LSuccess, LGoSlowPath;760761cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);762jccb(Assembler::equal, LNotRecursive);763764// Recursive inflated unlock765decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));766jmpb(LSuccess);767768bind(LNotRecursive);769movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));770orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));771jccb (Assembler::notZero, CheckSucc);772// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.773movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);774jmpb (DONE_LABEL);775776// Try to avoid passing control into the slow_path ...777bind (CheckSucc);778779// The following optional optimization can be elided if necessary780// Effectively: if (succ == null) goto slow path781// The code reduces the window for a race, however,782// and thus benefits performance.783cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);784jccb (Assembler::zero, LGoSlowPath);785786xorptr(boxReg, boxReg);787// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.788movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);789790// Memory barrier/fence791// Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ792// Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.793// This is faster on Nehalem and AMD Shanghai/Barcelona.794// See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences795// We might also restructure (ST Owner=0;barrier;LD _Succ) to796// (mov box,0; xchgq box, &m->Owner; LD _succ) .797lock(); addl(Address(rsp, 0), 0);798799cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);800jccb (Assembler::notZero, LSuccess);801802// Rare inopportune interleaving - race.803// The successor vanished in the small window above.804// The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.805// We need to ensure progress and succession.806// Try to reacquire the lock.807// If that fails then the new owner is responsible for succession and this808// thread needs to take no further action and can exit via the fast path (success).809// If the re-acquire succeeds then pass control into the slow path.810// As implemented, this latter mode is horrible because we generated more811// coherence traffic on the lock *and* artifically extended the critical section812// length while by virtue of passing control into the slow path.813814// box is really RAX -- the following CMPXCHG depends on that binding815// cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)816lock();817cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));818// There's no successor so we tried to regrab the lock.819// If that didn't work, then another thread grabbed the820// lock so we're done (and exit was a success).821jccb (Assembler::notEqual, LSuccess);822// Intentional fall-through into slow path823824bind (LGoSlowPath);825orl (boxReg, 1); // set ICC.ZF=0 to indicate failure826jmpb (DONE_LABEL);827828bind (LSuccess);829testl (boxReg, 0); // set ICC.ZF=1 to indicate success830jmpb (DONE_LABEL);831832bind (Stacked);833movptr(tmpReg, Address (boxReg, 0)); // re-fetch834lock();835cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box836837#endif838bind(DONE_LABEL);839}840841//-------------------------------------------------------------------------------------------842// Generic instructions support for use in .ad files C2 code generation843844void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {845if (dst != src) {846movdqu(dst, src);847}848if (opcode == Op_AbsVD) {849andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);850} else {851assert((opcode == Op_NegVD),"opcode should be Op_NegD");852xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);853}854}855856void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {857if (opcode == Op_AbsVD) {858vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);859} else {860assert((opcode == Op_NegVD),"opcode should be Op_NegD");861vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);862}863}864865void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {866if (dst != src) {867movdqu(dst, src);868}869if (opcode == Op_AbsVF) {870andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);871} else {872assert((opcode == Op_NegVF),"opcode should be Op_NegF");873xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);874}875}876877void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {878if (opcode == Op_AbsVF) {879vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);880} else {881assert((opcode == Op_NegVF),"opcode should be Op_NegF");882vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);883}884}885886void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {887assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");888assert(tmp == xnoreg || elem_bt == T_LONG, "unused");889890if (opcode == Op_MinV) {891if (elem_bt == T_BYTE) {892pminsb(dst, src);893} else if (elem_bt == T_SHORT) {894pminsw(dst, src);895} else if (elem_bt == T_INT) {896pminsd(dst, src);897} else {898assert(elem_bt == T_LONG, "required");899assert(tmp == xmm0, "required");900assert_different_registers(dst, src, tmp);901movdqu(xmm0, dst);902pcmpgtq(xmm0, src);903blendvpd(dst, src); // xmm0 as mask904}905} else { // opcode == Op_MaxV906if (elem_bt == T_BYTE) {907pmaxsb(dst, src);908} else if (elem_bt == T_SHORT) {909pmaxsw(dst, src);910} else if (elem_bt == T_INT) {911pmaxsd(dst, src);912} else {913assert(elem_bt == T_LONG, "required");914assert(tmp == xmm0, "required");915assert_different_registers(dst, src, tmp);916movdqu(xmm0, src);917pcmpgtq(xmm0, dst);918blendvpd(dst, src); // xmm0 as mask919}920}921}922923void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,924XMMRegister dst, XMMRegister src1, XMMRegister src2,925int vlen_enc) {926assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");927928if (opcode == Op_MinV) {929if (elem_bt == T_BYTE) {930vpminsb(dst, src1, src2, vlen_enc);931} else if (elem_bt == T_SHORT) {932vpminsw(dst, src1, src2, vlen_enc);933} else if (elem_bt == T_INT) {934vpminsd(dst, src1, src2, vlen_enc);935} else {936assert(elem_bt == T_LONG, "required");937if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {938vpminsq(dst, src1, src2, vlen_enc);939} else {940assert_different_registers(dst, src1, src2);941vpcmpgtq(dst, src1, src2, vlen_enc);942vblendvpd(dst, src1, src2, dst, vlen_enc);943}944}945} else { // opcode == Op_MaxV946if (elem_bt == T_BYTE) {947vpmaxsb(dst, src1, src2, vlen_enc);948} else if (elem_bt == T_SHORT) {949vpmaxsw(dst, src1, src2, vlen_enc);950} else if (elem_bt == T_INT) {951vpmaxsd(dst, src1, src2, vlen_enc);952} else {953assert(elem_bt == T_LONG, "required");954if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {955vpmaxsq(dst, src1, src2, vlen_enc);956} else {957assert_different_registers(dst, src1, src2);958vpcmpgtq(dst, src1, src2, vlen_enc);959vblendvpd(dst, src2, src1, dst, vlen_enc);960}961}962}963}964965// Float/Double min max966967void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,968XMMRegister dst, XMMRegister a, XMMRegister b,969XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,970int vlen_enc) {971assert(UseAVX > 0, "required");972assert(opcode == Op_MinV || opcode == Op_MinReductionV ||973opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");974assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");975assert_different_registers(a, b, tmp, atmp, btmp);976977bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);978bool is_double_word = is_double_word_type(elem_bt);979980if (!is_double_word && is_min) {981vblendvps(atmp, a, b, a, vlen_enc);982vblendvps(btmp, b, a, a, vlen_enc);983vminps(tmp, atmp, btmp, vlen_enc);984vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);985vblendvps(dst, tmp, atmp, btmp, vlen_enc);986} else if (!is_double_word && !is_min) {987vblendvps(btmp, b, a, b, vlen_enc);988vblendvps(atmp, a, b, b, vlen_enc);989vmaxps(tmp, atmp, btmp, vlen_enc);990vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);991vblendvps(dst, tmp, atmp, btmp, vlen_enc);992} else if (is_double_word && is_min) {993vblendvpd(atmp, a, b, a, vlen_enc);994vblendvpd(btmp, b, a, a, vlen_enc);995vminpd(tmp, atmp, btmp, vlen_enc);996vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);997vblendvpd(dst, tmp, atmp, btmp, vlen_enc);998} else {999assert(is_double_word && !is_min, "sanity");1000vblendvpd(btmp, b, a, b, vlen_enc);1001vblendvpd(atmp, a, b, b, vlen_enc);1002vmaxpd(tmp, atmp, btmp, vlen_enc);1003vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1004vblendvpd(dst, tmp, atmp, btmp, vlen_enc);1005}1006}10071008void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,1009XMMRegister dst, XMMRegister a, XMMRegister b,1010KRegister ktmp, XMMRegister atmp, XMMRegister btmp,1011int vlen_enc) {1012assert(UseAVX > 2, "required");1013assert(opcode == Op_MinV || opcode == Op_MinReductionV ||1014opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");1015assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");1016assert_different_registers(dst, a, b, atmp, btmp);10171018bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);1019bool is_double_word = is_double_word_type(elem_bt);1020bool merge = true;10211022if (!is_double_word && is_min) {1023evpmovd2m(ktmp, a, vlen_enc);1024evblendmps(atmp, ktmp, a, b, merge, vlen_enc);1025evblendmps(btmp, ktmp, b, a, merge, vlen_enc);1026vminps(dst, atmp, btmp, vlen_enc);1027evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1028evmovdqul(dst, ktmp, atmp, merge, vlen_enc);1029} else if (!is_double_word && !is_min) {1030evpmovd2m(ktmp, b, vlen_enc);1031evblendmps(atmp, ktmp, a, b, merge, vlen_enc);1032evblendmps(btmp, ktmp, b, a, merge, vlen_enc);1033vmaxps(dst, atmp, btmp, vlen_enc);1034evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1035evmovdqul(dst, ktmp, atmp, merge, vlen_enc);1036} else if (is_double_word && is_min) {1037evpmovq2m(ktmp, a, vlen_enc);1038evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);1039evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);1040vminpd(dst, atmp, btmp, vlen_enc);1041evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1042evmovdquq(dst, ktmp, atmp, merge, vlen_enc);1043} else {1044assert(is_double_word && !is_min, "sanity");1045evpmovq2m(ktmp, b, vlen_enc);1046evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);1047evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);1048vmaxpd(dst, atmp, btmp, vlen_enc);1049evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1050evmovdquq(dst, ktmp, atmp, merge, vlen_enc);1051}1052}10531054// Float/Double signum1055void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,1056XMMRegister zero, XMMRegister one,1057Register scratch) {1058assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");10591060Label DONE_LABEL;10611062if (opcode == Op_SignumF) {1063assert(UseSSE > 0, "required");1064ucomiss(dst, zero);1065jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument1066jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN1067movflt(dst, one);1068jcc(Assembler::above, DONE_LABEL);1069xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);1070} else if (opcode == Op_SignumD) {1071assert(UseSSE > 1, "required");1072ucomisd(dst, zero);1073jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument1074jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN1075movdbl(dst, one);1076jcc(Assembler::above, DONE_LABEL);1077xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);1078}10791080bind(DONE_LABEL);1081}10821083void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {1084if (sign) {1085pmovsxbw(dst, src);1086} else {1087pmovzxbw(dst, src);1088}1089}10901091void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {1092if (sign) {1093vpmovsxbw(dst, src, vector_len);1094} else {1095vpmovzxbw(dst, src, vector_len);1096}1097}10981099void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {1100if (sign) {1101vpmovsxbd(dst, src, vector_len);1102} else {1103vpmovzxbd(dst, src, vector_len);1104}1105}11061107void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {1108if (sign) {1109vpmovsxwd(dst, src, vector_len);1110} else {1111vpmovzxwd(dst, src, vector_len);1112}1113}11141115void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,1116int shift, int vector_len) {1117if (opcode == Op_RotateLeftV) {1118if (etype == T_INT) {1119evprold(dst, src, shift, vector_len);1120} else {1121assert(etype == T_LONG, "expected type T_LONG");1122evprolq(dst, src, shift, vector_len);1123}1124} else {1125assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");1126if (etype == T_INT) {1127evprord(dst, src, shift, vector_len);1128} else {1129assert(etype == T_LONG, "expected type T_LONG");1130evprorq(dst, src, shift, vector_len);1131}1132}1133}11341135void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,1136XMMRegister shift, int vector_len) {1137if (opcode == Op_RotateLeftV) {1138if (etype == T_INT) {1139evprolvd(dst, src, shift, vector_len);1140} else {1141assert(etype == T_LONG, "expected type T_LONG");1142evprolvq(dst, src, shift, vector_len);1143}1144} else {1145assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");1146if (etype == T_INT) {1147evprorvd(dst, src, shift, vector_len);1148} else {1149assert(etype == T_LONG, "expected type T_LONG");1150evprorvq(dst, src, shift, vector_len);1151}1152}1153}11541155void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {1156if (opcode == Op_RShiftVI) {1157psrad(dst, shift);1158} else if (opcode == Op_LShiftVI) {1159pslld(dst, shift);1160} else {1161assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");1162psrld(dst, shift);1163}1164}11651166void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {1167switch (opcode) {1168case Op_RShiftVI: psrad(dst, shift); break;1169case Op_LShiftVI: pslld(dst, shift); break;1170case Op_URShiftVI: psrld(dst, shift); break;11711172default: assert(false, "%s", NodeClassNames[opcode]);1173}1174}11751176void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {1177if (opcode == Op_RShiftVI) {1178vpsrad(dst, nds, shift, vector_len);1179} else if (opcode == Op_LShiftVI) {1180vpslld(dst, nds, shift, vector_len);1181} else {1182assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");1183vpsrld(dst, nds, shift, vector_len);1184}1185}11861187void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1188switch (opcode) {1189case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;1190case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;1191case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;11921193default: assert(false, "%s", NodeClassNames[opcode]);1194}1195}11961197void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {1198switch (opcode) {1199case Op_RShiftVB: // fall-through1200case Op_RShiftVS: psraw(dst, shift); break;12011202case Op_LShiftVB: // fall-through1203case Op_LShiftVS: psllw(dst, shift); break;12041205case Op_URShiftVS: // fall-through1206case Op_URShiftVB: psrlw(dst, shift); break;12071208default: assert(false, "%s", NodeClassNames[opcode]);1209}1210}12111212void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1213switch (opcode) {1214case Op_RShiftVB: // fall-through1215case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;12161217case Op_LShiftVB: // fall-through1218case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;12191220case Op_URShiftVS: // fall-through1221case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;12221223default: assert(false, "%s", NodeClassNames[opcode]);1224}1225}12261227void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {1228switch (opcode) {1229case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems1230case Op_LShiftVL: psllq(dst, shift); break;1231case Op_URShiftVL: psrlq(dst, shift); break;12321233default: assert(false, "%s", NodeClassNames[opcode]);1234}1235}12361237void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {1238if (opcode == Op_RShiftVL) {1239psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems1240} else if (opcode == Op_LShiftVL) {1241psllq(dst, shift);1242} else {1243assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");1244psrlq(dst, shift);1245}1246}12471248void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1249switch (opcode) {1250case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;1251case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;1252case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;12531254default: assert(false, "%s", NodeClassNames[opcode]);1255}1256}12571258void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {1259if (opcode == Op_RShiftVL) {1260evpsraq(dst, nds, shift, vector_len);1261} else if (opcode == Op_LShiftVL) {1262vpsllq(dst, nds, shift, vector_len);1263} else {1264assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");1265vpsrlq(dst, nds, shift, vector_len);1266}1267}12681269void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1270switch (opcode) {1271case Op_RShiftVB: // fall-through1272case Op_RShiftVS: // fall-through1273case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;12741275case Op_LShiftVB: // fall-through1276case Op_LShiftVS: // fall-through1277case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;12781279case Op_URShiftVB: // fall-through1280case Op_URShiftVS: // fall-through1281case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;12821283default: assert(false, "%s", NodeClassNames[opcode]);1284}1285}12861287void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1288switch (opcode) {1289case Op_RShiftVB: // fall-through1290case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;12911292case Op_LShiftVB: // fall-through1293case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;12941295case Op_URShiftVB: // fall-through1296case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;12971298default: assert(false, "%s", NodeClassNames[opcode]);1299}1300}13011302void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {1303assert(UseAVX >= 2, "required");1304switch (opcode) {1305case Op_RShiftVL: {1306if (UseAVX > 2) {1307assert(tmp == xnoreg, "not used");1308if (!VM_Version::supports_avx512vl()) {1309vlen_enc = Assembler::AVX_512bit;1310}1311evpsravq(dst, src, shift, vlen_enc);1312} else {1313vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));1314vpsrlvq(dst, src, shift, vlen_enc);1315vpsrlvq(tmp, tmp, shift, vlen_enc);1316vpxor(dst, dst, tmp, vlen_enc);1317vpsubq(dst, dst, tmp, vlen_enc);1318}1319break;1320}1321case Op_LShiftVL: {1322assert(tmp == xnoreg, "not used");1323vpsllvq(dst, src, shift, vlen_enc);1324break;1325}1326case Op_URShiftVL: {1327assert(tmp == xnoreg, "not used");1328vpsrlvq(dst, src, shift, vlen_enc);1329break;1330}1331default: assert(false, "%s", NodeClassNames[opcode]);1332}1333}13341335// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst1336void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {1337assert(opcode == Op_LShiftVB ||1338opcode == Op_RShiftVB ||1339opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);1340bool sign = (opcode != Op_URShiftVB);1341assert(vector_len == 0, "required");1342vextendbd(sign, dst, src, 1);1343vpmovzxbd(vtmp, shift, 1);1344varshiftd(opcode, dst, dst, vtmp, 1);1345vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);1346vextracti128_high(vtmp, dst);1347vpackusdw(dst, dst, vtmp, 0);1348}13491350// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst1351void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {1352assert(opcode == Op_LShiftVB ||1353opcode == Op_RShiftVB ||1354opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);1355bool sign = (opcode != Op_URShiftVB);1356int ext_vector_len = vector_len + 1;1357vextendbw(sign, dst, src, ext_vector_len);1358vpmovzxbw(vtmp, shift, ext_vector_len);1359varshiftw(opcode, dst, dst, vtmp, ext_vector_len);1360vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);1361if (vector_len == 0) {1362vextracti128_high(vtmp, dst);1363vpackuswb(dst, dst, vtmp, vector_len);1364} else {1365vextracti64x4_high(vtmp, dst);1366vpackuswb(dst, dst, vtmp, vector_len);1367vpermq(dst, dst, 0xD8, vector_len);1368}1369}13701371void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {1372switch(typ) {1373case T_BYTE:1374pinsrb(dst, val, idx);1375break;1376case T_SHORT:1377pinsrw(dst, val, idx);1378break;1379case T_INT:1380pinsrd(dst, val, idx);1381break;1382case T_LONG:1383pinsrq(dst, val, idx);1384break;1385default:1386assert(false,"Should not reach here.");1387break;1388}1389}13901391void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {1392switch(typ) {1393case T_BYTE:1394vpinsrb(dst, src, val, idx);1395break;1396case T_SHORT:1397vpinsrw(dst, src, val, idx);1398break;1399case T_INT:1400vpinsrd(dst, src, val, idx);1401break;1402case T_LONG:1403vpinsrq(dst, src, val, idx);1404break;1405default:1406assert(false,"Should not reach here.");1407break;1408}1409}14101411void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {1412switch(typ) {1413case T_INT:1414vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);1415break;1416case T_FLOAT:1417vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);1418break;1419case T_LONG:1420vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);1421break;1422case T_DOUBLE:1423vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);1424break;1425default:1426assert(false,"Should not reach here.");1427break;1428}1429}14301431void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {1432switch(typ) {1433case T_INT:1434evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);1435break;1436case T_FLOAT:1437evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);1438break;1439case T_LONG:1440evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);1441break;1442case T_DOUBLE:1443evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);1444break;1445default:1446assert(false,"Should not reach here.");1447break;1448}1449}14501451void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {1452switch(typ) {1453case T_INT:1454evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);1455break;1456case T_FLOAT:1457evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);1458break;1459case T_LONG:1460evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);1461break;1462case T_DOUBLE:1463evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);1464break;1465default:1466assert(false,"Should not reach here.");1467break;1468}1469}14701471void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {1472if (vlen_in_bytes <= 16) {1473pxor (dst, dst);1474psubb(dst, src);1475switch (elem_bt) {1476case T_BYTE: /* nothing to do */ break;1477case T_SHORT: pmovsxbw(dst, dst); break;1478case T_INT: pmovsxbd(dst, dst); break;1479case T_FLOAT: pmovsxbd(dst, dst); break;1480case T_LONG: pmovsxbq(dst, dst); break;1481case T_DOUBLE: pmovsxbq(dst, dst); break;14821483default: assert(false, "%s", type2name(elem_bt));1484}1485} else {1486assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");1487int vlen_enc = vector_length_encoding(vlen_in_bytes);14881489vpxor (dst, dst, dst, vlen_enc);1490vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);14911492switch (elem_bt) {1493case T_BYTE: /* nothing to do */ break;1494case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;1495case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;1496case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;1497case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;1498case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;14991500default: assert(false, "%s", type2name(elem_bt));1501}1502}1503}15041505void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {1506ExternalAddress addr(StubRoutines::x86::vector_iota_indices());1507if (vlen_in_bytes == 4) {1508movdl(dst, addr);1509} else if (vlen_in_bytes == 8) {1510movq(dst, addr);1511} else if (vlen_in_bytes == 16) {1512movdqu(dst, addr, scratch);1513} else if (vlen_in_bytes == 32) {1514vmovdqu(dst, addr, scratch);1515} else {1516assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);1517evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);1518}1519}15201521// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.15221523void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {1524int vector_len = Assembler::AVX_128bit;15251526switch (opcode) {1527case Op_AndReductionV: pand(dst, src); break;1528case Op_OrReductionV: por (dst, src); break;1529case Op_XorReductionV: pxor(dst, src); break;1530case Op_MinReductionV:1531switch (typ) {1532case T_BYTE: pminsb(dst, src); break;1533case T_SHORT: pminsw(dst, src); break;1534case T_INT: pminsd(dst, src); break;1535case T_LONG: assert(UseAVX > 2, "required");1536vpminsq(dst, dst, src, Assembler::AVX_128bit); break;1537default: assert(false, "wrong type");1538}1539break;1540case Op_MaxReductionV:1541switch (typ) {1542case T_BYTE: pmaxsb(dst, src); break;1543case T_SHORT: pmaxsw(dst, src); break;1544case T_INT: pmaxsd(dst, src); break;1545case T_LONG: assert(UseAVX > 2, "required");1546vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;1547default: assert(false, "wrong type");1548}1549break;1550case Op_AddReductionVF: addss(dst, src); break;1551case Op_AddReductionVD: addsd(dst, src); break;1552case Op_AddReductionVI:1553switch (typ) {1554case T_BYTE: paddb(dst, src); break;1555case T_SHORT: paddw(dst, src); break;1556case T_INT: paddd(dst, src); break;1557default: assert(false, "wrong type");1558}1559break;1560case Op_AddReductionVL: paddq(dst, src); break;1561case Op_MulReductionVF: mulss(dst, src); break;1562case Op_MulReductionVD: mulsd(dst, src); break;1563case Op_MulReductionVI:1564switch (typ) {1565case T_SHORT: pmullw(dst, src); break;1566case T_INT: pmulld(dst, src); break;1567default: assert(false, "wrong type");1568}1569break;1570case Op_MulReductionVL: assert(UseAVX > 2, "required");1571vpmullq(dst, dst, src, vector_len); break;1572default: assert(false, "wrong opcode");1573}1574}15751576void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {1577int vector_len = Assembler::AVX_256bit;15781579switch (opcode) {1580case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;1581case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;1582case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;1583case Op_MinReductionV:1584switch (typ) {1585case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;1586case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;1587case T_INT: vpminsd(dst, src1, src2, vector_len); break;1588case T_LONG: assert(UseAVX > 2, "required");1589vpminsq(dst, src1, src2, vector_len); break;1590default: assert(false, "wrong type");1591}1592break;1593case Op_MaxReductionV:1594switch (typ) {1595case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;1596case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;1597case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;1598case T_LONG: assert(UseAVX > 2, "required");1599vpmaxsq(dst, src1, src2, vector_len); break;1600default: assert(false, "wrong type");1601}1602break;1603case Op_AddReductionVI:1604switch (typ) {1605case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;1606case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;1607case T_INT: vpaddd(dst, src1, src2, vector_len); break;1608default: assert(false, "wrong type");1609}1610break;1611case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;1612case Op_MulReductionVI:1613switch (typ) {1614case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;1615case T_INT: vpmulld(dst, src1, src2, vector_len); break;1616default: assert(false, "wrong type");1617}1618break;1619case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;1620default: assert(false, "wrong opcode");1621}1622}16231624void C2_MacroAssembler::reduce_fp(int opcode, int vlen,1625XMMRegister dst, XMMRegister src,1626XMMRegister vtmp1, XMMRegister vtmp2) {1627switch (opcode) {1628case Op_AddReductionVF:1629case Op_MulReductionVF:1630reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);1631break;16321633case Op_AddReductionVD:1634case Op_MulReductionVD:1635reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);1636break;16371638default: assert(false, "wrong opcode");1639}1640}16411642void C2_MacroAssembler::reduceB(int opcode, int vlen,1643Register dst, Register src1, XMMRegister src2,1644XMMRegister vtmp1, XMMRegister vtmp2) {1645switch (vlen) {1646case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;1647case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1648case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1649case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;16501651default: assert(false, "wrong vector length");1652}1653}16541655void C2_MacroAssembler::mulreduceB(int opcode, int vlen,1656Register dst, Register src1, XMMRegister src2,1657XMMRegister vtmp1, XMMRegister vtmp2) {1658switch (vlen) {1659case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;1660case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1661case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1662case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;16631664default: assert(false, "wrong vector length");1665}1666}16671668void C2_MacroAssembler::reduceS(int opcode, int vlen,1669Register dst, Register src1, XMMRegister src2,1670XMMRegister vtmp1, XMMRegister vtmp2) {1671switch (vlen) {1672case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;1673case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;1674case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;1675case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;16761677default: assert(false, "wrong vector length");1678}1679}16801681void C2_MacroAssembler::reduceI(int opcode, int vlen,1682Register dst, Register src1, XMMRegister src2,1683XMMRegister vtmp1, XMMRegister vtmp2) {1684switch (vlen) {1685case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;1686case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;1687case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;1688case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;16891690default: assert(false, "wrong vector length");1691}1692}16931694#ifdef _LP641695void C2_MacroAssembler::reduceL(int opcode, int vlen,1696Register dst, Register src1, XMMRegister src2,1697XMMRegister vtmp1, XMMRegister vtmp2) {1698switch (vlen) {1699case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;1700case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;1701case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;17021703default: assert(false, "wrong vector length");1704}1705}1706#endif // _LP6417071708void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1709switch (vlen) {1710case 2:1711assert(vtmp2 == xnoreg, "");1712reduce2F(opcode, dst, src, vtmp1);1713break;1714case 4:1715assert(vtmp2 == xnoreg, "");1716reduce4F(opcode, dst, src, vtmp1);1717break;1718case 8:1719reduce8F(opcode, dst, src, vtmp1, vtmp2);1720break;1721case 16:1722reduce16F(opcode, dst, src, vtmp1, vtmp2);1723break;1724default: assert(false, "wrong vector length");1725}1726}17271728void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1729switch (vlen) {1730case 2:1731assert(vtmp2 == xnoreg, "");1732reduce2D(opcode, dst, src, vtmp1);1733break;1734case 4:1735reduce4D(opcode, dst, src, vtmp1, vtmp2);1736break;1737case 8:1738reduce8D(opcode, dst, src, vtmp1, vtmp2);1739break;1740default: assert(false, "wrong vector length");1741}1742}17431744void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1745if (opcode == Op_AddReductionVI) {1746if (vtmp1 != src2) {1747movdqu(vtmp1, src2);1748}1749phaddd(vtmp1, vtmp1);1750} else {1751pshufd(vtmp1, src2, 0x1);1752reduce_operation_128(T_INT, opcode, vtmp1, src2);1753}1754movdl(vtmp2, src1);1755reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);1756movdl(dst, vtmp1);1757}17581759void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1760if (opcode == Op_AddReductionVI) {1761if (vtmp1 != src2) {1762movdqu(vtmp1, src2);1763}1764phaddd(vtmp1, src2);1765reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1766} else {1767pshufd(vtmp2, src2, 0xE);1768reduce_operation_128(T_INT, opcode, vtmp2, src2);1769reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1770}1771}17721773void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1774if (opcode == Op_AddReductionVI) {1775vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);1776vextracti128_high(vtmp2, vtmp1);1777vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);1778reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1779} else {1780vextracti128_high(vtmp1, src2);1781reduce_operation_128(T_INT, opcode, vtmp1, src2);1782reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1783}1784}17851786void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1787vextracti64x4_high(vtmp2, src2);1788reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);1789reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1790}17911792void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1793pshufd(vtmp2, src2, 0x1);1794reduce_operation_128(T_BYTE, opcode, vtmp2, src2);1795movdqu(vtmp1, vtmp2);1796psrldq(vtmp1, 2);1797reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);1798movdqu(vtmp2, vtmp1);1799psrldq(vtmp2, 1);1800reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);1801movdl(vtmp2, src1);1802pmovsxbd(vtmp1, vtmp1);1803reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);1804pextrb(dst, vtmp1, 0x0);1805movsbl(dst, dst);1806}18071808void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1809pshufd(vtmp1, src2, 0xE);1810reduce_operation_128(T_BYTE, opcode, vtmp1, src2);1811reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1812}18131814void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1815vextracti128_high(vtmp2, src2);1816reduce_operation_128(T_BYTE, opcode, vtmp2, src2);1817reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1818}18191820void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1821vextracti64x4_high(vtmp1, src2);1822reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);1823reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1824}18251826void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1827pmovsxbw(vtmp2, src2);1828reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1829}18301831void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1832if (UseAVX > 1) {1833int vector_len = Assembler::AVX_256bit;1834vpmovsxbw(vtmp1, src2, vector_len);1835reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1836} else {1837pmovsxbw(vtmp2, src2);1838reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1839pshufd(vtmp2, src2, 0x1);1840pmovsxbw(vtmp2, src2);1841reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);1842}1843}18441845void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1846if (UseAVX > 2 && VM_Version::supports_avx512bw()) {1847int vector_len = Assembler::AVX_512bit;1848vpmovsxbw(vtmp1, src2, vector_len);1849reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1850} else {1851assert(UseAVX >= 2,"Should not reach here.");1852mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);1853vextracti128_high(vtmp2, src2);1854mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);1855}1856}18571858void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1859mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);1860vextracti64x4_high(vtmp2, src2);1861mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);1862}18631864void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1865if (opcode == Op_AddReductionVI) {1866if (vtmp1 != src2) {1867movdqu(vtmp1, src2);1868}1869phaddw(vtmp1, vtmp1);1870phaddw(vtmp1, vtmp1);1871} else {1872pshufd(vtmp2, src2, 0x1);1873reduce_operation_128(T_SHORT, opcode, vtmp2, src2);1874movdqu(vtmp1, vtmp2);1875psrldq(vtmp1, 2);1876reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);1877}1878movdl(vtmp2, src1);1879pmovsxwd(vtmp1, vtmp1);1880reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);1881pextrw(dst, vtmp1, 0x0);1882movswl(dst, dst);1883}18841885void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1886if (opcode == Op_AddReductionVI) {1887if (vtmp1 != src2) {1888movdqu(vtmp1, src2);1889}1890phaddw(vtmp1, src2);1891} else {1892pshufd(vtmp1, src2, 0xE);1893reduce_operation_128(T_SHORT, opcode, vtmp1, src2);1894}1895reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1896}18971898void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1899if (opcode == Op_AddReductionVI) {1900int vector_len = Assembler::AVX_256bit;1901vphaddw(vtmp2, src2, src2, vector_len);1902vpermq(vtmp2, vtmp2, 0xD8, vector_len);1903} else {1904vextracti128_high(vtmp2, src2);1905reduce_operation_128(T_SHORT, opcode, vtmp2, src2);1906}1907reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1908}19091910void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1911int vector_len = Assembler::AVX_256bit;1912vextracti64x4_high(vtmp1, src2);1913reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);1914reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1915}19161917#ifdef _LP641918void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1919pshufd(vtmp2, src2, 0xE);1920reduce_operation_128(T_LONG, opcode, vtmp2, src2);1921movdq(vtmp1, src1);1922reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);1923movdq(dst, vtmp1);1924}19251926void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1927vextracti128_high(vtmp1, src2);1928reduce_operation_128(T_LONG, opcode, vtmp1, src2);1929reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1930}19311932void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1933vextracti64x4_high(vtmp2, src2);1934reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);1935reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1936}19371938void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {1939assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");1940mov64(temp, -1L);1941bzhiq(temp, temp, len);1942kmovql(dst, temp);1943}1944#endif // _LP6419451946void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {1947reduce_operation_128(T_FLOAT, opcode, dst, src);1948pshufd(vtmp, src, 0x1);1949reduce_operation_128(T_FLOAT, opcode, dst, vtmp);1950}19511952void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {1953reduce2F(opcode, dst, src, vtmp);1954pshufd(vtmp, src, 0x2);1955reduce_operation_128(T_FLOAT, opcode, dst, vtmp);1956pshufd(vtmp, src, 0x3);1957reduce_operation_128(T_FLOAT, opcode, dst, vtmp);1958}19591960void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1961reduce4F(opcode, dst, src, vtmp2);1962vextractf128_high(vtmp2, src);1963reduce4F(opcode, dst, vtmp2, vtmp1);1964}19651966void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1967reduce8F(opcode, dst, src, vtmp1, vtmp2);1968vextracti64x4_high(vtmp1, src);1969reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);1970}19711972void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {1973reduce_operation_128(T_DOUBLE, opcode, dst, src);1974pshufd(vtmp, src, 0xE);1975reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);1976}19771978void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1979reduce2D(opcode, dst, src, vtmp2);1980vextractf128_high(vtmp2, src);1981reduce2D(opcode, dst, vtmp2, vtmp1);1982}19831984void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1985reduce4D(opcode, dst, src, vtmp1, vtmp2);1986vextracti64x4_high(vtmp1, src);1987reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);1988}19891990void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {1991MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);1992}19931994void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {1995MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);1996}199719981999void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,2000XMMRegister dst, XMMRegister src,2001XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,2002XMMRegister xmm_0, XMMRegister xmm_1) {2003int permconst[] = {1, 14};2004XMMRegister wsrc = src;2005XMMRegister wdst = xmm_0;2006XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;20072008int vlen_enc = Assembler::AVX_128bit;2009if (vlen == 16) {2010vlen_enc = Assembler::AVX_256bit;2011}20122013for (int i = log2(vlen) - 1; i >=0; i--) {2014if (i == 0 && !is_dst_valid) {2015wdst = dst;2016}2017if (i == 3) {2018vextracti64x4_high(wtmp, wsrc);2019} else if (i == 2) {2020vextracti128_high(wtmp, wsrc);2021} else { // i = [0,1]2022vpermilps(wtmp, wsrc, permconst[i], vlen_enc);2023}2024vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);2025wsrc = wdst;2026vlen_enc = Assembler::AVX_128bit;2027}2028if (is_dst_valid) {2029vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);2030}2031}20322033void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,2034XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,2035XMMRegister xmm_0, XMMRegister xmm_1) {2036XMMRegister wsrc = src;2037XMMRegister wdst = xmm_0;2038XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;2039int vlen_enc = Assembler::AVX_128bit;2040if (vlen == 8) {2041vlen_enc = Assembler::AVX_256bit;2042}2043for (int i = log2(vlen) - 1; i >=0; i--) {2044if (i == 0 && !is_dst_valid) {2045wdst = dst;2046}2047if (i == 1) {2048vextracti128_high(wtmp, wsrc);2049} else if (i == 2) {2050vextracti64x4_high(wtmp, wsrc);2051} else {2052assert(i == 0, "%d", i);2053vpermilpd(wtmp, wsrc, 1, vlen_enc);2054}2055vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);2056wsrc = wdst;2057vlen_enc = Assembler::AVX_128bit;2058}2059if (is_dst_valid) {2060vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);2061}2062}20632064void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {2065switch (bt) {2066case T_BYTE: pextrb(dst, src, idx); break;2067case T_SHORT: pextrw(dst, src, idx); break;2068case T_INT: pextrd(dst, src, idx); break;2069case T_LONG: pextrq(dst, src, idx); break;20702071default:2072assert(false,"Should not reach here.");2073break;2074}2075}20762077XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {2078int esize = type2aelembytes(typ);2079int elem_per_lane = 16/esize;2080int lane = elemindex / elem_per_lane;2081int eindex = elemindex % elem_per_lane;20822083if (lane >= 2) {2084assert(UseAVX > 2, "required");2085vextractf32x4(dst, src, lane & 3);2086return dst;2087} else if (lane > 0) {2088assert(UseAVX > 0, "required");2089vextractf128(dst, src, lane);2090return dst;2091} else {2092return src;2093}2094}20952096void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {2097int esize = type2aelembytes(typ);2098int elem_per_lane = 16/esize;2099int eindex = elemindex % elem_per_lane;2100assert(is_integral_type(typ),"required");21012102if (eindex == 0) {2103if (typ == T_LONG) {2104movq(dst, src);2105} else {2106movdl(dst, src);2107if (typ == T_BYTE)2108movsbl(dst, dst);2109else if (typ == T_SHORT)2110movswl(dst, dst);2111}2112} else {2113extract(typ, dst, src, eindex);2114}2115}21162117void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {2118int esize = type2aelembytes(typ);2119int elem_per_lane = 16/esize;2120int eindex = elemindex % elem_per_lane;2121assert((typ == T_FLOAT || typ == T_DOUBLE),"required");21222123if (eindex == 0) {2124movq(dst, src);2125} else {2126if (typ == T_FLOAT) {2127if (UseAVX == 0) {2128movdqu(dst, src);2129pshufps(dst, dst, eindex);2130} else {2131vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);2132}2133} else {2134if (UseAVX == 0) {2135movdqu(dst, src);2136psrldq(dst, eindex*esize);2137} else {2138vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);2139}2140movq(dst, dst);2141}2142}2143// Zero upper bits2144if (typ == T_FLOAT) {2145if (UseAVX == 0) {2146assert((vtmp != xnoreg) && (tmp != noreg), "required.");2147movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);2148pand(dst, vtmp);2149} else {2150assert((tmp != noreg), "required.");2151vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);2152}2153}2154}21552156void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {2157switch(typ) {2158case T_BYTE:2159case T_BOOLEAN:2160evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);2161break;2162case T_SHORT:2163case T_CHAR:2164evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);2165break;2166case T_INT:2167case T_FLOAT:2168evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);2169break;2170case T_LONG:2171case T_DOUBLE:2172evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);2173break;2174default:2175assert(false,"Should not reach here.");2176break;2177}2178}21792180void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {2181switch(typ) {2182case T_BOOLEAN:2183case T_BYTE:2184evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2185break;2186case T_CHAR:2187case T_SHORT:2188evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2189break;2190case T_INT:2191case T_FLOAT:2192evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2193break;2194case T_LONG:2195case T_DOUBLE:2196evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2197break;2198default:2199assert(false,"Should not reach here.");2200break;2201}2202}22032204void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,2205int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {2206int vlen_enc = vector_length_encoding(vlen_in_bytes*2);2207switch (typ) {2208case T_BYTE:2209vpmovzxbw(vtmp1, src1, vlen_enc);2210vpmovzxbw(vtmp2, src2, vlen_enc);2211vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);2212vpacksswb(dst, dst, dst, vlen_enc);2213break;2214case T_SHORT:2215vpmovzxwd(vtmp1, src1, vlen_enc);2216vpmovzxwd(vtmp2, src2, vlen_enc);2217vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);2218vpackssdw(dst, dst, dst, vlen_enc);2219break;2220case T_INT:2221vpmovzxdq(vtmp1, src1, vlen_enc);2222vpmovzxdq(vtmp2, src2, vlen_enc);2223vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);2224vpermilps(dst, dst, 8, vlen_enc);2225break;2226default:2227assert(false, "Should not reach here");2228}2229if (vlen_in_bytes == 16) {2230vpermpd(dst, dst, 0x8, vlen_enc);2231}2232}22332234void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,2235XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {2236int vlen_enc = vector_length_encoding(vlen_in_bytes);2237switch (typ) {2238case T_BYTE:2239vpmovzxbw(vtmp1, src1, vlen_enc);2240vpmovzxbw(vtmp2, src2, vlen_enc);2241vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);2242vextracti128(vtmp1, src1, 1);2243vextracti128(vtmp2, src2, 1);2244vpmovzxbw(vtmp1, vtmp1, vlen_enc);2245vpmovzxbw(vtmp2, vtmp2, vlen_enc);2246vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);2247vpacksswb(dst, dst, vtmp3, vlen_enc);2248vpermpd(dst, dst, 0xd8, vlen_enc);2249break;2250case T_SHORT:2251vpmovzxwd(vtmp1, src1, vlen_enc);2252vpmovzxwd(vtmp2, src2, vlen_enc);2253vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);2254vextracti128(vtmp1, src1, 1);2255vextracti128(vtmp2, src2, 1);2256vpmovzxwd(vtmp1, vtmp1, vlen_enc);2257vpmovzxwd(vtmp2, vtmp2, vlen_enc);2258vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);2259vpackssdw(dst, dst, vtmp3, vlen_enc);2260vpermpd(dst, dst, 0xd8, vlen_enc);2261break;2262case T_INT:2263vpmovzxdq(vtmp1, src1, vlen_enc);2264vpmovzxdq(vtmp2, src2, vlen_enc);2265vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);2266vpshufd(dst, dst, 8, vlen_enc);2267vpermq(dst, dst, 8, vlen_enc);2268vextracti128(vtmp1, src1, 1);2269vextracti128(vtmp2, src2, 1);2270vpmovzxdq(vtmp1, vtmp1, vlen_enc);2271vpmovzxdq(vtmp2, vtmp2, vlen_enc);2272vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);2273vpshufd(vtmp3, vtmp3, 8, vlen_enc);2274vpermq(vtmp3, vtmp3, 0x80, vlen_enc);2275vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);2276break;2277default:2278assert(false, "Should not reach here");2279}2280}22812282void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {2283switch(typ) {2284case T_BYTE:2285evpblendmb(dst, kmask, src1, src2, merge, vector_len);2286break;2287case T_SHORT:2288evpblendmw(dst, kmask, src1, src2, merge, vector_len);2289break;2290case T_INT:2291case T_FLOAT:2292evpblendmd(dst, kmask, src1, src2, merge, vector_len);2293break;2294case T_LONG:2295case T_DOUBLE:2296evpblendmq(dst, kmask, src1, src2, merge, vector_len);2297break;2298default:2299assert(false,"Should not reach here.");2300break;2301}2302}23032304void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,2305XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {2306switch(vlen) {2307case 4:2308assert(vtmp1 != xnoreg, "required.");2309// Broadcast lower 32 bits to 128 bits before ptest2310pshufd(vtmp1, src1, 0x0);2311if (bt == BoolTest::overflow) {2312assert(vtmp2 != xnoreg, "required.");2313pshufd(vtmp2, src2, 0x0);2314} else {2315assert(vtmp2 == xnoreg, "required.");2316vtmp2 = src2;2317}2318ptest(vtmp1, vtmp2);2319break;2320case 8:2321assert(vtmp1 != xnoreg, "required.");2322// Broadcast lower 64 bits to 128 bits before ptest2323pshufd(vtmp1, src1, 0x4);2324if (bt == BoolTest::overflow) {2325assert(vtmp2 != xnoreg, "required.");2326pshufd(vtmp2, src2, 0x4);2327} else {2328assert(vtmp2 == xnoreg, "required.");2329vtmp2 = src2;2330}2331ptest(vtmp1, vtmp2);2332break;2333case 16:2334assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");2335ptest(src1, src2);2336break;2337case 32:2338assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");2339vptest(src1, src2, Assembler::AVX_256bit);2340break;2341case 64:2342{2343assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");2344evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);2345if (bt == BoolTest::ne) {2346ktestql(mask, mask);2347} else {2348assert(bt == BoolTest::overflow, "required");2349kortestql(mask, mask);2350}2351}2352break;2353default:2354assert(false,"Should not reach here.");2355break;2356}2357}23582359//-------------------------------------------------------------------------------------------23602361// IndexOf for constant substrings with size >= 8 chars2362// which don't need to be loaded through stack.2363void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,2364Register cnt1, Register cnt2,2365int int_cnt2, Register result,2366XMMRegister vec, Register tmp,2367int ae) {2368ShortBranchVerifier sbv(this);2369assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");2370assert(ae != StrIntrinsicNode::LU, "Invalid encoding");23712372// This method uses the pcmpestri instruction with bound registers2373// inputs:2374// xmm - substring2375// rax - substring length (elements count)2376// mem - scanned string2377// rdx - string length (elements count)2378// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)2379// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)2380// outputs:2381// rcx - matched index in string2382assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");2383int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts2384int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 82385Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;2386Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;23872388Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,2389RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,2390MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;23912392// Note, inline_string_indexOf() generates checks:2393// if (substr.count > string.count) return -1;2394// if (substr.count == 0) return 0;2395assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");23962397// Load substring.2398if (ae == StrIntrinsicNode::UL) {2399pmovzxbw(vec, Address(str2, 0));2400} else {2401movdqu(vec, Address(str2, 0));2402}2403movl(cnt2, int_cnt2);2404movptr(result, str1); // string addr24052406if (int_cnt2 > stride) {2407jmpb(SCAN_TO_SUBSTR);24082409// Reload substr for rescan, this code2410// is executed only for large substrings (> 8 chars)2411bind(RELOAD_SUBSTR);2412if (ae == StrIntrinsicNode::UL) {2413pmovzxbw(vec, Address(str2, 0));2414} else {2415movdqu(vec, Address(str2, 0));2416}2417negptr(cnt2); // Jumped here with negative cnt2, convert to positive24182419bind(RELOAD_STR);2420// We came here after the beginning of the substring was2421// matched but the rest of it was not so we need to search2422// again. Start from the next element after the previous match.24232424// cnt2 is number of substring reminding elements and2425// cnt1 is number of string reminding elements when cmp failed.2426// Restored cnt1 = cnt1 - cnt2 + int_cnt22427subl(cnt1, cnt2);2428addl(cnt1, int_cnt2);2429movl(cnt2, int_cnt2); // Now restore cnt224302431decrementl(cnt1); // Shift to next element2432cmpl(cnt1, cnt2);2433jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring24342435addptr(result, (1<<scale1));24362437} // (int_cnt2 > 8)24382439// Scan string for start of substr in 16-byte vectors2440bind(SCAN_TO_SUBSTR);2441pcmpestri(vec, Address(result, 0), mode);2442jccb(Assembler::below, FOUND_CANDIDATE); // CF == 12443subl(cnt1, stride);2444jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string2445cmpl(cnt1, cnt2);2446jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring2447addptr(result, 16);2448jmpb(SCAN_TO_SUBSTR);24492450// Found a potential substr2451bind(FOUND_CANDIDATE);2452// Matched whole vector if first element matched (tmp(rcx) == 0).2453if (int_cnt2 == stride) {2454jccb(Assembler::overflow, RET_FOUND); // OF == 12455} else { // int_cnt2 > 82456jccb(Assembler::overflow, FOUND_SUBSTR);2457}2458// After pcmpestri tmp(rcx) contains matched element index2459// Compute start addr of substr2460lea(result, Address(result, tmp, scale1));24612462// Make sure string is still long enough2463subl(cnt1, tmp);2464cmpl(cnt1, cnt2);2465if (int_cnt2 == stride) {2466jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);2467} else { // int_cnt2 > 82468jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);2469}2470// Left less then substring.24712472bind(RET_NOT_FOUND);2473movl(result, -1);2474jmp(EXIT);24752476if (int_cnt2 > stride) {2477// This code is optimized for the case when whole substring2478// is matched if its head is matched.2479bind(MATCH_SUBSTR_HEAD);2480pcmpestri(vec, Address(result, 0), mode);2481// Reload only string if does not match2482jcc(Assembler::noOverflow, RELOAD_STR); // OF == 024832484Label CONT_SCAN_SUBSTR;2485// Compare the rest of substring (> 8 chars).2486bind(FOUND_SUBSTR);2487// First 8 chars are already matched.2488negptr(cnt2);2489addptr(cnt2, stride);24902491bind(SCAN_SUBSTR);2492subl(cnt1, stride);2493cmpl(cnt2, -stride); // Do not read beyond substring2494jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);2495// Back-up strings to avoid reading beyond substring:2496// cnt1 = cnt1 - cnt2 + 82497addl(cnt1, cnt2); // cnt2 is negative2498addl(cnt1, stride);2499movl(cnt2, stride); negptr(cnt2);2500bind(CONT_SCAN_SUBSTR);2501if (int_cnt2 < (int)G) {2502int tail_off1 = int_cnt2<<scale1;2503int tail_off2 = int_cnt2<<scale2;2504if (ae == StrIntrinsicNode::UL) {2505pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));2506} else {2507movdqu(vec, Address(str2, cnt2, scale2, tail_off2));2508}2509pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);2510} else {2511// calculate index in register to avoid integer overflow (int_cnt2*2)2512movl(tmp, int_cnt2);2513addptr(tmp, cnt2);2514if (ae == StrIntrinsicNode::UL) {2515pmovzxbw(vec, Address(str2, tmp, scale2, 0));2516} else {2517movdqu(vec, Address(str2, tmp, scale2, 0));2518}2519pcmpestri(vec, Address(result, tmp, scale1, 0), mode);2520}2521// Need to reload strings pointers if not matched whole vector2522jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 02523addptr(cnt2, stride);2524jcc(Assembler::negative, SCAN_SUBSTR);2525// Fall through if found full substring25262527} // (int_cnt2 > 8)25282529bind(RET_FOUND);2530// Found result if we matched full small substring.2531// Compute substr offset2532subptr(result, str1);2533if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {2534shrl(result, 1); // index2535}2536bind(EXIT);25372538} // string_indexofC825392540// Small strings are loaded through stack if they cross page boundary.2541void C2_MacroAssembler::string_indexof(Register str1, Register str2,2542Register cnt1, Register cnt2,2543int int_cnt2, Register result,2544XMMRegister vec, Register tmp,2545int ae) {2546ShortBranchVerifier sbv(this);2547assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");2548assert(ae != StrIntrinsicNode::LU, "Invalid encoding");25492550//2551// int_cnt2 is length of small (< 8 chars) constant substring2552// or (-1) for non constant substring in which case its length2553// is in cnt2 register.2554//2555// Note, inline_string_indexOf() generates checks:2556// if (substr.count > string.count) return -1;2557// if (substr.count == 0) return 0;2558//2559int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 82560assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");2561// This method uses the pcmpestri instruction with bound registers2562// inputs:2563// xmm - substring2564// rax - substring length (elements count)2565// mem - scanned string2566// rdx - string length (elements count)2567// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)2568// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)2569// outputs:2570// rcx - matched index in string2571assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");2572int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts2573Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;2574Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;25752576Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,2577RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,2578FOUND_CANDIDATE;25792580{ //========================================================2581// We don't know where these strings are located2582// and we can't read beyond them. Load them through stack.2583Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;25842585movptr(tmp, rsp); // save old SP25862587if (int_cnt2 > 0) { // small (< 8 chars) constant substring2588if (int_cnt2 == (1>>scale2)) { // One byte2589assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");2590load_unsigned_byte(result, Address(str2, 0));2591movdl(vec, result); // move 32 bits2592} else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes2593// Not enough header space in 32-bit VM: 12+3 = 15.2594movl(result, Address(str2, -1));2595shrl(result, 8);2596movdl(vec, result); // move 32 bits2597} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char2598load_unsigned_short(result, Address(str2, 0));2599movdl(vec, result); // move 32 bits2600} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars2601movdl(vec, Address(str2, 0)); // move 32 bits2602} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars2603movq(vec, Address(str2, 0)); // move 64 bits2604} else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})2605// Array header size is 12 bytes in 32-bit VM2606// + 6 bytes for 3 chars == 18 bytes,2607// enough space to load vec and shift.2608assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");2609if (ae == StrIntrinsicNode::UL) {2610int tail_off = int_cnt2-8;2611pmovzxbw(vec, Address(str2, tail_off));2612psrldq(vec, -2*tail_off);2613}2614else {2615int tail_off = int_cnt2*(1<<scale2);2616movdqu(vec, Address(str2, tail_off-16));2617psrldq(vec, 16-tail_off);2618}2619}2620} else { // not constant substring2621cmpl(cnt2, stride);2622jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough26232624// We can read beyond string if srt+16 does not cross page boundary2625// since heaps are aligned and mapped by pages.2626assert(os::vm_page_size() < (int)G, "default page should be small");2627movl(result, str2); // We need only low 32 bits2628andl(result, (os::vm_page_size()-1));2629cmpl(result, (os::vm_page_size()-16));2630jccb(Assembler::belowEqual, CHECK_STR);26312632// Move small strings to stack to allow load 16 bytes into vec.2633subptr(rsp, 16);2634int stk_offset = wordSize-(1<<scale2);2635push(cnt2);26362637bind(COPY_SUBSTR);2638if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {2639load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));2640movb(Address(rsp, cnt2, scale2, stk_offset), result);2641} else if (ae == StrIntrinsicNode::UU) {2642load_unsigned_short(result, Address(str2, cnt2, scale2, -2));2643movw(Address(rsp, cnt2, scale2, stk_offset), result);2644}2645decrement(cnt2);2646jccb(Assembler::notZero, COPY_SUBSTR);26472648pop(cnt2);2649movptr(str2, rsp); // New substring address2650} // non constant26512652bind(CHECK_STR);2653cmpl(cnt1, stride);2654jccb(Assembler::aboveEqual, BIG_STRINGS);26552656// Check cross page boundary.2657movl(result, str1); // We need only low 32 bits2658andl(result, (os::vm_page_size()-1));2659cmpl(result, (os::vm_page_size()-16));2660jccb(Assembler::belowEqual, BIG_STRINGS);26612662subptr(rsp, 16);2663int stk_offset = -(1<<scale1);2664if (int_cnt2 < 0) { // not constant2665push(cnt2);2666stk_offset += wordSize;2667}2668movl(cnt2, cnt1);26692670bind(COPY_STR);2671if (ae == StrIntrinsicNode::LL) {2672load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));2673movb(Address(rsp, cnt2, scale1, stk_offset), result);2674} else {2675load_unsigned_short(result, Address(str1, cnt2, scale1, -2));2676movw(Address(rsp, cnt2, scale1, stk_offset), result);2677}2678decrement(cnt2);2679jccb(Assembler::notZero, COPY_STR);26802681if (int_cnt2 < 0) { // not constant2682pop(cnt2);2683}2684movptr(str1, rsp); // New string address26852686bind(BIG_STRINGS);2687// Load substring.2688if (int_cnt2 < 0) { // -12689if (ae == StrIntrinsicNode::UL) {2690pmovzxbw(vec, Address(str2, 0));2691} else {2692movdqu(vec, Address(str2, 0));2693}2694push(cnt2); // substr count2695push(str2); // substr addr2696push(str1); // string addr2697} else {2698// Small (< 8 chars) constant substrings are loaded already.2699movl(cnt2, int_cnt2);2700}2701push(tmp); // original SP27022703} // Finished loading27042705//========================================================2706// Start search2707//27082709movptr(result, str1); // string addr27102711if (int_cnt2 < 0) { // Only for non constant substring2712jmpb(SCAN_TO_SUBSTR);27132714// SP saved at sp+02715// String saved at sp+1*wordSize2716// Substr saved at sp+2*wordSize2717// Substr count saved at sp+3*wordSize27182719// Reload substr for rescan, this code2720// is executed only for large substrings (> 8 chars)2721bind(RELOAD_SUBSTR);2722movptr(str2, Address(rsp, 2*wordSize));2723movl(cnt2, Address(rsp, 3*wordSize));2724if (ae == StrIntrinsicNode::UL) {2725pmovzxbw(vec, Address(str2, 0));2726} else {2727movdqu(vec, Address(str2, 0));2728}2729// We came here after the beginning of the substring was2730// matched but the rest of it was not so we need to search2731// again. Start from the next element after the previous match.2732subptr(str1, result); // Restore counter2733if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {2734shrl(str1, 1);2735}2736addl(cnt1, str1);2737decrementl(cnt1); // Shift to next element2738cmpl(cnt1, cnt2);2739jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring27402741addptr(result, (1<<scale1));2742} // non constant27432744// Scan string for start of substr in 16-byte vectors2745bind(SCAN_TO_SUBSTR);2746assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");2747pcmpestri(vec, Address(result, 0), mode);2748jccb(Assembler::below, FOUND_CANDIDATE); // CF == 12749subl(cnt1, stride);2750jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string2751cmpl(cnt1, cnt2);2752jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring2753addptr(result, 16);27542755bind(ADJUST_STR);2756cmpl(cnt1, stride); // Do not read beyond string2757jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);2758// Back-up string to avoid reading beyond string.2759lea(result, Address(result, cnt1, scale1, -16));2760movl(cnt1, stride);2761jmpb(SCAN_TO_SUBSTR);27622763// Found a potential substr2764bind(FOUND_CANDIDATE);2765// After pcmpestri tmp(rcx) contains matched element index27662767// Make sure string is still long enough2768subl(cnt1, tmp);2769cmpl(cnt1, cnt2);2770jccb(Assembler::greaterEqual, FOUND_SUBSTR);2771// Left less then substring.27722773bind(RET_NOT_FOUND);2774movl(result, -1);2775jmp(CLEANUP);27762777bind(FOUND_SUBSTR);2778// Compute start addr of substr2779lea(result, Address(result, tmp, scale1));2780if (int_cnt2 > 0) { // Constant substring2781// Repeat search for small substring (< 8 chars)2782// from new point without reloading substring.2783// Have to check that we don't read beyond string.2784cmpl(tmp, stride-int_cnt2);2785jccb(Assembler::greater, ADJUST_STR);2786// Fall through if matched whole substring.2787} else { // non constant2788assert(int_cnt2 == -1, "should be != 0");27892790addl(tmp, cnt2);2791// Found result if we matched whole substring.2792cmpl(tmp, stride);2793jcc(Assembler::lessEqual, RET_FOUND);27942795// Repeat search for small substring (<= 8 chars)2796// from new point 'str1' without reloading substring.2797cmpl(cnt2, stride);2798// Have to check that we don't read beyond string.2799jccb(Assembler::lessEqual, ADJUST_STR);28002801Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;2802// Compare the rest of substring (> 8 chars).2803movptr(str1, result);28042805cmpl(tmp, cnt2);2806// First 8 chars are already matched.2807jccb(Assembler::equal, CHECK_NEXT);28082809bind(SCAN_SUBSTR);2810pcmpestri(vec, Address(str1, 0), mode);2811// Need to reload strings pointers if not matched whole vector2812jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 028132814bind(CHECK_NEXT);2815subl(cnt2, stride);2816jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring2817addptr(str1, 16);2818if (ae == StrIntrinsicNode::UL) {2819addptr(str2, 8);2820} else {2821addptr(str2, 16);2822}2823subl(cnt1, stride);2824cmpl(cnt2, stride); // Do not read beyond substring2825jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);2826// Back-up strings to avoid reading beyond substring.28272828if (ae == StrIntrinsicNode::UL) {2829lea(str2, Address(str2, cnt2, scale2, -8));2830lea(str1, Address(str1, cnt2, scale1, -16));2831} else {2832lea(str2, Address(str2, cnt2, scale2, -16));2833lea(str1, Address(str1, cnt2, scale1, -16));2834}2835subl(cnt1, cnt2);2836movl(cnt2, stride);2837addl(cnt1, stride);2838bind(CONT_SCAN_SUBSTR);2839if (ae == StrIntrinsicNode::UL) {2840pmovzxbw(vec, Address(str2, 0));2841} else {2842movdqu(vec, Address(str2, 0));2843}2844jmp(SCAN_SUBSTR);28452846bind(RET_FOUND_LONG);2847movptr(str1, Address(rsp, wordSize));2848} // non constant28492850bind(RET_FOUND);2851// Compute substr offset2852subptr(result, str1);2853if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {2854shrl(result, 1); // index2855}2856bind(CLEANUP);2857pop(rsp); // restore SP28582859} // string_indexof28602861void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,2862XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {2863ShortBranchVerifier sbv(this);2864assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");28652866int stride = 8;28672868Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,2869SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,2870RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,2871FOUND_SEQ_CHAR, DONE_LABEL;28722873movptr(result, str1);2874if (UseAVX >= 2) {2875cmpl(cnt1, stride);2876jcc(Assembler::less, SCAN_TO_CHAR);2877cmpl(cnt1, 2*stride);2878jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);2879movdl(vec1, ch);2880vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);2881vpxor(vec2, vec2);2882movl(tmp, cnt1);2883andl(tmp, 0xFFFFFFF0); //vector count (in chars)2884andl(cnt1,0x0000000F); //tail count (in chars)28852886bind(SCAN_TO_16_CHAR_LOOP);2887vmovdqu(vec3, Address(result, 0));2888vpcmpeqw(vec3, vec3, vec1, 1);2889vptest(vec2, vec3);2890jcc(Assembler::carryClear, FOUND_CHAR);2891addptr(result, 32);2892subl(tmp, 2*stride);2893jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);2894jmp(SCAN_TO_8_CHAR);2895bind(SCAN_TO_8_CHAR_INIT);2896movdl(vec1, ch);2897pshuflw(vec1, vec1, 0x00);2898pshufd(vec1, vec1, 0);2899pxor(vec2, vec2);2900}2901bind(SCAN_TO_8_CHAR);2902cmpl(cnt1, stride);2903jcc(Assembler::less, SCAN_TO_CHAR);2904if (UseAVX < 2) {2905movdl(vec1, ch);2906pshuflw(vec1, vec1, 0x00);2907pshufd(vec1, vec1, 0);2908pxor(vec2, vec2);2909}2910movl(tmp, cnt1);2911andl(tmp, 0xFFFFFFF8); //vector count (in chars)2912andl(cnt1,0x00000007); //tail count (in chars)29132914bind(SCAN_TO_8_CHAR_LOOP);2915movdqu(vec3, Address(result, 0));2916pcmpeqw(vec3, vec1);2917ptest(vec2, vec3);2918jcc(Assembler::carryClear, FOUND_CHAR);2919addptr(result, 16);2920subl(tmp, stride);2921jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);2922bind(SCAN_TO_CHAR);2923testl(cnt1, cnt1);2924jcc(Assembler::zero, RET_NOT_FOUND);2925bind(SCAN_TO_CHAR_LOOP);2926load_unsigned_short(tmp, Address(result, 0));2927cmpl(ch, tmp);2928jccb(Assembler::equal, FOUND_SEQ_CHAR);2929addptr(result, 2);2930subl(cnt1, 1);2931jccb(Assembler::zero, RET_NOT_FOUND);2932jmp(SCAN_TO_CHAR_LOOP);29332934bind(RET_NOT_FOUND);2935movl(result, -1);2936jmpb(DONE_LABEL);29372938bind(FOUND_CHAR);2939if (UseAVX >= 2) {2940vpmovmskb(tmp, vec3);2941} else {2942pmovmskb(tmp, vec3);2943}2944bsfl(ch, tmp);2945addptr(result, ch);29462947bind(FOUND_SEQ_CHAR);2948subptr(result, str1);2949shrl(result, 1);29502951bind(DONE_LABEL);2952} // string_indexof_char29532954void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,2955XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {2956ShortBranchVerifier sbv(this);2957assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");29582959int stride = 16;29602961Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,2962SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,2963RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,2964FOUND_SEQ_CHAR, DONE_LABEL;29652966movptr(result, str1);2967if (UseAVX >= 2) {2968cmpl(cnt1, stride);2969jcc(Assembler::less, SCAN_TO_CHAR_INIT);2970cmpl(cnt1, stride*2);2971jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);2972movdl(vec1, ch);2973vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);2974vpxor(vec2, vec2);2975movl(tmp, cnt1);2976andl(tmp, 0xFFFFFFE0); //vector count (in chars)2977andl(cnt1,0x0000001F); //tail count (in chars)29782979bind(SCAN_TO_32_CHAR_LOOP);2980vmovdqu(vec3, Address(result, 0));2981vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);2982vptest(vec2, vec3);2983jcc(Assembler::carryClear, FOUND_CHAR);2984addptr(result, 32);2985subl(tmp, stride*2);2986jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);2987jmp(SCAN_TO_16_CHAR);29882989bind(SCAN_TO_16_CHAR_INIT);2990movdl(vec1, ch);2991pxor(vec2, vec2);2992pshufb(vec1, vec2);2993}29942995bind(SCAN_TO_16_CHAR);2996cmpl(cnt1, stride);2997jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left2998if (UseAVX < 2) {2999movdl(vec1, ch);3000pxor(vec2, vec2);3001pshufb(vec1, vec2);3002}3003movl(tmp, cnt1);3004andl(tmp, 0xFFFFFFF0); //vector count (in bytes)3005andl(cnt1,0x0000000F); //tail count (in bytes)30063007bind(SCAN_TO_16_CHAR_LOOP);3008movdqu(vec3, Address(result, 0));3009pcmpeqb(vec3, vec1);3010ptest(vec2, vec3);3011jcc(Assembler::carryClear, FOUND_CHAR);3012addptr(result, 16);3013subl(tmp, stride);3014jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...30153016bind(SCAN_TO_CHAR_INIT);3017testl(cnt1, cnt1);3018jcc(Assembler::zero, RET_NOT_FOUND);3019bind(SCAN_TO_CHAR_LOOP);3020load_unsigned_byte(tmp, Address(result, 0));3021cmpl(ch, tmp);3022jccb(Assembler::equal, FOUND_SEQ_CHAR);3023addptr(result, 1);3024subl(cnt1, 1);3025jccb(Assembler::zero, RET_NOT_FOUND);3026jmp(SCAN_TO_CHAR_LOOP);30273028bind(RET_NOT_FOUND);3029movl(result, -1);3030jmpb(DONE_LABEL);30313032bind(FOUND_CHAR);3033if (UseAVX >= 2) {3034vpmovmskb(tmp, vec3);3035} else {3036pmovmskb(tmp, vec3);3037}3038bsfl(ch, tmp);3039addptr(result, ch);30403041bind(FOUND_SEQ_CHAR);3042subptr(result, str1);30433044bind(DONE_LABEL);3045} // stringL_indexof_char30463047// helper function for string_compare3048void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,3049Address::ScaleFactor scale, Address::ScaleFactor scale1,3050Address::ScaleFactor scale2, Register index, int ae) {3051if (ae == StrIntrinsicNode::LL) {3052load_unsigned_byte(elem1, Address(str1, index, scale, 0));3053load_unsigned_byte(elem2, Address(str2, index, scale, 0));3054} else if (ae == StrIntrinsicNode::UU) {3055load_unsigned_short(elem1, Address(str1, index, scale, 0));3056load_unsigned_short(elem2, Address(str2, index, scale, 0));3057} else {3058load_unsigned_byte(elem1, Address(str1, index, scale1, 0));3059load_unsigned_short(elem2, Address(str2, index, scale2, 0));3060}3061}30623063// Compare strings, used for char[] and byte[].3064void C2_MacroAssembler::string_compare(Register str1, Register str2,3065Register cnt1, Register cnt2, Register result,3066XMMRegister vec1, int ae, KRegister mask) {3067ShortBranchVerifier sbv(this);3068Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;3069Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX33070int stride, stride2, adr_stride, adr_stride1, adr_stride2;3071int stride2x2 = 0x40;3072Address::ScaleFactor scale = Address::no_scale;3073Address::ScaleFactor scale1 = Address::no_scale;3074Address::ScaleFactor scale2 = Address::no_scale;30753076if (ae != StrIntrinsicNode::LL) {3077stride2x2 = 0x20;3078}30793080if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {3081shrl(cnt2, 1);3082}3083// Compute the minimum of the string lengths and the3084// difference of the string lengths (stack).3085// Do the conditional move stuff3086movl(result, cnt1);3087subl(cnt1, cnt2);3088push(cnt1);3089cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)30903091// Is the minimum length zero?3092testl(cnt2, cnt2);3093jcc(Assembler::zero, LENGTH_DIFF_LABEL);3094if (ae == StrIntrinsicNode::LL) {3095// Load first bytes3096load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]3097load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]3098} else if (ae == StrIntrinsicNode::UU) {3099// Load first characters3100load_unsigned_short(result, Address(str1, 0));3101load_unsigned_short(cnt1, Address(str2, 0));3102} else {3103load_unsigned_byte(result, Address(str1, 0));3104load_unsigned_short(cnt1, Address(str2, 0));3105}3106subl(result, cnt1);3107jcc(Assembler::notZero, POP_LABEL);31083109if (ae == StrIntrinsicNode::UU) {3110// Divide length by 2 to get number of chars3111shrl(cnt2, 1);3112}3113cmpl(cnt2, 1);3114jcc(Assembler::equal, LENGTH_DIFF_LABEL);31153116// Check if the strings start at the same location and setup scale and stride3117if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3118cmpptr(str1, str2);3119jcc(Assembler::equal, LENGTH_DIFF_LABEL);3120if (ae == StrIntrinsicNode::LL) {3121scale = Address::times_1;3122stride = 16;3123} else {3124scale = Address::times_2;3125stride = 8;3126}3127} else {3128scale1 = Address::times_1;3129scale2 = Address::times_2;3130// scale not used3131stride = 8;3132}31333134if (UseAVX >= 2 && UseSSE42Intrinsics) {3135Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;3136Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;3137Label COMPARE_WIDE_VECTORS_LOOP_AVX2;3138Label COMPARE_TAIL_LONG;3139Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX331403141int pcmpmask = 0x19;3142if (ae == StrIntrinsicNode::LL) {3143pcmpmask &= ~0x01;3144}31453146// Setup to compare 16-chars (32-bytes) vectors,3147// start from first character again because it has aligned address.3148if (ae == StrIntrinsicNode::LL) {3149stride2 = 32;3150} else {3151stride2 = 16;3152}3153if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3154adr_stride = stride << scale;3155} else {3156adr_stride1 = 8; //stride << scale1;3157adr_stride2 = 16; //stride << scale2;3158}31593160assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");3161// rax and rdx are used by pcmpestri as elements counters3162movl(result, cnt2);3163andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count3164jcc(Assembler::zero, COMPARE_TAIL_LONG);31653166// fast path : compare first 2 8-char vectors.3167bind(COMPARE_16_CHARS);3168if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3169movdqu(vec1, Address(str1, 0));3170} else {3171pmovzxbw(vec1, Address(str1, 0));3172}3173pcmpestri(vec1, Address(str2, 0), pcmpmask);3174jccb(Assembler::below, COMPARE_INDEX_CHAR);31753176if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3177movdqu(vec1, Address(str1, adr_stride));3178pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);3179} else {3180pmovzxbw(vec1, Address(str1, adr_stride1));3181pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);3182}3183jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);3184addl(cnt1, stride);31853186// Compare the characters at index in cnt13187bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character3188load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);3189subl(result, cnt2);3190jmp(POP_LABEL);31913192// Setup the registers to start vector comparison loop3193bind(COMPARE_WIDE_VECTORS);3194if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3195lea(str1, Address(str1, result, scale));3196lea(str2, Address(str2, result, scale));3197} else {3198lea(str1, Address(str1, result, scale1));3199lea(str2, Address(str2, result, scale2));3200}3201subl(result, stride2);3202subl(cnt2, stride2);3203jcc(Assembler::zero, COMPARE_WIDE_TAIL);3204negptr(result);32053206// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)3207bind(COMPARE_WIDE_VECTORS_LOOP);32083209#ifdef _LP643210if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop3211cmpl(cnt2, stride2x2);3212jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);3213testl(cnt2, stride2x2-1); // cnt2 holds the vector count3214jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x4032153216bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop3217if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3218evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);3219evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 03220} else {3221vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);3222evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 03223}3224kortestql(mask, mask);3225jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare3226addptr(result, stride2x2); // update since we already compared at this addr3227subl(cnt2, stride2x2); // and sub the size too3228jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);32293230vpxor(vec1, vec1);3231jmpb(COMPARE_WIDE_TAIL);3232}//if (VM_Version::supports_avx512vlbw())3233#endif // _LP64323432353236bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);3237if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3238vmovdqu(vec1, Address(str1, result, scale));3239vpxor(vec1, Address(str2, result, scale));3240} else {3241vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);3242vpxor(vec1, Address(str2, result, scale2));3243}3244vptest(vec1, vec1);3245jcc(Assembler::notZero, VECTOR_NOT_EQUAL);3246addptr(result, stride2);3247subl(cnt2, stride2);3248jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);3249// clean upper bits of YMM registers3250vpxor(vec1, vec1);32513252// compare wide vectors tail3253bind(COMPARE_WIDE_TAIL);3254testptr(result, result);3255jcc(Assembler::zero, LENGTH_DIFF_LABEL);32563257movl(result, stride2);3258movl(cnt2, result);3259negptr(result);3260jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);32613262// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.3263bind(VECTOR_NOT_EQUAL);3264// clean upper bits of YMM registers3265vpxor(vec1, vec1);3266if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3267lea(str1, Address(str1, result, scale));3268lea(str2, Address(str2, result, scale));3269} else {3270lea(str1, Address(str1, result, scale1));3271lea(str2, Address(str2, result, scale2));3272}3273jmp(COMPARE_16_CHARS);32743275// Compare tail chars, length between 1 to 15 chars3276bind(COMPARE_TAIL_LONG);3277movl(cnt2, result);3278cmpl(cnt2, stride);3279jcc(Assembler::less, COMPARE_SMALL_STR);32803281if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3282movdqu(vec1, Address(str1, 0));3283} else {3284pmovzxbw(vec1, Address(str1, 0));3285}3286pcmpestri(vec1, Address(str2, 0), pcmpmask);3287jcc(Assembler::below, COMPARE_INDEX_CHAR);3288subptr(cnt2, stride);3289jcc(Assembler::zero, LENGTH_DIFF_LABEL);3290if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3291lea(str1, Address(str1, result, scale));3292lea(str2, Address(str2, result, scale));3293} else {3294lea(str1, Address(str1, result, scale1));3295lea(str2, Address(str2, result, scale2));3296}3297negptr(cnt2);3298jmpb(WHILE_HEAD_LABEL);32993300bind(COMPARE_SMALL_STR);3301} else if (UseSSE42Intrinsics) {3302Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;3303int pcmpmask = 0x19;3304// Setup to compare 8-char (16-byte) vectors,3305// start from first character again because it has aligned address.3306movl(result, cnt2);3307andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count3308if (ae == StrIntrinsicNode::LL) {3309pcmpmask &= ~0x01;3310}3311jcc(Assembler::zero, COMPARE_TAIL);3312if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3313lea(str1, Address(str1, result, scale));3314lea(str2, Address(str2, result, scale));3315} else {3316lea(str1, Address(str1, result, scale1));3317lea(str2, Address(str2, result, scale2));3318}3319negptr(result);33203321// pcmpestri3322// inputs:3323// vec1- substring3324// rax - negative string length (elements count)3325// mem - scanned string3326// rdx - string length (elements count)3327// pcmpmask - cmp mode: 11000 (string compare with negated result)3328// + 00 (unsigned bytes) or + 01 (unsigned shorts)3329// outputs:3330// rcx - first mismatched element index3331assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");33323333bind(COMPARE_WIDE_VECTORS);3334if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3335movdqu(vec1, Address(str1, result, scale));3336pcmpestri(vec1, Address(str2, result, scale), pcmpmask);3337} else {3338pmovzxbw(vec1, Address(str1, result, scale1));3339pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);3340}3341// After pcmpestri cnt1(rcx) contains mismatched element index33423343jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==13344addptr(result, stride);3345subptr(cnt2, stride);3346jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);33473348// compare wide vectors tail3349testptr(result, result);3350jcc(Assembler::zero, LENGTH_DIFF_LABEL);33513352movl(cnt2, stride);3353movl(result, stride);3354negptr(result);3355if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3356movdqu(vec1, Address(str1, result, scale));3357pcmpestri(vec1, Address(str2, result, scale), pcmpmask);3358} else {3359pmovzxbw(vec1, Address(str1, result, scale1));3360pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);3361}3362jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);33633364// Mismatched characters in the vectors3365bind(VECTOR_NOT_EQUAL);3366addptr(cnt1, result);3367load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);3368subl(result, cnt2);3369jmpb(POP_LABEL);33703371bind(COMPARE_TAIL); // limit is zero3372movl(cnt2, result);3373// Fallthru to tail compare3374}3375// Shift str2 and str1 to the end of the arrays, negate min3376if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3377lea(str1, Address(str1, cnt2, scale));3378lea(str2, Address(str2, cnt2, scale));3379} else {3380lea(str1, Address(str1, cnt2, scale1));3381lea(str2, Address(str2, cnt2, scale2));3382}3383decrementl(cnt2); // first character was compared already3384negptr(cnt2);33853386// Compare the rest of the elements3387bind(WHILE_HEAD_LABEL);3388load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);3389subl(result, cnt1);3390jccb(Assembler::notZero, POP_LABEL);3391increment(cnt2);3392jccb(Assembler::notZero, WHILE_HEAD_LABEL);33933394// Strings are equal up to min length. Return the length difference.3395bind(LENGTH_DIFF_LABEL);3396pop(result);3397if (ae == StrIntrinsicNode::UU) {3398// Divide diff by 2 to get number of chars3399sarl(result, 1);3400}3401jmpb(DONE_LABEL);34023403#ifdef _LP643404if (VM_Version::supports_avx512vlbw()) {34053406bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);34073408kmovql(cnt1, mask);3409notq(cnt1);3410bsfq(cnt2, cnt1);3411if (ae != StrIntrinsicNode::LL) {3412// Divide diff by 2 to get number of chars3413sarl(cnt2, 1);3414}3415addq(result, cnt2);3416if (ae == StrIntrinsicNode::LL) {3417load_unsigned_byte(cnt1, Address(str2, result));3418load_unsigned_byte(result, Address(str1, result));3419} else if (ae == StrIntrinsicNode::UU) {3420load_unsigned_short(cnt1, Address(str2, result, scale));3421load_unsigned_short(result, Address(str1, result, scale));3422} else {3423load_unsigned_short(cnt1, Address(str2, result, scale2));3424load_unsigned_byte(result, Address(str1, result, scale1));3425}3426subl(result, cnt1);3427jmpb(POP_LABEL);3428}//if (VM_Version::supports_avx512vlbw())3429#endif // _LP6434303431// Discard the stored length difference3432bind(POP_LABEL);3433pop(cnt1);34343435// That's it3436bind(DONE_LABEL);3437if(ae == StrIntrinsicNode::UL) {3438negl(result);3439}34403441}34423443// Search for Non-ASCII character (Negative byte value) in a byte array,3444// return true if it has any and false otherwise.3445// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java3446// @IntrinsicCandidate3447// private static boolean hasNegatives(byte[] ba, int off, int len) {3448// for (int i = off; i < off + len; i++) {3449// if (ba[i] < 0) {3450// return true;3451// }3452// }3453// return false;3454// }3455void C2_MacroAssembler::has_negatives(Register ary1, Register len,3456Register result, Register tmp1,3457XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {3458// rsi: byte array3459// rcx: len3460// rax: result3461ShortBranchVerifier sbv(this);3462assert_different_registers(ary1, len, result, tmp1);3463assert_different_registers(vec1, vec2);3464Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;34653466// len == 03467testl(len, len);3468jcc(Assembler::zero, FALSE_LABEL);34693470if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX5123471VM_Version::supports_avx512vlbw() &&3472VM_Version::supports_bmi2()) {34733474Label test_64_loop, test_tail;3475Register tmp3_aliased = len;34763477movl(tmp1, len);3478vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);34793480andl(tmp1, 64 - 1); // tail count (in chars) 0x3F3481andl(len, ~(64 - 1)); // vector count (in chars)3482jccb(Assembler::zero, test_tail);34833484lea(ary1, Address(ary1, len, Address::times_1));3485negptr(len);34863487bind(test_64_loop);3488// Check whether our 64 elements of size byte contain negatives3489evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);3490kortestql(mask1, mask1);3491jcc(Assembler::notZero, TRUE_LABEL);34923493addptr(len, 64);3494jccb(Assembler::notZero, test_64_loop);349534963497bind(test_tail);3498// bail out when there is nothing to be done3499testl(tmp1, -1);3500jcc(Assembler::zero, FALSE_LABEL);35013502// ~(~0 << len) applied up to two times (for 32-bit scenario)3503#ifdef _LP643504mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);3505shlxq(tmp3_aliased, tmp3_aliased, tmp1);3506notq(tmp3_aliased);3507kmovql(mask2, tmp3_aliased);3508#else3509Label k_init;3510jmp(k_init);35113512// We could not read 64-bits from a general purpose register thus we move3513// data required to compose 64 1's to the instruction stream3514// We emit 64 byte wide series of elements from 0..63 which later on would3515// be used as a compare targets with tail count contained in tmp1 register.3516// Result would be a k register having tmp1 consecutive number or 13517// counting from least significant bit.3518address tmp = pc();3519emit_int64(0x0706050403020100);3520emit_int64(0x0F0E0D0C0B0A0908);3521emit_int64(0x1716151413121110);3522emit_int64(0x1F1E1D1C1B1A1918);3523emit_int64(0x2726252423222120);3524emit_int64(0x2F2E2D2C2B2A2928);3525emit_int64(0x3736353433323130);3526emit_int64(0x3F3E3D3C3B3A3938);35273528bind(k_init);3529lea(len, InternalAddress(tmp));3530// create mask to test for negative byte inside a vector3531evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);3532evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);35333534#endif3535evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);3536ktestq(mask1, mask2);3537jcc(Assembler::notZero, TRUE_LABEL);35383539jmp(FALSE_LABEL);3540} else {3541movl(result, len); // copy35423543if (UseAVX >= 2 && UseSSE >= 2) {3544// With AVX2, use 32-byte vector compare3545Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;35463547// Compare 32-byte vectors3548andl(result, 0x0000001f); // tail count (in bytes)3549andl(len, 0xffffffe0); // vector count (in bytes)3550jccb(Assembler::zero, COMPARE_TAIL);35513552lea(ary1, Address(ary1, len, Address::times_1));3553negptr(len);35543555movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector3556movdl(vec2, tmp1);3557vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);35583559bind(COMPARE_WIDE_VECTORS);3560vmovdqu(vec1, Address(ary1, len, Address::times_1));3561vptest(vec1, vec2);3562jccb(Assembler::notZero, TRUE_LABEL);3563addptr(len, 32);3564jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);35653566testl(result, result);3567jccb(Assembler::zero, FALSE_LABEL);35683569vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));3570vptest(vec1, vec2);3571jccb(Assembler::notZero, TRUE_LABEL);3572jmpb(FALSE_LABEL);35733574bind(COMPARE_TAIL); // len is zero3575movl(len, result);3576// Fallthru to tail compare3577} else if (UseSSE42Intrinsics) {3578// With SSE4.2, use double quad vector compare3579Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;35803581// Compare 16-byte vectors3582andl(result, 0x0000000f); // tail count (in bytes)3583andl(len, 0xfffffff0); // vector count (in bytes)3584jcc(Assembler::zero, COMPARE_TAIL);35853586lea(ary1, Address(ary1, len, Address::times_1));3587negptr(len);35883589movl(tmp1, 0x80808080);3590movdl(vec2, tmp1);3591pshufd(vec2, vec2, 0);35923593bind(COMPARE_WIDE_VECTORS);3594movdqu(vec1, Address(ary1, len, Address::times_1));3595ptest(vec1, vec2);3596jcc(Assembler::notZero, TRUE_LABEL);3597addptr(len, 16);3598jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);35993600testl(result, result);3601jcc(Assembler::zero, FALSE_LABEL);36023603movdqu(vec1, Address(ary1, result, Address::times_1, -16));3604ptest(vec1, vec2);3605jccb(Assembler::notZero, TRUE_LABEL);3606jmpb(FALSE_LABEL);36073608bind(COMPARE_TAIL); // len is zero3609movl(len, result);3610// Fallthru to tail compare3611}3612}3613// Compare 4-byte vectors3614andl(len, 0xfffffffc); // vector count (in bytes)3615jccb(Assembler::zero, COMPARE_CHAR);36163617lea(ary1, Address(ary1, len, Address::times_1));3618negptr(len);36193620bind(COMPARE_VECTORS);3621movl(tmp1, Address(ary1, len, Address::times_1));3622andl(tmp1, 0x80808080);3623jccb(Assembler::notZero, TRUE_LABEL);3624addptr(len, 4);3625jcc(Assembler::notZero, COMPARE_VECTORS);36263627// Compare trailing char (final 2 bytes), if any3628bind(COMPARE_CHAR);3629testl(result, 0x2); // tail char3630jccb(Assembler::zero, COMPARE_BYTE);3631load_unsigned_short(tmp1, Address(ary1, 0));3632andl(tmp1, 0x00008080);3633jccb(Assembler::notZero, TRUE_LABEL);3634subptr(result, 2);3635lea(ary1, Address(ary1, 2));36363637bind(COMPARE_BYTE);3638testl(result, 0x1); // tail byte3639jccb(Assembler::zero, FALSE_LABEL);3640load_unsigned_byte(tmp1, Address(ary1, 0));3641andl(tmp1, 0x00000080);3642jccb(Assembler::notEqual, TRUE_LABEL);3643jmpb(FALSE_LABEL);36443645bind(TRUE_LABEL);3646movl(result, 1); // return true3647jmpb(DONE);36483649bind(FALSE_LABEL);3650xorl(result, result); // return false36513652// That's it3653bind(DONE);3654if (UseAVX >= 2 && UseSSE >= 2) {3655// clean upper bits of YMM registers3656vpxor(vec1, vec1);3657vpxor(vec2, vec2);3658}3659}3660// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.3661void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,3662Register limit, Register result, Register chr,3663XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {3664ShortBranchVerifier sbv(this);3665Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;36663667int length_offset = arrayOopDesc::length_offset_in_bytes();3668int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);36693670if (is_array_equ) {3671// Check the input args3672cmpoop(ary1, ary2);3673jcc(Assembler::equal, TRUE_LABEL);36743675// Need additional checks for arrays_equals.3676testptr(ary1, ary1);3677jcc(Assembler::zero, FALSE_LABEL);3678testptr(ary2, ary2);3679jcc(Assembler::zero, FALSE_LABEL);36803681// Check the lengths3682movl(limit, Address(ary1, length_offset));3683cmpl(limit, Address(ary2, length_offset));3684jcc(Assembler::notEqual, FALSE_LABEL);3685}36863687// count == 03688testl(limit, limit);3689jcc(Assembler::zero, TRUE_LABEL);36903691if (is_array_equ) {3692// Load array address3693lea(ary1, Address(ary1, base_offset));3694lea(ary2, Address(ary2, base_offset));3695}36963697if (is_array_equ && is_char) {3698// arrays_equals when used for char[].3699shll(limit, 1); // byte count != 03700}3701movl(result, limit); // copy37023703if (UseAVX >= 2) {3704// With AVX2, use 32-byte vector compare3705Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;37063707// Compare 32-byte vectors3708andl(result, 0x0000001f); // tail count (in bytes)3709andl(limit, 0xffffffe0); // vector count (in bytes)3710jcc(Assembler::zero, COMPARE_TAIL);37113712lea(ary1, Address(ary1, limit, Address::times_1));3713lea(ary2, Address(ary2, limit, Address::times_1));3714negptr(limit);37153716#ifdef _LP643717if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop3718Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;37193720cmpl(limit, -64);3721jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);37223723bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop37243725evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);3726evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);3727kortestql(mask, mask);3728jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare3729addptr(limit, 64); // update since we already compared at this addr3730cmpl(limit, -64);3731jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);37323733// At this point we may still need to compare -limit+result bytes.3734// We could execute the next two instruction and just continue via non-wide path:3735// cmpl(limit, 0);3736// jcc(Assembler::equal, COMPARE_TAIL); // true3737// But since we stopped at the points ary{1,2}+limit which are3738// not farther than 64 bytes from the ends of arrays ary{1,2}+result3739// (|limit| <= 32 and result < 32),3740// we may just compare the last 64 bytes.3741//3742addptr(result, -64); // it is safe, bc we just came from this area3743evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);3744evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);3745kortestql(mask, mask);3746jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare37473748jmp(TRUE_LABEL);37493750bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);37513752}//if (VM_Version::supports_avx512vlbw())3753#endif //_LP643754bind(COMPARE_WIDE_VECTORS);3755vmovdqu(vec1, Address(ary1, limit, Address::times_1));3756vmovdqu(vec2, Address(ary2, limit, Address::times_1));3757vpxor(vec1, vec2);37583759vptest(vec1, vec1);3760jcc(Assembler::notZero, FALSE_LABEL);3761addptr(limit, 32);3762jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);37633764testl(result, result);3765jcc(Assembler::zero, TRUE_LABEL);37663767vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));3768vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));3769vpxor(vec1, vec2);37703771vptest(vec1, vec1);3772jccb(Assembler::notZero, FALSE_LABEL);3773jmpb(TRUE_LABEL);37743775bind(COMPARE_TAIL); // limit is zero3776movl(limit, result);3777// Fallthru to tail compare3778} else if (UseSSE42Intrinsics) {3779// With SSE4.2, use double quad vector compare3780Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;37813782// Compare 16-byte vectors3783andl(result, 0x0000000f); // tail count (in bytes)3784andl(limit, 0xfffffff0); // vector count (in bytes)3785jcc(Assembler::zero, COMPARE_TAIL);37863787lea(ary1, Address(ary1, limit, Address::times_1));3788lea(ary2, Address(ary2, limit, Address::times_1));3789negptr(limit);37903791bind(COMPARE_WIDE_VECTORS);3792movdqu(vec1, Address(ary1, limit, Address::times_1));3793movdqu(vec2, Address(ary2, limit, Address::times_1));3794pxor(vec1, vec2);37953796ptest(vec1, vec1);3797jcc(Assembler::notZero, FALSE_LABEL);3798addptr(limit, 16);3799jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);38003801testl(result, result);3802jcc(Assembler::zero, TRUE_LABEL);38033804movdqu(vec1, Address(ary1, result, Address::times_1, -16));3805movdqu(vec2, Address(ary2, result, Address::times_1, -16));3806pxor(vec1, vec2);38073808ptest(vec1, vec1);3809jccb(Assembler::notZero, FALSE_LABEL);3810jmpb(TRUE_LABEL);38113812bind(COMPARE_TAIL); // limit is zero3813movl(limit, result);3814// Fallthru to tail compare3815}38163817// Compare 4-byte vectors3818andl(limit, 0xfffffffc); // vector count (in bytes)3819jccb(Assembler::zero, COMPARE_CHAR);38203821lea(ary1, Address(ary1, limit, Address::times_1));3822lea(ary2, Address(ary2, limit, Address::times_1));3823negptr(limit);38243825bind(COMPARE_VECTORS);3826movl(chr, Address(ary1, limit, Address::times_1));3827cmpl(chr, Address(ary2, limit, Address::times_1));3828jccb(Assembler::notEqual, FALSE_LABEL);3829addptr(limit, 4);3830jcc(Assembler::notZero, COMPARE_VECTORS);38313832// Compare trailing char (final 2 bytes), if any3833bind(COMPARE_CHAR);3834testl(result, 0x2); // tail char3835jccb(Assembler::zero, COMPARE_BYTE);3836load_unsigned_short(chr, Address(ary1, 0));3837load_unsigned_short(limit, Address(ary2, 0));3838cmpl(chr, limit);3839jccb(Assembler::notEqual, FALSE_LABEL);38403841if (is_array_equ && is_char) {3842bind(COMPARE_BYTE);3843} else {3844lea(ary1, Address(ary1, 2));3845lea(ary2, Address(ary2, 2));38463847bind(COMPARE_BYTE);3848testl(result, 0x1); // tail byte3849jccb(Assembler::zero, TRUE_LABEL);3850load_unsigned_byte(chr, Address(ary1, 0));3851load_unsigned_byte(limit, Address(ary2, 0));3852cmpl(chr, limit);3853jccb(Assembler::notEqual, FALSE_LABEL);3854}3855bind(TRUE_LABEL);3856movl(result, 1); // return true3857jmpb(DONE);38583859bind(FALSE_LABEL);3860xorl(result, result); // return false38613862// That's it3863bind(DONE);3864if (UseAVX >= 2) {3865// clean upper bits of YMM registers3866vpxor(vec1, vec1);3867vpxor(vec2, vec2);3868}3869}38703871#ifdef _LP643872void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,3873Register tmp, KRegister ktmp, int masklen, int vec_enc) {3874assert(VM_Version::supports_avx512vlbw(), "");3875vpxor(xtmp, xtmp, xtmp, vec_enc);3876vpsubb(xtmp, xtmp, mask, vec_enc);3877evpmovb2m(ktmp, xtmp, vec_enc);3878kmovql(tmp, ktmp);3879switch(opc) {3880case Op_VectorMaskTrueCount:3881popcntq(dst, tmp);3882break;3883case Op_VectorMaskLastTrue:3884mov64(dst, -1);3885bsrq(tmp, tmp);3886cmov(Assembler::notZero, dst, tmp);3887break;3888case Op_VectorMaskFirstTrue:3889mov64(dst, masklen);3890bsfq(tmp, tmp);3891cmov(Assembler::notZero, dst, tmp);3892break;3893default: assert(false, "Unhandled mask operation");3894}3895}38963897void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,3898XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {3899assert(VM_Version::supports_avx(), "");3900vpxor(xtmp, xtmp, xtmp, vec_enc);3901vpsubb(xtmp, xtmp, mask, vec_enc);3902vpmovmskb(tmp, xtmp, vec_enc);3903if (masklen < 64) {3904andq(tmp, (((jlong)1 << masklen) - 1));3905}3906switch(opc) {3907case Op_VectorMaskTrueCount:3908popcntq(dst, tmp);3909break;3910case Op_VectorMaskLastTrue:3911mov64(dst, -1);3912bsrq(tmp, tmp);3913cmov(Assembler::notZero, dst, tmp);3914break;3915case Op_VectorMaskFirstTrue:3916mov64(dst, masklen);3917bsfq(tmp, tmp);3918cmov(Assembler::notZero, dst, tmp);3919break;3920default: assert(false, "Unhandled mask operation");3921}3922}3923#endif392439253926