Path: blob/master/dep/vixl/src/aarch64/macro-assembler-sve-aarch64.cc
4261 views
// Copyright 2019, VIXL authors1// All rights reserved.2//3// Redistribution and use in source and binary forms, with or without4// modification, are permitted provided that the following conditions are met:5//6// * Redistributions of source code must retain the above copyright notice,7// this list of conditions and the following disclaimer.8// * Redistributions in binary form must reproduce the above copyright notice,9// this list of conditions and the following disclaimer in the documentation10// and/or other materials provided with the distribution.11// * Neither the name of ARM Limited nor the names of its contributors may be12// used to endorse or promote products derived from this software without13// specific prior written permission.14//15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND16// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED17// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE18// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE19// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR21// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER22// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,23// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE24// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.2526#include "macro-assembler-aarch64.h"2728namespace vixl {29namespace aarch64 {3031void MacroAssembler::AddSubHelper(AddSubHelperOption option,32const ZRegister& zd,33const ZRegister& zn,34IntegerOperand imm) {35VIXL_ASSERT(imm.FitsInLane(zd));3637// Simple, encodable cases.38if (TrySingleAddSub(option, zd, zn, imm)) return;3940VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));41bool add_imm = (option == kAddImmediate);4243// Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one44// instruction. Also interpret the immediate as signed, so we can convert45// Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.46IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));47if (signed_imm.IsNegative()) {48AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;49IntegerOperand n_imm(signed_imm.GetMagnitude());50// IntegerOperand can represent -INT_MIN, so this is always safe.51VIXL_ASSERT(n_imm.IsPositiveOrZero());52if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;53}5455// Otherwise, fall back to dup + ADD_z_z/SUB_z_z.56UseScratchRegisterScope temps(this);57ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());58Dup(scratch, imm);5960SingleEmissionCheckScope guard(this);61if (add_imm) {62add(zd, zn, scratch);63} else {64sub(zd, zn, scratch);65}66}6768bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,69const ZRegister& zd,70const ZRegister& zn,71IntegerOperand imm) {72VIXL_ASSERT(imm.FitsInLane(zd));7374int imm8;75int shift = -1;76if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||77imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {78MovprfxHelperScope guard(this, zd, zn);79switch (option) {80case kAddImmediate:81add(zd, zd, imm8, shift);82return true;83case kSubImmediate:84sub(zd, zd, imm8, shift);85return true;86}87}88return false;89}9091void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,92SVEArithPredicatedFn reg_macro,93const ZRegister& zd,94const ZRegister& zn,95IntegerOperand imm,96bool is_signed) {97if (is_signed) {98// E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi99if (imm.IsInt8()) {100MovprfxHelperScope guard(this, zd, zn);101(this->*imm_fn)(zd, zd, imm.AsInt8());102return;103}104} else {105// E.g. UMIN_z_zi, UMAX_z_zi106if (imm.IsUint8()) {107MovprfxHelperScope guard(this, zd, zn);108(this->*imm_fn)(zd, zd, imm.AsUint8());109return;110}111}112113UseScratchRegisterScope temps(this);114PRegister pg = temps.AcquireGoverningP();115Ptrue(pg.WithSameLaneSizeAs(zd));116117// Try to re-use zd if we can, so we can avoid a movprfx.118ZRegister scratch =119zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())120: zd;121Dup(scratch, imm);122123// The vector-form macro for commutative operations will swap the arguments to124// avoid movprfx, if necessary.125(this->*reg_macro)(zd, pg.Merging(), zn, scratch);126}127128void MacroAssembler::Mul(const ZRegister& zd,129const ZRegister& zn,130IntegerOperand imm) {131VIXL_ASSERT(allow_macro_instructions_);132IntArithImmFn imm_fn = &Assembler::mul;133SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;134IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);135}136137void MacroAssembler::Smin(const ZRegister& zd,138const ZRegister& zn,139IntegerOperand imm) {140VIXL_ASSERT(allow_macro_instructions_);141VIXL_ASSERT(imm.FitsInSignedLane(zd));142IntArithImmFn imm_fn = &Assembler::smin;143SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;144IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);145}146147void MacroAssembler::Smax(const ZRegister& zd,148const ZRegister& zn,149IntegerOperand imm) {150VIXL_ASSERT(allow_macro_instructions_);151VIXL_ASSERT(imm.FitsInSignedLane(zd));152IntArithImmFn imm_fn = &Assembler::smax;153SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;154IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);155}156157void MacroAssembler::Umax(const ZRegister& zd,158const ZRegister& zn,159IntegerOperand imm) {160VIXL_ASSERT(allow_macro_instructions_);161VIXL_ASSERT(imm.FitsInUnsignedLane(zd));162IntArithImmFn imm_fn = &Assembler::umax;163SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;164IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);165}166167void MacroAssembler::Umin(const ZRegister& zd,168const ZRegister& zn,169IntegerOperand imm) {170VIXL_ASSERT(allow_macro_instructions_);171VIXL_ASSERT(imm.FitsInUnsignedLane(zd));172IntArithImmFn imm_fn = &Assembler::umin;173SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;174IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);175}176177void MacroAssembler::Addpl(const Register& xd,178const Register& xn,179int64_t multiplier) {180VIXL_ASSERT(allow_macro_instructions_);181182// This macro relies on `Rdvl` to handle some out-of-range cases. Check that183// `VL * multiplier` cannot overflow, for any possible value of VL.184VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));185VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));186187if (xd.IsZero()) return;188if (xn.IsZero() && xd.IsSP()) {189// TODO: This operation doesn't make much sense, but we could support it190// with a scratch register if necessary.191VIXL_UNIMPLEMENTED();192}193194// Handling xzr requires an extra move, so defer it until later so we can try195// to use `rdvl` instead (via `Addvl`).196if (IsInt6(multiplier) && !xn.IsZero()) {197SingleEmissionCheckScope guard(this);198addpl(xd, xn, static_cast<int>(multiplier));199return;200}201202// If `multiplier` is a multiple of 8, we can use `Addvl` instead.203if ((multiplier % kZRegBitsPerPRegBit) == 0) {204Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);205return;206}207208if (IsInt6(multiplier)) {209VIXL_ASSERT(xn.IsZero()); // Other cases were handled with `addpl`.210// There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so211// materialise a zero.212MacroEmissionCheckScope guard(this);213movz(xd, 0);214addpl(xd, xd, static_cast<int>(multiplier));215return;216}217218// TODO: Some probable cases result in rather long sequences. For example,219// `Addpl(sp, sp, 33)` requires five instructions, even though it's only just220// outside the encodable range. We should look for ways to cover such cases221// without drastically increasing the complexity of this logic.222223// For other cases, calculate xn + (PL * multiplier) using discrete224// instructions. This requires two scratch registers in the general case, so225// try to re-use the destination as a scratch register.226UseScratchRegisterScope temps(this);227temps.Include(xd);228temps.Exclude(xn);229230Register scratch = temps.AcquireX();231// Because there is no `rdpl`, so we have to calculate PL from VL. We can't232// scale the multiplier because (we already know) it isn't a multiple of 8.233Rdvl(scratch, multiplier);234235MacroEmissionCheckScope guard(this);236if (xn.IsZero()) {237asr(xd, scratch, kZRegBitsPerPRegBitLog2);238} else if (xd.IsSP() || xn.IsSP()) {239// TODO: MacroAssembler::Add should be able to handle this.240asr(scratch, scratch, kZRegBitsPerPRegBitLog2);241add(xd, xn, scratch);242} else {243add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));244}245}246247void MacroAssembler::Addvl(const Register& xd,248const Register& xn,249int64_t multiplier) {250VIXL_ASSERT(allow_macro_instructions_);251VIXL_ASSERT(xd.IsX());252VIXL_ASSERT(xn.IsX());253254// Check that `VL * multiplier` cannot overflow, for any possible value of VL.255VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));256VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));257258if (xd.IsZero()) return;259if (xn.IsZero() && xd.IsSP()) {260// TODO: This operation doesn't make much sense, but we could support it261// with a scratch register if necessary. `rdvl` cannot write into `sp`.262VIXL_UNIMPLEMENTED();263}264265if (IsInt6(multiplier)) {266SingleEmissionCheckScope guard(this);267if (xn.IsZero()) {268rdvl(xd, static_cast<int>(multiplier));269} else {270addvl(xd, xn, static_cast<int>(multiplier));271}272return;273}274275// TODO: Some probable cases result in rather long sequences. For example,276// `Addvl(sp, sp, 42)` requires four instructions, even though it's only just277// outside the encodable range. We should look for ways to cover such cases278// without drastically increasing the complexity of this logic.279280// For other cases, calculate xn + (VL * multiplier) using discrete281// instructions. This requires two scratch registers in the general case, so282// we try to re-use the destination as a scratch register.283UseScratchRegisterScope temps(this);284temps.Include(xd);285temps.Exclude(xn);286287Register a = temps.AcquireX();288Mov(a, multiplier);289290MacroEmissionCheckScope guard(this);291Register b = temps.AcquireX();292rdvl(b, 1);293if (xn.IsZero()) {294mul(xd, a, b);295} else if (xd.IsSP() || xn.IsSP()) {296mul(a, a, b);297add(xd, xn, a);298} else {299madd(xd, a, b, xn);300}301}302303void MacroAssembler::CalculateSVEAddress(const Register& xd,304const SVEMemOperand& addr,305int vl_divisor_log2) {306VIXL_ASSERT(allow_macro_instructions_);307VIXL_ASSERT(!addr.IsScatterGather());308VIXL_ASSERT(xd.IsX());309310// The lower bound is where a whole Z register is accessed.311VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));312// The upper bound is for P register accesses, and for instructions like313// "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.314VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));315316SVEOffsetModifier mod = addr.GetOffsetModifier();317Register base = addr.GetScalarBase();318319if (addr.IsEquivalentToScalar()) {320// For example:321// [x0]322// [x0, #0]323// [x0, xzr, LSL 2]324Mov(xd, base);325} else if (addr.IsScalarPlusImmediate()) {326// For example:327// [x0, #42]328// [x0, #42, MUL VL]329int64_t offset = addr.GetImmediateOffset();330VIXL_ASSERT(offset != 0); // Handled by IsEquivalentToScalar.331if (addr.IsMulVl()) {332int vl_divisor = 1 << vl_divisor_log2;333// For all possible values of vl_divisor, we can simply use `Addpl`. This334// will select `addvl` if necessary.335VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);336Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));337} else {338// IsScalarPlusImmediate() ensures that no other modifiers can occur.339VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);340Add(xd, base, offset);341}342} else if (addr.IsScalarPlusScalar()) {343// For example:344// [x0, x1]345// [x0, x1, LSL #4]346Register offset = addr.GetScalarOffset();347VIXL_ASSERT(!offset.IsZero()); // Handled by IsEquivalentToScalar.348if (mod == SVE_LSL) {349Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));350} else {351// IsScalarPlusScalar() ensures that no other modifiers can occur.352VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);353Add(xd, base, offset);354}355} else {356// All other forms are scatter-gather addresses, which cannot be evaluated357// into an X register.358VIXL_UNREACHABLE();359}360}361362void MacroAssembler::Cpy(const ZRegister& zd,363const PRegister& pg,364IntegerOperand imm) {365VIXL_ASSERT(allow_macro_instructions_);366VIXL_ASSERT(imm.FitsInLane(zd));367int imm8;368int shift;369if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||370imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {371SingleEmissionCheckScope guard(this);372cpy(zd, pg, imm8, shift);373return;374}375376// The fallbacks rely on `cpy` variants that only support merging predication.377// If zeroing predication was requested, zero the destination first.378if (pg.IsZeroing()) {379SingleEmissionCheckScope guard(this);380dup(zd, 0);381}382PRegisterM pg_m = pg.Merging();383384// Try to encode the immediate using fcpy.385VIXL_ASSERT(imm.FitsInLane(zd));386if (zd.GetLaneSizeInBits() >= kHRegSize) {387double fp_imm = 0.0;388switch (zd.GetLaneSizeInBits()) {389case kHRegSize:390fp_imm =391FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);392break;393case kSRegSize:394fp_imm = RawbitsToFloat(imm.AsUint32());395break;396case kDRegSize:397fp_imm = RawbitsToDouble(imm.AsUint64());398break;399default:400VIXL_UNREACHABLE();401break;402}403// IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so404// we can use IsImmFP64 for all lane sizes.405if (IsImmFP64(fp_imm)) {406SingleEmissionCheckScope guard(this);407fcpy(zd, pg_m, fp_imm);408return;409}410}411412// Fall back to using a scratch register.413UseScratchRegisterScope temps(this);414Register scratch = temps.AcquireRegisterToHoldLane(zd);415Mov(scratch, imm);416417SingleEmissionCheckScope guard(this);418cpy(zd, pg_m, scratch);419}420421// TODO: We implement Fcpy (amongst other things) for all FP types because it422// allows us to preserve user-specified NaNs. We should come up with some423// FPImmediate type to abstract this, and avoid all the duplication below (and424// elsewhere).425426void MacroAssembler::Fcpy(const ZRegister& zd,427const PRegisterM& pg,428double imm) {429VIXL_ASSERT(allow_macro_instructions_);430VIXL_ASSERT(pg.IsMerging());431432if (IsImmFP64(imm)) {433SingleEmissionCheckScope guard(this);434fcpy(zd, pg, imm);435return;436}437438// As a fall-back, cast the immediate to the required lane size, and try to439// encode the bit pattern using `Cpy`.440Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));441}442443void MacroAssembler::Fcpy(const ZRegister& zd,444const PRegisterM& pg,445float imm) {446VIXL_ASSERT(allow_macro_instructions_);447VIXL_ASSERT(pg.IsMerging());448449if (IsImmFP32(imm)) {450SingleEmissionCheckScope guard(this);451fcpy(zd, pg, imm);452return;453}454455// As a fall-back, cast the immediate to the required lane size, and try to456// encode the bit pattern using `Cpy`.457Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));458}459460void MacroAssembler::Fcpy(const ZRegister& zd,461const PRegisterM& pg,462Float16 imm) {463VIXL_ASSERT(allow_macro_instructions_);464VIXL_ASSERT(pg.IsMerging());465466if (IsImmFP16(imm)) {467SingleEmissionCheckScope guard(this);468fcpy(zd, pg, imm);469return;470}471472// As a fall-back, cast the immediate to the required lane size, and try to473// encode the bit pattern using `Cpy`.474Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));475}476477void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {478VIXL_ASSERT(allow_macro_instructions_);479VIXL_ASSERT(imm.FitsInLane(zd));480unsigned lane_size = zd.GetLaneSizeInBits();481int imm8;482int shift;483if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||484imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {485SingleEmissionCheckScope guard(this);486dup(zd, imm8, shift);487} else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {488SingleEmissionCheckScope guard(this);489dupm(zd, imm.AsUintN(lane_size));490} else {491UseScratchRegisterScope temps(this);492Register scratch = temps.AcquireRegisterToHoldLane(zd);493Mov(scratch, imm);494495SingleEmissionCheckScope guard(this);496dup(zd, scratch);497}498}499500void MacroAssembler::NoncommutativeArithmeticHelper(501const ZRegister& zd,502const PRegisterM& pg,503const ZRegister& zn,504const ZRegister& zm,505SVEArithPredicatedFn fn,506SVEArithPredicatedFn rev_fn) {507if (zd.Aliases(zn)) {508// E.g. zd = zd / zm509SingleEmissionCheckScope guard(this);510(this->*fn)(zd, pg, zn, zm);511} else if (zd.Aliases(zm)) {512// E.g. zd = zn / zd513SingleEmissionCheckScope guard(this);514(this->*rev_fn)(zd, pg, zm, zn);515} else {516// E.g. zd = zn / zm517MovprfxHelperScope guard(this, zd, pg, zn);518(this->*fn)(zd, pg, zd, zm);519}520}521522void MacroAssembler::FPCommutativeArithmeticHelper(523const ZRegister& zd,524const PRegisterM& pg,525const ZRegister& zn,526const ZRegister& zm,527SVEArithPredicatedFn fn,528FPMacroNaNPropagationOption nan_option) {529ResolveFPNaNPropagationOption(&nan_option);530531if (zd.Aliases(zn)) {532SingleEmissionCheckScope guard(this);533(this->*fn)(zd, pg, zd, zm);534} else if (zd.Aliases(zm)) {535switch (nan_option) {536case FastNaNPropagation: {537// Swap the arguments.538SingleEmissionCheckScope guard(this);539(this->*fn)(zd, pg, zd, zn);540return;541}542case StrictNaNPropagation: {543UseScratchRegisterScope temps(this);544// Use a scratch register to keep the argument order exactly as545// specified.546ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);547{548MovprfxHelperScope guard(this, scratch, pg, zn);549(this->*fn)(scratch, pg, scratch, zm);550}551Mov(zd, scratch);552return;553}554case NoFPMacroNaNPropagationSelected:555VIXL_UNREACHABLE();556return;557}558} else {559MovprfxHelperScope guard(this, zd, pg, zn);560(this->*fn)(zd, pg, zd, zm);561}562}563564// Instructions of the form "inst zda, zn, zm, #num", where they are565// non-commutative and no reversed form is provided.566#define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \567V(Cmla, cmla) \568V(Sqrdcmlah, sqrdcmlah)569570#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \571void MacroAssembler::MASMFN(const ZRegister& zd, \572const ZRegister& za, \573const ZRegister& zn, \574const ZRegister& zm, \575int imm) { \576if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \577UseScratchRegisterScope temps(this); \578VIXL_ASSERT(AreSameLaneSize(zn, zm)); \579ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); \580Mov(ztmp, zd.Aliases(zn) ? zn : zm); \581MovprfxHelperScope guard(this, zd, za); \582ASMFN(zd, \583(zd.Aliases(zn) ? ztmp : zn), \584(zd.Aliases(zm) ? ztmp : zm), \585imm); \586} else { \587MovprfxHelperScope guard(this, zd, za); \588ASMFN(zd, zn, zm, imm); \589} \590}591VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)592#undef VIXL_DEFINE_MASM_FUNC593594// Instructions of the form "inst zda, zn, zm, #num, #num", where they are595// non-commutative and no reversed form is provided.596#define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \597V(Cmla, cmla) \598V(Sqrdcmlah, sqrdcmlah)599600// This doesn't handle zm when it's out of the range that can be encoded in601// instruction. The range depends on element size: z0-z7 for H, z0-15 for S.602#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \603void MacroAssembler::MASMFN(const ZRegister& zd, \604const ZRegister& za, \605const ZRegister& zn, \606const ZRegister& zm, \607int index, \608int rot) { \609if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \610UseScratchRegisterScope temps(this); \611ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); \612{ \613MovprfxHelperScope guard(this, ztmp, za); \614ASMFN(ztmp, zn, zm, index, rot); \615} \616Mov(zd, ztmp); \617} else { \618MovprfxHelperScope guard(this, zd, za); \619ASMFN(zd, zn, zm, index, rot); \620} \621}622VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)623#undef VIXL_DEFINE_MASM_FUNC624625// Instructions of the form "inst zda, pg, zda, zn", where they are626// non-commutative and no reversed form is provided.627#define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \628V(Addp, addp) \629V(Bic, bic) \630V(Faddp, faddp) \631V(Fmaxnmp, fmaxnmp) \632V(Fminnmp, fminnmp) \633V(Fmaxp, fmaxp) \634V(Fminp, fminp) \635V(Fscale, fscale) \636V(Smaxp, smaxp) \637V(Sminp, sminp) \638V(Suqadd, suqadd) \639V(Umaxp, umaxp) \640V(Uminp, uminp) \641V(Usqadd, usqadd)642643#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \644void MacroAssembler::MASMFN(const ZRegister& zd, \645const PRegisterM& pg, \646const ZRegister& zn, \647const ZRegister& zm) { \648VIXL_ASSERT(allow_macro_instructions_); \649if (zd.Aliases(zm) && !zd.Aliases(zn)) { \650UseScratchRegisterScope temps(this); \651ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \652Mov(scratch, zm); \653MovprfxHelperScope guard(this, zd, pg, zn); \654ASMFN(zd, pg, zd, scratch); \655} else { \656MovprfxHelperScope guard(this, zd, pg, zn); \657ASMFN(zd, pg, zd, zm); \658} \659}660VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)661#undef VIXL_DEFINE_MASM_FUNC662663// Instructions of the form "inst zda, pg, zda, zn", where they are664// non-commutative and a reversed form is provided.665#define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \666V(Asr, asr) \667V(Fdiv, fdiv) \668V(Fsub, fsub) \669V(Lsl, lsl) \670V(Lsr, lsr) \671V(Sdiv, sdiv) \672V(Shsub, shsub) \673V(Sqrshl, sqrshl) \674V(Sqshl, sqshl) \675V(Sqsub, sqsub) \676V(Srshl, srshl) \677V(Sub, sub) \678V(Udiv, udiv) \679V(Uhsub, uhsub) \680V(Uqrshl, uqrshl) \681V(Uqshl, uqshl) \682V(Uqsub, uqsub) \683V(Urshl, urshl)684685#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \686void MacroAssembler::MASMFN(const ZRegister& zd, \687const PRegisterM& pg, \688const ZRegister& zn, \689const ZRegister& zm) { \690VIXL_ASSERT(allow_macro_instructions_); \691NoncommutativeArithmeticHelper(zd, \692pg, \693zn, \694zm, \695static_cast<SVEArithPredicatedFn>( \696&Assembler::ASMFN), \697static_cast<SVEArithPredicatedFn>( \698&Assembler::ASMFN##r)); \699}700VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)701#undef VIXL_DEFINE_MASM_FUNC702703void MacroAssembler::Fadd(const ZRegister& zd,704const PRegisterM& pg,705const ZRegister& zn,706const ZRegister& zm,707FPMacroNaNPropagationOption nan_option) {708VIXL_ASSERT(allow_macro_instructions_);709FPCommutativeArithmeticHelper(zd,710pg,711zn,712zm,713static_cast<SVEArithPredicatedFn>(714&Assembler::fadd),715nan_option);716}717718void MacroAssembler::Fabd(const ZRegister& zd,719const PRegisterM& pg,720const ZRegister& zn,721const ZRegister& zm,722FPMacroNaNPropagationOption nan_option) {723VIXL_ASSERT(allow_macro_instructions_);724FPCommutativeArithmeticHelper(zd,725pg,726zn,727zm,728static_cast<SVEArithPredicatedFn>(729&Assembler::fabd),730nan_option);731}732733void MacroAssembler::Fmul(const ZRegister& zd,734const PRegisterM& pg,735const ZRegister& zn,736const ZRegister& zm,737FPMacroNaNPropagationOption nan_option) {738VIXL_ASSERT(allow_macro_instructions_);739FPCommutativeArithmeticHelper(zd,740pg,741zn,742zm,743static_cast<SVEArithPredicatedFn>(744&Assembler::fmul),745nan_option);746}747748void MacroAssembler::Fmulx(const ZRegister& zd,749const PRegisterM& pg,750const ZRegister& zn,751const ZRegister& zm,752FPMacroNaNPropagationOption nan_option) {753VIXL_ASSERT(allow_macro_instructions_);754FPCommutativeArithmeticHelper(zd,755pg,756zn,757zm,758static_cast<SVEArithPredicatedFn>(759&Assembler::fmulx),760nan_option);761}762763void MacroAssembler::Fmax(const ZRegister& zd,764const PRegisterM& pg,765const ZRegister& zn,766const ZRegister& zm,767FPMacroNaNPropagationOption nan_option) {768VIXL_ASSERT(allow_macro_instructions_);769FPCommutativeArithmeticHelper(zd,770pg,771zn,772zm,773static_cast<SVEArithPredicatedFn>(774&Assembler::fmax),775nan_option);776}777778void MacroAssembler::Fmin(const ZRegister& zd,779const PRegisterM& pg,780const ZRegister& zn,781const ZRegister& zm,782FPMacroNaNPropagationOption nan_option) {783VIXL_ASSERT(allow_macro_instructions_);784FPCommutativeArithmeticHelper(zd,785pg,786zn,787zm,788static_cast<SVEArithPredicatedFn>(789&Assembler::fmin),790nan_option);791}792793void MacroAssembler::Fmaxnm(const ZRegister& zd,794const PRegisterM& pg,795const ZRegister& zn,796const ZRegister& zm,797FPMacroNaNPropagationOption nan_option) {798VIXL_ASSERT(allow_macro_instructions_);799FPCommutativeArithmeticHelper(zd,800pg,801zn,802zm,803static_cast<SVEArithPredicatedFn>(804&Assembler::fmaxnm),805nan_option);806}807808void MacroAssembler::Fminnm(const ZRegister& zd,809const PRegisterM& pg,810const ZRegister& zn,811const ZRegister& zm,812FPMacroNaNPropagationOption nan_option) {813VIXL_ASSERT(allow_macro_instructions_);814FPCommutativeArithmeticHelper(zd,815pg,816zn,817zm,818static_cast<SVEArithPredicatedFn>(819&Assembler::fminnm),820nan_option);821}822823void MacroAssembler::Fdup(const ZRegister& zd, double imm) {824VIXL_ASSERT(allow_macro_instructions_);825826switch (zd.GetLaneSizeInBits()) {827case kHRegSize:828Fdup(zd, Float16(imm));829break;830case kSRegSize:831Fdup(zd, static_cast<float>(imm));832break;833case kDRegSize:834uint64_t bits = DoubleToRawbits(imm);835if (IsImmFP64(bits)) {836SingleEmissionCheckScope guard(this);837fdup(zd, imm);838} else {839Dup(zd, bits);840}841break;842}843}844845void MacroAssembler::Fdup(const ZRegister& zd, float imm) {846VIXL_ASSERT(allow_macro_instructions_);847848switch (zd.GetLaneSizeInBits()) {849case kHRegSize:850Fdup(zd, Float16(imm));851break;852case kSRegSize:853if (IsImmFP32(imm)) {854SingleEmissionCheckScope guard(this);855fdup(zd, imm);856} else {857Dup(zd, FloatToRawbits(imm));858}859break;860case kDRegSize:861Fdup(zd, static_cast<double>(imm));862break;863}864}865866void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {867VIXL_ASSERT(allow_macro_instructions_);868869switch (zd.GetLaneSizeInBits()) {870case kHRegSize:871if (IsImmFP16(imm)) {872SingleEmissionCheckScope guard(this);873fdup(zd, imm);874} else {875Dup(zd, Float16ToRawbits(imm));876}877break;878case kSRegSize:879Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));880break;881case kDRegSize:882Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));883break;884}885}886887void MacroAssembler::Index(const ZRegister& zd,888const Operand& start,889const Operand& step) {890class IndexOperand : public Operand {891public:892static IndexOperand Prepare(MacroAssembler* masm,893UseScratchRegisterScope* temps,894const Operand& op,895const ZRegister& zd_inner) {896// Look for encodable immediates.897int imm;898if (op.IsImmediate()) {899if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {900return IndexOperand(imm);901}902Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);903masm->Mov(scratch, op);904return IndexOperand(scratch);905} else {906// Plain registers can be encoded directly.907VIXL_ASSERT(op.IsPlainRegister());908return IndexOperand(op.GetRegister());909}910}911912int GetImm5() const {913int64_t imm = GetImmediate();914VIXL_ASSERT(IsInt5(imm));915return static_cast<int>(imm);916}917918private:919explicit IndexOperand(const Register& reg) : Operand(reg) {}920explicit IndexOperand(int64_t imm) : Operand(imm) {}921};922923UseScratchRegisterScope temps(this);924IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);925IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);926927SingleEmissionCheckScope guard(this);928if (start_enc.IsImmediate()) {929if (step_enc.IsImmediate()) {930index(zd, start_enc.GetImm5(), step_enc.GetImm5());931} else {932index(zd, start_enc.GetImm5(), step_enc.GetRegister());933}934} else {935if (step_enc.IsImmediate()) {936index(zd, start_enc.GetRegister(), step_enc.GetImm5());937} else {938index(zd, start_enc.GetRegister(), step_enc.GetRegister());939}940}941}942943void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {944VIXL_ASSERT(allow_macro_instructions_);945VIXL_ASSERT(imm.FitsInLane(zdn));946947if (imm.IsZero()) {948SingleEmissionCheckScope guard(this);949insr(zdn, xzr);950return;951}952953UseScratchRegisterScope temps(this);954Register scratch = temps.AcquireRegisterToHoldLane(zdn);955956// TODO: There are many cases where we could optimise immediates, such as by957// detecting repeating patterns or FP immediates. We should optimise and958// abstract this for use in other SVE mov-immediate-like macros.959Mov(scratch, imm);960961SingleEmissionCheckScope guard(this);962insr(zdn, scratch);963}964965void MacroAssembler::Mla(const ZRegister& zd,966const PRegisterM& pg,967const ZRegister& za,968const ZRegister& zn,969const ZRegister& zm) {970VIXL_ASSERT(allow_macro_instructions_);971if (zd.Aliases(za)) {972// zda = zda + (zn * zm)973SingleEmissionCheckScope guard(this);974mla(zd, pg, zn, zm);975} else if (zd.Aliases(zn)) {976// zdn = za + (zdn * zm)977SingleEmissionCheckScope guard(this);978mad(zd, pg, zm, za);979} else if (zd.Aliases(zm)) {980// Multiplication is commutative, so we can swap zn and zm.981// zdm = za + (zdm * zn)982SingleEmissionCheckScope guard(this);983mad(zd, pg, zn, za);984} else {985// zd = za + (zn * zm)986ExactAssemblyScope guard(this, 2 * kInstructionSize);987movprfx(zd, pg, za);988mla(zd, pg, zn, zm);989}990}991992void MacroAssembler::Mls(const ZRegister& zd,993const PRegisterM& pg,994const ZRegister& za,995const ZRegister& zn,996const ZRegister& zm) {997VIXL_ASSERT(allow_macro_instructions_);998if (zd.Aliases(za)) {999// zda = zda - (zn * zm)1000SingleEmissionCheckScope guard(this);1001mls(zd, pg, zn, zm);1002} else if (zd.Aliases(zn)) {1003// zdn = za - (zdn * zm)1004SingleEmissionCheckScope guard(this);1005msb(zd, pg, zm, za);1006} else if (zd.Aliases(zm)) {1007// Multiplication is commutative, so we can swap zn and zm.1008// zdm = za - (zdm * zn)1009SingleEmissionCheckScope guard(this);1010msb(zd, pg, zn, za);1011} else {1012// zd = za - (zn * zm)1013ExactAssemblyScope guard(this, 2 * kInstructionSize);1014movprfx(zd, pg, za);1015mls(zd, pg, zn, zm);1016}1017}10181019void MacroAssembler::CompareHelper(Condition cond,1020const PRegisterWithLaneSize& pd,1021const PRegisterZ& pg,1022const ZRegister& zn,1023IntegerOperand imm) {1024UseScratchRegisterScope temps(this);1025ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());1026Dup(zm, imm);1027SingleEmissionCheckScope guard(this);1028cmp(cond, pd, pg, zn, zm);1029}10301031void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,1032const PRegister& pg,1033const PRegisterWithLaneSize& pn) {1034VIXL_ASSERT(allow_macro_instructions_);1035VIXL_ASSERT(pd.IsLaneSizeB());1036VIXL_ASSERT(pn.IsLaneSizeB());1037if (pd.Is(pn)) {1038SingleEmissionCheckScope guard(this);1039pfirst(pd, pg, pn);1040} else {1041UseScratchRegisterScope temps(this);1042PRegister temp_pg = pg;1043if (pd.Aliases(pg)) {1044temp_pg = temps.AcquireP();1045Mov(temp_pg.VnB(), pg.VnB());1046}1047Mov(pd, pn);1048SingleEmissionCheckScope guard(this);1049pfirst(pd, temp_pg, pd);1050}1051}10521053void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,1054const PRegister& pg,1055const PRegisterWithLaneSize& pn) {1056VIXL_ASSERT(allow_macro_instructions_);1057VIXL_ASSERT(AreSameFormat(pd, pn));1058if (pd.Is(pn)) {1059SingleEmissionCheckScope guard(this);1060pnext(pd, pg, pn);1061} else {1062UseScratchRegisterScope temps(this);1063PRegister temp_pg = pg;1064if (pd.Aliases(pg)) {1065temp_pg = temps.AcquireP();1066Mov(temp_pg.VnB(), pg.VnB());1067}1068Mov(pd.VnB(), pn.VnB());1069SingleEmissionCheckScope guard(this);1070pnext(pd, temp_pg, pd);1071}1072}10731074void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,1075SVEPredicateConstraint pattern,1076FlagsUpdate s) {1077VIXL_ASSERT(allow_macro_instructions_);1078switch (s) {1079case LeaveFlags:1080Ptrue(pd, pattern);1081return;1082case SetFlags:1083Ptrues(pd, pattern);1084return;1085}1086VIXL_UNREACHABLE();1087}10881089void MacroAssembler::Sub(const ZRegister& zd,1090IntegerOperand imm,1091const ZRegister& zm) {1092VIXL_ASSERT(allow_macro_instructions_);10931094int imm8;1095int shift = -1;1096if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||1097imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {1098MovprfxHelperScope guard(this, zd, zm);1099subr(zd, zd, imm8, shift);1100} else {1101UseScratchRegisterScope temps(this);1102ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());1103Dup(scratch, imm);11041105SingleEmissionCheckScope guard(this);1106sub(zd, scratch, zm);1107}1108}11091110void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,1111const PRegisterZ& pg,1112const SVEMemOperand& addr,1113SVELoadBroadcastFn fn,1114int divisor) {1115VIXL_ASSERT(addr.IsScalarPlusImmediate());1116int64_t imm = addr.GetImmediateOffset();1117if ((imm % divisor == 0) && IsUint6(imm / divisor)) {1118SingleEmissionCheckScope guard(this);1119(this->*fn)(zt, pg, addr);1120} else {1121UseScratchRegisterScope temps(this);1122Register scratch = temps.AcquireX();1123CalculateSVEAddress(scratch, addr, zt);1124SingleEmissionCheckScope guard(this);1125(this->*fn)(zt, pg, SVEMemOperand(scratch));1126}1127}11281129void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,1130const SVEMemOperand& addr,1131SVELoadStoreFn fn) {1132VIXL_ASSERT(allow_macro_instructions_);1133VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());11341135if (addr.IsPlainScalar() ||1136(addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&1137addr.IsMulVl())) {1138SingleEmissionCheckScope guard(this);1139(this->*fn)(rt, addr);1140return;1141}11421143if (addr.IsEquivalentToScalar()) {1144SingleEmissionCheckScope guard(this);1145(this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));1146return;1147}11481149UseScratchRegisterScope temps(this);1150Register scratch = temps.AcquireX();1151CalculateSVEAddress(scratch, addr, rt);1152SingleEmissionCheckScope guard(this);1153(this->*fn)(rt, SVEMemOperand(scratch));1154}11551156template <typename Tg, typename Tf>1157void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(1158const ZRegister& zt,1159const Tg& pg,1160const SVEMemOperand& addr,1161Tf fn,1162int imm_bits,1163int shift_amount,1164SVEOffsetModifier supported_modifier,1165int vl_divisor_log2) {1166VIXL_ASSERT(allow_macro_instructions_);1167int imm_divisor = 1 << shift_amount;11681169if (addr.IsPlainScalar() ||1170(addr.IsScalarPlusImmediate() &&1171IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&1172((addr.GetImmediateOffset() % imm_divisor) == 0) &&1173(addr.GetOffsetModifier() == supported_modifier))) {1174SingleEmissionCheckScope guard(this);1175(this->*fn)(zt, pg, addr);1176return;1177}11781179if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&1180addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {1181SingleEmissionCheckScope guard(this);1182(this->*fn)(zt, pg, addr);1183return;1184}11851186if (addr.IsEquivalentToScalar()) {1187SingleEmissionCheckScope guard(this);1188(this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));1189return;1190}11911192if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&1193(vl_divisor_log2 == -1)) {1194// We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL1195// dependent.1196VIXL_UNIMPLEMENTED();1197}11981199UseScratchRegisterScope temps(this);1200Register scratch = temps.AcquireX();1201CalculateSVEAddress(scratch, addr, vl_divisor_log2);1202SingleEmissionCheckScope guard(this);1203(this->*fn)(zt, pg, SVEMemOperand(scratch));1204}12051206template <typename Tg, typename Tf>1207void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,1208const ZRegister& zt,1209const Tg& pg,1210const SVEMemOperand& addr,1211Tf fn) {1212if (addr.IsPlainScalar() ||1213(addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&1214addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||1215(addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&1216addr.IsMulVl())) {1217SingleEmissionCheckScope guard(this);1218(this->*fn)(zt, pg, addr);1219return;1220}12211222if (addr.IsEquivalentToScalar()) {1223SingleEmissionCheckScope guard(this);1224(this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));1225return;1226}12271228if (addr.IsVectorPlusImmediate()) {1229uint64_t offset = addr.GetImmediateOffset();1230if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&1231IsUint5(offset >> msize_in_bytes_log2)) {1232SingleEmissionCheckScope guard(this);1233(this->*fn)(zt, pg, addr);1234return;1235}1236}12371238if (addr.IsScalarPlusVector()) {1239VIXL_ASSERT(addr.IsScatterGather());1240SingleEmissionCheckScope guard(this);1241(this->*fn)(zt, pg, addr);1242return;1243}12441245UseScratchRegisterScope temps(this);1246if (addr.IsScatterGather()) {1247// In scatter-gather modes, zt and zn/zm have the same lane size. However,1248// for 32-bit accesses, the result of each lane's address calculation still1249// requires 64 bits; we can't naively use `Adr` for the address calculation1250// because it would truncate each address to 32 bits.12511252if (addr.IsVectorPlusImmediate()) {1253// Synthesise the immediate in an X register, then use a1254// scalar-plus-vector access with the original vector.1255Register scratch = temps.AcquireX();1256Mov(scratch, addr.GetImmediateOffset());1257SingleEmissionCheckScope guard(this);1258SVEOffsetModifier om =1259zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;1260(this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));1261return;1262}12631264VIXL_UNIMPLEMENTED();1265} else {1266Register scratch = temps.AcquireX();1267// TODO: If we have an immediate offset that is a multiple of1268// msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to1269// save an instruction.1270int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;1271CalculateSVEAddress(scratch, addr, vl_divisor_log2);1272SingleEmissionCheckScope guard(this);1273(this->*fn)(zt, pg, SVEMemOperand(scratch));1274}1275}12761277template <typename Tf>1278void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,1279const ZRegister& zt,1280const PRegisterZ& pg,1281const SVEMemOperand& addr,1282Tf fn) {1283if (addr.IsScatterGather()) {1284// Scatter-gather first-fault loads share encodings with normal loads.1285SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);1286return;1287}12881289// Contiguous first-faulting loads have no scalar-plus-immediate form at all,1290// so we don't do immediate synthesis.12911292// We cannot currently distinguish "[x0]" from "[x0, #0]", and this1293// is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.1294if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&1295addr.IsEquivalentToLSL(msize_in_bytes_log2))) {1296SingleEmissionCheckScope guard(this);1297(this->*fn)(zt, pg, addr);1298return;1299}13001301VIXL_UNIMPLEMENTED();1302}13031304void MacroAssembler::Ld1b(const ZRegister& zt,1305const PRegisterZ& pg,1306const SVEMemOperand& addr) {1307VIXL_ASSERT(allow_macro_instructions_);1308SVELoadStore1Helper(kBRegSizeInBytesLog2,1309zt,1310pg,1311addr,1312static_cast<SVELoad1Fn>(&Assembler::ld1b));1313}13141315void MacroAssembler::Ld1h(const ZRegister& zt,1316const PRegisterZ& pg,1317const SVEMemOperand& addr) {1318VIXL_ASSERT(allow_macro_instructions_);1319SVELoadStore1Helper(kHRegSizeInBytesLog2,1320zt,1321pg,1322addr,1323static_cast<SVELoad1Fn>(&Assembler::ld1h));1324}13251326void MacroAssembler::Ld1w(const ZRegister& zt,1327const PRegisterZ& pg,1328const SVEMemOperand& addr) {1329VIXL_ASSERT(allow_macro_instructions_);1330SVELoadStore1Helper(kWRegSizeInBytesLog2,1331zt,1332pg,1333addr,1334static_cast<SVELoad1Fn>(&Assembler::ld1w));1335}13361337void MacroAssembler::Ld1d(const ZRegister& zt,1338const PRegisterZ& pg,1339const SVEMemOperand& addr) {1340VIXL_ASSERT(allow_macro_instructions_);1341SVELoadStore1Helper(kDRegSizeInBytesLog2,1342zt,1343pg,1344addr,1345static_cast<SVELoad1Fn>(&Assembler::ld1d));1346}13471348void MacroAssembler::Ld1sb(const ZRegister& zt,1349const PRegisterZ& pg,1350const SVEMemOperand& addr) {1351VIXL_ASSERT(allow_macro_instructions_);1352SVELoadStore1Helper(kBRegSizeInBytesLog2,1353zt,1354pg,1355addr,1356static_cast<SVELoad1Fn>(&Assembler::ld1sb));1357}13581359void MacroAssembler::Ld1sh(const ZRegister& zt,1360const PRegisterZ& pg,1361const SVEMemOperand& addr) {1362VIXL_ASSERT(allow_macro_instructions_);1363SVELoadStore1Helper(kHRegSizeInBytesLog2,1364zt,1365pg,1366addr,1367static_cast<SVELoad1Fn>(&Assembler::ld1sh));1368}13691370void MacroAssembler::Ld1sw(const ZRegister& zt,1371const PRegisterZ& pg,1372const SVEMemOperand& addr) {1373VIXL_ASSERT(allow_macro_instructions_);1374SVELoadStore1Helper(kSRegSizeInBytesLog2,1375zt,1376pg,1377addr,1378static_cast<SVELoad1Fn>(&Assembler::ld1sw));1379}13801381void MacroAssembler::St1b(const ZRegister& zt,1382const PRegister& pg,1383const SVEMemOperand& addr) {1384VIXL_ASSERT(allow_macro_instructions_);1385SVELoadStore1Helper(kBRegSizeInBytesLog2,1386zt,1387pg,1388addr,1389static_cast<SVEStore1Fn>(&Assembler::st1b));1390}13911392void MacroAssembler::St1h(const ZRegister& zt,1393const PRegister& pg,1394const SVEMemOperand& addr) {1395VIXL_ASSERT(allow_macro_instructions_);1396SVELoadStore1Helper(kHRegSizeInBytesLog2,1397zt,1398pg,1399addr,1400static_cast<SVEStore1Fn>(&Assembler::st1h));1401}14021403void MacroAssembler::St1w(const ZRegister& zt,1404const PRegister& pg,1405const SVEMemOperand& addr) {1406VIXL_ASSERT(allow_macro_instructions_);1407SVELoadStore1Helper(kSRegSizeInBytesLog2,1408zt,1409pg,1410addr,1411static_cast<SVEStore1Fn>(&Assembler::st1w));1412}14131414void MacroAssembler::St1d(const ZRegister& zt,1415const PRegister& pg,1416const SVEMemOperand& addr) {1417VIXL_ASSERT(allow_macro_instructions_);1418SVELoadStore1Helper(kDRegSizeInBytesLog2,1419zt,1420pg,1421addr,1422static_cast<SVEStore1Fn>(&Assembler::st1d));1423}14241425void MacroAssembler::Ldff1b(const ZRegister& zt,1426const PRegisterZ& pg,1427const SVEMemOperand& addr) {1428VIXL_ASSERT(allow_macro_instructions_);1429SVELoadFFHelper(kBRegSizeInBytesLog2,1430zt,1431pg,1432addr,1433static_cast<SVELoad1Fn>(&Assembler::ldff1b));1434}14351436void MacroAssembler::Ldff1h(const ZRegister& zt,1437const PRegisterZ& pg,1438const SVEMemOperand& addr) {1439VIXL_ASSERT(allow_macro_instructions_);1440SVELoadFFHelper(kHRegSizeInBytesLog2,1441zt,1442pg,1443addr,1444static_cast<SVELoad1Fn>(&Assembler::ldff1h));1445}14461447void MacroAssembler::Ldff1w(const ZRegister& zt,1448const PRegisterZ& pg,1449const SVEMemOperand& addr) {1450VIXL_ASSERT(allow_macro_instructions_);1451SVELoadFFHelper(kSRegSizeInBytesLog2,1452zt,1453pg,1454addr,1455static_cast<SVELoad1Fn>(&Assembler::ldff1w));1456}14571458void MacroAssembler::Ldff1d(const ZRegister& zt,1459const PRegisterZ& pg,1460const SVEMemOperand& addr) {1461VIXL_ASSERT(allow_macro_instructions_);1462SVELoadFFHelper(kDRegSizeInBytesLog2,1463zt,1464pg,1465addr,1466static_cast<SVELoad1Fn>(&Assembler::ldff1d));1467}14681469void MacroAssembler::Ldff1sb(const ZRegister& zt,1470const PRegisterZ& pg,1471const SVEMemOperand& addr) {1472VIXL_ASSERT(allow_macro_instructions_);1473SVELoadFFHelper(kBRegSizeInBytesLog2,1474zt,1475pg,1476addr,1477static_cast<SVELoad1Fn>(&Assembler::ldff1sb));1478}14791480void MacroAssembler::Ldff1sh(const ZRegister& zt,1481const PRegisterZ& pg,1482const SVEMemOperand& addr) {1483VIXL_ASSERT(allow_macro_instructions_);1484SVELoadFFHelper(kHRegSizeInBytesLog2,1485zt,1486pg,1487addr,1488static_cast<SVELoad1Fn>(&Assembler::ldff1sh));1489}14901491void MacroAssembler::Ldff1sw(const ZRegister& zt,1492const PRegisterZ& pg,1493const SVEMemOperand& addr) {1494VIXL_ASSERT(allow_macro_instructions_);1495SVELoadFFHelper(kSRegSizeInBytesLog2,1496zt,1497pg,1498addr,1499static_cast<SVELoad1Fn>(&Assembler::ldff1sw));1500}15011502#define VIXL_SVE_LD1R_LIST(V) \1503V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)15041505#define VIXL_DEFINE_MASM_FUNC(SZ, SH) \1506void MacroAssembler::Ld1r##SZ(const ZRegister& zt, \1507const PRegisterZ& pg, \1508const SVEMemOperand& addr) { \1509VIXL_ASSERT(allow_macro_instructions_); \1510SVELoadStoreNTBroadcastQOHelper(zt, \1511pg, \1512addr, \1513&MacroAssembler::ld1r##SZ, \15144, \1515SH, \1516NO_SVE_OFFSET_MODIFIER, \1517-1); \1518}15191520VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)15211522#undef VIXL_DEFINE_MASM_FUNC1523#undef VIXL_SVE_LD1R_LIST15241525void MacroAssembler::Ldnt1b(const ZRegister& zt,1526const PRegisterZ& pg,1527const SVEMemOperand& addr) {1528VIXL_ASSERT(allow_macro_instructions_);1529if (addr.IsVectorPlusScalar()) {1530SingleEmissionCheckScope guard(this);1531ldnt1b(zt, pg, addr);1532} else {1533SVELoadStoreNTBroadcastQOHelper(zt,1534pg,1535addr,1536&MacroAssembler::ldnt1b,15374,15380,1539SVE_MUL_VL);1540}1541}15421543void MacroAssembler::Ldnt1d(const ZRegister& zt,1544const PRegisterZ& pg,1545const SVEMemOperand& addr) {1546VIXL_ASSERT(allow_macro_instructions_);1547if (addr.IsVectorPlusScalar()) {1548SingleEmissionCheckScope guard(this);1549ldnt1d(zt, pg, addr);1550} else {1551SVELoadStoreNTBroadcastQOHelper(zt,1552pg,1553addr,1554&MacroAssembler::ldnt1d,15554,15560,1557SVE_MUL_VL);1558}1559}15601561void MacroAssembler::Ldnt1h(const ZRegister& zt,1562const PRegisterZ& pg,1563const SVEMemOperand& addr) {1564VIXL_ASSERT(allow_macro_instructions_);1565if (addr.IsVectorPlusScalar()) {1566SingleEmissionCheckScope guard(this);1567ldnt1h(zt, pg, addr);1568} else {1569SVELoadStoreNTBroadcastQOHelper(zt,1570pg,1571addr,1572&MacroAssembler::ldnt1h,15734,15740,1575SVE_MUL_VL);1576}1577}15781579void MacroAssembler::Ldnt1w(const ZRegister& zt,1580const PRegisterZ& pg,1581const SVEMemOperand& addr) {1582VIXL_ASSERT(allow_macro_instructions_);1583if (addr.IsVectorPlusScalar()) {1584SingleEmissionCheckScope guard(this);1585ldnt1w(zt, pg, addr);1586} else {1587SVELoadStoreNTBroadcastQOHelper(zt,1588pg,1589addr,1590&MacroAssembler::ldnt1w,15914,15920,1593SVE_MUL_VL);1594}1595}15961597void MacroAssembler::Stnt1b(const ZRegister& zt,1598const PRegister& pg,1599const SVEMemOperand& addr) {1600VIXL_ASSERT(allow_macro_instructions_);1601if (addr.IsVectorPlusScalar()) {1602SingleEmissionCheckScope guard(this);1603stnt1b(zt, pg, addr);1604} else {1605SVELoadStoreNTBroadcastQOHelper(zt,1606pg,1607addr,1608&MacroAssembler::stnt1b,16094,16100,1611SVE_MUL_VL);1612}1613}1614void MacroAssembler::Stnt1d(const ZRegister& zt,1615const PRegister& pg,1616const SVEMemOperand& addr) {1617VIXL_ASSERT(allow_macro_instructions_);1618if (addr.IsVectorPlusScalar()) {1619SingleEmissionCheckScope guard(this);1620stnt1d(zt, pg, addr);1621} else {1622SVELoadStoreNTBroadcastQOHelper(zt,1623pg,1624addr,1625&MacroAssembler::stnt1d,16264,16270,1628SVE_MUL_VL);1629}1630}1631void MacroAssembler::Stnt1h(const ZRegister& zt,1632const PRegister& pg,1633const SVEMemOperand& addr) {1634VIXL_ASSERT(allow_macro_instructions_);1635if (addr.IsVectorPlusScalar()) {1636SingleEmissionCheckScope guard(this);1637stnt1h(zt, pg, addr);1638} else {1639SVELoadStoreNTBroadcastQOHelper(zt,1640pg,1641addr,1642&MacroAssembler::stnt1h,16434,16440,1645SVE_MUL_VL);1646}1647}1648void MacroAssembler::Stnt1w(const ZRegister& zt,1649const PRegister& pg,1650const SVEMemOperand& addr) {1651VIXL_ASSERT(allow_macro_instructions_);1652if (addr.IsVectorPlusScalar()) {1653SingleEmissionCheckScope guard(this);1654stnt1w(zt, pg, addr);1655} else {1656SVELoadStoreNTBroadcastQOHelper(zt,1657pg,1658addr,1659&MacroAssembler::stnt1w,16604,16610,1662SVE_MUL_VL);1663}1664}16651666void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,1667const ZRegister& zd,1668const ZRegister& za,1669const ZRegister& zn,1670const ZRegister& zm,1671int index) {1672if (zd.Aliases(za)) {1673// zda = zda + (zn . zm)1674SingleEmissionCheckScope guard(this);1675(this->*fn)(zd, zn, zm, index);16761677} else if (zd.Aliases(zn) || zd.Aliases(zm)) {1678// zdn = za + (zdn . zm[index])1679// zdm = za + (zn . zdm[index])1680// zdnm = za + (zdnm . zdnm[index])1681UseScratchRegisterScope temps(this);1682ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);1683{1684MovprfxHelperScope guard(this, scratch, za);1685(this->*fn)(scratch, zn, zm, index);1686}16871688Mov(zd, scratch);1689} else {1690// zd = za + (zn . zm)1691MovprfxHelperScope guard(this, zd, za);1692(this->*fn)(zd, zn, zm, index);1693}1694}16951696void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,1697const ZRegister& zd,1698const ZRegister& za,1699const ZRegister& zn,1700const ZRegister& zm) {1701if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {1702// zd = za . zd . zm1703// zd = za . zn . zd1704// zd = za . zd . zd1705UseScratchRegisterScope temps(this);1706ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);1707{1708MovprfxHelperScope guard(this, scratch, za);1709(this->*fn)(scratch, zn, zm);1710}17111712Mov(zd, scratch);1713} else {1714MovprfxHelperScope guard(this, zd, za);1715(this->*fn)(zd, zn, zm);1716}1717}17181719void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,1720const ZRegister& zd,1721const ZRegister& za,1722const ZRegister& zn,1723const ZRegister& zm) {1724if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {1725// zd = za . zd . zm1726// zd = za . zn . zd1727// zd = za . zd . zd1728UseScratchRegisterScope temps(this);1729ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);1730{1731MovprfxHelperScope guard(this, scratch, za);1732(this->*fn)(scratch, scratch, zn, zm);1733}17341735Mov(zd, scratch);1736} else {1737MovprfxHelperScope guard(this, zd, za);1738(this->*fn)(zd, zd, zn, zm);1739}1740}17411742void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,1743const ZRegister& zd,1744const ZRegister& za,1745const ZRegister& zn,1746const ZRegister& zm,1747int imm) {1748if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {1749// zd = za . zd . zm[i]1750// zd = za . zn . zd[i]1751// zd = za . zd . zd[i]1752UseScratchRegisterScope temps(this);1753ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);1754{1755MovprfxHelperScope guard(this, scratch, za);1756(this->*fn)(scratch, zn, zm, imm);1757}17581759Mov(zd, scratch);1760} else {1761// zd = za . zn . zm[i]1762MovprfxHelperScope guard(this, zd, za);1763(this->*fn)(zd, zn, zm, imm);1764}1765}17661767void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,1768const ZRegister& zd,1769const ZRegister& za,1770const ZRegister& zn,1771const ZRegister& zm) {1772if (zn.Aliases(zm)) {1773// If zn == zm, the difference is zero.1774if (!zd.Aliases(za)) {1775Mov(zd, za);1776}1777} else if (zd.Aliases(za)) {1778SingleEmissionCheckScope guard(this);1779(this->*fn)(zd, zn, zm);1780} else if (zd.Aliases(zn)) {1781UseScratchRegisterScope temps(this);1782ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());1783Mov(ztmp, zn);1784MovprfxHelperScope guard(this, zd, za);1785(this->*fn)(zd, ztmp, zm);1786} else if (zd.Aliases(zm)) {1787UseScratchRegisterScope temps(this);1788ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());1789Mov(ztmp, zm);1790MovprfxHelperScope guard(this, zd, za);1791(this->*fn)(zd, zn, ztmp);1792} else {1793MovprfxHelperScope guard(this, zd, za);1794(this->*fn)(zd, zn, zm);1795}1796}17971798#define VIXL_SVE_4REG_LIST(V) \1799V(Saba, saba, AbsoluteDifferenceAccumulate) \1800V(Uaba, uaba, AbsoluteDifferenceAccumulate) \1801V(Sabalb, sabalb, AbsoluteDifferenceAccumulate) \1802V(Sabalt, sabalt, AbsoluteDifferenceAccumulate) \1803V(Uabalb, uabalb, AbsoluteDifferenceAccumulate) \1804V(Uabalt, uabalt, AbsoluteDifferenceAccumulate) \1805V(Sdot, sdot, FourRegDestructiveHelper) \1806V(Udot, udot, FourRegDestructiveHelper) \1807V(Adclb, adclb, FourRegDestructiveHelper) \1808V(Adclt, adclt, FourRegDestructiveHelper) \1809V(Sbclb, sbclb, FourRegDestructiveHelper) \1810V(Sbclt, sbclt, FourRegDestructiveHelper) \1811V(Smlalb, smlalb, FourRegDestructiveHelper) \1812V(Smlalt, smlalt, FourRegDestructiveHelper) \1813V(Smlslb, smlslb, FourRegDestructiveHelper) \1814V(Smlslt, smlslt, FourRegDestructiveHelper) \1815V(Umlalb, umlalb, FourRegDestructiveHelper) \1816V(Umlalt, umlalt, FourRegDestructiveHelper) \1817V(Umlslb, umlslb, FourRegDestructiveHelper) \1818V(Umlslt, umlslt, FourRegDestructiveHelper) \1819V(Bcax, bcax, FourRegDestructiveHelper) \1820V(Bsl, bsl, FourRegDestructiveHelper) \1821V(Bsl1n, bsl1n, FourRegDestructiveHelper) \1822V(Bsl2n, bsl2n, FourRegDestructiveHelper) \1823V(Eor3, eor3, FourRegDestructiveHelper) \1824V(Nbsl, nbsl, FourRegDestructiveHelper) \1825V(Fmlalb, fmlalb, FourRegDestructiveHelper) \1826V(Fmlalt, fmlalt, FourRegDestructiveHelper) \1827V(Fmlslb, fmlslb, FourRegDestructiveHelper) \1828V(Fmlslt, fmlslt, FourRegDestructiveHelper) \1829V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper) \1830V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \1831V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper) \1832V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper) \1833V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \1834V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper) \1835V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper) \1836V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper) \1837V(Fmmla, fmmla, FourRegDestructiveHelper) \1838V(Smmla, smmla, FourRegDestructiveHelper) \1839V(Ummla, ummla, FourRegDestructiveHelper) \1840V(Usmmla, usmmla, FourRegDestructiveHelper) \1841V(Usdot, usdot, FourRegDestructiveHelper)18421843#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \1844void MacroAssembler::MASMFN(const ZRegister& zd, \1845const ZRegister& za, \1846const ZRegister& zn, \1847const ZRegister& zm) { \1848VIXL_ASSERT(allow_macro_instructions_); \1849HELPER(&Assembler::ASMFN, zd, za, zn, zm); \1850}1851VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)1852#undef VIXL_DEFINE_MASM_FUNC18531854#define VIXL_SVE_4REG_1IMM_LIST(V) \1855V(Fmla, fmla, FourRegOneImmDestructiveHelper) \1856V(Fmls, fmls, FourRegOneImmDestructiveHelper) \1857V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper) \1858V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper) \1859V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper) \1860V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper) \1861V(Mla, mla, FourRegOneImmDestructiveHelper) \1862V(Mls, mls, FourRegOneImmDestructiveHelper) \1863V(Smlalb, smlalb, FourRegOneImmDestructiveHelper) \1864V(Smlalt, smlalt, FourRegOneImmDestructiveHelper) \1865V(Smlslb, smlslb, FourRegOneImmDestructiveHelper) \1866V(Smlslt, smlslt, FourRegOneImmDestructiveHelper) \1867V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \1868V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \1869V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \1870V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \1871V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \1872V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \1873V(Umlalb, umlalb, FourRegOneImmDestructiveHelper) \1874V(Umlalt, umlalt, FourRegOneImmDestructiveHelper) \1875V(Umlslb, umlslb, FourRegOneImmDestructiveHelper) \1876V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)18771878#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \1879void MacroAssembler::MASMFN(const ZRegister& zd, \1880const ZRegister& za, \1881const ZRegister& zn, \1882const ZRegister& zm, \1883int imm) { \1884VIXL_ASSERT(allow_macro_instructions_); \1885HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm); \1886}1887VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)1888#undef VIXL_DEFINE_MASM_FUNC18891890void MacroAssembler::Sdot(const ZRegister& zd,1891const ZRegister& za,1892const ZRegister& zn,1893const ZRegister& zm,1894int index) {1895VIXL_ASSERT(allow_macro_instructions_);1896SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);1897}18981899void MacroAssembler::Udot(const ZRegister& zd,1900const ZRegister& za,1901const ZRegister& zn,1902const ZRegister& zm,1903int index) {1904VIXL_ASSERT(allow_macro_instructions_);1905SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);1906}19071908void MacroAssembler::Sudot(const ZRegister& zd,1909const ZRegister& za,1910const ZRegister& zn,1911const ZRegister& zm,1912int index) {1913VIXL_ASSERT(allow_macro_instructions_);1914SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);1915}19161917void MacroAssembler::Usdot(const ZRegister& zd,1918const ZRegister& za,1919const ZRegister& zn,1920const ZRegister& zm,1921int index) {1922VIXL_ASSERT(allow_macro_instructions_);1923SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);1924}19251926void MacroAssembler::Cdot(const ZRegister& zd,1927const ZRegister& za,1928const ZRegister& zn,1929const ZRegister& zm,1930int index,1931int rot) {1932// This doesn't handle zm when it's out of the range that can be encoded in1933// instruction. The range depends on element size: z0-z7 for B, z0-15 for H.1934if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {1935UseScratchRegisterScope temps(this);1936ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);1937{1938MovprfxHelperScope guard(this, ztmp, za);1939cdot(ztmp, zn, zm, index, rot);1940}1941Mov(zd, ztmp);1942} else {1943MovprfxHelperScope guard(this, zd, za);1944cdot(zd, zn, zm, index, rot);1945}1946}19471948void MacroAssembler::Cdot(const ZRegister& zd,1949const ZRegister& za,1950const ZRegister& zn,1951const ZRegister& zm,1952int rot) {1953if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {1954UseScratchRegisterScope temps(this);1955VIXL_ASSERT(AreSameLaneSize(zn, zm));1956ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);1957Mov(ztmp, zd.Aliases(zn) ? zn : zm);1958MovprfxHelperScope guard(this, zd, za);1959cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);1960} else {1961MovprfxHelperScope guard(this, zd, za);1962cdot(zd, zn, zm, rot);1963}1964}19651966void MacroAssembler::FPMulAddHelper(const ZRegister& zd,1967const PRegisterM& pg,1968const ZRegister& za,1969const ZRegister& zn,1970const ZRegister& zm,1971SVEMulAddPredicatedZdaFn fn_zda,1972SVEMulAddPredicatedZdnFn fn_zdn,1973FPMacroNaNPropagationOption nan_option) {1974ResolveFPNaNPropagationOption(&nan_option);19751976if (zd.Aliases(za)) {1977// zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.1978SingleEmissionCheckScope guard(this);1979(this->*fn_zda)(zd, pg, zn, zm);1980} else if (zd.Aliases(zn)) {1981// zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.1982SingleEmissionCheckScope guard(this);1983(this->*fn_zdn)(zd, pg, zm, za);1984} else if (zd.Aliases(zm)) {1985switch (nan_option) {1986case FastNaNPropagation: {1987// We treat multiplication as commutative in the fast mode, so we can1988// swap zn and zm.1989// zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.1990SingleEmissionCheckScope guard(this);1991(this->*fn_zdn)(zd, pg, zn, za);1992return;1993}1994case StrictNaNPropagation: {1995UseScratchRegisterScope temps(this);1996// Use a scratch register to keep the argument order exactly as1997// specified.1998ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);1999{2000MovprfxHelperScope guard(this, scratch, pg, za);2001// scratch = (-)za + ((-)zn * zm)2002(this->*fn_zda)(scratch, pg, zn, zm);2003}2004Mov(zd, scratch);2005return;2006}2007case NoFPMacroNaNPropagationSelected:2008VIXL_UNREACHABLE();2009return;2010}2011} else {2012// zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.2013MovprfxHelperScope guard(this, zd, pg, za);2014(this->*fn_zda)(zd, pg, zn, zm);2015}2016}20172018void MacroAssembler::Fmla(const ZRegister& zd,2019const PRegisterM& pg,2020const ZRegister& za,2021const ZRegister& zn,2022const ZRegister& zm,2023FPMacroNaNPropagationOption nan_option) {2024VIXL_ASSERT(allow_macro_instructions_);2025FPMulAddHelper(zd,2026pg,2027za,2028zn,2029zm,2030&Assembler::fmla,2031&Assembler::fmad,2032nan_option);2033}20342035void MacroAssembler::Fmls(const ZRegister& zd,2036const PRegisterM& pg,2037const ZRegister& za,2038const ZRegister& zn,2039const ZRegister& zm,2040FPMacroNaNPropagationOption nan_option) {2041VIXL_ASSERT(allow_macro_instructions_);2042FPMulAddHelper(zd,2043pg,2044za,2045zn,2046zm,2047&Assembler::fmls,2048&Assembler::fmsb,2049nan_option);2050}20512052void MacroAssembler::Fnmla(const ZRegister& zd,2053const PRegisterM& pg,2054const ZRegister& za,2055const ZRegister& zn,2056const ZRegister& zm,2057FPMacroNaNPropagationOption nan_option) {2058VIXL_ASSERT(allow_macro_instructions_);2059FPMulAddHelper(zd,2060pg,2061za,2062zn,2063zm,2064&Assembler::fnmla,2065&Assembler::fnmad,2066nan_option);2067}20682069void MacroAssembler::Fnmls(const ZRegister& zd,2070const PRegisterM& pg,2071const ZRegister& za,2072const ZRegister& zn,2073const ZRegister& zm,2074FPMacroNaNPropagationOption nan_option) {2075VIXL_ASSERT(allow_macro_instructions_);2076FPMulAddHelper(zd,2077pg,2078za,2079zn,2080zm,2081&Assembler::fnmls,2082&Assembler::fnmsb,2083nan_option);2084}20852086void MacroAssembler::Ftmad(const ZRegister& zd,2087const ZRegister& zn,2088const ZRegister& zm,2089int imm3) {2090VIXL_ASSERT(allow_macro_instructions_);2091if (zd.Aliases(zm) && !zd.Aliases(zn)) {2092UseScratchRegisterScope temps(this);2093ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);2094Mov(scratch, zm);2095MovprfxHelperScope guard(this, zd, zn);2096ftmad(zd, zd, scratch, imm3);2097} else {2098MovprfxHelperScope guard(this, zd, zn);2099ftmad(zd, zd, zm, imm3);2100}2101}21022103void MacroAssembler::Fcadd(const ZRegister& zd,2104const PRegisterM& pg,2105const ZRegister& zn,2106const ZRegister& zm,2107int rot) {2108VIXL_ASSERT(allow_macro_instructions_);2109if (zd.Aliases(zm) && !zd.Aliases(zn)) {2110UseScratchRegisterScope temps(this);2111ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);2112{2113MovprfxHelperScope guard(this, scratch, pg, zn);2114fcadd(scratch, pg, scratch, zm, rot);2115}2116Mov(zd, scratch);2117} else {2118MovprfxHelperScope guard(this, zd, pg, zn);2119fcadd(zd, pg, zd, zm, rot);2120}2121}21222123void MacroAssembler::Fcmla(const ZRegister& zd,2124const PRegisterM& pg,2125const ZRegister& za,2126const ZRegister& zn,2127const ZRegister& zm,2128int rot) {2129VIXL_ASSERT(allow_macro_instructions_);2130if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {2131UseScratchRegisterScope temps(this);2132ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);2133{2134MovprfxHelperScope guard(this, ztmp, za);2135fcmla(ztmp, pg, zn, zm, rot);2136}2137Mov(zd, pg, ztmp);2138} else {2139MovprfxHelperScope guard(this, zd, pg, za);2140fcmla(zd, pg, zn, zm, rot);2141}2142}21432144void MacroAssembler::Splice(const ZRegister& zd,2145const PRegister& pg,2146const ZRegister& zn,2147const ZRegister& zm) {2148VIXL_ASSERT(allow_macro_instructions_);2149if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {2150SingleEmissionCheckScope guard(this);2151splice(zd, pg, zn, zm);2152} else if (zd.Aliases(zm) && !zd.Aliases(zn)) {2153UseScratchRegisterScope temps(this);2154ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);2155{2156MovprfxHelperScope guard(this, scratch, zn);2157splice(scratch, pg, scratch, zm);2158}2159Mov(zd, scratch);2160} else {2161MovprfxHelperScope guard(this, zd, zn);2162splice(zd, pg, zd, zm);2163}2164}21652166void MacroAssembler::Clasta(const ZRegister& zd,2167const PRegister& pg,2168const ZRegister& zn,2169const ZRegister& zm) {2170VIXL_ASSERT(allow_macro_instructions_);2171if (zd.Aliases(zm) && !zd.Aliases(zn)) {2172UseScratchRegisterScope temps(this);2173ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);2174{2175MovprfxHelperScope guard(this, scratch, zn);2176clasta(scratch, pg, scratch, zm);2177}2178Mov(zd, scratch);2179} else {2180MovprfxHelperScope guard(this, zd, zn);2181clasta(zd, pg, zd, zm);2182}2183}21842185void MacroAssembler::Clastb(const ZRegister& zd,2186const PRegister& pg,2187const ZRegister& zn,2188const ZRegister& zm) {2189VIXL_ASSERT(allow_macro_instructions_);2190if (zd.Aliases(zm) && !zd.Aliases(zn)) {2191UseScratchRegisterScope temps(this);2192ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);2193{2194MovprfxHelperScope guard(this, scratch, zn);2195clastb(scratch, pg, scratch, zm);2196}2197Mov(zd, scratch);2198} else {2199MovprfxHelperScope guard(this, zd, zn);2200clastb(zd, pg, zd, zm);2201}2202}22032204void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,2205const ZRegister& zd,2206const ZRegister& za,2207const ZRegister& zn,2208int shift) {2209VIXL_ASSERT(allow_macro_instructions_);2210if (!zd.Aliases(za) && zd.Aliases(zn)) {2211UseScratchRegisterScope temps(this);2212ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);2213Mov(ztmp, zn);2214{2215MovprfxHelperScope guard(this, zd, za);2216(this->*fn)(zd, ztmp, shift);2217}2218} else {2219MovprfxHelperScope guard(this, zd, za);2220(this->*fn)(zd, zn, shift);2221}2222}22232224void MacroAssembler::Srsra(const ZRegister& zd,2225const ZRegister& za,2226const ZRegister& zn,2227int shift) {2228ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);2229}22302231void MacroAssembler::Ssra(const ZRegister& zd,2232const ZRegister& za,2233const ZRegister& zn,2234int shift) {2235ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);2236}22372238void MacroAssembler::Ursra(const ZRegister& zd,2239const ZRegister& za,2240const ZRegister& zn,2241int shift) {2242ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);2243}22442245void MacroAssembler::Usra(const ZRegister& zd,2246const ZRegister& za,2247const ZRegister& zn,2248int shift) {2249ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);2250}22512252void MacroAssembler::ComplexAddition(ZZZImmFn fn,2253const ZRegister& zd,2254const ZRegister& zn,2255const ZRegister& zm,2256int rot) {2257VIXL_ASSERT(allow_macro_instructions_);2258if (!zd.Aliases(zn) && zd.Aliases(zm)) {2259UseScratchRegisterScope temps(this);2260ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);2261Mov(ztmp, zm);2262{2263MovprfxHelperScope guard(this, zd, zn);2264(this->*fn)(zd, zd, ztmp, rot);2265}2266} else {2267MovprfxHelperScope guard(this, zd, zn);2268(this->*fn)(zd, zd, zm, rot);2269}2270}22712272void MacroAssembler::Cadd(const ZRegister& zd,2273const ZRegister& zn,2274const ZRegister& zm,2275int rot) {2276ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);2277}22782279void MacroAssembler::Sqcadd(const ZRegister& zd,2280const ZRegister& zn,2281const ZRegister& zm,2282int rot) {2283ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);2284}22852286} // namespace aarch642287} // namespace vixl228822892290