CoCalc -- macro-assembler-sve-aarch64.cc

GitHub Repository: stenzek/duckstation
Path: blob/master/dep/vixl/src/aarch64/macro-assembler-sve-aarch64.cc
⁴²⁶¹ views
1
// Copyright 2019, VIXL authors
2
// All rights reserved.
3
//
4
// Redistribution and use in source and binary forms, with or without
5
// modification, are permitted provided that the following conditions are met:
6
//
7
//   * Redistributions of source code must retain the above copyright notice,
8
//     this list of conditions and the following disclaimer.
9
//   * Redistributions in binary form must reproduce the above copyright notice,
10
//     this list of conditions and the following disclaimer in the documentation
11
//     and/or other materials provided with the distribution.
12
//   * Neither the name of ARM Limited nor the names of its contributors may be
13
//     used to endorse or promote products derived from this software without
14
//     specific prior written permission.
15
//
16
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26

27
#include "macro-assembler-aarch64.h"
28

29
namespace vixl {
30
namespace aarch64 {
31

32
void MacroAssembler::AddSubHelper(AddSubHelperOption option,
33
                                  const ZRegister& zd,
34
                                  const ZRegister& zn,
35
                                  IntegerOperand imm) {
36
  VIXL_ASSERT(imm.FitsInLane(zd));
37

38
  // Simple, encodable cases.
39
  if (TrySingleAddSub(option, zd, zn, imm)) return;
40

41
  VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
42
  bool add_imm = (option == kAddImmediate);
43

44
  // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
45
  // instruction. Also interpret the immediate as signed, so we can convert
46
  // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
47
  IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
48
  if (signed_imm.IsNegative()) {
49
    AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
50
    IntegerOperand n_imm(signed_imm.GetMagnitude());
51
    // IntegerOperand can represent -INT_MIN, so this is always safe.
52
    VIXL_ASSERT(n_imm.IsPositiveOrZero());
53
    if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
54
  }
55

56
  // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
57
  UseScratchRegisterScope temps(this);
58
  ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
59
  Dup(scratch, imm);
60

61
  SingleEmissionCheckScope guard(this);
62
  if (add_imm) {
63
    add(zd, zn, scratch);
64
  } else {
65
    sub(zd, zn, scratch);
66
  }
67
}
68

69
bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
70
                                     const ZRegister& zd,
71
                                     const ZRegister& zn,
72
                                     IntegerOperand imm) {
73
  VIXL_ASSERT(imm.FitsInLane(zd));
74

75
  int imm8;
76
  int shift = -1;
77
  if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
78
      imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
79
    MovprfxHelperScope guard(this, zd, zn);
80
    switch (option) {
81
      case kAddImmediate:
82
        add(zd, zd, imm8, shift);
83
        return true;
84
      case kSubImmediate:
85
        sub(zd, zd, imm8, shift);
86
        return true;
87
    }
88
  }
89
  return false;
90
}
91

92
void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
93
                                      SVEArithPredicatedFn reg_macro,
94
                                      const ZRegister& zd,
95
                                      const ZRegister& zn,
96
                                      IntegerOperand imm,
97
                                      bool is_signed) {
98
  if (is_signed) {
99
    // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
100
    if (imm.IsInt8()) {
101
      MovprfxHelperScope guard(this, zd, zn);
102
      (this->*imm_fn)(zd, zd, imm.AsInt8());
103
      return;
104
    }
105
  } else {
106
    // E.g. UMIN_z_zi, UMAX_z_zi
107
    if (imm.IsUint8()) {
108
      MovprfxHelperScope guard(this, zd, zn);
109
      (this->*imm_fn)(zd, zd, imm.AsUint8());
110
      return;
111
    }
112
  }
113

114
  UseScratchRegisterScope temps(this);
115
  PRegister pg = temps.AcquireGoverningP();
116
  Ptrue(pg.WithSameLaneSizeAs(zd));
117

118
  // Try to re-use zd if we can, so we can avoid a movprfx.
119
  ZRegister scratch =
120
      zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
121
                     : zd;
122
  Dup(scratch, imm);
123

124
  // The vector-form macro for commutative operations will swap the arguments to
125
  // avoid movprfx, if necessary.
126
  (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
127
}
128

129
void MacroAssembler::Mul(const ZRegister& zd,
130
                         const ZRegister& zn,
131
                         IntegerOperand imm) {
132
  VIXL_ASSERT(allow_macro_instructions_);
133
  IntArithImmFn imm_fn = &Assembler::mul;
134
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
135
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
136
}
137

138
void MacroAssembler::Smin(const ZRegister& zd,
139
                          const ZRegister& zn,
140
                          IntegerOperand imm) {
141
  VIXL_ASSERT(allow_macro_instructions_);
142
  VIXL_ASSERT(imm.FitsInSignedLane(zd));
143
  IntArithImmFn imm_fn = &Assembler::smin;
144
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
145
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
146
}
147

148
void MacroAssembler::Smax(const ZRegister& zd,
149
                          const ZRegister& zn,
150
                          IntegerOperand imm) {
151
  VIXL_ASSERT(allow_macro_instructions_);
152
  VIXL_ASSERT(imm.FitsInSignedLane(zd));
153
  IntArithImmFn imm_fn = &Assembler::smax;
154
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
155
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
156
}
157

158
void MacroAssembler::Umax(const ZRegister& zd,
159
                          const ZRegister& zn,
160
                          IntegerOperand imm) {
161
  VIXL_ASSERT(allow_macro_instructions_);
162
  VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
163
  IntArithImmFn imm_fn = &Assembler::umax;
164
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
165
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
166
}
167

168
void MacroAssembler::Umin(const ZRegister& zd,
169
                          const ZRegister& zn,
170
                          IntegerOperand imm) {
171
  VIXL_ASSERT(allow_macro_instructions_);
172
  VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
173
  IntArithImmFn imm_fn = &Assembler::umin;
174
  SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
175
  IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
176
}
177

178
void MacroAssembler::Addpl(const Register& xd,
179
                           const Register& xn,
180
                           int64_t multiplier) {
181
  VIXL_ASSERT(allow_macro_instructions_);
182

183
  // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
184
  // `VL * multiplier` cannot overflow, for any possible value of VL.
185
  VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
186
  VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
187

188
  if (xd.IsZero()) return;
189
  if (xn.IsZero() && xd.IsSP()) {
190
    // TODO: This operation doesn't make much sense, but we could support it
191
    // with a scratch register if necessary.
192
    VIXL_UNIMPLEMENTED();
193
  }
194

195
  // Handling xzr requires an extra move, so defer it until later so we can try
196
  // to use `rdvl` instead (via `Addvl`).
197
  if (IsInt6(multiplier) && !xn.IsZero()) {
198
    SingleEmissionCheckScope guard(this);
199
    addpl(xd, xn, static_cast<int>(multiplier));
200
    return;
201
  }
202

203
  // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
204
  if ((multiplier % kZRegBitsPerPRegBit) == 0) {
205
    Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
206
    return;
207
  }
208

209
  if (IsInt6(multiplier)) {
210
    VIXL_ASSERT(xn.IsZero());  // Other cases were handled with `addpl`.
211
    // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
212
    // materialise a zero.
213
    MacroEmissionCheckScope guard(this);
214
    movz(xd, 0);
215
    addpl(xd, xd, static_cast<int>(multiplier));
216
    return;
217
  }
218

219
  // TODO: Some probable cases result in rather long sequences. For example,
220
  // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
221
  // outside the encodable range. We should look for ways to cover such cases
222
  // without drastically increasing the complexity of this logic.
223

224
  // For other cases, calculate xn + (PL * multiplier) using discrete
225
  // instructions. This requires two scratch registers in the general case, so
226
  // try to re-use the destination as a scratch register.
227
  UseScratchRegisterScope temps(this);
228
  temps.Include(xd);
229
  temps.Exclude(xn);
230

231
  Register scratch = temps.AcquireX();
232
  // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
233
  // scale the multiplier because (we already know) it isn't a multiple of 8.
234
  Rdvl(scratch, multiplier);
235

236
  MacroEmissionCheckScope guard(this);
237
  if (xn.IsZero()) {
238
    asr(xd, scratch, kZRegBitsPerPRegBitLog2);
239
  } else if (xd.IsSP() || xn.IsSP()) {
240
    // TODO: MacroAssembler::Add should be able to handle this.
241
    asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
242
    add(xd, xn, scratch);
243
  } else {
244
    add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
245
  }
246
}
247

248
void MacroAssembler::Addvl(const Register& xd,
249
                           const Register& xn,
250
                           int64_t multiplier) {
251
  VIXL_ASSERT(allow_macro_instructions_);
252
  VIXL_ASSERT(xd.IsX());
253
  VIXL_ASSERT(xn.IsX());
254

255
  // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
256
  VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
257
  VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
258

259
  if (xd.IsZero()) return;
260
  if (xn.IsZero() && xd.IsSP()) {
261
    // TODO: This operation doesn't make much sense, but we could support it
262
    // with a scratch register if necessary. `rdvl` cannot write into `sp`.
263
    VIXL_UNIMPLEMENTED();
264
  }
265

266
  if (IsInt6(multiplier)) {
267
    SingleEmissionCheckScope guard(this);
268
    if (xn.IsZero()) {
269
      rdvl(xd, static_cast<int>(multiplier));
270
    } else {
271
      addvl(xd, xn, static_cast<int>(multiplier));
272
    }
273
    return;
274
  }
275

276
  // TODO: Some probable cases result in rather long sequences. For example,
277
  // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
278
  // outside the encodable range. We should look for ways to cover such cases
279
  // without drastically increasing the complexity of this logic.
280

281
  // For other cases, calculate xn + (VL * multiplier) using discrete
282
  // instructions. This requires two scratch registers in the general case, so
283
  // we try to re-use the destination as a scratch register.
284
  UseScratchRegisterScope temps(this);
285
  temps.Include(xd);
286
  temps.Exclude(xn);
287

288
  Register a = temps.AcquireX();
289
  Mov(a, multiplier);
290

291
  MacroEmissionCheckScope guard(this);
292
  Register b = temps.AcquireX();
293
  rdvl(b, 1);
294
  if (xn.IsZero()) {
295
    mul(xd, a, b);
296
  } else if (xd.IsSP() || xn.IsSP()) {
297
    mul(a, a, b);
298
    add(xd, xn, a);
299
  } else {
300
    madd(xd, a, b, xn);
301
  }
302
}
303

304
void MacroAssembler::CalculateSVEAddress(const Register& xd,
305
                                         const SVEMemOperand& addr,
306
                                         int vl_divisor_log2) {
307
  VIXL_ASSERT(allow_macro_instructions_);
308
  VIXL_ASSERT(!addr.IsScatterGather());
309
  VIXL_ASSERT(xd.IsX());
310

311
  // The lower bound is where a whole Z register is accessed.
312
  VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
313
  // The upper bound is for P register accesses, and for instructions like
314
  // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
315
  VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
316

317
  SVEOffsetModifier mod = addr.GetOffsetModifier();
318
  Register base = addr.GetScalarBase();
319

320
  if (addr.IsEquivalentToScalar()) {
321
    // For example:
322
    //   [x0]
323
    //   [x0, #0]
324
    //   [x0, xzr, LSL 2]
325
    Mov(xd, base);
326
  } else if (addr.IsScalarPlusImmediate()) {
327
    // For example:
328
    //   [x0, #42]
329
    //   [x0, #42, MUL VL]
330
    int64_t offset = addr.GetImmediateOffset();
331
    VIXL_ASSERT(offset != 0);  // Handled by IsEquivalentToScalar.
332
    if (addr.IsMulVl()) {
333
      int vl_divisor = 1 << vl_divisor_log2;
334
      // For all possible values of vl_divisor, we can simply use `Addpl`. This
335
      // will select `addvl` if necessary.
336
      VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
337
      Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
338
    } else {
339
      // IsScalarPlusImmediate() ensures that no other modifiers can occur.
340
      VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
341
      Add(xd, base, offset);
342
    }
343
  } else if (addr.IsScalarPlusScalar()) {
344
    // For example:
345
    //   [x0, x1]
346
    //   [x0, x1, LSL #4]
347
    Register offset = addr.GetScalarOffset();
348
    VIXL_ASSERT(!offset.IsZero());  // Handled by IsEquivalentToScalar.
349
    if (mod == SVE_LSL) {
350
      Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
351
    } else {
352
      // IsScalarPlusScalar() ensures that no other modifiers can occur.
353
      VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
354
      Add(xd, base, offset);
355
    }
356
  } else {
357
    // All other forms are scatter-gather addresses, which cannot be evaluated
358
    // into an X register.
359
    VIXL_UNREACHABLE();
360
  }
361
}
362

363
void MacroAssembler::Cpy(const ZRegister& zd,
364
                         const PRegister& pg,
365
                         IntegerOperand imm) {
366
  VIXL_ASSERT(allow_macro_instructions_);
367
  VIXL_ASSERT(imm.FitsInLane(zd));
368
  int imm8;
369
  int shift;
370
  if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
371
      imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
372
    SingleEmissionCheckScope guard(this);
373
    cpy(zd, pg, imm8, shift);
374
    return;
375
  }
376

377
  // The fallbacks rely on `cpy` variants that only support merging predication.
378
  // If zeroing predication was requested, zero the destination first.
379
  if (pg.IsZeroing()) {
380
    SingleEmissionCheckScope guard(this);
381
    dup(zd, 0);
382
  }
383
  PRegisterM pg_m = pg.Merging();
384

385
  // Try to encode the immediate using fcpy.
386
  VIXL_ASSERT(imm.FitsInLane(zd));
387
  if (zd.GetLaneSizeInBits() >= kHRegSize) {
388
    double fp_imm = 0.0;
389
    switch (zd.GetLaneSizeInBits()) {
390
      case kHRegSize:
391
        fp_imm =
392
            FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
393
        break;
394
      case kSRegSize:
395
        fp_imm = RawbitsToFloat(imm.AsUint32());
396
        break;
397
      case kDRegSize:
398
        fp_imm = RawbitsToDouble(imm.AsUint64());
399
        break;
400
      default:
401
        VIXL_UNREACHABLE();
402
        break;
403
    }
404
    // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
405
    // we can use IsImmFP64 for all lane sizes.
406
    if (IsImmFP64(fp_imm)) {
407
      SingleEmissionCheckScope guard(this);
408
      fcpy(zd, pg_m, fp_imm);
409
      return;
410
    }
411
  }
412

413
  // Fall back to using a scratch register.
414
  UseScratchRegisterScope temps(this);
415
  Register scratch = temps.AcquireRegisterToHoldLane(zd);
416
  Mov(scratch, imm);
417

418
  SingleEmissionCheckScope guard(this);
419
  cpy(zd, pg_m, scratch);
420
}
421

422
// TODO: We implement Fcpy (amongst other things) for all FP types because it
423
// allows us to preserve user-specified NaNs. We should come up with some
424
// FPImmediate type to abstract this, and avoid all the duplication below (and
425
// elsewhere).
426

427
void MacroAssembler::Fcpy(const ZRegister& zd,
428
                          const PRegisterM& pg,
429
                          double imm) {
430
  VIXL_ASSERT(allow_macro_instructions_);
431
  VIXL_ASSERT(pg.IsMerging());
432

433
  if (IsImmFP64(imm)) {
434
    SingleEmissionCheckScope guard(this);
435
    fcpy(zd, pg, imm);
436
    return;
437
  }
438

439
  // As a fall-back, cast the immediate to the required lane size, and try to
440
  // encode the bit pattern using `Cpy`.
441
  Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
442
}
443

444
void MacroAssembler::Fcpy(const ZRegister& zd,
445
                          const PRegisterM& pg,
446
                          float imm) {
447
  VIXL_ASSERT(allow_macro_instructions_);
448
  VIXL_ASSERT(pg.IsMerging());
449

450
  if (IsImmFP32(imm)) {
451
    SingleEmissionCheckScope guard(this);
452
    fcpy(zd, pg, imm);
453
    return;
454
  }
455

456
  // As a fall-back, cast the immediate to the required lane size, and try to
457
  // encode the bit pattern using `Cpy`.
458
  Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
459
}
460

461
void MacroAssembler::Fcpy(const ZRegister& zd,
462
                          const PRegisterM& pg,
463
                          Float16 imm) {
464
  VIXL_ASSERT(allow_macro_instructions_);
465
  VIXL_ASSERT(pg.IsMerging());
466

467
  if (IsImmFP16(imm)) {
468
    SingleEmissionCheckScope guard(this);
469
    fcpy(zd, pg, imm);
470
    return;
471
  }
472

473
  // As a fall-back, cast the immediate to the required lane size, and try to
474
  // encode the bit pattern using `Cpy`.
475
  Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
476
}
477

478
void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
479
  VIXL_ASSERT(allow_macro_instructions_);
480
  VIXL_ASSERT(imm.FitsInLane(zd));
481
  unsigned lane_size = zd.GetLaneSizeInBits();
482
  int imm8;
483
  int shift;
484
  if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
485
      imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
486
    SingleEmissionCheckScope guard(this);
487
    dup(zd, imm8, shift);
488
  } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
489
    SingleEmissionCheckScope guard(this);
490
    dupm(zd, imm.AsUintN(lane_size));
491
  } else {
492
    UseScratchRegisterScope temps(this);
493
    Register scratch = temps.AcquireRegisterToHoldLane(zd);
494
    Mov(scratch, imm);
495

496
    SingleEmissionCheckScope guard(this);
497
    dup(zd, scratch);
498
  }
499
}
500

501
void MacroAssembler::NoncommutativeArithmeticHelper(
502
    const ZRegister& zd,
503
    const PRegisterM& pg,
504
    const ZRegister& zn,
505
    const ZRegister& zm,
506
    SVEArithPredicatedFn fn,
507
    SVEArithPredicatedFn rev_fn) {
508
  if (zd.Aliases(zn)) {
509
    // E.g. zd = zd / zm
510
    SingleEmissionCheckScope guard(this);
511
    (this->*fn)(zd, pg, zn, zm);
512
  } else if (zd.Aliases(zm)) {
513
    // E.g. zd = zn / zd
514
    SingleEmissionCheckScope guard(this);
515
    (this->*rev_fn)(zd, pg, zm, zn);
516
  } else {
517
    // E.g. zd = zn / zm
518
    MovprfxHelperScope guard(this, zd, pg, zn);
519
    (this->*fn)(zd, pg, zd, zm);
520
  }
521
}
522

523
void MacroAssembler::FPCommutativeArithmeticHelper(
524
    const ZRegister& zd,
525
    const PRegisterM& pg,
526
    const ZRegister& zn,
527
    const ZRegister& zm,
528
    SVEArithPredicatedFn fn,
529
    FPMacroNaNPropagationOption nan_option) {
530
  ResolveFPNaNPropagationOption(&nan_option);
531

532
  if (zd.Aliases(zn)) {
533
    SingleEmissionCheckScope guard(this);
534
    (this->*fn)(zd, pg, zd, zm);
535
  } else if (zd.Aliases(zm)) {
536
    switch (nan_option) {
537
      case FastNaNPropagation: {
538
        // Swap the arguments.
539
        SingleEmissionCheckScope guard(this);
540
        (this->*fn)(zd, pg, zd, zn);
541
        return;
542
      }
543
      case StrictNaNPropagation: {
544
        UseScratchRegisterScope temps(this);
545
        // Use a scratch register to keep the argument order exactly as
546
        // specified.
547
        ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
548
        {
549
          MovprfxHelperScope guard(this, scratch, pg, zn);
550
          (this->*fn)(scratch, pg, scratch, zm);
551
        }
552
        Mov(zd, scratch);
553
        return;
554
      }
555
      case NoFPMacroNaNPropagationSelected:
556
        VIXL_UNREACHABLE();
557
        return;
558
    }
559
  } else {
560
    MovprfxHelperScope guard(this, zd, pg, zn);
561
    (this->*fn)(zd, pg, zd, zm);
562
  }
563
}
564

565
// Instructions of the form "inst zda, zn, zm, #num", where they are
566
// non-commutative and no reversed form is provided.
567
#define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
568
  V(Cmla, cmla)                              \
569
  V(Sqrdcmlah, sqrdcmlah)
570

571
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
572
  void MacroAssembler::MASMFN(const ZRegister& zd,               \
573
                              const ZRegister& za,               \
574
                              const ZRegister& zn,               \
575
                              const ZRegister& zm,               \
576
                              int imm) {                         \
577
    if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
578
      UseScratchRegisterScope temps(this);                       \
579
      VIXL_ASSERT(AreSameLaneSize(zn, zm));                      \
580
      ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);  \
581
      Mov(ztmp, zd.Aliases(zn) ? zn : zm);                       \
582
      MovprfxHelperScope guard(this, zd, za);                    \
583
      ASMFN(zd,                                                  \
584
            (zd.Aliases(zn) ? ztmp : zn),                        \
585
            (zd.Aliases(zm) ? ztmp : zm),                        \
586
            imm);                                                \
587
    } else {                                                     \
588
      MovprfxHelperScope guard(this, zd, za);                    \
589
      ASMFN(zd, zn, zm, imm);                                    \
590
    }                                                            \
591
  }
592
VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
593
#undef VIXL_DEFINE_MASM_FUNC
594

595
// Instructions of the form "inst zda, zn, zm, #num, #num", where they are
596
// non-commutative and no reversed form is provided.
597
#define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
598
  V(Cmla, cmla)                               \
599
  V(Sqrdcmlah, sqrdcmlah)
600

601
// This doesn't handle zm when it's out of the range that can be encoded in
602
// instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
603
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                     \
604
  void MacroAssembler::MASMFN(const ZRegister& zd,               \
605
                              const ZRegister& za,               \
606
                              const ZRegister& zn,               \
607
                              const ZRegister& zm,               \
608
                              int index,                         \
609
                              int rot) {                         \
610
    if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
611
      UseScratchRegisterScope temps(this);                       \
612
      ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);  \
613
      {                                                          \
614
        MovprfxHelperScope guard(this, ztmp, za);                \
615
        ASMFN(ztmp, zn, zm, index, rot);                         \
616
      }                                                          \
617
      Mov(zd, ztmp);                                             \
618
    } else {                                                     \
619
      MovprfxHelperScope guard(this, zd, za);                    \
620
      ASMFN(zd, zn, zm, index, rot);                             \
621
    }                                                            \
622
  }
623
VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
624
#undef VIXL_DEFINE_MASM_FUNC
625

626
// Instructions of the form "inst zda, pg, zda, zn", where they are
627
// non-commutative and no reversed form is provided.
628
#define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
629
  V(Addp, addp)                             \
630
  V(Bic, bic)                               \
631
  V(Faddp, faddp)                           \
632
  V(Fmaxnmp, fmaxnmp)                       \
633
  V(Fminnmp, fminnmp)                       \
634
  V(Fmaxp, fmaxp)                           \
635
  V(Fminp, fminp)                           \
636
  V(Fscale, fscale)                         \
637
  V(Smaxp, smaxp)                           \
638
  V(Sminp, sminp)                           \
639
  V(Suqadd, suqadd)                         \
640
  V(Umaxp, umaxp)                           \
641
  V(Uminp, uminp)                           \
642
  V(Usqadd, usqadd)
643

644
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                       \
645
  void MacroAssembler::MASMFN(const ZRegister& zd,                 \
646
                              const PRegisterM& pg,                \
647
                              const ZRegister& zn,                 \
648
                              const ZRegister& zm) {               \
649
    VIXL_ASSERT(allow_macro_instructions_);                        \
650
    if (zd.Aliases(zm) && !zd.Aliases(zn)) {                       \
651
      UseScratchRegisterScope temps(this);                         \
652
      ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
653
      Mov(scratch, zm);                                            \
654
      MovprfxHelperScope guard(this, zd, pg, zn);                  \
655
      ASMFN(zd, pg, zd, scratch);                                  \
656
    } else {                                                       \
657
      MovprfxHelperScope guard(this, zd, pg, zn);                  \
658
      ASMFN(zd, pg, zd, zm);                                       \
659
    }                                                              \
660
  }
661
VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
662
#undef VIXL_DEFINE_MASM_FUNC
663

664
// Instructions of the form "inst zda, pg, zda, zn", where they are
665
// non-commutative and a reversed form is provided.
666
#define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
667
  V(Asr, asr)                                       \
668
  V(Fdiv, fdiv)                                     \
669
  V(Fsub, fsub)                                     \
670
  V(Lsl, lsl)                                       \
671
  V(Lsr, lsr)                                       \
672
  V(Sdiv, sdiv)                                     \
673
  V(Shsub, shsub)                                   \
674
  V(Sqrshl, sqrshl)                                 \
675
  V(Sqshl, sqshl)                                   \
676
  V(Sqsub, sqsub)                                   \
677
  V(Srshl, srshl)                                   \
678
  V(Sub, sub)                                       \
679
  V(Udiv, udiv)                                     \
680
  V(Uhsub, uhsub)                                   \
681
  V(Uqrshl, uqrshl)                                 \
682
  V(Uqshl, uqshl)                                   \
683
  V(Uqsub, uqsub)                                   \
684
  V(Urshl, urshl)
685

686
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN)                          \
687
  void MacroAssembler::MASMFN(const ZRegister& zd,                    \
688
                              const PRegisterM& pg,                   \
689
                              const ZRegister& zn,                    \
690
                              const ZRegister& zm) {                  \
691
    VIXL_ASSERT(allow_macro_instructions_);                           \
692
    NoncommutativeArithmeticHelper(zd,                                \
693
                                   pg,                                \
694
                                   zn,                                \
695
                                   zm,                                \
696
                                   static_cast<SVEArithPredicatedFn>( \
697
                                       &Assembler::ASMFN),            \
698
                                   static_cast<SVEArithPredicatedFn>( \
699
                                       &Assembler::ASMFN##r));        \
700
  }
701
VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
702
#undef VIXL_DEFINE_MASM_FUNC
703

704
void MacroAssembler::Fadd(const ZRegister& zd,
705
                          const PRegisterM& pg,
706
                          const ZRegister& zn,
707
                          const ZRegister& zm,
708
                          FPMacroNaNPropagationOption nan_option) {
709
  VIXL_ASSERT(allow_macro_instructions_);
710
  FPCommutativeArithmeticHelper(zd,
711
                                pg,
712
                                zn,
713
                                zm,
714
                                static_cast<SVEArithPredicatedFn>(
715
                                    &Assembler::fadd),
716
                                nan_option);
717
}
718

719
void MacroAssembler::Fabd(const ZRegister& zd,
720
                          const PRegisterM& pg,
721
                          const ZRegister& zn,
722
                          const ZRegister& zm,
723
                          FPMacroNaNPropagationOption nan_option) {
724
  VIXL_ASSERT(allow_macro_instructions_);
725
  FPCommutativeArithmeticHelper(zd,
726
                                pg,
727
                                zn,
728
                                zm,
729
                                static_cast<SVEArithPredicatedFn>(
730
                                    &Assembler::fabd),
731
                                nan_option);
732
}
733

734
void MacroAssembler::Fmul(const ZRegister& zd,
735
                          const PRegisterM& pg,
736
                          const ZRegister& zn,
737
                          const ZRegister& zm,
738
                          FPMacroNaNPropagationOption nan_option) {
739
  VIXL_ASSERT(allow_macro_instructions_);
740
  FPCommutativeArithmeticHelper(zd,
741
                                pg,
742
                                zn,
743
                                zm,
744
                                static_cast<SVEArithPredicatedFn>(
745
                                    &Assembler::fmul),
746
                                nan_option);
747
}
748

749
void MacroAssembler::Fmulx(const ZRegister& zd,
750
                           const PRegisterM& pg,
751
                           const ZRegister& zn,
752
                           const ZRegister& zm,
753
                           FPMacroNaNPropagationOption nan_option) {
754
  VIXL_ASSERT(allow_macro_instructions_);
755
  FPCommutativeArithmeticHelper(zd,
756
                                pg,
757
                                zn,
758
                                zm,
759
                                static_cast<SVEArithPredicatedFn>(
760
                                    &Assembler::fmulx),
761
                                nan_option);
762
}
763

764
void MacroAssembler::Fmax(const ZRegister& zd,
765
                          const PRegisterM& pg,
766
                          const ZRegister& zn,
767
                          const ZRegister& zm,
768
                          FPMacroNaNPropagationOption nan_option) {
769
  VIXL_ASSERT(allow_macro_instructions_);
770
  FPCommutativeArithmeticHelper(zd,
771
                                pg,
772
                                zn,
773
                                zm,
774
                                static_cast<SVEArithPredicatedFn>(
775
                                    &Assembler::fmax),
776
                                nan_option);
777
}
778

779
void MacroAssembler::Fmin(const ZRegister& zd,
780
                          const PRegisterM& pg,
781
                          const ZRegister& zn,
782
                          const ZRegister& zm,
783
                          FPMacroNaNPropagationOption nan_option) {
784
  VIXL_ASSERT(allow_macro_instructions_);
785
  FPCommutativeArithmeticHelper(zd,
786
                                pg,
787
                                zn,
788
                                zm,
789
                                static_cast<SVEArithPredicatedFn>(
790
                                    &Assembler::fmin),
791
                                nan_option);
792
}
793

794
void MacroAssembler::Fmaxnm(const ZRegister& zd,
795
                            const PRegisterM& pg,
796
                            const ZRegister& zn,
797
                            const ZRegister& zm,
798
                            FPMacroNaNPropagationOption nan_option) {
799
  VIXL_ASSERT(allow_macro_instructions_);
800
  FPCommutativeArithmeticHelper(zd,
801
                                pg,
802
                                zn,
803
                                zm,
804
                                static_cast<SVEArithPredicatedFn>(
805
                                    &Assembler::fmaxnm),
806
                                nan_option);
807
}
808

809
void MacroAssembler::Fminnm(const ZRegister& zd,
810
                            const PRegisterM& pg,
811
                            const ZRegister& zn,
812
                            const ZRegister& zm,
813
                            FPMacroNaNPropagationOption nan_option) {
814
  VIXL_ASSERT(allow_macro_instructions_);
815
  FPCommutativeArithmeticHelper(zd,
816
                                pg,
817
                                zn,
818
                                zm,
819
                                static_cast<SVEArithPredicatedFn>(
820
                                    &Assembler::fminnm),
821
                                nan_option);
822
}
823

824
void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
825
  VIXL_ASSERT(allow_macro_instructions_);
826

827
  switch (zd.GetLaneSizeInBits()) {
828
    case kHRegSize:
829
      Fdup(zd, Float16(imm));
830
      break;
831
    case kSRegSize:
832
      Fdup(zd, static_cast<float>(imm));
833
      break;
834
    case kDRegSize:
835
      uint64_t bits = DoubleToRawbits(imm);
836
      if (IsImmFP64(bits)) {
837
        SingleEmissionCheckScope guard(this);
838
        fdup(zd, imm);
839
      } else {
840
        Dup(zd, bits);
841
      }
842
      break;
843
  }
844
}
845

846
void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
847
  VIXL_ASSERT(allow_macro_instructions_);
848

849
  switch (zd.GetLaneSizeInBits()) {
850
    case kHRegSize:
851
      Fdup(zd, Float16(imm));
852
      break;
853
    case kSRegSize:
854
      if (IsImmFP32(imm)) {
855
        SingleEmissionCheckScope guard(this);
856
        fdup(zd, imm);
857
      } else {
858
        Dup(zd, FloatToRawbits(imm));
859
      }
860
      break;
861
    case kDRegSize:
862
      Fdup(zd, static_cast<double>(imm));
863
      break;
864
  }
865
}
866

867
void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
868
  VIXL_ASSERT(allow_macro_instructions_);
869

870
  switch (zd.GetLaneSizeInBits()) {
871
    case kHRegSize:
872
      if (IsImmFP16(imm)) {
873
        SingleEmissionCheckScope guard(this);
874
        fdup(zd, imm);
875
      } else {
876
        Dup(zd, Float16ToRawbits(imm));
877
      }
878
      break;
879
    case kSRegSize:
880
      Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
881
      break;
882
    case kDRegSize:
883
      Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
884
      break;
885
  }
886
}
887

888
void MacroAssembler::Index(const ZRegister& zd,
889
                           const Operand& start,
890
                           const Operand& step) {
891
  class IndexOperand : public Operand {
892
   public:
893
    static IndexOperand Prepare(MacroAssembler* masm,
894
                                UseScratchRegisterScope* temps,
895
                                const Operand& op,
896
                                const ZRegister& zd_inner) {
897
      // Look for encodable immediates.
898
      int imm;
899
      if (op.IsImmediate()) {
900
        if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
901
          return IndexOperand(imm);
902
        }
903
        Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
904
        masm->Mov(scratch, op);
905
        return IndexOperand(scratch);
906
      } else {
907
        // Plain registers can be encoded directly.
908
        VIXL_ASSERT(op.IsPlainRegister());
909
        return IndexOperand(op.GetRegister());
910
      }
911
    }
912

913
    int GetImm5() const {
914
      int64_t imm = GetImmediate();
915
      VIXL_ASSERT(IsInt5(imm));
916
      return static_cast<int>(imm);
917
    }
918

919
   private:
920
    explicit IndexOperand(const Register& reg) : Operand(reg) {}
921
    explicit IndexOperand(int64_t imm) : Operand(imm) {}
922
  };
923

924
  UseScratchRegisterScope temps(this);
925
  IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
926
  IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
927

928
  SingleEmissionCheckScope guard(this);
929
  if (start_enc.IsImmediate()) {
930
    if (step_enc.IsImmediate()) {
931
      index(zd, start_enc.GetImm5(), step_enc.GetImm5());
932
    } else {
933
      index(zd, start_enc.GetImm5(), step_enc.GetRegister());
934
    }
935
  } else {
936
    if (step_enc.IsImmediate()) {
937
      index(zd, start_enc.GetRegister(), step_enc.GetImm5());
938
    } else {
939
      index(zd, start_enc.GetRegister(), step_enc.GetRegister());
940
    }
941
  }
942
}
943

944
void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
945
  VIXL_ASSERT(allow_macro_instructions_);
946
  VIXL_ASSERT(imm.FitsInLane(zdn));
947

948
  if (imm.IsZero()) {
949
    SingleEmissionCheckScope guard(this);
950
    insr(zdn, xzr);
951
    return;
952
  }
953

954
  UseScratchRegisterScope temps(this);
955
  Register scratch = temps.AcquireRegisterToHoldLane(zdn);
956

957
  // TODO: There are many cases where we could optimise immediates, such as by
958
  // detecting repeating patterns or FP immediates. We should optimise and
959
  // abstract this for use in other SVE mov-immediate-like macros.
960
  Mov(scratch, imm);
961

962
  SingleEmissionCheckScope guard(this);
963
  insr(zdn, scratch);
964
}
965

966
void MacroAssembler::Mla(const ZRegister& zd,
967
                         const PRegisterM& pg,
968
                         const ZRegister& za,
969
                         const ZRegister& zn,
970
                         const ZRegister& zm) {
971
  VIXL_ASSERT(allow_macro_instructions_);
972
  if (zd.Aliases(za)) {
973
    // zda = zda + (zn * zm)
974
    SingleEmissionCheckScope guard(this);
975
    mla(zd, pg, zn, zm);
976
  } else if (zd.Aliases(zn)) {
977
    // zdn = za + (zdn * zm)
978
    SingleEmissionCheckScope guard(this);
979
    mad(zd, pg, zm, za);
980
  } else if (zd.Aliases(zm)) {
981
    // Multiplication is commutative, so we can swap zn and zm.
982
    // zdm = za + (zdm * zn)
983
    SingleEmissionCheckScope guard(this);
984
    mad(zd, pg, zn, za);
985
  } else {
986
    // zd = za + (zn * zm)
987
    ExactAssemblyScope guard(this, 2 * kInstructionSize);
988
    movprfx(zd, pg, za);
989
    mla(zd, pg, zn, zm);
990
  }
991
}
992

993
void MacroAssembler::Mls(const ZRegister& zd,
994
                         const PRegisterM& pg,
995
                         const ZRegister& za,
996
                         const ZRegister& zn,
997
                         const ZRegister& zm) {
998
  VIXL_ASSERT(allow_macro_instructions_);
999
  if (zd.Aliases(za)) {
1000
    // zda = zda - (zn * zm)
1001
    SingleEmissionCheckScope guard(this);
1002
    mls(zd, pg, zn, zm);
1003
  } else if (zd.Aliases(zn)) {
1004
    // zdn = za - (zdn * zm)
1005
    SingleEmissionCheckScope guard(this);
1006
    msb(zd, pg, zm, za);
1007
  } else if (zd.Aliases(zm)) {
1008
    // Multiplication is commutative, so we can swap zn and zm.
1009
    // zdm = za - (zdm * zn)
1010
    SingleEmissionCheckScope guard(this);
1011
    msb(zd, pg, zn, za);
1012
  } else {
1013
    // zd = za - (zn * zm)
1014
    ExactAssemblyScope guard(this, 2 * kInstructionSize);
1015
    movprfx(zd, pg, za);
1016
    mls(zd, pg, zn, zm);
1017
  }
1018
}
1019

1020
void MacroAssembler::CompareHelper(Condition cond,
1021
                                   const PRegisterWithLaneSize& pd,
1022
                                   const PRegisterZ& pg,
1023
                                   const ZRegister& zn,
1024
                                   IntegerOperand imm) {
1025
  UseScratchRegisterScope temps(this);
1026
  ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1027
  Dup(zm, imm);
1028
  SingleEmissionCheckScope guard(this);
1029
  cmp(cond, pd, pg, zn, zm);
1030
}
1031

1032
void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
1033
                            const PRegister& pg,
1034
                            const PRegisterWithLaneSize& pn) {
1035
  VIXL_ASSERT(allow_macro_instructions_);
1036
  VIXL_ASSERT(pd.IsLaneSizeB());
1037
  VIXL_ASSERT(pn.IsLaneSizeB());
1038
  if (pd.Is(pn)) {
1039
    SingleEmissionCheckScope guard(this);
1040
    pfirst(pd, pg, pn);
1041
  } else {
1042
    UseScratchRegisterScope temps(this);
1043
    PRegister temp_pg = pg;
1044
    if (pd.Aliases(pg)) {
1045
      temp_pg = temps.AcquireP();
1046
      Mov(temp_pg.VnB(), pg.VnB());
1047
    }
1048
    Mov(pd, pn);
1049
    SingleEmissionCheckScope guard(this);
1050
    pfirst(pd, temp_pg, pd);
1051
  }
1052
}
1053

1054
void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
1055
                           const PRegister& pg,
1056
                           const PRegisterWithLaneSize& pn) {
1057
  VIXL_ASSERT(allow_macro_instructions_);
1058
  VIXL_ASSERT(AreSameFormat(pd, pn));
1059
  if (pd.Is(pn)) {
1060
    SingleEmissionCheckScope guard(this);
1061
    pnext(pd, pg, pn);
1062
  } else {
1063
    UseScratchRegisterScope temps(this);
1064
    PRegister temp_pg = pg;
1065
    if (pd.Aliases(pg)) {
1066
      temp_pg = temps.AcquireP();
1067
      Mov(temp_pg.VnB(), pg.VnB());
1068
    }
1069
    Mov(pd.VnB(), pn.VnB());
1070
    SingleEmissionCheckScope guard(this);
1071
    pnext(pd, temp_pg, pd);
1072
  }
1073
}
1074

1075
void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
1076
                           SVEPredicateConstraint pattern,
1077
                           FlagsUpdate s) {
1078
  VIXL_ASSERT(allow_macro_instructions_);
1079
  switch (s) {
1080
    case LeaveFlags:
1081
      Ptrue(pd, pattern);
1082
      return;
1083
    case SetFlags:
1084
      Ptrues(pd, pattern);
1085
      return;
1086
  }
1087
  VIXL_UNREACHABLE();
1088
}
1089

1090
void MacroAssembler::Sub(const ZRegister& zd,
1091
                         IntegerOperand imm,
1092
                         const ZRegister& zm) {
1093
  VIXL_ASSERT(allow_macro_instructions_);
1094

1095
  int imm8;
1096
  int shift = -1;
1097
  if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
1098
      imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
1099
    MovprfxHelperScope guard(this, zd, zm);
1100
    subr(zd, zd, imm8, shift);
1101
  } else {
1102
    UseScratchRegisterScope temps(this);
1103
    ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
1104
    Dup(scratch, imm);
1105

1106
    SingleEmissionCheckScope guard(this);
1107
    sub(zd, scratch, zm);
1108
  }
1109
}
1110

1111
void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
1112
                                               const PRegisterZ& pg,
1113
                                               const SVEMemOperand& addr,
1114
                                               SVELoadBroadcastFn fn,
1115
                                               int divisor) {
1116
  VIXL_ASSERT(addr.IsScalarPlusImmediate());
1117
  int64_t imm = addr.GetImmediateOffset();
1118
  if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
1119
    SingleEmissionCheckScope guard(this);
1120
    (this->*fn)(zt, pg, addr);
1121
  } else {
1122
    UseScratchRegisterScope temps(this);
1123
    Register scratch = temps.AcquireX();
1124
    CalculateSVEAddress(scratch, addr, zt);
1125
    SingleEmissionCheckScope guard(this);
1126
    (this->*fn)(zt, pg, SVEMemOperand(scratch));
1127
  }
1128
}
1129

1130
void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
1131
                                                 const SVEMemOperand& addr,
1132
                                                 SVELoadStoreFn fn) {
1133
  VIXL_ASSERT(allow_macro_instructions_);
1134
  VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
1135

1136
  if (addr.IsPlainScalar() ||
1137
      (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
1138
       addr.IsMulVl())) {
1139
    SingleEmissionCheckScope guard(this);
1140
    (this->*fn)(rt, addr);
1141
    return;
1142
  }
1143

1144
  if (addr.IsEquivalentToScalar()) {
1145
    SingleEmissionCheckScope guard(this);
1146
    (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
1147
    return;
1148
  }
1149

1150
  UseScratchRegisterScope temps(this);
1151
  Register scratch = temps.AcquireX();
1152
  CalculateSVEAddress(scratch, addr, rt);
1153
  SingleEmissionCheckScope guard(this);
1154
  (this->*fn)(rt, SVEMemOperand(scratch));
1155
}
1156

1157
template <typename Tg, typename Tf>
1158
void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
1159
    const ZRegister& zt,
1160
    const Tg& pg,
1161
    const SVEMemOperand& addr,
1162
    Tf fn,
1163
    int imm_bits,
1164
    int shift_amount,
1165
    SVEOffsetModifier supported_modifier,
1166
    int vl_divisor_log2) {
1167
  VIXL_ASSERT(allow_macro_instructions_);
1168
  int imm_divisor = 1 << shift_amount;
1169

1170
  if (addr.IsPlainScalar() ||
1171
      (addr.IsScalarPlusImmediate() &&
1172
       IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
1173
       ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
1174
       (addr.GetOffsetModifier() == supported_modifier))) {
1175
    SingleEmissionCheckScope guard(this);
1176
    (this->*fn)(zt, pg, addr);
1177
    return;
1178
  }
1179

1180
  if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1181
      addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
1182
    SingleEmissionCheckScope guard(this);
1183
    (this->*fn)(zt, pg, addr);
1184
    return;
1185
  }
1186

1187
  if (addr.IsEquivalentToScalar()) {
1188
    SingleEmissionCheckScope guard(this);
1189
    (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1190
    return;
1191
  }
1192

1193
  if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
1194
      (vl_divisor_log2 == -1)) {
1195
    // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
1196
    // dependent.
1197
    VIXL_UNIMPLEMENTED();
1198
  }
1199

1200
  UseScratchRegisterScope temps(this);
1201
  Register scratch = temps.AcquireX();
1202
  CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1203
  SingleEmissionCheckScope guard(this);
1204
  (this->*fn)(zt, pg, SVEMemOperand(scratch));
1205
}
1206

1207
template <typename Tg, typename Tf>
1208
void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
1209
                                         const ZRegister& zt,
1210
                                         const Tg& pg,
1211
                                         const SVEMemOperand& addr,
1212
                                         Tf fn) {
1213
  if (addr.IsPlainScalar() ||
1214
      (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1215
       addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
1216
      (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
1217
       addr.IsMulVl())) {
1218
    SingleEmissionCheckScope guard(this);
1219
    (this->*fn)(zt, pg, addr);
1220
    return;
1221
  }
1222

1223
  if (addr.IsEquivalentToScalar()) {
1224
    SingleEmissionCheckScope guard(this);
1225
    (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1226
    return;
1227
  }
1228

1229
  if (addr.IsVectorPlusImmediate()) {
1230
    uint64_t offset = addr.GetImmediateOffset();
1231
    if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
1232
        IsUint5(offset >> msize_in_bytes_log2)) {
1233
      SingleEmissionCheckScope guard(this);
1234
      (this->*fn)(zt, pg, addr);
1235
      return;
1236
    }
1237
  }
1238

1239
  if (addr.IsScalarPlusVector()) {
1240
    VIXL_ASSERT(addr.IsScatterGather());
1241
    SingleEmissionCheckScope guard(this);
1242
    (this->*fn)(zt, pg, addr);
1243
    return;
1244
  }
1245

1246
  UseScratchRegisterScope temps(this);
1247
  if (addr.IsScatterGather()) {
1248
    // In scatter-gather modes, zt and zn/zm have the same lane size. However,
1249
    // for 32-bit accesses, the result of each lane's address calculation still
1250
    // requires 64 bits; we can't naively use `Adr` for the address calculation
1251
    // because it would truncate each address to 32 bits.
1252

1253
    if (addr.IsVectorPlusImmediate()) {
1254
      // Synthesise the immediate in an X register, then use a
1255
      // scalar-plus-vector access with the original vector.
1256
      Register scratch = temps.AcquireX();
1257
      Mov(scratch, addr.GetImmediateOffset());
1258
      SingleEmissionCheckScope guard(this);
1259
      SVEOffsetModifier om =
1260
          zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
1261
      (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
1262
      return;
1263
    }
1264

1265
    VIXL_UNIMPLEMENTED();
1266
  } else {
1267
    Register scratch = temps.AcquireX();
1268
    // TODO: If we have an immediate offset that is a multiple of
1269
    // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
1270
    // save an instruction.
1271
    int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
1272
    CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1273
    SingleEmissionCheckScope guard(this);
1274
    (this->*fn)(zt, pg, SVEMemOperand(scratch));
1275
  }
1276
}
1277

1278
template <typename Tf>
1279
void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
1280
                                     const ZRegister& zt,
1281
                                     const PRegisterZ& pg,
1282
                                     const SVEMemOperand& addr,
1283
                                     Tf fn) {
1284
  if (addr.IsScatterGather()) {
1285
    // Scatter-gather first-fault loads share encodings with normal loads.
1286
    SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
1287
    return;
1288
  }
1289

1290
  // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
1291
  // so we don't do immediate synthesis.
1292

1293
  // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
1294
  // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
1295
  if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
1296
                               addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
1297
    SingleEmissionCheckScope guard(this);
1298
    (this->*fn)(zt, pg, addr);
1299
    return;
1300
  }
1301

1302
  VIXL_UNIMPLEMENTED();
1303
}
1304

1305
void MacroAssembler::Ld1b(const ZRegister& zt,
1306
                          const PRegisterZ& pg,
1307
                          const SVEMemOperand& addr) {
1308
  VIXL_ASSERT(allow_macro_instructions_);
1309
  SVELoadStore1Helper(kBRegSizeInBytesLog2,
1310
                      zt,
1311
                      pg,
1312
                      addr,
1313
                      static_cast<SVELoad1Fn>(&Assembler::ld1b));
1314
}
1315

1316
void MacroAssembler::Ld1h(const ZRegister& zt,
1317
                          const PRegisterZ& pg,
1318
                          const SVEMemOperand& addr) {
1319
  VIXL_ASSERT(allow_macro_instructions_);
1320
  SVELoadStore1Helper(kHRegSizeInBytesLog2,
1321
                      zt,
1322
                      pg,
1323
                      addr,
1324
                      static_cast<SVELoad1Fn>(&Assembler::ld1h));
1325
}
1326

1327
void MacroAssembler::Ld1w(const ZRegister& zt,
1328
                          const PRegisterZ& pg,
1329
                          const SVEMemOperand& addr) {
1330
  VIXL_ASSERT(allow_macro_instructions_);
1331
  SVELoadStore1Helper(kWRegSizeInBytesLog2,
1332
                      zt,
1333
                      pg,
1334
                      addr,
1335
                      static_cast<SVELoad1Fn>(&Assembler::ld1w));
1336
}
1337

1338
void MacroAssembler::Ld1d(const ZRegister& zt,
1339
                          const PRegisterZ& pg,
1340
                          const SVEMemOperand& addr) {
1341
  VIXL_ASSERT(allow_macro_instructions_);
1342
  SVELoadStore1Helper(kDRegSizeInBytesLog2,
1343
                      zt,
1344
                      pg,
1345
                      addr,
1346
                      static_cast<SVELoad1Fn>(&Assembler::ld1d));
1347
}
1348

1349
void MacroAssembler::Ld1sb(const ZRegister& zt,
1350
                           const PRegisterZ& pg,
1351
                           const SVEMemOperand& addr) {
1352
  VIXL_ASSERT(allow_macro_instructions_);
1353
  SVELoadStore1Helper(kBRegSizeInBytesLog2,
1354
                      zt,
1355
                      pg,
1356
                      addr,
1357
                      static_cast<SVELoad1Fn>(&Assembler::ld1sb));
1358
}
1359

1360
void MacroAssembler::Ld1sh(const ZRegister& zt,
1361
                           const PRegisterZ& pg,
1362
                           const SVEMemOperand& addr) {
1363
  VIXL_ASSERT(allow_macro_instructions_);
1364
  SVELoadStore1Helper(kHRegSizeInBytesLog2,
1365
                      zt,
1366
                      pg,
1367
                      addr,
1368
                      static_cast<SVELoad1Fn>(&Assembler::ld1sh));
1369
}
1370

1371
void MacroAssembler::Ld1sw(const ZRegister& zt,
1372
                           const PRegisterZ& pg,
1373
                           const SVEMemOperand& addr) {
1374
  VIXL_ASSERT(allow_macro_instructions_);
1375
  SVELoadStore1Helper(kSRegSizeInBytesLog2,
1376
                      zt,
1377
                      pg,
1378
                      addr,
1379
                      static_cast<SVELoad1Fn>(&Assembler::ld1sw));
1380
}
1381

1382
void MacroAssembler::St1b(const ZRegister& zt,
1383
                          const PRegister& pg,
1384
                          const SVEMemOperand& addr) {
1385
  VIXL_ASSERT(allow_macro_instructions_);
1386
  SVELoadStore1Helper(kBRegSizeInBytesLog2,
1387
                      zt,
1388
                      pg,
1389
                      addr,
1390
                      static_cast<SVEStore1Fn>(&Assembler::st1b));
1391
}
1392

1393
void MacroAssembler::St1h(const ZRegister& zt,
1394
                          const PRegister& pg,
1395
                          const SVEMemOperand& addr) {
1396
  VIXL_ASSERT(allow_macro_instructions_);
1397
  SVELoadStore1Helper(kHRegSizeInBytesLog2,
1398
                      zt,
1399
                      pg,
1400
                      addr,
1401
                      static_cast<SVEStore1Fn>(&Assembler::st1h));
1402
}
1403

1404
void MacroAssembler::St1w(const ZRegister& zt,
1405
                          const PRegister& pg,
1406
                          const SVEMemOperand& addr) {
1407
  VIXL_ASSERT(allow_macro_instructions_);
1408
  SVELoadStore1Helper(kSRegSizeInBytesLog2,
1409
                      zt,
1410
                      pg,
1411
                      addr,
1412
                      static_cast<SVEStore1Fn>(&Assembler::st1w));
1413
}
1414

1415
void MacroAssembler::St1d(const ZRegister& zt,
1416
                          const PRegister& pg,
1417
                          const SVEMemOperand& addr) {
1418
  VIXL_ASSERT(allow_macro_instructions_);
1419
  SVELoadStore1Helper(kDRegSizeInBytesLog2,
1420
                      zt,
1421
                      pg,
1422
                      addr,
1423
                      static_cast<SVEStore1Fn>(&Assembler::st1d));
1424
}
1425

1426
void MacroAssembler::Ldff1b(const ZRegister& zt,
1427
                            const PRegisterZ& pg,
1428
                            const SVEMemOperand& addr) {
1429
  VIXL_ASSERT(allow_macro_instructions_);
1430
  SVELoadFFHelper(kBRegSizeInBytesLog2,
1431
                  zt,
1432
                  pg,
1433
                  addr,
1434
                  static_cast<SVELoad1Fn>(&Assembler::ldff1b));
1435
}
1436

1437
void MacroAssembler::Ldff1h(const ZRegister& zt,
1438
                            const PRegisterZ& pg,
1439
                            const SVEMemOperand& addr) {
1440
  VIXL_ASSERT(allow_macro_instructions_);
1441
  SVELoadFFHelper(kHRegSizeInBytesLog2,
1442
                  zt,
1443
                  pg,
1444
                  addr,
1445
                  static_cast<SVELoad1Fn>(&Assembler::ldff1h));
1446
}
1447

1448
void MacroAssembler::Ldff1w(const ZRegister& zt,
1449
                            const PRegisterZ& pg,
1450
                            const SVEMemOperand& addr) {
1451
  VIXL_ASSERT(allow_macro_instructions_);
1452
  SVELoadFFHelper(kSRegSizeInBytesLog2,
1453
                  zt,
1454
                  pg,
1455
                  addr,
1456
                  static_cast<SVELoad1Fn>(&Assembler::ldff1w));
1457
}
1458

1459
void MacroAssembler::Ldff1d(const ZRegister& zt,
1460
                            const PRegisterZ& pg,
1461
                            const SVEMemOperand& addr) {
1462
  VIXL_ASSERT(allow_macro_instructions_);
1463
  SVELoadFFHelper(kDRegSizeInBytesLog2,
1464
                  zt,
1465
                  pg,
1466
                  addr,
1467
                  static_cast<SVELoad1Fn>(&Assembler::ldff1d));
1468
}
1469

1470
void MacroAssembler::Ldff1sb(const ZRegister& zt,
1471
                             const PRegisterZ& pg,
1472
                             const SVEMemOperand& addr) {
1473
  VIXL_ASSERT(allow_macro_instructions_);
1474
  SVELoadFFHelper(kBRegSizeInBytesLog2,
1475
                  zt,
1476
                  pg,
1477
                  addr,
1478
                  static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
1479
}
1480

1481
void MacroAssembler::Ldff1sh(const ZRegister& zt,
1482
                             const PRegisterZ& pg,
1483
                             const SVEMemOperand& addr) {
1484
  VIXL_ASSERT(allow_macro_instructions_);
1485
  SVELoadFFHelper(kHRegSizeInBytesLog2,
1486
                  zt,
1487
                  pg,
1488
                  addr,
1489
                  static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
1490
}
1491

1492
void MacroAssembler::Ldff1sw(const ZRegister& zt,
1493
                             const PRegisterZ& pg,
1494
                             const SVEMemOperand& addr) {
1495
  VIXL_ASSERT(allow_macro_instructions_);
1496
  SVELoadFFHelper(kSRegSizeInBytesLog2,
1497
                  zt,
1498
                  pg,
1499
                  addr,
1500
                  static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
1501
}
1502

1503
#define VIXL_SVE_LD1R_LIST(V) \
1504
  V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)
1505

1506
#define VIXL_DEFINE_MASM_FUNC(SZ, SH)                          \
1507
  void MacroAssembler::Ld1r##SZ(const ZRegister& zt,           \
1508
                                const PRegisterZ& pg,          \
1509
                                const SVEMemOperand& addr) {   \
1510
    VIXL_ASSERT(allow_macro_instructions_);                    \
1511
    SVELoadStoreNTBroadcastQOHelper(zt,                        \
1512
                                    pg,                        \
1513
                                    addr,                      \
1514
                                    &MacroAssembler::ld1r##SZ, \
1515
                                    4,                         \
1516
                                    SH,                        \
1517
                                    NO_SVE_OFFSET_MODIFIER,    \
1518
                                    -1);                       \
1519
  }
1520

1521
VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)
1522

1523
#undef VIXL_DEFINE_MASM_FUNC
1524
#undef VIXL_SVE_LD1R_LIST
1525

1526
void MacroAssembler::Ldnt1b(const ZRegister& zt,
1527
                            const PRegisterZ& pg,
1528
                            const SVEMemOperand& addr) {
1529
  VIXL_ASSERT(allow_macro_instructions_);
1530
  if (addr.IsVectorPlusScalar()) {
1531
    SingleEmissionCheckScope guard(this);
1532
    ldnt1b(zt, pg, addr);
1533
  } else {
1534
    SVELoadStoreNTBroadcastQOHelper(zt,
1535
                                    pg,
1536
                                    addr,
1537
                                    &MacroAssembler::ldnt1b,
1538
                                    4,
1539
                                    0,
1540
                                    SVE_MUL_VL);
1541
  }
1542
}
1543

1544
void MacroAssembler::Ldnt1d(const ZRegister& zt,
1545
                            const PRegisterZ& pg,
1546
                            const SVEMemOperand& addr) {
1547
  VIXL_ASSERT(allow_macro_instructions_);
1548
  if (addr.IsVectorPlusScalar()) {
1549
    SingleEmissionCheckScope guard(this);
1550
    ldnt1d(zt, pg, addr);
1551
  } else {
1552
    SVELoadStoreNTBroadcastQOHelper(zt,
1553
                                    pg,
1554
                                    addr,
1555
                                    &MacroAssembler::ldnt1d,
1556
                                    4,
1557
                                    0,
1558
                                    SVE_MUL_VL);
1559
  }
1560
}
1561

1562
void MacroAssembler::Ldnt1h(const ZRegister& zt,
1563
                            const PRegisterZ& pg,
1564
                            const SVEMemOperand& addr) {
1565
  VIXL_ASSERT(allow_macro_instructions_);
1566
  if (addr.IsVectorPlusScalar()) {
1567
    SingleEmissionCheckScope guard(this);
1568
    ldnt1h(zt, pg, addr);
1569
  } else {
1570
    SVELoadStoreNTBroadcastQOHelper(zt,
1571
                                    pg,
1572
                                    addr,
1573
                                    &MacroAssembler::ldnt1h,
1574
                                    4,
1575
                                    0,
1576
                                    SVE_MUL_VL);
1577
  }
1578
}
1579

1580
void MacroAssembler::Ldnt1w(const ZRegister& zt,
1581
                            const PRegisterZ& pg,
1582
                            const SVEMemOperand& addr) {
1583
  VIXL_ASSERT(allow_macro_instructions_);
1584
  if (addr.IsVectorPlusScalar()) {
1585
    SingleEmissionCheckScope guard(this);
1586
    ldnt1w(zt, pg, addr);
1587
  } else {
1588
    SVELoadStoreNTBroadcastQOHelper(zt,
1589
                                    pg,
1590
                                    addr,
1591
                                    &MacroAssembler::ldnt1w,
1592
                                    4,
1593
                                    0,
1594
                                    SVE_MUL_VL);
1595
  }
1596
}
1597

1598
void MacroAssembler::Stnt1b(const ZRegister& zt,
1599
                            const PRegister& pg,
1600
                            const SVEMemOperand& addr) {
1601
  VIXL_ASSERT(allow_macro_instructions_);
1602
  if (addr.IsVectorPlusScalar()) {
1603
    SingleEmissionCheckScope guard(this);
1604
    stnt1b(zt, pg, addr);
1605
  } else {
1606
    SVELoadStoreNTBroadcastQOHelper(zt,
1607
                                    pg,
1608
                                    addr,
1609
                                    &MacroAssembler::stnt1b,
1610
                                    4,
1611
                                    0,
1612
                                    SVE_MUL_VL);
1613
  }
1614
}
1615
void MacroAssembler::Stnt1d(const ZRegister& zt,
1616
                            const PRegister& pg,
1617
                            const SVEMemOperand& addr) {
1618
  VIXL_ASSERT(allow_macro_instructions_);
1619
  if (addr.IsVectorPlusScalar()) {
1620
    SingleEmissionCheckScope guard(this);
1621
    stnt1d(zt, pg, addr);
1622
  } else {
1623
    SVELoadStoreNTBroadcastQOHelper(zt,
1624
                                    pg,
1625
                                    addr,
1626
                                    &MacroAssembler::stnt1d,
1627
                                    4,
1628
                                    0,
1629
                                    SVE_MUL_VL);
1630
  }
1631
}
1632
void MacroAssembler::Stnt1h(const ZRegister& zt,
1633
                            const PRegister& pg,
1634
                            const SVEMemOperand& addr) {
1635
  VIXL_ASSERT(allow_macro_instructions_);
1636
  if (addr.IsVectorPlusScalar()) {
1637
    SingleEmissionCheckScope guard(this);
1638
    stnt1h(zt, pg, addr);
1639
  } else {
1640
    SVELoadStoreNTBroadcastQOHelper(zt,
1641
                                    pg,
1642
                                    addr,
1643
                                    &MacroAssembler::stnt1h,
1644
                                    4,
1645
                                    0,
1646
                                    SVE_MUL_VL);
1647
  }
1648
}
1649
void MacroAssembler::Stnt1w(const ZRegister& zt,
1650
                            const PRegister& pg,
1651
                            const SVEMemOperand& addr) {
1652
  VIXL_ASSERT(allow_macro_instructions_);
1653
  if (addr.IsVectorPlusScalar()) {
1654
    SingleEmissionCheckScope guard(this);
1655
    stnt1w(zt, pg, addr);
1656
  } else {
1657
    SVELoadStoreNTBroadcastQOHelper(zt,
1658
                                    pg,
1659
                                    addr,
1660
                                    &MacroAssembler::stnt1w,
1661
                                    4,
1662
                                    0,
1663
                                    SVE_MUL_VL);
1664
  }
1665
}
1666

1667
void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
1668
                                       const ZRegister& zd,
1669
                                       const ZRegister& za,
1670
                                       const ZRegister& zn,
1671
                                       const ZRegister& zm,
1672
                                       int index) {
1673
  if (zd.Aliases(za)) {
1674
    // zda = zda + (zn . zm)
1675
    SingleEmissionCheckScope guard(this);
1676
    (this->*fn)(zd, zn, zm, index);
1677

1678
  } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1679
    // zdn = za + (zdn . zm[index])
1680
    // zdm = za + (zn . zdm[index])
1681
    // zdnm = za + (zdnm . zdnm[index])
1682
    UseScratchRegisterScope temps(this);
1683
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1684
    {
1685
      MovprfxHelperScope guard(this, scratch, za);
1686
      (this->*fn)(scratch, zn, zm, index);
1687
    }
1688

1689
    Mov(zd, scratch);
1690
  } else {
1691
    // zd = za + (zn . zm)
1692
    MovprfxHelperScope guard(this, zd, za);
1693
    (this->*fn)(zd, zn, zm, index);
1694
  }
1695
}
1696

1697
void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
1698
                                              const ZRegister& zd,
1699
                                              const ZRegister& za,
1700
                                              const ZRegister& zn,
1701
                                              const ZRegister& zm) {
1702
  if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1703
    // zd = za . zd . zm
1704
    // zd = za . zn . zd
1705
    // zd = za . zd . zd
1706
    UseScratchRegisterScope temps(this);
1707
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1708
    {
1709
      MovprfxHelperScope guard(this, scratch, za);
1710
      (this->*fn)(scratch, zn, zm);
1711
    }
1712

1713
    Mov(zd, scratch);
1714
  } else {
1715
    MovprfxHelperScope guard(this, zd, za);
1716
    (this->*fn)(zd, zn, zm);
1717
  }
1718
}
1719

1720
void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
1721
                                              const ZRegister& zd,
1722
                                              const ZRegister& za,
1723
                                              const ZRegister& zn,
1724
                                              const ZRegister& zm) {
1725
  if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1726
    // zd = za . zd . zm
1727
    // zd = za . zn . zd
1728
    // zd = za . zd . zd
1729
    UseScratchRegisterScope temps(this);
1730
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1731
    {
1732
      MovprfxHelperScope guard(this, scratch, za);
1733
      (this->*fn)(scratch, scratch, zn, zm);
1734
    }
1735

1736
    Mov(zd, scratch);
1737
  } else {
1738
    MovprfxHelperScope guard(this, zd, za);
1739
    (this->*fn)(zd, zd, zn, zm);
1740
  }
1741
}
1742

1743
void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
1744
                                                    const ZRegister& zd,
1745
                                                    const ZRegister& za,
1746
                                                    const ZRegister& zn,
1747
                                                    const ZRegister& zm,
1748
                                                    int imm) {
1749
  if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1750
    // zd = za . zd . zm[i]
1751
    // zd = za . zn . zd[i]
1752
    // zd = za . zd . zd[i]
1753
    UseScratchRegisterScope temps(this);
1754
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1755
    {
1756
      MovprfxHelperScope guard(this, scratch, za);
1757
      (this->*fn)(scratch, zn, zm, imm);
1758
    }
1759

1760
    Mov(zd, scratch);
1761
  } else {
1762
    // zd = za . zn . zm[i]
1763
    MovprfxHelperScope guard(this, zd, za);
1764
    (this->*fn)(zd, zn, zm, imm);
1765
  }
1766
}
1767

1768
void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
1769
                                                  const ZRegister& zd,
1770
                                                  const ZRegister& za,
1771
                                                  const ZRegister& zn,
1772
                                                  const ZRegister& zm) {
1773
  if (zn.Aliases(zm)) {
1774
    // If zn == zm, the difference is zero.
1775
    if (!zd.Aliases(za)) {
1776
      Mov(zd, za);
1777
    }
1778
  } else if (zd.Aliases(za)) {
1779
    SingleEmissionCheckScope guard(this);
1780
    (this->*fn)(zd, zn, zm);
1781
  } else if (zd.Aliases(zn)) {
1782
    UseScratchRegisterScope temps(this);
1783
    ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1784
    Mov(ztmp, zn);
1785
    MovprfxHelperScope guard(this, zd, za);
1786
    (this->*fn)(zd, ztmp, zm);
1787
  } else if (zd.Aliases(zm)) {
1788
    UseScratchRegisterScope temps(this);
1789
    ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1790
    Mov(ztmp, zm);
1791
    MovprfxHelperScope guard(this, zd, za);
1792
    (this->*fn)(zd, zn, ztmp);
1793
  } else {
1794
    MovprfxHelperScope guard(this, zd, za);
1795
    (this->*fn)(zd, zn, zm);
1796
  }
1797
}
1798

1799
#define VIXL_SVE_4REG_LIST(V)                       \
1800
  V(Saba, saba, AbsoluteDifferenceAccumulate)       \
1801
  V(Uaba, uaba, AbsoluteDifferenceAccumulate)       \
1802
  V(Sabalb, sabalb, AbsoluteDifferenceAccumulate)   \
1803
  V(Sabalt, sabalt, AbsoluteDifferenceAccumulate)   \
1804
  V(Uabalb, uabalb, AbsoluteDifferenceAccumulate)   \
1805
  V(Uabalt, uabalt, AbsoluteDifferenceAccumulate)   \
1806
  V(Sdot, sdot, FourRegDestructiveHelper)           \
1807
  V(Udot, udot, FourRegDestructiveHelper)           \
1808
  V(Adclb, adclb, FourRegDestructiveHelper)         \
1809
  V(Adclt, adclt, FourRegDestructiveHelper)         \
1810
  V(Sbclb, sbclb, FourRegDestructiveHelper)         \
1811
  V(Sbclt, sbclt, FourRegDestructiveHelper)         \
1812
  V(Smlalb, smlalb, FourRegDestructiveHelper)       \
1813
  V(Smlalt, smlalt, FourRegDestructiveHelper)       \
1814
  V(Smlslb, smlslb, FourRegDestructiveHelper)       \
1815
  V(Smlslt, smlslt, FourRegDestructiveHelper)       \
1816
  V(Umlalb, umlalb, FourRegDestructiveHelper)       \
1817
  V(Umlalt, umlalt, FourRegDestructiveHelper)       \
1818
  V(Umlslb, umlslb, FourRegDestructiveHelper)       \
1819
  V(Umlslt, umlslt, FourRegDestructiveHelper)       \
1820
  V(Bcax, bcax, FourRegDestructiveHelper)           \
1821
  V(Bsl, bsl, FourRegDestructiveHelper)             \
1822
  V(Bsl1n, bsl1n, FourRegDestructiveHelper)         \
1823
  V(Bsl2n, bsl2n, FourRegDestructiveHelper)         \
1824
  V(Eor3, eor3, FourRegDestructiveHelper)           \
1825
  V(Nbsl, nbsl, FourRegDestructiveHelper)           \
1826
  V(Fmlalb, fmlalb, FourRegDestructiveHelper)       \
1827
  V(Fmlalt, fmlalt, FourRegDestructiveHelper)       \
1828
  V(Fmlslb, fmlslb, FourRegDestructiveHelper)       \
1829
  V(Fmlslt, fmlslt, FourRegDestructiveHelper)       \
1830
  V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper)   \
1831
  V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
1832
  V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper)   \
1833
  V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper)   \
1834
  V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
1835
  V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper)   \
1836
  V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper)   \
1837
  V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper)   \
1838
  V(Fmmla, fmmla, FourRegDestructiveHelper)         \
1839
  V(Smmla, smmla, FourRegDestructiveHelper)         \
1840
  V(Ummla, ummla, FourRegDestructiveHelper)         \
1841
  V(Usmmla, usmmla, FourRegDestructiveHelper)       \
1842
  V(Usdot, usdot, FourRegDestructiveHelper)
1843

1844
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
1845
  void MacroAssembler::MASMFN(const ZRegister& zd,   \
1846
                              const ZRegister& za,   \
1847
                              const ZRegister& zn,   \
1848
                              const ZRegister& zm) { \
1849
    VIXL_ASSERT(allow_macro_instructions_);          \
1850
    HELPER(&Assembler::ASMFN, zd, za, zn, zm);       \
1851
  }
1852
VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
1853
#undef VIXL_DEFINE_MASM_FUNC
1854

1855
#define VIXL_SVE_4REG_1IMM_LIST(V)                      \
1856
  V(Fmla, fmla, FourRegOneImmDestructiveHelper)         \
1857
  V(Fmls, fmls, FourRegOneImmDestructiveHelper)         \
1858
  V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper)     \
1859
  V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper)     \
1860
  V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper)     \
1861
  V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper)     \
1862
  V(Mla, mla, FourRegOneImmDestructiveHelper)           \
1863
  V(Mls, mls, FourRegOneImmDestructiveHelper)           \
1864
  V(Smlalb, smlalb, FourRegOneImmDestructiveHelper)     \
1865
  V(Smlalt, smlalt, FourRegOneImmDestructiveHelper)     \
1866
  V(Smlslb, smlslb, FourRegOneImmDestructiveHelper)     \
1867
  V(Smlslt, smlslt, FourRegOneImmDestructiveHelper)     \
1868
  V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
1869
  V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
1870
  V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
1871
  V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
1872
  V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
1873
  V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
1874
  V(Umlalb, umlalb, FourRegOneImmDestructiveHelper)     \
1875
  V(Umlalt, umlalt, FourRegOneImmDestructiveHelper)     \
1876
  V(Umlslb, umlslb, FourRegOneImmDestructiveHelper)     \
1877
  V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)
1878

1879
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
1880
  void MacroAssembler::MASMFN(const ZRegister& zd,   \
1881
                              const ZRegister& za,   \
1882
                              const ZRegister& zn,   \
1883
                              const ZRegister& zm,   \
1884
                              int imm) {             \
1885
    VIXL_ASSERT(allow_macro_instructions_);          \
1886
    HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm);  \
1887
  }
1888
VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
1889
#undef VIXL_DEFINE_MASM_FUNC
1890

1891
void MacroAssembler::Sdot(const ZRegister& zd,
1892
                          const ZRegister& za,
1893
                          const ZRegister& zn,
1894
                          const ZRegister& zm,
1895
                          int index) {
1896
  VIXL_ASSERT(allow_macro_instructions_);
1897
  SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
1898
}
1899

1900
void MacroAssembler::Udot(const ZRegister& zd,
1901
                          const ZRegister& za,
1902
                          const ZRegister& zn,
1903
                          const ZRegister& zm,
1904
                          int index) {
1905
  VIXL_ASSERT(allow_macro_instructions_);
1906
  SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
1907
}
1908

1909
void MacroAssembler::Sudot(const ZRegister& zd,
1910
                           const ZRegister& za,
1911
                           const ZRegister& zn,
1912
                           const ZRegister& zm,
1913
                           int index) {
1914
  VIXL_ASSERT(allow_macro_instructions_);
1915
  SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
1916
}
1917

1918
void MacroAssembler::Usdot(const ZRegister& zd,
1919
                           const ZRegister& za,
1920
                           const ZRegister& zn,
1921
                           const ZRegister& zm,
1922
                           int index) {
1923
  VIXL_ASSERT(allow_macro_instructions_);
1924
  SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
1925
}
1926

1927
void MacroAssembler::Cdot(const ZRegister& zd,
1928
                          const ZRegister& za,
1929
                          const ZRegister& zn,
1930
                          const ZRegister& zm,
1931
                          int index,
1932
                          int rot) {
1933
  // This doesn't handle zm when it's out of the range that can be encoded in
1934
  // instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
1935
  if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
1936
    UseScratchRegisterScope temps(this);
1937
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
1938
    {
1939
      MovprfxHelperScope guard(this, ztmp, za);
1940
      cdot(ztmp, zn, zm, index, rot);
1941
    }
1942
    Mov(zd, ztmp);
1943
  } else {
1944
    MovprfxHelperScope guard(this, zd, za);
1945
    cdot(zd, zn, zm, index, rot);
1946
  }
1947
}
1948

1949
void MacroAssembler::Cdot(const ZRegister& zd,
1950
                          const ZRegister& za,
1951
                          const ZRegister& zn,
1952
                          const ZRegister& zm,
1953
                          int rot) {
1954
  if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
1955
    UseScratchRegisterScope temps(this);
1956
    VIXL_ASSERT(AreSameLaneSize(zn, zm));
1957
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
1958
    Mov(ztmp, zd.Aliases(zn) ? zn : zm);
1959
    MovprfxHelperScope guard(this, zd, za);
1960
    cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
1961
  } else {
1962
    MovprfxHelperScope guard(this, zd, za);
1963
    cdot(zd, zn, zm, rot);
1964
  }
1965
}
1966

1967
void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
1968
                                    const PRegisterM& pg,
1969
                                    const ZRegister& za,
1970
                                    const ZRegister& zn,
1971
                                    const ZRegister& zm,
1972
                                    SVEMulAddPredicatedZdaFn fn_zda,
1973
                                    SVEMulAddPredicatedZdnFn fn_zdn,
1974
                                    FPMacroNaNPropagationOption nan_option) {
1975
  ResolveFPNaNPropagationOption(&nan_option);
1976

1977
  if (zd.Aliases(za)) {
1978
    // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1979
    SingleEmissionCheckScope guard(this);
1980
    (this->*fn_zda)(zd, pg, zn, zm);
1981
  } else if (zd.Aliases(zn)) {
1982
    // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
1983
    SingleEmissionCheckScope guard(this);
1984
    (this->*fn_zdn)(zd, pg, zm, za);
1985
  } else if (zd.Aliases(zm)) {
1986
    switch (nan_option) {
1987
      case FastNaNPropagation: {
1988
        // We treat multiplication as commutative in the fast mode, so we can
1989
        // swap zn and zm.
1990
        // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
1991
        SingleEmissionCheckScope guard(this);
1992
        (this->*fn_zdn)(zd, pg, zn, za);
1993
        return;
1994
      }
1995
      case StrictNaNPropagation: {
1996
        UseScratchRegisterScope temps(this);
1997
        // Use a scratch register to keep the argument order exactly as
1998
        // specified.
1999
        ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
2000
        {
2001
          MovprfxHelperScope guard(this, scratch, pg, za);
2002
          // scratch = (-)za + ((-)zn * zm)
2003
          (this->*fn_zda)(scratch, pg, zn, zm);
2004
        }
2005
        Mov(zd, scratch);
2006
        return;
2007
      }
2008
      case NoFPMacroNaNPropagationSelected:
2009
        VIXL_UNREACHABLE();
2010
        return;
2011
    }
2012
  } else {
2013
    // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
2014
    MovprfxHelperScope guard(this, zd, pg, za);
2015
    (this->*fn_zda)(zd, pg, zn, zm);
2016
  }
2017
}
2018

2019
void MacroAssembler::Fmla(const ZRegister& zd,
2020
                          const PRegisterM& pg,
2021
                          const ZRegister& za,
2022
                          const ZRegister& zn,
2023
                          const ZRegister& zm,
2024
                          FPMacroNaNPropagationOption nan_option) {
2025
  VIXL_ASSERT(allow_macro_instructions_);
2026
  FPMulAddHelper(zd,
2027
                 pg,
2028
                 za,
2029
                 zn,
2030
                 zm,
2031
                 &Assembler::fmla,
2032
                 &Assembler::fmad,
2033
                 nan_option);
2034
}
2035

2036
void MacroAssembler::Fmls(const ZRegister& zd,
2037
                          const PRegisterM& pg,
2038
                          const ZRegister& za,
2039
                          const ZRegister& zn,
2040
                          const ZRegister& zm,
2041
                          FPMacroNaNPropagationOption nan_option) {
2042
  VIXL_ASSERT(allow_macro_instructions_);
2043
  FPMulAddHelper(zd,
2044
                 pg,
2045
                 za,
2046
                 zn,
2047
                 zm,
2048
                 &Assembler::fmls,
2049
                 &Assembler::fmsb,
2050
                 nan_option);
2051
}
2052

2053
void MacroAssembler::Fnmla(const ZRegister& zd,
2054
                           const PRegisterM& pg,
2055
                           const ZRegister& za,
2056
                           const ZRegister& zn,
2057
                           const ZRegister& zm,
2058
                           FPMacroNaNPropagationOption nan_option) {
2059
  VIXL_ASSERT(allow_macro_instructions_);
2060
  FPMulAddHelper(zd,
2061
                 pg,
2062
                 za,
2063
                 zn,
2064
                 zm,
2065
                 &Assembler::fnmla,
2066
                 &Assembler::fnmad,
2067
                 nan_option);
2068
}
2069

2070
void MacroAssembler::Fnmls(const ZRegister& zd,
2071
                           const PRegisterM& pg,
2072
                           const ZRegister& za,
2073
                           const ZRegister& zn,
2074
                           const ZRegister& zm,
2075
                           FPMacroNaNPropagationOption nan_option) {
2076
  VIXL_ASSERT(allow_macro_instructions_);
2077
  FPMulAddHelper(zd,
2078
                 pg,
2079
                 za,
2080
                 zn,
2081
                 zm,
2082
                 &Assembler::fnmls,
2083
                 &Assembler::fnmsb,
2084
                 nan_option);
2085
}
2086

2087
void MacroAssembler::Ftmad(const ZRegister& zd,
2088
                           const ZRegister& zn,
2089
                           const ZRegister& zm,
2090
                           int imm3) {
2091
  VIXL_ASSERT(allow_macro_instructions_);
2092
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2093
    UseScratchRegisterScope temps(this);
2094
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
2095
    Mov(scratch, zm);
2096
    MovprfxHelperScope guard(this, zd, zn);
2097
    ftmad(zd, zd, scratch, imm3);
2098
  } else {
2099
    MovprfxHelperScope guard(this, zd, zn);
2100
    ftmad(zd, zd, zm, imm3);
2101
  }
2102
}
2103

2104
void MacroAssembler::Fcadd(const ZRegister& zd,
2105
                           const PRegisterM& pg,
2106
                           const ZRegister& zn,
2107
                           const ZRegister& zm,
2108
                           int rot) {
2109
  VIXL_ASSERT(allow_macro_instructions_);
2110
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2111
    UseScratchRegisterScope temps(this);
2112
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2113
    {
2114
      MovprfxHelperScope guard(this, scratch, pg, zn);
2115
      fcadd(scratch, pg, scratch, zm, rot);
2116
    }
2117
    Mov(zd, scratch);
2118
  } else {
2119
    MovprfxHelperScope guard(this, zd, pg, zn);
2120
    fcadd(zd, pg, zd, zm, rot);
2121
  }
2122
}
2123

2124
void MacroAssembler::Fcmla(const ZRegister& zd,
2125
                           const PRegisterM& pg,
2126
                           const ZRegister& za,
2127
                           const ZRegister& zn,
2128
                           const ZRegister& zm,
2129
                           int rot) {
2130
  VIXL_ASSERT(allow_macro_instructions_);
2131
  if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
2132
    UseScratchRegisterScope temps(this);
2133
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
2134
    {
2135
      MovprfxHelperScope guard(this, ztmp, za);
2136
      fcmla(ztmp, pg, zn, zm, rot);
2137
    }
2138
    Mov(zd, pg, ztmp);
2139
  } else {
2140
    MovprfxHelperScope guard(this, zd, pg, za);
2141
    fcmla(zd, pg, zn, zm, rot);
2142
  }
2143
}
2144

2145
void MacroAssembler::Splice(const ZRegister& zd,
2146
                            const PRegister& pg,
2147
                            const ZRegister& zn,
2148
                            const ZRegister& zm) {
2149
  VIXL_ASSERT(allow_macro_instructions_);
2150
  if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
2151
    SingleEmissionCheckScope guard(this);
2152
    splice(zd, pg, zn, zm);
2153
  } else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2154
    UseScratchRegisterScope temps(this);
2155
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2156
    {
2157
      MovprfxHelperScope guard(this, scratch, zn);
2158
      splice(scratch, pg, scratch, zm);
2159
    }
2160
    Mov(zd, scratch);
2161
  } else {
2162
    MovprfxHelperScope guard(this, zd, zn);
2163
    splice(zd, pg, zd, zm);
2164
  }
2165
}
2166

2167
void MacroAssembler::Clasta(const ZRegister& zd,
2168
                            const PRegister& pg,
2169
                            const ZRegister& zn,
2170
                            const ZRegister& zm) {
2171
  VIXL_ASSERT(allow_macro_instructions_);
2172
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2173
    UseScratchRegisterScope temps(this);
2174
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2175
    {
2176
      MovprfxHelperScope guard(this, scratch, zn);
2177
      clasta(scratch, pg, scratch, zm);
2178
    }
2179
    Mov(zd, scratch);
2180
  } else {
2181
    MovprfxHelperScope guard(this, zd, zn);
2182
    clasta(zd, pg, zd, zm);
2183
  }
2184
}
2185

2186
void MacroAssembler::Clastb(const ZRegister& zd,
2187
                            const PRegister& pg,
2188
                            const ZRegister& zn,
2189
                            const ZRegister& zm) {
2190
  VIXL_ASSERT(allow_macro_instructions_);
2191
  if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2192
    UseScratchRegisterScope temps(this);
2193
    ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2194
    {
2195
      MovprfxHelperScope guard(this, scratch, zn);
2196
      clastb(scratch, pg, scratch, zm);
2197
    }
2198
    Mov(zd, scratch);
2199
  } else {
2200
    MovprfxHelperScope guard(this, zd, zn);
2201
    clastb(zd, pg, zd, zm);
2202
  }
2203
}
2204

2205
void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
2206
                                          const ZRegister& zd,
2207
                                          const ZRegister& za,
2208
                                          const ZRegister& zn,
2209
                                          int shift) {
2210
  VIXL_ASSERT(allow_macro_instructions_);
2211
  if (!zd.Aliases(za) && zd.Aliases(zn)) {
2212
    UseScratchRegisterScope temps(this);
2213
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
2214
    Mov(ztmp, zn);
2215
    {
2216
      MovprfxHelperScope guard(this, zd, za);
2217
      (this->*fn)(zd, ztmp, shift);
2218
    }
2219
  } else {
2220
    MovprfxHelperScope guard(this, zd, za);
2221
    (this->*fn)(zd, zn, shift);
2222
  }
2223
}
2224

2225
void MacroAssembler::Srsra(const ZRegister& zd,
2226
                           const ZRegister& za,
2227
                           const ZRegister& zn,
2228
                           int shift) {
2229
  ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
2230
}
2231

2232
void MacroAssembler::Ssra(const ZRegister& zd,
2233
                          const ZRegister& za,
2234
                          const ZRegister& zn,
2235
                          int shift) {
2236
  ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
2237
}
2238

2239
void MacroAssembler::Ursra(const ZRegister& zd,
2240
                           const ZRegister& za,
2241
                           const ZRegister& zn,
2242
                           int shift) {
2243
  ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
2244
}
2245

2246
void MacroAssembler::Usra(const ZRegister& zd,
2247
                          const ZRegister& za,
2248
                          const ZRegister& zn,
2249
                          int shift) {
2250
  ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
2251
}
2252

2253
void MacroAssembler::ComplexAddition(ZZZImmFn fn,
2254
                                     const ZRegister& zd,
2255
                                     const ZRegister& zn,
2256
                                     const ZRegister& zm,
2257
                                     int rot) {
2258
  VIXL_ASSERT(allow_macro_instructions_);
2259
  if (!zd.Aliases(zn) && zd.Aliases(zm)) {
2260
    UseScratchRegisterScope temps(this);
2261
    ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
2262
    Mov(ztmp, zm);
2263
    {
2264
      MovprfxHelperScope guard(this, zd, zn);
2265
      (this->*fn)(zd, zd, ztmp, rot);
2266
    }
2267
  } else {
2268
    MovprfxHelperScope guard(this, zd, zn);
2269
    (this->*fn)(zd, zd, zm, rot);
2270
  }
2271
}
2272

2273
void MacroAssembler::Cadd(const ZRegister& zd,
2274
                          const ZRegister& zn,
2275
                          const ZRegister& zm,
2276
                          int rot) {
2277
  ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
2278
}
2279

2280
void MacroAssembler::Sqcadd(const ZRegister& zd,
2281
                            const ZRegister& zn,
2282
                            const ZRegister& zm,
2283
                            int rot) {
2284
  ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
2285
}
2286

2287
}  // namespace aarch64
2288
}  // namespace vixl
2289

2290
Product

Resources

Company