Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/dep/vixl/src/aarch64/macro-assembler-sve-aarch64.cc
4261 views
1
// Copyright 2019, VIXL authors
2
// All rights reserved.
3
//
4
// Redistribution and use in source and binary forms, with or without
5
// modification, are permitted provided that the following conditions are met:
6
//
7
// * Redistributions of source code must retain the above copyright notice,
8
// this list of conditions and the following disclaimer.
9
// * Redistributions in binary form must reproduce the above copyright notice,
10
// this list of conditions and the following disclaimer in the documentation
11
// and/or other materials provided with the distribution.
12
// * Neither the name of ARM Limited nor the names of its contributors may be
13
// used to endorse or promote products derived from this software without
14
// specific prior written permission.
15
//
16
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27
#include "macro-assembler-aarch64.h"
28
29
namespace vixl {
30
namespace aarch64 {
31
32
void MacroAssembler::AddSubHelper(AddSubHelperOption option,
33
const ZRegister& zd,
34
const ZRegister& zn,
35
IntegerOperand imm) {
36
VIXL_ASSERT(imm.FitsInLane(zd));
37
38
// Simple, encodable cases.
39
if (TrySingleAddSub(option, zd, zn, imm)) return;
40
41
VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
42
bool add_imm = (option == kAddImmediate);
43
44
// Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
45
// instruction. Also interpret the immediate as signed, so we can convert
46
// Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
47
IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
48
if (signed_imm.IsNegative()) {
49
AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
50
IntegerOperand n_imm(signed_imm.GetMagnitude());
51
// IntegerOperand can represent -INT_MIN, so this is always safe.
52
VIXL_ASSERT(n_imm.IsPositiveOrZero());
53
if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
54
}
55
56
// Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
57
UseScratchRegisterScope temps(this);
58
ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
59
Dup(scratch, imm);
60
61
SingleEmissionCheckScope guard(this);
62
if (add_imm) {
63
add(zd, zn, scratch);
64
} else {
65
sub(zd, zn, scratch);
66
}
67
}
68
69
bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
70
const ZRegister& zd,
71
const ZRegister& zn,
72
IntegerOperand imm) {
73
VIXL_ASSERT(imm.FitsInLane(zd));
74
75
int imm8;
76
int shift = -1;
77
if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
78
imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
79
MovprfxHelperScope guard(this, zd, zn);
80
switch (option) {
81
case kAddImmediate:
82
add(zd, zd, imm8, shift);
83
return true;
84
case kSubImmediate:
85
sub(zd, zd, imm8, shift);
86
return true;
87
}
88
}
89
return false;
90
}
91
92
void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
93
SVEArithPredicatedFn reg_macro,
94
const ZRegister& zd,
95
const ZRegister& zn,
96
IntegerOperand imm,
97
bool is_signed) {
98
if (is_signed) {
99
// E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
100
if (imm.IsInt8()) {
101
MovprfxHelperScope guard(this, zd, zn);
102
(this->*imm_fn)(zd, zd, imm.AsInt8());
103
return;
104
}
105
} else {
106
// E.g. UMIN_z_zi, UMAX_z_zi
107
if (imm.IsUint8()) {
108
MovprfxHelperScope guard(this, zd, zn);
109
(this->*imm_fn)(zd, zd, imm.AsUint8());
110
return;
111
}
112
}
113
114
UseScratchRegisterScope temps(this);
115
PRegister pg = temps.AcquireGoverningP();
116
Ptrue(pg.WithSameLaneSizeAs(zd));
117
118
// Try to re-use zd if we can, so we can avoid a movprfx.
119
ZRegister scratch =
120
zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
121
: zd;
122
Dup(scratch, imm);
123
124
// The vector-form macro for commutative operations will swap the arguments to
125
// avoid movprfx, if necessary.
126
(this->*reg_macro)(zd, pg.Merging(), zn, scratch);
127
}
128
129
void MacroAssembler::Mul(const ZRegister& zd,
130
const ZRegister& zn,
131
IntegerOperand imm) {
132
VIXL_ASSERT(allow_macro_instructions_);
133
IntArithImmFn imm_fn = &Assembler::mul;
134
SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
135
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
136
}
137
138
void MacroAssembler::Smin(const ZRegister& zd,
139
const ZRegister& zn,
140
IntegerOperand imm) {
141
VIXL_ASSERT(allow_macro_instructions_);
142
VIXL_ASSERT(imm.FitsInSignedLane(zd));
143
IntArithImmFn imm_fn = &Assembler::smin;
144
SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
145
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
146
}
147
148
void MacroAssembler::Smax(const ZRegister& zd,
149
const ZRegister& zn,
150
IntegerOperand imm) {
151
VIXL_ASSERT(allow_macro_instructions_);
152
VIXL_ASSERT(imm.FitsInSignedLane(zd));
153
IntArithImmFn imm_fn = &Assembler::smax;
154
SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
155
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
156
}
157
158
void MacroAssembler::Umax(const ZRegister& zd,
159
const ZRegister& zn,
160
IntegerOperand imm) {
161
VIXL_ASSERT(allow_macro_instructions_);
162
VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
163
IntArithImmFn imm_fn = &Assembler::umax;
164
SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
165
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
166
}
167
168
void MacroAssembler::Umin(const ZRegister& zd,
169
const ZRegister& zn,
170
IntegerOperand imm) {
171
VIXL_ASSERT(allow_macro_instructions_);
172
VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
173
IntArithImmFn imm_fn = &Assembler::umin;
174
SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
175
IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
176
}
177
178
void MacroAssembler::Addpl(const Register& xd,
179
const Register& xn,
180
int64_t multiplier) {
181
VIXL_ASSERT(allow_macro_instructions_);
182
183
// This macro relies on `Rdvl` to handle some out-of-range cases. Check that
184
// `VL * multiplier` cannot overflow, for any possible value of VL.
185
VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
186
VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
187
188
if (xd.IsZero()) return;
189
if (xn.IsZero() && xd.IsSP()) {
190
// TODO: This operation doesn't make much sense, but we could support it
191
// with a scratch register if necessary.
192
VIXL_UNIMPLEMENTED();
193
}
194
195
// Handling xzr requires an extra move, so defer it until later so we can try
196
// to use `rdvl` instead (via `Addvl`).
197
if (IsInt6(multiplier) && !xn.IsZero()) {
198
SingleEmissionCheckScope guard(this);
199
addpl(xd, xn, static_cast<int>(multiplier));
200
return;
201
}
202
203
// If `multiplier` is a multiple of 8, we can use `Addvl` instead.
204
if ((multiplier % kZRegBitsPerPRegBit) == 0) {
205
Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
206
return;
207
}
208
209
if (IsInt6(multiplier)) {
210
VIXL_ASSERT(xn.IsZero()); // Other cases were handled with `addpl`.
211
// There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
212
// materialise a zero.
213
MacroEmissionCheckScope guard(this);
214
movz(xd, 0);
215
addpl(xd, xd, static_cast<int>(multiplier));
216
return;
217
}
218
219
// TODO: Some probable cases result in rather long sequences. For example,
220
// `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
221
// outside the encodable range. We should look for ways to cover such cases
222
// without drastically increasing the complexity of this logic.
223
224
// For other cases, calculate xn + (PL * multiplier) using discrete
225
// instructions. This requires two scratch registers in the general case, so
226
// try to re-use the destination as a scratch register.
227
UseScratchRegisterScope temps(this);
228
temps.Include(xd);
229
temps.Exclude(xn);
230
231
Register scratch = temps.AcquireX();
232
// Because there is no `rdpl`, so we have to calculate PL from VL. We can't
233
// scale the multiplier because (we already know) it isn't a multiple of 8.
234
Rdvl(scratch, multiplier);
235
236
MacroEmissionCheckScope guard(this);
237
if (xn.IsZero()) {
238
asr(xd, scratch, kZRegBitsPerPRegBitLog2);
239
} else if (xd.IsSP() || xn.IsSP()) {
240
// TODO: MacroAssembler::Add should be able to handle this.
241
asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
242
add(xd, xn, scratch);
243
} else {
244
add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
245
}
246
}
247
248
void MacroAssembler::Addvl(const Register& xd,
249
const Register& xn,
250
int64_t multiplier) {
251
VIXL_ASSERT(allow_macro_instructions_);
252
VIXL_ASSERT(xd.IsX());
253
VIXL_ASSERT(xn.IsX());
254
255
// Check that `VL * multiplier` cannot overflow, for any possible value of VL.
256
VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
257
VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
258
259
if (xd.IsZero()) return;
260
if (xn.IsZero() && xd.IsSP()) {
261
// TODO: This operation doesn't make much sense, but we could support it
262
// with a scratch register if necessary. `rdvl` cannot write into `sp`.
263
VIXL_UNIMPLEMENTED();
264
}
265
266
if (IsInt6(multiplier)) {
267
SingleEmissionCheckScope guard(this);
268
if (xn.IsZero()) {
269
rdvl(xd, static_cast<int>(multiplier));
270
} else {
271
addvl(xd, xn, static_cast<int>(multiplier));
272
}
273
return;
274
}
275
276
// TODO: Some probable cases result in rather long sequences. For example,
277
// `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
278
// outside the encodable range. We should look for ways to cover such cases
279
// without drastically increasing the complexity of this logic.
280
281
// For other cases, calculate xn + (VL * multiplier) using discrete
282
// instructions. This requires two scratch registers in the general case, so
283
// we try to re-use the destination as a scratch register.
284
UseScratchRegisterScope temps(this);
285
temps.Include(xd);
286
temps.Exclude(xn);
287
288
Register a = temps.AcquireX();
289
Mov(a, multiplier);
290
291
MacroEmissionCheckScope guard(this);
292
Register b = temps.AcquireX();
293
rdvl(b, 1);
294
if (xn.IsZero()) {
295
mul(xd, a, b);
296
} else if (xd.IsSP() || xn.IsSP()) {
297
mul(a, a, b);
298
add(xd, xn, a);
299
} else {
300
madd(xd, a, b, xn);
301
}
302
}
303
304
void MacroAssembler::CalculateSVEAddress(const Register& xd,
305
const SVEMemOperand& addr,
306
int vl_divisor_log2) {
307
VIXL_ASSERT(allow_macro_instructions_);
308
VIXL_ASSERT(!addr.IsScatterGather());
309
VIXL_ASSERT(xd.IsX());
310
311
// The lower bound is where a whole Z register is accessed.
312
VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
313
// The upper bound is for P register accesses, and for instructions like
314
// "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
315
VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
316
317
SVEOffsetModifier mod = addr.GetOffsetModifier();
318
Register base = addr.GetScalarBase();
319
320
if (addr.IsEquivalentToScalar()) {
321
// For example:
322
// [x0]
323
// [x0, #0]
324
// [x0, xzr, LSL 2]
325
Mov(xd, base);
326
} else if (addr.IsScalarPlusImmediate()) {
327
// For example:
328
// [x0, #42]
329
// [x0, #42, MUL VL]
330
int64_t offset = addr.GetImmediateOffset();
331
VIXL_ASSERT(offset != 0); // Handled by IsEquivalentToScalar.
332
if (addr.IsMulVl()) {
333
int vl_divisor = 1 << vl_divisor_log2;
334
// For all possible values of vl_divisor, we can simply use `Addpl`. This
335
// will select `addvl` if necessary.
336
VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
337
Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
338
} else {
339
// IsScalarPlusImmediate() ensures that no other modifiers can occur.
340
VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
341
Add(xd, base, offset);
342
}
343
} else if (addr.IsScalarPlusScalar()) {
344
// For example:
345
// [x0, x1]
346
// [x0, x1, LSL #4]
347
Register offset = addr.GetScalarOffset();
348
VIXL_ASSERT(!offset.IsZero()); // Handled by IsEquivalentToScalar.
349
if (mod == SVE_LSL) {
350
Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
351
} else {
352
// IsScalarPlusScalar() ensures that no other modifiers can occur.
353
VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
354
Add(xd, base, offset);
355
}
356
} else {
357
// All other forms are scatter-gather addresses, which cannot be evaluated
358
// into an X register.
359
VIXL_UNREACHABLE();
360
}
361
}
362
363
void MacroAssembler::Cpy(const ZRegister& zd,
364
const PRegister& pg,
365
IntegerOperand imm) {
366
VIXL_ASSERT(allow_macro_instructions_);
367
VIXL_ASSERT(imm.FitsInLane(zd));
368
int imm8;
369
int shift;
370
if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
371
imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
372
SingleEmissionCheckScope guard(this);
373
cpy(zd, pg, imm8, shift);
374
return;
375
}
376
377
// The fallbacks rely on `cpy` variants that only support merging predication.
378
// If zeroing predication was requested, zero the destination first.
379
if (pg.IsZeroing()) {
380
SingleEmissionCheckScope guard(this);
381
dup(zd, 0);
382
}
383
PRegisterM pg_m = pg.Merging();
384
385
// Try to encode the immediate using fcpy.
386
VIXL_ASSERT(imm.FitsInLane(zd));
387
if (zd.GetLaneSizeInBits() >= kHRegSize) {
388
double fp_imm = 0.0;
389
switch (zd.GetLaneSizeInBits()) {
390
case kHRegSize:
391
fp_imm =
392
FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
393
break;
394
case kSRegSize:
395
fp_imm = RawbitsToFloat(imm.AsUint32());
396
break;
397
case kDRegSize:
398
fp_imm = RawbitsToDouble(imm.AsUint64());
399
break;
400
default:
401
VIXL_UNREACHABLE();
402
break;
403
}
404
// IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
405
// we can use IsImmFP64 for all lane sizes.
406
if (IsImmFP64(fp_imm)) {
407
SingleEmissionCheckScope guard(this);
408
fcpy(zd, pg_m, fp_imm);
409
return;
410
}
411
}
412
413
// Fall back to using a scratch register.
414
UseScratchRegisterScope temps(this);
415
Register scratch = temps.AcquireRegisterToHoldLane(zd);
416
Mov(scratch, imm);
417
418
SingleEmissionCheckScope guard(this);
419
cpy(zd, pg_m, scratch);
420
}
421
422
// TODO: We implement Fcpy (amongst other things) for all FP types because it
423
// allows us to preserve user-specified NaNs. We should come up with some
424
// FPImmediate type to abstract this, and avoid all the duplication below (and
425
// elsewhere).
426
427
void MacroAssembler::Fcpy(const ZRegister& zd,
428
const PRegisterM& pg,
429
double imm) {
430
VIXL_ASSERT(allow_macro_instructions_);
431
VIXL_ASSERT(pg.IsMerging());
432
433
if (IsImmFP64(imm)) {
434
SingleEmissionCheckScope guard(this);
435
fcpy(zd, pg, imm);
436
return;
437
}
438
439
// As a fall-back, cast the immediate to the required lane size, and try to
440
// encode the bit pattern using `Cpy`.
441
Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
442
}
443
444
void MacroAssembler::Fcpy(const ZRegister& zd,
445
const PRegisterM& pg,
446
float imm) {
447
VIXL_ASSERT(allow_macro_instructions_);
448
VIXL_ASSERT(pg.IsMerging());
449
450
if (IsImmFP32(imm)) {
451
SingleEmissionCheckScope guard(this);
452
fcpy(zd, pg, imm);
453
return;
454
}
455
456
// As a fall-back, cast the immediate to the required lane size, and try to
457
// encode the bit pattern using `Cpy`.
458
Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
459
}
460
461
void MacroAssembler::Fcpy(const ZRegister& zd,
462
const PRegisterM& pg,
463
Float16 imm) {
464
VIXL_ASSERT(allow_macro_instructions_);
465
VIXL_ASSERT(pg.IsMerging());
466
467
if (IsImmFP16(imm)) {
468
SingleEmissionCheckScope guard(this);
469
fcpy(zd, pg, imm);
470
return;
471
}
472
473
// As a fall-back, cast the immediate to the required lane size, and try to
474
// encode the bit pattern using `Cpy`.
475
Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
476
}
477
478
void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
479
VIXL_ASSERT(allow_macro_instructions_);
480
VIXL_ASSERT(imm.FitsInLane(zd));
481
unsigned lane_size = zd.GetLaneSizeInBits();
482
int imm8;
483
int shift;
484
if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
485
imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
486
SingleEmissionCheckScope guard(this);
487
dup(zd, imm8, shift);
488
} else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
489
SingleEmissionCheckScope guard(this);
490
dupm(zd, imm.AsUintN(lane_size));
491
} else {
492
UseScratchRegisterScope temps(this);
493
Register scratch = temps.AcquireRegisterToHoldLane(zd);
494
Mov(scratch, imm);
495
496
SingleEmissionCheckScope guard(this);
497
dup(zd, scratch);
498
}
499
}
500
501
void MacroAssembler::NoncommutativeArithmeticHelper(
502
const ZRegister& zd,
503
const PRegisterM& pg,
504
const ZRegister& zn,
505
const ZRegister& zm,
506
SVEArithPredicatedFn fn,
507
SVEArithPredicatedFn rev_fn) {
508
if (zd.Aliases(zn)) {
509
// E.g. zd = zd / zm
510
SingleEmissionCheckScope guard(this);
511
(this->*fn)(zd, pg, zn, zm);
512
} else if (zd.Aliases(zm)) {
513
// E.g. zd = zn / zd
514
SingleEmissionCheckScope guard(this);
515
(this->*rev_fn)(zd, pg, zm, zn);
516
} else {
517
// E.g. zd = zn / zm
518
MovprfxHelperScope guard(this, zd, pg, zn);
519
(this->*fn)(zd, pg, zd, zm);
520
}
521
}
522
523
void MacroAssembler::FPCommutativeArithmeticHelper(
524
const ZRegister& zd,
525
const PRegisterM& pg,
526
const ZRegister& zn,
527
const ZRegister& zm,
528
SVEArithPredicatedFn fn,
529
FPMacroNaNPropagationOption nan_option) {
530
ResolveFPNaNPropagationOption(&nan_option);
531
532
if (zd.Aliases(zn)) {
533
SingleEmissionCheckScope guard(this);
534
(this->*fn)(zd, pg, zd, zm);
535
} else if (zd.Aliases(zm)) {
536
switch (nan_option) {
537
case FastNaNPropagation: {
538
// Swap the arguments.
539
SingleEmissionCheckScope guard(this);
540
(this->*fn)(zd, pg, zd, zn);
541
return;
542
}
543
case StrictNaNPropagation: {
544
UseScratchRegisterScope temps(this);
545
// Use a scratch register to keep the argument order exactly as
546
// specified.
547
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
548
{
549
MovprfxHelperScope guard(this, scratch, pg, zn);
550
(this->*fn)(scratch, pg, scratch, zm);
551
}
552
Mov(zd, scratch);
553
return;
554
}
555
case NoFPMacroNaNPropagationSelected:
556
VIXL_UNREACHABLE();
557
return;
558
}
559
} else {
560
MovprfxHelperScope guard(this, zd, pg, zn);
561
(this->*fn)(zd, pg, zd, zm);
562
}
563
}
564
565
// Instructions of the form "inst zda, zn, zm, #num", where they are
566
// non-commutative and no reversed form is provided.
567
#define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
568
V(Cmla, cmla) \
569
V(Sqrdcmlah, sqrdcmlah)
570
571
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
572
void MacroAssembler::MASMFN(const ZRegister& zd, \
573
const ZRegister& za, \
574
const ZRegister& zn, \
575
const ZRegister& zm, \
576
int imm) { \
577
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
578
UseScratchRegisterScope temps(this); \
579
VIXL_ASSERT(AreSameLaneSize(zn, zm)); \
580
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); \
581
Mov(ztmp, zd.Aliases(zn) ? zn : zm); \
582
MovprfxHelperScope guard(this, zd, za); \
583
ASMFN(zd, \
584
(zd.Aliases(zn) ? ztmp : zn), \
585
(zd.Aliases(zm) ? ztmp : zm), \
586
imm); \
587
} else { \
588
MovprfxHelperScope guard(this, zd, za); \
589
ASMFN(zd, zn, zm, imm); \
590
} \
591
}
592
VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
593
#undef VIXL_DEFINE_MASM_FUNC
594
595
// Instructions of the form "inst zda, zn, zm, #num, #num", where they are
596
// non-commutative and no reversed form is provided.
597
#define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
598
V(Cmla, cmla) \
599
V(Sqrdcmlah, sqrdcmlah)
600
601
// This doesn't handle zm when it's out of the range that can be encoded in
602
// instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
603
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
604
void MacroAssembler::MASMFN(const ZRegister& zd, \
605
const ZRegister& za, \
606
const ZRegister& zn, \
607
const ZRegister& zm, \
608
int index, \
609
int rot) { \
610
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
611
UseScratchRegisterScope temps(this); \
612
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); \
613
{ \
614
MovprfxHelperScope guard(this, ztmp, za); \
615
ASMFN(ztmp, zn, zm, index, rot); \
616
} \
617
Mov(zd, ztmp); \
618
} else { \
619
MovprfxHelperScope guard(this, zd, za); \
620
ASMFN(zd, zn, zm, index, rot); \
621
} \
622
}
623
VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
624
#undef VIXL_DEFINE_MASM_FUNC
625
626
// Instructions of the form "inst zda, pg, zda, zn", where they are
627
// non-commutative and no reversed form is provided.
628
#define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
629
V(Addp, addp) \
630
V(Bic, bic) \
631
V(Faddp, faddp) \
632
V(Fmaxnmp, fmaxnmp) \
633
V(Fminnmp, fminnmp) \
634
V(Fmaxp, fmaxp) \
635
V(Fminp, fminp) \
636
V(Fscale, fscale) \
637
V(Smaxp, smaxp) \
638
V(Sminp, sminp) \
639
V(Suqadd, suqadd) \
640
V(Umaxp, umaxp) \
641
V(Uminp, uminp) \
642
V(Usqadd, usqadd)
643
644
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
645
void MacroAssembler::MASMFN(const ZRegister& zd, \
646
const PRegisterM& pg, \
647
const ZRegister& zn, \
648
const ZRegister& zm) { \
649
VIXL_ASSERT(allow_macro_instructions_); \
650
if (zd.Aliases(zm) && !zd.Aliases(zn)) { \
651
UseScratchRegisterScope temps(this); \
652
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
653
Mov(scratch, zm); \
654
MovprfxHelperScope guard(this, zd, pg, zn); \
655
ASMFN(zd, pg, zd, scratch); \
656
} else { \
657
MovprfxHelperScope guard(this, zd, pg, zn); \
658
ASMFN(zd, pg, zd, zm); \
659
} \
660
}
661
VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
662
#undef VIXL_DEFINE_MASM_FUNC
663
664
// Instructions of the form "inst zda, pg, zda, zn", where they are
665
// non-commutative and a reversed form is provided.
666
#define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
667
V(Asr, asr) \
668
V(Fdiv, fdiv) \
669
V(Fsub, fsub) \
670
V(Lsl, lsl) \
671
V(Lsr, lsr) \
672
V(Sdiv, sdiv) \
673
V(Shsub, shsub) \
674
V(Sqrshl, sqrshl) \
675
V(Sqshl, sqshl) \
676
V(Sqsub, sqsub) \
677
V(Srshl, srshl) \
678
V(Sub, sub) \
679
V(Udiv, udiv) \
680
V(Uhsub, uhsub) \
681
V(Uqrshl, uqrshl) \
682
V(Uqshl, uqshl) \
683
V(Uqsub, uqsub) \
684
V(Urshl, urshl)
685
686
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
687
void MacroAssembler::MASMFN(const ZRegister& zd, \
688
const PRegisterM& pg, \
689
const ZRegister& zn, \
690
const ZRegister& zm) { \
691
VIXL_ASSERT(allow_macro_instructions_); \
692
NoncommutativeArithmeticHelper(zd, \
693
pg, \
694
zn, \
695
zm, \
696
static_cast<SVEArithPredicatedFn>( \
697
&Assembler::ASMFN), \
698
static_cast<SVEArithPredicatedFn>( \
699
&Assembler::ASMFN##r)); \
700
}
701
VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
702
#undef VIXL_DEFINE_MASM_FUNC
703
704
void MacroAssembler::Fadd(const ZRegister& zd,
705
const PRegisterM& pg,
706
const ZRegister& zn,
707
const ZRegister& zm,
708
FPMacroNaNPropagationOption nan_option) {
709
VIXL_ASSERT(allow_macro_instructions_);
710
FPCommutativeArithmeticHelper(zd,
711
pg,
712
zn,
713
zm,
714
static_cast<SVEArithPredicatedFn>(
715
&Assembler::fadd),
716
nan_option);
717
}
718
719
void MacroAssembler::Fabd(const ZRegister& zd,
720
const PRegisterM& pg,
721
const ZRegister& zn,
722
const ZRegister& zm,
723
FPMacroNaNPropagationOption nan_option) {
724
VIXL_ASSERT(allow_macro_instructions_);
725
FPCommutativeArithmeticHelper(zd,
726
pg,
727
zn,
728
zm,
729
static_cast<SVEArithPredicatedFn>(
730
&Assembler::fabd),
731
nan_option);
732
}
733
734
void MacroAssembler::Fmul(const ZRegister& zd,
735
const PRegisterM& pg,
736
const ZRegister& zn,
737
const ZRegister& zm,
738
FPMacroNaNPropagationOption nan_option) {
739
VIXL_ASSERT(allow_macro_instructions_);
740
FPCommutativeArithmeticHelper(zd,
741
pg,
742
zn,
743
zm,
744
static_cast<SVEArithPredicatedFn>(
745
&Assembler::fmul),
746
nan_option);
747
}
748
749
void MacroAssembler::Fmulx(const ZRegister& zd,
750
const PRegisterM& pg,
751
const ZRegister& zn,
752
const ZRegister& zm,
753
FPMacroNaNPropagationOption nan_option) {
754
VIXL_ASSERT(allow_macro_instructions_);
755
FPCommutativeArithmeticHelper(zd,
756
pg,
757
zn,
758
zm,
759
static_cast<SVEArithPredicatedFn>(
760
&Assembler::fmulx),
761
nan_option);
762
}
763
764
void MacroAssembler::Fmax(const ZRegister& zd,
765
const PRegisterM& pg,
766
const ZRegister& zn,
767
const ZRegister& zm,
768
FPMacroNaNPropagationOption nan_option) {
769
VIXL_ASSERT(allow_macro_instructions_);
770
FPCommutativeArithmeticHelper(zd,
771
pg,
772
zn,
773
zm,
774
static_cast<SVEArithPredicatedFn>(
775
&Assembler::fmax),
776
nan_option);
777
}
778
779
void MacroAssembler::Fmin(const ZRegister& zd,
780
const PRegisterM& pg,
781
const ZRegister& zn,
782
const ZRegister& zm,
783
FPMacroNaNPropagationOption nan_option) {
784
VIXL_ASSERT(allow_macro_instructions_);
785
FPCommutativeArithmeticHelper(zd,
786
pg,
787
zn,
788
zm,
789
static_cast<SVEArithPredicatedFn>(
790
&Assembler::fmin),
791
nan_option);
792
}
793
794
void MacroAssembler::Fmaxnm(const ZRegister& zd,
795
const PRegisterM& pg,
796
const ZRegister& zn,
797
const ZRegister& zm,
798
FPMacroNaNPropagationOption nan_option) {
799
VIXL_ASSERT(allow_macro_instructions_);
800
FPCommutativeArithmeticHelper(zd,
801
pg,
802
zn,
803
zm,
804
static_cast<SVEArithPredicatedFn>(
805
&Assembler::fmaxnm),
806
nan_option);
807
}
808
809
void MacroAssembler::Fminnm(const ZRegister& zd,
810
const PRegisterM& pg,
811
const ZRegister& zn,
812
const ZRegister& zm,
813
FPMacroNaNPropagationOption nan_option) {
814
VIXL_ASSERT(allow_macro_instructions_);
815
FPCommutativeArithmeticHelper(zd,
816
pg,
817
zn,
818
zm,
819
static_cast<SVEArithPredicatedFn>(
820
&Assembler::fminnm),
821
nan_option);
822
}
823
824
void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
825
VIXL_ASSERT(allow_macro_instructions_);
826
827
switch (zd.GetLaneSizeInBits()) {
828
case kHRegSize:
829
Fdup(zd, Float16(imm));
830
break;
831
case kSRegSize:
832
Fdup(zd, static_cast<float>(imm));
833
break;
834
case kDRegSize:
835
uint64_t bits = DoubleToRawbits(imm);
836
if (IsImmFP64(bits)) {
837
SingleEmissionCheckScope guard(this);
838
fdup(zd, imm);
839
} else {
840
Dup(zd, bits);
841
}
842
break;
843
}
844
}
845
846
void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
847
VIXL_ASSERT(allow_macro_instructions_);
848
849
switch (zd.GetLaneSizeInBits()) {
850
case kHRegSize:
851
Fdup(zd, Float16(imm));
852
break;
853
case kSRegSize:
854
if (IsImmFP32(imm)) {
855
SingleEmissionCheckScope guard(this);
856
fdup(zd, imm);
857
} else {
858
Dup(zd, FloatToRawbits(imm));
859
}
860
break;
861
case kDRegSize:
862
Fdup(zd, static_cast<double>(imm));
863
break;
864
}
865
}
866
867
void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
868
VIXL_ASSERT(allow_macro_instructions_);
869
870
switch (zd.GetLaneSizeInBits()) {
871
case kHRegSize:
872
if (IsImmFP16(imm)) {
873
SingleEmissionCheckScope guard(this);
874
fdup(zd, imm);
875
} else {
876
Dup(zd, Float16ToRawbits(imm));
877
}
878
break;
879
case kSRegSize:
880
Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
881
break;
882
case kDRegSize:
883
Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
884
break;
885
}
886
}
887
888
void MacroAssembler::Index(const ZRegister& zd,
889
const Operand& start,
890
const Operand& step) {
891
class IndexOperand : public Operand {
892
public:
893
static IndexOperand Prepare(MacroAssembler* masm,
894
UseScratchRegisterScope* temps,
895
const Operand& op,
896
const ZRegister& zd_inner) {
897
// Look for encodable immediates.
898
int imm;
899
if (op.IsImmediate()) {
900
if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
901
return IndexOperand(imm);
902
}
903
Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
904
masm->Mov(scratch, op);
905
return IndexOperand(scratch);
906
} else {
907
// Plain registers can be encoded directly.
908
VIXL_ASSERT(op.IsPlainRegister());
909
return IndexOperand(op.GetRegister());
910
}
911
}
912
913
int GetImm5() const {
914
int64_t imm = GetImmediate();
915
VIXL_ASSERT(IsInt5(imm));
916
return static_cast<int>(imm);
917
}
918
919
private:
920
explicit IndexOperand(const Register& reg) : Operand(reg) {}
921
explicit IndexOperand(int64_t imm) : Operand(imm) {}
922
};
923
924
UseScratchRegisterScope temps(this);
925
IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
926
IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
927
928
SingleEmissionCheckScope guard(this);
929
if (start_enc.IsImmediate()) {
930
if (step_enc.IsImmediate()) {
931
index(zd, start_enc.GetImm5(), step_enc.GetImm5());
932
} else {
933
index(zd, start_enc.GetImm5(), step_enc.GetRegister());
934
}
935
} else {
936
if (step_enc.IsImmediate()) {
937
index(zd, start_enc.GetRegister(), step_enc.GetImm5());
938
} else {
939
index(zd, start_enc.GetRegister(), step_enc.GetRegister());
940
}
941
}
942
}
943
944
void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
945
VIXL_ASSERT(allow_macro_instructions_);
946
VIXL_ASSERT(imm.FitsInLane(zdn));
947
948
if (imm.IsZero()) {
949
SingleEmissionCheckScope guard(this);
950
insr(zdn, xzr);
951
return;
952
}
953
954
UseScratchRegisterScope temps(this);
955
Register scratch = temps.AcquireRegisterToHoldLane(zdn);
956
957
// TODO: There are many cases where we could optimise immediates, such as by
958
// detecting repeating patterns or FP immediates. We should optimise and
959
// abstract this for use in other SVE mov-immediate-like macros.
960
Mov(scratch, imm);
961
962
SingleEmissionCheckScope guard(this);
963
insr(zdn, scratch);
964
}
965
966
void MacroAssembler::Mla(const ZRegister& zd,
967
const PRegisterM& pg,
968
const ZRegister& za,
969
const ZRegister& zn,
970
const ZRegister& zm) {
971
VIXL_ASSERT(allow_macro_instructions_);
972
if (zd.Aliases(za)) {
973
// zda = zda + (zn * zm)
974
SingleEmissionCheckScope guard(this);
975
mla(zd, pg, zn, zm);
976
} else if (zd.Aliases(zn)) {
977
// zdn = za + (zdn * zm)
978
SingleEmissionCheckScope guard(this);
979
mad(zd, pg, zm, za);
980
} else if (zd.Aliases(zm)) {
981
// Multiplication is commutative, so we can swap zn and zm.
982
// zdm = za + (zdm * zn)
983
SingleEmissionCheckScope guard(this);
984
mad(zd, pg, zn, za);
985
} else {
986
// zd = za + (zn * zm)
987
ExactAssemblyScope guard(this, 2 * kInstructionSize);
988
movprfx(zd, pg, za);
989
mla(zd, pg, zn, zm);
990
}
991
}
992
993
void MacroAssembler::Mls(const ZRegister& zd,
994
const PRegisterM& pg,
995
const ZRegister& za,
996
const ZRegister& zn,
997
const ZRegister& zm) {
998
VIXL_ASSERT(allow_macro_instructions_);
999
if (zd.Aliases(za)) {
1000
// zda = zda - (zn * zm)
1001
SingleEmissionCheckScope guard(this);
1002
mls(zd, pg, zn, zm);
1003
} else if (zd.Aliases(zn)) {
1004
// zdn = za - (zdn * zm)
1005
SingleEmissionCheckScope guard(this);
1006
msb(zd, pg, zm, za);
1007
} else if (zd.Aliases(zm)) {
1008
// Multiplication is commutative, so we can swap zn and zm.
1009
// zdm = za - (zdm * zn)
1010
SingleEmissionCheckScope guard(this);
1011
msb(zd, pg, zn, za);
1012
} else {
1013
// zd = za - (zn * zm)
1014
ExactAssemblyScope guard(this, 2 * kInstructionSize);
1015
movprfx(zd, pg, za);
1016
mls(zd, pg, zn, zm);
1017
}
1018
}
1019
1020
void MacroAssembler::CompareHelper(Condition cond,
1021
const PRegisterWithLaneSize& pd,
1022
const PRegisterZ& pg,
1023
const ZRegister& zn,
1024
IntegerOperand imm) {
1025
UseScratchRegisterScope temps(this);
1026
ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1027
Dup(zm, imm);
1028
SingleEmissionCheckScope guard(this);
1029
cmp(cond, pd, pg, zn, zm);
1030
}
1031
1032
void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
1033
const PRegister& pg,
1034
const PRegisterWithLaneSize& pn) {
1035
VIXL_ASSERT(allow_macro_instructions_);
1036
VIXL_ASSERT(pd.IsLaneSizeB());
1037
VIXL_ASSERT(pn.IsLaneSizeB());
1038
if (pd.Is(pn)) {
1039
SingleEmissionCheckScope guard(this);
1040
pfirst(pd, pg, pn);
1041
} else {
1042
UseScratchRegisterScope temps(this);
1043
PRegister temp_pg = pg;
1044
if (pd.Aliases(pg)) {
1045
temp_pg = temps.AcquireP();
1046
Mov(temp_pg.VnB(), pg.VnB());
1047
}
1048
Mov(pd, pn);
1049
SingleEmissionCheckScope guard(this);
1050
pfirst(pd, temp_pg, pd);
1051
}
1052
}
1053
1054
void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
1055
const PRegister& pg,
1056
const PRegisterWithLaneSize& pn) {
1057
VIXL_ASSERT(allow_macro_instructions_);
1058
VIXL_ASSERT(AreSameFormat(pd, pn));
1059
if (pd.Is(pn)) {
1060
SingleEmissionCheckScope guard(this);
1061
pnext(pd, pg, pn);
1062
} else {
1063
UseScratchRegisterScope temps(this);
1064
PRegister temp_pg = pg;
1065
if (pd.Aliases(pg)) {
1066
temp_pg = temps.AcquireP();
1067
Mov(temp_pg.VnB(), pg.VnB());
1068
}
1069
Mov(pd.VnB(), pn.VnB());
1070
SingleEmissionCheckScope guard(this);
1071
pnext(pd, temp_pg, pd);
1072
}
1073
}
1074
1075
void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
1076
SVEPredicateConstraint pattern,
1077
FlagsUpdate s) {
1078
VIXL_ASSERT(allow_macro_instructions_);
1079
switch (s) {
1080
case LeaveFlags:
1081
Ptrue(pd, pattern);
1082
return;
1083
case SetFlags:
1084
Ptrues(pd, pattern);
1085
return;
1086
}
1087
VIXL_UNREACHABLE();
1088
}
1089
1090
void MacroAssembler::Sub(const ZRegister& zd,
1091
IntegerOperand imm,
1092
const ZRegister& zm) {
1093
VIXL_ASSERT(allow_macro_instructions_);
1094
1095
int imm8;
1096
int shift = -1;
1097
if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
1098
imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
1099
MovprfxHelperScope guard(this, zd, zm);
1100
subr(zd, zd, imm8, shift);
1101
} else {
1102
UseScratchRegisterScope temps(this);
1103
ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
1104
Dup(scratch, imm);
1105
1106
SingleEmissionCheckScope guard(this);
1107
sub(zd, scratch, zm);
1108
}
1109
}
1110
1111
void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
1112
const PRegisterZ& pg,
1113
const SVEMemOperand& addr,
1114
SVELoadBroadcastFn fn,
1115
int divisor) {
1116
VIXL_ASSERT(addr.IsScalarPlusImmediate());
1117
int64_t imm = addr.GetImmediateOffset();
1118
if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
1119
SingleEmissionCheckScope guard(this);
1120
(this->*fn)(zt, pg, addr);
1121
} else {
1122
UseScratchRegisterScope temps(this);
1123
Register scratch = temps.AcquireX();
1124
CalculateSVEAddress(scratch, addr, zt);
1125
SingleEmissionCheckScope guard(this);
1126
(this->*fn)(zt, pg, SVEMemOperand(scratch));
1127
}
1128
}
1129
1130
void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
1131
const SVEMemOperand& addr,
1132
SVELoadStoreFn fn) {
1133
VIXL_ASSERT(allow_macro_instructions_);
1134
VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
1135
1136
if (addr.IsPlainScalar() ||
1137
(addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
1138
addr.IsMulVl())) {
1139
SingleEmissionCheckScope guard(this);
1140
(this->*fn)(rt, addr);
1141
return;
1142
}
1143
1144
if (addr.IsEquivalentToScalar()) {
1145
SingleEmissionCheckScope guard(this);
1146
(this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
1147
return;
1148
}
1149
1150
UseScratchRegisterScope temps(this);
1151
Register scratch = temps.AcquireX();
1152
CalculateSVEAddress(scratch, addr, rt);
1153
SingleEmissionCheckScope guard(this);
1154
(this->*fn)(rt, SVEMemOperand(scratch));
1155
}
1156
1157
template <typename Tg, typename Tf>
1158
void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
1159
const ZRegister& zt,
1160
const Tg& pg,
1161
const SVEMemOperand& addr,
1162
Tf fn,
1163
int imm_bits,
1164
int shift_amount,
1165
SVEOffsetModifier supported_modifier,
1166
int vl_divisor_log2) {
1167
VIXL_ASSERT(allow_macro_instructions_);
1168
int imm_divisor = 1 << shift_amount;
1169
1170
if (addr.IsPlainScalar() ||
1171
(addr.IsScalarPlusImmediate() &&
1172
IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
1173
((addr.GetImmediateOffset() % imm_divisor) == 0) &&
1174
(addr.GetOffsetModifier() == supported_modifier))) {
1175
SingleEmissionCheckScope guard(this);
1176
(this->*fn)(zt, pg, addr);
1177
return;
1178
}
1179
1180
if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1181
addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
1182
SingleEmissionCheckScope guard(this);
1183
(this->*fn)(zt, pg, addr);
1184
return;
1185
}
1186
1187
if (addr.IsEquivalentToScalar()) {
1188
SingleEmissionCheckScope guard(this);
1189
(this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1190
return;
1191
}
1192
1193
if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
1194
(vl_divisor_log2 == -1)) {
1195
// We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
1196
// dependent.
1197
VIXL_UNIMPLEMENTED();
1198
}
1199
1200
UseScratchRegisterScope temps(this);
1201
Register scratch = temps.AcquireX();
1202
CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1203
SingleEmissionCheckScope guard(this);
1204
(this->*fn)(zt, pg, SVEMemOperand(scratch));
1205
}
1206
1207
template <typename Tg, typename Tf>
1208
void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
1209
const ZRegister& zt,
1210
const Tg& pg,
1211
const SVEMemOperand& addr,
1212
Tf fn) {
1213
if (addr.IsPlainScalar() ||
1214
(addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1215
addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
1216
(addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
1217
addr.IsMulVl())) {
1218
SingleEmissionCheckScope guard(this);
1219
(this->*fn)(zt, pg, addr);
1220
return;
1221
}
1222
1223
if (addr.IsEquivalentToScalar()) {
1224
SingleEmissionCheckScope guard(this);
1225
(this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1226
return;
1227
}
1228
1229
if (addr.IsVectorPlusImmediate()) {
1230
uint64_t offset = addr.GetImmediateOffset();
1231
if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
1232
IsUint5(offset >> msize_in_bytes_log2)) {
1233
SingleEmissionCheckScope guard(this);
1234
(this->*fn)(zt, pg, addr);
1235
return;
1236
}
1237
}
1238
1239
if (addr.IsScalarPlusVector()) {
1240
VIXL_ASSERT(addr.IsScatterGather());
1241
SingleEmissionCheckScope guard(this);
1242
(this->*fn)(zt, pg, addr);
1243
return;
1244
}
1245
1246
UseScratchRegisterScope temps(this);
1247
if (addr.IsScatterGather()) {
1248
// In scatter-gather modes, zt and zn/zm have the same lane size. However,
1249
// for 32-bit accesses, the result of each lane's address calculation still
1250
// requires 64 bits; we can't naively use `Adr` for the address calculation
1251
// because it would truncate each address to 32 bits.
1252
1253
if (addr.IsVectorPlusImmediate()) {
1254
// Synthesise the immediate in an X register, then use a
1255
// scalar-plus-vector access with the original vector.
1256
Register scratch = temps.AcquireX();
1257
Mov(scratch, addr.GetImmediateOffset());
1258
SingleEmissionCheckScope guard(this);
1259
SVEOffsetModifier om =
1260
zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
1261
(this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
1262
return;
1263
}
1264
1265
VIXL_UNIMPLEMENTED();
1266
} else {
1267
Register scratch = temps.AcquireX();
1268
// TODO: If we have an immediate offset that is a multiple of
1269
// msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
1270
// save an instruction.
1271
int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
1272
CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1273
SingleEmissionCheckScope guard(this);
1274
(this->*fn)(zt, pg, SVEMemOperand(scratch));
1275
}
1276
}
1277
1278
template <typename Tf>
1279
void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
1280
const ZRegister& zt,
1281
const PRegisterZ& pg,
1282
const SVEMemOperand& addr,
1283
Tf fn) {
1284
if (addr.IsScatterGather()) {
1285
// Scatter-gather first-fault loads share encodings with normal loads.
1286
SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
1287
return;
1288
}
1289
1290
// Contiguous first-faulting loads have no scalar-plus-immediate form at all,
1291
// so we don't do immediate synthesis.
1292
1293
// We cannot currently distinguish "[x0]" from "[x0, #0]", and this
1294
// is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
1295
if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
1296
addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
1297
SingleEmissionCheckScope guard(this);
1298
(this->*fn)(zt, pg, addr);
1299
return;
1300
}
1301
1302
VIXL_UNIMPLEMENTED();
1303
}
1304
1305
void MacroAssembler::Ld1b(const ZRegister& zt,
1306
const PRegisterZ& pg,
1307
const SVEMemOperand& addr) {
1308
VIXL_ASSERT(allow_macro_instructions_);
1309
SVELoadStore1Helper(kBRegSizeInBytesLog2,
1310
zt,
1311
pg,
1312
addr,
1313
static_cast<SVELoad1Fn>(&Assembler::ld1b));
1314
}
1315
1316
void MacroAssembler::Ld1h(const ZRegister& zt,
1317
const PRegisterZ& pg,
1318
const SVEMemOperand& addr) {
1319
VIXL_ASSERT(allow_macro_instructions_);
1320
SVELoadStore1Helper(kHRegSizeInBytesLog2,
1321
zt,
1322
pg,
1323
addr,
1324
static_cast<SVELoad1Fn>(&Assembler::ld1h));
1325
}
1326
1327
void MacroAssembler::Ld1w(const ZRegister& zt,
1328
const PRegisterZ& pg,
1329
const SVEMemOperand& addr) {
1330
VIXL_ASSERT(allow_macro_instructions_);
1331
SVELoadStore1Helper(kWRegSizeInBytesLog2,
1332
zt,
1333
pg,
1334
addr,
1335
static_cast<SVELoad1Fn>(&Assembler::ld1w));
1336
}
1337
1338
void MacroAssembler::Ld1d(const ZRegister& zt,
1339
const PRegisterZ& pg,
1340
const SVEMemOperand& addr) {
1341
VIXL_ASSERT(allow_macro_instructions_);
1342
SVELoadStore1Helper(kDRegSizeInBytesLog2,
1343
zt,
1344
pg,
1345
addr,
1346
static_cast<SVELoad1Fn>(&Assembler::ld1d));
1347
}
1348
1349
void MacroAssembler::Ld1sb(const ZRegister& zt,
1350
const PRegisterZ& pg,
1351
const SVEMemOperand& addr) {
1352
VIXL_ASSERT(allow_macro_instructions_);
1353
SVELoadStore1Helper(kBRegSizeInBytesLog2,
1354
zt,
1355
pg,
1356
addr,
1357
static_cast<SVELoad1Fn>(&Assembler::ld1sb));
1358
}
1359
1360
void MacroAssembler::Ld1sh(const ZRegister& zt,
1361
const PRegisterZ& pg,
1362
const SVEMemOperand& addr) {
1363
VIXL_ASSERT(allow_macro_instructions_);
1364
SVELoadStore1Helper(kHRegSizeInBytesLog2,
1365
zt,
1366
pg,
1367
addr,
1368
static_cast<SVELoad1Fn>(&Assembler::ld1sh));
1369
}
1370
1371
void MacroAssembler::Ld1sw(const ZRegister& zt,
1372
const PRegisterZ& pg,
1373
const SVEMemOperand& addr) {
1374
VIXL_ASSERT(allow_macro_instructions_);
1375
SVELoadStore1Helper(kSRegSizeInBytesLog2,
1376
zt,
1377
pg,
1378
addr,
1379
static_cast<SVELoad1Fn>(&Assembler::ld1sw));
1380
}
1381
1382
void MacroAssembler::St1b(const ZRegister& zt,
1383
const PRegister& pg,
1384
const SVEMemOperand& addr) {
1385
VIXL_ASSERT(allow_macro_instructions_);
1386
SVELoadStore1Helper(kBRegSizeInBytesLog2,
1387
zt,
1388
pg,
1389
addr,
1390
static_cast<SVEStore1Fn>(&Assembler::st1b));
1391
}
1392
1393
void MacroAssembler::St1h(const ZRegister& zt,
1394
const PRegister& pg,
1395
const SVEMemOperand& addr) {
1396
VIXL_ASSERT(allow_macro_instructions_);
1397
SVELoadStore1Helper(kHRegSizeInBytesLog2,
1398
zt,
1399
pg,
1400
addr,
1401
static_cast<SVEStore1Fn>(&Assembler::st1h));
1402
}
1403
1404
void MacroAssembler::St1w(const ZRegister& zt,
1405
const PRegister& pg,
1406
const SVEMemOperand& addr) {
1407
VIXL_ASSERT(allow_macro_instructions_);
1408
SVELoadStore1Helper(kSRegSizeInBytesLog2,
1409
zt,
1410
pg,
1411
addr,
1412
static_cast<SVEStore1Fn>(&Assembler::st1w));
1413
}
1414
1415
void MacroAssembler::St1d(const ZRegister& zt,
1416
const PRegister& pg,
1417
const SVEMemOperand& addr) {
1418
VIXL_ASSERT(allow_macro_instructions_);
1419
SVELoadStore1Helper(kDRegSizeInBytesLog2,
1420
zt,
1421
pg,
1422
addr,
1423
static_cast<SVEStore1Fn>(&Assembler::st1d));
1424
}
1425
1426
void MacroAssembler::Ldff1b(const ZRegister& zt,
1427
const PRegisterZ& pg,
1428
const SVEMemOperand& addr) {
1429
VIXL_ASSERT(allow_macro_instructions_);
1430
SVELoadFFHelper(kBRegSizeInBytesLog2,
1431
zt,
1432
pg,
1433
addr,
1434
static_cast<SVELoad1Fn>(&Assembler::ldff1b));
1435
}
1436
1437
void MacroAssembler::Ldff1h(const ZRegister& zt,
1438
const PRegisterZ& pg,
1439
const SVEMemOperand& addr) {
1440
VIXL_ASSERT(allow_macro_instructions_);
1441
SVELoadFFHelper(kHRegSizeInBytesLog2,
1442
zt,
1443
pg,
1444
addr,
1445
static_cast<SVELoad1Fn>(&Assembler::ldff1h));
1446
}
1447
1448
void MacroAssembler::Ldff1w(const ZRegister& zt,
1449
const PRegisterZ& pg,
1450
const SVEMemOperand& addr) {
1451
VIXL_ASSERT(allow_macro_instructions_);
1452
SVELoadFFHelper(kSRegSizeInBytesLog2,
1453
zt,
1454
pg,
1455
addr,
1456
static_cast<SVELoad1Fn>(&Assembler::ldff1w));
1457
}
1458
1459
void MacroAssembler::Ldff1d(const ZRegister& zt,
1460
const PRegisterZ& pg,
1461
const SVEMemOperand& addr) {
1462
VIXL_ASSERT(allow_macro_instructions_);
1463
SVELoadFFHelper(kDRegSizeInBytesLog2,
1464
zt,
1465
pg,
1466
addr,
1467
static_cast<SVELoad1Fn>(&Assembler::ldff1d));
1468
}
1469
1470
void MacroAssembler::Ldff1sb(const ZRegister& zt,
1471
const PRegisterZ& pg,
1472
const SVEMemOperand& addr) {
1473
VIXL_ASSERT(allow_macro_instructions_);
1474
SVELoadFFHelper(kBRegSizeInBytesLog2,
1475
zt,
1476
pg,
1477
addr,
1478
static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
1479
}
1480
1481
void MacroAssembler::Ldff1sh(const ZRegister& zt,
1482
const PRegisterZ& pg,
1483
const SVEMemOperand& addr) {
1484
VIXL_ASSERT(allow_macro_instructions_);
1485
SVELoadFFHelper(kHRegSizeInBytesLog2,
1486
zt,
1487
pg,
1488
addr,
1489
static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
1490
}
1491
1492
void MacroAssembler::Ldff1sw(const ZRegister& zt,
1493
const PRegisterZ& pg,
1494
const SVEMemOperand& addr) {
1495
VIXL_ASSERT(allow_macro_instructions_);
1496
SVELoadFFHelper(kSRegSizeInBytesLog2,
1497
zt,
1498
pg,
1499
addr,
1500
static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
1501
}
1502
1503
#define VIXL_SVE_LD1R_LIST(V) \
1504
V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)
1505
1506
#define VIXL_DEFINE_MASM_FUNC(SZ, SH) \
1507
void MacroAssembler::Ld1r##SZ(const ZRegister& zt, \
1508
const PRegisterZ& pg, \
1509
const SVEMemOperand& addr) { \
1510
VIXL_ASSERT(allow_macro_instructions_); \
1511
SVELoadStoreNTBroadcastQOHelper(zt, \
1512
pg, \
1513
addr, \
1514
&MacroAssembler::ld1r##SZ, \
1515
4, \
1516
SH, \
1517
NO_SVE_OFFSET_MODIFIER, \
1518
-1); \
1519
}
1520
1521
VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)
1522
1523
#undef VIXL_DEFINE_MASM_FUNC
1524
#undef VIXL_SVE_LD1R_LIST
1525
1526
void MacroAssembler::Ldnt1b(const ZRegister& zt,
1527
const PRegisterZ& pg,
1528
const SVEMemOperand& addr) {
1529
VIXL_ASSERT(allow_macro_instructions_);
1530
if (addr.IsVectorPlusScalar()) {
1531
SingleEmissionCheckScope guard(this);
1532
ldnt1b(zt, pg, addr);
1533
} else {
1534
SVELoadStoreNTBroadcastQOHelper(zt,
1535
pg,
1536
addr,
1537
&MacroAssembler::ldnt1b,
1538
4,
1539
0,
1540
SVE_MUL_VL);
1541
}
1542
}
1543
1544
void MacroAssembler::Ldnt1d(const ZRegister& zt,
1545
const PRegisterZ& pg,
1546
const SVEMemOperand& addr) {
1547
VIXL_ASSERT(allow_macro_instructions_);
1548
if (addr.IsVectorPlusScalar()) {
1549
SingleEmissionCheckScope guard(this);
1550
ldnt1d(zt, pg, addr);
1551
} else {
1552
SVELoadStoreNTBroadcastQOHelper(zt,
1553
pg,
1554
addr,
1555
&MacroAssembler::ldnt1d,
1556
4,
1557
0,
1558
SVE_MUL_VL);
1559
}
1560
}
1561
1562
void MacroAssembler::Ldnt1h(const ZRegister& zt,
1563
const PRegisterZ& pg,
1564
const SVEMemOperand& addr) {
1565
VIXL_ASSERT(allow_macro_instructions_);
1566
if (addr.IsVectorPlusScalar()) {
1567
SingleEmissionCheckScope guard(this);
1568
ldnt1h(zt, pg, addr);
1569
} else {
1570
SVELoadStoreNTBroadcastQOHelper(zt,
1571
pg,
1572
addr,
1573
&MacroAssembler::ldnt1h,
1574
4,
1575
0,
1576
SVE_MUL_VL);
1577
}
1578
}
1579
1580
void MacroAssembler::Ldnt1w(const ZRegister& zt,
1581
const PRegisterZ& pg,
1582
const SVEMemOperand& addr) {
1583
VIXL_ASSERT(allow_macro_instructions_);
1584
if (addr.IsVectorPlusScalar()) {
1585
SingleEmissionCheckScope guard(this);
1586
ldnt1w(zt, pg, addr);
1587
} else {
1588
SVELoadStoreNTBroadcastQOHelper(zt,
1589
pg,
1590
addr,
1591
&MacroAssembler::ldnt1w,
1592
4,
1593
0,
1594
SVE_MUL_VL);
1595
}
1596
}
1597
1598
void MacroAssembler::Stnt1b(const ZRegister& zt,
1599
const PRegister& pg,
1600
const SVEMemOperand& addr) {
1601
VIXL_ASSERT(allow_macro_instructions_);
1602
if (addr.IsVectorPlusScalar()) {
1603
SingleEmissionCheckScope guard(this);
1604
stnt1b(zt, pg, addr);
1605
} else {
1606
SVELoadStoreNTBroadcastQOHelper(zt,
1607
pg,
1608
addr,
1609
&MacroAssembler::stnt1b,
1610
4,
1611
0,
1612
SVE_MUL_VL);
1613
}
1614
}
1615
void MacroAssembler::Stnt1d(const ZRegister& zt,
1616
const PRegister& pg,
1617
const SVEMemOperand& addr) {
1618
VIXL_ASSERT(allow_macro_instructions_);
1619
if (addr.IsVectorPlusScalar()) {
1620
SingleEmissionCheckScope guard(this);
1621
stnt1d(zt, pg, addr);
1622
} else {
1623
SVELoadStoreNTBroadcastQOHelper(zt,
1624
pg,
1625
addr,
1626
&MacroAssembler::stnt1d,
1627
4,
1628
0,
1629
SVE_MUL_VL);
1630
}
1631
}
1632
void MacroAssembler::Stnt1h(const ZRegister& zt,
1633
const PRegister& pg,
1634
const SVEMemOperand& addr) {
1635
VIXL_ASSERT(allow_macro_instructions_);
1636
if (addr.IsVectorPlusScalar()) {
1637
SingleEmissionCheckScope guard(this);
1638
stnt1h(zt, pg, addr);
1639
} else {
1640
SVELoadStoreNTBroadcastQOHelper(zt,
1641
pg,
1642
addr,
1643
&MacroAssembler::stnt1h,
1644
4,
1645
0,
1646
SVE_MUL_VL);
1647
}
1648
}
1649
void MacroAssembler::Stnt1w(const ZRegister& zt,
1650
const PRegister& pg,
1651
const SVEMemOperand& addr) {
1652
VIXL_ASSERT(allow_macro_instructions_);
1653
if (addr.IsVectorPlusScalar()) {
1654
SingleEmissionCheckScope guard(this);
1655
stnt1w(zt, pg, addr);
1656
} else {
1657
SVELoadStoreNTBroadcastQOHelper(zt,
1658
pg,
1659
addr,
1660
&MacroAssembler::stnt1w,
1661
4,
1662
0,
1663
SVE_MUL_VL);
1664
}
1665
}
1666
1667
void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
1668
const ZRegister& zd,
1669
const ZRegister& za,
1670
const ZRegister& zn,
1671
const ZRegister& zm,
1672
int index) {
1673
if (zd.Aliases(za)) {
1674
// zda = zda + (zn . zm)
1675
SingleEmissionCheckScope guard(this);
1676
(this->*fn)(zd, zn, zm, index);
1677
1678
} else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1679
// zdn = za + (zdn . zm[index])
1680
// zdm = za + (zn . zdm[index])
1681
// zdnm = za + (zdnm . zdnm[index])
1682
UseScratchRegisterScope temps(this);
1683
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1684
{
1685
MovprfxHelperScope guard(this, scratch, za);
1686
(this->*fn)(scratch, zn, zm, index);
1687
}
1688
1689
Mov(zd, scratch);
1690
} else {
1691
// zd = za + (zn . zm)
1692
MovprfxHelperScope guard(this, zd, za);
1693
(this->*fn)(zd, zn, zm, index);
1694
}
1695
}
1696
1697
void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
1698
const ZRegister& zd,
1699
const ZRegister& za,
1700
const ZRegister& zn,
1701
const ZRegister& zm) {
1702
if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1703
// zd = za . zd . zm
1704
// zd = za . zn . zd
1705
// zd = za . zd . zd
1706
UseScratchRegisterScope temps(this);
1707
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1708
{
1709
MovprfxHelperScope guard(this, scratch, za);
1710
(this->*fn)(scratch, zn, zm);
1711
}
1712
1713
Mov(zd, scratch);
1714
} else {
1715
MovprfxHelperScope guard(this, zd, za);
1716
(this->*fn)(zd, zn, zm);
1717
}
1718
}
1719
1720
void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
1721
const ZRegister& zd,
1722
const ZRegister& za,
1723
const ZRegister& zn,
1724
const ZRegister& zm) {
1725
if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1726
// zd = za . zd . zm
1727
// zd = za . zn . zd
1728
// zd = za . zd . zd
1729
UseScratchRegisterScope temps(this);
1730
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1731
{
1732
MovprfxHelperScope guard(this, scratch, za);
1733
(this->*fn)(scratch, scratch, zn, zm);
1734
}
1735
1736
Mov(zd, scratch);
1737
} else {
1738
MovprfxHelperScope guard(this, zd, za);
1739
(this->*fn)(zd, zd, zn, zm);
1740
}
1741
}
1742
1743
void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
1744
const ZRegister& zd,
1745
const ZRegister& za,
1746
const ZRegister& zn,
1747
const ZRegister& zm,
1748
int imm) {
1749
if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1750
// zd = za . zd . zm[i]
1751
// zd = za . zn . zd[i]
1752
// zd = za . zd . zd[i]
1753
UseScratchRegisterScope temps(this);
1754
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1755
{
1756
MovprfxHelperScope guard(this, scratch, za);
1757
(this->*fn)(scratch, zn, zm, imm);
1758
}
1759
1760
Mov(zd, scratch);
1761
} else {
1762
// zd = za . zn . zm[i]
1763
MovprfxHelperScope guard(this, zd, za);
1764
(this->*fn)(zd, zn, zm, imm);
1765
}
1766
}
1767
1768
void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
1769
const ZRegister& zd,
1770
const ZRegister& za,
1771
const ZRegister& zn,
1772
const ZRegister& zm) {
1773
if (zn.Aliases(zm)) {
1774
// If zn == zm, the difference is zero.
1775
if (!zd.Aliases(za)) {
1776
Mov(zd, za);
1777
}
1778
} else if (zd.Aliases(za)) {
1779
SingleEmissionCheckScope guard(this);
1780
(this->*fn)(zd, zn, zm);
1781
} else if (zd.Aliases(zn)) {
1782
UseScratchRegisterScope temps(this);
1783
ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1784
Mov(ztmp, zn);
1785
MovprfxHelperScope guard(this, zd, za);
1786
(this->*fn)(zd, ztmp, zm);
1787
} else if (zd.Aliases(zm)) {
1788
UseScratchRegisterScope temps(this);
1789
ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1790
Mov(ztmp, zm);
1791
MovprfxHelperScope guard(this, zd, za);
1792
(this->*fn)(zd, zn, ztmp);
1793
} else {
1794
MovprfxHelperScope guard(this, zd, za);
1795
(this->*fn)(zd, zn, zm);
1796
}
1797
}
1798
1799
#define VIXL_SVE_4REG_LIST(V) \
1800
V(Saba, saba, AbsoluteDifferenceAccumulate) \
1801
V(Uaba, uaba, AbsoluteDifferenceAccumulate) \
1802
V(Sabalb, sabalb, AbsoluteDifferenceAccumulate) \
1803
V(Sabalt, sabalt, AbsoluteDifferenceAccumulate) \
1804
V(Uabalb, uabalb, AbsoluteDifferenceAccumulate) \
1805
V(Uabalt, uabalt, AbsoluteDifferenceAccumulate) \
1806
V(Sdot, sdot, FourRegDestructiveHelper) \
1807
V(Udot, udot, FourRegDestructiveHelper) \
1808
V(Adclb, adclb, FourRegDestructiveHelper) \
1809
V(Adclt, adclt, FourRegDestructiveHelper) \
1810
V(Sbclb, sbclb, FourRegDestructiveHelper) \
1811
V(Sbclt, sbclt, FourRegDestructiveHelper) \
1812
V(Smlalb, smlalb, FourRegDestructiveHelper) \
1813
V(Smlalt, smlalt, FourRegDestructiveHelper) \
1814
V(Smlslb, smlslb, FourRegDestructiveHelper) \
1815
V(Smlslt, smlslt, FourRegDestructiveHelper) \
1816
V(Umlalb, umlalb, FourRegDestructiveHelper) \
1817
V(Umlalt, umlalt, FourRegDestructiveHelper) \
1818
V(Umlslb, umlslb, FourRegDestructiveHelper) \
1819
V(Umlslt, umlslt, FourRegDestructiveHelper) \
1820
V(Bcax, bcax, FourRegDestructiveHelper) \
1821
V(Bsl, bsl, FourRegDestructiveHelper) \
1822
V(Bsl1n, bsl1n, FourRegDestructiveHelper) \
1823
V(Bsl2n, bsl2n, FourRegDestructiveHelper) \
1824
V(Eor3, eor3, FourRegDestructiveHelper) \
1825
V(Nbsl, nbsl, FourRegDestructiveHelper) \
1826
V(Fmlalb, fmlalb, FourRegDestructiveHelper) \
1827
V(Fmlalt, fmlalt, FourRegDestructiveHelper) \
1828
V(Fmlslb, fmlslb, FourRegDestructiveHelper) \
1829
V(Fmlslt, fmlslt, FourRegDestructiveHelper) \
1830
V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper) \
1831
V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
1832
V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper) \
1833
V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper) \
1834
V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
1835
V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper) \
1836
V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper) \
1837
V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper) \
1838
V(Fmmla, fmmla, FourRegDestructiveHelper) \
1839
V(Smmla, smmla, FourRegDestructiveHelper) \
1840
V(Ummla, ummla, FourRegDestructiveHelper) \
1841
V(Usmmla, usmmla, FourRegDestructiveHelper) \
1842
V(Usdot, usdot, FourRegDestructiveHelper)
1843
1844
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
1845
void MacroAssembler::MASMFN(const ZRegister& zd, \
1846
const ZRegister& za, \
1847
const ZRegister& zn, \
1848
const ZRegister& zm) { \
1849
VIXL_ASSERT(allow_macro_instructions_); \
1850
HELPER(&Assembler::ASMFN, zd, za, zn, zm); \
1851
}
1852
VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
1853
#undef VIXL_DEFINE_MASM_FUNC
1854
1855
#define VIXL_SVE_4REG_1IMM_LIST(V) \
1856
V(Fmla, fmla, FourRegOneImmDestructiveHelper) \
1857
V(Fmls, fmls, FourRegOneImmDestructiveHelper) \
1858
V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper) \
1859
V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper) \
1860
V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper) \
1861
V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper) \
1862
V(Mla, mla, FourRegOneImmDestructiveHelper) \
1863
V(Mls, mls, FourRegOneImmDestructiveHelper) \
1864
V(Smlalb, smlalb, FourRegOneImmDestructiveHelper) \
1865
V(Smlalt, smlalt, FourRegOneImmDestructiveHelper) \
1866
V(Smlslb, smlslb, FourRegOneImmDestructiveHelper) \
1867
V(Smlslt, smlslt, FourRegOneImmDestructiveHelper) \
1868
V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
1869
V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
1870
V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
1871
V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
1872
V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
1873
V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
1874
V(Umlalb, umlalb, FourRegOneImmDestructiveHelper) \
1875
V(Umlalt, umlalt, FourRegOneImmDestructiveHelper) \
1876
V(Umlslb, umlslb, FourRegOneImmDestructiveHelper) \
1877
V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)
1878
1879
#define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
1880
void MacroAssembler::MASMFN(const ZRegister& zd, \
1881
const ZRegister& za, \
1882
const ZRegister& zn, \
1883
const ZRegister& zm, \
1884
int imm) { \
1885
VIXL_ASSERT(allow_macro_instructions_); \
1886
HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm); \
1887
}
1888
VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
1889
#undef VIXL_DEFINE_MASM_FUNC
1890
1891
void MacroAssembler::Sdot(const ZRegister& zd,
1892
const ZRegister& za,
1893
const ZRegister& zn,
1894
const ZRegister& zm,
1895
int index) {
1896
VIXL_ASSERT(allow_macro_instructions_);
1897
SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
1898
}
1899
1900
void MacroAssembler::Udot(const ZRegister& zd,
1901
const ZRegister& za,
1902
const ZRegister& zn,
1903
const ZRegister& zm,
1904
int index) {
1905
VIXL_ASSERT(allow_macro_instructions_);
1906
SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
1907
}
1908
1909
void MacroAssembler::Sudot(const ZRegister& zd,
1910
const ZRegister& za,
1911
const ZRegister& zn,
1912
const ZRegister& zm,
1913
int index) {
1914
VIXL_ASSERT(allow_macro_instructions_);
1915
SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
1916
}
1917
1918
void MacroAssembler::Usdot(const ZRegister& zd,
1919
const ZRegister& za,
1920
const ZRegister& zn,
1921
const ZRegister& zm,
1922
int index) {
1923
VIXL_ASSERT(allow_macro_instructions_);
1924
SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
1925
}
1926
1927
void MacroAssembler::Cdot(const ZRegister& zd,
1928
const ZRegister& za,
1929
const ZRegister& zn,
1930
const ZRegister& zm,
1931
int index,
1932
int rot) {
1933
// This doesn't handle zm when it's out of the range that can be encoded in
1934
// instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
1935
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
1936
UseScratchRegisterScope temps(this);
1937
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
1938
{
1939
MovprfxHelperScope guard(this, ztmp, za);
1940
cdot(ztmp, zn, zm, index, rot);
1941
}
1942
Mov(zd, ztmp);
1943
} else {
1944
MovprfxHelperScope guard(this, zd, za);
1945
cdot(zd, zn, zm, index, rot);
1946
}
1947
}
1948
1949
void MacroAssembler::Cdot(const ZRegister& zd,
1950
const ZRegister& za,
1951
const ZRegister& zn,
1952
const ZRegister& zm,
1953
int rot) {
1954
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
1955
UseScratchRegisterScope temps(this);
1956
VIXL_ASSERT(AreSameLaneSize(zn, zm));
1957
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
1958
Mov(ztmp, zd.Aliases(zn) ? zn : zm);
1959
MovprfxHelperScope guard(this, zd, za);
1960
cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
1961
} else {
1962
MovprfxHelperScope guard(this, zd, za);
1963
cdot(zd, zn, zm, rot);
1964
}
1965
}
1966
1967
void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
1968
const PRegisterM& pg,
1969
const ZRegister& za,
1970
const ZRegister& zn,
1971
const ZRegister& zm,
1972
SVEMulAddPredicatedZdaFn fn_zda,
1973
SVEMulAddPredicatedZdnFn fn_zdn,
1974
FPMacroNaNPropagationOption nan_option) {
1975
ResolveFPNaNPropagationOption(&nan_option);
1976
1977
if (zd.Aliases(za)) {
1978
// zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1979
SingleEmissionCheckScope guard(this);
1980
(this->*fn_zda)(zd, pg, zn, zm);
1981
} else if (zd.Aliases(zn)) {
1982
// zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
1983
SingleEmissionCheckScope guard(this);
1984
(this->*fn_zdn)(zd, pg, zm, za);
1985
} else if (zd.Aliases(zm)) {
1986
switch (nan_option) {
1987
case FastNaNPropagation: {
1988
// We treat multiplication as commutative in the fast mode, so we can
1989
// swap zn and zm.
1990
// zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
1991
SingleEmissionCheckScope guard(this);
1992
(this->*fn_zdn)(zd, pg, zn, za);
1993
return;
1994
}
1995
case StrictNaNPropagation: {
1996
UseScratchRegisterScope temps(this);
1997
// Use a scratch register to keep the argument order exactly as
1998
// specified.
1999
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
2000
{
2001
MovprfxHelperScope guard(this, scratch, pg, za);
2002
// scratch = (-)za + ((-)zn * zm)
2003
(this->*fn_zda)(scratch, pg, zn, zm);
2004
}
2005
Mov(zd, scratch);
2006
return;
2007
}
2008
case NoFPMacroNaNPropagationSelected:
2009
VIXL_UNREACHABLE();
2010
return;
2011
}
2012
} else {
2013
// zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
2014
MovprfxHelperScope guard(this, zd, pg, za);
2015
(this->*fn_zda)(zd, pg, zn, zm);
2016
}
2017
}
2018
2019
void MacroAssembler::Fmla(const ZRegister& zd,
2020
const PRegisterM& pg,
2021
const ZRegister& za,
2022
const ZRegister& zn,
2023
const ZRegister& zm,
2024
FPMacroNaNPropagationOption nan_option) {
2025
VIXL_ASSERT(allow_macro_instructions_);
2026
FPMulAddHelper(zd,
2027
pg,
2028
za,
2029
zn,
2030
zm,
2031
&Assembler::fmla,
2032
&Assembler::fmad,
2033
nan_option);
2034
}
2035
2036
void MacroAssembler::Fmls(const ZRegister& zd,
2037
const PRegisterM& pg,
2038
const ZRegister& za,
2039
const ZRegister& zn,
2040
const ZRegister& zm,
2041
FPMacroNaNPropagationOption nan_option) {
2042
VIXL_ASSERT(allow_macro_instructions_);
2043
FPMulAddHelper(zd,
2044
pg,
2045
za,
2046
zn,
2047
zm,
2048
&Assembler::fmls,
2049
&Assembler::fmsb,
2050
nan_option);
2051
}
2052
2053
void MacroAssembler::Fnmla(const ZRegister& zd,
2054
const PRegisterM& pg,
2055
const ZRegister& za,
2056
const ZRegister& zn,
2057
const ZRegister& zm,
2058
FPMacroNaNPropagationOption nan_option) {
2059
VIXL_ASSERT(allow_macro_instructions_);
2060
FPMulAddHelper(zd,
2061
pg,
2062
za,
2063
zn,
2064
zm,
2065
&Assembler::fnmla,
2066
&Assembler::fnmad,
2067
nan_option);
2068
}
2069
2070
void MacroAssembler::Fnmls(const ZRegister& zd,
2071
const PRegisterM& pg,
2072
const ZRegister& za,
2073
const ZRegister& zn,
2074
const ZRegister& zm,
2075
FPMacroNaNPropagationOption nan_option) {
2076
VIXL_ASSERT(allow_macro_instructions_);
2077
FPMulAddHelper(zd,
2078
pg,
2079
za,
2080
zn,
2081
zm,
2082
&Assembler::fnmls,
2083
&Assembler::fnmsb,
2084
nan_option);
2085
}
2086
2087
void MacroAssembler::Ftmad(const ZRegister& zd,
2088
const ZRegister& zn,
2089
const ZRegister& zm,
2090
int imm3) {
2091
VIXL_ASSERT(allow_macro_instructions_);
2092
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2093
UseScratchRegisterScope temps(this);
2094
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
2095
Mov(scratch, zm);
2096
MovprfxHelperScope guard(this, zd, zn);
2097
ftmad(zd, zd, scratch, imm3);
2098
} else {
2099
MovprfxHelperScope guard(this, zd, zn);
2100
ftmad(zd, zd, zm, imm3);
2101
}
2102
}
2103
2104
void MacroAssembler::Fcadd(const ZRegister& zd,
2105
const PRegisterM& pg,
2106
const ZRegister& zn,
2107
const ZRegister& zm,
2108
int rot) {
2109
VIXL_ASSERT(allow_macro_instructions_);
2110
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2111
UseScratchRegisterScope temps(this);
2112
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2113
{
2114
MovprfxHelperScope guard(this, scratch, pg, zn);
2115
fcadd(scratch, pg, scratch, zm, rot);
2116
}
2117
Mov(zd, scratch);
2118
} else {
2119
MovprfxHelperScope guard(this, zd, pg, zn);
2120
fcadd(zd, pg, zd, zm, rot);
2121
}
2122
}
2123
2124
void MacroAssembler::Fcmla(const ZRegister& zd,
2125
const PRegisterM& pg,
2126
const ZRegister& za,
2127
const ZRegister& zn,
2128
const ZRegister& zm,
2129
int rot) {
2130
VIXL_ASSERT(allow_macro_instructions_);
2131
if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
2132
UseScratchRegisterScope temps(this);
2133
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
2134
{
2135
MovprfxHelperScope guard(this, ztmp, za);
2136
fcmla(ztmp, pg, zn, zm, rot);
2137
}
2138
Mov(zd, pg, ztmp);
2139
} else {
2140
MovprfxHelperScope guard(this, zd, pg, za);
2141
fcmla(zd, pg, zn, zm, rot);
2142
}
2143
}
2144
2145
void MacroAssembler::Splice(const ZRegister& zd,
2146
const PRegister& pg,
2147
const ZRegister& zn,
2148
const ZRegister& zm) {
2149
VIXL_ASSERT(allow_macro_instructions_);
2150
if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
2151
SingleEmissionCheckScope guard(this);
2152
splice(zd, pg, zn, zm);
2153
} else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2154
UseScratchRegisterScope temps(this);
2155
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2156
{
2157
MovprfxHelperScope guard(this, scratch, zn);
2158
splice(scratch, pg, scratch, zm);
2159
}
2160
Mov(zd, scratch);
2161
} else {
2162
MovprfxHelperScope guard(this, zd, zn);
2163
splice(zd, pg, zd, zm);
2164
}
2165
}
2166
2167
void MacroAssembler::Clasta(const ZRegister& zd,
2168
const PRegister& pg,
2169
const ZRegister& zn,
2170
const ZRegister& zm) {
2171
VIXL_ASSERT(allow_macro_instructions_);
2172
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2173
UseScratchRegisterScope temps(this);
2174
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2175
{
2176
MovprfxHelperScope guard(this, scratch, zn);
2177
clasta(scratch, pg, scratch, zm);
2178
}
2179
Mov(zd, scratch);
2180
} else {
2181
MovprfxHelperScope guard(this, zd, zn);
2182
clasta(zd, pg, zd, zm);
2183
}
2184
}
2185
2186
void MacroAssembler::Clastb(const ZRegister& zd,
2187
const PRegister& pg,
2188
const ZRegister& zn,
2189
const ZRegister& zm) {
2190
VIXL_ASSERT(allow_macro_instructions_);
2191
if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2192
UseScratchRegisterScope temps(this);
2193
ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2194
{
2195
MovprfxHelperScope guard(this, scratch, zn);
2196
clastb(scratch, pg, scratch, zm);
2197
}
2198
Mov(zd, scratch);
2199
} else {
2200
MovprfxHelperScope guard(this, zd, zn);
2201
clastb(zd, pg, zd, zm);
2202
}
2203
}
2204
2205
void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
2206
const ZRegister& zd,
2207
const ZRegister& za,
2208
const ZRegister& zn,
2209
int shift) {
2210
VIXL_ASSERT(allow_macro_instructions_);
2211
if (!zd.Aliases(za) && zd.Aliases(zn)) {
2212
UseScratchRegisterScope temps(this);
2213
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
2214
Mov(ztmp, zn);
2215
{
2216
MovprfxHelperScope guard(this, zd, za);
2217
(this->*fn)(zd, ztmp, shift);
2218
}
2219
} else {
2220
MovprfxHelperScope guard(this, zd, za);
2221
(this->*fn)(zd, zn, shift);
2222
}
2223
}
2224
2225
void MacroAssembler::Srsra(const ZRegister& zd,
2226
const ZRegister& za,
2227
const ZRegister& zn,
2228
int shift) {
2229
ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
2230
}
2231
2232
void MacroAssembler::Ssra(const ZRegister& zd,
2233
const ZRegister& za,
2234
const ZRegister& zn,
2235
int shift) {
2236
ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
2237
}
2238
2239
void MacroAssembler::Ursra(const ZRegister& zd,
2240
const ZRegister& za,
2241
const ZRegister& zn,
2242
int shift) {
2243
ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
2244
}
2245
2246
void MacroAssembler::Usra(const ZRegister& zd,
2247
const ZRegister& za,
2248
const ZRegister& zn,
2249
int shift) {
2250
ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
2251
}
2252
2253
void MacroAssembler::ComplexAddition(ZZZImmFn fn,
2254
const ZRegister& zd,
2255
const ZRegister& zn,
2256
const ZRegister& zm,
2257
int rot) {
2258
VIXL_ASSERT(allow_macro_instructions_);
2259
if (!zd.Aliases(zn) && zd.Aliases(zm)) {
2260
UseScratchRegisterScope temps(this);
2261
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
2262
Mov(ztmp, zm);
2263
{
2264
MovprfxHelperScope guard(this, zd, zn);
2265
(this->*fn)(zd, zd, ztmp, rot);
2266
}
2267
} else {
2268
MovprfxHelperScope guard(this, zd, zn);
2269
(this->*fn)(zd, zd, zm, rot);
2270
}
2271
}
2272
2273
void MacroAssembler::Cadd(const ZRegister& zd,
2274
const ZRegister& zn,
2275
const ZRegister& zm,
2276
int rot) {
2277
ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
2278
}
2279
2280
void MacroAssembler::Sqcadd(const ZRegister& zd,
2281
const ZRegister& zn,
2282
const ZRegister& zm,
2283
int rot) {
2284
ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
2285
}
2286
2287
} // namespace aarch64
2288
} // namespace vixl
2289
2290