CoCalc -- IrLoweringA64.cpp

GitHub Repository: Roblox/luau
Path: blob/master/CodeGen/src/IrLoweringA64.cpp
²⁷²⁵ views
1
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
2
#include "IrLoweringA64.h"
3

4
#include "Luau/DenseHash.h"
5
#include "Luau/IrData.h"
6
#include "Luau/IrUtils.h"
7
#include "Luau/LoweringStats.h"
8

9
#include "EmitCommonA64.h"
10
#include "NativeState.h"
11

12
#include "lstate.h"
13
#include "lgc.h"
14

15
LUAU_FASTFLAG(LuauCodegenBlockSafeEnv)
16
LUAU_FASTFLAG(LuauCodegenBufferRangeMerge4)
17
LUAU_FASTFLAG(LuauCodegenBufNoDefTag)
18

19
namespace Luau
20
{
21
namespace CodeGen
22
{
23
namespace A64
24
{
25

26
inline ConditionA64 getConditionFP(IrCondition cond)
27
{
28
    switch (cond)
29
    {
30
    case IrCondition::Equal:
31
        return ConditionA64::Equal;
32

33
    case IrCondition::NotEqual:
34
        return ConditionA64::NotEqual;
35

36
    case IrCondition::Less:
37
        return ConditionA64::Minus;
38

39
    case IrCondition::NotLess:
40
        return ConditionA64::Plus;
41

42
    case IrCondition::LessEqual:
43
        return ConditionA64::UnsignedLessEqual;
44

45
    case IrCondition::NotLessEqual:
46
        return ConditionA64::UnsignedGreater;
47

48
    case IrCondition::Greater:
49
        return ConditionA64::Greater;
50

51
    case IrCondition::NotGreater:
52
        return ConditionA64::LessEqual;
53

54
    case IrCondition::GreaterEqual:
55
        return ConditionA64::GreaterEqual;
56

57
    case IrCondition::NotGreaterEqual:
58
        return ConditionA64::Less;
59

60
    default:
61
        CODEGEN_ASSERT(!"Unexpected condition code");
62
        return ConditionA64::Always;
63
    }
64
}
65

66
inline ConditionA64 getConditionInt(IrCondition cond)
67
{
68
    switch (cond)
69
    {
70
    case IrCondition::Equal:
71
        return ConditionA64::Equal;
72

73
    case IrCondition::NotEqual:
74
        return ConditionA64::NotEqual;
75

76
    case IrCondition::Less:
77
        return ConditionA64::Minus;
78

79
    case IrCondition::NotLess:
80
        return ConditionA64::Plus;
81

82
    case IrCondition::LessEqual:
83
        return ConditionA64::LessEqual;
84

85
    case IrCondition::NotLessEqual:
86
        return ConditionA64::Greater;
87

88
    case IrCondition::Greater:
89
        return ConditionA64::Greater;
90

91
    case IrCondition::NotGreater:
92
        return ConditionA64::LessEqual;
93

94
    case IrCondition::GreaterEqual:
95
        return ConditionA64::GreaterEqual;
96

97
    case IrCondition::NotGreaterEqual:
98
        return ConditionA64::Less;
99

100
    case IrCondition::UnsignedLess:
101
        return ConditionA64::CarryClear;
102

103
    case IrCondition::UnsignedLessEqual:
104
        return ConditionA64::UnsignedLessEqual;
105

106
    case IrCondition::UnsignedGreater:
107
        return ConditionA64::UnsignedGreater;
108

109
    case IrCondition::UnsignedGreaterEqual:
110
        return ConditionA64::CarrySet;
111

112
    default:
113
        CODEGEN_ASSERT(!"Unexpected condition code");
114
        return ConditionA64::Always;
115
    }
116
}
117

118
static void emitAddOffset(AssemblyBuilderA64& build, RegisterA64 dst, RegisterA64 src, size_t offset)
119
{
120
    CODEGEN_ASSERT(dst != src);
121
    CODEGEN_ASSERT(offset <= INT_MAX);
122

123
    if (offset <= AssemblyBuilderA64::kMaxImmediate)
124
    {
125
        build.add(dst, src, uint16_t(offset));
126
    }
127
    else
128
    {
129
        build.mov(dst, int(offset));
130
        build.add(dst, dst, src);
131
    }
132
}
133

134
static void emitAbort(AssemblyBuilderA64& build, Label& abort)
135
{
136
    Label skip;
137
    build.b(skip);
138
    build.setLabel(abort);
139
    build.udf();
140
    build.setLabel(skip);
141
}
142

143
static void emitFallback(AssemblyBuilderA64& build, int offset, int pcpos)
144
{
145
    // fallback(L, instruction, base, k)
146
    build.mov(x0, rState);
147
    emitAddOffset(build, x1, rCode, pcpos * sizeof(Instruction));
148
    build.mov(x2, rBase);
149
    build.mov(x3, rConstants);
150
    build.ldr(x4, mem(rNativeContext, offset));
151
    build.blr(x4);
152

153
    emitUpdateBase(build);
154
}
155

156
static void emitInvokeLibm1P(AssemblyBuilderA64& build, size_t func, int arg)
157
{
158
    CODEGEN_ASSERT(kTempSlots >= 1);
159
    CODEGEN_ASSERT(unsigned(sTemporary.data) <= AssemblyBuilderA64::kMaxImmediate);
160
    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
161
    build.add(x0, sp, uint16_t(sTemporary.data)); // sp-relative offset
162
    build.ldr(x1, mem(rNativeContext, uint32_t(func)));
163
    build.blr(x1);
164
}
165

166
static bool emitBuiltin(AssemblyBuilderA64& build, IrFunction& function, IrRegAllocA64& regs, int bfid, int res, int arg, int nresults)
167
{
168
    switch (bfid)
169
    {
170
    case LBF_MATH_FREXP:
171
    {
172
        CODEGEN_ASSERT(nresults == 1 || nresults == 2);
173
        emitInvokeLibm1P(build, offsetof(NativeContext, libm_frexp), arg);
174
        build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
175

176
        RegisterA64 temp = regs.allocTemp(KindA64::w);
177
        build.mov(temp, LUA_TNUMBER);
178
        build.str(temp, mem(rBase, res * sizeof(TValue) + offsetof(TValue, tt)));
179

180
        if (nresults == 2)
181
        {
182
            build.ldr(w0, sTemporary);
183
            build.scvtf(d1, w0);
184
            build.str(d1, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, value.n)));
185
            build.str(temp, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, tt)));
186
        }
187
        return true;
188
    }
189
    case LBF_MATH_MODF:
190
    {
191
        CODEGEN_ASSERT(nresults == 1 || nresults == 2);
192
        emitInvokeLibm1P(build, offsetof(NativeContext, libm_modf), arg);
193
        build.ldr(d1, sTemporary);
194
        build.str(d1, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
195

196
        RegisterA64 temp = regs.allocTemp(KindA64::w);
197
        build.mov(temp, LUA_TNUMBER);
198
        build.str(temp, mem(rBase, res * sizeof(TValue) + offsetof(TValue, tt)));
199

200
        if (nresults == 2)
201
        {
202
            build.str(d0, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, value.n)));
203
            build.str(temp, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, tt)));
204
        }
205
        return true;
206
    }
207

208
    default:
209
        CODEGEN_ASSERT(!"Missing A64 lowering");
210
        return false;
211
    }
212
}
213

214
static uint64_t getDoubleBits(double value)
215
{
216
    uint64_t result;
217
    static_assert(sizeof(result) == sizeof(value), "Expecting double to be 64-bit");
218
    memcpy(&result, &value, sizeof(value));
219
    return result;
220
}
221

222
static uint32_t getFloatBits(float value)
223
{
224
    uint32_t result;
225
    static_assert(sizeof(result) == sizeof(value), "Expecting float to be 32-bit");
226
    memcpy(&result, &value, sizeof(value));
227
    return result;
228
}
229

230
IrLoweringA64::IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers, IrFunction& function, LoweringStats* stats)
231
    : build(build)
232
    , helpers(helpers)
233
    , function(function)
234
    , stats(stats)
235
    , regs(build, function, stats, {{x0, x15}, {x16, x17}, {q0, q7}, {q16, q31}})
236
    , valueTracker(function)
237
    , exitHandlerMap(~0u)
238
{
239
    valueTracker.setRestoreCallback(
240
        this,
241
        [](void* context, IrInst& inst)
242
        {
243
            IrLoweringA64* self = static_cast<IrLoweringA64*>(context);
244
            self->regs.restoreReg(inst);
245
        }
246
    );
247
}
248

249
void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
250
{
251
    regs.currInstIdx = index;
252

253
    valueTracker.beforeInstLowering(inst);
254

255
    switch (inst.cmd)
256
    {
257
    case IrCmd::LOAD_TAG:
258
    {
259
        inst.regA64 = regs.allocReg(KindA64::w, index);
260
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, tt));
261
        build.ldr(inst.regA64, addr);
262
        break;
263
    }
264
    case IrCmd::LOAD_POINTER:
265
    {
266
        inst.regA64 = regs.allocReg(KindA64::x, index);
267
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value.gc));
268
        build.ldr(inst.regA64, addr);
269
        break;
270
    }
271
    case IrCmd::LOAD_DOUBLE:
272
    {
273
        inst.regA64 = regs.allocReg(KindA64::d, index);
274
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value.n));
275
        build.ldr(inst.regA64, addr);
276
        break;
277
    }
278
    case IrCmd::LOAD_INT:
279
    {
280
        inst.regA64 = regs.allocReg(KindA64::w, index);
281
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value));
282
        build.ldr(inst.regA64, addr);
283
        break;
284
    }
285
    case IrCmd::LOAD_FLOAT:
286
    {
287
        inst.regA64 = regs.allocReg(KindA64::s, index);
288
        AddressA64 addr = tempAddr(OP_A(inst), intOp(OP_B(inst)));
289

290
        build.ldr(inst.regA64, addr);
291
        break;
292
    }
293
    case IrCmd::LOAD_TVALUE:
294
    {
295
        inst.regA64 = regs.allocReg(KindA64::q, index);
296

297
        int addrOffset = HAS_OP_B(inst) ? intOp(OP_B(inst)) : 0;
298
        AddressA64 addr = tempAddr(OP_A(inst), addrOffset);
299
        build.ldr(inst.regA64, addr);
300
        break;
301
    }
302
    case IrCmd::LOAD_ENV:
303
        inst.regA64 = regs.allocReg(KindA64::x, index);
304
        build.ldr(inst.regA64, mem(rClosure, offsetof(Closure, env)));
305
        break;
306
    case IrCmd::GET_ARR_ADDR:
307
    {
308
        inst.regA64 = regs.allocReuse(KindA64::x, index, {OP_A(inst)});
309
        build.ldr(inst.regA64, mem(regOp(OP_A(inst)), offsetof(LuaTable, array)));
310

311
        if (OP_B(inst).kind == IrOpKind::Inst)
312
        {
313
            build.add(inst.regA64, inst.regA64, regOp(OP_B(inst)), kTValueSizeLog2); // implicit uxtw
314
        }
315
        else if (OP_B(inst).kind == IrOpKind::Constant)
316
        {
317
            if (intOp(OP_B(inst)) == 0)
318
            {
319
                // no offset required
320
            }
321
            else if (intOp(OP_B(inst)) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
322
            {
323
                build.add(inst.regA64, inst.regA64, uint16_t(intOp(OP_B(inst)) * sizeof(TValue)));
324
            }
325
            else
326
            {
327
                RegisterA64 temp = regs.allocTemp(KindA64::x);
328
                build.mov(temp, intOp(OP_B(inst)) * sizeof(TValue));
329
                build.add(inst.regA64, inst.regA64, temp);
330
            }
331
        }
332
        else
333
            CODEGEN_ASSERT(!"Unsupported instruction form");
334
        break;
335
    }
336
    case IrCmd::GET_SLOT_NODE_ADDR:
337
    {
338
        inst.regA64 = regs.allocReuse(KindA64::x, index, {OP_A(inst)});
339
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
340
        RegisterA64 temp1w = castReg(KindA64::w, temp1);
341
        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
342
        RegisterA64 temp2x = castReg(KindA64::x, temp2);
343

344
        // note: since the stride of the load is the same as the destination register size, we can range check the array index, not the byte offset
345
        if (uintOp(OP_B(inst)) <= AddressA64::kMaxOffset)
346
            build.ldr(temp1w, mem(rCode, uintOp(OP_B(inst)) * sizeof(Instruction)));
347
        else
348
        {
349
            build.mov(temp1, uintOp(OP_B(inst)) * sizeof(Instruction));
350
            build.ldr(temp1w, mem(rCode, temp1));
351
        }
352

353
        // C field can be shifted as long as it's at the most significant byte of the instruction word
354
        CODEGEN_ASSERT(kOffsetOfInstructionC == 3);
355
        build.ldrb(temp2, mem(regOp(OP_A(inst)), offsetof(LuaTable, nodemask8)));
356
        build.and_(temp2, temp2, temp1w, -24);
357

358
        // note: this may clobber OP_A(inst), so it's important that we don't use it after this
359
        build.ldr(inst.regA64, mem(regOp(OP_A(inst)), offsetof(LuaTable, node)));
360
        build.add(inst.regA64, inst.regA64, temp2x, kLuaNodeSizeLog2); // "zero extend" temp2 to get a larger shift (top 32 bits are zero)
361
        break;
362
    }
363
    case IrCmd::GET_HASH_NODE_ADDR:
364
    {
365
        inst.regA64 = regs.allocReuse(KindA64::x, index, {OP_A(inst)});
366
        RegisterA64 temp1 = regs.allocTemp(KindA64::w);
367
        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
368
        RegisterA64 temp2x = castReg(KindA64::x, temp2);
369

370
        // hash & ((1 << lsizenode) - 1) == hash & ~(-1 << lsizenode)
371
        build.mov(temp1, -1);
372
        build.ldrb(temp2, mem(regOp(OP_A(inst)), offsetof(LuaTable, lsizenode)));
373
        build.lsl(temp1, temp1, temp2);
374
        build.mov(temp2, uintOp(OP_B(inst)));
375
        build.bic(temp2, temp2, temp1);
376

377
        // note: this may clobber OP_A(inst), so it's important that we don't use it after this
378
        build.ldr(inst.regA64, mem(regOp(OP_A(inst)), offsetof(LuaTable, node)));
379
        build.add(inst.regA64, inst.regA64, temp2x, kLuaNodeSizeLog2); // "zero extend" temp2 to get a larger shift (top 32 bits are zero)
380
        break;
381
    }
382
    case IrCmd::GET_CLOSURE_UPVAL_ADDR:
383
    {
384
        inst.regA64 = regs.allocReuse(KindA64::x, index, {OP_A(inst)});
385
        RegisterA64 cl = OP_A(inst).kind == IrOpKind::Undef ? rClosure : regOp(OP_A(inst));
386

387
        build.add(inst.regA64, cl, uint16_t(offsetof(Closure, l.uprefs) + sizeof(TValue) * vmUpvalueOp(OP_B(inst))));
388
        break;
389
    }
390
    case IrCmd::STORE_TAG:
391
    {
392
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, tt));
393
        if (tagOp(OP_B(inst)) == 0)
394
        {
395
            build.str(wzr, addr);
396
        }
397
        else
398
        {
399
            RegisterA64 temp = regs.allocTemp(KindA64::w);
400
            build.mov(temp, tagOp(OP_B(inst)));
401
            build.str(temp, addr);
402
        }
403
        break;
404
    }
405
    case IrCmd::STORE_POINTER:
406
    {
407
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value));
408
        if (OP_B(inst).kind == IrOpKind::Constant)
409
        {
410
            CODEGEN_ASSERT(intOp(OP_B(inst)) == 0);
411
            build.str(xzr, addr);
412
        }
413
        else
414
        {
415
            build.str(regOp(OP_B(inst)), addr);
416
        }
417
        break;
418
    }
419
    case IrCmd::STORE_EXTRA:
420
    {
421
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, extra));
422
        if (intOp(OP_B(inst)) == 0)
423
        {
424
            build.str(wzr, addr);
425
        }
426
        else
427
        {
428
            RegisterA64 temp = regs.allocTemp(KindA64::w);
429
            build.mov(temp, intOp(OP_B(inst)));
430
            build.str(temp, addr);
431
        }
432
        break;
433
    }
434
    case IrCmd::STORE_DOUBLE:
435
    {
436
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value));
437
        if (OP_B(inst).kind == IrOpKind::Constant && getDoubleBits(doubleOp(OP_B(inst))) == 0)
438
        {
439
            build.str(xzr, addr);
440
        }
441
        else
442
        {
443
            RegisterA64 temp = tempDouble(OP_B(inst));
444
            build.str(temp, addr);
445
        }
446
        break;
447
    }
448
    case IrCmd::STORE_INT:
449
    {
450
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value));
451
        if (OP_B(inst).kind == IrOpKind::Constant && intOp(OP_B(inst)) == 0)
452
        {
453
            build.str(wzr, addr);
454
        }
455
        else
456
        {
457
            RegisterA64 temp = tempInt(OP_B(inst));
458
            build.str(temp, addr);
459
        }
460
        break;
461
    }
462
    case IrCmd::STORE_VECTOR:
463
    {
464
        RegisterA64 temp1 = tempFloat(OP_B(inst));
465
        RegisterA64 temp2 = tempFloat(OP_C(inst));
466
        RegisterA64 temp3 = tempFloat(OP_D(inst));
467

468
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value));
469
        CODEGEN_ASSERT(addr.kind == AddressKindA64::imm && addr.data % 4 == 0 && unsigned(addr.data + 8) / 4 <= AddressA64::kMaxOffset);
470

471
        build.str(temp1, AddressA64(addr.base, addr.data + 0));
472
        build.str(temp2, AddressA64(addr.base, addr.data + 4));
473
        build.str(temp3, AddressA64(addr.base, addr.data + 8));
474

475
        if (HAS_OP_E(inst))
476
        {
477
            RegisterA64 temp = regs.allocTemp(KindA64::w);
478
            build.mov(temp, tagOp(OP_E(inst)));
479
            build.str(temp, tempAddr(OP_A(inst), offsetof(TValue, tt)));
480
        }
481
        break;
482
    }
483
    case IrCmd::STORE_TVALUE:
484
    {
485
        int addrOffset = HAS_OP_C(inst) ? intOp(OP_C(inst)) : 0;
486
        AddressA64 addr = tempAddr(OP_A(inst), addrOffset);
487
        build.str(regOp(OP_B(inst)), addr);
488
        break;
489
    }
490
    case IrCmd::STORE_SPLIT_TVALUE:
491
    {
492
        int addrOffset = HAS_OP_D(inst) ? intOp(OP_D(inst)) : 0;
493

494
        RegisterA64 tempt = regs.allocTemp(KindA64::w);
495
        AddressA64 addrt = tempAddr(OP_A(inst), offsetof(TValue, tt) + addrOffset);
496
        build.mov(tempt, tagOp(OP_B(inst)));
497
        build.str(tempt, addrt);
498

499
        AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, value) + addrOffset);
500

501
        if (tagOp(OP_B(inst)) == LUA_TBOOLEAN)
502
        {
503
            if (OP_C(inst).kind == IrOpKind::Constant)
504
            {
505
                // note: we reuse tag temp register as value for true booleans, and use built-in zero register for false values
506
                CODEGEN_ASSERT(LUA_TBOOLEAN == 1);
507
                build.str(intOp(OP_C(inst)) ? tempt : wzr, addr);
508
            }
509
            else
510
                build.str(regOp(OP_C(inst)), addr);
511
        }
512
        else if (tagOp(OP_B(inst)) == LUA_TNUMBER)
513
        {
514
            RegisterA64 temp = tempDouble(OP_C(inst));
515
            build.str(temp, addr);
516
        }
517
        else if (isGCO(tagOp(OP_B(inst))))
518
        {
519
            build.str(regOp(OP_C(inst)), addr);
520
        }
521
        else
522
        {
523
            CODEGEN_ASSERT(!"Unsupported instruction form");
524
        }
525
        break;
526
    }
527
    case IrCmd::ADD_INT:
528
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
529
        if (OP_B(inst).kind == IrOpKind::Constant && unsigned(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate)
530
            build.add(inst.regA64, regOp(OP_A(inst)), uint16_t(intOp(OP_B(inst))));
531
        else if (OP_A(inst).kind == IrOpKind::Constant && unsigned(intOp(OP_A(inst))) <= AssemblyBuilderA64::kMaxImmediate)
532
            build.add(inst.regA64, regOp(OP_B(inst)), uint16_t(intOp(OP_A(inst))));
533
        else
534
        {
535
            RegisterA64 temp1 = tempInt(OP_A(inst));
536
            RegisterA64 temp2 = tempInt(OP_B(inst));
537
            build.add(inst.regA64, temp1, temp2);
538
        }
539
        break;
540
    case IrCmd::SUB_INT:
541
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
542
        if (OP_B(inst).kind == IrOpKind::Constant && unsigned(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate)
543
            build.sub(inst.regA64, regOp(OP_A(inst)), uint16_t(intOp(OP_B(inst))));
544
        else
545
        {
546
            RegisterA64 temp1 = tempInt(OP_A(inst));
547
            RegisterA64 temp2 = tempInt(OP_B(inst));
548
            build.sub(inst.regA64, temp1, temp2);
549
        }
550
        break;
551
    case IrCmd::SEXTI8_INT:
552
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
553

554
        build.sbfx(inst.regA64, regOp(OP_A(inst)), 0, 8); // sextb
555
        break;
556
    case IrCmd::SEXTI16_INT:
557
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
558

559
        build.sbfx(inst.regA64, regOp(OP_A(inst)), 0, 16); // sexth
560
        break;
561
    case IrCmd::ADD_NUM:
562
    {
563
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
564
        RegisterA64 temp1 = tempDouble(OP_A(inst));
565
        RegisterA64 temp2 = tempDouble(OP_B(inst));
566
        build.fadd(inst.regA64, temp1, temp2);
567
        break;
568
    }
569
    case IrCmd::SUB_NUM:
570
    {
571
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
572
        RegisterA64 temp1 = tempDouble(OP_A(inst));
573
        RegisterA64 temp2 = tempDouble(OP_B(inst));
574
        build.fsub(inst.regA64, temp1, temp2);
575
        break;
576
    }
577
    case IrCmd::MUL_NUM:
578
    {
579
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
580
        RegisterA64 temp1 = tempDouble(OP_A(inst));
581
        RegisterA64 temp2 = tempDouble(OP_B(inst));
582
        build.fmul(inst.regA64, temp1, temp2);
583
        break;
584
    }
585
    case IrCmd::DIV_NUM:
586
    {
587
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
588
        RegisterA64 temp1 = tempDouble(OP_A(inst));
589
        RegisterA64 temp2 = tempDouble(OP_B(inst));
590
        build.fdiv(inst.regA64, temp1, temp2);
591
        break;
592
    }
593
    case IrCmd::IDIV_NUM:
594
    {
595
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
596
        RegisterA64 temp1 = tempDouble(OP_A(inst));
597
        RegisterA64 temp2 = tempDouble(OP_B(inst));
598
        build.fdiv(inst.regA64, temp1, temp2);
599
        build.frintm(inst.regA64, inst.regA64);
600
        break;
601
    }
602
    case IrCmd::MOD_NUM:
603
    {
604
        inst.regA64 = regs.allocReg(KindA64::d, index); // can't allocReuse because both A and B are used twice
605
        RegisterA64 temp1 = tempDouble(OP_A(inst));
606
        RegisterA64 temp2 = tempDouble(OP_B(inst));
607
        build.fdiv(inst.regA64, temp1, temp2);
608
        build.frintm(inst.regA64, inst.regA64);
609
        build.fmul(inst.regA64, inst.regA64, temp2);
610
        build.fsub(inst.regA64, temp1, inst.regA64);
611
        break;
612
    }
613
    case IrCmd::MULADD_NUM:
614
    {
615
        RegisterA64 tempA = tempDouble(OP_A(inst));
616
        RegisterA64 tempB = tempDouble(OP_B(inst));
617
        RegisterA64 tempC = tempDouble(OP_C(inst));
618

619
        if ((build.features & Feature_AdvSIMD) != 0)
620
        {
621
            inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_C(inst)});
622
            if (inst.regA64 != tempC)
623
                build.fmov(inst.regA64, tempC);
624
            build.fmla(inst.regA64, tempB, tempA);
625
        }
626
        else
627
        {
628
            inst.regA64 = regs.allocReg(KindA64::d, index);
629
            build.fmul(inst.regA64, tempB, tempA);
630
            build.fadd(inst.regA64, inst.regA64, tempC);
631
        }
632
        break;
633
    }
634
    case IrCmd::MIN_NUM:
635
    {
636
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
637
        RegisterA64 temp1 = tempDouble(OP_A(inst));
638
        RegisterA64 temp2 = tempDouble(OP_B(inst));
639
        build.fcmp(temp1, temp2);
640
        build.fcsel(inst.regA64, temp1, temp2, getConditionFP(IrCondition::Less));
641
        break;
642
    }
643
    case IrCmd::MAX_NUM:
644
    {
645
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst)});
646
        RegisterA64 temp1 = tempDouble(OP_A(inst));
647
        RegisterA64 temp2 = tempDouble(OP_B(inst));
648
        build.fcmp(temp1, temp2);
649
        build.fcsel(inst.regA64, temp1, temp2, getConditionFP(IrCondition::Greater));
650
        break;
651
    }
652
    case IrCmd::UNM_NUM:
653
    {
654
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
655
        RegisterA64 temp = tempDouble(OP_A(inst));
656
        build.fneg(inst.regA64, temp);
657
        break;
658
    }
659
    case IrCmd::FLOOR_NUM:
660
    {
661
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
662
        RegisterA64 temp = tempDouble(OP_A(inst));
663
        build.frintm(inst.regA64, temp);
664
        break;
665
    }
666
    case IrCmd::CEIL_NUM:
667
    {
668
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
669
        RegisterA64 temp = tempDouble(OP_A(inst));
670
        build.frintp(inst.regA64, temp);
671
        break;
672
    }
673
    case IrCmd::ROUND_NUM:
674
    {
675
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
676
        RegisterA64 temp = tempDouble(OP_A(inst));
677
        build.frinta(inst.regA64, temp);
678
        break;
679
    }
680
    case IrCmd::SQRT_NUM:
681
    {
682
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
683
        RegisterA64 temp = tempDouble(OP_A(inst));
684
        build.fsqrt(inst.regA64, temp);
685
        break;
686
    }
687
    case IrCmd::ABS_NUM:
688
    {
689
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
690
        RegisterA64 temp = tempDouble(OP_A(inst));
691
        build.fabs(inst.regA64, temp);
692
        break;
693
    }
694
    case IrCmd::SIGN_NUM:
695
    {
696
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst)});
697

698
        RegisterA64 temp = tempDouble(OP_A(inst));
699
        RegisterA64 temp0 = regs.allocTemp(KindA64::d);
700
        RegisterA64 temp1 = regs.allocTemp(KindA64::d);
701

702
        build.fcmpz(temp);
703
        build.fmov(temp0, 0.0);
704
        build.fmov(temp1, 1.0);
705
        build.fcsel(inst.regA64, temp1, temp0, getConditionFP(IrCondition::Greater));
706
        build.fmov(temp1, -1.0);
707
        build.fcsel(inst.regA64, temp1, inst.regA64, getConditionFP(IrCondition::Less));
708
        break;
709
    }
710
    case IrCmd::ADD_FLOAT:
711
    {
712
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst), OP_B(inst)});
713
        RegisterA64 temp1 = tempFloat(OP_A(inst));
714
        RegisterA64 temp2 = tempFloat(OP_B(inst));
715
        build.fadd(inst.regA64, temp1, temp2);
716
        break;
717
    }
718
    case IrCmd::SUB_FLOAT:
719
    {
720
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst), OP_B(inst)});
721
        RegisterA64 temp1 = tempFloat(OP_A(inst));
722
        RegisterA64 temp2 = tempFloat(OP_B(inst));
723
        build.fsub(inst.regA64, temp1, temp2);
724
        break;
725
    }
726
    case IrCmd::MUL_FLOAT:
727
    {
728
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst), OP_B(inst)});
729
        RegisterA64 temp1 = tempFloat(OP_A(inst));
730
        RegisterA64 temp2 = tempFloat(OP_B(inst));
731
        build.fmul(inst.regA64, temp1, temp2);
732
        break;
733
    }
734
    case IrCmd::DIV_FLOAT:
735
    {
736
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst), OP_B(inst)});
737
        RegisterA64 temp1 = tempFloat(OP_A(inst));
738
        RegisterA64 temp2 = tempFloat(OP_B(inst));
739
        build.fdiv(inst.regA64, temp1, temp2);
740
        break;
741
    }
742
    case IrCmd::MIN_FLOAT:
743
    {
744
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst), OP_B(inst)});
745
        RegisterA64 temp1 = tempFloat(OP_A(inst));
746
        RegisterA64 temp2 = tempFloat(OP_B(inst));
747
        build.fcmp(temp1, temp2);
748
        build.fcsel(inst.regA64, temp1, temp2, getConditionFP(IrCondition::Less));
749
        break;
750
    }
751
    case IrCmd::MAX_FLOAT:
752
    {
753
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst), OP_B(inst)});
754
        RegisterA64 temp1 = tempFloat(OP_A(inst));
755
        RegisterA64 temp2 = tempFloat(OP_B(inst));
756
        build.fcmp(temp1, temp2);
757
        build.fcsel(inst.regA64, temp1, temp2, getConditionFP(IrCondition::Greater));
758
        break;
759
    }
760
    case IrCmd::UNM_FLOAT:
761
    {
762
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst)});
763
        RegisterA64 temp = tempFloat(OP_A(inst));
764
        build.fneg(inst.regA64, temp);
765
        break;
766
    }
767
    case IrCmd::FLOOR_FLOAT:
768
    {
769
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst)});
770
        RegisterA64 temp = tempFloat(OP_A(inst));
771
        build.frintm(inst.regA64, temp);
772
        break;
773
    }
774
    case IrCmd::CEIL_FLOAT:
775
    {
776
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst)});
777
        RegisterA64 temp = tempFloat(OP_A(inst));
778
        build.frintp(inst.regA64, temp);
779
        break;
780
    }
781
    case IrCmd::SQRT_FLOAT:
782
    {
783
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst)});
784
        RegisterA64 temp = tempFloat(OP_A(inst));
785
        build.fsqrt(inst.regA64, temp);
786
        break;
787
    }
788
    case IrCmd::ABS_FLOAT:
789
    {
790
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst)});
791
        RegisterA64 temp = tempFloat(OP_A(inst));
792
        build.fabs(inst.regA64, temp);
793
        break;
794
    }
795
    case IrCmd::SIGN_FLOAT:
796
    {
797
        inst.regA64 = regs.allocReuse(KindA64::s, index, {OP_A(inst)});
798

799
        RegisterA64 temp = tempFloat(OP_A(inst));
800
        RegisterA64 temp0 = regs.allocTemp(KindA64::s);
801
        RegisterA64 temp1 = regs.allocTemp(KindA64::s);
802

803
        build.fcmpz(temp);
804
        build.fmov(temp0, 0.0f);
805
        build.fmov(temp1, 1.0f);
806
        build.fcsel(inst.regA64, temp1, temp0, getConditionFP(IrCondition::Greater));
807
        build.fmov(temp1, -1.0f);
808
        build.fcsel(inst.regA64, temp1, inst.regA64, getConditionFP(IrCondition::Less));
809
        break;
810
    }
811
    case IrCmd::SELECT_NUM:
812
    {
813
        inst.regA64 = regs.allocReuse(KindA64::d, index, {OP_A(inst), OP_B(inst), OP_C(inst), OP_D(inst)});
814

815
        RegisterA64 temp1 = tempDouble(OP_A(inst));
816
        RegisterA64 temp2 = tempDouble(OP_B(inst));
817
        RegisterA64 temp3 = tempDouble(OP_C(inst));
818
        RegisterA64 temp4 = tempDouble(OP_D(inst));
819

820
        build.fcmp(temp3, temp4);
821
        build.fcsel(inst.regA64, temp2, temp1, getConditionFP(IrCondition::Equal));
822
        break;
823
    }
824
    case IrCmd::SELECT_VEC:
825
    {
826
        // `OP_B(inst)` cannot be reused for return value, because it can be overwritten with A before the first usage
827
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_C(inst), OP_D(inst)});
828

829
        RegisterA64 temp1 = regOp(OP_A(inst));
830
        RegisterA64 temp2 = regOp(OP_B(inst));
831
        RegisterA64 temp3 = regOp(OP_C(inst));
832
        RegisterA64 temp4 = regOp(OP_D(inst));
833

834
        RegisterA64 mask = regs.allocTemp(KindA64::q);
835

836
        // Evaluate predicate and calculate mask.
837
        build.fcmeq_4s(mask, temp3, temp4);
838
        // mov A to res register
839
        build.mov(inst.regA64, temp1);
840
        // If numbers are equal override A with B in res register.
841
        build.bit(inst.regA64, temp2, mask);
842
        break;
843
    }
844
    case IrCmd::SELECT_IF_TRUTHY:
845
    {
846
        inst.regA64 = regs.allocReg(KindA64::q, index);
847

848
        // Place lhs as the result, we will overwrite it with rhs if 'A' is falsy later
849
        build.mov(inst.regA64, regOp(OP_B(inst)));
850

851
        // Get rhs register early, so a potential restore happens on both sides of a conditional control flow
852
        RegisterA64 c = regOp(OP_C(inst));
853

854
        RegisterA64 temp = regs.allocTemp(KindA64::w);
855
        Label saveRhs, exit;
856

857
        // Check tag first
858
        build.umov_4s(temp, regOp(OP_A(inst)), 3);
859
        build.cmp(temp, uint16_t(LUA_TBOOLEAN));
860

861
        build.b(ConditionA64::UnsignedLess, saveRhs); // rhs if 'A' is nil
862
        build.b(ConditionA64::UnsignedGreater, exit); // Keep lhs if 'A' is not a boolean
863

864
        // Check the boolean value
865
        build.umov_4s(temp, regOp(OP_A(inst)), 0);
866
        build.cbnz(temp, exit); // Keep lhs if 'A' is true
867

868
        build.setLabel(saveRhs);
869
        build.mov(inst.regA64, c);
870

871
        build.setLabel(exit);
872
        break;
873
    }
874
    case IrCmd::MULADD_VEC:
875
    {
876
        RegisterA64 tempA = regOp(OP_A(inst));
877
        RegisterA64 tempB = regOp(OP_B(inst));
878
        RegisterA64 tempC = regOp(OP_C(inst));
879

880
        if ((build.features & Feature_AdvSIMD) != 0)
881
        {
882
            inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_C(inst)});
883
            if (inst.regA64 != tempC)
884
                build.mov(inst.regA64, tempC);
885
            build.fmla(inst.regA64, tempB, tempA);
886
        }
887
        else
888
        {
889
            inst.regA64 = regs.allocReg(KindA64::q, index);
890
            build.fmul(inst.regA64, tempB, tempA);
891
            build.fadd(inst.regA64, inst.regA64, tempC);
892
        }
893
        break;
894
    }
895
    case IrCmd::ADD_VEC:
896
    {
897
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
898

899
        build.fadd(inst.regA64, regOp(OP_A(inst)), regOp(OP_B(inst)));
900
        break;
901
    }
902
    case IrCmd::SUB_VEC:
903
    {
904
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
905

906
        build.fsub(inst.regA64, regOp(OP_A(inst)), regOp(OP_B(inst)));
907
        break;
908
    }
909
    case IrCmd::MUL_VEC:
910
    {
911
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
912

913
        build.fmul(inst.regA64, regOp(OP_A(inst)), regOp(OP_B(inst)));
914
        break;
915
    }
916
    case IrCmd::DIV_VEC:
917
    {
918
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
919

920
        build.fdiv(inst.regA64, regOp(OP_A(inst)), regOp(OP_B(inst)));
921
        break;
922
    }
923
    case IrCmd::IDIV_VEC:
924
    {
925
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
926

927
        build.fdiv(inst.regA64, regOp(OP_A(inst)), regOp(OP_B(inst)));
928
        build.frintm(inst.regA64, inst.regA64);
929
        break;
930
    }
931
    case IrCmd::UNM_VEC:
932
    {
933
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst)});
934

935
        build.fneg(inst.regA64, regOp(OP_A(inst)));
936
        break;
937
    }
938
    case IrCmd::MIN_VEC:
939
    {
940
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
941

942
        RegisterA64 temp1 = regOp(OP_A(inst));
943
        RegisterA64 temp2 = regOp(OP_B(inst));
944

945
        RegisterA64 mask = regs.allocTemp(KindA64::q);
946

947
        // b > a == a < b
948
        build.fcmgt_4s(mask, temp2, temp1);
949

950
        // If A is already at the target, select B where mask is 0
951
        if (inst.regA64 == temp1)
952
        {
953
            build.bif(inst.regA64, temp2, mask);
954
        }
955
        else
956
        {
957
            // Store B at the target unless it's there, select A where mask is 1
958
            if (inst.regA64 != temp2)
959
                build.mov(inst.regA64, temp2);
960

961
            build.bit(inst.regA64, temp1, mask);
962
        }
963
        break;
964
    }
965
    case IrCmd::MAX_VEC:
966
    {
967
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst), OP_B(inst)});
968

969
        RegisterA64 temp1 = regOp(OP_A(inst));
970
        RegisterA64 temp2 = regOp(OP_B(inst));
971

972
        RegisterA64 mask = regs.allocTemp(KindA64::q);
973

974
        build.fcmgt_4s(mask, temp1, temp2);
975

976
        // If A is already at the target, select B where mask is 0
977
        if (inst.regA64 == temp1)
978
        {
979
            build.bif(inst.regA64, temp2, mask);
980
        }
981
        else
982
        {
983
            // Store B at the target unless it's there, select A where mask is 1
984
            if (inst.regA64 != temp2)
985
                build.mov(inst.regA64, temp2);
986

987
            build.bit(inst.regA64, temp1, mask);
988
        }
989
        break;
990
    }
991
    case IrCmd::FLOOR_VEC:
992
    {
993
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst)});
994

995
        build.frintm(inst.regA64, regOp(OP_A(inst)));
996
        break;
997
    }
998
    case IrCmd::CEIL_VEC:
999
    {
1000
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst)});
1001

1002
        build.frintp(inst.regA64, regOp(OP_A(inst)));
1003
        break;
1004
    }
1005
    case IrCmd::ABS_VEC:
1006
    {
1007
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst)});
1008
        build.fabs(inst.regA64, regOp(OP_A(inst)));
1009
        break;
1010
    }
1011
    case IrCmd::DOT_VEC:
1012
    {
1013
        inst.regA64 = regs.allocReg(KindA64::s, index);
1014

1015
        RegisterA64 temp = regs.allocTemp(KindA64::q);
1016
        RegisterA64 temps = castReg(KindA64::s, temp);
1017

1018
        build.fmul(temp, regOp(OP_A(inst)), regOp(OP_B(inst)));
1019
        build.faddp(inst.regA64, temps); // x+y
1020
        build.dup_4s(temp, temp, 2);
1021
        build.fadd(inst.regA64, inst.regA64, temps); // +z
1022
        break;
1023
    }
1024
    case IrCmd::EXTRACT_VEC:
1025
    {
1026
        inst.regA64 = regs.allocReg(KindA64::s, index);
1027

1028
        if (intOp(OP_B(inst)) == 0)
1029
        {
1030
            // Lane vN.s[0] can just be read directly as sN
1031
            build.fmov(inst.regA64, castReg(KindA64::s, regOp(OP_A(inst))));
1032
        }
1033
        else
1034
        {
1035
            build.dup_4s(inst.regA64, regOp(OP_A(inst)), intOp(OP_B(inst)));
1036
        }
1037
        break;
1038
    }
1039
    case IrCmd::NOT_ANY:
1040
    {
1041
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
1042

1043
        if (OP_A(inst).kind == IrOpKind::Constant)
1044
        {
1045
            // other cases should've been constant folded
1046
            CODEGEN_ASSERT(tagOp(OP_A(inst)) == LUA_TBOOLEAN);
1047
            build.eor(inst.regA64, regOp(OP_B(inst)), 1);
1048
        }
1049
        else
1050
        {
1051
            Label notBool, exit;
1052

1053
            // use the fact that NIL is the only value less than BOOLEAN to do two tag comparisons at once
1054
            CODEGEN_ASSERT(LUA_TNIL == 0 && LUA_TBOOLEAN == 1);
1055
            build.cmp(regOp(OP_A(inst)), uint16_t(LUA_TBOOLEAN));
1056
            build.b(ConditionA64::NotEqual, notBool);
1057

1058
            if (OP_B(inst).kind == IrOpKind::Constant)
1059
                build.mov(inst.regA64, intOp(OP_B(inst)) == 0 ? 1 : 0);
1060
            else
1061
                build.eor(inst.regA64, regOp(OP_B(inst)), 1); // boolean => invert value
1062

1063
            build.b(exit);
1064

1065
            // not boolean => result is true iff tag was nil
1066
            build.setLabel(notBool);
1067
            build.cset(inst.regA64, ConditionA64::Less);
1068

1069
            build.setLabel(exit);
1070
        }
1071
        break;
1072
    }
1073
    case IrCmd::CMP_INT:
1074
    {
1075
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
1076

1077
        IrCondition cond = conditionOp(OP_C(inst));
1078

1079
        if (OP_A(inst).kind == IrOpKind::Constant)
1080
        {
1081
            if (unsigned(intOp(OP_A(inst))) <= AssemblyBuilderA64::kMaxImmediate)
1082
                build.cmp(regOp(OP_B(inst)), uint16_t(intOp(OP_A(inst))));
1083
            else
1084
                build.cmp(regOp(OP_B(inst)), tempInt(OP_A(inst)));
1085

1086
            build.cset(inst.regA64, getInverseCondition(getConditionInt(cond)));
1087
        }
1088
        else if (OP_A(inst).kind == IrOpKind::Inst)
1089
        {
1090
            if (unsigned(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate)
1091
                build.cmp(regOp(OP_A(inst)), uint16_t(intOp(OP_B(inst))));
1092
            else
1093
                build.cmp(regOp(OP_A(inst)), tempInt(OP_B(inst)));
1094

1095
            build.cset(inst.regA64, getConditionInt(cond));
1096
        }
1097
        else
1098
        {
1099
            CODEGEN_ASSERT(!"Unsupported instruction form");
1100
        }
1101
        break;
1102
    }
1103
    case IrCmd::CMP_ANY:
1104
    {
1105
        CODEGEN_ASSERT(OP_A(inst).kind == IrOpKind::VmReg && OP_B(inst).kind == IrOpKind::VmReg);
1106
        IrCondition cond = conditionOp(OP_C(inst));
1107

1108
        Label skip, exit;
1109

1110
        // For equality comparison, 'luaV_lessequal' expects tag to be equal before the call
1111
        if (cond == IrCondition::Equal)
1112
        {
1113
            RegisterA64 tempa = regs.allocTemp(KindA64::w);
1114
            RegisterA64 tempb = regs.allocTemp(KindA64::w);
1115

1116
            build.ldr(tempa, tempAddr(OP_A(inst), offsetof(TValue, tt)));
1117
            build.ldr(tempb, tempAddr(OP_B(inst), offsetof(TValue, tt)));
1118
            build.cmp(tempa, tempb);
1119

1120
            // If the tags are not equal, skip 'luaV_lessequal' call and set result to 0
1121
            build.b(ConditionA64::NotEqual, skip);
1122
        }
1123

1124
        regs.spill(index);
1125
        build.mov(x0, rState);
1126
        build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1127
        build.add(x2, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
1128

1129
        if (cond == IrCondition::LessEqual)
1130
            build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaV_lessequal)));
1131
        else if (cond == IrCondition::Less)
1132
            build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaV_lessthan)));
1133
        else if (cond == IrCondition::Equal)
1134
            build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaV_equalval)));
1135
        else
1136
            CODEGEN_ASSERT(!"Unsupported condition");
1137

1138
        build.blr(x3);
1139

1140
        emitUpdateBase(build);
1141

1142
        inst.regA64 = regs.takeReg(w0, index);
1143

1144
        if (cond == IrCondition::Equal)
1145
        {
1146
            build.b(exit);
1147
            build.setLabel(skip);
1148

1149
            build.mov(inst.regA64, 0);
1150
            build.setLabel(exit);
1151
        }
1152

1153
        // If case we made a call, skip high register bits clear, only consumer is JUMP_CMP_INT which doesn't read them
1154
        break;
1155
    }
1156
    case IrCmd::CMP_TAG:
1157
    {
1158
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
1159

1160
        IrCondition cond = conditionOp(OP_C(inst));
1161
        CODEGEN_ASSERT(cond == IrCondition::Equal || cond == IrCondition::NotEqual);
1162
        RegisterA64 aReg = noreg;
1163
        RegisterA64 bReg = noreg;
1164

1165
        if (OP_A(inst).kind == IrOpKind::Inst)
1166
        {
1167
            aReg = regOp(OP_A(inst));
1168
        }
1169
        else if (OP_A(inst).kind == IrOpKind::VmReg)
1170
        {
1171
            aReg = regs.allocTemp(KindA64::w);
1172
            AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, tt));
1173
            build.ldr(aReg, addr);
1174
        }
1175
        else
1176
        {
1177
            CODEGEN_ASSERT(OP_A(inst).kind == IrOpKind::Constant);
1178
        }
1179

1180
        if (OP_B(inst).kind == IrOpKind::Inst)
1181
        {
1182
            bReg = regOp(OP_B(inst));
1183
        }
1184
        else if (OP_B(inst).kind == IrOpKind::VmReg)
1185
        {
1186
            bReg = regs.allocTemp(KindA64::w);
1187
            AddressA64 addr = tempAddr(OP_B(inst), offsetof(TValue, tt));
1188
            build.ldr(bReg, addr);
1189
        }
1190
        else
1191
        {
1192
            CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::Constant);
1193
        }
1194

1195
        if (OP_A(inst).kind == IrOpKind::Constant)
1196
        {
1197
            build.cmp(bReg, uint16_t(tagOp(OP_A(inst))));
1198
            build.cset(inst.regA64, getInverseCondition(getConditionInt(cond)));
1199
        }
1200
        else if (OP_B(inst).kind == IrOpKind::Constant)
1201
        {
1202
            build.cmp(aReg, uint16_t(tagOp(OP_B(inst))));
1203
            build.cset(inst.regA64, getConditionInt(cond));
1204
        }
1205
        else
1206
        {
1207
            build.cmp(aReg, bReg);
1208
            build.cset(inst.regA64, getConditionInt(cond));
1209
        }
1210
        break;
1211
    }
1212
    case IrCmd::CMP_SPLIT_TVALUE:
1213
    {
1214
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
1215

1216
        // Second operand of this instruction must be a constant
1217
        // Without a constant type, we wouldn't know the correct way to compare the values at lowering time
1218
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::Constant);
1219

1220
        IrCondition cond = conditionOp(OP_E(inst));
1221
        CODEGEN_ASSERT(cond == IrCondition::Equal || cond == IrCondition::NotEqual);
1222

1223
        // Check tag equality first
1224
        RegisterA64 temp = regs.allocTemp(KindA64::w);
1225

1226
        if (OP_A(inst).kind != IrOpKind::Constant)
1227
        {
1228
            build.cmp(regOp(OP_A(inst)), uint16_t(tagOp(OP_B(inst))));
1229
            build.cset(temp, getConditionInt(cond));
1230
        }
1231
        else
1232
        {
1233
            // Constant folding had to handle different constant tags
1234
            CODEGEN_ASSERT(tagOp(OP_A(inst)) == tagOp(OP_B(inst)));
1235
        }
1236

1237
        if (tagOp(OP_B(inst)) == LUA_TBOOLEAN)
1238
        {
1239
            if (OP_C(inst).kind == IrOpKind::Constant)
1240
            {
1241
                CODEGEN_ASSERT(intOp(OP_C(inst)) == 0 || intOp(OP_C(inst)) == 1);
1242
                build.cmp(regOp(OP_D(inst)), uint16_t(intOp(OP_C(inst)))); // swapped arguments
1243
            }
1244
            else if (OP_D(inst).kind == IrOpKind::Constant)
1245
            {
1246
                CODEGEN_ASSERT(intOp(OP_D(inst)) == 0 || intOp(OP_D(inst)) == 1);
1247
                build.cmp(regOp(OP_C(inst)), uint16_t(intOp(OP_D(inst))));
1248
            }
1249
            else
1250
            {
1251
                build.cmp(regOp(OP_C(inst)), regOp(OP_D(inst)));
1252
            }
1253

1254
            build.cset(inst.regA64, getConditionInt(cond));
1255
        }
1256
        else if (tagOp(OP_B(inst)) == LUA_TSTRING)
1257
        {
1258
            build.cmp(regOp(OP_C(inst)), regOp(OP_D(inst)));
1259
            build.cset(inst.regA64, getConditionInt(cond));
1260
        }
1261
        else if (tagOp(OP_B(inst)) == LUA_TNUMBER)
1262
        {
1263
            RegisterA64 temp1 = tempDouble(OP_C(inst));
1264
            RegisterA64 temp2 = tempDouble(OP_D(inst));
1265

1266
            build.fcmp(temp1, temp2);
1267
            build.cset(inst.regA64, getConditionFP(cond));
1268
        }
1269
        else
1270
        {
1271
            CODEGEN_ASSERT(!"unsupported type tag in CMP_SPLIT_TVALUE");
1272
        }
1273

1274
        if (OP_A(inst).kind != IrOpKind::Constant)
1275
        {
1276
            if (cond == IrCondition::Equal)
1277
                build.and_(inst.regA64, inst.regA64, temp);
1278
            else
1279
                build.orr(inst.regA64, inst.regA64, temp);
1280
        }
1281
        break;
1282
    }
1283
    case IrCmd::JUMP:
1284
        if (OP_A(inst).kind == IrOpKind::Undef || OP_A(inst).kind == IrOpKind::VmExit)
1285
        {
1286
            Label fresh;
1287
            build.b(getTargetLabel(OP_A(inst), fresh));
1288
            finalizeTargetLabel(OP_A(inst), fresh);
1289
        }
1290
        else
1291
        {
1292
            jumpOrFallthrough(blockOp(OP_A(inst)), next);
1293
        }
1294
        break;
1295
    case IrCmd::JUMP_IF_TRUTHY:
1296
    {
1297
        RegisterA64 temp = regs.allocTemp(KindA64::w);
1298
        build.ldr(temp, mem(rBase, vmRegOp(OP_A(inst)) * sizeof(TValue) + offsetof(TValue, tt)));
1299
        // nil => falsy
1300
        CODEGEN_ASSERT(LUA_TNIL == 0);
1301
        build.cbz(temp, labelOp(OP_C(inst)));
1302
        // not boolean => truthy
1303
        build.cmp(temp, uint16_t(LUA_TBOOLEAN));
1304
        build.b(ConditionA64::NotEqual, labelOp(OP_B(inst)));
1305
        // compare boolean value
1306
        build.ldr(temp, mem(rBase, vmRegOp(OP_A(inst)) * sizeof(TValue) + offsetof(TValue, value)));
1307
        build.cbnz(temp, labelOp(OP_B(inst)));
1308
        jumpOrFallthrough(blockOp(OP_C(inst)), next);
1309
        break;
1310
    }
1311
    case IrCmd::JUMP_IF_FALSY:
1312
    {
1313
        RegisterA64 temp = regs.allocTemp(KindA64::w);
1314
        build.ldr(temp, mem(rBase, vmRegOp(OP_A(inst)) * sizeof(TValue) + offsetof(TValue, tt)));
1315
        // nil => falsy
1316
        CODEGEN_ASSERT(LUA_TNIL == 0);
1317
        build.cbz(temp, labelOp(OP_B(inst)));
1318
        // not boolean => truthy
1319
        build.cmp(temp, uint16_t(LUA_TBOOLEAN));
1320
        build.b(ConditionA64::NotEqual, labelOp(OP_C(inst)));
1321
        // compare boolean value
1322
        build.ldr(temp, mem(rBase, vmRegOp(OP_A(inst)) * sizeof(TValue) + offsetof(TValue, value)));
1323
        build.cbz(temp, labelOp(OP_B(inst)));
1324
        jumpOrFallthrough(blockOp(OP_C(inst)), next);
1325
        break;
1326
    }
1327
    case IrCmd::JUMP_EQ_TAG:
1328
    {
1329
        RegisterA64 zr = noreg;
1330
        RegisterA64 aReg = noreg;
1331
        RegisterA64 bReg = noreg;
1332

1333
        if (OP_A(inst).kind == IrOpKind::Inst)
1334
        {
1335
            aReg = regOp(OP_A(inst));
1336
        }
1337
        else if (OP_A(inst).kind == IrOpKind::VmReg)
1338
        {
1339
            aReg = regs.allocTemp(KindA64::w);
1340
            AddressA64 addr = tempAddr(OP_A(inst), offsetof(TValue, tt));
1341
            build.ldr(aReg, addr);
1342
        }
1343
        else
1344
        {
1345
            CODEGEN_ASSERT(OP_A(inst).kind == IrOpKind::Constant);
1346
        }
1347

1348
        if (OP_B(inst).kind == IrOpKind::Inst)
1349
        {
1350
            bReg = regOp(OP_B(inst));
1351
        }
1352
        else if (OP_B(inst).kind == IrOpKind::VmReg)
1353
        {
1354
            bReg = regs.allocTemp(KindA64::w);
1355
            AddressA64 addr = tempAddr(OP_B(inst), offsetof(TValue, tt));
1356
            build.ldr(bReg, addr);
1357
        }
1358
        else
1359
        {
1360
            CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::Constant);
1361
        }
1362

1363
        if (OP_A(inst).kind == IrOpKind::Constant && tagOp(OP_A(inst)) == 0)
1364
            zr = bReg;
1365
        else if (OP_B(inst).kind == IrOpKind::Constant && tagOp(OP_B(inst)) == 0)
1366
            zr = aReg;
1367
        else if (OP_B(inst).kind == IrOpKind::Constant)
1368
            build.cmp(aReg, uint16_t(tagOp(OP_B(inst))));
1369
        else if (OP_A(inst).kind == IrOpKind::Constant)
1370
            build.cmp(bReg, uint16_t(tagOp(OP_A(inst))));
1371
        else
1372
            build.cmp(aReg, bReg);
1373

1374
        if (isFallthroughBlock(blockOp(OP_D(inst)), next))
1375
        {
1376
            if (zr != noreg)
1377
                build.cbz(zr, labelOp(OP_C(inst)));
1378
            else
1379
                build.b(ConditionA64::Equal, labelOp(OP_C(inst)));
1380
            jumpOrFallthrough(blockOp(OP_D(inst)), next);
1381
        }
1382
        else
1383
        {
1384
            if (zr != noreg)
1385
                build.cbnz(zr, labelOp(OP_D(inst)));
1386
            else
1387
                build.b(ConditionA64::NotEqual, labelOp(OP_D(inst)));
1388
            jumpOrFallthrough(blockOp(OP_C(inst)), next);
1389
        }
1390
        break;
1391
    }
1392
    case IrCmd::JUMP_CMP_INT:
1393
    {
1394
        IrCondition cond = conditionOp(OP_C(inst));
1395

1396
        if (cond == IrCondition::Equal && intOp(OP_B(inst)) == 0)
1397
        {
1398
            build.cbz(regOp(OP_A(inst)), labelOp(OP_D(inst)));
1399
        }
1400
        else if (cond == IrCondition::NotEqual && intOp(OP_B(inst)) == 0)
1401
        {
1402
            build.cbnz(regOp(OP_A(inst)), labelOp(OP_D(inst)));
1403
        }
1404
        else
1405
        {
1406
            CODEGEN_ASSERT(unsigned(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate);
1407
            build.cmp(regOp(OP_A(inst)), uint16_t(intOp(OP_B(inst))));
1408
            build.b(getConditionInt(cond), labelOp(OP_D(inst)));
1409
        }
1410
        jumpOrFallthrough(blockOp(OP_E(inst)), next);
1411
        break;
1412
    }
1413
    case IrCmd::JUMP_EQ_POINTER:
1414
        build.cmp(regOp(OP_A(inst)), regOp(OP_B(inst)));
1415
        build.b(ConditionA64::Equal, labelOp(OP_C(inst)));
1416
        jumpOrFallthrough(blockOp(OP_D(inst)), next);
1417
        break;
1418
    case IrCmd::JUMP_CMP_NUM:
1419
    {
1420
        IrCondition cond = conditionOp(OP_C(inst));
1421

1422
        if (OP_B(inst).kind == IrOpKind::Constant && doubleOp(OP_B(inst)) == 0.0)
1423
        {
1424
            RegisterA64 temp = tempDouble(OP_A(inst));
1425

1426
            build.fcmpz(temp);
1427
        }
1428
        else
1429
        {
1430
            RegisterA64 temp1 = tempDouble(OP_A(inst));
1431
            RegisterA64 temp2 = tempDouble(OP_B(inst));
1432

1433
            build.fcmp(temp1, temp2);
1434
        }
1435

1436
        build.b(getConditionFP(cond), labelOp(OP_D(inst)));
1437
        jumpOrFallthrough(blockOp(OP_E(inst)), next);
1438
        break;
1439
    }
1440
    case IrCmd::JUMP_CMP_FLOAT:
1441
    {
1442
        IrCondition cond = conditionOp(OP_C(inst));
1443

1444
        if (OP_B(inst).kind == IrOpKind::Constant && float(doubleOp(OP_B(inst))) == 0.0f)
1445
        {
1446
            RegisterA64 temp = tempFloat(OP_A(inst));
1447

1448
            build.fcmpz(temp);
1449
        }
1450
        else
1451
        {
1452
            RegisterA64 temp1 = tempFloat(OP_A(inst));
1453
            RegisterA64 temp2 = tempFloat(OP_B(inst));
1454

1455
            build.fcmp(temp1, temp2);
1456
        }
1457

1458
        build.b(getConditionFP(cond), labelOp(OP_D(inst)));
1459
        jumpOrFallthrough(blockOp(OP_E(inst)), next);
1460
        break;
1461
    }
1462
    case IrCmd::JUMP_FORN_LOOP_COND:
1463
    {
1464
        RegisterA64 index = tempDouble(OP_A(inst));
1465
        RegisterA64 limit = tempDouble(OP_B(inst));
1466
        RegisterA64 step = tempDouble(OP_C(inst));
1467

1468
        Label direct;
1469

1470
        // step > 0
1471
        build.fcmpz(step);
1472
        build.b(getConditionFP(IrCondition::Greater), direct);
1473

1474
        // !(limit <= index)
1475
        build.fcmp(limit, index);
1476
        build.b(getConditionFP(IrCondition::NotLessEqual), labelOp(OP_E(inst)));
1477
        build.b(labelOp(OP_D(inst)));
1478

1479
        // !(index <= limit)
1480
        build.setLabel(direct);
1481

1482
        build.fcmp(index, limit);
1483
        build.b(getConditionFP(IrCondition::NotLessEqual), labelOp(OP_E(inst)));
1484
        jumpOrFallthrough(blockOp(OP_D(inst)), next);
1485
        break;
1486
    }
1487
    // IrCmd::JUMP_SLOT_MATCH implemented below
1488
    case IrCmd::TABLE_LEN:
1489
    {
1490
        RegisterA64 reg = regOp(OP_A(inst)); // note: we need to call regOp before spill so that we don't do redundant reloads
1491
        regs.spill(index, {reg});
1492
        build.mov(x0, reg);
1493
        build.ldr(x1, mem(rNativeContext, offsetof(NativeContext, luaH_getn)));
1494
        build.blr(x1);
1495

1496
        inst.regA64 = regs.takeReg(w0, index);
1497

1498
        build.ubfx(inst.regA64, inst.regA64, 0, 32); // Ensure high register bits are cleared
1499
        break;
1500
    }
1501
    case IrCmd::STRING_LEN:
1502
    {
1503
        inst.regA64 = regs.allocReg(KindA64::w, index);
1504

1505
        build.ldr(inst.regA64, mem(regOp(OP_A(inst)), offsetof(TString, len)));
1506
        break;
1507
    }
1508
    case IrCmd::TABLE_SETNUM:
1509
    {
1510
        // note: we need to call regOp before spill so that we don't do redundant reloads
1511
        RegisterA64 table = regOp(OP_A(inst));
1512
        RegisterA64 key = regOp(OP_B(inst));
1513
        RegisterA64 temp = regs.allocTemp(KindA64::w);
1514

1515
        regs.spill(index, {table, key});
1516

1517
        if (w1 != key)
1518
        {
1519
            build.mov(x1, table);
1520
            build.mov(w2, key);
1521
        }
1522
        else
1523
        {
1524
            build.mov(temp, w1);
1525
            build.mov(x1, table);
1526
            build.mov(w2, temp);
1527
        }
1528

1529
        build.mov(x0, rState);
1530
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaH_setnum)));
1531
        build.blr(x3);
1532
        inst.regA64 = regs.takeReg(x0, index);
1533
        break;
1534
    }
1535
    case IrCmd::NEW_TABLE:
1536
    {
1537
        regs.spill(index);
1538
        build.mov(x0, rState);
1539
        build.mov(x1, uintOp(OP_A(inst)));
1540
        build.mov(x2, uintOp(OP_B(inst)));
1541
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaH_new)));
1542
        build.blr(x3);
1543
        inst.regA64 = regs.takeReg(x0, index);
1544
        break;
1545
    }
1546
    case IrCmd::DUP_TABLE:
1547
    {
1548
        RegisterA64 reg = regOp(OP_A(inst)); // note: we need to call regOp before spill so that we don't do redundant reloads
1549
        regs.spill(index, {reg});
1550
        build.mov(x1, reg);
1551
        build.mov(x0, rState);
1552
        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaH_clone)));
1553
        build.blr(x2);
1554
        inst.regA64 = regs.takeReg(x0, index);
1555
        break;
1556
    }
1557
    case IrCmd::TRY_NUM_TO_INDEX:
1558
    {
1559
        inst.regA64 = regs.allocReg(KindA64::w, index);
1560
        RegisterA64 temp1 = tempDouble(OP_A(inst));
1561

1562
        if (build.features & Feature_JSCVT)
1563
        {
1564
            build.fjcvtzs(inst.regA64, temp1); // fjcvtzs sets PSTATE.Z (equal) iff conversion is exact
1565
            build.b(ConditionA64::NotEqual, labelOp(OP_B(inst)));
1566
        }
1567
        else
1568
        {
1569
            RegisterA64 temp2 = regs.allocTemp(KindA64::d);
1570

1571
            build.fcvtzs(inst.regA64, temp1);
1572
            build.scvtf(temp2, inst.regA64);
1573
            build.fcmp(temp1, temp2);
1574
            build.b(ConditionA64::NotEqual, labelOp(OP_B(inst)));
1575
        }
1576
        break;
1577
    }
1578
    case IrCmd::TRY_CALL_FASTGETTM:
1579
    {
1580
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
1581
        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
1582

1583
        build.ldr(temp1, mem(regOp(OP_A(inst)), offsetof(LuaTable, metatable)));
1584
        build.cbz(temp1, labelOp(OP_C(inst))); // no metatable
1585

1586
        build.ldrb(temp2, mem(temp1, offsetof(LuaTable, tmcache)));
1587
        build.tst(temp2, 1 << intOp(OP_B(inst)));             // can't use tbz/tbnz because their jump offsets are too short
1588
        build.b(ConditionA64::NotEqual, labelOp(OP_C(inst))); // Equal = Zero after tst; tmcache caches *absence* of metamethods
1589

1590
        regs.spill(index, {temp1});
1591
        build.mov(x0, temp1);
1592
        build.mov(w1, intOp(OP_B(inst)));
1593
        build.ldr(x2, mem(rGlobalState, offsetof(global_State, tmname) + intOp(OP_B(inst)) * sizeof(TString*)));
1594
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaT_gettm)));
1595
        build.blr(x3);
1596

1597
        build.cbz(x0, labelOp(OP_C(inst))); // no tag method
1598

1599
        inst.regA64 = regs.takeReg(x0, index);
1600
        break;
1601
    }
1602
    case IrCmd::NEW_USERDATA:
1603
    {
1604
        regs.spill(index);
1605
        build.mov(x0, rState);
1606
        build.mov(x1, intOp(OP_A(inst)));
1607
        build.mov(x2, intOp(OP_B(inst)));
1608
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, newUserdata)));
1609
        build.blr(x3);
1610
        inst.regA64 = regs.takeReg(x0, index);
1611
        break;
1612
    }
1613
    case IrCmd::INT_TO_NUM:
1614
    {
1615
        inst.regA64 = regs.allocReg(KindA64::d, index);
1616
        RegisterA64 temp = tempInt(OP_A(inst));
1617
        build.scvtf(inst.regA64, temp);
1618
        break;
1619
    }
1620
    case IrCmd::UINT_TO_NUM:
1621
    {
1622
        inst.regA64 = regs.allocReg(KindA64::d, index);
1623
        RegisterA64 temp = tempInt(OP_A(inst));
1624
        build.ucvtf(inst.regA64, temp);
1625
        break;
1626
    }
1627
    case IrCmd::UINT_TO_FLOAT:
1628
    {
1629
        inst.regA64 = regs.allocReg(KindA64::s, index);
1630
        RegisterA64 temp = tempInt(OP_A(inst));
1631
        build.ucvtf(inst.regA64, temp);
1632
        break;
1633
    }
1634
    case IrCmd::NUM_TO_INT:
1635
    {
1636
        inst.regA64 = regs.allocReg(KindA64::w, index);
1637
        RegisterA64 temp = tempDouble(OP_A(inst));
1638
        build.fcvtzs(inst.regA64, temp);
1639
        break;
1640
    }
1641
    case IrCmd::NUM_TO_UINT:
1642
    {
1643
        inst.regA64 = regs.allocReg(KindA64::w, index);
1644
        RegisterA64 temp = tempDouble(OP_A(inst));
1645
        // note: we don't use fcvtzu for consistency with C++ code
1646
        build.fcvtzs(castReg(KindA64::x, inst.regA64), temp);
1647
        break;
1648
    }
1649
    case IrCmd::FLOAT_TO_NUM:
1650
        inst.regA64 = regs.allocReg(KindA64::d, index);
1651

1652
        build.fcvt(inst.regA64, regOp(OP_A(inst)));
1653
        break;
1654
    case IrCmd::NUM_TO_FLOAT:
1655
        inst.regA64 = regs.allocReg(KindA64::s, index);
1656

1657
        build.fcvt(inst.regA64, regOp(OP_A(inst)));
1658
        break;
1659
    case IrCmd::FLOAT_TO_VEC:
1660
    {
1661
        inst.regA64 = regs.allocReg(KindA64::q, index);
1662

1663
        if (OP_A(inst).kind == IrOpKind::Constant)
1664
        {
1665
            float value = float(doubleOp(OP_A(inst)));
1666
            uint32_t asU32;
1667
            static_assert(sizeof(asU32) == sizeof(value), "Expecting float to be 32-bit");
1668
            memcpy(&asU32, &value, sizeof(value));
1669

1670
            if (AssemblyBuilderA64::isFmovSupportedFp32(value))
1671
            {
1672
                build.fmov(inst.regA64, value);
1673
            }
1674
            else
1675
            {
1676
                RegisterA64 temp = regs.allocTemp(KindA64::x);
1677

1678
                uint32_t vec[4] = {asU32, asU32, asU32, 0};
1679
                build.adr(temp, vec, sizeof(vec));
1680
                build.ldr(inst.regA64, temp);
1681
            }
1682
        }
1683
        else
1684
        {
1685
            RegisterA64 temp = tempFloat(OP_A(inst));
1686

1687
            build.dup_4s(inst.regA64, castReg(KindA64::q, temp), 0);
1688
        }
1689
        break;
1690
    }
1691
    case IrCmd::TAG_VECTOR:
1692
    {
1693
        inst.regA64 = regs.allocReuse(KindA64::q, index, {OP_A(inst)});
1694

1695
        RegisterA64 reg = regOp(OP_A(inst));
1696
        RegisterA64 tempw = regs.allocTemp(KindA64::w);
1697

1698
        if (inst.regA64 != reg)
1699
            build.mov(inst.regA64, reg);
1700

1701
        build.mov(tempw, LUA_TVECTOR);
1702
        build.ins_4s(inst.regA64, tempw, 3);
1703
        break;
1704
    }
1705
    case IrCmd::TRUNCATE_UINT:
1706
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
1707

1708
        build.ubfx(castReg(KindA64::x, inst.regA64), castReg(KindA64::x, regOp(OP_A(inst))), 0, 32); // explicit uxtw
1709
        break;
1710
    case IrCmd::ADJUST_STACK_TO_REG:
1711
    {
1712
        RegisterA64 temp = regs.allocTemp(KindA64::x);
1713

1714
        if (OP_B(inst).kind == IrOpKind::Constant)
1715
        {
1716
            build.add(temp, rBase, uint16_t((vmRegOp(OP_A(inst)) + intOp(OP_B(inst))) * sizeof(TValue)));
1717
            build.str(temp, mem(rState, offsetof(lua_State, top)));
1718
        }
1719
        else if (OP_B(inst).kind == IrOpKind::Inst)
1720
        {
1721
            build.add(temp, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1722
            build.add(temp, temp, regOp(OP_B(inst)), kTValueSizeLog2); // implicit uxtw
1723
            build.str(temp, mem(rState, offsetof(lua_State, top)));
1724
        }
1725
        else
1726
            CODEGEN_ASSERT(!"Unsupported instruction form");
1727
        break;
1728
    }
1729
    case IrCmd::ADJUST_STACK_TO_TOP:
1730
    {
1731
        RegisterA64 temp = regs.allocTemp(KindA64::x);
1732
        build.ldr(temp, mem(rState, offsetof(lua_State, ci)));
1733
        build.ldr(temp, mem(temp, offsetof(CallInfo, top)));
1734
        build.str(temp, mem(rState, offsetof(lua_State, top)));
1735
        break;
1736
    }
1737
    case IrCmd::FASTCALL:
1738
        regs.spill(index);
1739

1740
        error |= !emitBuiltin(build, function, regs, uintOp(OP_A(inst)), vmRegOp(OP_B(inst)), vmRegOp(OP_C(inst)), intOp(OP_D(inst)));
1741
        break;
1742
    case IrCmd::INVOKE_FASTCALL:
1743
    {
1744
        // We might need a temporary and we have to preserve it over the spill
1745
        RegisterA64 temp = regs.allocTemp(KindA64::q);
1746
        regs.spill(index, {temp});
1747

1748
        build.mov(x0, rState);
1749
        build.add(x1, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
1750
        build.add(x2, rBase, uint16_t(vmRegOp(OP_C(inst)) * sizeof(TValue)));
1751
        build.mov(w3, intOp(OP_G(inst))); // nresults
1752

1753
        // 'E' argument can only be produced by LOP_FASTCALL3 lowering
1754
        if (OP_E(inst).kind != IrOpKind::Undef)
1755
        {
1756
            CODEGEN_ASSERT(intOp(OP_F(inst)) == 3);
1757

1758
            build.ldr(x4, mem(rState, offsetof(lua_State, top)));
1759

1760
            build.ldr(temp, mem(rBase, vmRegOp(OP_D(inst)) * sizeof(TValue)));
1761
            build.str(temp, mem(x4, 0));
1762

1763
            build.ldr(temp, mem(rBase, vmRegOp(OP_E(inst)) * sizeof(TValue)));
1764
            build.str(temp, mem(x4, sizeof(TValue)));
1765
        }
1766
        else
1767
        {
1768
            if (OP_D(inst).kind == IrOpKind::VmReg)
1769
                build.add(x4, rBase, uint16_t(vmRegOp(OP_D(inst)) * sizeof(TValue)));
1770
            else if (OP_D(inst).kind == IrOpKind::VmConst)
1771
                emitAddOffset(build, x4, rConstants, vmConstOp(OP_D(inst)) * sizeof(TValue));
1772
            else
1773
                CODEGEN_ASSERT(OP_D(inst).kind == IrOpKind::Undef);
1774
        }
1775

1776
        // nparams
1777
        if (intOp(OP_F(inst)) == LUA_MULTRET)
1778
        {
1779
            // L->top - (ra + 1)
1780
            build.ldr(x5, mem(rState, offsetof(lua_State, top)));
1781
            build.sub(x5, x5, rBase);
1782
            build.sub(x5, x5, uint16_t((vmRegOp(OP_B(inst)) + 1) * sizeof(TValue)));
1783
            build.lsr(x5, x5, kTValueSizeLog2);
1784
        }
1785
        else
1786
            build.mov(w5, intOp(OP_F(inst)));
1787

1788
        build.ldr(x6, mem(rNativeContext, offsetof(NativeContext, luauF_table) + uintOp(OP_A(inst)) * sizeof(luau_FastFunction)));
1789
        build.blr(x6);
1790

1791
        inst.regA64 = regs.takeReg(w0, index);
1792
        // Skipping high register bits clear, only consumer is CHECK_FASTCALL_RES which doesn't read them
1793
        break;
1794
    }
1795
    case IrCmd::CHECK_FASTCALL_RES:
1796
        build.cmp(regOp(OP_A(inst)), uint16_t(0));
1797
        build.b(ConditionA64::Less, labelOp(OP_B(inst)));
1798
        break;
1799
    case IrCmd::DO_ARITH:
1800
        regs.spill(index);
1801
        build.mov(x0, rState);
1802
        build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1803

1804
        if (OP_B(inst).kind == IrOpKind::VmConst)
1805
            emitAddOffset(build, x2, rConstants, vmConstOp(OP_B(inst)) * sizeof(TValue));
1806
        else
1807
            build.add(x2, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
1808

1809
        if (OP_C(inst).kind == IrOpKind::VmConst)
1810
            emitAddOffset(build, x3, rConstants, vmConstOp(OP_C(inst)) * sizeof(TValue));
1811
        else
1812
            build.add(x3, rBase, uint16_t(vmRegOp(OP_C(inst)) * sizeof(TValue)));
1813

1814
        switch (TMS(intOp(OP_D(inst))))
1815
        {
1816
        case TM_ADD:
1817
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithadd)));
1818
            break;
1819
        case TM_SUB:
1820
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithsub)));
1821
            break;
1822
        case TM_MUL:
1823
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithmul)));
1824
            break;
1825
        case TM_DIV:
1826
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithdiv)));
1827
            break;
1828
        case TM_IDIV:
1829
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithidiv)));
1830
            break;
1831
        case TM_MOD:
1832
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithmod)));
1833
            break;
1834
        case TM_POW:
1835
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithpow)));
1836
            break;
1837
        case TM_UNM:
1838
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_doarithunm)));
1839
            break;
1840
        default:
1841
            CODEGEN_ASSERT(!"Invalid doarith helper operation tag");
1842
            break;
1843
        }
1844

1845
        build.blr(x4);
1846

1847
        emitUpdateBase(build);
1848
        break;
1849
    case IrCmd::DO_LEN:
1850
        regs.spill(index);
1851
        build.mov(x0, rState);
1852
        build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1853
        build.add(x2, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
1854
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaV_dolen)));
1855
        build.blr(x3);
1856

1857
        emitUpdateBase(build);
1858
        break;
1859
    case IrCmd::GET_TABLE:
1860
        regs.spill(index);
1861
        build.mov(x0, rState);
1862
        build.add(x1, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
1863

1864
        if (OP_C(inst).kind == IrOpKind::VmReg)
1865
            build.add(x2, rBase, uint16_t(vmRegOp(OP_C(inst)) * sizeof(TValue)));
1866
        else if (OP_C(inst).kind == IrOpKind::Constant)
1867
        {
1868
            TValue n = {};
1869
            setnvalue(&n, uintOp(OP_C(inst)));
1870
            build.adr(x2, &n, sizeof(n));
1871
        }
1872
        else
1873
            CODEGEN_ASSERT(!"Unsupported instruction form");
1874

1875
        build.add(x3, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1876
        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_gettable)));
1877
        build.blr(x4);
1878

1879
        emitUpdateBase(build);
1880
        break;
1881
    case IrCmd::SET_TABLE:
1882
        regs.spill(index);
1883
        build.mov(x0, rState);
1884
        build.add(x1, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
1885

1886
        if (OP_C(inst).kind == IrOpKind::VmReg)
1887
            build.add(x2, rBase, uint16_t(vmRegOp(OP_C(inst)) * sizeof(TValue)));
1888
        else if (OP_C(inst).kind == IrOpKind::Constant)
1889
        {
1890
            TValue n = {};
1891
            setnvalue(&n, uintOp(OP_C(inst)));
1892
            build.adr(x2, &n, sizeof(n));
1893
        }
1894
        else
1895
            CODEGEN_ASSERT(!"Unsupported instruction form");
1896

1897
        build.add(x3, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1898
        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaV_settable)));
1899
        build.blr(x4);
1900

1901
        emitUpdateBase(build);
1902
        break;
1903
    case IrCmd::GET_CACHED_IMPORT:
1904
    {
1905
        regs.spill(index);
1906

1907
        Label skip, exit;
1908

1909
        RegisterA64 tempTag = regs.allocTemp(KindA64::w);
1910

1911
        AddressA64 addrConstTag = tempAddr(OP_B(inst), offsetof(TValue, tt));
1912
        build.ldr(tempTag, addrConstTag);
1913

1914
        // If the constant for the import is set, we will use it directly, otherwise we have to call an import path lookup function
1915
        CODEGEN_ASSERT(LUA_TNIL == 0);
1916
        build.cbnz(tempTag, skip);
1917

1918
        {
1919
            build.mov(x0, rState);
1920
            build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
1921
            build.mov(w2, importOp(OP_C(inst)));
1922
            build.mov(w3, uintOp(OP_D(inst)));
1923
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, getImport)));
1924
            build.blr(x4);
1925

1926
            emitUpdateBase(build);
1927
            build.b(exit);
1928
        }
1929

1930
        build.setLabel(skip);
1931

1932
        RegisterA64 tempTv = regs.allocTemp(KindA64::q);
1933

1934
        AddressA64 addrConst = tempAddr(OP_B(inst), 0);
1935
        build.ldr(tempTv, addrConst);
1936

1937
        AddressA64 addrReg = tempAddr(OP_A(inst), 0);
1938
        build.str(tempTv, addrReg);
1939

1940
        build.setLabel(exit);
1941
        break;
1942
    }
1943
    case IrCmd::CONCAT:
1944
        regs.spill(index);
1945
        build.mov(x0, rState);
1946
        build.mov(w1, uintOp(OP_B(inst)));
1947
        build.mov(w2, vmRegOp(OP_A(inst)) + uintOp(OP_B(inst)) - 1);
1948
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaV_concat)));
1949
        build.blr(x3);
1950

1951
        emitUpdateBase(build);
1952
        break;
1953
    case IrCmd::GET_UPVALUE:
1954
    {
1955
        inst.regA64 = regs.allocReg(KindA64::q, index);
1956

1957
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
1958
        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
1959

1960
        build.add(temp1, rClosure, uint16_t(offsetof(Closure, l.uprefs) + sizeof(TValue) * vmUpvalueOp(OP_A(inst))));
1961

1962
        // uprefs[] is either an actual value, or it points to UpVal object which has a pointer to value
1963
        Label skip;
1964
        build.ldr(temp2, mem(temp1, offsetof(TValue, tt)));
1965
        build.cmp(temp2, uint16_t(LUA_TUPVAL));
1966
        build.b(ConditionA64::NotEqual, skip);
1967

1968
        // UpVal.v points to the value (either on stack, or on heap inside each UpVal, but we can deref it unconditionally)
1969
        build.ldr(temp1, mem(temp1, offsetof(TValue, value.gc)));
1970
        build.ldr(temp1, mem(temp1, offsetof(UpVal, v)));
1971

1972
        build.setLabel(skip);
1973

1974
        build.ldr(inst.regA64, temp1);
1975
        break;
1976
    }
1977
    case IrCmd::SET_UPVALUE:
1978
    {
1979
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
1980
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
1981

1982
        // UpVal*
1983
        build.ldr(temp1, mem(rClosure, offsetof(Closure, l.uprefs) + sizeof(TValue) * vmUpvalueOp(OP_A(inst)) + offsetof(TValue, value.gc)));
1984

1985
        build.ldr(temp2, mem(temp1, offsetof(UpVal, v)));
1986
        build.str(regOp(OP_B(inst)), temp2);
1987

1988
        if (OP_C(inst).kind == IrOpKind::Undef || isGCO(tagOp(OP_C(inst))))
1989
        {
1990
            RegisterA64 value = regOp(OP_B(inst));
1991

1992
            Label skip;
1993
            checkObjectBarrierConditions(temp1, temp2, value, OP_B(inst), OP_C(inst).kind == IrOpKind::Undef ? -1 : tagOp(OP_C(inst)), skip);
1994

1995
            size_t spills = regs.spill(index, {temp1, value});
1996

1997
            build.mov(x1, temp1);
1998
            build.mov(x0, rState);
1999
            build.fmov(x2, castReg(KindA64::d, value));
2000
            build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaC_barrierf)));
2001
            build.blr(x3);
2002

2003
            regs.restore(spills); // need to restore before skip so that registers are in a consistent state
2004

2005
            // note: no emitUpdateBase necessary because luaC_ barriers do not reallocate stack
2006
            build.setLabel(skip);
2007
        }
2008
        break;
2009
    }
2010
    case IrCmd::CHECK_TAG:
2011
    {
2012
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2013
        Label& fail = getTargetLabel(OP_C(inst), fresh);
2014

2015
        if (tagOp(OP_B(inst)) == 0)
2016
        {
2017
            build.cbnz(regOp(OP_A(inst)), fail);
2018
        }
2019
        else
2020
        {
2021
            build.cmp(regOp(OP_A(inst)), uint16_t(tagOp(OP_B(inst))));
2022
            build.b(ConditionA64::NotEqual, fail);
2023
        }
2024

2025
        finalizeTargetLabel(OP_C(inst), fresh);
2026
        break;
2027
    }
2028
    case IrCmd::CHECK_TRUTHY:
2029
    {
2030
        // Constant tags which don't require boolean value check should've been removed in constant folding
2031
        CODEGEN_ASSERT(OP_A(inst).kind != IrOpKind::Constant || tagOp(OP_A(inst)) == LUA_TBOOLEAN);
2032

2033
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2034
        Label& target = getTargetLabel(OP_C(inst), fresh);
2035

2036
        Label skip;
2037

2038
        if (OP_A(inst).kind != IrOpKind::Constant)
2039
        {
2040
            // fail to fallback on 'nil' (falsy)
2041
            CODEGEN_ASSERT(LUA_TNIL == 0);
2042
            build.cbz(regOp(OP_A(inst)), target);
2043

2044
            // skip value test if it's not a boolean (truthy)
2045
            build.cmp(regOp(OP_A(inst)), uint16_t(LUA_TBOOLEAN));
2046
            build.b(ConditionA64::NotEqual, skip);
2047
        }
2048

2049
        // fail to fallback on 'false' boolean value (falsy)
2050
        if (OP_B(inst).kind != IrOpKind::Constant)
2051
        {
2052
            build.cbz(regOp(OP_B(inst)), target);
2053
        }
2054
        else
2055
        {
2056
            if (intOp(OP_B(inst)) == 0)
2057
                build.b(target);
2058
        }
2059

2060
        if (OP_A(inst).kind != IrOpKind::Constant)
2061
            build.setLabel(skip);
2062

2063
        finalizeTargetLabel(OP_C(inst), fresh);
2064
        break;
2065
    }
2066
    case IrCmd::CHECK_READONLY:
2067
    {
2068
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2069
        RegisterA64 temp = regs.allocTemp(KindA64::w);
2070
        build.ldrb(temp, mem(regOp(OP_A(inst)), offsetof(LuaTable, readonly)));
2071
        build.cbnz(temp, getTargetLabel(OP_B(inst), fresh));
2072
        finalizeTargetLabel(OP_B(inst), fresh);
2073
        break;
2074
    }
2075
    case IrCmd::CHECK_NO_METATABLE:
2076
    {
2077
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2078
        RegisterA64 temp = regs.allocTemp(KindA64::x);
2079
        build.ldr(temp, mem(regOp(OP_A(inst)), offsetof(LuaTable, metatable)));
2080
        build.cbnz(temp, getTargetLabel(OP_B(inst), fresh));
2081
        finalizeTargetLabel(OP_B(inst), fresh);
2082
        break;
2083
    }
2084
    case IrCmd::CHECK_SAFE_ENV:
2085
    {
2086
        if (FFlag::LuauCodegenBlockSafeEnv)
2087
        {
2088
            checkSafeEnv(OP_A(inst), next);
2089
        }
2090
        else
2091
        {
2092
            Label fresh; // used when guard aborts execution or jumps to a VM exit
2093
            RegisterA64 temp = regs.allocTemp(KindA64::x);
2094
            RegisterA64 tempw = castReg(KindA64::w, temp);
2095
            build.ldr(temp, mem(rClosure, offsetof(Closure, env)));
2096
            build.ldrb(tempw, mem(temp, offsetof(LuaTable, safeenv)));
2097
            build.cbz(tempw, getTargetLabel(OP_A(inst), fresh));
2098
            finalizeTargetLabel(OP_A(inst), fresh);
2099
        }
2100
        break;
2101
    }
2102
    case IrCmd::CHECK_ARRAY_SIZE:
2103
    {
2104
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2105
        Label& fail = getTargetLabel(OP_C(inst), fresh);
2106

2107
        RegisterA64 temp = regs.allocTemp(KindA64::w);
2108
        build.ldr(temp, mem(regOp(OP_A(inst)), offsetof(LuaTable, sizearray)));
2109

2110
        if (OP_B(inst).kind == IrOpKind::Inst)
2111
        {
2112
            build.cmp(temp, regOp(OP_B(inst)));
2113
            build.b(ConditionA64::UnsignedLessEqual, fail);
2114
        }
2115
        else if (OP_B(inst).kind == IrOpKind::Constant)
2116
        {
2117
            if (intOp(OP_B(inst)) == 0)
2118
            {
2119
                build.cbz(temp, fail);
2120
            }
2121
            else if (size_t(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate)
2122
            {
2123
                build.cmp(temp, uint16_t(intOp(OP_B(inst))));
2124
                build.b(ConditionA64::UnsignedLessEqual, fail);
2125
            }
2126
            else
2127
            {
2128
                RegisterA64 temp2 = regs.allocTemp(KindA64::w);
2129
                build.mov(temp2, intOp(OP_B(inst)));
2130
                build.cmp(temp, temp2);
2131
                build.b(ConditionA64::UnsignedLessEqual, fail);
2132
            }
2133
        }
2134
        else
2135
            CODEGEN_ASSERT(!"Unsupported instruction form");
2136

2137
        finalizeTargetLabel(OP_C(inst), fresh);
2138
        break;
2139
    }
2140
    case IrCmd::JUMP_SLOT_MATCH:
2141
    case IrCmd::CHECK_SLOT_MATCH:
2142
    {
2143
        Label abort; // used when guard aborts execution
2144
        const IrOp& mismatchOp = inst.cmd == IrCmd::JUMP_SLOT_MATCH ? OP_D(inst) : OP_C(inst);
2145
        Label& mismatch = mismatchOp.kind == IrOpKind::Undef ? abort : labelOp(mismatchOp);
2146

2147
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
2148
        RegisterA64 temp1w = castReg(KindA64::w, temp1);
2149
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
2150

2151
        static_assert(offsetof(LuaNode, key.value) == offsetof(LuaNode, key) && kOffsetOfTKeyTagNext >= 8 && kOffsetOfTKeyTagNext < 16);
2152
        build.ldp(
2153
            temp1, temp2, mem(regOp(OP_A(inst)), offsetof(LuaNode, key))
2154
        ); // load key.value into temp1 and key.tt (alongside other bits) into temp2
2155
        build.ubfx(temp2, temp2, (kOffsetOfTKeyTagNext - 8) * 8, kTKeyTagBits); // .tt is right before .next, and 8 bytes are skipped by ldp
2156
        build.cmp(temp2, uint16_t(LUA_TSTRING));
2157
        build.b(ConditionA64::NotEqual, mismatch);
2158

2159
        AddressA64 addr = tempAddr(OP_B(inst), offsetof(TValue, value));
2160
        build.ldr(temp2, addr);
2161
        build.cmp(temp1, temp2);
2162
        build.b(ConditionA64::NotEqual, mismatch);
2163

2164
        build.ldr(temp1w, mem(regOp(OP_A(inst)), offsetof(LuaNode, val.tt)));
2165
        CODEGEN_ASSERT(LUA_TNIL == 0);
2166
        build.cbz(temp1w, mismatch);
2167

2168
        if (inst.cmd == IrCmd::JUMP_SLOT_MATCH)
2169
            jumpOrFallthrough(blockOp(OP_C(inst)), next);
2170
        else if (abort.id)
2171
            emitAbort(build, abort);
2172
        break;
2173
    }
2174
    case IrCmd::CHECK_NODE_NO_NEXT:
2175
    {
2176
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2177
        RegisterA64 temp = regs.allocTemp(KindA64::w);
2178

2179
        build.ldr(temp, mem(regOp(OP_A(inst)), offsetof(LuaNode, key) + kOffsetOfTKeyTagNext));
2180
        build.lsr(temp, temp, kTKeyTagBits);
2181
        build.cbnz(temp, getTargetLabel(OP_B(inst), fresh));
2182
        finalizeTargetLabel(OP_B(inst), fresh);
2183
        break;
2184
    }
2185
    case IrCmd::CHECK_NODE_VALUE:
2186
    {
2187
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2188
        RegisterA64 temp = regs.allocTemp(KindA64::w);
2189

2190
        build.ldr(temp, mem(regOp(OP_A(inst)), offsetof(LuaNode, val.tt)));
2191
        CODEGEN_ASSERT(LUA_TNIL == 0);
2192
        build.cbz(temp, getTargetLabel(OP_B(inst), fresh));
2193
        finalizeTargetLabel(OP_B(inst), fresh);
2194
        break;
2195
    }
2196
    case IrCmd::CHECK_BUFFER_LEN:
2197
    {
2198
        if (FFlag::LuauCodegenBufferRangeMerge4)
2199
        {
2200
            int minOffset = intOp(OP_C(inst));
2201
            int maxOffset = intOp(OP_D(inst));
2202
            CODEGEN_ASSERT(minOffset < maxOffset);
2203
            CODEGEN_ASSERT(minOffset >= -int(AssemblyBuilderA64::kMaxImmediate) && minOffset <= int(AssemblyBuilderA64::kMaxImmediate));
2204

2205
            int accessSize = maxOffset - minOffset;
2206
            CODEGEN_ASSERT(accessSize > 0 && accessSize <= int(AssemblyBuilderA64::kMaxImmediate));
2207

2208
            Label fresh; // used when guard aborts execution or jumps to a VM exit
2209
            Label& target = getTargetLabel(OP_F(inst), fresh);
2210

2211
            // Check if we are acting not only as a guard for the size, but as a guard that offset represents an exact integer
2212
            if (OP_E(inst).kind != IrOpKind::Undef)
2213
            {
2214
                CODEGEN_ASSERT(getCmdValueKind(function.instOp(OP_B(inst)).cmd) == IrValueKind::Int);
2215
                CODEGEN_ASSERT(!producesDirtyHighRegisterBits(function.instOp(OP_B(inst)).cmd)); // Ensure that high register bits are cleared
2216

2217
                if ((build.features & Feature_JSCVT) != 0)
2218
                {
2219
                    RegisterA64 temp = regs.allocTemp(KindA64::w);
2220

2221
                    build.fjcvtzs(temp, regOp(OP_E(inst))); // fjcvtzs sets PSTATE.Z (equal) iff conversion is exact
2222
                    build.b(ConditionA64::NotEqual, target);
2223
                }
2224
                else
2225
                {
2226
                    RegisterA64 temp = regs.allocTemp(KindA64::d);
2227

2228
                    build.scvtf(temp, regOp(OP_B(inst)));
2229
                    build.fcmp(regOp(OP_E(inst)), temp);
2230
                    build.b(ConditionA64::NotEqual, target);
2231
                }
2232
            }
2233

2234
            RegisterA64 temp = regs.allocTemp(KindA64::w);
2235
            build.ldr(temp, mem(regOp(OP_A(inst)), offsetof(Buffer, len)));
2236

2237
            if (OP_B(inst).kind == IrOpKind::Inst)
2238
            {
2239
                CODEGEN_ASSERT(!producesDirtyHighRegisterBits(function.instOp(OP_B(inst)).cmd)); // Ensure that high register bits are cleared
2240

2241
                if (accessSize == 1 && minOffset == 0)
2242
                {
2243
                    // fails if offset >= len
2244
                    build.cmp(temp, regOp(OP_B(inst)));
2245
                    build.b(ConditionA64::UnsignedLessEqual, target);
2246
                }
2247
                else if (minOffset >= 0 && maxOffset <= int(AssemblyBuilderA64::kMaxImmediate))
2248
                {
2249
                    // fails if offset + size > len; we compute it as len - offset < size
2250
                    RegisterA64 tempx = castReg(KindA64::x, temp);
2251
                    build.sub(tempx, tempx, regOp(OP_B(inst))); // implicit uxtw
2252
                    build.cmp(tempx, uint16_t(maxOffset));
2253
                    build.b(ConditionA64::Less, target); // note: this is a signed 64-bit comparison so that out of bounds offset fails
2254
                }
2255
                else
2256
                {
2257
                    RegisterA64 tempx = castReg(KindA64::x, temp);
2258
                    RegisterA64 temp2 = regs.allocTemp(KindA64::x);
2259

2260
                    // Get the base offset in 32 bits
2261
                    if (minOffset >= 0)
2262
                        build.add(castReg(KindA64::w, temp2), regOp(OP_B(inst)), uint16_t(minOffset));
2263
                    else
2264
                        build.sub(castReg(KindA64::w, temp2), regOp(OP_B(inst)), uint16_t(-minOffset));
2265

2266
                    // fail if uint64_t(uint32_t(offset + minOffset)) + accessSize > length
2267
                    build.add(temp2, temp2, uint16_t(accessSize));
2268
                    build.cmp(temp2, tempx);
2269
                    build.b(ConditionA64::UnsignedGreater, target);
2270
                }
2271
            }
2272
            else if (OP_B(inst).kind == IrOpKind::Constant)
2273
            {
2274
                int offset = intOp(OP_B(inst));
2275

2276
                // Constant folding can take care of it, but for safety we avoid overflow/underflow cases here
2277
                if (offset < 0 || unsigned(offset) + unsigned(accessSize) >= unsigned(INT_MAX))
2278
                {
2279
                    build.b(target);
2280
                }
2281
                else if (offset + accessSize <= int(AssemblyBuilderA64::kMaxImmediate))
2282
                {
2283
                    build.cmp(temp, uint16_t(offset + accessSize));
2284
                    build.b(ConditionA64::UnsignedLessEqual, target);
2285
                }
2286
                else
2287
                {
2288
                    RegisterA64 temp2 = regs.allocTemp(KindA64::w);
2289
                    build.mov(temp2, offset + accessSize);
2290
                    build.cmp(temp, temp2);
2291
                    build.b(ConditionA64::UnsignedLessEqual, target);
2292
                }
2293
            }
2294
            else
2295
            {
2296
                CODEGEN_ASSERT(!"Unsupported instruction form");
2297
            }
2298
            finalizeTargetLabel(OP_F(inst), fresh);
2299
        }
2300
        else
2301
        {
2302
            int accessSize = intOp(OP_C(inst));
2303
            CODEGEN_ASSERT(accessSize > 0 && accessSize <= int(AssemblyBuilderA64::kMaxImmediate));
2304

2305
            Label fresh; // used when guard aborts execution or jumps to a VM exit
2306
            Label& target = getTargetLabel(OP_D(inst), fresh);
2307

2308
            RegisterA64 temp = regs.allocTemp(KindA64::w);
2309
            build.ldr(temp, mem(regOp(OP_A(inst)), offsetof(Buffer, len)));
2310

2311
            if (OP_B(inst).kind == IrOpKind::Inst)
2312
            {
2313
                CODEGEN_ASSERT(!producesDirtyHighRegisterBits(function.instOp(OP_B(inst)).cmd)); // Ensure that high register bits are cleared
2314

2315
                if (accessSize == 1)
2316
                {
2317
                    // fails if offset >= len
2318
                    build.cmp(temp, regOp(OP_B(inst)));
2319
                    build.b(ConditionA64::UnsignedLessEqual, target);
2320
                }
2321
                else
2322
                {
2323
                    // fails if offset + size > len; we compute it as len - offset < size
2324
                    RegisterA64 tempx = castReg(KindA64::x, temp);
2325
                    build.sub(tempx, tempx, regOp(OP_B(inst))); // implicit uxtw
2326
                    build.cmp(tempx, uint16_t(accessSize));
2327
                    build.b(ConditionA64::Less, target); // note: this is a signed 64-bit comparison so that out of bounds offset fails
2328
                }
2329
            }
2330
            else if (OP_B(inst).kind == IrOpKind::Constant)
2331
            {
2332
                int offset = intOp(OP_B(inst));
2333

2334
                // Constant folding can take care of it, but for safety we avoid overflow/underflow cases here
2335
                if (offset < 0 || unsigned(offset) + unsigned(accessSize) >= unsigned(INT_MAX))
2336
                {
2337
                    build.b(target);
2338
                }
2339
                else if (offset + accessSize <= int(AssemblyBuilderA64::kMaxImmediate))
2340
                {
2341
                    build.cmp(temp, uint16_t(offset + accessSize));
2342
                    build.b(ConditionA64::UnsignedLessEqual, target);
2343
                }
2344
                else
2345
                {
2346
                    RegisterA64 temp2 = regs.allocTemp(KindA64::w);
2347
                    build.mov(temp2, offset + accessSize);
2348
                    build.cmp(temp, temp2);
2349
                    build.b(ConditionA64::UnsignedLessEqual, target);
2350
                }
2351
            }
2352
            else
2353
            {
2354
                CODEGEN_ASSERT(!"Unsupported instruction form");
2355
            }
2356
            finalizeTargetLabel(OP_D(inst), fresh);
2357
        }
2358
        break;
2359
    }
2360
    case IrCmd::CHECK_USERDATA_TAG:
2361
    {
2362
        CODEGEN_ASSERT(unsigned(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate);
2363

2364
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2365
        Label& fail = getTargetLabel(OP_C(inst), fresh);
2366
        RegisterA64 temp = regs.allocTemp(KindA64::w);
2367
        build.ldrb(temp, mem(regOp(OP_A(inst)), offsetof(Udata, tag)));
2368
        build.cmp(temp, uint16_t(intOp(OP_B(inst))));
2369
        build.b(ConditionA64::NotEqual, fail);
2370
        finalizeTargetLabel(OP_C(inst), fresh);
2371
        break;
2372
    }
2373
    case IrCmd::CHECK_CMP_INT:
2374
    {
2375
        IrCondition cond = conditionOp(OP_C(inst));
2376

2377
        Label fresh; // used when guard aborts execution or jumps to a VM exit
2378
        Label& fail = getTargetLabel(OP_D(inst), fresh);
2379

2380
        if (cond == IrCondition::Equal && intOp(OP_B(inst)) == 0)
2381
        {
2382
            build.cbnz(regOp(OP_A(inst)), fail);
2383
        }
2384
        else if (cond == IrCondition::NotEqual && intOp(OP_B(inst)) == 0)
2385
        {
2386
            build.cbz(regOp(OP_A(inst)), fail);
2387
        }
2388
        else
2389
        {
2390
            RegisterA64 tempA = tempInt(OP_A(inst));
2391

2392
            if (OP_B(inst).kind == IrOpKind::Constant && unsigned(intOp(OP_B(inst))) <= AssemblyBuilderA64::kMaxImmediate)
2393
                build.cmp(tempA, uint16_t(intOp(OP_B(inst))));
2394
            else
2395
                build.cmp(tempA, tempInt(OP_B(inst)));
2396

2397
            build.b(getConditionInt(getNegatedCondition(cond)), fail);
2398
        }
2399
        finalizeTargetLabel(OP_D(inst), fresh);
2400
        break;
2401
    }
2402
    case IrCmd::INTERRUPT:
2403
    {
2404
        regs.spill(index);
2405

2406
        Label self;
2407

2408
        build.ldr(x0, mem(rGlobalState, offsetof(global_State, cb.interrupt)));
2409
        build.cbnz(x0, self);
2410

2411
        Label next = build.setLabel();
2412

2413
        interruptHandlers.push_back({self, uintOp(OP_A(inst)), next});
2414
        break;
2415
    }
2416
    case IrCmd::CHECK_GC:
2417
    {
2418
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
2419
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
2420

2421
        static_assert(offsetof(global_State, totalbytes) == offsetof(global_State, GCthreshold) + sizeof(global_State::GCthreshold));
2422
        Label skip;
2423
        build.ldp(temp1, temp2, mem(rGlobalState, offsetof(global_State, GCthreshold)));
2424
        build.cmp(temp1, temp2);
2425
        build.b(ConditionA64::UnsignedGreater, skip);
2426

2427
        size_t spills = regs.spill(index);
2428

2429
        build.mov(x0, rState);
2430
        build.mov(w1, 1);
2431
        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaC_step)));
2432
        build.blr(x2);
2433

2434
        emitUpdateBase(build);
2435

2436
        regs.restore(spills); // need to restore before skip so that registers are in a consistent state
2437

2438
        build.setLabel(skip);
2439
        break;
2440
    }
2441
    case IrCmd::BARRIER_OBJ:
2442
    {
2443
        RegisterA64 temp = regs.allocTemp(KindA64::x);
2444

2445
        Label skip;
2446
        checkObjectBarrierConditions(regOp(OP_A(inst)), temp, noreg, OP_B(inst), OP_C(inst).kind == IrOpKind::Undef ? -1 : tagOp(OP_C(inst)), skip);
2447

2448
        RegisterA64 reg = regOp(OP_A(inst)); // note: we need to call regOp before spill so that we don't do redundant reloads
2449
        size_t spills = regs.spill(index, {reg});
2450
        build.mov(x1, reg);
2451
        build.mov(x0, rState);
2452
        build.ldr(x2, mem(rBase, vmRegOp(OP_B(inst)) * sizeof(TValue) + offsetof(TValue, value)));
2453
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaC_barrierf)));
2454
        build.blr(x3);
2455

2456
        regs.restore(spills); // need to restore before skip so that registers are in a consistent state
2457

2458
        // note: no emitUpdateBase necessary because luaC_ barriers do not reallocate stack
2459
        build.setLabel(skip);
2460
        break;
2461
    }
2462
    case IrCmd::BARRIER_TABLE_BACK:
2463
    {
2464
        Label skip;
2465
        RegisterA64 temp = regs.allocTemp(KindA64::w);
2466

2467
        // isblack(obj2gco(t))
2468
        build.ldrb(temp, mem(regOp(OP_A(inst)), offsetof(GCheader, marked)));
2469
        build.tbz(temp, BLACKBIT, skip);
2470

2471
        RegisterA64 reg = regOp(OP_A(inst)); // note: we need to call regOp before spill so that we don't do redundant reloads
2472
        size_t spills = regs.spill(index, {reg});
2473
        build.mov(x1, reg);
2474
        build.mov(x0, rState);
2475
        build.add(x2, x1, uint16_t(offsetof(LuaTable, gclist)));
2476
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaC_barrierback)));
2477
        build.blr(x3);
2478

2479
        regs.restore(spills); // need to restore before skip so that registers are in a consistent state
2480

2481
        // note: no emitUpdateBase necessary because luaC_ barriers do not reallocate stack
2482
        build.setLabel(skip);
2483
        break;
2484
    }
2485
    case IrCmd::BARRIER_TABLE_FORWARD:
2486
    {
2487
        RegisterA64 temp = regs.allocTemp(KindA64::x);
2488

2489
        Label skip;
2490
        checkObjectBarrierConditions(regOp(OP_A(inst)), temp, noreg, OP_B(inst), OP_C(inst).kind == IrOpKind::Undef ? -1 : tagOp(OP_C(inst)), skip);
2491

2492
        RegisterA64 reg = regOp(OP_A(inst)); // note: we need to call regOp before spill so that we don't do redundant reloads
2493
        AddressA64 addr = tempAddr(OP_B(inst), offsetof(TValue, value));
2494
        size_t spills = regs.spill(index, {reg});
2495
        build.mov(x1, reg);
2496
        build.mov(x0, rState);
2497
        build.ldr(x2, addr);
2498
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaC_barriertable)));
2499
        build.blr(x3);
2500

2501
        regs.restore(spills); // need to restore before skip so that registers are in a consistent state
2502

2503
        // note: no emitUpdateBase necessary because luaC_ barriers do not reallocate stack
2504
        build.setLabel(skip);
2505
        break;
2506
    }
2507
    case IrCmd::SET_SAVEDPC:
2508
    {
2509
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
2510
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
2511

2512
        emitAddOffset(build, temp1, rCode, uintOp(OP_A(inst)) * sizeof(Instruction));
2513
        build.ldr(temp2, mem(rState, offsetof(lua_State, ci)));
2514
        build.str(temp1, mem(temp2, offsetof(CallInfo, savedpc)));
2515
        break;
2516
    }
2517
    case IrCmd::CLOSE_UPVALS:
2518
    {
2519
        Label skip;
2520
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
2521
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
2522

2523
        // L->openupval != 0
2524
        build.ldr(temp1, mem(rState, offsetof(lua_State, openupval)));
2525
        build.cbz(temp1, skip);
2526

2527
        // ra <= L->openupval->v
2528
        build.ldr(temp1, mem(temp1, offsetof(UpVal, v)));
2529
        build.add(temp2, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
2530
        build.cmp(temp2, temp1);
2531
        build.b(ConditionA64::UnsignedGreater, skip);
2532

2533
        size_t spills = regs.spill(index, {temp2});
2534
        build.mov(x1, temp2);
2535
        build.mov(x0, rState);
2536
        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaF_close)));
2537
        build.blr(x2);
2538

2539
        regs.restore(spills); // need to restore before skip so that registers are in a consistent state
2540

2541
        build.setLabel(skip);
2542
        break;
2543
    }
2544
    case IrCmd::CAPTURE:
2545
        // no-op
2546
        break;
2547
    case IrCmd::SETLIST:
2548
        regs.spill(index);
2549
        emitFallback(build, offsetof(NativeContext, executeSETLIST), uintOp(OP_A(inst)));
2550
        break;
2551
    case IrCmd::CALL:
2552
        regs.spill(index);
2553
        // argtop = (nparams == LUA_MULTRET) ? L->top : ra + 1 + nparams;
2554
        if (intOp(OP_B(inst)) == LUA_MULTRET)
2555
            build.ldr(x2, mem(rState, offsetof(lua_State, top)));
2556
        else
2557
            build.add(x2, rBase, uint16_t((vmRegOp(OP_A(inst)) + 1 + intOp(OP_B(inst))) * sizeof(TValue)));
2558

2559
        // callFallback(L, ra, argtop, nresults)
2560
        build.mov(x0, rState);
2561
        build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
2562
        build.mov(w3, intOp(OP_C(inst)));
2563
        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, callFallback)));
2564
        build.blr(x4);
2565

2566
        emitUpdateBase(build);
2567

2568
        // reentry with x0=closure (NULL implies C function; CALL_FALLBACK_YIELD will trigger exit)
2569
        build.cbnz(x0, helpers.continueCall);
2570
        break;
2571
    case IrCmd::RETURN:
2572
        regs.spill(index);
2573

2574
        if (function.variadic)
2575
        {
2576
            build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
2577
            build.ldr(x1, mem(x1, offsetof(CallInfo, func)));
2578
        }
2579
        else if (intOp(OP_B(inst)) != 1)
2580
            build.sub(x1, rBase, uint16_t(sizeof(TValue))); // invariant: ci->func + 1 == ci->base for non-variadic frames
2581

2582
        if (intOp(OP_B(inst)) == 0)
2583
        {
2584
            build.mov(w2, 0);
2585
            build.b(helpers.return_);
2586
        }
2587
        else if (intOp(OP_B(inst)) == 1 && !function.variadic)
2588
        {
2589
            // fast path: minimizes x1 adjustments
2590
            // note that we skipped x1 computation for this specific case above
2591
            build.ldr(q0, mem(rBase, vmRegOp(OP_A(inst)) * sizeof(TValue)));
2592
            build.str(q0, mem(rBase, -int(sizeof(TValue))));
2593
            build.mov(x1, rBase);
2594
            build.mov(w2, 1);
2595
            build.b(helpers.return_);
2596
        }
2597
        else if (intOp(OP_B(inst)) >= 1 && intOp(OP_B(inst)) <= 3)
2598
        {
2599
            for (int r = 0; r < intOp(OP_B(inst)); ++r)
2600
            {
2601
                build.ldr(q0, mem(rBase, (vmRegOp(OP_A(inst)) + r) * sizeof(TValue)));
2602
                build.str(q0, mem(x1, sizeof(TValue), AddressKindA64::post));
2603
            }
2604
            build.mov(w2, intOp(OP_B(inst)));
2605
            build.b(helpers.return_);
2606
        }
2607
        else
2608
        {
2609
            build.mov(w2, 0);
2610

2611
            // vali = ra
2612
            build.add(x3, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
2613

2614
            // valend = (n == LUA_MULTRET) ? L->top : ra + n
2615
            if (intOp(OP_B(inst)) == LUA_MULTRET)
2616
                build.ldr(x4, mem(rState, offsetof(lua_State, top)));
2617
            else
2618
                build.add(x4, rBase, uint16_t((vmRegOp(OP_A(inst)) + intOp(OP_B(inst))) * sizeof(TValue)));
2619

2620
            Label repeatValueLoop, exitValueLoop;
2621

2622
            if (intOp(OP_B(inst)) == LUA_MULTRET)
2623
            {
2624
                build.cmp(x3, x4);
2625
                build.b(ConditionA64::CarrySet, exitValueLoop); // CarrySet == UnsignedGreaterEqual
2626
            }
2627

2628
            build.setLabel(repeatValueLoop);
2629
            build.ldr(q0, mem(x3, sizeof(TValue), AddressKindA64::post));
2630
            build.str(q0, mem(x1, sizeof(TValue), AddressKindA64::post));
2631
            build.add(w2, w2, uint16_t(1));
2632
            build.cmp(x3, x4);
2633
            build.b(ConditionA64::CarryClear, repeatValueLoop); // CarryClear == UnsignedLess
2634

2635
            build.setLabel(exitValueLoop);
2636
            build.b(helpers.return_);
2637
        }
2638
        break;
2639
    case IrCmd::FORGLOOP:
2640
        // register layout: ra + 1 = table, ra + 2 = internal index, ra + 3 .. ra + aux = iteration variables
2641
        regs.spill(index);
2642
        // clear extra variables since we might have more than two
2643
        if (intOp(OP_B(inst)) > 2)
2644
        {
2645
            CODEGEN_ASSERT(LUA_TNIL == 0);
2646
            for (int i = 2; i < intOp(OP_B(inst)); ++i)
2647
                build.str(wzr, mem(rBase, (vmRegOp(OP_A(inst)) + 3 + i) * sizeof(TValue) + offsetof(TValue, tt)));
2648
        }
2649
        // we use full iter fallback for now; in the future it could be worthwhile to accelerate array iteration here
2650
        build.mov(x0, rState);
2651
        build.ldr(x1, mem(rBase, (vmRegOp(OP_A(inst)) + 1) * sizeof(TValue) + offsetof(TValue, value.gc)));
2652
        build.ldr(w2, mem(rBase, (vmRegOp(OP_A(inst)) + 2) * sizeof(TValue) + offsetof(TValue, value.p)));
2653
        build.add(x3, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
2654
        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, forgLoopTableIter)));
2655
        build.blr(x4);
2656
        // note: no emitUpdateBase necessary because forgLoopTableIter does not reallocate stack
2657
        build.cbnz(w0, labelOp(OP_C(inst)));
2658
        jumpOrFallthrough(blockOp(OP_D(inst)), next);
2659
        break;
2660
    case IrCmd::FORGLOOP_FALLBACK:
2661
        regs.spill(index);
2662
        build.mov(x0, rState);
2663
        build.mov(w1, vmRegOp(OP_A(inst)));
2664
        build.mov(w2, intOp(OP_B(inst)));
2665
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, forgLoopNonTableFallback)));
2666
        build.blr(x3);
2667
        emitUpdateBase(build);
2668
        build.cbnz(w0, labelOp(OP_C(inst)));
2669
        jumpOrFallthrough(blockOp(OP_D(inst)), next);
2670
        break;
2671
    case IrCmd::FORGPREP_XNEXT_FALLBACK:
2672
        regs.spill(index);
2673
        build.mov(x0, rState);
2674
        build.add(x1, rBase, uint16_t(vmRegOp(OP_B(inst)) * sizeof(TValue)));
2675
        build.mov(w2, uintOp(OP_A(inst)) + 1);
2676
        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, forgPrepXnextFallback)));
2677
        build.blr(x3);
2678
        // note: no emitUpdateBase necessary because forgLoopNonTableFallback does not reallocate stack
2679
        jumpOrFallthrough(blockOp(OP_C(inst)), next);
2680
        break;
2681
    case IrCmd::COVERAGE:
2682
    {
2683
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
2684
        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
2685
        RegisterA64 temp3 = regs.allocTemp(KindA64::w);
2686

2687
        build.mov(temp1, uintOp(OP_A(inst)) * sizeof(Instruction));
2688
        build.ldr(temp2, mem(rCode, temp1));
2689

2690
        // increments E (high 24 bits); if the result overflows a 23-bit counter, high bit becomes 1
2691
        // note: cmp can be eliminated with adds but we aren't concerned with code size for coverage
2692
        build.add(temp3, temp2, uint16_t(256));
2693
        build.cmp(temp3, uint16_t(0));
2694
        build.csel(temp2, temp2, temp3, ConditionA64::Less);
2695

2696
        build.str(temp2, mem(rCode, temp1));
2697
        break;
2698
    }
2699

2700
        // Full instruction fallbacks
2701
    case IrCmd::FALLBACK_GETGLOBAL:
2702
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2703
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::VmConst);
2704

2705
        regs.spill(index);
2706
        emitFallback(build, offsetof(NativeContext, executeGETGLOBAL), uintOp(OP_A(inst)));
2707
        break;
2708
    case IrCmd::FALLBACK_SETGLOBAL:
2709
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2710
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::VmConst);
2711

2712
        regs.spill(index);
2713
        emitFallback(build, offsetof(NativeContext, executeSETGLOBAL), uintOp(OP_A(inst)));
2714
        break;
2715
    case IrCmd::FALLBACK_GETTABLEKS:
2716
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2717
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::VmReg);
2718
        CODEGEN_ASSERT(OP_D(inst).kind == IrOpKind::VmConst);
2719

2720
        regs.spill(index);
2721
        emitFallback(build, offsetof(NativeContext, executeGETTABLEKS), uintOp(OP_A(inst)));
2722
        break;
2723
    case IrCmd::FALLBACK_SETTABLEKS:
2724
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2725
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::VmReg);
2726
        CODEGEN_ASSERT(OP_D(inst).kind == IrOpKind::VmConst);
2727

2728
        regs.spill(index);
2729
        emitFallback(build, offsetof(NativeContext, executeSETTABLEKS), uintOp(OP_A(inst)));
2730
        break;
2731
    case IrCmd::FALLBACK_NAMECALL:
2732
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2733
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::VmReg);
2734
        CODEGEN_ASSERT(OP_D(inst).kind == IrOpKind::VmConst);
2735

2736
        regs.spill(index);
2737
        emitFallback(build, offsetof(NativeContext, executeNAMECALL), uintOp(OP_A(inst)));
2738
        break;
2739
    case IrCmd::FALLBACK_PREPVARARGS:
2740
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::Constant);
2741

2742
        regs.spill(index);
2743
        emitFallback(build, offsetof(NativeContext, executePREPVARARGS), uintOp(OP_A(inst)));
2744
        break;
2745
    case IrCmd::FALLBACK_GETVARARGS:
2746
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2747
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::Constant);
2748

2749
        regs.spill(index);
2750
        build.mov(x0, rState);
2751

2752
        if (intOp(OP_C(inst)) == LUA_MULTRET)
2753
        {
2754
            emitAddOffset(build, x1, rCode, uintOp(OP_A(inst)) * sizeof(Instruction));
2755
            build.mov(x2, rBase);
2756
            build.mov(w3, vmRegOp(OP_B(inst)));
2757
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, executeGETVARARGSMultRet)));
2758
            build.blr(x4);
2759

2760
            emitUpdateBase(build);
2761
        }
2762
        else
2763
        {
2764
            build.mov(x1, rBase);
2765
            build.mov(w2, vmRegOp(OP_B(inst)));
2766
            build.mov(w3, intOp(OP_C(inst)));
2767
            build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, executeGETVARARGSConst)));
2768
            build.blr(x4);
2769

2770
            // note: no emitUpdateBase necessary because executeGETVARARGSConst does not reallocate stack
2771
        }
2772
        break;
2773
    case IrCmd::NEWCLOSURE:
2774
    {
2775
        RegisterA64 reg = regOp(OP_B(inst)); // note: we need to call regOp before spill so that we don't do redundant reloads
2776

2777
        regs.spill(index, {reg});
2778
        build.mov(x2, reg);
2779

2780
        build.mov(x0, rState);
2781
        build.mov(w1, uintOp(OP_A(inst)));
2782

2783
        build.ldr(x3, mem(rClosure, offsetof(Closure, l.p)));
2784
        build.ldr(x3, mem(x3, offsetof(Proto, p)));
2785

2786
        unsigned protoIndex = uintOp(OP_C(inst)); // 0..32767
2787
        int protoOffset = int(sizeof(Proto*) * protoIndex);
2788

2789
        if (protoIndex <= AddressA64::kMaxOffset)
2790
        {
2791
            build.ldr(x3, mem(x3, protoOffset));
2792
        }
2793
        else
2794
        {
2795
            build.mov(x4, protoOffset);
2796
            build.ldr(x3, mem(x3, x4));
2797
        }
2798

2799
        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, luaF_newLclosure)));
2800
        build.blr(x4);
2801

2802
        inst.regA64 = regs.takeReg(x0, index);
2803
        break;
2804
    }
2805
    case IrCmd::FALLBACK_DUPCLOSURE:
2806
        CODEGEN_ASSERT(OP_B(inst).kind == IrOpKind::VmReg);
2807
        CODEGEN_ASSERT(OP_C(inst).kind == IrOpKind::VmConst);
2808

2809
        regs.spill(index);
2810
        emitFallback(build, offsetof(NativeContext, executeDUPCLOSURE), uintOp(OP_A(inst)));
2811
        break;
2812
    case IrCmd::FALLBACK_FORGPREP:
2813
        regs.spill(index);
2814
        emitFallback(build, offsetof(NativeContext, executeFORGPREP), uintOp(OP_A(inst)));
2815
        jumpOrFallthrough(blockOp(OP_C(inst)), next);
2816
        break;
2817

2818
    // Pseudo instructions
2819
    case IrCmd::NOP:
2820
    case IrCmd::SUBSTITUTE:
2821
    case IrCmd::MARK_USED:
2822
    case IrCmd::MARK_DEAD:
2823
        CODEGEN_ASSERT(!"Pseudo instructions should not be lowered");
2824
        break;
2825

2826
    case IrCmd::BITAND_UINT:
2827
    {
2828
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2829
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant &&
2830
            AssemblyBuilderA64::isMaskSupported(unsigned(intOp(OP_B(inst)))))
2831
            build.and_(inst.regA64, regOp(OP_A(inst)), unsigned(intOp(OP_B(inst))));
2832
        else
2833
        {
2834
            RegisterA64 temp1 = tempUint(OP_A(inst));
2835
            RegisterA64 temp2 = tempUint(OP_B(inst));
2836
            build.and_(inst.regA64, temp1, temp2);
2837
        }
2838
        break;
2839
    }
2840
    case IrCmd::BITXOR_UINT:
2841
    {
2842
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2843
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant &&
2844
            AssemblyBuilderA64::isMaskSupported(unsigned(intOp(OP_B(inst)))))
2845
            build.eor(inst.regA64, regOp(OP_A(inst)), unsigned(intOp(OP_B(inst))));
2846
        else
2847
        {
2848
            RegisterA64 temp1 = tempUint(OP_A(inst));
2849
            RegisterA64 temp2 = tempUint(OP_B(inst));
2850
            build.eor(inst.regA64, temp1, temp2);
2851
        }
2852
        break;
2853
    }
2854
    case IrCmd::BITOR_UINT:
2855
    {
2856
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2857
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant &&
2858
            AssemblyBuilderA64::isMaskSupported(unsigned(intOp(OP_B(inst)))))
2859
            build.orr(inst.regA64, regOp(OP_A(inst)), unsigned(intOp(OP_B(inst))));
2860
        else
2861
        {
2862
            RegisterA64 temp1 = tempUint(OP_A(inst));
2863
            RegisterA64 temp2 = tempUint(OP_B(inst));
2864
            build.orr(inst.regA64, temp1, temp2);
2865
        }
2866
        break;
2867
    }
2868
    case IrCmd::BITNOT_UINT:
2869
    {
2870
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
2871
        RegisterA64 temp = tempUint(OP_A(inst));
2872
        build.mvn_(inst.regA64, temp);
2873
        break;
2874
    }
2875
    case IrCmd::BITLSHIFT_UINT:
2876
    {
2877
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2878
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant)
2879
            build.lsl(inst.regA64, regOp(OP_A(inst)), uint8_t(unsigned(intOp(OP_B(inst))) & 31));
2880
        else
2881
        {
2882
            RegisterA64 temp1 = tempUint(OP_A(inst));
2883
            RegisterA64 temp2 = tempUint(OP_B(inst));
2884
            build.lsl(inst.regA64, temp1, temp2);
2885
        }
2886
        break;
2887
    }
2888
    case IrCmd::BITRSHIFT_UINT:
2889
    {
2890
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2891
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant)
2892
            build.lsr(inst.regA64, regOp(OP_A(inst)), uint8_t(unsigned(intOp(OP_B(inst))) & 31));
2893
        else
2894
        {
2895
            RegisterA64 temp1 = tempUint(OP_A(inst));
2896
            RegisterA64 temp2 = tempUint(OP_B(inst));
2897
            build.lsr(inst.regA64, temp1, temp2);
2898
        }
2899
        break;
2900
    }
2901
    case IrCmd::BITARSHIFT_UINT:
2902
    {
2903
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2904
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant)
2905
            build.asr(inst.regA64, regOp(OP_A(inst)), uint8_t(unsigned(intOp(OP_B(inst))) & 31));
2906
        else
2907
        {
2908
            RegisterA64 temp1 = tempUint(OP_A(inst));
2909
            RegisterA64 temp2 = tempUint(OP_B(inst));
2910
            build.asr(inst.regA64, temp1, temp2);
2911
        }
2912
        break;
2913
    }
2914
    case IrCmd::BITLROTATE_UINT:
2915
    {
2916
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant)
2917
        {
2918
            inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
2919
            build.ror(inst.regA64, regOp(OP_A(inst)), uint8_t((32 - unsigned(intOp(OP_B(inst)))) & 31));
2920
        }
2921
        else
2922
        {
2923
            inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_B(inst)}); // can't reuse a because it would be clobbered by neg
2924
            RegisterA64 temp1 = tempUint(OP_A(inst));
2925
            RegisterA64 temp2 = tempUint(OP_B(inst));
2926
            build.neg(inst.regA64, temp2);
2927
            build.ror(inst.regA64, temp1, inst.regA64);
2928
        }
2929
        break;
2930
    }
2931
    case IrCmd::BITRROTATE_UINT:
2932
    {
2933
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst), OP_B(inst)});
2934
        if (OP_A(inst).kind == IrOpKind::Inst && OP_B(inst).kind == IrOpKind::Constant)
2935
            build.ror(inst.regA64, regOp(OP_A(inst)), uint8_t(unsigned(intOp(OP_B(inst))) & 31));
2936
        else
2937
        {
2938
            RegisterA64 temp1 = tempUint(OP_A(inst));
2939
            RegisterA64 temp2 = tempUint(OP_B(inst));
2940
            build.ror(inst.regA64, temp1, temp2);
2941
        }
2942
        break;
2943
    }
2944
    case IrCmd::BITCOUNTLZ_UINT:
2945
    {
2946
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
2947
        RegisterA64 temp = tempUint(OP_A(inst));
2948
        build.clz(inst.regA64, temp);
2949
        break;
2950
    }
2951
    case IrCmd::BITCOUNTRZ_UINT:
2952
    {
2953
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
2954
        RegisterA64 temp = tempUint(OP_A(inst));
2955
        build.rbit(inst.regA64, temp);
2956
        build.clz(inst.regA64, inst.regA64);
2957
        break;
2958
    }
2959
    case IrCmd::BYTESWAP_UINT:
2960
    {
2961
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_A(inst)});
2962
        RegisterA64 temp = tempUint(OP_A(inst));
2963
        build.rev(inst.regA64, temp);
2964
        break;
2965
    }
2966
    case IrCmd::INVOKE_LIBM:
2967
    {
2968
        if (HAS_OP_C(inst))
2969
        {
2970
            bool isInt = (OP_C(inst).kind == IrOpKind::Constant) ? constOp(OP_C(inst)).kind == IrConstKind::Int
2971
                                                                 : getCmdValueKind(function.instOp(OP_C(inst)).cmd) == IrValueKind::Int;
2972

2973
            RegisterA64 temp1 = tempDouble(OP_B(inst));
2974
            RegisterA64 temp2 = isInt ? tempInt(OP_C(inst)) : tempDouble(OP_C(inst));
2975
            RegisterA64 temp3 = isInt ? noreg : regs.allocTemp(KindA64::d); // note: spill() frees all registers so we need to avoid alloc after spill
2976
            regs.spill(index, {temp1, temp2});
2977

2978
            if (isInt)
2979
            {
2980
                build.fmov(d0, temp1);
2981
                build.mov(w0, temp2);
2982
            }
2983
            else if (d0 != temp2)
2984
            {
2985
                build.fmov(d0, temp1);
2986
                build.fmov(d1, temp2);
2987
            }
2988
            else
2989
            {
2990
                build.fmov(temp3, d0);
2991
                build.fmov(d0, temp1);
2992
                build.fmov(d1, temp3);
2993
            }
2994
        }
2995
        else
2996
        {
2997
            RegisterA64 temp1 = tempDouble(OP_B(inst));
2998
            regs.spill(index, {temp1});
2999
            build.fmov(d0, temp1);
3000
        }
3001

3002
        build.ldr(x1, mem(rNativeContext, getNativeContextOffset(uintOp(OP_A(inst)))));
3003
        build.blr(x1);
3004
        inst.regA64 = regs.takeReg(d0, index);
3005
        break;
3006
    }
3007
    case IrCmd::GET_TYPE:
3008
    {
3009
        inst.regA64 = regs.allocReg(KindA64::x, index);
3010

3011
        CODEGEN_ASSERT(sizeof(TString*) == 8);
3012

3013
        if (OP_A(inst).kind == IrOpKind::Inst)
3014
            build.add(inst.regA64, rGlobalState, regOp(OP_A(inst)), 3); // implicit uxtw
3015
        else if (OP_A(inst).kind == IrOpKind::Constant)
3016
            build.add(inst.regA64, rGlobalState, uint16_t(tagOp(OP_A(inst)) * 8));
3017
        else
3018
            CODEGEN_ASSERT(!"Unsupported instruction form");
3019

3020
        build.ldr(inst.regA64, mem(inst.regA64, offsetof(global_State, ttname)));
3021
        break;
3022
    }
3023
    case IrCmd::GET_TYPEOF:
3024
    {
3025
        regs.spill(index);
3026
        build.mov(x0, rState);
3027
        build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
3028
        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaT_objtypenamestr)));
3029
        build.blr(x2);
3030

3031
        inst.regA64 = regs.takeReg(x0, index);
3032
        break;
3033
    }
3034

3035
    case IrCmd::FINDUPVAL:
3036
    {
3037
        regs.spill(index);
3038
        build.mov(x0, rState);
3039
        build.add(x1, rBase, uint16_t(vmRegOp(OP_A(inst)) * sizeof(TValue)));
3040
        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaF_findupval)));
3041
        build.blr(x2);
3042

3043
        inst.regA64 = regs.takeReg(x0, index);
3044
        break;
3045
    }
3046

3047
    case IrCmd::BUFFER_READI8:
3048
    {
3049
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_B(inst)});
3050
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3051

3052
        build.ldrsb(inst.regA64, addr);
3053
        break;
3054
    }
3055

3056
    case IrCmd::BUFFER_READU8:
3057
    {
3058
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_B(inst)});
3059
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3060

3061
        build.ldrb(inst.regA64, addr);
3062
        break;
3063
    }
3064

3065
    case IrCmd::BUFFER_WRITEI8:
3066
    {
3067
        RegisterA64 temp = tempInt(OP_C(inst));
3068
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_D(inst) ? LUA_TBUFFER : tagOp(OP_D(inst)));
3069

3070
        build.strb(temp, addr);
3071
        break;
3072
    }
3073

3074
    case IrCmd::BUFFER_READI16:
3075
    {
3076
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_B(inst)});
3077
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3078

3079
        build.ldrsh(inst.regA64, addr);
3080
        break;
3081
    }
3082

3083
    case IrCmd::BUFFER_READU16:
3084
    {
3085
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_B(inst)});
3086
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3087

3088
        build.ldrh(inst.regA64, addr);
3089
        break;
3090
    }
3091

3092
    case IrCmd::BUFFER_WRITEI16:
3093
    {
3094
        RegisterA64 temp = tempInt(OP_C(inst));
3095
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_D(inst) ? LUA_TBUFFER : tagOp(OP_D(inst)));
3096

3097
        build.strh(temp, addr);
3098
        break;
3099
    }
3100

3101
    case IrCmd::BUFFER_READI32:
3102
    {
3103
        inst.regA64 = regs.allocReuse(KindA64::w, index, {OP_B(inst)});
3104
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3105

3106
        build.ldr(inst.regA64, addr);
3107
        break;
3108
    }
3109

3110
    case IrCmd::BUFFER_WRITEI32:
3111
    {
3112
        RegisterA64 temp = tempInt(OP_C(inst));
3113
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_D(inst) ? LUA_TBUFFER : tagOp(OP_D(inst)));
3114

3115
        build.str(temp, addr);
3116
        break;
3117
    }
3118

3119
    case IrCmd::BUFFER_READF32:
3120
    {
3121
        inst.regA64 = regs.allocReg(KindA64::s, index);
3122
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3123

3124
        build.ldr(inst.regA64, addr);
3125
        break;
3126
    }
3127

3128
    case IrCmd::BUFFER_WRITEF32:
3129
    {
3130
        RegisterA64 temp = tempFloat(OP_C(inst));
3131
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_D(inst) ? LUA_TBUFFER : tagOp(OP_D(inst)));
3132

3133
        build.str(temp, addr);
3134
        break;
3135
    }
3136

3137
    case IrCmd::BUFFER_READF64:
3138
    {
3139
        inst.regA64 = regs.allocReg(KindA64::d, index);
3140
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_C(inst) ? LUA_TBUFFER : tagOp(OP_C(inst)));
3141

3142
        build.ldr(inst.regA64, addr);
3143
        break;
3144
    }
3145

3146
    case IrCmd::BUFFER_WRITEF64:
3147
    {
3148
        RegisterA64 temp = tempDouble(OP_C(inst));
3149
        AddressA64 addr = tempAddrBuffer(OP_A(inst), OP_B(inst), !FFlag::LuauCodegenBufNoDefTag && !HAS_OP_D(inst) ? LUA_TBUFFER : tagOp(OP_D(inst)));
3150

3151
        build.str(temp, addr);
3152
        break;
3153
    }
3154

3155
        // To handle unsupported instructions, add "case IrCmd::OP" and make sure to set error = true!
3156
    }
3157

3158
    valueTracker.afterInstLowering(inst, index);
3159

3160
    regs.currInstIdx = kInvalidInstIdx;
3161

3162
    regs.freeLastUseRegs(inst, index);
3163
    regs.freeTempRegs();
3164
}
3165

3166
void IrLoweringA64::startBlock(const IrBlock& curr)
3167
{
3168
    if (curr.startpc != kBlockNoStartPc)
3169
        allocAndIncrementCounterAt(
3170
            curr.kind == IrBlockKind::Fallback ? CodeGenCounter::FallbackBlockExecuted : CodeGenCounter::RegularBlockExecuted, curr.startpc
3171
        );
3172
}
3173

3174
void IrLoweringA64::finishBlock(const IrBlock& curr, const IrBlock& next)
3175
{
3176
    if (!regs.spills.empty())
3177
    {
3178
        // If we have spills remaining, we have to immediately lower the successor block
3179
        for (uint32_t predIdx : predecessors(function.cfg, function.getBlockIndex(next)))
3180
            CODEGEN_ASSERT(predIdx == function.getBlockIndex(curr) || function.blocks[predIdx].kind == IrBlockKind::Dead);
3181

3182
        // And the next block cannot be a join block in cfg
3183
        CODEGEN_ASSERT(next.useCount == 1);
3184
    }
3185
}
3186

3187
void IrLoweringA64::finishFunction()
3188
{
3189
    if (build.logText)
3190
        build.logAppend("; interrupt handlers\n");
3191

3192
    for (InterruptHandler& handler : interruptHandlers)
3193
    {
3194
        build.setLabel(handler.self);
3195
        build.mov(x0, (handler.pcpos + 1) * sizeof(Instruction));
3196
        build.adr(x1, handler.next);
3197
        build.b(helpers.interrupt);
3198
    }
3199

3200
    if (build.logText)
3201
        build.logAppend("; exit handlers\n");
3202

3203
    for (ExitHandler& handler : exitHandlers)
3204
    {
3205
        if (handler.pcpos == kVmExitEntryGuardPc)
3206
        {
3207
            build.setLabel(handler.self);
3208

3209
            allocAndIncrementCounterAt(CodeGenCounter::VmExitTaken, ~0u);
3210

3211
            build.b(helpers.exitContinueVmClearNativeFlag);
3212
        }
3213
        else
3214
        {
3215
            build.setLabel(handler.self);
3216

3217
            allocAndIncrementCounterAt(CodeGenCounter::VmExitTaken, handler.pcpos);
3218

3219
            build.mov(x0, handler.pcpos * sizeof(Instruction));
3220
            build.b(helpers.updatePcAndContinueInVm);
3221
        }
3222
    }
3223

3224
    // An undefined instruction is placed after the function to be used as an aborting jump offset
3225
    function.endLocation = build.getLabelOffset(build.setLabel());
3226
    build.udf();
3227

3228
    if (stats)
3229
    {
3230
        if (error)
3231
            stats->loweringErrors++;
3232

3233
        if (regs.error)
3234
            stats->regAllocErrors++;
3235
    }
3236
}
3237

3238
bool IrLoweringA64::hasError() const
3239
{
3240
    return error || regs.error;
3241
}
3242

3243
bool IrLoweringA64::isFallthroughBlock(const IrBlock& target, const IrBlock& next)
3244
{
3245
    return target.start == next.start;
3246
}
3247

3248
void IrLoweringA64::jumpOrFallthrough(IrBlock& target, const IrBlock& next)
3249
{
3250
    if (!isFallthroughBlock(target, next))
3251
        build.b(target.label);
3252
}
3253

3254
Label& IrLoweringA64::getTargetLabel(IrOp op, Label& fresh)
3255
{
3256
    if (op.kind == IrOpKind::Undef)
3257
        return fresh;
3258

3259
    if (op.kind == IrOpKind::VmExit)
3260
    {
3261
        if (uint32_t* index = exitHandlerMap.find(vmExitOp(op)))
3262
            return exitHandlers[*index].self;
3263

3264
        return fresh;
3265
    }
3266

3267
    return labelOp(op);
3268
}
3269

3270
void IrLoweringA64::finalizeTargetLabel(IrOp op, Label& fresh)
3271
{
3272
    if (op.kind == IrOpKind::Undef)
3273
    {
3274
        emitAbort(build, fresh);
3275
    }
3276
    else if (op.kind == IrOpKind::VmExit && fresh.id != 0)
3277
    {
3278
        exitHandlerMap[vmExitOp(op)] = uint32_t(exitHandlers.size());
3279
        exitHandlers.push_back({fresh, vmExitOp(op)});
3280
    }
3281
}
3282

3283
void IrLoweringA64::checkSafeEnv(IrOp target, const IrBlock& next)
3284
{
3285
    Label fresh; // used when guard aborts execution or jumps to a VM exit
3286
    RegisterA64 temp = regs.allocTemp(KindA64::x);
3287
    RegisterA64 tempw = castReg(KindA64::w, temp);
3288
    build.ldr(temp, mem(rClosure, offsetof(Closure, env)));
3289
    build.ldrb(tempw, mem(temp, offsetof(LuaTable, safeenv)));
3290
    build.cbz(tempw, getTargetLabel(target, fresh));
3291
    finalizeTargetLabel(target, fresh);
3292
}
3293

3294
void IrLoweringA64::allocAndIncrementCounterAt(CodeGenCounter kind, uint32_t pcpos)
3295
{
3296
    if (!function.recordCounters)
3297
        return;
3298

3299
    if (build.logText)
3300
        build.logAppend("; counter kind %u at pcpos %d\n", unsigned(kind), pcpos);
3301

3302
    // {uint32_t, uint32_t, uint64_t}
3303
    function.extraNativeData.push_back(unsigned(kind));
3304
    function.extraNativeData.push_back(pcpos);
3305
    incrementCounterAt(function.extraNativeData.size());
3306
    function.extraNativeData.push_back(0);
3307
    function.extraNativeData.push_back(0);
3308
}
3309

3310
void IrLoweringA64::incrementCounterAt(size_t offset)
3311
{
3312
    RegisterA64 temp1 = regs.allocTemp(KindA64::x);
3313
    RegisterA64 temp2 = regs.allocTemp(KindA64::x);
3314

3315
    // Get counter slot
3316
    build.ldr(temp1, mem(rClosure, offsetof(Closure, l.p)));
3317
    build.ldr(temp1, mem(temp1, offsetof(Proto, execdata)));
3318
    emitAddOffset(build, temp2, temp1, (unsigned(function.proto->sizecode) + offset) * 4);
3319

3320
    // Increment
3321
    build.ldr(temp1, temp2);
3322
    build.add(temp1, temp1, uint16_t(1));
3323
    build.str(temp1, temp2);
3324

3325
    regs.freeTemp(temp1);
3326
    regs.freeTemp(temp2);
3327
}
3328

3329
void IrLoweringA64::checkObjectBarrierConditions(RegisterA64 object, RegisterA64 temp, RegisterA64 ra, IrOp raOp, int ratag, Label& skip)
3330
{
3331
    RegisterA64 tempw = castReg(KindA64::w, temp);
3332

3333
    // iscollectable(ra)
3334
    if (ratag == -1 || !isGCO(ratag))
3335
    {
3336
        if (raOp.kind == IrOpKind::Inst)
3337
        {
3338
            build.umov_4s(tempw, ra, 3);
3339
        }
3340
        else
3341
        {
3342
            AddressA64 addr = tempAddr(raOp, offsetof(TValue, tt), temp);
3343
            build.ldr(tempw, addr);
3344
        }
3345

3346
        build.cmp(tempw, uint16_t(LUA_TSTRING));
3347
        build.b(ConditionA64::Less, skip);
3348
    }
3349

3350
    // isblack(obj2gco(o))
3351
    build.ldrb(tempw, mem(object, offsetof(GCheader, marked)));
3352
    build.tbz(tempw, BLACKBIT, skip);
3353

3354
    // iswhite(gcvalue(ra))
3355
    if (raOp.kind == IrOpKind::Inst)
3356
    {
3357
        build.fmov(temp, castReg(KindA64::d, ra));
3358
    }
3359
    else
3360
    {
3361
        AddressA64 addr = tempAddr(raOp, offsetof(TValue, value), temp);
3362
        build.ldr(temp, addr);
3363
    }
3364

3365
    build.ldrb(tempw, mem(temp, offsetof(GCheader, marked)));
3366
    build.tst(tempw, bit2mask(WHITE0BIT, WHITE1BIT));
3367
    build.b(ConditionA64::Equal, skip); // Equal = Zero after tst
3368
}
3369

3370
RegisterA64 IrLoweringA64::tempDouble(IrOp op)
3371
{
3372
    if (op.kind == IrOpKind::Inst)
3373
        return regOp(op);
3374
    else if (op.kind == IrOpKind::Constant)
3375
    {
3376
        double val = doubleOp(op);
3377

3378
        if (AssemblyBuilderA64::isFmovSupportedFp64(val))
3379
        {
3380
            RegisterA64 temp = regs.allocTemp(KindA64::d);
3381
            build.fmov(temp, val);
3382
            return temp;
3383
        }
3384
        else
3385
        {
3386
            RegisterA64 temp1 = regs.allocTemp(KindA64::x);
3387
            RegisterA64 temp2 = regs.allocTemp(KindA64::d);
3388

3389
            uint64_t vali = getDoubleBits(val);
3390

3391
            if ((vali << 16) == 0)
3392
            {
3393
                build.movz(temp1, uint16_t(vali >> 48), 48);
3394
                build.fmov(temp2, temp1);
3395
            }
3396
            else if ((vali << 32) == 0)
3397
            {
3398
                build.movz(temp1, uint16_t(vali >> 48), 48);
3399
                build.movk(temp1, uint16_t(vali >> 32), 32);
3400
                build.fmov(temp2, temp1);
3401
            }
3402
            else
3403
            {
3404
                build.adr(temp1, val);
3405
                build.ldr(temp2, temp1);
3406
            }
3407

3408
            return temp2;
3409
        }
3410
    }
3411
    else
3412
    {
3413
        CODEGEN_ASSERT(!"Unsupported instruction form");
3414
        return noreg;
3415
    }
3416
}
3417

3418
RegisterA64 IrLoweringA64::tempFloat(IrOp op)
3419
{
3420
    if (op.kind == IrOpKind::Inst)
3421
        return regOp(op);
3422
    else if (op.kind == IrOpKind::Constant)
3423
    {
3424
        float val = float(doubleOp(op));
3425

3426
        if (AssemblyBuilderA64::isFmovSupportedFp32(val))
3427
        {
3428
            RegisterA64 temp = regs.allocTemp(KindA64::s);
3429
            build.fmov(temp, val);
3430
            return temp;
3431
        }
3432
        else
3433
        {
3434
            RegisterA64 temp = regs.allocTemp(KindA64::s);
3435

3436
            uint32_t vali = getFloatBits(val);
3437

3438
            if ((vali & 0xffff) == 0)
3439
            {
3440
                RegisterA64 temp2 = regs.allocTemp(KindA64::w);
3441

3442
                build.movz(temp2, uint16_t(vali >> 16), 16);
3443
                build.fmov(temp, temp2);
3444
            }
3445
            else
3446
            {
3447
                RegisterA64 temp2 = regs.allocTemp(KindA64::x);
3448

3449
                build.adr(temp2, val);
3450
                build.ldr(temp, temp2);
3451
            }
3452

3453
            return temp;
3454
        }
3455
    }
3456
    else
3457
    {
3458
        CODEGEN_ASSERT(!"Unsupported instruction form");
3459
        return noreg;
3460
    }
3461
}
3462

3463
RegisterA64 IrLoweringA64::tempInt(IrOp op)
3464
{
3465
    if (op.kind == IrOpKind::Inst)
3466
        return regOp(op);
3467
    else if (op.kind == IrOpKind::Constant)
3468
    {
3469
        RegisterA64 temp = regs.allocTemp(KindA64::w);
3470
        build.mov(temp, intOp(op));
3471
        return temp;
3472
    }
3473
    else
3474
    {
3475
        CODEGEN_ASSERT(!"Unsupported instruction form");
3476
        return noreg;
3477
    }
3478
}
3479

3480
RegisterA64 IrLoweringA64::tempUint(IrOp op)
3481
{
3482
    if (op.kind == IrOpKind::Inst)
3483
        return regOp(op);
3484
    else if (op.kind == IrOpKind::Constant)
3485
    {
3486
        RegisterA64 temp = regs.allocTemp(KindA64::w);
3487
        build.mov(temp, unsigned(intOp(op)));
3488
        return temp;
3489
    }
3490
    else
3491
    {
3492
        CODEGEN_ASSERT(!"Unsupported instruction form");
3493
        return noreg;
3494
    }
3495
}
3496

3497
AddressA64 IrLoweringA64::tempAddr(IrOp op, int offset, RegisterA64 tempStorage)
3498
{
3499
    // This is needed to tighten the bounds checks in the VmConst case below
3500
    CODEGEN_ASSERT(offset % 4 == 0);
3501
    // Full encoded range is wider depending on the load size, but this assertion helps establish a smaller guaranteed working range [0..4096)
3502
    CODEGEN_ASSERT(offset >= 0 && unsigned(offset / 4) <= AssemblyBuilderA64::kMaxImmediate);
3503

3504
    if (op.kind == IrOpKind::VmReg)
3505
    {
3506
        return mem(rBase, vmRegOp(op) * sizeof(TValue) + offset);
3507
    }
3508
    else if (op.kind == IrOpKind::VmConst)
3509
    {
3510
        size_t constantOffset = vmConstOp(op) * sizeof(TValue) + offset;
3511

3512
        // Note: cumulative offset is guaranteed to be divisible by 4; we can use that to expand the useful range that doesn't require temporaries
3513
        if (constantOffset / 4 <= AddressA64::kMaxOffset)
3514
            return mem(rConstants, int(constantOffset));
3515

3516
        RegisterA64 temp = tempStorage == noreg ? regs.allocTemp(KindA64::x) : tempStorage;
3517
        CODEGEN_ASSERT(temp.kind == KindA64::x && "temp storage, when provided, must be an 'x' register");
3518

3519
        emitAddOffset(build, temp, rConstants, constantOffset);
3520
        return temp;
3521
    }
3522
    // If we have a register, we assume it's a pointer to TValue
3523
    else if (op.kind == IrOpKind::Inst)
3524
    {
3525
        CODEGEN_ASSERT(getCmdValueKind(function.instOp(op).cmd) == IrValueKind::Pointer);
3526
        return mem(regOp(op), offset);
3527
    }
3528
    else
3529
    {
3530
        CODEGEN_ASSERT(!"Unsupported instruction form");
3531
        return noreg;
3532
    }
3533
}
3534

3535
AddressA64 IrLoweringA64::tempAddrBuffer(IrOp bufferOp, IrOp indexOp, uint8_t tag)
3536
{
3537
    CODEGEN_ASSERT(tag == LUA_TUSERDATA || tag == LUA_TBUFFER);
3538
    int dataOffset = tag == LUA_TBUFFER ? offsetof(Buffer, data) : offsetof(Udata, data);
3539

3540
    if (indexOp.kind == IrOpKind::Inst)
3541
    {
3542
        CODEGEN_ASSERT(!producesDirtyHighRegisterBits(function.instOp(indexOp).cmd));
3543

3544
        RegisterA64 temp = regs.allocTemp(KindA64::x);
3545
        build.add(temp, regOp(bufferOp), regOp(indexOp)); // implicit uxtw
3546
        return mem(temp, dataOffset);
3547
    }
3548
    else if (indexOp.kind == IrOpKind::Constant)
3549
    {
3550
        // Since the resulting address may be used to load any size, including 1 byte, from an unaligned offset, we are limited by unscaled
3551
        // encoding
3552
        if (unsigned(intOp(indexOp)) + dataOffset <= 255)
3553
            return mem(regOp(bufferOp), int(intOp(indexOp) + dataOffset));
3554

3555
        // indexOp can only be negative in dead code (since offsets are checked); this avoids assertion in emitAddOffset
3556
        if (intOp(indexOp) < 0)
3557
            return mem(regOp(bufferOp), dataOffset);
3558

3559
        RegisterA64 temp = regs.allocTemp(KindA64::x);
3560
        emitAddOffset(build, temp, regOp(bufferOp), size_t(intOp(indexOp)));
3561
        return mem(temp, dataOffset);
3562
    }
3563
    else
3564
    {
3565
        CODEGEN_ASSERT(!"Unsupported instruction form");
3566
        return noreg;
3567
    }
3568
}
3569

3570
RegisterA64 IrLoweringA64::regOp(IrOp op)
3571
{
3572
    IrInst& inst = function.instOp(op);
3573

3574
    if (inst.spilled || inst.needsReload)
3575
        regs.restoreReg(inst);
3576

3577
    CODEGEN_ASSERT(inst.regA64 != noreg);
3578
    return inst.regA64;
3579
}
3580

3581
IrConst IrLoweringA64::constOp(IrOp op) const
3582
{
3583
    return function.constOp(op);
3584
}
3585

3586
uint8_t IrLoweringA64::tagOp(IrOp op) const
3587
{
3588
    return function.tagOp(op);
3589
}
3590

3591
int IrLoweringA64::intOp(IrOp op) const
3592
{
3593
    return function.intOp(op);
3594
}
3595

3596
unsigned IrLoweringA64::uintOp(IrOp op) const
3597
{
3598
    return function.uintOp(op);
3599
}
3600

3601
unsigned IrLoweringA64::importOp(IrOp op) const
3602
{
3603
    return function.importOp(op);
3604
}
3605

3606
double IrLoweringA64::doubleOp(IrOp op) const
3607
{
3608
    return function.doubleOp(op);
3609
}
3610

3611
IrBlock& IrLoweringA64::blockOp(IrOp op) const
3612
{
3613
    return function.blockOp(op);
3614
}
3615

3616
Label& IrLoweringA64::labelOp(IrOp op) const
3617
{
3618
    return blockOp(op).label;
3619
}
3620

3621
} // namespace A64
3622
} // namespace CodeGen
3623
} // namespace Luau
3624

3625
Product

Resources

Company