CoCalc -- masm.rs

GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/winch/codegen/src/isa/x64/masm.rs
¹⁶⁹³ views
1
use super::{
2
    RegAlloc,
3
    abi::X64ABI,
4
    address::Address,
5
    asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},
6
    regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset},
7
};
8
use anyhow::{Result, anyhow, bail};
9

10
use crate::masm::{
11
    DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm as I, IntCmpKind,
12
    IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm,
13
    RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType, ShiftKind, SplatKind,
14
    StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS, V128AbsKind, V128AddKind,
15
    V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind, V128MinKind,
16
    V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind, VectorCompareKind,
17
    VectorEqualityKind, Zero,
18
};
19
use crate::{
20
    abi::{self, LocalSlot, align_to, calculate_frame_adjustment},
21
    codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size},
22
    stack::{TypedReg, Val},
23
};
24
use crate::{
25
    abi::{ABI, vmctx},
26
    masm::{SPOffset, StackSlot},
27
};
28
use crate::{
29
    isa::{
30
        CallingConvention,
31
        reg::{Reg, RegClass, WritableReg, writable},
32
    },
33
    masm::CalleeKind,
34
};
35
use cranelift_codegen::{
36
    Final, MachBufferFinalized, MachLabel,
37
    binemit::CodeOffset,
38
    ir::{MemFlags, RelSourceLoc, SourceLoc},
39
    isa::{
40
        unwind::UnwindInst,
41
        x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings},
42
    },
43
    settings,
44
};
45
use wasmtime_cranelift::TRAP_UNREACHABLE;
46
use wasmtime_environ::{PtrSize, WasmValType};
47

48
// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`
49
// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
50
// need to fix up the bits that migrate from one half of the lane to the
51
// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
52
// right by 0 (no movement), we want to retain all the bits so we mask with
53
// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
54
// we mask with `0x7f`; etc.
55

56
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
57
const I8X16_ISHL_MASKS: [u8; 128] = [
58
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59
    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
60
    0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
61
    0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
62
    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
63
    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
64
    0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
65
    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
66
];
67

68
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
69
const I8X16_USHR_MASKS: [u8; 128] = [
70
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71
    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
72
    0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
73
    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
74
    0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
75
    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
76
    0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
77
    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
78
];
79

80
/// x64 MacroAssembler.
81
pub(crate) struct MacroAssembler {
82
    /// Stack pointer offset.
83
    sp_offset: u32,
84
    /// This value represents the maximum stack size seen while compiling the function. While the
85
    /// function is still being compiled its value will not be valid (the stack will grow and
86
    /// shrink as space is reserved and freed during compilation), but once all instructions have
87
    /// been seen this value will be the maximum stack usage seen.
88
    sp_max: u32,
89
    /// Add instructions that are used to add the constant stack max to a register.
90
    stack_max_use_add: Option<PatchableAddToReg>,
91
    /// Low level assembler.
92
    asm: Assembler,
93
    /// ISA flags.
94
    flags: x64_settings::Flags,
95
    /// Shared flags.vmcontext_store_context
96
    shared_flags: settings::Flags,
97
    /// The target pointer size.
98
    ptr_size: OperandSize,
99
    /// Scratch register scope.
100
    scratch_scope: RegAlloc,
101
}
102

103
impl Masm for MacroAssembler {
104
    type Address = Address;
105
    type Ptr = u8;
106
    type ABI = X64ABI;
107

108
    fn frame_setup(&mut self) -> Result<()> {
109
        let frame_pointer = rbp();
110
        let stack_pointer = rsp();
111

112
        self.asm.push_r(frame_pointer);
113

114
        if self.shared_flags.unwind_info() {
115
            self.asm.unwind_inst(UnwindInst::PushFrameRegs {
116
                offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
117
            })
118
        }
119

120
        self.asm
121
            .mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);
122

123
        Ok(())
124
    }
125

126
    fn check_stack(&mut self, vmctx: Reg) -> Result<()> {
127
        let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();
128

129
        self.with_scratch::<IntScratch, _>(|masm, scratch| {
130
            masm.load_ptr(
131
                masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,
132
                scratch.writable(),
133
            )?;
134

135
            masm.load_ptr(
136
                Address::offset(
137
                    scratch.inner(),
138
                    ptr_size.vmstore_context_stack_limit().into(),
139
                ),
140
                scratch.writable(),
141
            )?;
142

143
            masm.add_stack_max(scratch.inner());
144

145
            masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size);
146
            masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);
147
            anyhow::Ok(())
148
        })?;
149

150
        // Emit unwind info.
151
        if self.shared_flags.unwind_info() {
152
            self.asm.unwind_inst(UnwindInst::DefineNewFrame {
153
                offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
154

155
                // The Winch calling convention has no callee-save registers, so nothing will be
156
                // clobbered.
157
                offset_downward_to_clobbers: 0,
158
            })
159
        }
160
        Ok(())
161
    }
162

163
    fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {
164
        let bytes = match (reg.class(), size) {
165
            (RegClass::Int, OperandSize::S64) => {
166
                let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;
167
                self.asm.push_r(reg);
168
                self.increment_sp(word_bytes);
169
                word_bytes
170
            }
171
            (RegClass::Int, OperandSize::S32) => {
172
                let bytes = size.bytes();
173
                self.reserve_stack(bytes)?;
174
                let sp_offset = SPOffset::from_u32(self.sp_offset);
175
                self.asm
176
                    .mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
177
                bytes
178
            }
179
            (RegClass::Float, _) => {
180
                let bytes = size.bytes();
181
                self.reserve_stack(bytes)?;
182
                let sp_offset = SPOffset::from_u32(self.sp_offset);
183
                self.asm
184
                    .xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
185
                bytes
186
            }
187
            _ => unreachable!(),
188
        };
189

190
        Ok(StackSlot {
191
            offset: SPOffset::from_u32(self.sp_offset),
192
            size: bytes,
193
        })
194
    }
195

196
    fn reserve_stack(&mut self, bytes: u32) -> Result<()> {
197
        if bytes == 0 {
198
            return Ok(());
199
        }
200

201
        self.asm
202
            .sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
203
        self.increment_sp(bytes);
204

205
        Ok(())
206
    }
207

208
    fn free_stack(&mut self, bytes: u32) -> Result<()> {
209
        if bytes == 0 {
210
            return Ok(());
211
        }
212
        self.asm
213
            .add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
214
        self.decrement_sp(bytes);
215

216
        Ok(())
217
    }
218

219
    fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {
220
        self.sp_offset = offset.as_u32();
221

222
        Ok(())
223
    }
224

225
    fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {
226
        let (reg, offset) = if local.addressed_from_sp() {
227
            let offset = self
228
                .sp_offset
229
                .checked_sub(local.offset)
230
                .ok_or_else(|| CodeGenError::invalid_local_offset())?;
231
            (rsp(), offset)
232
        } else {
233
            (rbp(), local.offset)
234
        };
235

236
        Ok(Address::offset(reg, offset))
237
    }
238

239
    fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {
240
        Ok(Address::offset(
241
            regs::rsp(),
242
            self.sp_offset - offset.as_u32(),
243
        ))
244
    }
245

246
    fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {
247
        Ok(Address::offset(regs::rsp(), offset.as_u32()))
248
    }
249

250
    fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {
251
        Ok(Address::offset(vmctx!(Self), offset))
252
    }
253

254
    fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {
255
        self.store(src.into(), dst, self.ptr_size)
256
    }
257

258
    fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {
259
        self.store_impl(src, dst, size, TRUSTED_FLAGS)
260
    }
261

262
    fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {
263
        match kind {
264
            StoreKind::Operand(size) => {
265
                self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
266
            }
267
            StoreKind::Atomic(size) => {
268
                if size == OperandSize::S128 {
269
                    // TODO: we don't support 128-bit atomic store yet.
270
                    bail!(CodeGenError::unexpected_operand_size());
271
                }
272
                // To stay consistent with cranelift, we emit a normal store followed by a mfence,
273
                // although, we could probably just emit a xchg.
274
                self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
275
                self.asm.mfence();
276
            }
277
            StoreKind::VectorLane(LaneSelector { lane, size }) => {
278
                self.ensure_has_avx()?;
279
                self.asm
280
                    .xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS);
281
            }
282
        }
283

284
        Ok(())
285
    }
286

287
    fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
288
        let current_sp = SPOffset::from_u32(self.sp_offset);
289
        let _ = match (dst.to_reg().class(), size) {
290
            (RegClass::Int, OperandSize::S32) => {
291
                let addr = self.address_from_sp(current_sp)?;
292
                self.asm.movzx_mr(
293
                    &addr,
294
                    dst,
295
                    size.extend_to::<Zero>(OperandSize::S64),
296
                    TRUSTED_FLAGS,
297
                );
298
                self.free_stack(size.bytes())?;
299
            }
300
            (RegClass::Int, OperandSize::S64) => {
301
                self.asm.pop_r(dst);
302
                self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);
303
            }
304
            (RegClass::Float, _) | (RegClass::Vector, _) => {
305
                let addr = self.address_from_sp(current_sp)?;
306
                self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
307
                self.free_stack(size.bytes())?;
308
            }
309
            _ => bail!(CodeGenError::invalid_operand_combination()),
310
        };
311
        Ok(())
312
    }
313

314
    fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R {
315
        let r = self
316
            .scratch_scope
317
            .reg_for_class(T::reg_class(), &mut |_| Ok(()))
318
            .expect("Scratch register to be available");
319

320
        let ret = f(self, Scratch::new(r));
321
        self.scratch_scope.free(r);
322
        ret
323
    }
324

325
    fn call(
326
        &mut self,
327
        stack_args_size: u32,
328
        mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,
329
    ) -> Result<u32> {
330
        let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();
331
        let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();
332
        let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);
333
        let aligned_args_size = align_to(stack_args_size, alignment);
334
        let total_stack = delta + aligned_args_size;
335
        self.reserve_stack(total_stack)?;
336
        let (callee, cc) = load_callee(self)?;
337
        match callee {
338
            CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),
339
            CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),
340
        };
341
        Ok(total_stack)
342
    }
343

344
    fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {
345
        self.load(src, dst, self.ptr_size)
346
    }
347

348
    fn compute_addr(
349
        &mut self,
350
        src: Self::Address,
351
        dst: WritableReg,
352
        size: OperandSize,
353
    ) -> Result<()> {
354
        self.asm.lea(&src, dst, size);
355
        Ok(())
356
    }
357

358
    fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {
359
        self.load_impl(src, dst, size, TRUSTED_FLAGS)
360
    }
361

362
    fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {
363
        let size = kind.derive_operand_size();
364

365
        match kind {
366
            LoadKind::ScalarExtend(ext) => match ext {
367
                ExtendKind::Signed(ext) => {
368
                    self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);
369
                }
370
                ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,
371
            },
372
            LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {
373
                // The guarantees of the x86-64 memory model ensure that `SeqCst`
374
                // loads are equivalent to normal loads.
375
                if kind.is_atomic() && size == OperandSize::S128 {
376
                    bail!(CodeGenError::unexpected_operand_size());
377
                }
378

379
                self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;
380
            }
381
            LoadKind::VectorExtend(ext) => {
382
                self.ensure_has_avx()?;
383
                self.asm
384
                    .xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)
385
            }
386
            LoadKind::Splat(_) => {
387
                self.ensure_has_avx()?;
388

389
                if size == OperandSize::S64 {
390
                    self.asm
391
                        .xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);
392
                    self.asm.xmm_vpshuf_rr(
393
                        dst.to_reg(),
394
                        dst,
395
                        Self::vpshuf_mask_for_64_bit_splats(),
396
                        OperandSize::S32,
397
                    );
398
                } else {
399
                    self.asm
400
                        .xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);
401
                }
402
            }
403
            LoadKind::VectorLane(LaneSelector { lane, size }) => {
404
                self.ensure_has_avx()?;
405
                self.with_scratch::<IntScratch, _>(|masm, byte_tmp| {
406
                    masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?;
407
                    masm.asm
408
                        .xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size);
409
                    anyhow::Ok(())
410
                })?;
411
            }
412
            LoadKind::VectorZero(size) => {
413
                self.ensure_has_avx()?;
414
                self.with_scratch::<IntScratch, _>(|masm, scratch| {
415
                    masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?;
416
                    masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size);
417
                    anyhow::Ok(())
418
                })?;
419
            }
420
        }
421

422
        Ok(())
423
    }
424

425
    fn sp_offset(&self) -> Result<SPOffset> {
426
        Ok(SPOffset::from_u32(self.sp_offset))
427
    }
428

429
    fn zero(&mut self, reg: WritableReg) -> Result<()> {
430
        self.asm.xor_rr(
431
            reg.to_reg(),
432
            reg,
433
            OperandSize::from_bytes(<Self::ABI>::word_bytes()),
434
        );
435
        Ok(())
436
    }
437

438
    fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {
439
        match (src, dst.to_reg()) {
440
            (RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {
441
                (RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),
442
                (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),
443
                _ => bail!(CodeGenError::invalid_operand_combination()),
444
            },
445
            (RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size),
446
        }
447
    }
448

449
    fn cmov(
450
        &mut self,
451
        dst: WritableReg,
452
        src: Reg,
453
        cc: IntCmpKind,
454
        size: OperandSize,
455
    ) -> Result<()> {
456
        match (src.class(), dst.to_reg().class()) {
457
            (RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),
458
            (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),
459
            _ => Err(anyhow!(CodeGenError::invalid_operand_combination())),
460
        }
461
    }
462

463
    fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
464
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
465
        match (rhs, dst) {
466
            (RegImm::Imm(imm), _) => {
467
                if let Some(v) = imm.to_i32() {
468
                    self.asm.add_ir(v, dst, size);
469
                } else {
470
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
471
                        masm.load_constant(&imm, scratch.writable(), size)?;
472
                        masm.asm.add_rr(scratch.inner(), dst, size);
473
                        anyhow::Ok(())
474
                    })?;
475
                }
476
            }
477

478
            (RegImm::Reg(src), dst) => {
479
                self.asm.add_rr(src, dst, size);
480
            }
481
        }
482

483
        Ok(())
484
    }
485

486
    fn checked_uadd(
487
        &mut self,
488
        dst: WritableReg,
489
        lhs: Reg,
490
        rhs: RegImm,
491
        size: OperandSize,
492
        trap: TrapCode,
493
    ) -> Result<()> {
494
        self.add(dst, lhs, rhs, size)?;
495
        self.asm.trapif(CC::B, trap);
496
        Ok(())
497
    }
498

499
    fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
500
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
501
        match (rhs, dst) {
502
            (RegImm::Imm(imm), reg) => {
503
                if let Some(v) = imm.to_i32() {
504
                    self.asm.sub_ir(v, reg, size);
505
                } else {
506
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
507
                        masm.load_constant(&imm, scratch.writable(), size)?;
508
                        masm.asm.sub_rr(scratch.inner(), reg, size);
509
                        anyhow::Ok(())
510
                    })?;
511
                }
512
            }
513

514
            (RegImm::Reg(src), dst) => {
515
                self.asm.sub_rr(src, dst, size);
516
            }
517
        }
518

519
        Ok(())
520
    }
521

522
    fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
523
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
524
        match (rhs, dst) {
525
            (RegImm::Imm(imm), _) => {
526
                if let Some(v) = imm.to_i32() {
527
                    self.asm.mul_ir(v, dst, size);
528
                } else {
529
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
530
                        masm.load_constant(&imm, scratch.writable(), size)?;
531
                        masm.asm.mul_rr(scratch.inner(), dst, size);
532
                        anyhow::Ok(())
533
                    })?;
534
                }
535
            }
536

537
            (RegImm::Reg(src), dst) => {
538
                self.asm.mul_rr(src, dst, size);
539
            }
540
        }
541

542
        Ok(())
543
    }
544

545
    fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
546
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
547
        self.asm.xmm_add_rr(rhs, dst, size);
548
        Ok(())
549
    }
550

551
    fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
552
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
553
        self.asm.xmm_sub_rr(rhs, dst, size);
554
        Ok(())
555
    }
556

557
    fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
558
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
559
        self.asm.xmm_mul_rr(rhs, dst, size);
560
        Ok(())
561
    }
562

563
    fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
564
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
565
        self.asm.xmm_div_rr(rhs, dst, size);
566
        Ok(())
567
    }
568

569
    fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
570
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
571
        self.asm.xmm_min_seq(rhs, dst, size);
572
        Ok(())
573
    }
574

575
    fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
576
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
577
        self.asm.xmm_max_seq(rhs, dst, size);
578
        Ok(())
579
    }
580

581
    fn float_copysign(
582
        &mut self,
583
        dst: WritableReg,
584
        lhs: Reg,
585
        rhs: Reg,
586
        size: OperandSize,
587
    ) -> Result<()> {
588
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
589
        let sign_mask = match size {
590
            OperandSize::S32 => I::I32(0x80000000),
591
            OperandSize::S64 => I::I64(0x8000000000000000),
592
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
593
                bail!(CodeGenError::unexpected_operand_size())
594
            }
595
        };
596

597
        self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
598
            masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
599
                masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?;
600
                masm.asm
601
                    .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
602

603
                // Clear everything except sign bit in src.
604
                masm.asm
605
                    .xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size);
606

607
                // Clear sign bit in dst using scratch to store result. Then copy the
608
                // result back to dst.
609
                masm.asm
610
                    .xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size);
611
                masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size);
612

613
                // Copy sign bit from src to dst.
614
                masm.asm.xmm_or_rr(rhs, dst, size);
615
                Ok(())
616
            })
617
        })
618
    }
619

620
    fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
621
        debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
622
        let mask = match size {
623
            OperandSize::S32 => I::I32(0x80000000),
624
            OperandSize::S64 => I::I64(0x8000000000000000),
625
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
626
                bail!(CodeGenError::unexpected_operand_size())
627
            }
628
        };
629
        self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
630
            masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
631
                masm.load_constant(&mask, scratch_gpr.writable(), size)?;
632
                masm.asm
633
                    .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
634
                masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size);
635
                Ok(())
636
            })
637
        })
638
    }
639

640
    fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
641
        debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
642
        let mask = match size {
643
            OperandSize::S32 => I::I32(0x7fffffff),
644
            OperandSize::S64 => I::I64(0x7fffffffffffffff),
645
            OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {
646
                bail!(CodeGenError::unexpected_operand_size())
647
            }
648
        };
649

650
        self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
651
            masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
652
                masm.load_constant(&mask, scratch_gpr.writable(), size)?;
653

654
                masm.asm
655
                    .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
656
                masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size);
657
                Ok(())
658
            })
659
        })
660
    }
661

662
    fn float_round<
663
        F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,
664
    >(
665
        &mut self,
666
        mode: RoundingMode,
667
        env: &mut FuncEnv<Self::Ptr>,
668
        context: &mut CodeGenContext<Emission>,
669
        size: OperandSize,
670
        mut fallback: F,
671
    ) -> Result<()> {
672
        if self.flags.has_sse41() {
673
            let src = context.pop_to_reg(self, None)?;
674
            self.asm
675
                .xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);
676
            context.stack.push(src.into());
677
            Ok(())
678
        } else {
679
            fallback(env, context, self)
680
        }
681
    }
682

683
    fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
684
        self.asm.sqrt(src, dst, size);
685
        Ok(())
686
    }
687

688
    fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
689
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
690
        match (rhs, dst) {
691
            (RegImm::Imm(imm), _) => {
692
                if let Some(v) = imm.to_i32() {
693
                    self.asm.and_ir(v, dst, size);
694
                } else {
695
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
696
                        masm.load_constant(&imm, scratch.writable(), size)?;
697
                        masm.asm.and_rr(scratch.inner(), dst, size);
698
                        anyhow::Ok(())
699
                    })?;
700
                }
701
            }
702

703
            (RegImm::Reg(src), dst) => {
704
                self.asm.and_rr(src, dst, size);
705
            }
706
        }
707

708
        Ok(())
709
    }
710

711
    fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
712
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
713
        match (rhs, dst) {
714
            (RegImm::Imm(imm), _) => {
715
                if let Some(v) = imm.to_i32() {
716
                    self.asm.or_ir(v, dst, size);
717
                } else {
718
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
719
                        masm.load_constant(&imm, scratch.writable(), size)?;
720
                        masm.asm.or_rr(scratch.inner(), dst, size);
721
                        anyhow::Ok(())
722
                    })?;
723
                }
724
            }
725

726
            (RegImm::Reg(src), dst) => {
727
                self.asm.or_rr(src, dst, size);
728
            }
729
        }
730

731
        Ok(())
732
    }
733

734
    fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
735
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
736
        match (rhs, dst) {
737
            (RegImm::Imm(imm), _) => {
738
                if let Some(v) = imm.to_i32() {
739
                    self.asm.xor_ir(v, dst, size);
740
                } else {
741
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
742
                        masm.load_constant(&imm, scratch.writable(), size)?;
743
                        masm.asm.xor_rr(scratch.inner(), dst, size);
744
                        anyhow::Ok(())
745
                    })?;
746
                }
747
            }
748

749
            (RegImm::Reg(src), _) => {
750
                self.asm.xor_rr(src, dst, size);
751
            }
752
        }
753

754
        Ok(())
755
    }
756

757
    fn shift_ir(
758
        &mut self,
759
        dst: WritableReg,
760
        imm: I,
761
        lhs: Reg,
762
        kind: ShiftKind,
763
        size: OperandSize,
764
    ) -> Result<()> {
765
        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
766
        self.asm
767
            .shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size);
768
        Ok(())
769
    }
770

771
    fn shift(
772
        &mut self,
773
        context: &mut CodeGenContext<Emission>,
774
        kind: ShiftKind,
775
        size: OperandSize,
776
    ) -> Result<()> {
777
        // Number of bits to shift must be in the CL register.
778
        let src = context.pop_to_reg(self, Some(regs::rcx()))?;
779
        let dst = context.pop_to_reg(self, None)?;
780

781
        self.asm
782
            .shift_rr(src.into(), writable!(dst.into()), kind, size);
783

784
        context.free_reg(src);
785
        context.stack.push(dst.into());
786

787
        Ok(())
788
    }
789

790
    fn div(
791
        &mut self,
792
        context: &mut CodeGenContext<Emission>,
793
        kind: DivKind,
794
        size: OperandSize,
795
    ) -> Result<()> {
796
        // Allocate rdx:rax.
797
        let rdx = context.reg(regs::rdx(), self)?;
798
        let rax = context.reg(regs::rax(), self)?;
799

800
        // Allocate the divisor, which can be any gpr.
801
        let divisor = context.pop_to_reg(self, None)?;
802

803
        // Mark rax as allocatable.
804
        context.free_reg(rax);
805
        // Move the top value to rax.
806
        let rax = context.pop_to_reg(self, Some(rax))?;
807
        self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);
808

809
        // Free the divisor and rdx.
810
        context.free_reg(divisor);
811
        context.free_reg(rdx);
812

813
        // Push the quotient.
814
        context.stack.push(rax.into());
815
        Ok(())
816
    }
817

818
    fn rem(
819
        &mut self,
820
        context: &mut CodeGenContext<Emission>,
821
        kind: RemKind,
822
        size: OperandSize,
823
    ) -> Result<()> {
824
        // Allocate rdx:rax.
825
        let rdx = context.reg(regs::rdx(), self)?;
826
        let rax = context.reg(regs::rax(), self)?;
827

828
        // Allocate the divisor, which can be any gpr.
829
        let divisor = context.pop_to_reg(self, None)?;
830

831
        // Mark rax as allocatable.
832
        context.free_reg(rax);
833
        // Move the top value to rax.
834
        let rax = context.pop_to_reg(self, Some(rax))?;
835
        self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);
836

837
        // Free the divisor and rax.
838
        context.free_reg(divisor);
839
        context.free_reg(rax);
840

841
        // Push the remainder.
842
        context.stack.push(Val::reg(rdx, divisor.ty));
843

844
        Ok(())
845
    }
846

847
    fn frame_restore(&mut self) -> Result<()> {
848
        debug_assert_eq!(self.sp_offset, 0);
849
        self.asm.pop_r(writable!(rbp()));
850
        self.asm.ret();
851
        Ok(())
852
    }
853

854
    fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {
855
        if let Some(patch) = self.stack_max_use_add {
856
            patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());
857
        }
858

859
        Ok(self.asm.finalize(base))
860
    }
861

862
    fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {
863
        Ok(Address::offset(reg, offset))
864
    }
865

866
    fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {
867
        match src2 {
868
            RegImm::Imm(imm) => {
869
                if let Some(v) = imm.to_i32() {
870
                    self.asm.cmp_ir(src1, v, size);
871
                } else {
872
                    self.with_scratch::<IntScratch, _>(|masm, scratch| {
873
                        masm.load_constant(&imm, scratch.writable(), size)?;
874
                        masm.asm.cmp_rr(src1, scratch.inner(), size);
875
                        anyhow::Ok(())
876
                    })?;
877
                }
878
            }
879
            RegImm::Reg(src2) => {
880
                self.asm.cmp_rr(src1, src2, size);
881
            }
882
        }
883

884
        Ok(())
885
    }
886

887
    fn cmp_with_set(
888
        &mut self,
889
        dst: WritableReg,
890
        src: RegImm,
891
        kind: IntCmpKind,
892
        size: OperandSize,
893
    ) -> Result<()> {
894
        self.cmp(dst.to_reg(), src, size)?;
895
        self.asm.setcc(kind, dst);
896
        Ok(())
897
    }
898

899
    fn float_cmp_with_set(
900
        &mut self,
901
        dst: WritableReg,
902
        src1: Reg,
903
        src2: Reg,
904
        kind: FloatCmpKind,
905
        size: OperandSize,
906
    ) -> Result<()> {
907
        // Float comparisons needs to be ordered (that is, comparing with a NaN
908
        // should return 0) except for not equal which needs to be unordered.
909
        // We use ucomis{s, d} because comis{s, d} has an undefined result if
910
        // either operand is NaN. Since ucomis{s, d} is unordered, we need to
911
        // compensate to make the comparison ordered.  Ucomis{s, d} sets the
912
        // ZF, PF, and CF flags if there is an unordered result.
913
        let (src1, src2, set_kind) = match kind {
914
            FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),
915
            FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),
916
            FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),
917
            FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),
918
            // Reversing the operands and using the complementary comparison
919
            // avoids needing to perform an additional SETNP and AND
920
            // instruction.
921
            // SETNB and SETNBE check if the carry flag is unset (i.e., not
922
            // less than and not unordered) so we get the intended result
923
            // without having to look at the parity flag.
924
            FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),
925
            FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),
926
        };
927
        self.asm.ucomis(src1, src2, size);
928
        self.asm.setcc(set_kind, dst);
929
        let _ = match kind {
930
            FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {
931
                // Return false if either operand is NaN by ensuring PF is
932
                // unset.
933
                self.with_scratch::<IntScratch, _>(|masm, scratch| {
934
                    masm.asm.setnp(scratch.writable());
935
                    masm.asm.and_rr(scratch.inner(), dst, size);
936
                });
937
            }
938
            FloatCmpKind::Ne => {
939
                // Return true if either operand is NaN by checking if PF is
940
                // set.
941
                self.with_scratch::<IntScratch, _>(|masm, scratch| {
942
                    masm.asm.setp(scratch.writable());
943
                    masm.asm.or_rr(scratch.inner(), dst, size);
944
                });
945
            }
946
            FloatCmpKind::Lt | FloatCmpKind::Le => (),
947
        };
948
        Ok(())
949
    }
950

951
    fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
952
        if self.flags.has_lzcnt() {
953
            self.asm.lzcnt(src, dst, size);
954
        } else {
955
            self.with_scratch::<IntScratch, _>(|masm, scratch| {
956
                // Use the following approach:
957
                // dst = size.num_bits() - bsr(src) - is_not_zero
958
                //     = size.num.bits() + -bsr(src) - is_not_zero.
959
                masm.asm.bsr(src, dst, size);
960
                masm.asm.setcc(IntCmpKind::Ne, scratch.writable());
961
                masm.asm.neg(dst.to_reg(), dst, size);
962
                masm.asm.add_ir(size.num_bits() as i32, dst, size);
963
                masm.asm.sub_rr(scratch.inner(), dst, size);
964
            });
965
        }
966

967
        Ok(())
968
    }
969

970
    fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
971
        if self.flags.has_bmi1() {
972
            self.asm.tzcnt(src, dst, size);
973
        } else {
974
            self.with_scratch::<IntScratch, _>(|masm, scratch| {
975
                // Use the following approach:
976
                // dst = bsf(src) + (is_zero * size.num_bits())
977
                //     = bsf(src) + (is_zero << size.log2()).
978
                // BSF outputs the correct value for every value except 0.
979
                // When the value is 0, BSF outputs 0, correct output for ctz is
980
                // the number of bits.
981
                masm.asm.bsf(src, dst, size);
982
                masm.asm.setcc(IntCmpKind::Eq, scratch.writable());
983
                masm.asm
984
                    .shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size);
985
                masm.asm.add_rr(scratch.inner(), dst, size);
986
            });
987
        }
988

989
        Ok(())
990
    }
991

992
    fn get_label(&mut self) -> Result<MachLabel> {
993
        let buffer = self.asm.buffer_mut();
994
        Ok(buffer.get_label())
995
    }
996

997
    fn bind(&mut self, label: MachLabel) -> Result<()> {
998
        let buffer = self.asm.buffer_mut();
999
        buffer.bind_label(label, &mut Default::default());
1000
        Ok(())
1001
    }
1002

1003
    fn branch(
1004
        &mut self,
1005
        kind: IntCmpKind,
1006
        lhs: Reg,
1007
        rhs: RegImm,
1008
        taken: MachLabel,
1009
        size: OperandSize,
1010
    ) -> Result<()> {
1011
        use IntCmpKind::*;
1012

1013
        match &(lhs, rhs) {
1014
            (rlhs, RegImm::Reg(rrhs)) => {
1015
                // If the comparison kind is zero or not zero and both operands
1016
                // are the same register, emit a test instruction. Else we emit
1017
                // a normal comparison.
1018
                if (kind == Eq || kind == Ne) && (rlhs == rrhs) {
1019
                    self.asm.test_rr(*rlhs, *rrhs, size);
1020
                } else {
1021
                    self.cmp(lhs, rhs, size)?;
1022
                }
1023
            }
1024
            _ => self.cmp(lhs, rhs, size)?,
1025
        }
1026
        self.asm.jmp_if(kind, taken);
1027
        Ok(())
1028
    }
1029

1030
    fn jmp(&mut self, target: MachLabel) -> Result<()> {
1031
        self.asm.jmp(target);
1032
        Ok(())
1033
    }
1034

1035
    fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {
1036
        let src = context.pop_to_reg(self, None)?;
1037
        if self.flags.has_popcnt() && self.flags.has_sse42() {
1038
            self.asm.popcnt(src.into(), writable!(src.into()), size);
1039
            context.stack.push(src.into());
1040
            Ok(())
1041
        } else {
1042
            // The fallback functionality here is based on `MacroAssembler::popcnt64` in:
1043
            // https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#495
1044

1045
            let tmp = writable!(context.any_gpr(self)?);
1046
            let dst = writable!(src.into());
1047
            let (masks, shift_amt) = match size {
1048
                OperandSize::S64 => (
1049
                    [
1050
                        0x5555555555555555, // m1
1051
                        0x3333333333333333, // m2
1052
                        0x0f0f0f0f0f0f0f0f, // m4
1053
                        0x0101010101010101, // h01
1054
                    ],
1055
                    56u8,
1056
                ),
1057
                // 32-bit popcount is the same, except the masks are half as
1058
                // wide and we shift by 24 at the end rather than 56
1059
                OperandSize::S32 => (
1060
                    [0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],
1061
                    24u8,
1062
                ),
1063
                _ => bail!(CodeGenError::unexpected_operand_size()),
1064
            };
1065
            self.asm.mov_rr(src.into(), tmp, size);
1066

1067
            // x -= (x >> 1) & m1;
1068
            self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);
1069
            let lhs = dst.to_reg();
1070
            self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;
1071
            self.asm.sub_rr(dst.to_reg(), tmp, size);
1072

1073
            // x = (x & m2) + ((x >> 2) & m2);
1074
            self.asm.mov_rr(tmp.to_reg(), dst, size);
1075
            // Load `0x3333...` into the scratch reg once, allowing us to use
1076
            // `and_rr` and avoid inadvertently loading it twice as with `and`
1077

1078
            self.with_scratch::<IntScratch, _>(|masm, scratch| {
1079
                masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?;
1080
                masm.asm.and_rr(scratch.inner(), dst, size);
1081
                masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);
1082
                masm.asm.and_rr(scratch.inner(), tmp, size);
1083
                anyhow::Ok(())
1084
            })?;
1085
            self.asm.add_rr(dst.to_reg(), tmp, size);
1086

1087
            // x = (x + (x >> 4)) & m4;
1088
            self.asm.mov_rr(tmp.to_reg(), dst, size);
1089
            self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size);
1090
            self.asm.add_rr(tmp.to_reg(), dst, size);
1091
            let lhs = dst.to_reg();
1092
            self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;
1093

1094
            // (x * h01) >> shift_amt
1095
            let lhs = dst.to_reg();
1096
            self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;
1097
            self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size);
1098

1099
            context.stack.push(src.into());
1100
            context.free_reg(tmp.to_reg());
1101

1102
            Ok(())
1103
        }
1104
    }
1105

1106
    fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1107
        self.asm.mov_rr(src, dst, OperandSize::S32);
1108
        Ok(())
1109
    }
1110

1111
    fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {
1112
        match kind {
1113
            ExtendKind::Signed(ext) => {
1114
                self.asm.movsx_rr(src, dst, ext);
1115
            }
1116
            ExtendKind::Unsigned(ext) => {
1117
                self.asm.movzx_rr(src, dst, ext);
1118
            }
1119
        }
1120

1121
        Ok(())
1122
    }
1123

1124
    fn signed_truncate(
1125
        &mut self,
1126
        dst: WritableReg,
1127
        src: Reg,
1128
        src_size: OperandSize,
1129
        dst_size: OperandSize,
1130
        kind: TruncKind,
1131
    ) -> Result<()> {
1132
        self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1133
            masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1134
                masm.asm.cvt_float_to_sint_seq(
1135
                    src,
1136
                    dst,
1137
                    gpr_scratch.inner(),
1138
                    xmm_scratch.inner(),
1139
                    src_size,
1140
                    dst_size,
1141
                    kind.is_checked(),
1142
                );
1143
                Ok(())
1144
            })
1145
        })
1146
    }
1147

1148
    fn unsigned_truncate(
1149
        &mut self,
1150
        ctx: &mut CodeGenContext<Emission>,
1151
        src_size: OperandSize,
1152
        dst_size: OperandSize,
1153
        kind: TruncKind,
1154
    ) -> Result<()> {
1155
        let dst_ty = match dst_size {
1156
            OperandSize::S32 => WasmValType::I32,
1157
            OperandSize::S64 => WasmValType::I64,
1158
            _ => bail!(CodeGenError::unexpected_operand_size()),
1159
        };
1160

1161
        ctx.convert_op_with_tmp_reg(
1162
            self,
1163
            dst_ty,
1164
            RegClass::Float,
1165
            |masm, dst, src, tmp_fpr, dst_size| {
1166
                masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1167
                    masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1168
                        masm.asm.cvt_float_to_uint_seq(
1169
                            src,
1170
                            writable!(dst),
1171
                            gpr_scratch.inner(),
1172
                            xmm_scratch.inner(),
1173
                            tmp_fpr,
1174
                            src_size,
1175
                            dst_size,
1176
                            kind.is_checked(),
1177
                        );
1178
                        Ok(())
1179
                    })
1180
                })
1181
            },
1182
        )
1183
    }
1184

1185
    fn signed_convert(
1186
        &mut self,
1187
        dst: WritableReg,
1188
        src: Reg,
1189
        src_size: OperandSize,
1190
        dst_size: OperandSize,
1191
    ) -> Result<()> {
1192
        self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);
1193
        Ok(())
1194
    }
1195

1196
    fn unsigned_convert(
1197
        &mut self,
1198
        dst: WritableReg,
1199
        src: Reg,
1200
        tmp_gpr: Reg,
1201
        src_size: OperandSize,
1202
        dst_size: OperandSize,
1203
    ) -> Result<()> {
1204
        // Need to convert unsigned uint32 to uint64 for conversion instruction sequence.
1205
        if let OperandSize::S32 = src_size {
1206
            self.extend(
1207
                writable!(src),
1208
                src,
1209
                ExtendKind::Unsigned(Extend::I64Extend32),
1210
            )?;
1211
        }
1212

1213
        self.with_scratch::<IntScratch, _>(|masm, scratch| {
1214
            masm.asm
1215
                .cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size);
1216
            Ok(())
1217
        })
1218
    }
1219

1220
    fn reinterpret_float_as_int(
1221
        &mut self,
1222
        dst: WritableReg,
1223
        src: Reg,
1224
        size: OperandSize,
1225
    ) -> Result<()> {
1226
        self.asm.xmm_to_gpr(src, dst, size);
1227
        Ok(())
1228
    }
1229

1230
    fn reinterpret_int_as_float(
1231
        &mut self,
1232
        dst: WritableReg,
1233
        src: Reg,
1234
        size: OperandSize,
1235
    ) -> Result<()> {
1236
        self.asm.gpr_to_xmm(src, dst, size);
1237
        Ok(())
1238
    }
1239

1240
    fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1241
        self.asm
1242
            .cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32);
1243
        Ok(())
1244
    }
1245

1246
    fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1247
        self.asm
1248
            .cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64);
1249
        Ok(())
1250
    }
1251

1252
    fn unreachable(&mut self) -> Result<()> {
1253
        self.asm.trap(TRAP_UNREACHABLE);
1254
        Ok(())
1255
    }
1256

1257
    fn trap(&mut self, code: TrapCode) -> Result<()> {
1258
        self.asm.trap(code);
1259
        Ok(())
1260
    }
1261

1262
    fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {
1263
        self.asm.trapif(cc, code);
1264
        Ok(())
1265
    }
1266

1267
    fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {
1268
        self.asm.test_rr(src, src, self.ptr_size);
1269
        self.asm.trapif(IntCmpKind::Eq, code);
1270
        Ok(())
1271
    }
1272

1273
    fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {
1274
        // At least one default target.
1275
        debug_assert!(targets.len() >= 1);
1276
        let default_index = targets.len() - 1;
1277
        // Emit bounds check, by conditionally moving the max cases
1278
        // into the given index reg if the contents of the index reg
1279
        // are greater.
1280
        let max = default_index;
1281
        let size = OperandSize::S32;
1282
        self.asm.mov_ir(max as u64, writable!(tmp), size);
1283
        self.asm.cmp_rr(tmp, index, size);
1284
        self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);
1285

1286
        let default = targets[default_index];
1287
        let rest = &targets[0..default_index];
1288

1289
        self.with_scratch::<IntScratch, _>(|masm, tmp1| {
1290
            masm.asm
1291
                .jmp_table(rest.into(), default, index, tmp1.inner(), tmp);
1292
            Ok(())
1293
        })
1294
    }
1295

1296
    fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {
1297
        Ok(self.asm.buffer_mut().start_srcloc(loc))
1298
    }
1299

1300
    fn end_source_loc(&mut self) -> Result<()> {
1301
        self.asm.buffer_mut().end_srcloc();
1302
        Ok(())
1303
    }
1304

1305
    fn current_code_offset(&self) -> Result<CodeOffset> {
1306
        Ok(self.asm.buffer().cur_offset())
1307
    }
1308

1309
    fn add128(
1310
        &mut self,
1311
        dst_lo: WritableReg,
1312
        dst_hi: WritableReg,
1313
        lhs_lo: Reg,
1314
        lhs_hi: Reg,
1315
        rhs_lo: Reg,
1316
        rhs_hi: Reg,
1317
    ) -> Result<()> {
1318
        Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1319
        Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1320
        self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
1321
        self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
1322
        Ok(())
1323
    }
1324

1325
    fn sub128(
1326
        &mut self,
1327
        dst_lo: WritableReg,
1328
        dst_hi: WritableReg,
1329
        lhs_lo: Reg,
1330
        lhs_hi: Reg,
1331
        rhs_lo: Reg,
1332
        rhs_hi: Reg,
1333
    ) -> Result<()> {
1334
        Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1335
        Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1336
        self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
1337
        self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
1338
        Ok(())
1339
    }
1340

1341
    fn mul_wide(
1342
        &mut self,
1343
        context: &mut CodeGenContext<Emission>,
1344
        kind: MulWideKind,
1345
    ) -> Result<()> {
1346
        // Reserve rax/rdx since they're required by the `mul_wide` instruction
1347
        // being used here.
1348
        let rax = context.reg(regs::rax(), self)?;
1349
        let rdx = context.reg(regs::rdx(), self)?;
1350

1351
        // The rhs of this binop can be in any register
1352
        let rhs = context.pop_to_reg(self, None)?;
1353
        // Mark rax as allocatable. and then force the lhs operand to be placed
1354
        // in `rax`.
1355
        context.free_reg(rax);
1356
        let lhs = context.pop_to_reg(self, Some(rax))?;
1357

1358
        self.asm.mul_wide(
1359
            writable!(rax),
1360
            writable!(rdx),
1361
            lhs.reg,
1362
            rhs.reg,
1363
            kind,
1364
            OperandSize::S64,
1365
        );
1366

1367
        // No longer using the rhs register after the multiplication has been
1368
        // executed.
1369
        context.free_reg(rhs);
1370

1371
        // The low bits of the result are in rax, where `lhs` was allocated to
1372
        context.stack.push(lhs.into());
1373
        // The high bits of the result are in rdx, which we previously reserved.
1374
        context.stack.push(Val::Reg(TypedReg::i64(rdx)));
1375

1376
        Ok(())
1377
    }
1378

1379
    fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {
1380
        // Get the source and destination operands set up first.
1381
        let (src, dst) = match size {
1382
            // Floats can use the same register for `src` and `dst`.
1383
            SplatKind::F32x4 | SplatKind::F64x2 => {
1384
                let reg = context.pop_to_reg(self, None)?.reg;
1385
                (RegImm::reg(reg), writable!(reg))
1386
            }
1387
            // For ints, we need to load the operand into a vector register if
1388
            // it's not a constant.
1389
            SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {
1390
                let dst = writable!(context.any_fpr(self)?);
1391
                let src = if size == SplatKind::I64x2 {
1392
                    context.pop_i64_const().map(RegImm::i64)
1393
                } else {
1394
                    context.pop_i32_const().map(RegImm::i32)
1395
                }
1396
                .map_or_else(
1397
                    || -> Result<RegImm> {
1398
                        let reg = context.pop_to_reg(self, None)?.reg;
1399
                        self.reinterpret_int_as_float(
1400
                            dst,
1401
                            reg,
1402
                            match size {
1403
                                SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {
1404
                                    OperandSize::S32
1405
                                }
1406
                                SplatKind::I64x2 => OperandSize::S64,
1407
                                SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),
1408
                            },
1409
                        )?;
1410
                        context.free_reg(reg);
1411
                        Ok(RegImm::Reg(dst.to_reg()))
1412
                    },
1413
                    Ok,
1414
                )?;
1415
                (src, dst)
1416
            }
1417
        };
1418

1419
        // Perform the splat on the operands.
1420
        if size == SplatKind::I64x2 || size == SplatKind::F64x2 {
1421
            self.ensure_has_avx()?;
1422
            let mask = Self::vpshuf_mask_for_64_bit_splats();
1423
            match src {
1424
                RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),
1425
                RegImm::Imm(imm) => {
1426
                    let src = self.asm.add_constant(&imm.to_bytes());
1427
                    self.asm
1428
                        .xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());
1429
                }
1430
            }
1431
        } else {
1432
            self.ensure_has_avx2()?;
1433

1434
            match src {
1435
                RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),
1436
                RegImm::Imm(imm) => {
1437
                    let src = self.asm.add_constant(&imm.to_bytes());
1438
                    self.asm
1439
                        .xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());
1440
                }
1441
            }
1442
        }
1443

1444
        context
1445
            .stack
1446
            .push(Val::reg(dst.to_reg(), WasmValType::V128));
1447
        Ok(())
1448
    }
1449

1450
    fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1451
        self.ensure_has_avx()?;
1452

1453
        // Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`
1454
        // separately to either the selected index or 0.
1455
        // Then use `vpor` to combine `lhs` and `rhs` into `dst`.
1456
        // Setting the most significant bit in the mask's lane to 1 will
1457
        // result in corresponding lane in the destination register being
1458
        // set to 0. 0x80 sets the most significant bit to 1.
1459
        let mut mask_lhs: [u8; 16] = [0x80; 16];
1460
        let mut mask_rhs: [u8; 16] = [0x80; 16];
1461
        for i in 0..lanes.len() {
1462
            if lanes[i] < 16 {
1463
                mask_lhs[i] = lanes[i];
1464
            } else {
1465
                mask_rhs[i] = lanes[i] - 16;
1466
            }
1467
        }
1468
        let mask_lhs = self.asm.add_constant(&mask_lhs);
1469
        let mask_rhs = self.asm.add_constant(&mask_rhs);
1470

1471
        self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1472
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1473
            masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs);
1474
            masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner());
1475
            Ok(())
1476
        })
1477
    }
1478

1479
    fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
1480
        self.ensure_has_avx()?;
1481

1482
        // Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything
1483
        // outside that range.
1484
        // Each lane is a signed byte so the maximum value is 0x7F. Adding
1485
        // 0x70 to any value higher than 0xF will saturate resulting in a value
1486
        // of 0xFF (i.e., 0).
1487
        let clamp = self.asm.add_constant(&[0x70; 16]);
1488
        self.asm
1489
            .xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8);
1490

1491
        // Don't need to subtract 0x70 since `vpshufb` uses the least
1492
        // significant 4 bits which are the same after adding 0x70.
1493
        self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
1494
        Ok(())
1495
    }
1496

1497
    fn atomic_rmw(
1498
        &mut self,
1499
        context: &mut CodeGenContext<Emission>,
1500
        addr: Self::Address,
1501
        size: OperandSize,
1502
        op: RmwOp,
1503
        flags: MemFlags,
1504
        extend: Option<Extend<Zero>>,
1505
    ) -> Result<()> {
1506
        let res = match op {
1507
            RmwOp::Add => {
1508
                let operand = context.pop_to_reg(self, None)?;
1509
                self.asm
1510
                    .lock_xadd(addr, writable!(operand.reg), size, flags);
1511
                operand.reg
1512
            }
1513
            RmwOp::Sub => {
1514
                let operand = context.pop_to_reg(self, None)?;
1515
                self.asm.neg(operand.reg, writable!(operand.reg), size);
1516
                self.asm
1517
                    .lock_xadd(addr, writable!(operand.reg), size, flags);
1518
                operand.reg
1519
            }
1520
            RmwOp::Xchg => {
1521
                let operand = context.pop_to_reg(self, None)?;
1522
                self.asm.xchg(addr, writable!(operand.reg), size, flags);
1523
                operand.reg
1524
            }
1525
            RmwOp::And | RmwOp::Or | RmwOp::Xor => {
1526
                let op = match op {
1527
                    RmwOp::And => AtomicRmwSeqOp::And,
1528
                    RmwOp::Or => AtomicRmwSeqOp::Or,
1529
                    RmwOp::Xor => AtomicRmwSeqOp::Xor,
1530
                    _ => unreachable!(
1531
                        "invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"
1532
                    ),
1533
                };
1534
                let dst = context.reg(regs::rax(), self)?;
1535
                let operand = context.pop_to_reg(self, None)?;
1536

1537
                self.with_scratch::<IntScratch, _>(|masm, scratch| {
1538
                    masm.asm.atomic_rmw_seq(
1539
                        addr,
1540
                        operand.reg,
1541
                        writable!(dst),
1542
                        scratch.writable(),
1543
                        size,
1544
                        flags,
1545
                        op,
1546
                    );
1547
                });
1548

1549
                context.free_reg(operand.reg);
1550
                dst
1551
            }
1552
        };
1553

1554
        let dst_ty = match extend {
1555
            Some(ext) => {
1556
                // We don't need to zero-extend from 32 to 64bits.
1557
                if !(ext.from_bits() == 32 && ext.to_bits() == 64) {
1558
                    self.asm.movzx_rr(res, writable!(res), ext);
1559
                }
1560

1561
                WasmValType::int_from_bits(ext.to_bits())
1562
            }
1563
            None => WasmValType::int_from_bits(size.num_bits()),
1564
        };
1565

1566
        context.stack.push(TypedReg::new(dst_ty, res).into());
1567

1568
        Ok(())
1569
    }
1570

1571
    fn extract_lane(
1572
        &mut self,
1573
        src: Reg,
1574
        dst: WritableReg,
1575
        lane: u8,
1576
        kind: ExtractLaneKind,
1577
    ) -> Result<()> {
1578
        self.ensure_has_avx()?;
1579

1580
        match kind {
1581
            ExtractLaneKind::I8x16S
1582
            | ExtractLaneKind::I8x16U
1583
            | ExtractLaneKind::I16x8S
1584
            | ExtractLaneKind::I16x8U
1585
            | ExtractLaneKind::I32x4
1586
            | ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),
1587
            ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {
1588
                // If the `src` and `dst` registers are the same, then the
1589
                // appropriate value is already in the correct position in
1590
                // the register.
1591
                assert!(src == dst.to_reg());
1592
            }
1593
            ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),
1594
            ExtractLaneKind::F64x2 => {
1595
                // `0b11_10` selects the high and low 32-bits of the second
1596
                // 64-bit, so `0b11_10_11_10` splats the 64-bit value across
1597
                // both lanes. Since we put an `f64` on the stack, we use
1598
                // the splatted value.
1599
                // Double-check `lane == 0` was handled in another branch.
1600
                assert!(lane == 1);
1601
                self.asm
1602
                    .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)
1603
            }
1604
        }
1605

1606
        // Sign-extend to 32-bits for sign extended kinds.
1607
        match kind {
1608
            ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {
1609
                self.asm.movsx_rr(dst.to_reg(), dst, kind.into())
1610
            }
1611
            _ => (),
1612
        }
1613

1614
        Ok(())
1615
    }
1616

1617
    fn replace_lane(
1618
        &mut self,
1619
        src: RegImm,
1620
        dst: WritableReg,
1621
        lane: u8,
1622
        kind: ReplaceLaneKind,
1623
    ) -> Result<()> {
1624
        self.ensure_has_avx()?;
1625

1626
        match kind {
1627
            ReplaceLaneKind::I8x16
1628
            | ReplaceLaneKind::I16x8
1629
            | ReplaceLaneKind::I32x4
1630
            | ReplaceLaneKind::I64x2 => match src {
1631
                RegImm::Reg(reg) => {
1632
                    self.asm
1633
                        .xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());
1634
                }
1635
                RegImm::Imm(imm) => {
1636
                    let address = self.asm.add_constant(&imm.to_bytes());
1637
                    self.asm
1638
                        .xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());
1639
                }
1640
            },
1641
            ReplaceLaneKind::F32x4 => {
1642
                // Immediate for `vinsertps` uses first 3 bits to determine
1643
                // which elements of the destination to set to 0. The next 2
1644
                // bits specify which element of the destination will be
1645
                // overwritten.
1646
                let imm = lane << 4;
1647
                match src {
1648
                    RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),
1649
                    RegImm::Imm(val) => {
1650
                        let address = self.asm.add_constant(&val.to_bytes());
1651
                        self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);
1652
                    }
1653
                }
1654
            }
1655
            ReplaceLaneKind::F64x2 => match src {
1656
                RegImm::Reg(reg) => match lane {
1657
                    0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),
1658
                    1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),
1659
                    _ => unreachable!(),
1660
                },
1661
                RegImm::Imm(imm) => {
1662
                    let address = self.asm.add_constant(&imm.to_bytes());
1663
                    match lane {
1664
                        0 => {
1665
                            // Memory load variant of `vmovsd` zeroes the upper
1666
                            // 64 bits of the register so need to load the
1667
                            // immediate to a register to use the register
1668
                            // variant of `vmovsd` to perform the merge.
1669

1670
                            self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1671
                                masm.asm.xmm_vmovsd_rm(scratch.writable(), &address);
1672
                                masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner());
1673
                            });
1674
                        }
1675
                        1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),
1676
                        _ => unreachable!(),
1677
                    }
1678
                }
1679
            },
1680
        }
1681
        Ok(())
1682
    }
1683

1684
    fn atomic_cas(
1685
        &mut self,
1686
        context: &mut CodeGenContext<Emission>,
1687
        addr: Self::Address,
1688
        size: OperandSize,
1689
        flags: MemFlags,
1690
        extend: Option<Extend<Zero>>,
1691
    ) -> Result<()> {
1692
        // `cmpxchg` expects `expected` to be in the `*a*` register.
1693
        // reserve rax for the expected argument.
1694
        let rax = context.reg(regs::rax(), self)?;
1695

1696
        let replacement = context.pop_to_reg(self, None)?;
1697

1698
        // mark `rax` as allocatable again.
1699
        context.free_reg(rax);
1700
        let expected = context.pop_to_reg(self, Some(regs::rax()))?;
1701

1702
        self.asm
1703
            .cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);
1704

1705
        if let Some(extend) = extend {
1706
            // We don't need to zero-extend from 32 to 64bits.
1707
            if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
1708
                self.asm
1709
                    .movzx_rr(expected.reg, writable!(expected.reg), extend);
1710
            }
1711
        }
1712

1713
        context.stack.push(expected.into());
1714
        context.free_reg(replacement);
1715

1716
        Ok(())
1717
    }
1718

1719
    fn v128_eq(
1720
        &mut self,
1721
        dst: WritableReg,
1722
        lhs: Reg,
1723
        rhs: Reg,
1724
        kind: VectorEqualityKind,
1725
    ) -> Result<()> {
1726
        self.ensure_has_avx()?;
1727

1728
        match kind {
1729
            VectorEqualityKind::I8x16
1730
            | VectorEqualityKind::I16x8
1731
            | VectorEqualityKind::I32x4
1732
            | VectorEqualityKind::I64x2 => {
1733
                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())
1734
            }
1735
            VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1736
                self.asm
1737
                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)
1738
            }
1739
        }
1740
        Ok(())
1741
    }
1742

1743
    fn v128_ne(
1744
        &mut self,
1745
        dst: WritableReg,
1746
        lhs: Reg,
1747
        rhs: Reg,
1748
        kind: VectorEqualityKind,
1749
    ) -> Result<()> {
1750
        self.ensure_has_avx()?;
1751

1752
        match kind {
1753
            VectorEqualityKind::I8x16
1754
            | VectorEqualityKind::I16x8
1755
            | VectorEqualityKind::I32x4
1756
            | VectorEqualityKind::I64x2 => {
1757
                // Check for equality and invert the results.
1758
                self.asm
1759
                    .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1760
                self.asm
1761
                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1762
                self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1763
            }
1764
            VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1765
                self.asm
1766
                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)
1767
            }
1768
        }
1769
        Ok(())
1770
    }
1771

1772
    fn v128_lt(
1773
        &mut self,
1774
        dst: WritableReg,
1775
        lhs: Reg,
1776
        rhs: Reg,
1777
        kind: VectorCompareKind,
1778
    ) -> Result<()> {
1779
        self.ensure_has_avx()?;
1780

1781
        match kind {
1782
            VectorCompareKind::I8x16S
1783
            | VectorCompareKind::I16x8S
1784
            | VectorCompareKind::I32x4S
1785
            | VectorCompareKind::I64x2S => {
1786
                // Perform a greater than check with reversed parameters.
1787
                self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())
1788
            }
1789
            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1790
                // Set `lhs` to min values, check for equality, then invert the
1791
                // result.
1792
                // If `lhs` is smaller, then equality check will fail and result
1793
                // will be inverted to true. Otherwise the equality check will
1794
                // pass and be inverted to false.
1795
                self.asm
1796
                    .xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1797
                self.asm
1798
                    .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1799
                self.asm
1800
                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1801
                self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1802
            }
1803
            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1804
                self.asm
1805
                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)
1806
            }
1807
        }
1808
        Ok(())
1809
    }
1810

1811
    fn v128_le(
1812
        &mut self,
1813
        dst: WritableReg,
1814
        lhs: Reg,
1815
        rhs: Reg,
1816
        kind: VectorCompareKind,
1817
    ) -> Result<()> {
1818
        self.ensure_has_avx()?;
1819

1820
        match kind {
1821
            VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1822
                // Set the `rhs` vector to the signed minimum values and then
1823
                // compare them with `lhs` for equality.
1824
                self.asm
1825
                    .xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1826
                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1827
            }
1828
            VectorCompareKind::I64x2S => {
1829
                // Do a greater than check and invert the results.
1830
                self.asm
1831
                    .xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1832
                self.asm
1833
                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1834
                self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1835
            }
1836
            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1837
                // Set the `rhs` vector to the signed minimum values and then
1838
                // compare them with `lhs` for equality.
1839
                self.asm
1840
                    .xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1841
                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1842
            }
1843
            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1844
                self.asm
1845
                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)
1846
            }
1847
        }
1848
        Ok(())
1849
    }
1850

1851
    fn v128_gt(
1852
        &mut self,
1853
        dst: WritableReg,
1854
        lhs: Reg,
1855
        rhs: Reg,
1856
        kind: VectorCompareKind,
1857
    ) -> Result<()> {
1858
        self.ensure_has_avx()?;
1859

1860
        match kind {
1861
            VectorCompareKind::I8x16S
1862
            | VectorCompareKind::I16x8S
1863
            | VectorCompareKind::I32x4S
1864
            | VectorCompareKind::I64x2S => {
1865
                self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())
1866
            }
1867
            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1868
                // Set `lhs` to max values, check for equality, then invert the
1869
                // result.
1870
                // If `lhs` is larger, then equality check will fail and result
1871
                // will be inverted to true. Otherwise the equality check will
1872
                // pass and be inverted to false.
1873
                self.asm
1874
                    .xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1875
                self.asm
1876
                    .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1877
                self.asm
1878
                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1879
                self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1880
            }
1881
            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1882
                // Do a less than comparison with the operands swapped.
1883
                self.asm
1884
                    .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)
1885
            }
1886
        }
1887
        Ok(())
1888
    }
1889

1890
    fn v128_ge(
1891
        &mut self,
1892
        dst: WritableReg,
1893
        lhs: Reg,
1894
        rhs: Reg,
1895
        kind: VectorCompareKind,
1896
    ) -> Result<()> {
1897
        self.ensure_has_avx()?;
1898

1899
        match kind {
1900
            VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1901
                // Set each lane to maximum value and then compare for equality.
1902
                self.asm
1903
                    .xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1904
                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1905
            }
1906
            VectorCompareKind::I64x2S => {
1907
                // Perform a greater than comparison with operands swapped,
1908
                // then invert the results.
1909
                self.asm
1910
                    .xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());
1911
                self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());
1912
                self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
1913
            }
1914
            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1915
                // Set lanes to maximum values and compare them for equality.
1916
                self.asm
1917
                    .xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1918
                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1919
            }
1920
            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1921
                // Perform a less than or equal comparison on swapped operands.
1922
                self.asm
1923
                    .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)
1924
            }
1925
        }
1926

1927
        Ok(())
1928
    }
1929

1930
    fn fence(&mut self) -> Result<()> {
1931
        self.asm.mfence();
1932
        Ok(())
1933
    }
1934

1935
    fn v128_not(&mut self, dst: WritableReg) -> Result<()> {
1936
        self.ensure_has_avx()?;
1937

1938
        self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1939
            // First, we initialize `tmp` with all ones by comparing it with
1940
            // itself.
1941
            masm.asm
1942
                .xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32);
1943
            // Then we `xor` tmp and `dst` together, yielding `!dst`.
1944
            masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst);
1945
            Ok(())
1946
        })
1947
    }
1948

1949
    fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1950
        self.ensure_has_avx()?;
1951
        self.asm.xmm_vpand_rrr(src1, src2, dst);
1952
        Ok(())
1953
    }
1954

1955
    fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1956
        self.ensure_has_avx()?;
1957
        self.asm.xmm_vpandn_rrr(src1, src2, dst);
1958
        Ok(())
1959
    }
1960

1961
    fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1962
        self.ensure_has_avx()?;
1963
        self.asm.xmm_vpor_rrr(dst, src1, src2);
1964
        Ok(())
1965
    }
1966

1967
    fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1968
        self.ensure_has_avx()?;
1969
        self.asm.xmm_vpxor_rrr(src1, src2, dst);
1970
        Ok(())
1971
    }
1972

1973
    fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {
1974
        self.ensure_has_avx()?;
1975

1976
        self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1977
            masm.v128_and(src1, mask, tmp.writable())?;
1978
            masm.v128_and_not(mask, src2, dst)?;
1979
            masm.v128_or(dst.to_reg(), tmp.inner(), dst)?;
1980
            Ok(())
1981
        })
1982
    }
1983

1984
    fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
1985
        self.ensure_has_avx()?;
1986
        self.asm.xmm_vptest(src, src);
1987
        self.asm.setcc(IntCmpKind::Ne, dst);
1988
        Ok(())
1989
    }
1990

1991
    fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {
1992
        self.ensure_has_avx()?;
1993
        match kind {
1994
            V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),
1995
            V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),
1996
            V128ConvertKind::I32x4U => {
1997
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1998
                    // Split each 32-bit integer into 16-bit parts.
1999
                    // `scratch` will contain the low bits and `dst` will contain
2000
                    // the high bits.
2001
                    masm.asm
2002
                        .xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size());
2003
                    masm.asm.xmm_vpsrl_rri(
2004
                        scratch.inner(),
2005
                        scratch.writable(),
2006
                        0x10,
2007
                        kind.src_lane_size(),
2008
                    );
2009
                    masm.asm
2010
                        .xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2011

2012
                    // Convert the low bits in `scratch` to floating point numbers.
2013
                    masm.asm
2014
                        .xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
2015

2016
                    // Prevent overflow by right shifting high bits.
2017
                    masm.asm
2018
                        .xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size());
2019
                    // Convert high bits in `dst` to floating point numbers.
2020
                    masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);
2021
                    // Double high bits in `dst` to reverse right shift.
2022
                    masm.asm
2023
                        .xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());
2024
                    // Add high bits in `dst` to low bits in `scratch`.
2025
                    masm.asm.xmm_vaddp_rrr(
2026
                        dst.to_reg(),
2027
                        scratch.inner(),
2028
                        dst,
2029
                        kind.src_lane_size(),
2030
                    );
2031
                });
2032
            }
2033
            V128ConvertKind::I32x4LowU => {
2034
                // See
2035
                // https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L3668
2036
                // for details on the Cranelift AVX implementation.
2037
                // Use `vunpcklp` to create doubles from the integers.
2038
                // Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers
2039
                // creates a byte array for a double that sets the mantissa
2040
                // bits to the original integer value.
2041
                let conversion_constant = self
2042
                    .asm
2043
                    .add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);
2044
                self.asm
2045
                    .xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());
2046
                // Subtract the 0x1.0p52 added above.
2047
                let conversion_constant = self.asm.add_constant(&[
2048
                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
2049
                    0x00, 0x30, 0x43,
2050
                ]);
2051
                self.asm.xmm_vsub_rrm(
2052
                    dst.to_reg(),
2053
                    &conversion_constant,
2054
                    dst,
2055
                    kind.dst_lane_size(),
2056
                );
2057
            }
2058
        }
2059
        Ok(())
2060
    }
2061

2062
    fn v128_narrow(
2063
        &mut self,
2064
        src1: Reg,
2065
        src2: Reg,
2066
        dst: WritableReg,
2067
        kind: V128NarrowKind,
2068
    ) -> Result<()> {
2069
        self.ensure_has_avx()?;
2070
        match kind {
2071
            V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {
2072
                self.asm
2073
                    .xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())
2074
            }
2075
            V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {
2076
                self.asm
2077
                    .xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())
2078
            }
2079
        }
2080
        Ok(())
2081
    }
2082

2083
    fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2084
        self.ensure_has_avx()?;
2085
        self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);
2086
        Ok(())
2087
    }
2088

2089
    fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2090
        self.ensure_has_avx()?;
2091
        self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);
2092
        Ok(())
2093
    }
2094

2095
    fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {
2096
        self.ensure_has_avx()?;
2097
        match kind {
2098
            V128ExtendKind::LowI8x16S
2099
            | V128ExtendKind::LowI8x16U
2100
            | V128ExtendKind::LowI16x8S
2101
            | V128ExtendKind::LowI16x8U
2102
            | V128ExtendKind::LowI32x4S
2103
            | V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),
2104
            V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {
2105
                self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);
2106
                self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2107
            }
2108
            V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {
2109
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2110
                    masm.asm
2111
                        .xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2112
                    masm.asm
2113
                        .xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2114
                });
2115
            }
2116
            V128ExtendKind::HighI32x4S => {
2117
                // Move the 3rd element (i.e., 0b10) to the 1st (rightmost)
2118
                // position and the 4th element (i.e., 0b11) to the 2nd (second
2119
                // from the right) position and then perform the extend.
2120
                self.asm
2121
                    .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());
2122
                self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2123
            }
2124
            V128ExtendKind::HighI32x4U => {
2125
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2126
                    // Set `scratch` to a vector 0s.
2127
                    masm.asm.xmm_vxorp_rrr(
2128
                        scratch.inner(),
2129
                        scratch.inner(),
2130
                        scratch.writable(),
2131
                        kind.src_lane_size(),
2132
                    );
2133
                    // Interleave the 0 bits into the two 32-bit integers to zero extend them.
2134
                    masm.asm
2135
                        .xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2136
                });
2137
            }
2138
        }
2139
        Ok(())
2140
    }
2141

2142
    fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {
2143
        self.ensure_has_avx()?;
2144
        match kind {
2145
            V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32),
2146
            V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64),
2147
            V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8),
2148
            V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8),
2149
            V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8),
2150
            V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16),
2151
            V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16),
2152
            V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16),
2153
            V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32),
2154
            V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64),
2155
        };
2156
        Ok(())
2157
    }
2158

2159
    fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {
2160
        self.ensure_has_avx()?;
2161
        match kind {
2162
            V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32),
2163
            V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64),
2164
            V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8),
2165
            V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8),
2166
            V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8),
2167
            V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16),
2168
            V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16),
2169
            V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16),
2170
            V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32),
2171
            V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64),
2172
        };
2173
        Ok(())
2174
    }
2175

2176
    fn v128_mul(
2177
        &mut self,
2178
        context: &mut CodeGenContext<Emission>,
2179
        kind: V128MulKind,
2180
    ) -> Result<()> {
2181
        self.ensure_has_avx()?;
2182

2183
        let rhs = context.pop_to_reg(self, None)?;
2184
        let lhs = context.pop_to_reg(self, None)?;
2185

2186
        let mul_i64x2_avx512 = |this: &mut Self| {
2187
            this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg));
2188
        };
2189

2190
        let mul_i64x2_fallback = |this: &mut Self,
2191
                                  context: &mut CodeGenContext<Emission>|
2192
         -> Result<()> {
2193
            // Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback
2194
            // to an instruction sequence using 32bits multiplication (taken from cranelift
2195
            // implementation, in `isa/x64/lower.isle`):
2196
            //
2197
            // > Otherwise, for i64x2 multiplication we describe a lane A as being composed of
2198
            // > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
2199
            // > multiplication can then be written as:
2200
            //
2201
            // >    Ah Al
2202
            // > *  Bh Bl
2203
            // >    -----
2204
            // >    Al * Bl
2205
            // > + (Ah * Bl) << 32
2206
            // > + (Al * Bh) << 32
2207
            //
2208
            // > So for each lane we will compute:
2209
            //
2210
            // >   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
2211
            //
2212
            // > Note, the algorithm will use `pmuludq` which operates directly on the lower
2213
            // > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
2214
            // > the lane of the destination. For this reason we don't need shifts to isolate
2215
            // > the lower 32-bits, however, we will need to use shifts to isolate the high
2216
            // > 32-bits when doing calculations, i.e., `Ah == A >> 32`.
2217

2218
            let tmp2 = context.any_fpr(this)?;
2219
            this.with_scratch::<FloatScratch, _>(|this, tmp1| {
2220
                // tmp1 = lhs_hi = (lhs >> 32)
2221
                this.asm
2222
                    .xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64);
2223

2224
                // tmp2 = lhs_hi * rhs_low = tmp1 * rhs
2225
                this.asm
2226
                    .xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2));
2227

2228
                // tmp1 = rhs_hi = rhs >> 32
2229
                this.asm
2230
                    .xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64);
2231

2232
                // tmp1 = lhs_low * rhs_high = tmp1 * lhs
2233
                this.asm
2234
                    .xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable());
2235

2236
                // tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2237
                this.asm
2238
                    .xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64);
2239

2240
                //tmp1 = tmp1 << 32
2241
                this.asm
2242
                    .xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64);
2243

2244
                // tmp2 = lhs_lo + rhs_lo
2245
                this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2));
2246

2247
                // finally, with `lhs` as destination:
2248
                // lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2249
                this.asm
2250
                    .xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64);
2251
            });
2252

2253
            context.free_reg(tmp2);
2254

2255
            Ok(())
2256
        };
2257

2258
        match kind {
2259
            V128MulKind::F32x4 => {
2260
                self.asm
2261
                    .xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2262
            }
2263
            V128MulKind::F64x2 => {
2264
                self.asm
2265
                    .xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64)
2266
            }
2267
            V128MulKind::I16x8 => {
2268
                self.asm
2269
                    .xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16)
2270
            }
2271
            V128MulKind::I32x4 => {
2272
                self.asm
2273
                    .xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2274
            }
2275
            // This is the fast path when AVX512 is available.
2276
            V128MulKind::I64x2
2277
                if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>
2278
            {
2279
                mul_i64x2_avx512(self)
2280
            }
2281
            // Otherwise, we emit AVX fallback sequence.
2282
            V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,
2283
        }
2284

2285
        context.stack.push(lhs.into());
2286
        context.free_reg(rhs);
2287

2288
        Ok(())
2289
    }
2290

2291
    fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {
2292
        self.ensure_has_avx()?;
2293

2294
        match kind {
2295
            V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {
2296
                self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())
2297
            }
2298
            V128AbsKind::I64x2 => {
2299
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2300
                    // Perform an arithmetic right shift of 31 bits. If the number
2301
                    // is positive, this will result in all zeroes in the upper
2302
                    // 32-bits. If the number is negative, this will result in all
2303
                    // ones in the upper 32-bits.
2304
                    masm.asm
2305
                        .xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32);
2306
                    // Copy the ones and zeroes in the high bits of each 64-bit
2307
                    // lane to the low bits of each 64-bit lane.
2308
                    masm.asm.xmm_vpshuf_rr(
2309
                        scratch.inner(),
2310
                        scratch.writable(),
2311
                        0b11_11_01_01,
2312
                        OperandSize::S32,
2313
                    );
2314
                    // Flip the bits in lanes that were negative in `src` and leave
2315
                    // the positive lanes as they are. Positive lanes will have a
2316
                    // zero mask in `scratch` so xor doesn't affect them.
2317
                    masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst);
2318
                    // Subtract the mask from the results of xor which will
2319
                    // complete the two's complement for lanes which were negative.
2320
                    masm.asm
2321
                        .xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2322
                });
2323
            }
2324
            V128AbsKind::F32x4 | V128AbsKind::F64x2 => {
2325
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2326
                    // Create a mask of all ones.
2327
                    masm.asm.xmm_vpcmpeq_rrr(
2328
                        scratch.writable(),
2329
                        scratch.inner(),
2330
                        scratch.inner(),
2331
                        kind.lane_size(),
2332
                    );
2333
                    // Right shift the mask so each lane is a single zero followed
2334
                    // by all ones.
2335
                    masm.asm.xmm_vpsrl_rri(
2336
                        scratch.inner(),
2337
                        scratch.writable(),
2338
                        0x1,
2339
                        kind.lane_size(),
2340
                    );
2341
                    // Use the mask to zero the sign bit in each lane which will
2342
                    // make the float value positive.
2343
                    masm.asm
2344
                        .xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size());
2345
                });
2346
            }
2347
        }
2348
        Ok(())
2349
    }
2350

2351
    fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {
2352
        self.ensure_has_avx()?;
2353

2354
        match kind {
2355
            V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {
2356
                self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2357
                    masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?;
2358
                    masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?;
2359
                    anyhow::Ok(())
2360
                })?;
2361
            }
2362
            V128NegKind::F32x4 | V128NegKind::F64x2 => {
2363
                self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2364
                    // Create a mask of all 1s.
2365
                    masm.asm.xmm_vpcmpeq_rrr(
2366
                        tmp.writable(),
2367
                        tmp.inner(),
2368
                        tmp.inner(),
2369
                        kind.lane_size(),
2370
                    );
2371
                    // Left shift the lanes in the mask so only the sign bit in the
2372
                    // mask is set to 1.
2373
                    masm.asm.xmm_vpsll_rri(
2374
                        tmp.inner(),
2375
                        tmp.writable(),
2376
                        (kind.lane_size().num_bits() - 1) as u32,
2377
                        kind.lane_size(),
2378
                    );
2379
                    // Use the mask to flip the sign bit.
2380
                    masm.asm
2381
                        .xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size());
2382
                });
2383
            }
2384
        }
2385
        Ok(())
2386
    }
2387

2388
    fn v128_shift(
2389
        &mut self,
2390
        context: &mut CodeGenContext<Emission>,
2391
        lane_width: OperandSize,
2392
        kind: ShiftKind,
2393
    ) -> Result<()> {
2394
        self.ensure_has_avx()?;
2395
        let shift_amount = context.pop_to_reg(self, None)?.reg;
2396
        let operand = context.pop_to_reg(self, None)?.reg;
2397
        let amount_mask = lane_width.num_bits() - 1;
2398

2399
        self.and(
2400
            writable!(shift_amount),
2401
            shift_amount,
2402
            RegImm::i32(amount_mask as i32),
2403
            OperandSize::S32,
2404
        )?;
2405

2406
        self.with_scratch::<IntScratch, _>(|masm, tmp| {
2407
            masm.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2408
                let move_to_tmp_xmm = |this: &mut Self| {
2409
                    this.asm
2410
                        .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2411
                };
2412

2413
                // A helper for deciding between `vpsllw` and `vpsrlw` in
2414
                // `shift_i8x16`.
2415
                enum Direction {
2416
                    Left,
2417
                    Right,
2418
                }
2419

2420
                let shift_i8x16 = |this: &mut Self, masks: &'static [u8], direction: Direction| {
2421
                    // The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit
2422
                    // shift instruction. Instead, we shift as 16bits, and then mask the bits in the
2423
                    // 8bits lane, for example (with 2 8bits lanes):
2424
                    // - Before shifting:
2425
                    // 01001101 11101110
2426
                    // - shifting by 2 left:
2427
                    // 00110111 10111000
2428
                    //       ^^_ these bits come from the previous byte, and need to be masked.
2429
                    // - The mask:
2430
                    // 11111100 11111111
2431
                    // - After masking:
2432
                    // 00110100 10111000
2433
                    //
2434
                    // The mask is loaded from a well known memory, depending on the shift amount.
2435

2436
                    this.asm
2437
                        .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2438

2439
                    // Perform the 16-bit shift.
2440
                    match direction {
2441
                        Direction::Left => this.asm.xmm_vpsll_rrr(
2442
                            operand,
2443
                            tmp_xmm.inner(),
2444
                            writable!(operand),
2445
                            OperandSize::S16,
2446
                        ),
2447
                        Direction::Right => this.asm.xmm_vpsrl_rrr(
2448
                            operand,
2449
                            tmp_xmm.inner(),
2450
                            writable!(operand),
2451
                            OperandSize::S16,
2452
                        ),
2453
                    }
2454

2455
                    // Get a handle to the masks array constant.
2456
                    let masks_addr = this.asm.add_constant(masks);
2457

2458
                    // Load the masks array effective address into the tmp register.
2459
                    this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64);
2460

2461
                    // Compute the offset of the mask that we need to use. This is shift_amount * 16 ==
2462
                    // shift_amount << 4.
2463
                    this.asm
2464
                        .shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
2465

2466
                    // Load the mask to tmp_xmm.
2467
                    this.asm.xmm_vmovdqu_mr(
2468
                        &Address::ImmRegRegShift {
2469
                            simm32: 0,
2470
                            base: tmp.inner(),
2471
                            index: shift_amount,
2472
                            shift: 0,
2473
                        },
2474
                        tmp_xmm.writable(),
2475
                        MemFlags::trusted(),
2476
                    );
2477

2478
                    // Mask unwanted bits from operand.
2479
                    this.asm
2480
                        .xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand));
2481
                };
2482

2483
                let i64x2_shr_s = |this: &mut Self,
2484
                                   context: &mut CodeGenContext<Emission>|
2485
                 -> Result<()> {
2486
                    const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
2487

2488
                    // AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the
2489
                    // following formula (from hacker's delight 2-7), where x is the value and n the shift
2490
                    // amount, for each lane:
2491
                    // t = (1 << 63) >> n; ((x >> n) ^ t) - t
2492

2493
                    // We need an extra scratch register:
2494
                    let tmp_xmm2 = context.any_fpr(this)?;
2495

2496
                    this.asm
2497
                        .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2498

2499
                    let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
2500

2501
                    this.asm
2502
                        .xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
2503
                    this.asm.xmm_vpsrl_rrr(
2504
                        tmp_xmm2,
2505
                        tmp_xmm.inner(),
2506
                        writable!(tmp_xmm2),
2507
                        OperandSize::S64,
2508
                    );
2509
                    this.asm.xmm_vpsrl_rrr(
2510
                        operand,
2511
                        tmp_xmm.inner(),
2512
                        writable!(operand),
2513
                        OperandSize::S64,
2514
                    );
2515
                    this.asm
2516
                        .xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand));
2517
                    this.asm
2518
                        .xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64);
2519

2520
                    context.free_reg(tmp_xmm2);
2521

2522
                    Ok(())
2523
                };
2524

2525
                let i8x16_shr_s = |this: &mut Self,
2526
                                   context: &mut CodeGenContext<Emission>|
2527
                 -> Result<()> {
2528
                    // Since the x86 instruction set does not have an 8x16 shift instruction and the
2529
                    // approach used for `ishl` and `ushr` cannot be easily used (the masks do not
2530
                    // preserve the sign), we use a different approach here: separate the low and
2531
                    // high lanes, shift them separately, and merge them into the final result.
2532
                    //
2533
                    // Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
2534
                    // s15]:
2535
                    //
2536
                    //   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2537
                    //   shifted_lo.i16x8 = shift each lane of `low`
2538
                    //   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2539
                    //   shifted_hi.i16x8 = shift each lane of `high`
2540
                    //   result = [s0'', s1'', ..., s15'']
2541

2542
                    // In order for `packsswb` later to only use the high byte of each
2543
                    // 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
2544
                    // fill in the upper bits appropriately.
2545
                    this.asm
2546
                        .add_ir(8, writable!(shift_amount), OperandSize::S32);
2547
                    this.asm
2548
                        .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2549

2550
                    let tmp_lo = context.any_fpr(this)?;
2551
                    let tmp_hi = context.any_fpr(this)?;
2552

2553
                    // Extract lower and upper bytes.
2554
                    this.asm
2555
                        .xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8);
2556
                    this.asm
2557
                        .xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8);
2558

2559
                    // Perform 16bit right shift of upper and lower bytes.
2560
                    this.asm.xmm_vpsra_rrr(
2561
                        tmp_lo,
2562
                        tmp_xmm.inner(),
2563
                        writable!(tmp_lo),
2564
                        OperandSize::S16,
2565
                    );
2566
                    this.asm.xmm_vpsra_rrr(
2567
                        tmp_hi,
2568
                        tmp_xmm.inner(),
2569
                        writable!(tmp_hi),
2570
                        OperandSize::S16,
2571
                    );
2572

2573
                    // Merge lower and upper bytes back.
2574
                    this.asm
2575
                        .xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8);
2576

2577
                    context.free_reg(tmp_lo);
2578
                    context.free_reg(tmp_hi);
2579

2580
                    Ok(())
2581
                };
2582

2583
                match (lane_width, kind) {
2584
                    // shl
2585
                    (OperandSize::S8, ShiftKind::Shl) => {
2586
                        shift_i8x16(masm, &I8X16_ISHL_MASKS, Direction::Left)
2587
                    }
2588
                    (OperandSize::S16, ShiftKind::Shl) => {
2589
                        move_to_tmp_xmm(masm);
2590
                        masm.asm.xmm_vpsll_rrr(
2591
                            operand,
2592
                            tmp_xmm.inner(),
2593
                            writable!(operand),
2594
                            OperandSize::S16,
2595
                        );
2596
                    }
2597
                    (OperandSize::S32, ShiftKind::Shl) => {
2598
                        move_to_tmp_xmm(masm);
2599
                        masm.asm.xmm_vpsll_rrr(
2600
                            operand,
2601
                            tmp_xmm.inner(),
2602
                            writable!(operand),
2603
                            OperandSize::S32,
2604
                        );
2605
                    }
2606
                    (OperandSize::S64, ShiftKind::Shl) => {
2607
                        move_to_tmp_xmm(masm);
2608
                        masm.asm.xmm_vpsll_rrr(
2609
                            operand,
2610
                            tmp_xmm.inner(),
2611
                            writable!(operand),
2612
                            OperandSize::S64,
2613
                        );
2614
                    }
2615
                    // shr_u
2616
                    (OperandSize::S8, ShiftKind::ShrU) => {
2617
                        shift_i8x16(masm, &I8X16_USHR_MASKS, Direction::Right)
2618
                    }
2619
                    (OperandSize::S16, ShiftKind::ShrU) => {
2620
                        move_to_tmp_xmm(masm);
2621
                        masm.asm.xmm_vpsrl_rrr(
2622
                            operand,
2623
                            tmp_xmm.inner(),
2624
                            writable!(operand),
2625
                            OperandSize::S16,
2626
                        );
2627
                    }
2628
                    (OperandSize::S32, ShiftKind::ShrU) => {
2629
                        move_to_tmp_xmm(masm);
2630
                        masm.asm.xmm_vpsrl_rrr(
2631
                            operand,
2632
                            tmp_xmm.inner(),
2633
                            writable!(operand),
2634
                            OperandSize::S32,
2635
                        );
2636
                    }
2637
                    (OperandSize::S64, ShiftKind::ShrU) => {
2638
                        move_to_tmp_xmm(masm);
2639
                        masm.asm.xmm_vpsrl_rrr(
2640
                            operand,
2641
                            tmp_xmm.inner(),
2642
                            writable!(operand),
2643
                            OperandSize::S64,
2644
                        );
2645
                    }
2646
                    // shr_s
2647
                    (OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(masm, context)?,
2648
                    (OperandSize::S16, ShiftKind::ShrS) => {
2649
                        move_to_tmp_xmm(masm);
2650
                        masm.asm.xmm_vpsra_rrr(
2651
                            operand,
2652
                            tmp_xmm.inner(),
2653
                            writable!(operand),
2654
                            OperandSize::S16,
2655
                        );
2656
                    }
2657
                    (OperandSize::S32, ShiftKind::ShrS) => {
2658
                        move_to_tmp_xmm(masm);
2659
                        masm.asm.xmm_vpsra_rrr(
2660
                            operand,
2661
                            tmp_xmm.inner(),
2662
                            writable!(operand),
2663
                            OperandSize::S32,
2664
                        );
2665
                    }
2666
                    (OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(masm, context)?,
2667

2668
                    _ => bail!(CodeGenError::invalid_operand_combination()),
2669
                }
2670

2671
                Ok(())
2672
            })
2673
        })?;
2674

2675
        context.free_reg(shift_amount);
2676
        context
2677
            .stack
2678
            .push(TypedReg::new(WasmValType::V128, operand).into());
2679
        Ok(())
2680
    }
2681

2682
    fn v128_q15mulr_sat_s(
2683
        &mut self,
2684
        lhs: Reg,
2685
        rhs: Reg,
2686
        dst: WritableReg,
2687
        size: OperandSize,
2688
    ) -> Result<()> {
2689
        self.ensure_has_avx()?;
2690

2691
        self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
2692

2693
        // Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
2694
        // format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
2695
        // produces 0x8000 in that case when the correct result is 0x7FFF (that
2696
        // is, +1) so need to check if the result is 0x8000 and flip the bits
2697
        // of the result if it is.
2698
        let address = self.asm.add_constant(&[
2699
            0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
2700
            0x00, 0x80,
2701
        ]);
2702
        self.asm
2703
            .xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
2704
        self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
2705
        Ok(())
2706
    }
2707

2708
    fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2709
        self.ensure_has_avx()?;
2710

2711
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2712
            // Create a mask of all 0s.
2713
            masm.asm
2714
                .xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2715
            // Sets lane in `dst` to not zero if `src` lane was zero, and lane in
2716
            // `dst` to zero if `src` lane was not zero.
2717
            masm.asm
2718
                .xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size);
2719
            // Sets ZF if all values are zero (i.e., if all original values were not zero).
2720
            masm.asm.xmm_vptest(src, src);
2721
            // Set byte if ZF=1.
2722
        });
2723
        self.asm.setcc(IntCmpKind::Eq, dst);
2724
        Ok(())
2725
    }
2726

2727
    fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2728
        self.ensure_has_avx()?;
2729

2730
        match size {
2731
            OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),
2732
            OperandSize::S16 => {
2733
                // Signed conversion of 16-bit integers to 8-bit integers.
2734
                self.asm
2735
                    .xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);
2736
                // Creates a mask from each byte in `src`.
2737
                self.asm
2738
                    .xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);
2739
                // Removes 8 bits added as a result of the `vpackss` step.
2740
                self.asm
2741
                    .shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);
2742
            }
2743
            OperandSize::S32 | OperandSize::S64 => {
2744
                self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32)
2745
            }
2746
            _ => unimplemented!(),
2747
        }
2748

2749
        Ok(())
2750
    }
2751

2752
    fn v128_trunc(
2753
        &mut self,
2754
        context: &mut CodeGenContext<Emission>,
2755
        kind: V128TruncKind,
2756
    ) -> Result<()> {
2757
        self.ensure_has_avx()?;
2758

2759
        let reg = writable!(context.pop_to_reg(self, None)?.reg);
2760
        match kind {
2761
            V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(
2762
                reg.to_reg(),
2763
                reg,
2764
                VroundMode::TowardZero,
2765
                kind.dst_lane_size(),
2766
            ),
2767
            V128TruncKind::I32x4FromF32x4S => {
2768
                self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2769
            }
2770
            V128TruncKind::I32x4FromF32x4U => {
2771
                let temp_reg = writable!(context.any_fpr(self)?);
2772
                self.v128_trunc_sat_f32x4_u(
2773
                    reg,
2774
                    temp_reg,
2775
                    kind.src_lane_size(),
2776
                    kind.dst_lane_size(),
2777
                )?;
2778
                context.free_reg(temp_reg.to_reg());
2779
            }
2780
            V128TruncKind::I32x4FromF64x2SZero => {
2781
                self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?;
2782
            }
2783
            V128TruncKind::I32x4FromF64x2UZero => {
2784
                self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2785
            }
2786
        }
2787

2788
        context.stack.push(TypedReg::v128(reg.to_reg()).into());
2789
        Ok(())
2790
    }
2791

2792
    fn v128_min(
2793
        &mut self,
2794
        src1: Reg,
2795
        src2: Reg,
2796
        dst: WritableReg,
2797
        kind: V128MinKind,
2798
    ) -> Result<()> {
2799
        self.ensure_has_avx()?;
2800

2801
        match kind {
2802
            V128MinKind::I8x16S
2803
            | V128MinKind::I8x16U
2804
            | V128MinKind::I16x8S
2805
            | V128MinKind::I16x8U
2806
            | V128MinKind::I32x4S
2807
            | V128MinKind::I32x4U => {
2808
                match kind {
2809
                    V128MinKind::I8x16S => {
2810
                        self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8)
2811
                    }
2812
                    V128MinKind::I8x16U => {
2813
                        self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8)
2814
                    }
2815
                    V128MinKind::I16x8S => {
2816
                        self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16)
2817
                    }
2818
                    V128MinKind::I16x8U => {
2819
                        self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16)
2820
                    }
2821
                    V128MinKind::I32x4S => {
2822
                        self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32)
2823
                    }
2824
                    V128MinKind::I32x4U => {
2825
                        self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32)
2826
                    }
2827
                    _ => unreachable!(),
2828
                };
2829
            }
2830
            V128MinKind::F32x4 | V128MinKind::F64x2 => {
2831
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2832
                    // Handling +0 and -0 as well as NaN values are not commutative
2833
                    // when using `vminp` so we have to compensate.
2834
                    // Perform two comparison operations with the operands swapped
2835
                    // and OR the result to propagate 0 (positive and negative) and
2836
                    // NaN.
2837
                    masm.asm
2838
                        .xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2839
                    masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());
2840
                    // Use a single OR instruction to set the sign bit if either
2841
                    // result has the sign bit set to correctly propagate -0.
2842
                    masm.asm
2843
                        .xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2844
                });
2845
                // Set lanes with NaN to all 1s.
2846
                self.asm.xmm_vcmpp_rrr(
2847
                    writable!(src2),
2848
                    src2,
2849
                    dst.to_reg(),
2850
                    kind.lane_size(),
2851
                    VcmpKind::Unord,
2852
                );
2853
                // Doesn't change non-NaN values. For NaN values, sets all bits.
2854
                self.asm
2855
                    .xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2856
                self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2857
            }
2858
        }
2859

2860
        Ok(())
2861
    }
2862

2863
    fn v128_max(
2864
        &mut self,
2865
        src1: Reg,
2866
        src2: Reg,
2867
        dst: WritableReg,
2868
        kind: V128MaxKind,
2869
    ) -> Result<()> {
2870
        self.ensure_has_avx()?;
2871

2872
        match kind {
2873
            V128MaxKind::I8x16S
2874
            | V128MaxKind::I8x16U
2875
            | V128MaxKind::I16x8S
2876
            | V128MaxKind::I16x8U
2877
            | V128MaxKind::I32x4S
2878
            | V128MaxKind::I32x4U => {
2879
                match kind {
2880
                    V128MaxKind::I8x16S => {
2881
                        self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8)
2882
                    }
2883
                    V128MaxKind::I8x16U => {
2884
                        self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8)
2885
                    }
2886
                    V128MaxKind::I16x8S => {
2887
                        self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16)
2888
                    }
2889
                    V128MaxKind::I16x8U => {
2890
                        self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16)
2891
                    }
2892
                    V128MaxKind::I32x4S => {
2893
                        self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32)
2894
                    }
2895
                    V128MaxKind::I32x4U => {
2896
                        self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32)
2897
                    }
2898
                    _ => unreachable!(),
2899
                };
2900
            }
2901
            V128MaxKind::F32x4 | V128MaxKind::F64x2 => {
2902
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2903
                    // Handling +0 and -0 as well as NaN values are not commutative
2904
                    // when using `vmaxp` so we have to compensate.
2905
                    // Perform two comparison operations with the operands swapped
2906
                    // so we can propagate 0 (positive and negative) and NaNs
2907
                    // correctly.
2908

2909
                    masm.asm
2910
                        .xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2911
                    masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());
2912
                    // This combination of XOR, OR, and SUB will set the sign bit
2913
                    // on a 0 result to the correct value for a max operation.
2914
                    masm.asm
2915
                        .xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2916
                    masm.asm.xmm_vorp_rrr(
2917
                        dst.to_reg(),
2918
                        scratch.inner(),
2919
                        writable!(src2),
2920
                        kind.lane_size(),
2921
                    );
2922
                });
2923
                self.asm
2924
                    .xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2925
                // Set lanes of NaN values to 1.
2926
                self.asm.xmm_vcmpp_rrr(
2927
                    writable!(src2),
2928
                    src2,
2929
                    src2,
2930
                    kind.lane_size(),
2931
                    VcmpKind::Unord,
2932
                );
2933
                self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2934
            }
2935
        }
2936
        Ok(())
2937
    }
2938

2939
    fn v128_extmul(
2940
        &mut self,
2941
        context: &mut CodeGenContext<Emission>,
2942
        kind: V128ExtMulKind,
2943
    ) -> Result<()> {
2944
        self.ensure_has_avx()?;
2945

2946
        // The implementation for extmul is not optimized; for simplicity's sake, we simply perform
2947
        // an extension followed by a multiplication using already implemented primitives.
2948

2949
        let src1 = context.pop_to_reg(self, None)?;
2950
        let src2 = context.pop_to_reg(self, None)?;
2951

2952
        let ext_kind = kind.into();
2953
        self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;
2954
        self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;
2955

2956
        context.stack.push(src2.into());
2957
        context.stack.push(src1.into());
2958

2959
        self.v128_mul(context, kind.into())
2960
    }
2961

2962
    fn v128_extadd_pairwise(
2963
        &mut self,
2964
        src: Reg,
2965
        dst: WritableReg,
2966
        kind: V128ExtAddKind,
2967
    ) -> Result<()> {
2968
        self.ensure_has_avx()?;
2969

2970
        match kind {
2971
            V128ExtAddKind::I8x16S => {
2972
                self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2973
                    // Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
2974
                    // sign extend `src` to 16 bits and add adjacent words.
2975
                    // Need to supply constant as first operand since first operand
2976
                    // is treated as unsigned and the second operand is signed.
2977
                    let mask = masm.asm.add_constant(&[1; 16]);
2978
                    masm.asm.xmm_mov_mr(
2979
                        &mask,
2980
                        scratch.writable(),
2981
                        OperandSize::S128,
2982
                        MemFlags::trusted(),
2983
                    );
2984
                    masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst);
2985
                });
2986
            }
2987
            V128ExtAddKind::I8x16U => {
2988
                // Same approach as the signed variant but treat `src` as
2989
                // unsigned instead of signed by passing it as the first
2990
                // operand.
2991
                let mask = self.asm.add_constant(&[1; 16]);
2992
                self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst);
2993
            }
2994
            V128ExtAddKind::I16x8S => {
2995
                // Similar approach to the two variants above. The vector is 8
2996
                // lanes of 16-bit 1's and `vpmaddwd` treats both operands as
2997
                // signed.
2998
                let mask = self
2999
                    .asm
3000
                    .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3001
                self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
3002
            }
3003
            V128ExtAddKind::I16x8U => {
3004
                // Similar approach as the signed variant.
3005
                // `vpmaddwd` operates on signed integers and the operand is
3006
                // unsigned so the operand needs to be converted to a signed
3007
                // format and than that process needs to be reversed after
3008
                // `vpmaddwd`.
3009
                // Flip the sign bit for 8 16-bit lanes.
3010
                let xor_mask = self.asm.add_constant(&[
3011
                    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
3012
                    0x80, 0x00, 0x80,
3013
                ]);
3014
                self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
3015

3016
                let madd_mask = self
3017
                    .asm
3018
                    .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3019
                self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
3020

3021
                // Reverse the XOR. The XOR effectively subtracts 32,768 from
3022
                // both pairs that are added together so 65,536 (0x10000)
3023
                // needs to be added to 4 lanes of 32-bit values.
3024
                let add_mask = self
3025
                    .asm
3026
                    .add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
3027
                self.asm
3028
                    .xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
3029
            }
3030
        }
3031
        Ok(())
3032
    }
3033

3034
    fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
3035
        self.ensure_has_avx()?;
3036
        self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst);
3037
        Ok(())
3038
    }
3039

3040
    fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
3041
        self.ensure_has_avx()?;
3042

3043
        let reg = writable!(context.pop_to_reg(self, None)?.reg);
3044

3045
        // This works by using a lookup table to determine the count of bits
3046
        // set in the upper 4 bits and lower 4 bits separately and then adding
3047
        // the counts.
3048

3049
        // A mask to zero out the upper 4 bits in each lane.
3050
        let address = self.asm.add_constant(&[
3051
            0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
3052
            0x0F, 0x0F,
3053
        ]);
3054

3055
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3056
            // Zero out the upper 4 bits of each lane.
3057
            masm.asm
3058
                .xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable());
3059
            // Right shift bytes in input by 4 bits to put the upper 4 bits in the
3060
            // lower 4 bits.
3061
            masm.asm
3062
                .xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16);
3063
            // Zero out the upper 4 bits of each shifted lane.
3064
            masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
3065

3066
            // Write a lookup table of 4 bit values to number of bits set to a
3067
            // register so we only perform the memory read once.
3068
            // Index (hex) | Value (binary) | Population Count
3069
            // 0x0         | 0000          | 0
3070
            // 0x1         | 0001          | 1
3071
            // 0x2         | 0010          | 1
3072
            // 0x3         | 0011          | 2
3073
            // 0x4         | 0100          | 1
3074
            // 0x5         | 0101          | 2
3075
            // 0x6         | 0110          | 2
3076
            // 0x7         | 0111          | 3
3077
            // 0x8         | 1000          | 1
3078
            // 0x9         | 1001          | 2
3079
            // 0xA         | 1010          | 2
3080
            // 0xB         | 1011          | 3
3081
            // 0xC         | 1100          | 2
3082
            // 0xD         | 1101          | 3
3083
            // 0xE         | 1110          | 3
3084
            // 0xF         | 1111          | 4
3085
            let address = masm.asm.add_constant(&[
3086
                0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
3087
            ]);
3088
            let reg2 = writable!(context.any_fpr(masm)?);
3089
            masm.asm
3090
                .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
3091
            // Use the upper 4 bits as an index into the lookup table.
3092
            masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
3093
            // Use the lower 4 bits as an index into the lookup table.
3094
            masm.asm
3095
                .xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner());
3096
            context.free_reg(reg2.to_reg());
3097

3098
            // Add the counts of the upper 4 bits and the lower 4 bits to get the
3099
            // total number of bits set.
3100
            masm.asm
3101
                .xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8);
3102
            anyhow::Ok(())
3103
        })?;
3104

3105
        context.stack.push(TypedReg::v128(reg.to_reg()).into());
3106
        Ok(())
3107
    }
3108

3109
    fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3110
        self.ensure_has_avx()?;
3111
        self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
3112
        Ok(())
3113
    }
3114

3115
    fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3116
        self.ensure_has_avx()?;
3117
        self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);
3118
        Ok(())
3119
    }
3120

3121
    fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3122
        self.ensure_has_avx()?;
3123
        self.asm.xmm_vsqrtp_rr(src, dst, size);
3124
        Ok(())
3125
    }
3126

3127
    fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3128
        self.ensure_has_avx()?;
3129
        self.asm
3130
            .xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);
3131
        Ok(())
3132
    }
3133

3134
    fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3135
        self.ensure_has_avx()?;
3136
        self.asm
3137
            .xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);
3138
        Ok(())
3139
    }
3140

3141
    fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3142
        self.ensure_has_avx()?;
3143
        self.asm
3144
            .xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);
3145
        Ok(())
3146
    }
3147

3148
    fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3149
        self.ensure_has_avx()?;
3150
        // Reverse operands since Wasm specifies returning the first operand if
3151
        // either operand is NaN while x86 returns the second operand.
3152
        self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);
3153
        Ok(())
3154
    }
3155

3156
    fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3157
        self.ensure_has_avx()?;
3158
        // Reverse operands since Wasm specifies returning the first operand if
3159
        // either operand is NaN while x86 returns the second operand.
3160
        self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);
3161
        Ok(())
3162
    }
3163
}
3164

3165
impl MacroAssembler {
3166
    /// Create an x64 MacroAssembler.
3167
    pub fn new(
3168
        ptr_size: impl PtrSize,
3169
        shared_flags: settings::Flags,
3170
        isa_flags: x64_settings::Flags,
3171
    ) -> Result<Self> {
3172
        let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size());
3173

3174
        Ok(Self {
3175
            sp_offset: 0,
3176
            sp_max: 0,
3177
            stack_max_use_add: None,
3178
            asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),
3179
            flags: isa_flags,
3180
            shared_flags,
3181
            ptr_size: ptr_type.try_into()?,
3182
            scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),
3183
        })
3184
    }
3185

3186
    /// Add the maximum stack used to a register, recording an obligation to update the
3187
    /// add-with-immediate instruction emitted to use the real stack max when the masm is being
3188
    /// finalized.
3189
    fn add_stack_max(&mut self, reg: Reg) {
3190
        assert!(self.stack_max_use_add.is_none());
3191
        let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm);
3192
        self.stack_max_use_add.replace(patch);
3193
    }
3194

3195
    fn ensure_has_avx(&self) -> Result<()> {
3196
        anyhow::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);
3197
        Ok(())
3198
    }
3199

3200
    fn ensure_has_avx2(&self) -> Result<()> {
3201
        anyhow::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);
3202
        Ok(())
3203
    }
3204

3205
    fn ensure_has_avx512vl(&self) -> Result<()> {
3206
        anyhow::ensure!(
3207
            self.flags.has_avx512vl(),
3208
            CodeGenError::UnimplementedForNoAvx512VL
3209
        );
3210
        Ok(())
3211
    }
3212

3213
    fn ensure_has_avx512dq(&self) -> Result<()> {
3214
        anyhow::ensure!(
3215
            self.flags.has_avx512dq(),
3216
            CodeGenError::UnimplementedForNoAvx512DQ
3217
        );
3218
        Ok(())
3219
    }
3220

3221
    fn increment_sp(&mut self, bytes: u32) {
3222
        self.sp_offset += bytes;
3223

3224
        // NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have
3225
        // seen the entire function, this value will represent the maximum size for the stack
3226
        // frame.
3227
        self.sp_max = self.sp_max.max(self.sp_offset);
3228
    }
3229

3230
    fn decrement_sp(&mut self, bytes: u32) {
3231
        assert!(
3232
            self.sp_offset >= bytes,
3233
            "sp offset = {}; bytes = {}",
3234
            self.sp_offset,
3235
            bytes
3236
        );
3237
        self.sp_offset -= bytes;
3238
    }
3239

3240
    fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {
3241
        match constant {
3242
            I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),
3243
            I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),
3244
            I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3245
            I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3246
            I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3247
        }
3248
    }
3249

3250
    /// A common implementation for zero-extend stack loads.
3251
    fn load_impl(
3252
        &mut self,
3253
        src: Address,
3254
        dst: WritableReg,
3255
        size: OperandSize,
3256
        flags: MemFlags,
3257
    ) -> Result<()> {
3258
        if dst.to_reg().is_int() {
3259
            let ext = size.extend_to::<Zero>(OperandSize::S64);
3260
            self.asm.movzx_mr(&src, dst, ext, flags);
3261
        } else {
3262
            self.asm.xmm_mov_mr(&src, dst, size, flags);
3263
        }
3264

3265
        Ok(())
3266
    }
3267

3268
    /// A common implementation for stack stores.
3269
    fn store_impl(
3270
        &mut self,
3271
        src: RegImm,
3272
        dst: Address,
3273
        size: OperandSize,
3274
        flags: MemFlags,
3275
    ) -> Result<()> {
3276
        let _ = match src {
3277
            RegImm::Imm(imm) => match imm {
3278
                I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),
3279
                I::I64(v) => match v.try_into() {
3280
                    Ok(v) => self.asm.mov_im(v, &dst, size, flags),
3281
                    Err(_) => {
3282
                        // If the immediate doesn't sign extend, use a scratch
3283
                        // register.
3284
                        self.with_scratch::<IntScratch, _>(|masm, scratch| {
3285
                            masm.asm.mov_ir(v, scratch.writable(), size);
3286
                            masm.asm.mov_rm(scratch.inner(), &dst, size, flags);
3287
                        });
3288
                    }
3289
                },
3290
                I::F32(v) => {
3291
                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3292
                    self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3293
                        // Always trusted, since we are loading the constant from
3294
                        // the constant pool.
3295
                        masm.asm.xmm_mov_mr(
3296
                            &addr,
3297
                            float_scratch.writable(),
3298
                            size,
3299
                            MemFlags::trusted(),
3300
                        );
3301
                        masm.asm
3302
                            .xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3303
                    });
3304
                }
3305
                I::F64(v) => {
3306
                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3307

3308
                    self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3309
                        // Similar to above, always trusted since we are loading the
3310
                        // constant from the constant pool.
3311
                        masm.asm.xmm_mov_mr(
3312
                            &addr,
3313
                            float_scratch.writable(),
3314
                            size,
3315
                            MemFlags::trusted(),
3316
                        );
3317
                        masm.asm
3318
                            .xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3319
                    });
3320
                }
3321
                I::V128(v) => {
3322
                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3323
                    self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| {
3324
                        // Always trusted, since we are loading the constant from
3325
                        // the constant pool.
3326
                        masm.asm.xmm_mov_mr(
3327
                            &addr,
3328
                            vector_scratch.writable(),
3329
                            size,
3330
                            MemFlags::trusted(),
3331
                        );
3332
                        masm.asm
3333
                            .xmm_mov_rm(vector_scratch.inner(), &dst, size, flags);
3334
                    });
3335
                }
3336
            },
3337
            RegImm::Reg(reg) => {
3338
                if reg.is_int() {
3339
                    self.asm.mov_rm(reg, &dst, size, flags);
3340
                } else {
3341
                    self.asm.xmm_mov_rm(reg, &dst, size, flags);
3342
                }
3343
            }
3344
        };
3345
        Ok(())
3346
    }
3347

3348
    fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {
3349
        if dst != lhs {
3350
            Err(anyhow!(CodeGenError::invalid_two_arg_form()))
3351
        } else {
3352
            Ok(())
3353
        }
3354
    }
3355

3356
    /// The mask to use when performing a `vpshuf` operation for a 64-bit splat.
3357
    fn vpshuf_mask_for_64_bit_splats() -> u8 {
3358
        // Results in the first 4 bytes and second 4 bytes being
3359
        // swapped and then the swapped bytes being copied.
3360
        // [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields
3361
        // [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].
3362
        0b01_00_01_00
3363
    }
3364

3365
    fn v128_trunc_sat_f32x4_s(
3366
        &mut self,
3367
        reg: WritableReg,
3368
        src_lane_size: OperandSize,
3369
        dst_lane_size: OperandSize,
3370
    ) -> Result<()> {
3371
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3372
            // Create a mask to handle NaN values (1 for not NaN, 0 for
3373
            // NaN).
3374
            masm.asm.xmm_vcmpp_rrr(
3375
                scratch.writable(),
3376
                reg.to_reg(),
3377
                reg.to_reg(),
3378
                src_lane_size,
3379
                VcmpKind::Eq,
3380
            );
3381
            // Zero out any NaN values.
3382
            masm.asm
3383
                .xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3384
            // Create a mask for the sign bits.
3385
            masm.asm
3386
                .xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable());
3387
            // Convert floats to integers.
3388
            masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3389
            // Apply sign mask to the converted integers.
3390
            masm.asm
3391
                .xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3392
            // Create a saturation mask of all 1s for negative numbers,
3393
            // all 0s for positive numbers. The arithmetic shift will cop
3394
            // the sign bit.
3395
            masm.asm
3396
                .xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size);
3397
            // Combine converted integers with saturation mask.
3398
            masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg);
3399
            Ok(())
3400
        })
3401
    }
3402

3403
    fn v128_trunc_sat_f32x4_u(
3404
        &mut self,
3405
        reg: WritableReg,
3406
        temp_reg: WritableReg,
3407
        src_lane_size: OperandSize,
3408
        dst_lane_size: OperandSize,
3409
    ) -> Result<()> {
3410
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3411
            // Set scratch to all zeros.
3412
            masm.asm.xmm_vxorp_rrr(
3413
                reg.to_reg(),
3414
                reg.to_reg(),
3415
                scratch.writable(),
3416
                src_lane_size,
3417
            );
3418
            // Clamp negative numbers to 0.
3419
            masm.asm
3420
                .xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3421
            // Create a vector of all 1s.
3422
            masm.asm.xmm_vpcmpeq_rrr(
3423
                scratch.writable(),
3424
                scratch.inner(),
3425
                scratch.inner(),
3426
                src_lane_size,
3427
            );
3428
            // Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by
3429
            // performing a logical shift right.
3430
            masm.asm
3431
                .xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size);
3432
            // Convert max signed int to float as a reference point for saturation.
3433
            masm.asm
3434
                .xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
3435
            // Convert the floats to integers and put the results in `reg2`.
3436
            // This is signed and not unsigned so we need to handle the
3437
            // value for the high bit in each lane.
3438
            masm.asm
3439
                .xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);
3440
            // Set `reg` lanes to the amount that the value in the lane
3441
            // exceeds the maximum signed 32-bit integer.
3442
            masm.asm
3443
                .xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size);
3444
            // Create mask in `scratch` for numbers that are larger than
3445
            // the maximum signed 32-bit integer. Lanes that don't fit
3446
            // in 32-bits ints will be 1.
3447
            masm.asm.xmm_vcmpp_rrr(
3448
                scratch.writable(),
3449
                scratch.inner(),
3450
                reg.to_reg(),
3451
                dst_lane_size,
3452
                VcmpKind::Le,
3453
            );
3454
            // Convert the excess over signed 32-bits from floats to integers.
3455
            masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3456
            // Apply large number mask to excess values which will flip the
3457
            // bits in any lanes that exceed signed 32-bits. Adding this
3458
            // flipped value to the signed value will set the high bit and
3459
            // the carry behavior will update the other bits correctly.
3460
            masm.asm
3461
                .xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3462
            // Set `reg` to all 0s.
3463
            masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg);
3464
            // Ensure excess values are not negative by taking max b/w
3465
            // excess values and zero.
3466
            masm.asm
3467
                .xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size);
3468
        });
3469
        // Perform the addition between the signed conversion value (in
3470
        // `reg2`) and the flipped excess value (in `reg`) to get the
3471
        // unsigned value.
3472
        self.asm
3473
            .xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);
3474
        Ok(())
3475
    }
3476

3477
    fn v128_trunc_sat_f64x2_s_zero(
3478
        &mut self,
3479
        reg: WritableReg,
3480
        src_lane_size: OperandSize,
3481
    ) -> Result<()> {
3482
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3483
            // Create a NaN mask (1s for non-NaN, 0s for NaN).
3484
            masm.asm.xmm_vcmpp_rrr(
3485
                scratch.writable(),
3486
                reg.to_reg(),
3487
                reg.to_reg(),
3488
                src_lane_size,
3489
                VcmpKind::Eq,
3490
            );
3491
            // Clamp NaN values to maximum 64-bit float that can be
3492
            // converted to an i32.
3493
            let address = masm.asm.add_constant(&[
3494
                0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
3495
                0xDF, 0x41,
3496
            ]);
3497
            masm.asm
3498
                .xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size);
3499
            // Handle the saturation for values too large to fit in an i32.
3500
            masm.asm
3501
                .xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3502
            // Convert the floats to integers.
3503
            masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);
3504

3505
            Ok(())
3506
        })
3507
    }
3508

3509
    fn v128_trunc_sat_f64x2_u_zero(
3510
        &mut self,
3511
        reg: WritableReg,
3512
        src_lane_size: OperandSize,
3513
        dst_lane_size: OperandSize,
3514
    ) -> Result<()> {
3515
        self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3516
            // Zero out the scratch register.
3517
            masm.asm.xmm_vxorp_rrr(
3518
                scratch.inner(),
3519
                scratch.inner(),
3520
                scratch.writable(),
3521
                src_lane_size,
3522
            );
3523
            // Clamp negative values to zero.
3524
            masm.asm
3525
                .xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3526
            // Clamp value to maximum unsigned 32-bit integer value
3527
            // (0x41F0000000000000).
3528
            let address = masm.asm.add_constant(&[
3529
                0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
3530
                0xEF, 0x41,
3531
            ]);
3532
            masm.asm
3533
                .xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3534
            // Truncate floating point values.
3535
            masm.asm
3536
                .xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);
3537
            // Add 2^52 (doubles store 52 bits in their mantissa) to each
3538
            // lane causing values in the lower bits to be shifted into
3539
            // position for integer conversion.
3540
            let address = masm.asm.add_constant(&[
3541
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3542
                0x30, 0x43,
3543
            ]);
3544
            masm.asm
3545
                .xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3546
            // Takes lanes 0 and 2 from `reg` (converted values) and lanes
3547
            // 0 and 2 from `scratch` (zeroes) to put the converted ints in
3548
            // the lower lanes and zeroes in the upper lanes.
3549
            masm.asm.xmm_vshufp_rrri(
3550
                reg.to_reg(),
3551
                scratch.inner(),
3552
                reg,
3553
                0b10_00_10_00,
3554
                dst_lane_size,
3555
            );
3556
            Ok(())
3557
        })
3558
    }
3559

3560
    /// Given a vector of floats where lanes with NaN values are set to all 1s
3561
    /// in `reg` and a vector register `dst` with a mix of non-NaN values and
3562
    /// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.
3563
    fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {
3564
        // Canonical NaNs do not preserve the sign bit, have the exponent bits
3565
        // all set, and have only the high bit of the mantissa set so shift by
3566
        // that number.
3567
        // The mask we're producing in this step will be inverted in the next
3568
        // step.
3569
        let amount_to_shift = 1 + size.mantissa_bits() + 1;
3570
        self.asm
3571
            .xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size);
3572
        // The mask will be inverted by the ANDN so non-NaN values will be all
3573
        // 1s and NaN values will set the sign bit, exponent bits, and zero out
3574
        // almost all of the mantissa.
3575
        self.asm
3576
            .xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);
3577
    }
3578
}
3579

3580
Product

Resources

Company