CoCalc -- asm.rs

GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/winch/codegen/src/isa/x64/asm.rs
¹⁶⁹³ views
1
//! Assembler library implementation for x64.
2

3
use crate::{
4
    constant_pool::ConstantPool,
5
    isa::{CallingConvention, reg::Reg},
6
    masm::{
7
        DivKind, Extend, ExtendKind, ExtendType, IntCmpKind, MulWideKind, OperandSize, RemKind,
8
        RoundingMode, ShiftKind, Signed, V128ExtendKind, V128LoadExtendKind, Zero,
9
    },
10
    reg::writable,
11
};
12
use cranelift_codegen::{
13
    CallInfo, Final, MachBuffer, MachBufferFinalized, MachInst, MachInstEmit, MachInstEmitState,
14
    MachLabel, PatchRegion, Writable,
15
    ir::{ExternalName, MemFlags, SourceLoc, TrapCode, Type, UserExternalNameRef, types},
16
    isa::{
17
        unwind::UnwindInst,
18
        x64::{
19
            AtomicRmwSeqOp, EmitInfo, EmitState, Inst,
20
            args::{
21
                self, Amode, CC, ExtMode, FromWritableReg, Gpr, GprMem, GprMemImm, RegMem,
22
                RegMemImm, SyntheticAmode, WritableGpr, WritableXmm, Xmm, XmmMem, XmmMemImm,
23
            },
24
            external::{PairedGpr, PairedXmm},
25
            settings as x64_settings,
26
        },
27
    },
28
    settings,
29
};
30

31
use crate::reg::WritableReg;
32
use cranelift_assembler_x64 as asm;
33

34
use super::address::Address;
35
use smallvec::SmallVec;
36

37
// Conversions between winch-codegen x64 types and cranelift-codegen x64 types.
38

39
impl From<Reg> for RegMemImm {
40
    fn from(reg: Reg) -> Self {
41
        RegMemImm::reg(reg.into())
42
    }
43
}
44

45
impl From<Reg> for RegMem {
46
    fn from(value: Reg) -> Self {
47
        RegMem::Reg { reg: value.into() }
48
    }
49
}
50

51
impl From<Reg> for WritableGpr {
52
    fn from(reg: Reg) -> Self {
53
        let writable = Writable::from_reg(reg.into());
54
        WritableGpr::from_writable_reg(writable).expect("valid writable gpr")
55
    }
56
}
57

58
impl From<Reg> for WritableXmm {
59
    fn from(reg: Reg) -> Self {
60
        let writable = Writable::from_reg(reg.into());
61
        WritableXmm::from_writable_reg(writable).expect("valid writable xmm")
62
    }
63
}
64

65
/// Convert a writable GPR register to the read-write pair expected by
66
/// `cranelift-codegen`.
67
fn pair_gpr(reg: WritableReg) -> PairedGpr {
68
    assert!(reg.to_reg().is_int());
69
    let read = Gpr::unwrap_new(reg.to_reg().into());
70
    let write = WritableGpr::from_reg(reg.to_reg().into());
71
    PairedGpr { read, write }
72
}
73

74
impl From<Reg> for asm::Gpr<Gpr> {
75
    fn from(reg: Reg) -> Self {
76
        asm::Gpr::new(reg.into())
77
    }
78
}
79

80
impl From<Reg> for asm::GprMem<Gpr, Gpr> {
81
    fn from(reg: Reg) -> Self {
82
        asm::GprMem::Gpr(reg.into())
83
    }
84
}
85

86
/// Convert a writable XMM register to the read-write pair expected by
87
/// `cranelift-codegen`.
88
fn pair_xmm(reg: WritableReg) -> PairedXmm {
89
    assert!(reg.to_reg().is_float());
90
    let read = Xmm::unwrap_new(reg.to_reg().into());
91
    let write = WritableXmm::from_reg(reg.to_reg().into());
92
    PairedXmm { read, write }
93
}
94

95
impl From<Reg> for asm::Xmm<Xmm> {
96
    fn from(reg: Reg) -> Self {
97
        asm::Xmm::new(reg.into())
98
    }
99
}
100

101
impl From<Reg> for asm::XmmMem<Xmm, Gpr> {
102
    fn from(reg: Reg) -> Self {
103
        asm::XmmMem::Xmm(reg.into())
104
    }
105
}
106

107
impl From<Reg> for Gpr {
108
    fn from(reg: Reg) -> Self {
109
        Gpr::unwrap_new(reg.into())
110
    }
111
}
112

113
impl From<Reg> for GprMem {
114
    fn from(value: Reg) -> Self {
115
        GprMem::unwrap_new(value.into())
116
    }
117
}
118

119
impl From<Reg> for GprMemImm {
120
    fn from(reg: Reg) -> Self {
121
        GprMemImm::unwrap_new(reg.into())
122
    }
123
}
124

125
impl From<Reg> for Xmm {
126
    fn from(reg: Reg) -> Self {
127
        Xmm::unwrap_new(reg.into())
128
    }
129
}
130

131
impl From<Reg> for XmmMem {
132
    fn from(value: Reg) -> Self {
133
        XmmMem::unwrap_new(value.into())
134
    }
135
}
136

137
impl From<Reg> for XmmMemImm {
138
    fn from(value: Reg) -> Self {
139
        XmmMemImm::unwrap_new(value.into())
140
    }
141
}
142

143
impl From<OperandSize> for args::OperandSize {
144
    fn from(size: OperandSize) -> Self {
145
        match size {
146
            OperandSize::S8 => Self::Size8,
147
            OperandSize::S16 => Self::Size16,
148
            OperandSize::S32 => Self::Size32,
149
            OperandSize::S64 => Self::Size64,
150
            s => panic!("Invalid operand size {s:?}"),
151
        }
152
    }
153
}
154

155
impl From<IntCmpKind> for CC {
156
    fn from(value: IntCmpKind) -> Self {
157
        match value {
158
            IntCmpKind::Eq => CC::Z,
159
            IntCmpKind::Ne => CC::NZ,
160
            IntCmpKind::LtS => CC::L,
161
            IntCmpKind::LtU => CC::B,
162
            IntCmpKind::GtS => CC::NLE,
163
            IntCmpKind::GtU => CC::NBE,
164
            IntCmpKind::LeS => CC::LE,
165
            IntCmpKind::LeU => CC::BE,
166
            IntCmpKind::GeS => CC::NL,
167
            IntCmpKind::GeU => CC::NB,
168
        }
169
    }
170
}
171

172
impl<T: ExtendType> From<Extend<T>> for ExtMode {
173
    fn from(value: Extend<T>) -> Self {
174
        match value {
175
            Extend::I32Extend8 => ExtMode::BL,
176
            Extend::I32Extend16 => ExtMode::WL,
177
            Extend::I64Extend8 => ExtMode::BQ,
178
            Extend::I64Extend16 => ExtMode::WQ,
179
            Extend::I64Extend32 => ExtMode::LQ,
180
            Extend::__Kind(_) => unreachable!(),
181
        }
182
    }
183
}
184

185
impl From<ExtendKind> for ExtMode {
186
    fn from(value: ExtendKind) -> Self {
187
        match value {
188
            ExtendKind::Signed(s) => s.into(),
189
            ExtendKind::Unsigned(u) => u.into(),
190
        }
191
    }
192
}
193

194
/// Kinds of extends supported by `vpmov`.
195
pub(super) enum VpmovKind {
196
    /// Sign extends 8 lanes of 8-bit integers to 8 lanes of 16-bit integers.
197
    E8x8S,
198
    /// Zero extends 8 lanes of 8-bit integers to 8 lanes of 16-bit integers.
199
    E8x8U,
200
    /// Sign extends 4 lanes of 16-bit integers to 4 lanes of 32-bit integers.
201
    E16x4S,
202
    /// Zero extends 4 lanes of 16-bit integers to 4 lanes of 32-bit integers.
203
    E16x4U,
204
    /// Sign extends 2 lanes of 32-bit integers to 2 lanes of 64-bit integers.
205
    E32x2S,
206
    /// Zero extends 2 lanes of 32-bit integers to 2 lanes of 64-bit integers.
207
    E32x2U,
208
}
209

210
impl From<V128LoadExtendKind> for VpmovKind {
211
    fn from(value: V128LoadExtendKind) -> Self {
212
        match value {
213
            V128LoadExtendKind::E8x8S => Self::E8x8S,
214
            V128LoadExtendKind::E8x8U => Self::E8x8U,
215
            V128LoadExtendKind::E16x4S => Self::E16x4S,
216
            V128LoadExtendKind::E16x4U => Self::E16x4U,
217
            V128LoadExtendKind::E32x2S => Self::E32x2S,
218
            V128LoadExtendKind::E32x2U => Self::E32x2U,
219
        }
220
    }
221
}
222

223
impl From<V128ExtendKind> for VpmovKind {
224
    fn from(value: V128ExtendKind) -> Self {
225
        match value {
226
            V128ExtendKind::LowI8x16S | V128ExtendKind::HighI8x16S => Self::E8x8S,
227
            V128ExtendKind::LowI8x16U => Self::E8x8U,
228
            V128ExtendKind::LowI16x8S | V128ExtendKind::HighI16x8S => Self::E16x4S,
229
            V128ExtendKind::LowI16x8U => Self::E16x4U,
230
            V128ExtendKind::LowI32x4S | V128ExtendKind::HighI32x4S => Self::E32x2S,
231
            V128ExtendKind::LowI32x4U => Self::E32x2U,
232
            _ => unimplemented!(),
233
        }
234
    }
235
}
236

237
/// Kinds of comparisons supported by `vcmp`.
238
pub(super) enum VcmpKind {
239
    /// Equal comparison.
240
    Eq,
241
    /// Not equal comparison.
242
    Ne,
243
    /// Less than comparison.
244
    Lt,
245
    /// Less than or equal comparison.
246
    Le,
247
    /// Unordered comparison. Sets result to all 1s if either source operand is
248
    /// NaN.
249
    Unord,
250
}
251

252
/// Kinds of conversions supported by `vcvt`.
253
pub(super) enum VcvtKind {
254
    /// Converts 32-bit integers to 32-bit floats.
255
    I32ToF32,
256
    /// Converts doubleword integers to double precision floats.
257
    I32ToF64,
258
    /// Converts double precision floats to single precision floats.
259
    F64ToF32,
260
    // Converts double precision floats to 32-bit integers.
261
    F64ToI32,
262
    /// Converts single precision floats to double precision floats.
263
    F32ToF64,
264
    /// Converts single precision floats to 32-bit integers.
265
    F32ToI32,
266
}
267

268
/// Modes supported by `vround`.
269
pub(crate) enum VroundMode {
270
    /// Rounds toward nearest (ties to even).
271
    TowardNearest,
272
    /// Rounds toward negative infinity.
273
    TowardNegativeInfinity,
274
    /// Rounds toward positive infinity.
275
    TowardPositiveInfinity,
276
    /// Rounds toward zero.
277
    TowardZero,
278
}
279

280
/// Low level assembler implementation for x64.
281
pub(crate) struct Assembler {
282
    /// The machine instruction buffer.
283
    buffer: MachBuffer<Inst>,
284
    /// Constant emission information.
285
    emit_info: EmitInfo,
286
    /// Emission state.
287
    emit_state: EmitState,
288
    /// x64 flags.
289
    isa_flags: x64_settings::Flags,
290
    /// Constant pool.
291
    pool: ConstantPool,
292
}
293

294
impl Assembler {
295
    /// Create a new x64 assembler.
296
    pub fn new(shared_flags: settings::Flags, isa_flags: x64_settings::Flags) -> Self {
297
        Self {
298
            buffer: MachBuffer::<Inst>::new(),
299
            emit_state: Default::default(),
300
            emit_info: EmitInfo::new(shared_flags, isa_flags.clone()),
301
            pool: ConstantPool::new(),
302
            isa_flags,
303
        }
304
    }
305

306
    /// Get a mutable reference to underlying
307
    /// machine buffer.
308
    pub fn buffer_mut(&mut self) -> &mut MachBuffer<Inst> {
309
        &mut self.buffer
310
    }
311

312
    /// Get a reference to the underlying machine buffer.
313
    pub fn buffer(&self) -> &MachBuffer<Inst> {
314
        &self.buffer
315
    }
316

317
    /// Adds a constant to the constant pool and returns its address.
318
    pub fn add_constant(&mut self, constant: &[u8]) -> Address {
319
        let handle = self.pool.register(constant, &mut self.buffer);
320
        Address::constant(handle)
321
    }
322

323
    /// Load a floating point constant, using the constant pool.
324
    pub fn load_fp_const(&mut self, dst: WritableReg, constant: &[u8], size: OperandSize) {
325
        let addr = self.add_constant(constant);
326
        self.xmm_mov_mr(&addr, dst, size, MemFlags::trusted());
327
    }
328

329
    /// Return the emitted code.
330
    pub fn finalize(mut self, loc: Option<SourceLoc>) -> MachBufferFinalized<Final> {
331
        let stencil = self
332
            .buffer
333
            .finish(&self.pool.constants(), self.emit_state.ctrl_plane_mut());
334
        stencil.apply_base_srcloc(loc.unwrap_or_default())
335
    }
336

337
    fn emit(&mut self, inst: Inst) {
338
        inst.emit(&mut self.buffer, &self.emit_info, &mut self.emit_state);
339
    }
340

341
    fn to_synthetic_amode(addr: &Address, memflags: MemFlags) -> SyntheticAmode {
342
        match *addr {
343
            Address::Offset { base, offset } => {
344
                let amode = Amode::imm_reg(offset as i32, base.into()).with_flags(memflags);
345
                SyntheticAmode::real(amode)
346
            }
347
            Address::Const(c) => SyntheticAmode::ConstantOffset(c),
348
            Address::ImmRegRegShift {
349
                simm32,
350
                base,
351
                index,
352
                shift,
353
            } => SyntheticAmode::Real(Amode::ImmRegRegShift {
354
                simm32,
355
                base: base.into(),
356
                index: index.into(),
357
                shift,
358
                flags: memflags,
359
            }),
360
        }
361
    }
362

363
    /// Emit an unwind instruction.
364
    pub fn unwind_inst(&mut self, inst: UnwindInst) {
365
        self.emit(Inst::Unwind { inst })
366
    }
367

368
    /// Push register.
369
    pub fn push_r(&mut self, reg: Reg) {
370
        let inst = asm::inst::pushq_o::new(reg).into();
371
        self.emit(Inst::External { inst });
372
    }
373

374
    /// Pop to register.
375
    pub fn pop_r(&mut self, dst: WritableReg) {
376
        let writable: WritableGpr = dst.map(Into::into);
377
        let inst = asm::inst::popq_o::new(writable).into();
378
        self.emit(Inst::External { inst });
379
    }
380

381
    /// Return instruction.
382
    pub fn ret(&mut self) {
383
        let inst = asm::inst::retq_zo::new().into();
384
        self.emit(Inst::External { inst });
385
    }
386

387
    /// Register-to-register move.
388
    pub fn mov_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
389
        let dst: WritableGpr = dst.map(|r| r.into());
390
        let inst = match size {
391
            OperandSize::S8 => asm::inst::movb_mr::new(dst, src).into(),
392
            OperandSize::S16 => asm::inst::movw_mr::new(dst, src).into(),
393
            OperandSize::S32 => asm::inst::movl_mr::new(dst, src).into(),
394
            OperandSize::S64 => asm::inst::movq_mr::new(dst, src).into(),
395
            _ => unreachable!(),
396
        };
397
        self.emit(Inst::External { inst });
398
    }
399

400
    /// Register-to-memory move.
401
    pub fn mov_rm(&mut self, src: Reg, addr: &Address, size: OperandSize, flags: MemFlags) {
402
        assert!(addr.is_offset());
403
        let dst = Self::to_synthetic_amode(addr, flags);
404
        let inst = match size {
405
            OperandSize::S8 => asm::inst::movb_mr::new(dst, src).into(),
406
            OperandSize::S16 => asm::inst::movw_mr::new(dst, src).into(),
407
            OperandSize::S32 => asm::inst::movl_mr::new(dst, src).into(),
408
            OperandSize::S64 => asm::inst::movq_mr::new(dst, src).into(),
409
            _ => unreachable!(),
410
        };
411
        self.emit(Inst::External { inst });
412
    }
413

414
    /// Immediate-to-memory move.
415
    pub fn mov_im(&mut self, src: i32, addr: &Address, size: OperandSize, flags: MemFlags) {
416
        assert!(addr.is_offset());
417
        let dst = Self::to_synthetic_amode(addr, flags);
418
        let inst = match size {
419
            OperandSize::S8 => {
420
                let src = i8::try_from(src).unwrap();
421
                asm::inst::movb_mi::new(dst, src.cast_unsigned()).into()
422
            }
423
            OperandSize::S16 => {
424
                let src = i16::try_from(src).unwrap();
425
                asm::inst::movw_mi::new(dst, src.cast_unsigned()).into()
426
            }
427
            OperandSize::S32 => asm::inst::movl_mi::new(dst, src.cast_unsigned()).into(),
428
            OperandSize::S64 => asm::inst::movq_mi_sxl::new(dst, src).into(),
429
            _ => unreachable!(),
430
        };
431
        self.emit(Inst::External { inst });
432
    }
433

434
    /// Immediate-to-register move.
435
    pub fn mov_ir(&mut self, imm: u64, dst: WritableReg, size: OperandSize) {
436
        self.emit(Inst::imm(size.into(), imm, dst.map(Into::into)));
437
    }
438

439
    /// Zero-extend memory-to-register load.
440
    pub fn movzx_mr(
441
        &mut self,
442
        addr: &Address,
443
        dst: WritableReg,
444
        ext: Option<Extend<Zero>>,
445
        memflags: MemFlags,
446
    ) {
447
        let src = Self::to_synthetic_amode(addr, memflags);
448

449
        if let Some(ext) = ext {
450
            let dst = WritableGpr::from_reg(dst.to_reg().into());
451
            let inst = match ext.into() {
452
                ExtMode::BL => asm::inst::movzbl_rm::new(dst, src).into(),
453
                ExtMode::BQ => asm::inst::movzbq_rm::new(dst, src).into(),
454
                ExtMode::WL => asm::inst::movzwl_rm::new(dst, src).into(),
455
                ExtMode::WQ => asm::inst::movzwq_rm::new(dst, src).into(),
456
                ExtMode::LQ => {
457
                    // This instruction selection may seem strange but is
458
                    // correct in 64-bit mode: section 3.4.1.1 of the Intel
459
                    // manual says that "32-bit operands generate a 32-bit
460
                    // result, zero-extended to a 64-bit result in the
461
                    // destination general-purpose register." This is applicable
462
                    // beyond `mov` but we use this fact to zero-extend `src`
463
                    // into `dst`.
464
                    asm::inst::movl_rm::new(dst, src).into()
465
                }
466
            };
467
            self.emit(Inst::External { inst });
468
        } else {
469
            let dst = WritableGpr::from_reg(dst.to_reg().into());
470
            let inst = asm::inst::movq_rm::new(dst, src).into();
471
            self.emit(Inst::External { inst });
472
        }
473
    }
474

475
    // Sign-extend memory-to-register load.
476
    pub fn movsx_mr(
477
        &mut self,
478
        addr: &Address,
479
        dst: WritableReg,
480
        ext: Extend<Signed>,
481
        memflags: MemFlags,
482
    ) {
483
        let src = Self::to_synthetic_amode(addr, memflags);
484
        let dst = WritableGpr::from_reg(dst.to_reg().into());
485
        let inst = match ext.into() {
486
            ExtMode::BL => asm::inst::movsbl_rm::new(dst, src).into(),
487
            ExtMode::BQ => asm::inst::movsbq_rm::new(dst, src).into(),
488
            ExtMode::WL => asm::inst::movswl_rm::new(dst, src).into(),
489
            ExtMode::WQ => asm::inst::movswq_rm::new(dst, src).into(),
490
            ExtMode::LQ => asm::inst::movslq_rm::new(dst, src).into(),
491
        };
492
        self.emit(Inst::External { inst });
493
    }
494

495
    /// Register-to-register move with zero extension.
496
    pub fn movzx_rr(&mut self, src: Reg, dst: WritableReg, kind: Extend<Zero>) {
497
        let dst = WritableGpr::from_reg(dst.to_reg().into());
498
        let inst = match kind.into() {
499
            ExtMode::BL => asm::inst::movzbl_rm::new(dst, src).into(),
500
            ExtMode::BQ => asm::inst::movzbq_rm::new(dst, src).into(),
501
            ExtMode::WL => asm::inst::movzwl_rm::new(dst, src).into(),
502
            ExtMode::WQ => asm::inst::movzwq_rm::new(dst, src).into(),
503
            ExtMode::LQ => {
504
                // This instruction selection may seem strange but is correct in
505
                // 64-bit mode: section 3.4.1.1 of the Intel manual says that
506
                // "32-bit operands generate a 32-bit result, zero-extended to a
507
                // 64-bit result in the destination general-purpose register."
508
                // This is applicable beyond `mov` but we use this fact to
509
                // zero-extend `src` into `dst`.
510
                asm::inst::movl_rm::new(dst, src).into()
511
            }
512
        };
513
        self.emit(Inst::External { inst });
514
    }
515

516
    /// Register-to-register move with sign extension.
517
    pub fn movsx_rr(&mut self, src: Reg, dst: WritableReg, kind: Extend<Signed>) {
518
        let dst = WritableGpr::from_reg(dst.to_reg().into());
519
        let inst = match kind.into() {
520
            ExtMode::BL => asm::inst::movsbl_rm::new(dst, src).into(),
521
            ExtMode::BQ => asm::inst::movsbq_rm::new(dst, src).into(),
522
            ExtMode::WL => asm::inst::movswl_rm::new(dst, src).into(),
523
            ExtMode::WQ => asm::inst::movswq_rm::new(dst, src).into(),
524
            ExtMode::LQ => asm::inst::movslq_rm::new(dst, src).into(),
525
        };
526
        self.emit(Inst::External { inst });
527
    }
528

529
    /// Integer register conditional move.
530
    pub fn cmov(&mut self, src: Reg, dst: WritableReg, cc: IntCmpKind, size: OperandSize) {
531
        use IntCmpKind::*;
532
        use OperandSize::*;
533

534
        let dst: WritableGpr = dst.map(Into::into);
535
        let inst = match size {
536
            S8 | S16 | S32 => match cc {
537
                Eq => asm::inst::cmovel_rm::new(dst, src).into(),
538
                Ne => asm::inst::cmovnel_rm::new(dst, src).into(),
539
                LtS => asm::inst::cmovll_rm::new(dst, src).into(),
540
                LtU => asm::inst::cmovbl_rm::new(dst, src).into(),
541
                GtS => asm::inst::cmovgl_rm::new(dst, src).into(),
542
                GtU => asm::inst::cmoval_rm::new(dst, src).into(),
543
                LeS => asm::inst::cmovlel_rm::new(dst, src).into(),
544
                LeU => asm::inst::cmovbel_rm::new(dst, src).into(),
545
                GeS => asm::inst::cmovgel_rm::new(dst, src).into(),
546
                GeU => asm::inst::cmovael_rm::new(dst, src).into(),
547
            },
548
            S64 => match cc {
549
                Eq => asm::inst::cmoveq_rm::new(dst, src).into(),
550
                Ne => asm::inst::cmovneq_rm::new(dst, src).into(),
551
                LtS => asm::inst::cmovlq_rm::new(dst, src).into(),
552
                LtU => asm::inst::cmovbq_rm::new(dst, src).into(),
553
                GtS => asm::inst::cmovgq_rm::new(dst, src).into(),
554
                GtU => asm::inst::cmovaq_rm::new(dst, src).into(),
555
                LeS => asm::inst::cmovleq_rm::new(dst, src).into(),
556
                LeU => asm::inst::cmovbeq_rm::new(dst, src).into(),
557
                GeS => asm::inst::cmovgeq_rm::new(dst, src).into(),
558
                GeU => asm::inst::cmovaeq_rm::new(dst, src).into(),
559
            },
560
            _ => unreachable!(),
561
        };
562
        self.emit(Inst::External { inst });
563
    }
564

565
    /// Single and double precision floating point
566
    /// register-to-register move.
567
    pub fn xmm_mov_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
568
        let ty = match size {
569
            OperandSize::S32 => types::F32,
570
            OperandSize::S64 => types::F64,
571
            OperandSize::S128 => types::I32X4,
572
            OperandSize::S8 | OperandSize::S16 => unreachable!(),
573
        };
574
        self.emit(Inst::gen_move(dst.map(|r| r.into()), src.into(), ty));
575
    }
576

577
    /// Single and double precision floating point load.
578
    pub fn xmm_mov_mr(
579
        &mut self,
580
        src: &Address,
581
        dst: WritableReg,
582
        size: OperandSize,
583
        flags: MemFlags,
584
    ) {
585
        use OperandSize::*;
586

587
        assert!(dst.to_reg().is_float());
588

589
        let src = Self::to_synthetic_amode(src, flags);
590
        let dst: WritableXmm = dst.map(|r| r.into());
591
        let inst = match size {
592
            S32 => asm::inst::movss_a_m::new(dst, src).into(),
593
            S64 => asm::inst::movsd_a_m::new(dst, src).into(),
594
            S128 => asm::inst::movdqu_a::new(dst, src).into(),
595
            S8 | S16 => unreachable!(),
596
        };
597
        self.emit(Inst::External { inst });
598
    }
599

600
    /// Vector load and extend.
601
    pub fn xmm_vpmov_mr(
602
        &mut self,
603
        src: &Address,
604
        dst: WritableReg,
605
        kind: VpmovKind,
606
        flags: MemFlags,
607
    ) {
608
        assert!(dst.to_reg().is_float());
609
        let src = Self::to_synthetic_amode(src, flags);
610
        let dst: WritableXmm = dst.map(|r| r.into());
611
        let inst = match kind {
612
            VpmovKind::E8x8S => asm::inst::vpmovsxbw_a::new(dst, src).into(),
613
            VpmovKind::E8x8U => asm::inst::vpmovzxbw_a::new(dst, src).into(),
614
            VpmovKind::E16x4S => asm::inst::vpmovsxwd_a::new(dst, src).into(),
615
            VpmovKind::E16x4U => asm::inst::vpmovzxwd_a::new(dst, src).into(),
616
            VpmovKind::E32x2S => asm::inst::vpmovsxdq_a::new(dst, src).into(),
617
            VpmovKind::E32x2U => asm::inst::vpmovzxdq_a::new(dst, src).into(),
618
        };
619
        self.emit(Inst::External { inst });
620
    }
621

622
    /// Extends vector of integers in `src` and puts results in `dst`.
623
    pub fn xmm_vpmov_rr(&mut self, src: Reg, dst: WritableReg, kind: VpmovKind) {
624
        let dst: WritableXmm = dst.map(|r| r.into());
625
        let inst = match kind {
626
            VpmovKind::E8x8S => asm::inst::vpmovsxbw_a::new(dst, src).into(),
627
            VpmovKind::E8x8U => asm::inst::vpmovzxbw_a::new(dst, src).into(),
628
            VpmovKind::E16x4S => asm::inst::vpmovsxwd_a::new(dst, src).into(),
629
            VpmovKind::E16x4U => asm::inst::vpmovzxwd_a::new(dst, src).into(),
630
            VpmovKind::E32x2S => asm::inst::vpmovsxdq_a::new(dst, src).into(),
631
            VpmovKind::E32x2U => asm::inst::vpmovzxdq_a::new(dst, src).into(),
632
        };
633
        self.emit(Inst::External { inst });
634
    }
635

636
    /// Vector load and broadcast.
637
    pub fn xmm_vpbroadcast_mr(
638
        &mut self,
639
        src: &Address,
640
        dst: WritableReg,
641
        size: OperandSize,
642
        flags: MemFlags,
643
    ) {
644
        assert!(dst.to_reg().is_float());
645
        let src = Self::to_synthetic_amode(src, flags);
646
        let dst: WritableXmm = dst.map(|r| r.into());
647
        let inst = match size {
648
            OperandSize::S8 => asm::inst::vpbroadcastb_a::new(dst, src).into(),
649
            OperandSize::S16 => asm::inst::vpbroadcastw_a::new(dst, src).into(),
650
            OperandSize::S32 => asm::inst::vpbroadcastd_a::new(dst, src).into(),
651
            _ => unimplemented!(),
652
        };
653
        self.emit(Inst::External { inst });
654
    }
655

656
    /// Value in `src` is broadcast into lanes of `size` in `dst`.
657
    pub fn xmm_vpbroadcast_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
658
        assert!(src.is_float() && dst.to_reg().is_float());
659
        let dst: WritableXmm = dst.map(|r| r.into());
660
        let inst = match size {
661
            OperandSize::S8 => asm::inst::vpbroadcastb_a::new(dst, src).into(),
662
            OperandSize::S16 => asm::inst::vpbroadcastw_a::new(dst, src).into(),
663
            OperandSize::S32 => asm::inst::vpbroadcastd_a::new(dst, src).into(),
664
            _ => unimplemented!(),
665
        };
666
        self.emit(Inst::External { inst });
667
    }
668

669
    /// Memory to register shuffle of bytes in vector.
670
    pub fn xmm_vpshuf_mr(
671
        &mut self,
672
        src: &Address,
673
        dst: WritableReg,
674
        mask: u8,
675
        size: OperandSize,
676
        flags: MemFlags,
677
    ) {
678
        let dst: WritableXmm = dst.map(|r| r.into());
679
        let src = Self::to_synthetic_amode(src, flags);
680
        let inst = match size {
681
            OperandSize::S32 => asm::inst::vpshufd_a::new(dst, src, mask).into(),
682
            _ => unimplemented!(),
683
        };
684
        self.emit(Inst::External { inst });
685
    }
686

687
    /// Register to register shuffle of bytes in vector.
688
    pub fn xmm_vpshuf_rr(&mut self, src: Reg, dst: WritableReg, mask: u8, size: OperandSize) {
689
        let dst: WritableXmm = dst.map(|r| r.into());
690

691
        let inst = match size {
692
            OperandSize::S16 => asm::inst::vpshuflw_a::new(dst, src, mask).into(),
693
            OperandSize::S32 => asm::inst::vpshufd_a::new(dst, src, mask).into(),
694
            _ => unimplemented!(),
695
        };
696

697
        self.emit(Inst::External { inst });
698
    }
699

700
    /// Single and double precision floating point store.
701
    pub fn xmm_mov_rm(&mut self, src: Reg, dst: &Address, size: OperandSize, flags: MemFlags) {
702
        use OperandSize::*;
703

704
        assert!(src.is_float());
705

706
        let dst = Self::to_synthetic_amode(dst, flags);
707
        let src: Xmm = src.into();
708
        let inst = match size {
709
            S32 => asm::inst::movss_c_m::new(dst, src).into(),
710
            S64 => asm::inst::movsd_c_m::new(dst, src).into(),
711
            S128 => asm::inst::movdqu_b::new(dst, src).into(),
712
            S16 | S8 => unreachable!(),
713
        };
714
        self.emit(Inst::External { inst })
715
    }
716

717
    /// Floating point register conditional move.
718
    pub fn xmm_cmov(&mut self, src: Reg, dst: WritableReg, cc: IntCmpKind, size: OperandSize) {
719
        let dst: WritableXmm = dst.map(Into::into);
720
        let ty = match size {
721
            OperandSize::S32 => types::F32,
722
            OperandSize::S64 => types::F64,
723
            // Move the entire 128 bits via movdqa.
724
            OperandSize::S128 => types::I32X4,
725
            OperandSize::S8 | OperandSize::S16 => unreachable!(),
726
        };
727

728
        self.emit(Inst::XmmCmove {
729
            ty,
730
            cc: cc.into(),
731
            consequent: Xmm::unwrap_new(src.into()),
732
            alternative: dst.to_reg(),
733
            dst,
734
        })
735
    }
736

737
    /// Subtract register and register
738
    pub fn sub_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
739
        let dst = pair_gpr(dst);
740
        let inst = match size {
741
            OperandSize::S8 => asm::inst::subb_rm::new(dst, src).into(),
742
            OperandSize::S16 => asm::inst::subw_rm::new(dst, src).into(),
743
            OperandSize::S32 => asm::inst::subl_rm::new(dst, src).into(),
744
            OperandSize::S64 => asm::inst::subq_rm::new(dst, src).into(),
745
            OperandSize::S128 => unimplemented!(),
746
        };
747
        self.emit(Inst::External { inst });
748
    }
749

750
    /// Subtract immediate register.
751
    pub fn sub_ir(&mut self, imm: i32, dst: WritableReg, size: OperandSize) {
752
        let dst = pair_gpr(dst);
753
        let inst = match size {
754
            OperandSize::S8 => asm::inst::subb_mi::new(dst, u8::try_from(imm).unwrap()).into(),
755
            OperandSize::S16 => asm::inst::subw_mi::new(dst, u16::try_from(imm).unwrap()).into(),
756
            OperandSize::S32 => asm::inst::subl_mi::new(dst, imm as u32).into(),
757
            OperandSize::S64 => asm::inst::subq_mi_sxl::new(dst, imm).into(),
758
            OperandSize::S128 => unimplemented!(),
759
        };
760
        self.emit(Inst::External { inst });
761
    }
762

763
    /// "and" two registers.
764
    pub fn and_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
765
        let dst = pair_gpr(dst);
766
        let inst = match size {
767
            OperandSize::S8 => asm::inst::andb_rm::new(dst, src).into(),
768
            OperandSize::S16 => asm::inst::andw_rm::new(dst, src).into(),
769
            OperandSize::S32 => asm::inst::andl_rm::new(dst, src).into(),
770
            OperandSize::S64 => asm::inst::andq_rm::new(dst, src).into(),
771
            OperandSize::S128 => unimplemented!(),
772
        };
773
        self.emit(Inst::External { inst });
774
    }
775

776
    pub fn and_ir(&mut self, imm: i32, dst: WritableReg, size: OperandSize) {
777
        let dst = pair_gpr(dst);
778
        let inst = match size {
779
            OperandSize::S8 => asm::inst::andb_mi::new(dst, u8::try_from(imm).unwrap()).into(),
780
            OperandSize::S16 => asm::inst::andw_mi::new(dst, u16::try_from(imm).unwrap()).into(),
781
            OperandSize::S32 => asm::inst::andl_mi::new(dst, imm as u32).into(),
782
            OperandSize::S64 => asm::inst::andq_mi_sxl::new(dst, imm).into(),
783
            OperandSize::S128 => unimplemented!(),
784
        };
785
        self.emit(Inst::External { inst });
786
    }
787

788
    /// "and" two float registers.
789
    pub fn xmm_and_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
790
        let dst = pair_xmm(dst);
791
        let inst = match size {
792
            OperandSize::S32 => asm::inst::andps_a::new(dst, src).into(),
793
            OperandSize::S64 => asm::inst::andpd_a::new(dst, src).into(),
794
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
795
        };
796
        self.emit(Inst::External { inst });
797
    }
798

799
    /// "and not" two float registers.
800
    pub fn xmm_andn_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
801
        let dst = pair_xmm(dst);
802
        let inst = match size {
803
            OperandSize::S32 => asm::inst::andnps_a::new(dst, src).into(),
804
            OperandSize::S64 => asm::inst::andnpd_a::new(dst, src).into(),
805
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
806
        };
807
        self.emit(Inst::External { inst });
808
    }
809

810
    pub fn gpr_to_xmm(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
811
        let dst: WritableXmm = dst.map(|r| r.into());
812
        let inst = match size {
813
            OperandSize::S32 => asm::inst::movd_a::new(dst, src).into(),
814
            OperandSize::S64 => asm::inst::movq_a::new(dst, src).into(),
815
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
816
        };
817

818
        self.emit(Inst::External { inst });
819
    }
820

821
    pub fn xmm_to_gpr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
822
        let dst: WritableGpr = dst.map(Into::into);
823
        let src: Xmm = src.into();
824
        let inst = match size {
825
            OperandSize::S32 => asm::inst::movd_b::new(dst, src).into(),
826
            OperandSize::S64 => asm::inst::movq_b::new(dst, src).into(),
827
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
828
        };
829

830
        self.emit(Inst::External { inst })
831
    }
832

833
    /// Convert float to signed int.
834
    pub fn cvt_float_to_sint_seq(
835
        &mut self,
836
        src: Reg,
837
        dst: WritableReg,
838
        tmp_gpr: Reg,
839
        tmp_xmm: Reg,
840
        src_size: OperandSize,
841
        dst_size: OperandSize,
842
        saturating: bool,
843
    ) {
844
        self.emit(Inst::CvtFloatToSintSeq {
845
            dst_size: dst_size.into(),
846
            src_size: src_size.into(),
847
            is_saturating: saturating,
848
            src: src.into(),
849
            dst: dst.map(Into::into),
850
            tmp_gpr: tmp_gpr.into(),
851
            tmp_xmm: tmp_xmm.into(),
852
        });
853
    }
854

855
    /// Convert float to unsigned int.
856
    pub fn cvt_float_to_uint_seq(
857
        &mut self,
858
        src: Reg,
859
        dst: WritableReg,
860
        tmp_gpr: Reg,
861
        tmp_xmm: Reg,
862
        tmp_xmm2: Reg,
863
        src_size: OperandSize,
864
        dst_size: OperandSize,
865
        saturating: bool,
866
    ) {
867
        self.emit(Inst::CvtFloatToUintSeq {
868
            dst_size: dst_size.into(),
869
            src_size: src_size.into(),
870
            is_saturating: saturating,
871
            src: src.into(),
872
            dst: dst.map(Into::into),
873
            tmp_gpr: tmp_gpr.into(),
874
            tmp_xmm: tmp_xmm.into(),
875
            tmp_xmm2: tmp_xmm2.into(),
876
        });
877
    }
878

879
    /// Convert signed int to float.
880
    pub fn cvt_sint_to_float(
881
        &mut self,
882
        src: Reg,
883
        dst: WritableReg,
884
        src_size: OperandSize,
885
        dst_size: OperandSize,
886
    ) {
887
        use OperandSize::*;
888
        let dst = pair_xmm(dst);
889
        let inst = match (src_size, dst_size) {
890
            (S32, S32) => asm::inst::cvtsi2ssl_a::new(dst, src).into(),
891
            (S32, S64) => asm::inst::cvtsi2sdl_a::new(dst, src).into(),
892
            (S64, S32) => asm::inst::cvtsi2ssq_a::new(dst, src).into(),
893
            (S64, S64) => asm::inst::cvtsi2sdq_a::new(dst, src).into(),
894
            _ => unreachable!(),
895
        };
896
        self.emit(Inst::External { inst });
897
    }
898

899
    /// Convert unsigned 64-bit int to float.
900
    pub fn cvt_uint64_to_float_seq(
901
        &mut self,
902
        src: Reg,
903
        dst: WritableReg,
904
        tmp_gpr1: Reg,
905
        tmp_gpr2: Reg,
906
        dst_size: OperandSize,
907
    ) {
908
        self.emit(Inst::CvtUint64ToFloatSeq {
909
            dst_size: dst_size.into(),
910
            src: src.into(),
911
            dst: dst.map(Into::into),
912
            tmp_gpr1: tmp_gpr1.into(),
913
            tmp_gpr2: tmp_gpr2.into(),
914
        });
915
    }
916

917
    /// Change precision of float.
918
    pub fn cvt_float_to_float(
919
        &mut self,
920
        src: Reg,
921
        dst: WritableReg,
922
        src_size: OperandSize,
923
        dst_size: OperandSize,
924
    ) {
925
        use OperandSize::*;
926
        let dst = pair_xmm(dst);
927
        let inst = match (src_size, dst_size) {
928
            (S32, S64) => asm::inst::cvtss2sd_a::new(dst, src).into(),
929
            (S64, S32) => asm::inst::cvtsd2ss_a::new(dst, src).into(),
930
            _ => unimplemented!(),
931
        };
932
        self.emit(Inst::External { inst });
933
    }
934

935
    pub fn or_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
936
        let dst = pair_gpr(dst);
937
        let inst = match size {
938
            OperandSize::S8 => asm::inst::orb_rm::new(dst, src).into(),
939
            OperandSize::S16 => asm::inst::orw_rm::new(dst, src).into(),
940
            OperandSize::S32 => asm::inst::orl_rm::new(dst, src).into(),
941
            OperandSize::S64 => asm::inst::orq_rm::new(dst, src).into(),
942
            OperandSize::S128 => unimplemented!(),
943
        };
944
        self.emit(Inst::External { inst });
945
    }
946

947
    pub fn or_ir(&mut self, imm: i32, dst: WritableReg, size: OperandSize) {
948
        let dst = pair_gpr(dst);
949
        let inst = match size {
950
            OperandSize::S8 => asm::inst::orb_mi::new(dst, u8::try_from(imm).unwrap()).into(),
951
            OperandSize::S16 => asm::inst::orw_mi::new(dst, u16::try_from(imm).unwrap()).into(),
952
            OperandSize::S32 => asm::inst::orl_mi::new(dst, imm as u32).into(),
953
            OperandSize::S64 => asm::inst::orq_mi_sxl::new(dst, imm).into(),
954
            OperandSize::S128 => unimplemented!(),
955
        };
956
        self.emit(Inst::External { inst });
957
    }
958

959
    pub fn xmm_or_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
960
        let dst = pair_xmm(dst);
961
        let inst = match size {
962
            OperandSize::S32 => asm::inst::orps_a::new(dst, src).into(),
963
            OperandSize::S64 => asm::inst::orpd_a::new(dst, src).into(),
964
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
965
        };
966
        self.emit(Inst::External { inst });
967
    }
968

969
    /// Logical exclusive or with registers.
970
    pub fn xor_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
971
        let dst = pair_gpr(dst);
972
        let inst = match size {
973
            OperandSize::S8 => asm::inst::xorb_rm::new(dst, src).into(),
974
            OperandSize::S16 => asm::inst::xorw_rm::new(dst, src).into(),
975
            OperandSize::S32 => asm::inst::xorl_rm::new(dst, src).into(),
976
            OperandSize::S64 => asm::inst::xorq_rm::new(dst, src).into(),
977
            OperandSize::S128 => unimplemented!(),
978
        };
979
        self.emit(Inst::External { inst });
980
    }
981

982
    pub fn xor_ir(&mut self, imm: i32, dst: WritableReg, size: OperandSize) {
983
        let dst = pair_gpr(dst);
984
        let inst = match size {
985
            OperandSize::S8 => asm::inst::xorb_mi::new(dst, u8::try_from(imm).unwrap()).into(),
986
            OperandSize::S16 => asm::inst::xorw_mi::new(dst, u16::try_from(imm).unwrap()).into(),
987
            OperandSize::S32 => asm::inst::xorl_mi::new(dst, imm as u32).into(),
988
            OperandSize::S64 => asm::inst::xorq_mi_sxl::new(dst, imm).into(),
989
            OperandSize::S128 => unimplemented!(),
990
        };
991
        self.emit(Inst::External { inst });
992
    }
993

994
    /// Logical exclusive or with float registers.
995
    pub fn xmm_xor_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
996
        let dst = pair_xmm(dst);
997
        let inst = match size {
998
            OperandSize::S32 => asm::inst::xorps_a::new(dst, src).into(),
999
            OperandSize::S64 => asm::inst::xorpd_a::new(dst, src).into(),
1000
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
1001
        };
1002
        self.emit(Inst::External { inst });
1003
    }
1004

1005
    /// Shift with register and register.
1006
    pub fn shift_rr(&mut self, src: Reg, dst: WritableReg, kind: ShiftKind, size: OperandSize) {
1007
        let dst = pair_gpr(dst);
1008
        let src: Gpr = src.into();
1009
        let inst = match (kind, size) {
1010
            (ShiftKind::Shl, OperandSize::S32) => asm::inst::shll_mc::new(dst, src).into(),
1011
            (ShiftKind::Shl, OperandSize::S64) => asm::inst::shlq_mc::new(dst, src).into(),
1012
            (ShiftKind::Shl, _) => todo!(),
1013
            (ShiftKind::ShrS, OperandSize::S32) => asm::inst::sarl_mc::new(dst, src).into(),
1014
            (ShiftKind::ShrS, OperandSize::S64) => asm::inst::sarq_mc::new(dst, src).into(),
1015
            (ShiftKind::ShrS, _) => todo!(),
1016
            (ShiftKind::ShrU, OperandSize::S32) => asm::inst::shrl_mc::new(dst, src).into(),
1017
            (ShiftKind::ShrU, OperandSize::S64) => asm::inst::shrq_mc::new(dst, src).into(),
1018
            (ShiftKind::ShrU, _) => todo!(),
1019
            (ShiftKind::Rotl, OperandSize::S32) => asm::inst::roll_mc::new(dst, src).into(),
1020
            (ShiftKind::Rotl, OperandSize::S64) => asm::inst::rolq_mc::new(dst, src).into(),
1021
            (ShiftKind::Rotl, _) => todo!(),
1022
            (ShiftKind::Rotr, OperandSize::S32) => asm::inst::rorl_mc::new(dst, src).into(),
1023
            (ShiftKind::Rotr, OperandSize::S64) => asm::inst::rorq_mc::new(dst, src).into(),
1024
            (ShiftKind::Rotr, _) => todo!(),
1025
        };
1026
        self.emit(Inst::External { inst });
1027
    }
1028

1029
    /// Shift with immediate and register.
1030
    pub fn shift_ir(&mut self, imm: u8, dst: WritableReg, kind: ShiftKind, size: OperandSize) {
1031
        let dst = pair_gpr(dst);
1032
        let inst = match (kind, size) {
1033
            (ShiftKind::Shl, OperandSize::S32) => asm::inst::shll_mi::new(dst, imm).into(),
1034
            (ShiftKind::Shl, OperandSize::S64) => asm::inst::shlq_mi::new(dst, imm).into(),
1035
            (ShiftKind::Shl, _) => todo!(),
1036
            (ShiftKind::ShrS, OperandSize::S32) => asm::inst::sarl_mi::new(dst, imm).into(),
1037
            (ShiftKind::ShrS, OperandSize::S64) => asm::inst::sarq_mi::new(dst, imm).into(),
1038
            (ShiftKind::ShrS, _) => todo!(),
1039
            (ShiftKind::ShrU, OperandSize::S32) => asm::inst::shrl_mi::new(dst, imm).into(),
1040
            (ShiftKind::ShrU, OperandSize::S64) => asm::inst::shrq_mi::new(dst, imm).into(),
1041
            (ShiftKind::ShrU, _) => todo!(),
1042
            (ShiftKind::Rotl, OperandSize::S32) => asm::inst::roll_mi::new(dst, imm).into(),
1043
            (ShiftKind::Rotl, OperandSize::S64) => asm::inst::rolq_mi::new(dst, imm).into(),
1044
            (ShiftKind::Rotl, _) => todo!(),
1045
            (ShiftKind::Rotr, OperandSize::S32) => asm::inst::rorl_mi::new(dst, imm).into(),
1046
            (ShiftKind::Rotr, OperandSize::S64) => asm::inst::rorq_mi::new(dst, imm).into(),
1047
            (ShiftKind::Rotr, _) => todo!(),
1048
        };
1049
        self.emit(Inst::External { inst });
1050
    }
1051

1052
    /// Signed/unsigned division.
1053
    ///
1054
    /// Emits a sequence of instructions to ensure the correctness of
1055
    /// the division invariants.  This function assumes that the
1056
    /// caller has correctly allocated the dividend as `(rdx:rax)` and
1057
    /// accounted for the quotient to be stored in `rax`.
1058
    pub fn div(&mut self, divisor: Reg, dst: (Reg, Reg), kind: DivKind, size: OperandSize) {
1059
        let trap = match kind {
1060
            // Signed division has two trapping conditions, integer overflow and
1061
            // divide-by-zero. Check for divide-by-zero explicitly and let the
1062
            // hardware detect overflow.
1063
            DivKind::Signed => {
1064
                self.cmp_ir(divisor, 0, size);
1065
                self.emit(Inst::TrapIf {
1066
                    cc: CC::Z,
1067
                    trap_code: TrapCode::INTEGER_DIVISION_BY_ZERO,
1068
                });
1069

1070
                // Sign-extend the dividend with tailor-made instructoins for
1071
                // just this operation.
1072
                let ext_dst: WritableGpr = dst.1.into();
1073
                let ext_src: Gpr = dst.0.into();
1074
                let inst = match size {
1075
                    OperandSize::S32 => asm::inst::cltd_zo::new(ext_dst, ext_src).into(),
1076
                    OperandSize::S64 => asm::inst::cqto_zo::new(ext_dst, ext_src).into(),
1077
                    _ => unimplemented!(),
1078
                };
1079
                self.emit(Inst::External { inst });
1080
                TrapCode::INTEGER_OVERFLOW
1081
            }
1082

1083
            // Unsigned division only traps in one case, on divide-by-zero, so
1084
            // defer that to the trap opcode.
1085
            //
1086
            // The divisor_hi reg is initialized with zero through an
1087
            // xor-against-itself op.
1088
            DivKind::Unsigned => {
1089
                self.xor_rr(dst.1, writable!(dst.1), size);
1090
                TrapCode::INTEGER_DIVISION_BY_ZERO
1091
            }
1092
        };
1093
        let dst0 = pair_gpr(writable!(dst.0));
1094
        let dst1 = pair_gpr(writable!(dst.1));
1095
        let inst = match (kind, size) {
1096
            (DivKind::Signed, OperandSize::S32) => {
1097
                asm::inst::idivl_m::new(dst0, dst1, divisor, trap).into()
1098
            }
1099
            (DivKind::Unsigned, OperandSize::S32) => {
1100
                asm::inst::divl_m::new(dst0, dst1, divisor, trap).into()
1101
            }
1102
            (DivKind::Signed, OperandSize::S64) => {
1103
                asm::inst::idivq_m::new(dst0, dst1, divisor, trap).into()
1104
            }
1105
            (DivKind::Unsigned, OperandSize::S64) => {
1106
                asm::inst::divq_m::new(dst0, dst1, divisor, trap).into()
1107
            }
1108
            _ => todo!(),
1109
        };
1110
        self.emit(Inst::External { inst });
1111
    }
1112

1113
    /// Signed/unsigned remainder.
1114
    ///
1115
    /// Emits a sequence of instructions to ensure the correctness of the
1116
    /// division invariants and ultimately calculate the remainder.
1117
    /// This function assumes that the
1118
    /// caller has correctly allocated the dividend as `(rdx:rax)` and
1119
    /// accounted for the remainder to be stored in `rdx`.
1120
    pub fn rem(&mut self, divisor: Reg, dst: (Reg, Reg), kind: RemKind, size: OperandSize) {
1121
        match kind {
1122
            // Signed remainder goes through a pseudo-instruction which has
1123
            // some internal branching. The `dividend_hi`, or `rdx`, is
1124
            // initialized here with a `SignExtendData` instruction.
1125
            RemKind::Signed => {
1126
                let ext_dst: WritableGpr = dst.1.into();
1127

1128
                // Initialize `dividend_hi`, or `rdx`, with a tailor-made
1129
                // instruction for this operation.
1130
                let ext_src: Gpr = dst.0.into();
1131
                let inst = match size {
1132
                    OperandSize::S32 => asm::inst::cltd_zo::new(ext_dst, ext_src).into(),
1133
                    OperandSize::S64 => asm::inst::cqto_zo::new(ext_dst, ext_src).into(),
1134
                    _ => unimplemented!(),
1135
                };
1136
                self.emit(Inst::External { inst });
1137
                self.emit(Inst::CheckedSRemSeq {
1138
                    size: size.into(),
1139
                    divisor: divisor.into(),
1140
                    dividend_lo: dst.0.into(),
1141
                    dividend_hi: dst.1.into(),
1142
                    dst_quotient: dst.0.into(),
1143
                    dst_remainder: dst.1.into(),
1144
                });
1145
            }
1146

1147
            // Unsigned remainder initializes `dividend_hi` with zero and
1148
            // then executes a normal `div` instruction.
1149
            RemKind::Unsigned => {
1150
                self.xor_rr(dst.1, writable!(dst.1), size);
1151
                let dst0 = pair_gpr(writable!(dst.0));
1152
                let dst1 = pair_gpr(writable!(dst.1));
1153
                let trap = TrapCode::INTEGER_DIVISION_BY_ZERO;
1154
                let inst = match size {
1155
                    OperandSize::S32 => asm::inst::divl_m::new(dst0, dst1, divisor, trap).into(),
1156
                    OperandSize::S64 => asm::inst::divq_m::new(dst0, dst1, divisor, trap).into(),
1157
                    _ => todo!(),
1158
                };
1159
                self.emit(Inst::External { inst });
1160
            }
1161
        }
1162
    }
1163

1164
    /// Multiply immediate and register.
1165
    pub fn mul_ir(&mut self, imm: i32, dst: WritableReg, size: OperandSize) {
1166
        use OperandSize::*;
1167
        let src = dst.to_reg();
1168
        let dst: WritableGpr = dst.to_reg().into();
1169
        let inst = match size {
1170
            S16 => asm::inst::imulw_rmi::new(dst, src, u16::try_from(imm).unwrap()).into(),
1171
            S32 => asm::inst::imull_rmi::new(dst, src, imm as u32).into(),
1172
            S64 => asm::inst::imulq_rmi_sxl::new(dst, src, imm).into(),
1173
            S8 | S128 => unimplemented!(),
1174
        };
1175
        self.emit(Inst::External { inst });
1176
    }
1177

1178
    /// Multiply register and register.
1179
    pub fn mul_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1180
        use OperandSize::*;
1181
        let dst = pair_gpr(dst);
1182
        let inst = match size {
1183
            S16 => asm::inst::imulw_rm::new(dst, src).into(),
1184
            S32 => asm::inst::imull_rm::new(dst, src).into(),
1185
            S64 => asm::inst::imulq_rm::new(dst, src).into(),
1186
            S8 | S128 => unimplemented!(),
1187
        };
1188
        self.emit(Inst::External { inst });
1189
    }
1190

1191
    /// Add immediate and register.
1192
    pub fn add_ir(&mut self, imm: i32, dst: WritableReg, size: OperandSize) {
1193
        let dst = pair_gpr(dst);
1194
        let inst = match size {
1195
            OperandSize::S8 => asm::inst::addb_mi::new(dst, u8::try_from(imm).unwrap()).into(),
1196
            OperandSize::S16 => asm::inst::addw_mi::new(dst, u16::try_from(imm).unwrap()).into(),
1197
            OperandSize::S32 => asm::inst::addl_mi::new(dst, imm as u32).into(),
1198
            OperandSize::S64 => asm::inst::addq_mi_sxl::new(dst, imm).into(),
1199
            OperandSize::S128 => unimplemented!(),
1200
        };
1201
        self.emit(Inst::External { inst });
1202
    }
1203

1204
    /// Add register and register.
1205
    pub fn add_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1206
        let dst = pair_gpr(dst);
1207
        let inst = match size {
1208
            OperandSize::S8 => asm::inst::addb_rm::new(dst, src).into(),
1209
            OperandSize::S16 => asm::inst::addw_rm::new(dst, src).into(),
1210
            OperandSize::S32 => asm::inst::addl_rm::new(dst, src).into(),
1211
            OperandSize::S64 => asm::inst::addq_rm::new(dst, src).into(),
1212
            OperandSize::S128 => unimplemented!(),
1213
        };
1214
        self.emit(Inst::External { inst });
1215
    }
1216

1217
    pub fn lock_xadd(
1218
        &mut self,
1219
        addr: Address,
1220
        dst: WritableReg,
1221
        size: OperandSize,
1222
        flags: MemFlags,
1223
    ) {
1224
        assert!(addr.is_offset());
1225
        let mem = Self::to_synthetic_amode(&addr, flags);
1226
        let dst = pair_gpr(dst);
1227
        let inst = match size {
1228
            OperandSize::S8 => asm::inst::lock_xaddb_mr::new(mem, dst).into(),
1229
            OperandSize::S16 => asm::inst::lock_xaddw_mr::new(mem, dst).into(),
1230
            OperandSize::S32 => asm::inst::lock_xaddl_mr::new(mem, dst).into(),
1231
            OperandSize::S64 => asm::inst::lock_xaddq_mr::new(mem, dst).into(),
1232
            OperandSize::S128 => unimplemented!(),
1233
        };
1234

1235
        self.emit(Inst::External { inst });
1236
    }
1237

1238
    pub fn atomic_rmw_seq(
1239
        &mut self,
1240
        addr: Address,
1241
        operand: Reg,
1242
        dst: WritableReg,
1243
        temp: WritableReg,
1244
        size: OperandSize,
1245
        flags: MemFlags,
1246
        op: AtomicRmwSeqOp,
1247
    ) {
1248
        assert!(addr.is_offset());
1249
        let mem = Self::to_synthetic_amode(&addr, flags);
1250
        self.emit(Inst::AtomicRmwSeq {
1251
            ty: Type::int_with_byte_size(size.bytes() as _).unwrap(),
1252
            mem,
1253
            operand: operand.into(),
1254
            temp: temp.map(Into::into),
1255
            dst_old: dst.map(Into::into),
1256
            op,
1257
        });
1258
    }
1259

1260
    pub fn xchg(&mut self, addr: Address, dst: WritableReg, size: OperandSize, flags: MemFlags) {
1261
        assert!(addr.is_offset());
1262
        let mem = Self::to_synthetic_amode(&addr, flags);
1263
        let dst = pair_gpr(dst);
1264
        let inst = match size {
1265
            OperandSize::S8 => asm::inst::xchgb_rm::new(dst, mem).into(),
1266
            OperandSize::S16 => asm::inst::xchgw_rm::new(dst, mem).into(),
1267
            OperandSize::S32 => asm::inst::xchgl_rm::new(dst, mem).into(),
1268
            OperandSize::S64 => asm::inst::xchgq_rm::new(dst, mem).into(),
1269
            OperandSize::S128 => unimplemented!(),
1270
        };
1271

1272
        self.emit(Inst::External { inst });
1273
    }
1274
    pub fn cmpxchg(
1275
        &mut self,
1276
        addr: Address,
1277
        replacement: Reg,
1278
        dst: WritableReg,
1279
        size: OperandSize,
1280
        flags: MemFlags,
1281
    ) {
1282
        assert!(addr.is_offset());
1283
        let mem = Self::to_synthetic_amode(&addr, flags);
1284
        let dst = pair_gpr(dst);
1285
        let inst = match size {
1286
            OperandSize::S8 => asm::inst::lock_cmpxchgb_mr::new(mem, replacement, dst).into(),
1287
            OperandSize::S16 => asm::inst::lock_cmpxchgw_mr::new(mem, replacement, dst).into(),
1288
            OperandSize::S32 => asm::inst::lock_cmpxchgl_mr::new(mem, replacement, dst).into(),
1289
            OperandSize::S64 => asm::inst::lock_cmpxchgq_mr::new(mem, replacement, dst).into(),
1290
            OperandSize::S128 => unimplemented!(),
1291
        };
1292

1293
        self.emit(Inst::External { inst });
1294
    }
1295

1296
    pub fn cmp_ir(&mut self, src1: Reg, imm: i32, size: OperandSize) {
1297
        let inst = match size {
1298
            OperandSize::S8 => {
1299
                let imm = i8::try_from(imm).unwrap();
1300
                asm::inst::cmpb_mi::new(src1, imm.cast_unsigned()).into()
1301
            }
1302
            OperandSize::S16 => match i8::try_from(imm) {
1303
                Ok(imm8) => asm::inst::cmpw_mi_sxb::new(src1, imm8).into(),
1304
                Err(_) => {
1305
                    asm::inst::cmpw_mi::new(src1, i16::try_from(imm).unwrap().cast_unsigned())
1306
                        .into()
1307
                }
1308
            },
1309
            OperandSize::S32 => match i8::try_from(imm) {
1310
                Ok(imm8) => asm::inst::cmpl_mi_sxb::new(src1, imm8).into(),
1311
                Err(_) => asm::inst::cmpl_mi::new(src1, imm.cast_unsigned()).into(),
1312
            },
1313
            OperandSize::S64 => match i8::try_from(imm) {
1314
                Ok(imm8) => asm::inst::cmpq_mi_sxb::new(src1, imm8).into(),
1315
                Err(_) => asm::inst::cmpq_mi::new(src1, imm).into(),
1316
            },
1317
            OperandSize::S128 => unimplemented!(),
1318
        };
1319

1320
        self.emit(Inst::External { inst });
1321
    }
1322

1323
    pub fn cmp_rr(&mut self, src1: Reg, src2: Reg, size: OperandSize) {
1324
        let inst = match size {
1325
            OperandSize::S8 => asm::inst::cmpb_rm::new(src1, src2).into(),
1326
            OperandSize::S16 => asm::inst::cmpw_rm::new(src1, src2).into(),
1327
            OperandSize::S32 => asm::inst::cmpl_rm::new(src1, src2).into(),
1328
            OperandSize::S64 => asm::inst::cmpq_rm::new(src1, src2).into(),
1329
            OperandSize::S128 => unimplemented!(),
1330
        };
1331

1332
        self.emit(Inst::External { inst });
1333
    }
1334

1335
    /// Compares values in src1 and src2 and sets ZF, PF, and CF flags in EFLAGS
1336
    /// register.
1337
    pub fn ucomis(&mut self, src1: Reg, src2: Reg, size: OperandSize) {
1338
        let inst = match size {
1339
            OperandSize::S32 => asm::inst::ucomiss_a::new(src1, src2).into(),
1340
            OperandSize::S64 => asm::inst::ucomisd_a::new(src1, src2).into(),
1341
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
1342
        };
1343
        self.emit(Inst::External { inst });
1344
    }
1345

1346
    pub fn popcnt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1347
        assert!(
1348
            self.isa_flags.has_popcnt() && self.isa_flags.has_sse42(),
1349
            "Requires has_popcnt and has_sse42 flags"
1350
        );
1351
        let dst = WritableGpr::from_reg(dst.to_reg().into());
1352
        let inst = match size {
1353
            OperandSize::S16 => asm::inst::popcntw_rm::new(dst, src).into(),
1354
            OperandSize::S32 => asm::inst::popcntl_rm::new(dst, src).into(),
1355
            OperandSize::S64 => asm::inst::popcntq_rm::new(dst, src).into(),
1356
            OperandSize::S8 | OperandSize::S128 => unreachable!(),
1357
        };
1358
        self.emit(Inst::External { inst });
1359
    }
1360

1361
    /// Emit a test instruction with two register operands.
1362
    pub fn test_rr(&mut self, src1: Reg, src2: Reg, size: OperandSize) {
1363
        let inst = match size {
1364
            OperandSize::S8 => asm::inst::testb_mr::new(src1, src2).into(),
1365
            OperandSize::S16 => asm::inst::testw_mr::new(src1, src2).into(),
1366
            OperandSize::S32 => asm::inst::testl_mr::new(src1, src2).into(),
1367
            OperandSize::S64 => asm::inst::testq_mr::new(src1, src2).into(),
1368
            OperandSize::S128 => unimplemented!(),
1369
        };
1370

1371
        self.emit(Inst::External { inst });
1372
    }
1373

1374
    /// Set value in dst to `0` or `1` based on flags in status register and
1375
    /// [`CmpKind`].
1376
    pub fn setcc(&mut self, kind: IntCmpKind, dst: WritableReg) {
1377
        self.setcc_impl(kind.into(), dst);
1378
    }
1379

1380
    /// Set value in dst to `1` if parity flag in status register is set, `0`
1381
    /// otherwise.
1382
    pub fn setp(&mut self, dst: WritableReg) {
1383
        self.setcc_impl(CC::P, dst);
1384
    }
1385

1386
    /// Set value in dst to `1` if parity flag in status register is not set,
1387
    /// `0` otherwise.
1388
    pub fn setnp(&mut self, dst: WritableReg) {
1389
        self.setcc_impl(CC::NP, dst);
1390
    }
1391

1392
    fn setcc_impl(&mut self, cc: CC, dst: WritableReg) {
1393
        // Clear the dst register or bits 1 to 31 may be incorrectly set.
1394
        // Don't use xor since it updates the status register.
1395
        let dst: WritableGpr = dst.map(Into::into);
1396
        let inst = asm::inst::movl_oi::new(dst, 0).into();
1397
        self.emit(Inst::External { inst });
1398

1399
        // Copy correct bit from status register into dst register.
1400
        //
1401
        // Note that some of these mnemonics don't match exactly and that's
1402
        // intentional as there are multiple mnemonics for the same encoding in
1403
        // some cases and the assembler picked ones that match Capstone rather
1404
        // than Cranelift.
1405
        let inst = match cc {
1406
            CC::O => asm::inst::seto_m::new(dst).into(),
1407
            CC::NO => asm::inst::setno_m::new(dst).into(),
1408
            CC::B => asm::inst::setb_m::new(dst).into(),
1409
            CC::NB => asm::inst::setae_m::new(dst).into(), //  nb == ae
1410
            CC::Z => asm::inst::sete_m::new(dst).into(),   //   z ==  e
1411
            CC::NZ => asm::inst::setne_m::new(dst).into(), //  nz == ne
1412
            CC::BE => asm::inst::setbe_m::new(dst).into(),
1413
            CC::NBE => asm::inst::seta_m::new(dst).into(), // nbe ==  a
1414
            CC::S => asm::inst::sets_m::new(dst).into(),
1415
            CC::NS => asm::inst::setns_m::new(dst).into(),
1416
            CC::L => asm::inst::setl_m::new(dst).into(),
1417
            CC::NL => asm::inst::setge_m::new(dst).into(), //  nl == ge
1418
            CC::LE => asm::inst::setle_m::new(dst).into(),
1419
            CC::NLE => asm::inst::setg_m::new(dst).into(), // nle ==  g
1420
            CC::P => asm::inst::setp_m::new(dst).into(),
1421
            CC::NP => asm::inst::setnp_m::new(dst).into(),
1422
        };
1423
        self.emit(Inst::External { inst });
1424
    }
1425

1426
    /// Store the count of leading zeroes in src in dst.
1427
    /// Requires `has_lzcnt` flag.
1428
    pub fn lzcnt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1429
        assert!(self.isa_flags.has_lzcnt(), "Requires has_lzcnt flag");
1430
        let dst = WritableGpr::from_reg(dst.to_reg().into());
1431
        let inst = match size {
1432
            OperandSize::S16 => asm::inst::lzcntw_rm::new(dst, src).into(),
1433
            OperandSize::S32 => asm::inst::lzcntl_rm::new(dst, src).into(),
1434
            OperandSize::S64 => asm::inst::lzcntq_rm::new(dst, src).into(),
1435
            OperandSize::S8 | OperandSize::S128 => unreachable!(),
1436
        };
1437
        self.emit(Inst::External { inst });
1438
    }
1439

1440
    /// Store the count of trailing zeroes in src in dst.
1441
    /// Requires `has_bmi1` flag.
1442
    pub fn tzcnt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1443
        assert!(self.isa_flags.has_bmi1(), "Requires has_bmi1 flag");
1444
        let dst = WritableGpr::from_reg(dst.to_reg().into());
1445
        let inst = match size {
1446
            OperandSize::S16 => asm::inst::tzcntw_a::new(dst, src).into(),
1447
            OperandSize::S32 => asm::inst::tzcntl_a::new(dst, src).into(),
1448
            OperandSize::S64 => asm::inst::tzcntq_a::new(dst, src).into(),
1449
            OperandSize::S8 | OperandSize::S128 => unreachable!(),
1450
        };
1451
        self.emit(Inst::External { inst });
1452
    }
1453

1454
    /// Stores position of the most significant bit set in src in dst.
1455
    /// Zero flag is set if src is equal to 0.
1456
    pub fn bsr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1457
        let dst: WritableGpr = WritableGpr::from_reg(dst.to_reg().into());
1458
        let inst = match size {
1459
            OperandSize::S16 => asm::inst::bsrw_rm::new(dst, src).into(),
1460
            OperandSize::S32 => asm::inst::bsrl_rm::new(dst, src).into(),
1461
            OperandSize::S64 => asm::inst::bsrq_rm::new(dst, src).into(),
1462
            OperandSize::S8 | OperandSize::S128 => unreachable!(),
1463
        };
1464
        self.emit(Inst::External { inst });
1465
    }
1466

1467
    /// Performs integer negation on `src` and places result in `dst`.
1468
    pub fn neg(&mut self, read: Reg, write: WritableReg, size: OperandSize) {
1469
        let gpr = PairedGpr {
1470
            read: read.into(),
1471
            write: WritableGpr::from_reg(write.to_reg().into()),
1472
        };
1473
        let inst = match size {
1474
            OperandSize::S8 => asm::inst::negb_m::new(gpr).into(),
1475
            OperandSize::S16 => asm::inst::negw_m::new(gpr).into(),
1476
            OperandSize::S32 => asm::inst::negl_m::new(gpr).into(),
1477
            OperandSize::S64 => asm::inst::negq_m::new(gpr).into(),
1478
            OperandSize::S128 => unreachable!(),
1479
        };
1480
        self.emit(Inst::External { inst });
1481
    }
1482

1483
    /// Stores position of the least significant bit set in src in dst.
1484
    /// Zero flag is set if src is equal to 0.
1485
    pub fn bsf(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1486
        let dst: WritableGpr = WritableGpr::from_reg(dst.to_reg().into());
1487
        let inst = match size {
1488
            OperandSize::S16 => asm::inst::bsfw_rm::new(dst, src).into(),
1489
            OperandSize::S32 => asm::inst::bsfl_rm::new(dst, src).into(),
1490
            OperandSize::S64 => asm::inst::bsfq_rm::new(dst, src).into(),
1491
            OperandSize::S8 | OperandSize::S128 => unreachable!(),
1492
        };
1493
        self.emit(Inst::External { inst });
1494
    }
1495

1496
    /// Performs float addition on src and dst and places result in dst.
1497
    pub fn xmm_add_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1498
        let dst = pair_xmm(dst);
1499
        let inst = match size {
1500
            OperandSize::S32 => asm::inst::addss_a::new(dst, src).into(),
1501
            OperandSize::S64 => asm::inst::addsd_a::new(dst, src).into(),
1502
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
1503
        };
1504
        self.emit(Inst::External { inst });
1505
    }
1506

1507
    /// Performs float subtraction on src and dst and places result in dst.
1508
    pub fn xmm_sub_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1509
        let dst = pair_xmm(dst);
1510
        let inst = match size {
1511
            OperandSize::S32 => asm::inst::subss_a::new(dst, src).into(),
1512
            OperandSize::S64 => asm::inst::subsd_a::new(dst, src).into(),
1513
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
1514
        };
1515
        self.emit(Inst::External { inst });
1516
    }
1517

1518
    /// Performs float multiplication on src and dst and places result in dst.
1519
    pub fn xmm_mul_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1520
        use OperandSize::*;
1521
        let dst = pair_xmm(dst);
1522
        let inst = match size {
1523
            S32 => asm::inst::mulss_a::new(dst, src).into(),
1524
            S64 => asm::inst::mulsd_a::new(dst, src).into(),
1525
            S8 | S16 | S128 => unreachable!(),
1526
        };
1527
        self.emit(Inst::External { inst });
1528
    }
1529

1530
    /// Performs float division on src and dst and places result in dst.
1531
    pub fn xmm_div_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1532
        let dst = pair_xmm(dst);
1533
        let inst = match size {
1534
            OperandSize::S32 => asm::inst::divss_a::new(dst, src).into(),
1535
            OperandSize::S64 => asm::inst::divsd_a::new(dst, src).into(),
1536
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
1537
        };
1538
        self.emit(Inst::External { inst });
1539
    }
1540

1541
    /// Minimum for src and dst XMM registers with results put in dst.
1542
    pub fn xmm_min_seq(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1543
        self.emit(Inst::XmmMinMaxSeq {
1544
            size: size.into(),
1545
            is_min: true,
1546
            lhs: src.into(),
1547
            rhs: dst.to_reg().into(),
1548
            dst: dst.map(Into::into),
1549
        });
1550
    }
1551

1552
    /// Maximum for src and dst XMM registers with results put in dst.
1553
    pub fn xmm_max_seq(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1554
        self.emit(Inst::XmmMinMaxSeq {
1555
            size: size.into(),
1556
            is_min: false,
1557
            lhs: src.into(),
1558
            rhs: dst.to_reg().into(),
1559
            dst: dst.map(Into::into),
1560
        });
1561
    }
1562

1563
    /// Perform rounding operation on float register src and place results in
1564
    /// float register dst.
1565
    pub fn xmm_rounds_rr(
1566
        &mut self,
1567
        src: Reg,
1568
        dst: WritableReg,
1569
        mode: RoundingMode,
1570
        size: OperandSize,
1571
    ) {
1572
        let dst = dst.map(|r| r.into());
1573

1574
        let imm: u8 = match mode {
1575
            RoundingMode::Nearest => 0x00,
1576
            RoundingMode::Down => 0x01,
1577
            RoundingMode::Up => 0x02,
1578
            RoundingMode::Zero => 0x03,
1579
        };
1580

1581
        let inst = match size {
1582
            OperandSize::S32 => asm::inst::roundss_rmi::new(dst, src, imm).into(),
1583
            OperandSize::S64 => asm::inst::roundsd_rmi::new(dst, src, imm).into(),
1584
            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => unreachable!(),
1585
        };
1586

1587
        self.emit(Inst::External { inst });
1588
    }
1589

1590
    pub fn sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1591
        use OperandSize::*;
1592
        let dst = pair_xmm(dst);
1593
        let inst = match size {
1594
            S32 => asm::inst::sqrtss_a::new(dst, src).into(),
1595
            S64 => asm::inst::sqrtsd_a::new(dst, src).into(),
1596
            S8 | S16 | S128 => unimplemented!(),
1597
        };
1598
        self.emit(Inst::External { inst });
1599
    }
1600

1601
    /// Emit a call to an unknown location through a register.
1602
    pub fn call_with_reg(&mut self, cc: CallingConvention, callee: Reg) {
1603
        self.emit(Inst::CallUnknown {
1604
            info: Box::new(CallInfo::empty(RegMem::reg(callee.into()), cc.into())),
1605
        });
1606
    }
1607

1608
    /// Emit a call to a locally defined function through an index.
1609
    pub fn call_with_name(&mut self, cc: CallingConvention, name: UserExternalNameRef) {
1610
        self.emit(Inst::CallKnown {
1611
            info: Box::new(CallInfo::empty(ExternalName::user(name), cc.into())),
1612
        });
1613
    }
1614

1615
    /// Emits a conditional jump to the given label.
1616
    pub fn jmp_if(&mut self, cc: impl Into<CC>, taken: MachLabel) {
1617
        self.emit(Inst::WinchJmpIf {
1618
            cc: cc.into(),
1619
            taken,
1620
        });
1621
    }
1622

1623
    /// Performs an unconditional jump to the given label.
1624
    pub fn jmp(&mut self, target: MachLabel) {
1625
        self.emit(Inst::JmpKnown { dst: target });
1626
    }
1627

1628
    /// Emits a jump table sequence.
1629
    pub fn jmp_table(
1630
        &mut self,
1631
        targets: SmallVec<[MachLabel; 4]>,
1632
        default: MachLabel,
1633
        index: Reg,
1634
        tmp1: Reg,
1635
        tmp2: Reg,
1636
    ) {
1637
        self.emit(Inst::JmpTableSeq {
1638
            idx: index.into(),
1639
            tmp1: Writable::from_reg(tmp1.into()),
1640
            tmp2: Writable::from_reg(tmp2.into()),
1641
            default_target: default,
1642
            targets: Box::new(targets.to_vec()),
1643
        })
1644
    }
1645

1646
    /// Emit a trap instruction.
1647
    pub fn trap(&mut self, code: TrapCode) {
1648
        let inst = asm::inst::ud2_zo::new(code).into();
1649
        self.emit(Inst::External { inst });
1650
    }
1651

1652
    /// Conditional trap.
1653
    pub fn trapif(&mut self, cc: impl Into<CC>, trap_code: TrapCode) {
1654
        self.emit(Inst::TrapIf {
1655
            cc: cc.into(),
1656
            trap_code,
1657
        });
1658
    }
1659

1660
    /// Load effective address.
1661
    pub fn lea(&mut self, addr: &Address, dst: WritableReg, size: OperandSize) {
1662
        let addr = Self::to_synthetic_amode(addr, MemFlags::trusted());
1663
        let dst: WritableGpr = dst.map(Into::into);
1664
        let inst = match size {
1665
            OperandSize::S16 => asm::inst::leaw_rm::new(dst, addr).into(),
1666
            OperandSize::S32 => asm::inst::leal_rm::new(dst, addr).into(),
1667
            OperandSize::S64 => asm::inst::leaq_rm::new(dst, addr).into(),
1668
            OperandSize::S8 | OperandSize::S128 => unimplemented!(),
1669
        };
1670
        self.emit(Inst::External { inst });
1671
    }
1672

1673
    pub fn adc_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1674
        let dst = pair_gpr(dst);
1675
        let inst = match size {
1676
            OperandSize::S8 => asm::inst::adcb_rm::new(dst, src).into(),
1677
            OperandSize::S16 => asm::inst::adcw_rm::new(dst, src).into(),
1678
            OperandSize::S32 => asm::inst::adcl_rm::new(dst, src).into(),
1679
            OperandSize::S64 => asm::inst::adcq_rm::new(dst, src).into(),
1680
            OperandSize::S128 => unimplemented!(),
1681
        };
1682
        self.emit(Inst::External { inst });
1683
    }
1684

1685
    pub fn sbb_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1686
        let dst = pair_gpr(dst);
1687
        let inst = match size {
1688
            OperandSize::S8 => asm::inst::sbbb_rm::new(dst, src).into(),
1689
            OperandSize::S16 => asm::inst::sbbw_rm::new(dst, src).into(),
1690
            OperandSize::S32 => asm::inst::sbbl_rm::new(dst, src).into(),
1691
            OperandSize::S64 => asm::inst::sbbq_rm::new(dst, src).into(),
1692
            OperandSize::S128 => unimplemented!(),
1693
        };
1694
        self.emit(Inst::External { inst });
1695
    }
1696

1697
    pub fn mul_wide(
1698
        &mut self,
1699
        dst_lo: WritableReg,
1700
        dst_hi: WritableReg,
1701
        lhs: Reg,
1702
        rhs: Reg,
1703
        kind: MulWideKind,
1704
        size: OperandSize,
1705
    ) {
1706
        use MulWideKind::*;
1707
        use OperandSize::*;
1708
        let rax = asm::Fixed(PairedGpr {
1709
            read: lhs.into(),
1710
            write: WritableGpr::from_reg(dst_lo.to_reg().into()),
1711
        });
1712
        let rdx = asm::Fixed(dst_hi.to_reg().into());
1713
        if size == S8 {
1714
            // For `mulb` and `imulb`, both the high and low bits are written to
1715
            // RAX.
1716
            assert_eq!(dst_lo, dst_hi);
1717
        }
1718
        let inst = match (size, kind) {
1719
            (S8, Unsigned) => asm::inst::mulb_m::new(rax, rhs).into(),
1720
            (S8, Signed) => asm::inst::imulb_m::new(rax, rhs).into(),
1721
            (S16, Unsigned) => asm::inst::mulw_m::new(rax, rdx, rhs).into(),
1722
            (S16, Signed) => asm::inst::imulw_m::new(rax, rdx, rhs).into(),
1723
            (S32, Unsigned) => asm::inst::mull_m::new(rax, rdx, rhs).into(),
1724
            (S32, Signed) => asm::inst::imull_m::new(rax, rdx, rhs).into(),
1725
            (S64, Unsigned) => asm::inst::mulq_m::new(rax, rdx, rhs).into(),
1726
            (S64, Signed) => asm::inst::imulq_m::new(rax, rdx, rhs).into(),
1727
            (S128, _) => unimplemented!(),
1728
        };
1729
        self.emit(Inst::External { inst });
1730
    }
1731

1732
    /// Shuffles bytes in `src` according to contents of `mask` and puts
1733
    /// result in `dst`.
1734
    pub fn xmm_vpshufb_rrm(&mut self, dst: WritableReg, src: Reg, mask: &Address) {
1735
        let dst: WritableXmm = dst.map(|r| r.into());
1736
        let mask = Self::to_synthetic_amode(mask, MemFlags::trusted());
1737
        let inst = asm::inst::vpshufb_b::new(dst, src, mask).into();
1738
        self.emit(Inst::External { inst });
1739
    }
1740

1741
    /// Shuffles bytes in `src` according to contents of `mask` and puts
1742
    /// result in `dst`.
1743
    pub fn xmm_vpshufb_rrr(&mut self, dst: WritableReg, src: Reg, mask: Reg) {
1744
        let dst: WritableXmm = dst.map(|r| r.into());
1745
        let inst = asm::inst::vpshufb_b::new(dst, src, mask).into();
1746
        self.emit(Inst::External { inst });
1747
    }
1748

1749
    /// Add unsigned integers with unsigned saturation.
1750
    ///
1751
    /// Adds the src operands but when an individual byte result is larger than
1752
    /// an unsigned byte integer, 0xFF is written instead.
1753
    pub fn xmm_vpaddus_rrm(
1754
        &mut self,
1755
        dst: WritableReg,
1756
        src1: Reg,
1757
        src2: &Address,
1758
        size: OperandSize,
1759
    ) {
1760
        let dst: WritableXmm = dst.map(|r| r.into());
1761
        let src2 = Self::to_synthetic_amode(src2, MemFlags::trusted());
1762
        let inst = match size {
1763
            OperandSize::S8 => asm::inst::vpaddusb_b::new(dst, src1, src2).into(),
1764
            OperandSize::S32 => asm::inst::vpaddusw_b::new(dst, src1, src2).into(),
1765
            _ => unimplemented!(),
1766
        };
1767
        self.emit(Inst::External { inst });
1768
    }
1769

1770
    /// Add unsigned integers with unsigned saturation.
1771
    ///
1772
    /// Adds the src operands but when an individual byte result is larger than
1773
    /// an unsigned byte integer, 0xFF is written instead.
1774
    pub fn xmm_vpaddus_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg, size: OperandSize) {
1775
        let dst: WritableXmm = dst.map(|r| r.into());
1776
        let inst = match size {
1777
            OperandSize::S8 => asm::inst::vpaddusb_b::new(dst, src1, src2).into(),
1778
            OperandSize::S16 => asm::inst::vpaddusw_b::new(dst, src1, src2).into(),
1779
            _ => unimplemented!(),
1780
        };
1781
        self.emit(Inst::External { inst });
1782
    }
1783

1784
    /// Add signed integers.
1785
    pub fn xmm_vpadds_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg, size: OperandSize) {
1786
        let dst: WritableXmm = dst.map(|r| r.into());
1787
        let inst = match size {
1788
            OperandSize::S8 => asm::inst::vpaddsb_b::new(dst, src1, src2).into(),
1789
            OperandSize::S16 => asm::inst::vpaddsw_b::new(dst, src1, src2).into(),
1790
            _ => unimplemented!(),
1791
        };
1792
        self.emit(Inst::External { inst });
1793
    }
1794

1795
    pub fn xmm_vpadd_rmr(
1796
        &mut self,
1797
        src1: Reg,
1798
        src2: &Address,
1799
        dst: WritableReg,
1800
        size: OperandSize,
1801
    ) {
1802
        let dst: WritableXmm = dst.map(|r| r.into());
1803
        let address = Self::to_synthetic_amode(src2, MemFlags::trusted());
1804
        let inst = match size {
1805
            OperandSize::S8 => asm::inst::vpaddb_b::new(dst, src1, address).into(),
1806
            OperandSize::S16 => asm::inst::vpaddw_b::new(dst, src1, address).into(),
1807
            OperandSize::S32 => asm::inst::vpaddd_b::new(dst, src1, address).into(),
1808
            _ => unimplemented!(),
1809
        };
1810
        self.emit(Inst::External { inst });
1811
    }
1812

1813
    /// Adds vectors of integers in `src1` and `src2` and puts the results in
1814
    /// `dst`.
1815
    pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
1816
        let dst: WritableXmm = dst.map(|r| r.into());
1817
        let inst = match size {
1818
            OperandSize::S8 => asm::inst::vpaddb_b::new(dst, src1, src2).into(),
1819
            OperandSize::S16 => asm::inst::vpaddw_b::new(dst, src1, src2).into(),
1820
            OperandSize::S32 => asm::inst::vpaddd_b::new(dst, src1, src2).into(),
1821
            OperandSize::S64 => asm::inst::vpaddq_b::new(dst, src1, src2).into(),
1822
            _ => unimplemented!(),
1823
        };
1824
        self.emit(Inst::External { inst });
1825
    }
1826

1827
    pub fn mfence(&mut self) {
1828
        self.emit(Inst::External {
1829
            inst: asm::inst::mfence_zo::new().into(),
1830
        });
1831
    }
1832

1833
    /// Extract a value from `src` into `addr` determined by `lane`.
1834
    pub(crate) fn xmm_vpextr_rm(
1835
        &mut self,
1836
        addr: &Address,
1837
        src: Reg,
1838
        lane: u8,
1839
        size: OperandSize,
1840
        flags: MemFlags,
1841
    ) {
1842
        assert!(addr.is_offset());
1843
        let dst = Self::to_synthetic_amode(addr, flags);
1844
        let inst = match size {
1845
            OperandSize::S8 => asm::inst::vpextrb_a::new(dst, src, lane).into(),
1846
            OperandSize::S16 => asm::inst::vpextrw_b::new(dst, src, lane).into(),
1847
            OperandSize::S32 => asm::inst::vpextrd_a::new(dst, src, lane).into(),
1848
            OperandSize::S64 => asm::inst::vpextrq_a::new(dst, src, lane).into(),
1849
            _ => unimplemented!(),
1850
        };
1851
        self.emit(Inst::External { inst });
1852
    }
1853

1854
    /// Extract a value from `src` into `dst` (zero extended) determined by `lane`.
1855
    pub fn xmm_vpextr_rr(&mut self, dst: WritableReg, src: Reg, lane: u8, size: OperandSize) {
1856
        let dst: WritableGpr = dst.map(|r| r.into());
1857
        let inst = match size {
1858
            OperandSize::S8 => asm::inst::vpextrb_a::new(dst, src, lane).into(),
1859
            OperandSize::S16 => asm::inst::vpextrw_a::new(dst, src, lane).into(),
1860
            OperandSize::S32 => asm::inst::vpextrd_a::new(dst, src, lane).into(),
1861
            OperandSize::S64 => asm::inst::vpextrq_a::new(dst, src, lane).into(),
1862
            _ => unimplemented!(),
1863
        };
1864
        self.emit(Inst::External { inst });
1865
    }
1866

1867
    /// Copy value from `src2`, merge into `src1`, and put result in `dst` at
1868
    /// the location specified in `count`.
1869
    pub fn xmm_vpinsr_rrm(
1870
        &mut self,
1871
        dst: WritableReg,
1872
        src1: Reg,
1873
        src2: &Address,
1874
        count: u8,
1875
        size: OperandSize,
1876
    ) {
1877
        let src2 = Self::to_synthetic_amode(src2, MemFlags::trusted());
1878
        let dst: WritableXmm = dst.map(|r| r.into());
1879

1880
        let inst = match size {
1881
            OperandSize::S8 => asm::inst::vpinsrb_b::new(dst, src1, src2, count).into(),
1882
            OperandSize::S16 => asm::inst::vpinsrw_b::new(dst, src1, src2, count).into(),
1883
            OperandSize::S32 => asm::inst::vpinsrd_b::new(dst, src1, src2, count).into(),
1884
            OperandSize::S64 => asm::inst::vpinsrq_b::new(dst, src1, src2, count).into(),
1885
            OperandSize::S128 => unreachable!(),
1886
        };
1887
        self.emit(Inst::External { inst });
1888
    }
1889

1890
    /// Copy value from `src2`, merge into `src1`, and put result in `dst` at
1891
    /// the location specified in `count`.
1892
    pub fn xmm_vpinsr_rrr(
1893
        &mut self,
1894
        dst: WritableReg,
1895
        src1: Reg,
1896
        src2: Reg,
1897
        count: u8,
1898
        size: OperandSize,
1899
    ) {
1900
        let dst: WritableXmm = dst.map(|r| r.into());
1901
        let inst = match size {
1902
            OperandSize::S8 => asm::inst::vpinsrb_b::new(dst, src1, src2, count).into(),
1903
            OperandSize::S16 => asm::inst::vpinsrw_b::new(dst, src1, src2, count).into(),
1904
            OperandSize::S32 => asm::inst::vpinsrd_b::new(dst, src1, src2, count).into(),
1905
            OperandSize::S64 => asm::inst::vpinsrq_b::new(dst, src1, src2, count).into(),
1906
            OperandSize::S128 => unreachable!(),
1907
        };
1908
        self.emit(Inst::External { inst });
1909
    }
1910

1911
    /// Copy a 32-bit float in `src2`, merge into `src1`, and put result in `dst`.
1912
    pub fn xmm_vinsertps_rrm(&mut self, dst: WritableReg, src1: Reg, address: &Address, imm: u8) {
1913
        let dst: WritableXmm = dst.map(|r| r.into());
1914
        let address = Self::to_synthetic_amode(address, MemFlags::trusted());
1915
        let inst = asm::inst::vinsertps_b::new(dst, src1, address, imm).into();
1916
        self.emit(Inst::External { inst });
1917
    }
1918

1919
    /// Copy a 32-bit float in `src2`, merge into `src1`, and put result in `dst`.
1920
    pub fn xmm_vinsertps_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg, imm: u8) {
1921
        let dst: WritableXmm = dst.map(|r| r.into());
1922
        let inst = asm::inst::vinsertps_b::new(dst, src1, src2, imm).into();
1923
        self.emit(Inst::External { inst });
1924
    }
1925

1926
    /// Moves lower 64-bit float from `src2` into lower 64-bits of `dst` and the
1927
    /// upper 64-bits in `src1` into the upper 64-bits of `dst`.
1928
    pub fn xmm_vmovsd_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg) {
1929
        let dst: WritableXmm = dst.map(|r| r.into());
1930
        let inst = asm::inst::vmovsd_b::new(dst, src1, src2).into();
1931
        self.emit(Inst::External { inst });
1932
    }
1933

1934
    /// Moves 64-bit float from `src` into lower 64-bits of `dst`.
1935
    /// Zeroes out the upper 64 bits of `dst`.
1936
    pub fn xmm_vmovsd_rm(&mut self, dst: WritableReg, src: &Address) {
1937
        let src = Self::to_synthetic_amode(src, MemFlags::trusted());
1938
        let dst: WritableXmm = dst.map(|r| r.into());
1939
        let inst = asm::inst::vmovsd_d::new(dst, src).into();
1940
        self.emit(Inst::External { inst });
1941
    }
1942

1943
    /// Moves two 32-bit floats from `src2` to the upper 64-bits of `dst`.
1944
    /// Copies two 32-bit floats from the lower 64-bits of `src1` to lower
1945
    /// 64-bits of `dst`.
1946
    pub fn xmm_vmovlhps_rrm(&mut self, dst: WritableReg, src1: Reg, src2: &Address) {
1947
        let src2 = Self::to_synthetic_amode(src2, MemFlags::trusted());
1948
        let dst: WritableXmm = dst.map(|r| r.into());
1949
        let inst = asm::inst::vmovhps_b::new(dst, src1, src2).into();
1950
        self.emit(Inst::External { inst });
1951
    }
1952

1953
    /// Moves two 32-bit floats from the lower 64-bits of `src2` to the upper
1954
    /// 64-bits of `dst`. Copies two 32-bit floats from the lower 64-bits of
1955
    /// `src1` to lower 64-bits of `dst`.
1956
    pub fn xmm_vmovlhps_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg) {
1957
        let dst: WritableXmm = dst.map(|r| r.into());
1958
        let inst = asm::inst::vmovlhps_rvm::new(dst, src1, src2).into();
1959
        self.emit(Inst::External { inst });
1960
    }
1961

1962
    /// Move unaligned packed integer values from address `src` to `dst`.
1963
    pub fn xmm_vmovdqu_mr(&mut self, src: &Address, dst: WritableReg, flags: MemFlags) {
1964
        let src = Self::to_synthetic_amode(src, flags);
1965
        let dst: WritableXmm = dst.map(|r| r.into());
1966
        let inst = asm::inst::vmovdqu_a::new(dst, src).into();
1967
        self.emit(Inst::External { inst });
1968
    }
1969

1970
    /// Move integer from `src` to xmm register `dst` using an AVX instruction.
1971
    pub fn avx_gpr_to_xmm(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
1972
        let dst: WritableXmm = dst.map(|r| r.into());
1973
        let inst = match size {
1974
            OperandSize::S32 => asm::inst::vmovd_a::new(dst, src).into(),
1975
            OperandSize::S64 => asm::inst::vmovq_a::new(dst, src).into(),
1976
            _ => unreachable!(),
1977
        };
1978

1979
        self.emit(Inst::External { inst });
1980
    }
1981

1982
    pub fn xmm_vptest(&mut self, src1: Reg, src2: Reg) {
1983
        let inst = asm::inst::vptest_rm::new(src1, src2).into();
1984
        self.emit(Inst::External { inst });
1985
    }
1986

1987
    /// Converts vector of integers into vector of floating values.
1988
    pub fn xmm_vcvt_rr(&mut self, src: Reg, dst: WritableReg, kind: VcvtKind) {
1989
        let dst: WritableXmm = dst.map(|x| x.into());
1990
        let inst = match kind {
1991
            VcvtKind::I32ToF32 => asm::inst::vcvtdq2ps_a::new(dst, src).into(),
1992
            VcvtKind::I32ToF64 => asm::inst::vcvtdq2pd_a::new(dst, src).into(),
1993
            VcvtKind::F64ToF32 => asm::inst::vcvtpd2ps_a::new(dst, src).into(),
1994
            VcvtKind::F64ToI32 => asm::inst::vcvttpd2dq_a::new(dst, src).into(),
1995
            VcvtKind::F32ToF64 => asm::inst::vcvtps2pd_a::new(dst, src).into(),
1996
            VcvtKind::F32ToI32 => asm::inst::vcvttps2dq_a::new(dst, src).into(),
1997
        };
1998
        self.emit(Inst::External { inst });
1999
    }
2000

2001
    /// Subtract floats in vector `src1` to floats in vector `src2`.
2002
    pub fn xmm_vsubp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2003
        let dst: WritableXmm = dst.map(|r| r.into());
2004
        let inst = match size {
2005
            OperandSize::S32 => asm::inst::vsubps_b::new(dst, src1, src2).into(),
2006
            OperandSize::S64 => asm::inst::vsubpd_b::new(dst, src1, src2).into(),
2007
            _ => unimplemented!(),
2008
        };
2009
        self.emit(Inst::External { inst });
2010
    }
2011

2012
    /// Subtract integers in vector `src1` from integers in vector `src2`.
2013
    pub fn xmm_vpsub_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2014
        let dst: WritableXmm = dst.map(|r| r.into());
2015
        let inst = match size {
2016
            OperandSize::S8 => asm::inst::vpsubb_b::new(dst, src1, src2).into(),
2017
            OperandSize::S16 => asm::inst::vpsubw_b::new(dst, src1, src2).into(),
2018
            OperandSize::S32 => asm::inst::vpsubd_b::new(dst, src1, src2).into(),
2019
            OperandSize::S64 => asm::inst::vpsubq_b::new(dst, src1, src2).into(),
2020
            _ => unimplemented!(),
2021
        };
2022
        self.emit(Inst::External { inst });
2023
    }
2024

2025
    /// Subtract unsigned integers with unsigned saturation.
2026
    pub fn xmm_vpsubus_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg, size: OperandSize) {
2027
        let dst: WritableXmm = dst.map(|r| r.into());
2028
        let inst = match size {
2029
            OperandSize::S8 => asm::inst::vpsubusb_b::new(dst, src1, src2).into(),
2030
            OperandSize::S16 => asm::inst::vpsubusw_b::new(dst, src1, src2).into(),
2031
            _ => unimplemented!(),
2032
        };
2033
        self.emit(Inst::External { inst });
2034
    }
2035

2036
    /// Subtract signed integers with signed saturation.
2037
    pub fn xmm_vpsubs_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg, size: OperandSize) {
2038
        let dst: WritableXmm = dst.map(|r| r.into());
2039
        let inst = match size {
2040
            OperandSize::S8 => asm::inst::vpsubsb_b::new(dst, src1, src2).into(),
2041
            OperandSize::S16 => asm::inst::vpsubsw_b::new(dst, src1, src2).into(),
2042
            _ => unimplemented!(),
2043
        };
2044
        self.emit(Inst::External { inst });
2045
    }
2046

2047
    /// Add floats in vector `src1` to floats in vector `src2`.
2048
    pub fn xmm_vaddp_rrm(
2049
        &mut self,
2050
        src1: Reg,
2051
        src2: &Address,
2052
        dst: WritableReg,
2053
        size: OperandSize,
2054
    ) {
2055
        let dst: WritableXmm = dst.map(|r| r.into());
2056
        let address = Self::to_synthetic_amode(src2, MemFlags::trusted());
2057
        let inst = match size {
2058
            OperandSize::S32 => asm::inst::vaddps_b::new(dst, src1, address).into(),
2059
            OperandSize::S64 => asm::inst::vaddpd_b::new(dst, src1, address).into(),
2060
            _ => unimplemented!(),
2061
        };
2062
        self.emit(Inst::External { inst });
2063
    }
2064

2065
    /// Add floats in vector `src1` to floats in vector `src2`.
2066
    pub fn xmm_vaddp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2067
        let dst: WritableXmm = dst.map(|r| r.into());
2068
        let inst = match size {
2069
            OperandSize::S32 => asm::inst::vaddps_b::new(dst, src1, src2).into(),
2070
            OperandSize::S64 => asm::inst::vaddpd_b::new(dst, src1, src2).into(),
2071
            _ => unimplemented!(),
2072
        };
2073
        self.emit(Inst::External { inst });
2074
    }
2075

2076
    /// Compare vector register `lhs` with a vector of integers in `rhs` for
2077
    /// equality between packed integers and write the resulting vector into
2078
    /// `dst`.
2079
    pub fn xmm_vpcmpeq_rrm(
2080
        &mut self,
2081
        dst: WritableReg,
2082
        lhs: Reg,
2083
        address: &Address,
2084
        size: OperandSize,
2085
    ) {
2086
        let dst: WritableXmm = dst.map(|r| r.into());
2087
        let address = Self::to_synthetic_amode(address, MemFlags::trusted());
2088
        let inst = match size {
2089
            OperandSize::S8 => asm::inst::vpcmpeqb_b::new(dst, lhs, address).into(),
2090
            OperandSize::S16 => asm::inst::vpcmpeqw_b::new(dst, lhs, address).into(),
2091
            OperandSize::S32 => asm::inst::vpcmpeqd_b::new(dst, lhs, address).into(),
2092
            OperandSize::S64 => asm::inst::vpcmpeqq_b::new(dst, lhs, address).into(),
2093
            _ => unimplemented!(),
2094
        };
2095
        self.emit(Inst::External { inst });
2096
    }
2097

2098
    /// Compare vector registers `lhs` and `rhs` for equality between packed
2099
    /// integers and write the resulting vector into `dst`.
2100
    pub fn xmm_vpcmpeq_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
2101
        let dst: WritableXmm = dst.map(|r| r.into());
2102
        let inst = match size {
2103
            OperandSize::S8 => asm::inst::vpcmpeqb_b::new(dst, lhs, rhs).into(),
2104
            OperandSize::S16 => asm::inst::vpcmpeqw_b::new(dst, lhs, rhs).into(),
2105
            OperandSize::S32 => asm::inst::vpcmpeqd_b::new(dst, lhs, rhs).into(),
2106
            OperandSize::S64 => asm::inst::vpcmpeqq_b::new(dst, lhs, rhs).into(),
2107
            _ => unimplemented!(),
2108
        };
2109
        self.emit(Inst::External { inst });
2110
    }
2111

2112
    /// Performs a greater than comparison with vectors of signed integers in
2113
    /// `lhs` and `rhs` and puts the results in `dst`.
2114
    pub fn xmm_vpcmpgt_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
2115
        let dst: WritableXmm = dst.map(|r| r.into());
2116
        let inst = match size {
2117
            OperandSize::S8 => asm::inst::vpcmpgtb_b::new(dst, lhs, rhs).into(),
2118
            OperandSize::S16 => asm::inst::vpcmpgtw_b::new(dst, lhs, rhs).into(),
2119
            OperandSize::S32 => asm::inst::vpcmpgtd_b::new(dst, lhs, rhs).into(),
2120
            OperandSize::S64 => asm::inst::vpcmpgtq_b::new(dst, lhs, rhs).into(),
2121
            _ => unimplemented!(),
2122
        };
2123
        self.emit(Inst::External { inst });
2124
    }
2125

2126
    /// Performs a max operation with vectors of signed integers in `lhs` and
2127
    /// `rhs` and puts the results in `dst`.
2128
    pub fn xmm_vpmaxs_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
2129
        let dst: WritableXmm = dst.map(|r| r.into());
2130
        let inst = match size {
2131
            OperandSize::S8 => asm::inst::vpmaxsb_b::new(dst, lhs, rhs).into(),
2132
            OperandSize::S16 => asm::inst::vpmaxsw_b::new(dst, lhs, rhs).into(),
2133
            OperandSize::S32 => asm::inst::vpmaxsd_b::new(dst, lhs, rhs).into(),
2134
            _ => unimplemented!(),
2135
        };
2136
        self.emit(Inst::External { inst });
2137
    }
2138

2139
    /// Performs a max operation with vectors of unsigned integers in `lhs` and
2140
    /// `rhs` and puts the results in `dst`.
2141
    pub fn xmm_vpmaxu_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
2142
        let dst: WritableXmm = dst.map(|r| r.into());
2143
        let inst = match size {
2144
            OperandSize::S8 => asm::inst::vpmaxub_b::new(dst, lhs, rhs).into(),
2145
            OperandSize::S16 => asm::inst::vpmaxuw_b::new(dst, lhs, rhs).into(),
2146
            OperandSize::S32 => asm::inst::vpmaxud_b::new(dst, lhs, rhs).into(),
2147
            _ => unimplemented!(),
2148
        };
2149
        self.emit(Inst::External { inst });
2150
    }
2151

2152
    /// Performs a min operation with vectors of signed integers in `lhs` and
2153
    /// `rhs` and puts the results in `dst`.
2154
    pub fn xmm_vpmins_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
2155
        let dst: WritableXmm = dst.map(|r| r.into());
2156
        let inst = match size {
2157
            OperandSize::S8 => asm::inst::vpminsb_b::new(dst, lhs, rhs).into(),
2158
            OperandSize::S16 => asm::inst::vpminsw_b::new(dst, lhs, rhs).into(),
2159
            OperandSize::S32 => asm::inst::vpminsd_b::new(dst, lhs, rhs).into(),
2160
            _ => unimplemented!(),
2161
        };
2162
        self.emit(Inst::External { inst });
2163
    }
2164

2165
    /// Performs a min operation with vectors of unsigned integers in `lhs` and
2166
    /// `rhs` and puts the results in `dst`.
2167
    pub fn xmm_vpminu_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
2168
        let dst: WritableXmm = dst.map(|r| r.into());
2169
        let inst = match size {
2170
            OperandSize::S8 => asm::inst::vpminub_b::new(dst, lhs, rhs).into(),
2171
            OperandSize::S16 => asm::inst::vpminuw_b::new(dst, lhs, rhs).into(),
2172
            OperandSize::S32 => asm::inst::vpminud_b::new(dst, lhs, rhs).into(),
2173
            _ => unimplemented!(),
2174
        };
2175
        self.emit(Inst::External { inst });
2176
    }
2177

2178
    /// Performs a comparison operation between vectors of floats in `lhs` and
2179
    /// `rhs` and puts the results in `dst`.
2180
    pub fn xmm_vcmpp_rrr(
2181
        &mut self,
2182
        dst: WritableReg,
2183
        lhs: Reg,
2184
        rhs: Reg,
2185
        size: OperandSize,
2186
        kind: VcmpKind,
2187
    ) {
2188
        let dst: WritableXmm = dst.map(|r| r.into());
2189
        let imm = match kind {
2190
            VcmpKind::Eq => 0,
2191
            VcmpKind::Lt => 1,
2192
            VcmpKind::Le => 2,
2193
            VcmpKind::Unord => 3,
2194
            VcmpKind::Ne => 4,
2195
        };
2196
        let inst = match size {
2197
            OperandSize::S32 => asm::inst::vcmpps_b::new(dst, lhs, rhs, imm).into(),
2198
            OperandSize::S64 => asm::inst::vcmppd_b::new(dst, lhs, rhs, imm).into(),
2199
            _ => unimplemented!(),
2200
        };
2201
        self.emit(Inst::External { inst });
2202
    }
2203

2204
    /// Performs a subtraction on two vectors of floats and puts the results in
2205
    /// `dst`.
2206
    pub fn xmm_vsub_rrm(&mut self, src1: Reg, src2: &Address, dst: WritableReg, size: OperandSize) {
2207
        let dst: WritableXmm = dst.map(|r| r.into());
2208
        let address = Self::to_synthetic_amode(src2, MemFlags::trusted());
2209
        let inst = match size {
2210
            OperandSize::S64 => asm::inst::vsubpd_b::new(dst, src1, address).into(),
2211
            _ => unimplemented!(),
2212
        };
2213
        self.emit(Inst::External { inst });
2214
    }
2215

2216
    /// Performs a subtraction on two vectors of floats and puts the results in
2217
    /// `dst`.
2218
    pub fn xmm_vsub_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2219
        let dst: WritableXmm = dst.map(|r| r.into());
2220
        let inst = match size {
2221
            OperandSize::S32 => asm::inst::vsubps_b::new(dst, src1, src2).into(),
2222
            OperandSize::S64 => asm::inst::vsubpd_b::new(dst, src1, src2).into(),
2223
            _ => unimplemented!(),
2224
        };
2225
        self.emit(Inst::External { inst });
2226
    }
2227

2228
    /// Converts a vector of signed integers into a vector of narrower integers
2229
    /// using saturation to handle overflow.
2230
    pub fn xmm_vpackss_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2231
        let dst: WritableXmm = dst.map(|r| r.into());
2232
        let inst = match size {
2233
            OperandSize::S8 => asm::inst::vpacksswb_b::new(dst, src1, src2).into(),
2234
            OperandSize::S16 => asm::inst::vpackssdw_b::new(dst, src1, src2).into(),
2235
            _ => unimplemented!(),
2236
        };
2237
        self.emit(Inst::External { inst });
2238
    }
2239

2240
    /// Converts a vector of unsigned integers into a vector of narrower
2241
    /// integers using saturation to handle overflow.
2242
    pub fn xmm_vpackus_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2243
        let dst: WritableXmm = dst.map(|r| r.into());
2244
        let inst = match size {
2245
            OperandSize::S8 => asm::inst::vpackuswb_b::new(dst, src1, src2).into(),
2246
            OperandSize::S16 => asm::inst::vpackusdw_b::new(dst, src1, src2).into(),
2247
            _ => unimplemented!(),
2248
        };
2249
        self.emit(Inst::External { inst });
2250
    }
2251

2252
    /// Concatenates `src1` and `src2` and shifts right by `imm` and puts
2253
    /// result in `dst`.
2254
    pub fn xmm_vpalignr_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, imm: u8) {
2255
        let dst: WritableXmm = dst.map(|r| r.into());
2256
        let inst = asm::inst::vpalignr_b::new(dst, src1, src2, imm).into();
2257
        self.emit(Inst::External { inst });
2258
    }
2259

2260
    /// Takes the lower lanes of vectors of floats in `src1` and `src2` and
2261
    /// interleaves them in `dst`.
2262
    pub fn xmm_vunpcklp_rrm(
2263
        &mut self,
2264
        src1: Reg,
2265
        src2: &Address,
2266
        dst: WritableReg,
2267
        size: OperandSize,
2268
    ) {
2269
        let dst: WritableXmm = dst.map(|r| r.into());
2270
        let address = Self::to_synthetic_amode(src2, MemFlags::trusted());
2271
        let inst = match size {
2272
            OperandSize::S32 => asm::inst::vunpcklps_b::new(dst, src1, address).into(),
2273
            _ => unimplemented!(),
2274
        };
2275
        self.emit(Inst::External { inst });
2276
    }
2277

2278
    /// Unpacks and interleaves high order data of floats in `src1` and `src2`
2279
    /// and puts the results in `dst`.
2280
    pub fn xmm_vunpckhp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2281
        let dst: WritableXmm = dst.map(|r| r.into());
2282
        let inst = match size {
2283
            OperandSize::S32 => asm::inst::vunpckhps_b::new(dst, src1, src2).into(),
2284
            _ => unimplemented!(),
2285
        };
2286
        self.emit(Inst::External { inst });
2287
    }
2288

2289
    /// Unpacks and interleaves the lower lanes of vectors of integers in `src1`
2290
    /// and `src2` and puts the results in `dst`.
2291
    pub fn xmm_vpunpckl_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2292
        let dst: WritableXmm = dst.map(|r| r.into());
2293
        let inst = match size {
2294
            OperandSize::S8 => asm::inst::vpunpcklbw_b::new(dst, src1, src2).into(),
2295
            OperandSize::S16 => asm::inst::vpunpcklwd_b::new(dst, src1, src2).into(),
2296
            _ => unimplemented!(),
2297
        };
2298
        self.emit(Inst::External { inst });
2299
    }
2300

2301
    /// Unpacks and interleaves the higher lanes of vectors of integers in
2302
    /// `src1` and `src2` and puts the results in `dst`.
2303
    pub fn xmm_vpunpckh_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2304
        let dst: WritableXmm = dst.map(|r| r.into());
2305
        let inst = match size {
2306
            OperandSize::S8 => asm::inst::vpunpckhbw_b::new(dst, src1, src2).into(),
2307
            OperandSize::S16 => asm::inst::vpunpckhwd_b::new(dst, src1, src2).into(),
2308
            _ => unimplemented!(),
2309
        };
2310
        self.emit(Inst::External { inst });
2311
    }
2312

2313
    pub(crate) fn vpmullq(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2314
        let dst: WritableXmm = dst.map(|r| r.into());
2315
        let inst = asm::inst::vpmullq_c::new(dst, src1, src2).into();
2316
        self.emit(Inst::External { inst });
2317
    }
2318

2319
    /// Creates a mask made up of the most significant bit of each byte of
2320
    /// `src` and stores the result in `dst`.
2321
    pub fn xmm_vpmovmsk_rr(
2322
        &mut self,
2323
        src: Reg,
2324
        dst: WritableReg,
2325
        src_size: OperandSize,
2326
        dst_size: OperandSize,
2327
    ) {
2328
        assert_eq!(dst_size, OperandSize::S32);
2329
        let dst: WritableGpr = dst.map(|r| r.into());
2330
        let inst = match src_size {
2331
            OperandSize::S8 => asm::inst::vpmovmskb_rm::new(dst, src).into(),
2332
            _ => unimplemented!(),
2333
        };
2334

2335
        self.emit(Inst::External { inst });
2336
    }
2337

2338
    /// Creates a mask made up of the most significant bit of each byte of
2339
    /// in `src` and stores the result in `dst`.
2340
    pub fn xmm_vmovskp_rr(
2341
        &mut self,
2342
        src: Reg,
2343
        dst: WritableReg,
2344
        src_size: OperandSize,
2345
        dst_size: OperandSize,
2346
    ) {
2347
        assert_eq!(dst_size, OperandSize::S32);
2348
        let dst: WritableGpr = dst.map(|r| r.into());
2349
        let inst = match src_size {
2350
            OperandSize::S32 => asm::inst::vmovmskps_rm::new(dst, src).into(),
2351
            OperandSize::S64 => asm::inst::vmovmskpd_rm::new(dst, src).into(),
2352
            _ => unimplemented!(),
2353
        };
2354

2355
        self.emit(Inst::External { inst });
2356
    }
2357

2358
    /// Compute the absolute value of elements in vector `src` and put the
2359
    /// results in `dst`.
2360
    pub fn xmm_vpabs_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
2361
        let dst: WritableXmm = dst.map(|r| r.into());
2362
        let inst = match size {
2363
            OperandSize::S8 => asm::inst::vpabsb_a::new(dst, src).into(),
2364
            OperandSize::S16 => asm::inst::vpabsw_a::new(dst, src).into(),
2365
            OperandSize::S32 => asm::inst::vpabsd_a::new(dst, src).into(),
2366
            _ => unimplemented!(),
2367
        };
2368
        self.emit(Inst::External { inst });
2369
    }
2370

2371
    /// Arithmetically (sign preserving) right shift on vector in `src` by
2372
    /// `amount` with result written to `dst`.
2373
    pub fn xmm_vpsra_rrr(&mut self, src: Reg, amount: Reg, dst: WritableReg, size: OperandSize) {
2374
        let dst: WritableXmm = dst.map(|r| r.into());
2375
        let inst = match size {
2376
            OperandSize::S16 => asm::inst::vpsraw_c::new(dst, src, amount).into(),
2377
            OperandSize::S32 => asm::inst::vpsrad_c::new(dst, src, amount).into(),
2378
            _ => unimplemented!(),
2379
        };
2380
        self.emit(Inst::External { inst });
2381
    }
2382

2383
    /// Arithmetically (sign preserving) right shift on vector in `src` by
2384
    /// `imm` with result written to `dst`.
2385
    pub fn xmm_vpsra_rri(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) {
2386
        let dst: WritableXmm = dst.map(|r| r.into());
2387
        let imm = u8::try_from(imm).expect("immediate must fit in 8 bits");
2388
        let inst = match size {
2389
            OperandSize::S32 => asm::inst::vpsrad_d::new(dst, src, imm).into(),
2390
            _ => unimplemented!(),
2391
        };
2392
        self.emit(Inst::External { inst });
2393
    }
2394

2395
    /// Shift vector data left by `imm`.
2396
    pub fn xmm_vpsll_rri(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) {
2397
        let dst: WritableXmm = dst.map(|r| r.into());
2398
        let imm = u8::try_from(imm).expect("immediate must fit in 8 bits");
2399
        let inst = match size {
2400
            OperandSize::S32 => asm::inst::vpslld_d::new(dst, src, imm).into(),
2401
            OperandSize::S64 => asm::inst::vpsllq_d::new(dst, src, imm).into(),
2402
            _ => unimplemented!(),
2403
        };
2404
        self.emit(Inst::External { inst });
2405
    }
2406

2407
    /// Shift vector data left by `amount`.
2408
    pub fn xmm_vpsll_rrr(&mut self, src: Reg, amount: Reg, dst: WritableReg, size: OperandSize) {
2409
        let dst: WritableXmm = dst.map(|r| r.into());
2410
        let inst = match size {
2411
            OperandSize::S16 => asm::inst::vpsllw_c::new(dst, src, amount).into(),
2412
            OperandSize::S32 => asm::inst::vpslld_c::new(dst, src, amount).into(),
2413
            OperandSize::S64 => asm::inst::vpsllq_c::new(dst, src, amount).into(),
2414
            _ => unimplemented!(),
2415
        };
2416
        self.emit(Inst::External { inst });
2417
    }
2418

2419
    /// Shift vector data right by `imm`.
2420
    pub fn xmm_vpsrl_rri(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) {
2421
        let dst: WritableXmm = dst.map(|r| r.into());
2422
        let imm = u8::try_from(imm).expect("immediate must fit in 8 bits");
2423
        let inst = match size {
2424
            OperandSize::S16 => asm::inst::vpsrlw_d::new(dst, src, imm).into(),
2425
            OperandSize::S32 => asm::inst::vpsrld_d::new(dst, src, imm).into(),
2426
            OperandSize::S64 => asm::inst::vpsrlq_d::new(dst, src, imm).into(),
2427
            _ => unimplemented!(),
2428
        };
2429
        self.emit(Inst::External { inst });
2430
    }
2431

2432
    /// Shift vector data right by `amount`.
2433
    pub fn xmm_vpsrl_rrr(&mut self, src: Reg, amount: Reg, dst: WritableReg, size: OperandSize) {
2434
        let dst: WritableXmm = dst.map(|r| r.into());
2435
        let inst = match size {
2436
            OperandSize::S16 => asm::inst::vpsrlw_c::new(dst, src, amount).into(),
2437
            OperandSize::S32 => asm::inst::vpsrld_c::new(dst, src, amount).into(),
2438
            OperandSize::S64 => asm::inst::vpsrlq_c::new(dst, src, amount).into(),
2439
            _ => unimplemented!(),
2440
        };
2441
        self.emit(Inst::External { inst });
2442
    }
2443

2444
    /// Perform an `and` operation on vectors of floats in `src1` and `src2`
2445
    /// and put the results in `dst`.
2446
    pub fn xmm_vandp_rrm(
2447
        &mut self,
2448
        src1: Reg,
2449
        src2: &Address,
2450
        dst: WritableReg,
2451
        size: OperandSize,
2452
    ) {
2453
        let dst: WritableXmm = dst.map(|r| r.into());
2454
        let address = Self::to_synthetic_amode(src2, MemFlags::trusted());
2455
        let inst = match size {
2456
            OperandSize::S32 => asm::inst::vandps_b::new(dst, src1, address).into(),
2457
            OperandSize::S64 => asm::inst::vandpd_b::new(dst, src1, address).into(),
2458
            _ => unimplemented!(),
2459
        };
2460
        self.emit(Inst::External { inst });
2461
    }
2462

2463
    /// Perform an `and` operation on vectors of floats in `src1` and `src2`
2464
    /// and put the results in `dst`.
2465
    pub fn xmm_vandp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2466
        let dst: WritableXmm = dst.map(|r| r.into());
2467
        let inst = match size {
2468
            OperandSize::S32 => asm::inst::vandps_b::new(dst, src1, src2).into(),
2469
            OperandSize::S64 => asm::inst::vandpd_b::new(dst, src1, src2).into(),
2470
            _ => unimplemented!(),
2471
        };
2472
        self.emit(Inst::External { inst });
2473
    }
2474

2475
    /// Performs a bitwise `and` operation on the vectors in `src1` and `src2`
2476
    /// and stores the results in `dst`.
2477
    pub fn xmm_vpand_rrm(&mut self, src1: Reg, src2: &Address, dst: WritableReg) {
2478
        let dst: WritableXmm = dst.map(|r| r.into());
2479
        let address = Self::to_synthetic_amode(&src2, MemFlags::trusted());
2480
        let inst = asm::inst::vpand_b::new(dst, src1, address).into();
2481
        self.emit(Inst::External { inst });
2482
    }
2483

2484
    /// Performs a bitwise `and` operation on the vectors in `src1` and `src2`
2485
    /// and stores the results in `dst`.
2486
    pub fn xmm_vpand_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2487
        let dst: WritableXmm = dst.map(|r| r.into());
2488
        let inst = asm::inst::vpand_b::new(dst, src1, src2).into();
2489
        self.emit(Inst::External { inst });
2490
    }
2491

2492
    /// Perform an `and not` operation on vectors of floats in `src1` and
2493
    /// `src2` and put the results in `dst`.
2494
    pub fn xmm_vandnp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2495
        let dst: WritableXmm = dst.map(|r| r.into());
2496
        let inst = match size {
2497
            OperandSize::S32 => asm::inst::vandnps_b::new(dst, src1, src2).into(),
2498
            OperandSize::S64 => asm::inst::vandnpd_b::new(dst, src1, src2).into(),
2499
            _ => unimplemented!(),
2500
        };
2501
        self.emit(Inst::External { inst });
2502
    }
2503

2504
    /// Perform an `and not` operation on vectors in `src1` and `src2` and put
2505
    /// the results in `dst`.
2506
    pub fn xmm_vpandn_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2507
        let dst: WritableXmm = dst.map(|r| r.into());
2508
        let inst = asm::inst::vpandn_b::new(dst, src1, src2).into();
2509
        self.emit(Inst::External { inst });
2510
    }
2511

2512
    /// Perform an or operation for the vectors of floats in `src1` and `src2`
2513
    /// and put the results in `dst`.
2514
    pub fn xmm_vorp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2515
        let dst: WritableXmm = dst.map(|r| r.into());
2516
        let inst = match size {
2517
            OperandSize::S32 => asm::inst::vorps_b::new(dst, src1, src2).into(),
2518
            OperandSize::S64 => asm::inst::vorpd_b::new(dst, src1, src2).into(),
2519
            _ => unimplemented!(),
2520
        };
2521
        self.emit(Inst::External { inst });
2522
    }
2523

2524
    /// Bitwise OR of `src1` and `src2`.
2525
    pub fn xmm_vpor_rrr(&mut self, dst: WritableReg, src1: Reg, src2: Reg) {
2526
        let dst: WritableXmm = dst.map(|r| r.into());
2527
        let inst = asm::inst::vpor_b::new(dst, src1, src2).into();
2528
        self.emit(Inst::External { inst });
2529
    }
2530

2531
    /// Bitwise logical xor of vectors of floats in `src1` and `src2` and puts
2532
    /// the results in `dst`.
2533
    pub fn xmm_vxorp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2534
        let dst: WritableXmm = dst.map(|r| r.into());
2535
        let inst = match size {
2536
            OperandSize::S32 => asm::inst::vxorps_b::new(dst, src1, src2).into(),
2537
            OperandSize::S64 => asm::inst::vxorpd_b::new(dst, src1, src2).into(),
2538
            _ => unimplemented!(),
2539
        };
2540
        self.emit(Inst::External { inst });
2541
    }
2542

2543
    /// Perform a logical on vector in `src` and in `address` and put the
2544
    /// results in `dst`.
2545
    pub fn xmm_vpxor_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
2546
        let dst: WritableXmm = dst.map(|r| r.into());
2547
        let address = Self::to_synthetic_amode(address, MemFlags::trusted());
2548
        let inst = asm::inst::vpxor_b::new(dst, src, address).into();
2549
        self.emit(Inst::External { inst });
2550
    }
2551

2552
    /// Perform a logical on vectors in `src1` and `src2` and put the results in
2553
    /// `dst`.
2554
    pub fn xmm_vpxor_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2555
        let dst: WritableXmm = dst.map(|r| r.into());
2556
        let inst = asm::inst::vpxor_b::new(dst, src1, src2).into();
2557
        self.emit(Inst::External { inst });
2558
    }
2559

2560
    /// Perform a max operation across two vectors of floats and put the
2561
    /// results in `dst`.
2562
    pub fn xmm_vmaxp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2563
        let dst: WritableXmm = dst.map(|r| r.into());
2564
        let inst = match size {
2565
            OperandSize::S32 => asm::inst::vmaxps_b::new(dst, src1, src2).into(),
2566
            OperandSize::S64 => asm::inst::vmaxpd_b::new(dst, src1, src2).into(),
2567
            _ => unimplemented!(),
2568
        };
2569
        self.emit(Inst::External { inst });
2570
    }
2571

2572
    // Perform a min operation across two vectors of floats and put the
2573
    // results in `dst`.
2574
    pub fn xmm_vminp_rrm(
2575
        &mut self,
2576
        src1: Reg,
2577
        src2: &Address,
2578
        dst: WritableReg,
2579
        size: OperandSize,
2580
    ) {
2581
        let dst: WritableXmm = dst.map(|r| r.into());
2582
        let address = Self::to_synthetic_amode(src2, MemFlags::trusted());
2583
        let inst = match size {
2584
            OperandSize::S32 => asm::inst::vminps_b::new(dst, src1, address).into(),
2585
            OperandSize::S64 => asm::inst::vminpd_b::new(dst, src1, address).into(),
2586
            _ => unimplemented!(),
2587
        };
2588
        self.emit(Inst::External { inst });
2589
    }
2590

2591
    // Perform a min operation across two vectors of floats and put the
2592
    // results in `dst`.
2593
    pub fn xmm_vminp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2594
        let dst: WritableXmm = dst.map(|r| r.into());
2595
        let inst = match size {
2596
            OperandSize::S32 => asm::inst::vminps_b::new(dst, src1, src2).into(),
2597
            OperandSize::S64 => asm::inst::vminpd_b::new(dst, src1, src2).into(),
2598
            _ => unimplemented!(),
2599
        };
2600
        self.emit(Inst::External { inst });
2601
    }
2602

2603
    // Round a vector of floats.
2604
    pub fn xmm_vroundp_rri(
2605
        &mut self,
2606
        src: Reg,
2607
        dst: WritableReg,
2608
        mode: VroundMode,
2609
        size: OperandSize,
2610
    ) {
2611
        let dst: WritableXmm = dst.map(|r| r.into());
2612
        let imm = match mode {
2613
            VroundMode::TowardNearest => 0,
2614
            VroundMode::TowardNegativeInfinity => 1,
2615
            VroundMode::TowardPositiveInfinity => 2,
2616
            VroundMode::TowardZero => 3,
2617
        };
2618

2619
        let inst = match size {
2620
            OperandSize::S32 => asm::inst::vroundps_rmi::new(dst, src, imm).into(),
2621
            OperandSize::S64 => asm::inst::vroundpd_rmi::new(dst, src, imm).into(),
2622
            _ => unimplemented!(),
2623
        };
2624

2625
        self.emit(Inst::External { inst });
2626
    }
2627

2628
    /// Shuffle of vectors of floats.
2629
    pub fn xmm_vshufp_rrri(
2630
        &mut self,
2631
        src1: Reg,
2632
        src2: Reg,
2633
        dst: WritableReg,
2634
        imm: u8,
2635
        size: OperandSize,
2636
    ) {
2637
        let dst: WritableXmm = dst.map(|r| r.into());
2638
        let inst = match size {
2639
            OperandSize::S32 => asm::inst::vshufps_b::new(dst, src1, src2, imm).into(),
2640
            _ => unimplemented!(),
2641
        };
2642
        self.emit(Inst::External { inst });
2643
    }
2644

2645
    /// Each lane in `src1` is multiplied by the corresponding lane in `src2`
2646
    /// producing intermediate 32-bit operands. Each intermediate 32-bit
2647
    /// operand is truncated to 18 most significant bits. Rounding is performed
2648
    /// by adding 1 to the least significant bit of the 18-bit intermediate
2649
    /// result. The 16 bits immediately to the right of the most significant
2650
    /// bit of each 18-bit intermediate result is placed in each lane of `dst`.
2651
    pub fn xmm_vpmulhrs_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2652
        let dst: WritableXmm = dst.map(|r| r.into());
2653
        let inst = match size {
2654
            OperandSize::S16 => asm::inst::vpmulhrsw_b::new(dst, src1, src2).into(),
2655
            _ => unimplemented!(),
2656
        };
2657
        self.emit(Inst::External { inst });
2658
    }
2659

2660
    pub fn xmm_vpmuldq_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2661
        let dst: WritableXmm = dst.map(|r| r.into());
2662
        let inst = asm::inst::vpmuldq_b::new(dst, src1, src2).into();
2663
        self.emit(Inst::External { inst });
2664
    }
2665

2666
    pub fn xmm_vpmuludq_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2667
        let dst: WritableXmm = dst.map(|r| r.into());
2668
        let inst = asm::inst::vpmuludq_b::new(dst, src1, src2).into();
2669
        self.emit(Inst::External { inst });
2670
    }
2671

2672
    pub fn xmm_vpmull_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2673
        let dst: WritableXmm = dst.map(|r| r.into());
2674
        let inst = match size {
2675
            OperandSize::S16 => asm::inst::vpmullw_b::new(dst, src1, src2).into(),
2676
            OperandSize::S32 => asm::inst::vpmulld_b::new(dst, src1, src2).into(),
2677
            _ => unimplemented!(),
2678
        };
2679
        self.emit(Inst::External { inst });
2680
    }
2681

2682
    pub fn xmm_vmulp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2683
        let dst: WritableXmm = dst.map(|r| r.into());
2684
        let inst = match size {
2685
            OperandSize::S32 => asm::inst::vmulps_b::new(dst, src1, src2).into(),
2686
            OperandSize::S64 => asm::inst::vmulpd_b::new(dst, src1, src2).into(),
2687
            _ => unimplemented!(),
2688
        };
2689
        self.emit(Inst::External { inst });
2690
    }
2691

2692
    /// Perform an average operation for the vector of unsigned integers in
2693
    /// `src1` and `src2` and put the results in `dst`.
2694
    pub fn xmm_vpavg_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2695
        let dst: WritableXmm = dst.map(|r| r.into());
2696
        let inst = match size {
2697
            OperandSize::S8 => asm::inst::vpavgb_b::new(dst, src1, src2).into(),
2698
            OperandSize::S16 => asm::inst::vpavgw_b::new(dst, src1, src2).into(),
2699
            _ => unimplemented!(),
2700
        };
2701
        self.emit(Inst::External { inst });
2702
    }
2703

2704
    /// Divide the vector of floats in `src1` by the vector of floats in `src2`
2705
    /// and put the results in `dst`.
2706
    pub fn xmm_vdivp_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
2707
        let dst: WritableXmm = dst.map(|r| r.into());
2708
        let inst = match size {
2709
            OperandSize::S32 => asm::inst::vdivps_b::new(dst, src1, src2).into(),
2710
            OperandSize::S64 => asm::inst::vdivpd_b::new(dst, src1, src2).into(),
2711
            _ => unimplemented!(),
2712
        };
2713
        self.emit(Inst::External { inst });
2714
    }
2715

2716
    /// Compute square roots of vector of floats in `src` and put the results
2717
    /// in `dst`.
2718
    pub fn xmm_vsqrtp_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
2719
        let dst: WritableXmm = dst.map(|r| r.into());
2720
        let inst = match size {
2721
            OperandSize::S32 => asm::inst::vsqrtps_b::new(dst, src).into(),
2722
            OperandSize::S64 => asm::inst::vsqrtpd_b::new(dst, src).into(),
2723
            _ => unimplemented!(),
2724
        };
2725
        self.emit(Inst::External { inst });
2726
    }
2727

2728
    /// Multiply and add packed signed and unsigned bytes.
2729
    pub fn xmm_vpmaddubsw_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
2730
        let dst: WritableXmm = dst.map(|r| r.into());
2731
        let address = Self::to_synthetic_amode(address, MemFlags::trusted());
2732
        let inst = asm::inst::vpmaddubsw_b::new(dst, src, address).into();
2733
        self.emit(Inst::External { inst });
2734
    }
2735

2736
    /// Multiply and add packed signed and unsigned bytes.
2737
    pub fn xmm_vpmaddubsw_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2738
        let dst: WritableXmm = dst.map(|r| r.into());
2739
        let inst = asm::inst::vpmaddubsw_b::new(dst, src1, src2).into();
2740
        self.emit(Inst::External { inst });
2741
    }
2742

2743
    /// Multiple and add packed integers.
2744
    pub fn xmm_vpmaddwd_rmr(&mut self, src: Reg, address: &Address, dst: WritableReg) {
2745
        let dst: WritableXmm = dst.map(|r| r.into());
2746
        let address = Self::to_synthetic_amode(address, MemFlags::trusted());
2747
        let inst = asm::inst::vpmaddwd_b::new(dst, src, address).into();
2748
        self.emit(Inst::External { inst });
2749
    }
2750

2751
    /// Multiple and add packed integers.
2752
    pub fn xmm_vpmaddwd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg) {
2753
        let dst: WritableXmm = dst.map(|r| r.into());
2754
        let inst = asm::inst::vpmaddwd_b::new(dst, src1, src2).into();
2755
        self.emit(Inst::External { inst });
2756
    }
2757
}
2758

2759
/// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted,
2760
/// but the immediate is not yet known. Currently, this implementation expects a 32-bit immediate,
2761
/// so 8 and 16 bit operand sizes are not supported.
2762
pub(crate) struct PatchableAddToReg {
2763
    /// The region to be patched in the [`MachBuffer`]. It must contain a valid add instruction
2764
    /// sequence, accepting a 32-bit immediate.
2765
    region: PatchRegion,
2766

2767
    /// The offset into the patchable region where the patchable constant begins.
2768
    constant_offset: usize,
2769
}
2770

2771
impl PatchableAddToReg {
2772
    /// Create a new [`PatchableAddToReg`] by capturing a region in the output buffer where the
2773
    /// add-with-immediate occurs. The [`MachBuffer`] will have and add-with-immediate instruction
2774
    /// present in that region, though it will add `0` until the `::finalize` method is called.
2775
    ///
2776
    /// Currently this implementation expects to be able to patch a 32-bit immediate, which means
2777
    /// that 8 and 16-bit addition cannot be supported.
2778
    pub(crate) fn new(reg: Reg, size: OperandSize, asm: &mut Assembler) -> Self {
2779
        let open = asm.buffer_mut().start_patchable();
2780
        let start = asm.buffer().cur_offset();
2781

2782
        // Emit the opcode and register use for the add instruction.
2783
        let reg = pair_gpr(Writable::from_reg(reg));
2784
        let inst = match size {
2785
            OperandSize::S32 => asm::inst::addl_mi::new(reg, 0_u32).into(),
2786
            OperandSize::S64 => asm::inst::addq_mi_sxl::new(reg, 0_i32).into(),
2787
            _ => {
2788
                panic!(
2789
                    "{}-bit addition is not supported, please see the comment on PatchableAddToReg::new",
2790
                    size.num_bits(),
2791
                )
2792
            }
2793
        };
2794
        asm.emit(Inst::External { inst });
2795

2796
        // The offset to the constant is the width of what was just emitted
2797
        // minus 4, the width of the 32-bit immediate.
2798
        let constant_offset = usize::try_from(asm.buffer().cur_offset() - start - 4).unwrap();
2799

2800
        let region = asm.buffer_mut().end_patchable(open);
2801

2802
        Self {
2803
            region,
2804
            constant_offset,
2805
        }
2806
    }
2807

2808
    /// Patch the [`MachBuffer`] with the known constant to be added to the register. The final
2809
    /// value is passed in as an i32, but the instruction encoding is fixed when
2810
    /// [`PatchableAddToReg::new`] is called.
2811
    pub(crate) fn finalize(self, val: i32, buffer: &mut MachBuffer<Inst>) {
2812
        let slice = self.region.patch(buffer);
2813
        debug_assert_eq!(slice.len(), self.constant_offset + 4);
2814
        slice[self.constant_offset..].copy_from_slice(val.to_le_bytes().as_slice());
2815
    }
2816
}
2817

2818
Product

Resources

Company