CoCalc -- emit.rs

GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/cranelift/codegen/src/isa/x64/inst/emit.rs
¹⁶⁹³ views
1
use crate::ir::KnownSymbol;
2
use crate::ir::immediates::{Ieee32, Ieee64};
3
use crate::isa::x64::external::{AsmInst, CraneliftRegisters, PairedGpr};
4
use crate::isa::x64::inst::args::*;
5
use crate::isa::x64::inst::*;
6
use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};
7
use cranelift_assembler_x64 as asm;
8

9
/// A small helper to generate a signed conversion instruction.
10
fn emit_signed_cvt(
11
    sink: &mut MachBuffer<Inst>,
12
    info: &EmitInfo,
13
    state: &mut EmitState,
14
    src: Reg,
15
    dst: Writable<Reg>,
16
    to_f64: bool,
17
) {
18
    assert!(src.is_real());
19
    assert!(dst.to_reg().is_real());
20

21
    // Handle an unsigned int, which is the "easy" case: a signed conversion
22
    // will do the right thing.
23
    let dst = WritableXmm::from_writable_reg(dst).unwrap();
24
    if to_f64 {
25
        asm::inst::cvtsi2sdq_a::new(dst, src).emit(sink, info, state);
26
    } else {
27
        asm::inst::cvtsi2ssq_a::new(dst, src).emit(sink, info, state);
28
    }
29
}
30

31
/// Emits a one way conditional jump if CC is set (true).
32
fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
33
    let cond_start = sink.cur_offset();
34
    let cond_disp_off = cond_start + 2;
35
    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
36
    emit_jcc_no_offset(sink, cc);
37
    debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
38
}
39

40
/// Like `one_way_jmp` above emitting a conditional jump, but also using
41
/// `MachBuffer::add_cond_branch`.
42
fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
43
    let cond_start = sink.cur_offset();
44
    let cond_disp_off = cond_start + 2;
45
    let cond_end = cond_start + 6;
46

47
    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
48
    // FIXME: ideally this `inverted` calculation would go through the external
49
    // assembler, but for now it's left done manually.
50
    let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
51
    sink.add_cond_branch(cond_start, cond_end, label, &inverted[..]);
52

53
    emit_jcc_no_offset(sink, cc);
54

55
    debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
56
    debug_assert_eq!(sink.cur_offset(), cond_end);
57
}
58

59
fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {
60
    // Note that the disassembler matches Capstone which doesn't match the `CC`
61
    // enum directly as Intel has multiple mnemonics use the same encoding.
62
    let inst: AsmInst = match cc {
63
        CC::Z => asm::inst::je_d32::new(0).into(),   // jz == je
64
        CC::NZ => asm::inst::jne_d32::new(0).into(), // jnz == jne
65
        CC::B => asm::inst::jb_d32::new(0).into(),
66
        CC::NB => asm::inst::jae_d32::new(0).into(), // jnb == jae
67
        CC::BE => asm::inst::jbe_d32::new(0).into(),
68
        CC::NBE => asm::inst::ja_d32::new(0).into(), // jnbe == ja
69
        CC::L => asm::inst::jl_d32::new(0).into(),
70
        CC::LE => asm::inst::jle_d32::new(0).into(),
71
        CC::NL => asm::inst::jge_d32::new(0).into(), // jnl == jge
72
        CC::NLE => asm::inst::jg_d32::new(0).into(), // jnle == jg
73
        CC::O => asm::inst::jo_d32::new(0).into(),
74
        CC::NO => asm::inst::jno_d32::new(0).into(),
75
        CC::P => asm::inst::jp_d32::new(0).into(),
76
        CC::NP => asm::inst::jnp_d32::new(0).into(),
77
        CC::S => asm::inst::js_d32::new(0).into(),
78
        CC::NS => asm::inst::jns_d32::new(0).into(),
79
    };
80
    inst.encode(&mut external::AsmCodeSink {
81
        sink,
82
        incoming_arg_offset: 0,
83
        slot_offset: 0,
84
    });
85
}
86

87
/// Emits an unconditional branch.
88
fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {
89
    let uncond_start = sink.cur_offset();
90
    let uncond_disp_off = uncond_start + 1;
91
    let uncond_end = uncond_start + 5;
92

93
    sink.use_label_at_offset(uncond_disp_off, label, LabelUse::JmpRel32);
94
    sink.add_uncond_branch(uncond_start, uncond_end, label);
95

96
    asm::inst::jmp_d32::new(0).encode(&mut external::AsmCodeSink {
97
        sink,
98
        incoming_arg_offset: 0,
99
        slot_offset: 0,
100
    });
101
    debug_assert_eq!(sink.cur_offset(), uncond_disp_off + 4);
102
    debug_assert_eq!(sink.cur_offset(), uncond_end);
103
}
104

105
/// Emits a relocation, attaching the current source location as well.
106
fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
107
    sink.add_reloc(kind, name, addend);
108
}
109

110
/// The top-level emit function.
111
///
112
/// Important!  Do not add improved (shortened) encoding cases to existing
113
/// instructions without also adding tests for those improved encodings.  That
114
/// is a dangerous game that leads to hard-to-track-down errors in the emitted
115
/// code.
116
///
117
/// For all instructions, make sure to have test coverage for all of the
118
/// following situations.  Do this by creating the cross product resulting from
119
/// applying the following rules to each operand:
120
///
121
/// (1) for any insn that mentions a register: one test using a register from
122
///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
123
///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
124
///     This helps detect incorrect REX prefix construction.
125
///
126
/// (2) for any insn that mentions a byte register: one test for each of the
127
///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
128
///     [r8b .. r11b] and [r12b .. r15b].  This checks that
129
///     apparently-redundant REX prefixes are retained when required.
130
///
131
/// (3) for any insn that contains an immediate field, check the following
132
///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
133
///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
134
///     instructions that require a 32-bit immediate have a short-form encoding
135
///     when the imm is in simm8 range.
136
///
137
/// Rules (1), (2) and (3) don't apply for registers within address expressions
138
/// (`Addr`s).  Those are already pretty well tested, and the registers in them
139
/// don't have any effect on the containing instruction (apart from possibly
140
/// require REX prefix bits).
141
///
142
/// When choosing registers for a test, avoid using registers with the same
143
/// offset within a given group.  For example, don't use rax and r8, since they
144
/// both have the lowest 3 bits as 000, and so the test won't detect errors
145
/// where those 3-bit register sub-fields are confused by the emitter.  Instead
146
/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
147
/// and bpl since they have the same offset in their group; use instead (eg) cl
148
/// and sil.
149
///
150
/// For all instructions, also add a test that uses only low-half registers
151
/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
152
/// prefixes are correctly omitted.  This low-half restriction must apply to
153
/// _all_ registers in the insn, even those in address expressions.
154
///
155
/// Following these rules creates large numbers of test cases, but it's the
156
/// only way to make the emitter reliable.
157
///
158
/// Known possible improvements:
159
///
160
/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
161
///   care?)
162
pub(crate) fn emit(
163
    inst: &Inst,
164
    sink: &mut MachBuffer<Inst>,
165
    info: &EmitInfo,
166
    state: &mut EmitState,
167
) {
168
    if !inst.is_available(&info) {
169
        let features = if let Inst::External { inst } = inst {
170
            inst.features().to_string()
171
        } else {
172
            "see `is_available` source for feature term".to_string()
173
        };
174
        panic!(
175
            "Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {features}"
176
        );
177
    }
178

179
    match inst {
180
        Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
181
            // Validate that the register constraints of the dividend and the
182
            // destination are all as expected.
183
            let (dst, size) = match inst {
184
                Inst::CheckedSRemSeq {
185
                    dividend_lo,
186
                    dividend_hi,
187
                    dst_quotient,
188
                    dst_remainder,
189
                    size,
190
                    ..
191
                } => {
192
                    let dividend_lo = dividend_lo.to_reg();
193
                    let dividend_hi = dividend_hi.to_reg();
194
                    let dst_quotient = dst_quotient.to_reg().to_reg();
195
                    let dst_remainder = dst_remainder.to_reg().to_reg();
196
                    debug_assert_eq!(dividend_lo, regs::rax());
197
                    debug_assert_eq!(dividend_hi, regs::rdx());
198
                    debug_assert_eq!(dst_quotient, regs::rax());
199
                    debug_assert_eq!(dst_remainder, regs::rdx());
200
                    (regs::rdx(), *size)
201
                }
202
                Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
203
                    let dividend = dividend.to_reg();
204
                    let dst = dst.to_reg().to_reg();
205
                    debug_assert_eq!(dividend, regs::rax());
206
                    debug_assert_eq!(dst, regs::rax());
207
                    (regs::rax(), OperandSize::Size8)
208
                }
209
                _ => unreachable!(),
210
            };
211

212
            // Generates the following code sequence:
213
            //
214
            // cmp -1 %divisor
215
            // jnz $do_op
216
            //
217
            // ;; for srem, result is 0
218
            // mov #0, %dst
219
            // j $done
220
            //
221
            // $do_op:
222
            // idiv %divisor
223
            //
224
            // $done:
225

226
            let do_op = sink.get_label();
227
            let done_label = sink.get_label();
228

229
            // Check if the divisor is -1, and if it isn't then immediately
230
            // go to the `idiv`.
231
            let inst = Inst::cmp_mi_sxb(size, *divisor, -1);
232
            inst.emit(sink, info, state);
233
            one_way_jmp(sink, CC::NZ, do_op);
234

235
            // ... otherwise the divisor is -1 and the result is always 0. This
236
            // is written to the destination register which will be %rax for
237
            // 8-bit srem and %rdx otherwise.
238
            //
239
            // Note that for 16-to-64-bit srem operations this leaves the
240
            // second destination, %rax, unchanged. This isn't semantically
241
            // correct if a lowering actually tries to use the `dst_quotient`
242
            // output but for srem only the `dst_remainder` output is used for
243
            // now.
244
            let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
245
            inst.emit(sink, info, state);
246
            let inst = Inst::jmp_known(done_label);
247
            inst.emit(sink, info, state);
248

249
            // Here the `idiv` is executed, which is different depending on the
250
            // size
251
            sink.bind_label(do_op, state.ctrl_plane_mut());
252
            let rax = Gpr::RAX;
253
            let rdx = Gpr::RDX;
254
            let writable_rax = Writable::from_reg(rax);
255
            let writable_rdx = Writable::from_reg(rdx);
256
            let inst: AsmInst = match size {
257
                OperandSize::Size8 => asm::inst::idivb_m::new(
258
                    PairedGpr::from(writable_rax),
259
                    *divisor,
260
                    TrapCode::INTEGER_DIVISION_BY_ZERO,
261
                )
262
                .into(),
263

264
                OperandSize::Size16 => asm::inst::idivw_m::new(
265
                    PairedGpr::from(writable_rax),
266
                    PairedGpr::from(writable_rdx),
267
                    *divisor,
268
                    TrapCode::INTEGER_DIVISION_BY_ZERO,
269
                )
270
                .into(),
271

272
                OperandSize::Size32 => asm::inst::idivl_m::new(
273
                    PairedGpr::from(writable_rax),
274
                    PairedGpr::from(writable_rdx),
275
                    *divisor,
276
                    TrapCode::INTEGER_DIVISION_BY_ZERO,
277
                )
278
                .into(),
279

280
                OperandSize::Size64 => asm::inst::idivq_m::new(
281
                    PairedGpr::from(writable_rax),
282
                    PairedGpr::from(writable_rdx),
283
                    *divisor,
284
                    TrapCode::INTEGER_DIVISION_BY_ZERO,
285
                )
286
                .into(),
287
            };
288
            inst.emit(sink, info, state);
289

290
            sink.bind_label(done_label, state.ctrl_plane_mut());
291
        }
292

293
        Inst::MovFromPReg { src, dst } => {
294
            let src: Reg = (*src).into();
295
            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
296
            asm::inst::movq_mr::new(*dst, Gpr::unwrap_new(src)).emit(sink, info, state);
297
        }
298

299
        Inst::MovToPReg { src, dst } => {
300
            let dst: Reg = (*dst).into();
301
            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
302
            let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
303
            asm::inst::movq_mr::new(dst, *src).emit(sink, info, state);
304
        }
305

306
        Inst::XmmCmove {
307
            ty,
308
            cc,
309
            consequent,
310
            alternative,
311
            dst,
312
        } => {
313
            let alternative = *alternative;
314
            let dst = *dst;
315
            debug_assert_eq!(alternative, dst.to_reg());
316
            let consequent = *consequent;
317

318
            // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
319
            // this doesn't clobber flags. Make sure to not do so here.
320
            let next = sink.get_label();
321

322
            // Jump if cc is *not* set.
323
            one_way_jmp(sink, cc.invert(), next);
324
            Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)
325
                .emit(sink, info, state);
326

327
            sink.bind_label(next, state.ctrl_plane_mut());
328
        }
329

330
        Inst::StackProbeLoop {
331
            tmp,
332
            frame_size,
333
            guard_size,
334
        } => {
335
            assert!(info.flags.enable_probestack());
336
            assert!(guard_size.is_power_of_two());
337

338
            let tmp = *tmp;
339

340
            // Number of probes that we need to perform
341
            let probe_count = align_to(*frame_size, *guard_size) / guard_size;
342

343
            // The inline stack probe loop has 3 phases:
344
            //
345
            // We generate the "guard area" register which is essentially the frame_size aligned to
346
            // guard_size. We copy the stack pointer and subtract the guard area from it. This
347
            // gets us a register that we can use to compare when looping.
348
            //
349
            // After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
350
            // distance at a time and then touch the stack by writing anything to it. We use the previously
351
            // created "guard area" register to know when to stop looping.
352
            //
353
            // When we have touched all the pages that we need, we have to restore the stack pointer
354
            // to where it was before.
355
            //
356
            // Generate the following code:
357
            //         mov  tmp_reg, rsp
358
            //         sub  tmp_reg, guard_size * probe_count
359
            // .loop_start:
360
            //         sub  rsp, guard_size
361
            //         mov  [rsp], rsp
362
            //         cmp  rsp, tmp_reg
363
            //         jne  .loop_start
364
            //         add  rsp, guard_size * probe_count
365

366
            // Create the guard bound register
367
            // mov  tmp_reg, rsp
368
            let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
369
            inst.emit(sink, info, state);
370

371
            // sub  tmp_reg, GUARD_SIZE * probe_count
372
            let guard_plus_count = i32::try_from(guard_size * probe_count)
373
                .expect("`guard_size * probe_count` is too large to fit in a 32-bit immediate");
374
            Inst::subq_mi(tmp, guard_plus_count).emit(sink, info, state);
375

376
            // Emit the main loop!
377
            let loop_start = sink.get_label();
378
            sink.bind_label(loop_start, state.ctrl_plane_mut());
379

380
            // sub  rsp, GUARD_SIZE
381
            let rsp = Writable::from_reg(regs::rsp());
382
            let guard_size_ = i32::try_from(*guard_size)
383
                .expect("`guard_size` is too large to fit in a 32-bit immediate");
384
            Inst::subq_mi(rsp, guard_size_).emit(sink, info, state);
385

386
            // TODO: `mov [rsp], 0` would be better, but we don't have that instruction
387
            // Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
388
            // instruction size.
389
            // mov  [rsp], rsp
390
            asm::inst::movl_mr::new(Amode::imm_reg(0, regs::rsp()), Gpr::RSP)
391
                .emit(sink, info, state);
392

393
            // Compare and jump if we are not done yet
394
            // cmp  rsp, tmp_reg
395
            let tmp = Gpr::unwrap_new(tmp.to_reg());
396
            asm::inst::cmpq_rm::new(tmp, Gpr::RSP).emit(sink, info, state);
397

398
            // jne  .loop_start
399
            // TODO: Encoding the conditional jump as a short jump
400
            // could save us us 4 bytes here.
401
            one_way_jmp(sink, CC::NZ, loop_start);
402

403
            // The regular prologue code is going to emit a `sub` after this, so we need to
404
            // reset the stack pointer
405
            //
406
            // TODO: It would be better if we could avoid the `add` + `sub` that is generated here
407
            // and in the stack adj portion of the prologue
408
            //
409
            // add rsp, GUARD_SIZE * probe_count
410
            Inst::addq_mi(rsp, guard_plus_count).emit(sink, info, state);
411
        }
412

413
        Inst::CallKnown { info: call_info } => {
414
            let stack_map = state.take_stack_map();
415

416
            asm::inst::callq_d::new(0).emit(sink, info, state);
417

418
            // The last 4 bytes of `callq` is the relative displacement to where
419
            // we're calling, so that's where the reloc is registered.
420
            //
421
            // The addend adjusts for the difference between the end of the
422
            // instruction and the beginning of the immediate field.
423
            let len = sink.cur_offset();
424
            sink.add_reloc_at_offset(len - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
425

426
            if let Some(s) = stack_map {
427
                sink.push_user_stack_map(state, len, s);
428
            }
429

430
            if let Some(try_call) = call_info.try_call_info.as_ref() {
431
                sink.add_try_call_site(
432
                    Some(state.frame_layout().sp_to_fp()),
433
                    try_call.exception_handlers(&state.frame_layout()),
434
                );
435
            } else {
436
                sink.add_call_site();
437
            }
438

439
            // Reclaim the outgoing argument area that was released by the
440
            // callee, to ensure that StackAMode values are always computed from
441
            // a consistent SP.
442
            if call_info.callee_pop_size > 0 {
443
                let rsp = Writable::from_reg(regs::rsp());
444
                let callee_pop_size = i32::try_from(call_info.callee_pop_size)
445
                    .expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
446
                Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
447
            }
448

449
            // Load any stack-carried return values.
450
            call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
451
                state.frame_layout().stackslots_size,
452
                |inst| inst.emit(sink, info, state),
453
                |_space_needed| None,
454
            );
455

456
            // If this is a try-call, jump to the continuation
457
            // (normal-return) block.
458
            if let Some(try_call) = call_info.try_call_info.as_ref() {
459
                let jmp = Inst::JmpKnown {
460
                    dst: try_call.continuation,
461
                };
462
                jmp.emit(sink, info, state);
463
            }
464
        }
465

466
        Inst::ReturnCallKnown { info: call_info } => {
467
            emit_return_call_common_sequence(sink, info, state, &call_info);
468

469
            // Finally, jump to the callee!
470
            //
471
            // Note: this is not `Inst::Jmp { .. }.emit(..)` because we have
472
            // different metadata in this case: we don't have a label for the
473
            // target, but rather a function relocation.
474
            asm::inst::jmp_d32::new(0).emit(sink, info, state);
475
            let offset = sink.cur_offset();
476
            // The addend adjusts for the difference between the end of the instruction and the
477
            // beginning of the immediate field.
478
            sink.add_reloc_at_offset(offset - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
479
            sink.add_call_site();
480
        }
481

482
        Inst::ReturnCallUnknown { info: call_info } => {
483
            let callee = call_info.dest;
484

485
            emit_return_call_common_sequence(sink, info, state, &call_info);
486

487
            asm::inst::jmpq_m::new(callee).emit(sink, info, state);
488
            sink.add_call_site();
489
        }
490

491
        Inst::CallUnknown {
492
            info: call_info, ..
493
        } => {
494
            let stack_map = state.take_stack_map();
495

496
            let dest = match call_info.dest.clone() {
497
                RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::unwrap_new(reg)),
498
                RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),
499
            };
500

501
            asm::inst::callq_m::new(dest).emit(sink, info, state);
502

503
            if let Some(s) = stack_map {
504
                let offset = sink.cur_offset();
505
                sink.push_user_stack_map(state, offset, s);
506
            }
507

508
            if let Some(try_call) = call_info.try_call_info.as_ref() {
509
                sink.add_try_call_site(
510
                    Some(state.frame_layout().sp_to_fp()),
511
                    try_call.exception_handlers(&state.frame_layout()),
512
                );
513
            } else {
514
                sink.add_call_site();
515
            }
516

517
            // Reclaim the outgoing argument area that was released by the callee, to ensure that
518
            // StackAMode values are always computed from a consistent SP.
519
            if call_info.callee_pop_size > 0 {
520
                let rsp = Writable::from_reg(regs::rsp());
521
                let callee_pop_size = i32::try_from(call_info.callee_pop_size)
522
                    .expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
523
                Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
524
            }
525

526
            // Load any stack-carried return values.
527
            call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
528
                state.frame_layout().stackslots_size,
529
                |inst| inst.emit(sink, info, state),
530
                |_space_needed| None,
531
            );
532

533
            if let Some(try_call) = call_info.try_call_info.as_ref() {
534
                let jmp = Inst::JmpKnown {
535
                    dst: try_call.continuation,
536
                };
537
                jmp.emit(sink, info, state);
538
            }
539
        }
540

541
        Inst::Args { .. } => {}
542
        Inst::Rets { .. } => {}
543

544
        Inst::StackSwitchBasic {
545
            store_context_ptr,
546
            load_context_ptr,
547
            in_payload0,
548
            out_payload0,
549
        } => {
550
            // Note that we do not emit anything for preserving and restoring
551
            // ordinary registers here: That's taken care of by regalloc for us,
552
            // since we marked this instruction as clobbering all registers.
553
            //
554
            // Also note that we do nothing about passing the single payload
555
            // value: We've informed regalloc that it is sent and received via
556
            // the fixed register given by [stack_switch::payload_register]
557

558
            let (tmp1, tmp2) = {
559
                // Ideally we would just ask regalloc for two temporary registers.
560
                // However, adding any early defs to the constraints on StackSwitch
561
                // causes TooManyLiveRegs. Fortunately, we can manually find tmp
562
                // registers without regalloc: Since our instruction clobbers all
563
                // registers, we can simply pick any register that is not assigned
564
                // to the operands.
565

566
                let all = crate::isa::x64::abi::ALL_CLOBBERS;
567

568
                let used_regs = [
569
                    **load_context_ptr,
570
                    **store_context_ptr,
571
                    **in_payload0,
572
                    *out_payload0.to_reg(),
573
                ];
574

575
                let mut tmps = all.into_iter().filter_map(|preg| {
576
                    let reg: Reg = preg.into();
577
                    if !used_regs.contains(&reg) {
578
                        WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))
579
                    } else {
580
                        None
581
                    }
582
                });
583
                (tmps.next().unwrap(), tmps.next().unwrap())
584
            };
585

586
            let layout = stack_switch::control_context_layout();
587
            let rsp_offset = layout.stack_pointer_offset as i32;
588
            let pc_offset = layout.ip_offset as i32;
589
            let rbp_offset = layout.frame_pointer_offset as i32;
590

591
            // Location to which someone switch-ing back to this stack will jump
592
            // to: Right behind the `StackSwitch` instruction
593
            let resume = sink.get_label();
594

595
            //
596
            // For RBP and RSP we do the following:
597
            // - Load new value for register from `load_context_ptr` +
598
            // corresponding offset.
599
            // - Store previous (!) value of register at `store_context_ptr` +
600
            // corresponding offset.
601
            //
602
            // Since `load_context_ptr` and `store_context_ptr` are allowed to be
603
            // equal, we need to use a temporary register here.
604
            //
605

606
            let mut exchange = |offset, reg| {
607
                let addr = SyntheticAmode::real(Amode::imm_reg(offset, **load_context_ptr));
608
                asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
609

610
                asm::inst::movq_mr::new(
611
                    Amode::imm_reg(offset, **store_context_ptr),
612
                    Gpr::new(reg).unwrap(),
613
                )
614
                .emit(sink, info, state);
615

616
                let dst = Writable::from_reg(reg);
617
                asm::inst::movq_mr::new(dst.map(Gpr::unwrap_new), tmp1.to_reg())
618
                    .emit(sink, info, state);
619
            };
620

621
            exchange(rsp_offset, regs::rsp());
622
            exchange(rbp_offset, regs::rbp());
623

624
            //
625
            // Load target PC, store resume PC, jump to target PC
626
            //
627

628
            let addr = SyntheticAmode::real(Amode::imm_reg(pc_offset, **load_context_ptr));
629
            asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
630

631
            let amode = Amode::RipRelative { target: resume };
632
            asm::inst::leaq_rm::new(tmp2, amode).emit(sink, info, state);
633

634
            asm::inst::movq_mr::new(
635
                Amode::imm_reg(pc_offset, **store_context_ptr),
636
                tmp2.to_reg(),
637
            )
638
            .emit(sink, info, state);
639

640
            asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
641

642
            sink.bind_label(resume, state.ctrl_plane_mut());
643
        }
644

645
        Inst::JmpKnown { dst } => uncond_jmp(sink, *dst),
646

647
        Inst::WinchJmpIf { cc, taken } => one_way_jmp(sink, *cc, *taken),
648

649
        Inst::JmpCond {
650
            cc,
651
            taken,
652
            not_taken,
653
        } => {
654
            cond_jmp(sink, *cc, *taken);
655
            uncond_jmp(sink, *not_taken);
656
        }
657

658
        Inst::JmpCondOr {
659
            cc1,
660
            cc2,
661
            taken,
662
            not_taken,
663
        } => {
664
            // Emit:
665
            //   jcc1 taken
666
            //   jcc2 taken
667
            //   jmp not_taken
668
            //
669
            // Note that we enroll both conditionals in the
670
            // branch-chomping mechanism because MachBuffer
671
            // simplification can continue upward as long as it keeps
672
            // chomping branches. In the best case, if taken ==
673
            // not_taken and that one block is the fallthrough block,
674
            // all three branches can disappear.
675

676
            cond_jmp(sink, *cc1, *taken);
677
            cond_jmp(sink, *cc2, *taken);
678
            uncond_jmp(sink, *not_taken);
679
        }
680

681
        &Inst::JmpTableSeq {
682
            idx,
683
            tmp1,
684
            tmp2,
685
            ref targets,
686
            ref default_target,
687
            ..
688
        } => {
689
            // This sequence is *one* instruction in the vcode, and is expanded only here at
690
            // emission time, because we cannot allow the regalloc to insert spills/reloads in
691
            // the middle; we depend on hardcoded PC-rel addressing below.
692
            //
693
            // We don't have to worry about emitting islands, because the only label-use type has a
694
            // maximum range of 2 GB. If we later consider using shorter-range label references,
695
            // this will need to be revisited.
696

697
            // We generate the following sequence. Note that the only read of %idx is before the
698
            // write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
699
            // if you change this.
700
            // lea start_of_jump_table_offset(%rip), %tmp1
701
            // movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
702
            // addq %tmp2, %tmp1
703
            // j *%tmp1
704
            // $start_of_jump_table:
705
            // -- jump table entries
706

707
            // Load base address of jump table.
708
            let start_of_jumptable = sink.get_label();
709
            asm::inst::leaq_rm::new(tmp1, Amode::rip_relative(start_of_jumptable))
710
                .emit(sink, info, state);
711

712
            // Load value out of the jump table. It's a relative offset to the target block, so it
713
            // might be negative; use a sign-extension.
714
            let inst = Inst::movsx_rm_r(
715
                ExtMode::LQ,
716
                RegMem::mem(Amode::imm_reg_reg_shift(
717
                    0,
718
                    Gpr::unwrap_new(tmp1.to_reg()),
719
                    Gpr::unwrap_new(idx),
720
                    2,
721
                )),
722
                tmp2,
723
            );
724
            inst.emit(sink, info, state);
725

726
            // Add base of jump table to jump-table-sourced block offset.
727
            asm::inst::addq_rm::new(tmp1, tmp2).emit(sink, info, state);
728

729
            // Branch to computed address.
730
            asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
731

732
            // Emit jump table (table of 32-bit offsets).
733
            sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());
734
            let jt_off = sink.cur_offset();
735
            for &target in targets.iter().chain(std::iter::once(default_target)) {
736
                let word_off = sink.cur_offset();
737
                // off_into_table is an addend here embedded in the label to be later patched at
738
                // the end of codegen. The offset is initially relative to this jump table entry;
739
                // with the extra addend, it'll be relative to the jump table's start, after
740
                // patching.
741
                let off_into_table = word_off - jt_off;
742
                sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
743
                sink.put4(off_into_table);
744
            }
745
        }
746

747
        Inst::TrapIf { cc, trap_code } => {
748
            let trap_label = sink.defer_trap(*trap_code);
749
            one_way_jmp(sink, *cc, trap_label);
750
        }
751

752
        Inst::TrapIfAnd {
753
            cc1,
754
            cc2,
755
            trap_code,
756
        } => {
757
            let trap_label = sink.defer_trap(*trap_code);
758
            let else_label = sink.get_label();
759

760
            // Jump to the end if the first condition isn't true, and then if
761
            // the second condition is true go to the trap.
762
            one_way_jmp(sink, cc1.invert(), else_label);
763
            one_way_jmp(sink, *cc2, trap_label);
764

765
            sink.bind_label(else_label, state.ctrl_plane_mut());
766
        }
767

768
        Inst::TrapIfOr {
769
            cc1,
770
            cc2,
771
            trap_code,
772
        } => {
773
            let trap_label = sink.defer_trap(*trap_code);
774

775
            // Emit two jumps to the same trap if either condition code is true.
776
            one_way_jmp(sink, *cc1, trap_label);
777
            one_way_jmp(sink, *cc2, trap_label);
778
        }
779

780
        Inst::XmmMinMaxSeq {
781
            size,
782
            is_min,
783
            lhs,
784
            rhs,
785
            dst,
786
        } => {
787
            let rhs = rhs.to_reg();
788
            let lhs = lhs.to_reg();
789
            let dst = dst.to_writable_reg();
790
            debug_assert_eq!(rhs, dst.to_reg());
791

792
            // Generates the following sequence:
793
            // cmpss/cmpsd %lhs, %rhs_dst
794
            // jnz do_min_max
795
            // jp propagate_nan
796
            //
797
            // ;; ordered and equal: propagate the sign bit (for -0 vs 0):
798
            // {and,or}{ss,sd} %lhs, %rhs_dst
799
            // j done
800
            //
801
            // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
802
            // ;; NaN value is returned), we add both inputs.
803
            // propagate_nan:
804
            // add{ss,sd} %lhs, %rhs_dst
805
            // j done
806
            //
807
            // do_min_max:
808
            // {min,max}{ss,sd} %lhs, %rhs_dst
809
            //
810
            // done:
811
            let done = sink.get_label();
812
            let propagate_nan = sink.get_label();
813
            let do_min_max = sink.get_label();
814

815
            let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
816
                OperandSize::Size32 => (
817
                    asm::inst::addss_a::new(dst, lhs).into(),
818
                    asm::inst::ucomiss_a::new(dst.to_reg(), lhs).into(),
819
                    asm::inst::andps_a::new(dst, lhs).into(),
820
                    asm::inst::orps_a::new(dst, lhs).into(),
821
                    if *is_min {
822
                        asm::inst::minss_a::new(dst, lhs).into()
823
                    } else {
824
                        asm::inst::maxss_a::new(dst, lhs).into()
825
                    },
826
                ),
827
                OperandSize::Size64 => (
828
                    asm::inst::addsd_a::new(dst, lhs).into(),
829
                    asm::inst::ucomisd_a::new(dst.to_reg(), lhs).into(),
830
                    asm::inst::andpd_a::new(dst, lhs).into(),
831
                    asm::inst::orpd_a::new(dst, lhs).into(),
832
                    if *is_min {
833
                        asm::inst::minsd_a::new(dst, lhs).into()
834
                    } else {
835
                        asm::inst::maxsd_a::new(dst, lhs).into()
836
                    },
837
                ),
838
                _ => unreachable!(),
839
            };
840
            let add_op: AsmInst = add_op;
841
            let or_op: AsmInst = or_op;
842
            let min_max_op: AsmInst = min_max_op;
843
            let cmp_op: AsmInst = cmp_op;
844

845
            cmp_op.emit(sink, info, state);
846

847
            one_way_jmp(sink, CC::NZ, do_min_max);
848
            one_way_jmp(sink, CC::P, propagate_nan);
849

850
            // Ordered and equal. The operands are bit-identical unless they are zero
851
            // and negative zero. These instructions merge the sign bits in that
852
            // case, and are no-ops otherwise.
853
            let inst: AsmInst = if *is_min { or_op } else { and_op };
854
            inst.emit(sink, info, state);
855

856
            let inst = Inst::jmp_known(done);
857
            inst.emit(sink, info, state);
858

859
            // x86's min/max are not symmetric; if either operand is a NaN, they return the
860
            // read-only operand: perform an addition between the two operands, which has the
861
            // desired NaN propagation effects.
862
            sink.bind_label(propagate_nan, state.ctrl_plane_mut());
863
            add_op.emit(sink, info, state);
864

865
            one_way_jmp(sink, CC::P, done);
866

867
            sink.bind_label(do_min_max, state.ctrl_plane_mut());
868
            min_max_op.emit(sink, info, state);
869

870
            sink.bind_label(done, state.ctrl_plane_mut());
871
        }
872

873
        Inst::XmmUninitializedValue { .. } | Inst::GprUninitializedValue { .. } => {
874
            // These instruction formats only exist to declare a register as a
875
            // `def`; no code is emitted. This is always immediately followed by
876
            // an instruction, such as `xor <tmp>, <tmp>`, that semantically
877
            // reads this undefined value but arithmetically produces the same
878
            // result regardless of its value.
879
        }
880

881
        Inst::CvtUint64ToFloatSeq {
882
            dst_size,
883
            src,
884
            dst,
885
            tmp_gpr1,
886
            tmp_gpr2,
887
        } => {
888
            let src = src.to_reg();
889
            let dst = dst.to_writable_reg();
890
            let tmp_gpr1 = tmp_gpr1.to_writable_reg();
891
            let tmp_gpr2 = tmp_gpr2.to_writable_reg();
892

893
            // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
894
            // different sequence.
895
            //
896
            // Emit the following sequence:
897
            //
898
            //  cmp 0, %src
899
            //  jl handle_negative
900
            //
901
            //  ;; handle positive, which can't overflow
902
            //  cvtsi2sd/cvtsi2ss %src, %dst
903
            //  j done
904
            //
905
            //  ;; handle negative: see below for an explanation of what it's doing.
906
            //  handle_negative:
907
            //  mov %src, %tmp_gpr1
908
            //  shr $1, %tmp_gpr1
909
            //  mov %src, %tmp_gpr2
910
            //  and $1, %tmp_gpr2
911
            //  or %tmp_gpr1, %tmp_gpr2
912
            //  cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
913
            //  addsd/addss %dst, %dst
914
            //
915
            //  done:
916

917
            assert_ne!(src, tmp_gpr1.to_reg());
918
            assert_ne!(src, tmp_gpr2.to_reg());
919

920
            let handle_negative = sink.get_label();
921
            let done = sink.get_label();
922

923
            // If x seen as a signed int64 is not negative, a signed-conversion will do the right
924
            // thing.
925
            // TODO use tst src, src here.
926
            asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);
927

928
            one_way_jmp(sink, CC::L, handle_negative);
929

930
            // Handle a positive int64, which is the "easy" case: a signed conversion will do the
931
            // right thing.
932
            emit_signed_cvt(
933
                sink,
934
                info,
935
                state,
936
                src,
937
                dst,
938
                *dst_size == OperandSize::Size64,
939
            );
940

941
            let inst = Inst::jmp_known(done);
942
            inst.emit(sink, info, state);
943

944
            sink.bind_label(handle_negative, state.ctrl_plane_mut());
945

946
            // Divide x by two to get it in range for the signed conversion, keep the LSB, and
947
            // scale it back up on the FP side.
948
            let inst = Inst::gen_move(tmp_gpr1, src, types::I64);
949
            inst.emit(sink, info, state);
950

951
            // tmp_gpr1 := src >> 1
952
            asm::inst::shrq_mi::new(tmp_gpr1, 1).emit(sink, info, state);
953

954
            let inst = Inst::gen_move(tmp_gpr2, src, types::I64);
955
            inst.emit(sink, info, state);
956

957
            asm::inst::andq_mi_sxb::new(tmp_gpr2, 1).emit(sink, info, state);
958

959
            asm::inst::orq_rm::new(tmp_gpr2, tmp_gpr1).emit(sink, info, state);
960

961
            emit_signed_cvt(
962
                sink,
963
                info,
964
                state,
965
                tmp_gpr2.to_reg(),
966
                dst,
967
                *dst_size == OperandSize::Size64,
968
            );
969

970
            let inst: AsmInst = match *dst_size {
971
                OperandSize::Size64 => asm::inst::addsd_a::new(dst, dst.to_reg()).into(),
972
                OperandSize::Size32 => asm::inst::addss_a::new(dst, dst.to_reg()).into(),
973
                _ => unreachable!(),
974
            };
975
            inst.emit(sink, info, state);
976

977
            sink.bind_label(done, state.ctrl_plane_mut());
978
        }
979

980
        Inst::CvtFloatToSintSeq {
981
            src_size,
982
            dst_size,
983
            is_saturating,
984
            src,
985
            dst,
986
            tmp_gpr,
987
            tmp_xmm,
988
        } => {
989
            use OperandSize::*;
990

991
            let src = src.to_reg();
992
            let dst = dst.to_writable_reg();
993
            let tmp_gpr = tmp_gpr.to_writable_reg();
994
            let tmp_xmm = tmp_xmm.to_writable_reg();
995

996
            // Emits the following common sequence:
997
            //
998
            // cvttss2si/cvttsd2si %src, %dst
999
            // cmp %dst, 1
1000
            // jno done
1001
            //
1002
            // Then, for saturating conversions:
1003
            //
1004
            // ;; check for NaN
1005
            // cmpss/cmpsd %src, %src
1006
            // jnp not_nan
1007
            // xor %dst, %dst
1008
            //
1009
            // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
1010
            // ;; already in %dst.
1011
            // xorpd %tmp_xmm, %tmp_xmm
1012
            // cmpss/cmpsd %src, %tmp_xmm
1013
            // jnb done
1014
            // mov/movaps $INT_MAX, %dst
1015
            //
1016
            // done:
1017
            //
1018
            // Then, for non-saturating conversions:
1019
            //
1020
            // ;; check for NaN
1021
            // cmpss/cmpsd %src, %src
1022
            // jnp not_nan
1023
            // ud2 trap BadConversionToInteger
1024
            //
1025
            // ;; check if INT_MIN was the correct result, against a magic constant:
1026
            // not_nan:
1027
            // movaps/mov $magic, %tmp_gpr
1028
            // movq/movd %tmp_gpr, %tmp_xmm
1029
            // cmpss/cmpsd %tmp_xmm, %src
1030
            // jnb/jnbe $check_positive
1031
            // ud2 trap IntegerOverflow
1032
            //
1033
            // ;; if positive, it was a real overflow
1034
            // check_positive:
1035
            // xorpd %tmp_xmm, %tmp_xmm
1036
            // cmpss/cmpsd %src, %tmp_xmm
1037
            // jnb done
1038
            // ud2 trap IntegerOverflow
1039
            //
1040
            // done:
1041

1042
            let cmp_op: AsmInst = match src_size {
1043
                Size64 => asm::inst::ucomisd_a::new(src, src).into(),
1044
                Size32 => asm::inst::ucomiss_a::new(src, src).into(),
1045
                _ => unreachable!(),
1046
            };
1047

1048
            let cvtt_op = |dst, src| Inst::External {
1049
                inst: match (*src_size, *dst_size) {
1050
                    (Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1051
                    (Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1052
                    (Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1053
                    (Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1054
                    _ => unreachable!(),
1055
                },
1056
            };
1057

1058
            let done = sink.get_label();
1059

1060
            // The truncation.
1061
            cvtt_op(dst, src).emit(sink, info, state);
1062

1063
            // Compare against 1, in case of overflow the dst operand was INT_MIN.
1064
            let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);
1065
            inst.emit(sink, info, state);
1066

1067
            one_way_jmp(sink, CC::NO, done); // no overflow => done
1068

1069
            // Check for NaN.
1070
            cmp_op.emit(sink, info, state);
1071

1072
            if *is_saturating {
1073
                let not_nan = sink.get_label();
1074
                one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
1075

1076
                // For NaN, emit 0.
1077
                let inst: AsmInst = match *dst_size {
1078
                    OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1079
                    OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1080
                    _ => unreachable!(),
1081
                };
1082
                inst.emit(sink, info, state);
1083

1084
                let inst = Inst::jmp_known(done);
1085
                inst.emit(sink, info, state);
1086

1087
                sink.bind_label(not_nan, state.ctrl_plane_mut());
1088

1089
                // If the input was positive, saturate to INT_MAX.
1090

1091
                // Zero out tmp_xmm.
1092
                asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1093

1094
                let inst: AsmInst = match src_size {
1095
                    Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1096
                    Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1097
                    _ => unreachable!(),
1098
                };
1099
                inst.emit(sink, info, state);
1100

1101
                // Jump if >= to done.
1102
                one_way_jmp(sink, CC::NB, done);
1103

1104
                // Otherwise, put INT_MAX.
1105
                if *dst_size == OperandSize::Size64 {
1106
                    let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);
1107
                    inst.emit(sink, info, state);
1108
                } else {
1109
                    let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);
1110
                    inst.emit(sink, info, state);
1111
                }
1112
            } else {
1113
                let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1114
                inst.emit(sink, info, state);
1115

1116
                // Check if INT_MIN was the correct result: determine the smallest floating point
1117
                // number that would convert to INT_MIN, put it in a temporary register, and compare
1118
                // against the src register.
1119
                // If the src register is less (or in some cases, less-or-equal) than the threshold,
1120
                // trap!
1121

1122
                let mut no_overflow_cc = CC::NB; // >=
1123
                let output_bits = dst_size.to_bits();
1124
                match *src_size {
1125
                    OperandSize::Size32 => {
1126
                        let cst = (-Ieee32::pow2(output_bits - 1)).bits();
1127
                        let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);
1128
                        inst.emit(sink, info, state);
1129
                    }
1130
                    OperandSize::Size64 => {
1131
                        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
1132
                        // so there are values less than -2^(N-1) that convert correctly to INT_MIN.
1133
                        let cst = if output_bits < 64 {
1134
                            no_overflow_cc = CC::NBE; // >
1135
                            Ieee64::fcvt_to_sint_negative_overflow(output_bits)
1136
                        } else {
1137
                            -Ieee64::pow2(output_bits - 1)
1138
                        };
1139
                        let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);
1140
                        inst.emit(sink, info, state);
1141
                    }
1142
                    _ => unreachable!(),
1143
                }
1144

1145
                let inst: AsmInst = {
1146
                    let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1147
                    match src_size {
1148
                        Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1149
                        Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1150
                        _ => unreachable!(),
1151
                    }
1152
                };
1153
                inst.emit(sink, info, state);
1154

1155
                let inst: AsmInst = match src_size {
1156
                    Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1157
                    Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1158
                    _ => unreachable!(),
1159
                };
1160
                inst.emit(sink, info, state);
1161

1162
                // no trap if src >= or > threshold
1163
                let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);
1164
                inst.emit(sink, info, state);
1165

1166
                // If positive, it was a real overflow.
1167

1168
                // Zero out the tmp_xmm register.
1169
                asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1170

1171
                let inst: AsmInst = match src_size {
1172
                    Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1173
                    Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1174
                    _ => unreachable!(),
1175
                };
1176
                inst.emit(sink, info, state);
1177

1178
                // no trap if 0 >= src
1179
                let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);
1180
                inst.emit(sink, info, state);
1181
            }
1182

1183
            sink.bind_label(done, state.ctrl_plane_mut());
1184
        }
1185

1186
        Inst::CvtFloatToUintSeq {
1187
            src_size,
1188
            dst_size,
1189
            is_saturating,
1190
            src,
1191
            dst,
1192
            tmp_gpr,
1193
            tmp_xmm,
1194
            tmp_xmm2,
1195
        } => {
1196
            use OperandSize::*;
1197

1198
            let src = src.to_reg();
1199
            let dst = dst.to_writable_reg();
1200
            let tmp_gpr = tmp_gpr.to_writable_reg();
1201
            let tmp_xmm = tmp_xmm.to_writable_reg();
1202
            let tmp_xmm2 = tmp_xmm2.to_writable_reg();
1203

1204
            // The only difference in behavior between saturating and non-saturating is how we
1205
            // handle errors. Emits the following sequence:
1206
            //
1207
            // movaps/mov 2**(int_width - 1), %tmp_gpr
1208
            // movq/movd %tmp_gpr, %tmp_xmm
1209
            // cmpss/cmpsd %tmp_xmm, %src
1210
            // jnb is_large
1211
            //
1212
            // ;; check for NaN inputs
1213
            // jnp not_nan
1214
            // -- non-saturating: ud2 trap BadConversionToInteger
1215
            // -- saturating: xor %dst, %dst; j done
1216
            //
1217
            // not_nan:
1218
            // cvttss2si/cvttsd2si %src, %dst
1219
            // cmp 0, %dst
1220
            // jnl done
1221
            // -- non-saturating: ud2 trap IntegerOverflow
1222
            // -- saturating: xor %dst, %dst; j done
1223
            //
1224
            // is_large:
1225
            // mov %src, %tmp_xmm2
1226
            // subss/subsd %tmp_xmm, %tmp_xmm2
1227
            // cvttss2si/cvttss2sd %tmp_x, %dst
1228
            // cmp 0, %dst
1229
            // jnl next_is_large
1230
            // -- non-saturating: ud2 trap IntegerOverflow
1231
            // -- saturating: movaps $UINT_MAX, %dst; j done
1232
            //
1233
            // next_is_large:
1234
            // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
1235
            //
1236
            // done:
1237

1238
            assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");
1239

1240
            let xor_op = |dst, src| Inst::External {
1241
                inst: match *dst_size {
1242
                    Size32 => asm::inst::xorl_rm::new(dst, src).into(),
1243
                    Size64 => asm::inst::xorq_rm::new(dst, src).into(),
1244
                    _ => unreachable!(),
1245
                },
1246
            };
1247

1248
            let subs_op = |dst, src| Inst::External {
1249
                inst: match *src_size {
1250
                    Size32 => asm::inst::subss_a::new(dst, src).into(),
1251
                    Size64 => asm::inst::subsd_a::new(dst, src).into(),
1252
                    _ => unreachable!(),
1253
                },
1254
            };
1255

1256
            let cvtt_op = |dst, src| Inst::External {
1257
                inst: match (*src_size, *dst_size) {
1258
                    (Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1259
                    (Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1260
                    (Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1261
                    (Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1262
                    _ => unreachable!(),
1263
                },
1264
            };
1265

1266
            let done = sink.get_label();
1267

1268
            let cst = match src_size {
1269
                OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
1270
                OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
1271
                _ => unreachable!(),
1272
            };
1273

1274
            let inst = Inst::imm(*src_size, cst, tmp_gpr);
1275
            inst.emit(sink, info, state);
1276

1277
            let inst: AsmInst = {
1278
                let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1279
                match src_size {
1280
                    Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1281
                    Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1282
                    _ => unreachable!(),
1283
                }
1284
            };
1285
            inst.emit(sink, info, state);
1286

1287
            let inst: AsmInst = match src_size {
1288
                Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1289
                Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1290
                _ => unreachable!(),
1291
            };
1292
            inst.emit(sink, info, state);
1293

1294
            let handle_large = sink.get_label();
1295
            one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
1296

1297
            if *is_saturating {
1298
                // If not NaN jump over this 0-return, otherwise return 0
1299
                let not_nan = sink.get_label();
1300
                one_way_jmp(sink, CC::NP, not_nan);
1301

1302
                xor_op(dst, dst).emit(sink, info, state);
1303

1304
                let inst = Inst::jmp_known(done);
1305
                inst.emit(sink, info, state);
1306
                sink.bind_label(not_nan, state.ctrl_plane_mut());
1307
            } else {
1308
                // Trap.
1309
                let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1310
                inst.emit(sink, info, state);
1311
            }
1312

1313
            // Actual truncation for small inputs: if the result is not positive, then we had an
1314
            // overflow.
1315

1316
            cvtt_op(dst, src).emit(sink, info, state);
1317

1318
            let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1319
            inst.emit(sink, info, state);
1320

1321
            one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
1322

1323
            if *is_saturating {
1324
                // The input was "small" (< 2**(width -1)), so the only way to get an integer
1325
                // overflow is because the input was too small: saturate to the min value, i.e. 0.
1326
                let inst: AsmInst = match *dst_size {
1327
                    OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1328
                    OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1329
                    _ => unreachable!(),
1330
                };
1331
                inst.emit(sink, info, state);
1332

1333
                let inst = Inst::jmp_known(done);
1334
                inst.emit(sink, info, state);
1335
            } else {
1336
                // Trap.
1337
                asm::inst::ud2_zo::new(TrapCode::INTEGER_OVERFLOW).emit(sink, info, state);
1338
            }
1339

1340
            // Now handle large inputs.
1341

1342
            sink.bind_label(handle_large, state.ctrl_plane_mut());
1343

1344
            let inst = Inst::gen_move(tmp_xmm2, src, types::F64);
1345
            inst.emit(sink, info, state);
1346

1347
            subs_op(tmp_xmm2, tmp_xmm.to_reg()).emit(sink, info, state);
1348

1349
            cvtt_op(dst, tmp_xmm2.to_reg()).emit(sink, info, state);
1350

1351
            let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1352
            inst.emit(sink, info, state);
1353

1354
            if *is_saturating {
1355
                let next_is_large = sink.get_label();
1356
                one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
1357

1358
                // The input was "large" (>= 2**(width -1)), so the only way to get an integer
1359
                // overflow is because the input was too large: saturate to the max value.
1360
                let inst = Inst::imm(
1361
                    OperandSize::Size64,
1362
                    if *dst_size == OperandSize::Size64 {
1363
                        u64::max_value()
1364
                    } else {
1365
                        u32::max_value() as u64
1366
                    },
1367
                    dst,
1368
                );
1369
                inst.emit(sink, info, state);
1370

1371
                let inst = Inst::jmp_known(done);
1372
                inst.emit(sink, info, state);
1373
                sink.bind_label(next_is_large, state.ctrl_plane_mut());
1374
            } else {
1375
                let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);
1376
                inst.emit(sink, info, state);
1377
            }
1378

1379
            if *dst_size == OperandSize::Size64 {
1380
                let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);
1381
                inst.emit(sink, info, state);
1382

1383
                asm::inst::addq_rm::new(dst, tmp_gpr).emit(sink, info, state);
1384
            } else {
1385
                asm::inst::addl_mi::new(dst, asm::Imm32::new(1 << 31)).emit(sink, info, state);
1386
            }
1387

1388
            sink.bind_label(done, state.ctrl_plane_mut());
1389
        }
1390

1391
        Inst::LoadExtName {
1392
            dst,
1393
            name,
1394
            offset,
1395
            distance,
1396
        } => {
1397
            let name = &**name;
1398
            let riprel = asm::Amode::RipRelative {
1399
                target: asm::DeferredTarget::None,
1400
            };
1401
            if info.flags.is_pic() {
1402
                // Generates: movq symbol@GOTPCREL(%rip), %dst
1403
                asm::inst::movq_rm::new(*dst, riprel).emit(sink, info, state);
1404
                let cur = sink.cur_offset();
1405
                sink.add_reloc_at_offset(cur - 4, Reloc::X86GOTPCRel4, name, -4);
1406

1407
                // Offset in the relocation above applies to the address of the
1408
                // *GOT entry*, not the loaded address; so we emit a separate
1409
                // add instruction if needed.
1410
                let offset = i32::try_from(*offset).unwrap();
1411
                if offset != 0 {
1412
                    asm::inst::addq_mi_sxl::new(PairedGpr::from(*dst), offset)
1413
                        .emit(sink, info, state);
1414
                }
1415
            } else if distance == &RelocDistance::Near {
1416
                // If we know the distance to the name is within 2GB (e.g., a
1417
                // module-local function), we can generate a RIP-relative
1418
                // address, with a relocation.
1419
                asm::inst::leaq_rm::new(*dst, riprel).emit(sink, info, state);
1420
                let cur = sink.cur_offset();
1421
                sink.add_reloc_at_offset(cur - 4, Reloc::X86CallPCRel4, name, *offset - 4);
1422
            } else {
1423
                // The full address can be encoded in the register, with a
1424
                // relocation.
1425
                asm::inst::movabsq_oi::new(*dst, 0).emit(sink, info, state);
1426
                let cur = sink.cur_offset();
1427
                sink.add_reloc_at_offset(cur - 8, Reloc::Abs8, name, *offset);
1428
            }
1429
        }
1430

1431
        Inst::AtomicRmwSeq {
1432
            ty,
1433
            op,
1434
            mem,
1435
            operand,
1436
            temp,
1437
            dst_old,
1438
        } => {
1439
            let operand = *operand;
1440
            let temp = *temp;
1441
            let temp_r = temp.map(|r| *r);
1442
            let dst_old = *dst_old;
1443
            let dst_old_r = dst_old.map(|r| *r);
1444
            debug_assert_eq!(dst_old.to_reg(), regs::rax());
1445
            let mem = mem.finalize(state.frame_layout(), sink).clone();
1446

1447
            // Emit this:
1448
            //    mov{zbq,zwq,zlq,q}     (%r_address), %rax    // rax = old value
1449
            //  again:
1450
            //    movq                   %rax, %r_temp         // rax = old value, r_temp = old value
1451
            //    `op`q                  %r_operand, %r_temp   // rax = old value, r_temp = new value
1452
            //    lock cmpxchg{b,w,l,q}  %r_temp, (%r_address) // try to store new value
1453
            //    jnz again // If this is taken, rax will have a "revised" old value
1454
            //
1455
            // Operand conventions: IN:  %r_address, %r_operand OUT: %rax (old
1456
            //    value), %r_temp (trashed), %rflags (trashed)
1457
            let again_label = sink.get_label();
1458

1459
            // mov{zbq,zwq,zlq,q} (%r_address), %rax
1460
            // No need to call `add_trap` here, since the `i1` emit will do that.
1461
            let i1 = Inst::load(*ty, mem.clone(), dst_old_r, ExtKind::ZeroExtend);
1462
            i1.emit(sink, info, state);
1463

1464
            // again:
1465
            sink.bind_label(again_label, state.ctrl_plane_mut());
1466

1467
            // movq %rax, %r_temp
1468
            asm::inst::movq_mr::new(temp, dst_old.to_reg()).emit(sink, info, state);
1469

1470
            use AtomicRmwSeqOp as RmwOp;
1471
            match op {
1472
                RmwOp::Nand => {
1473
                    // andq %r_operand, %r_temp
1474
                    asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1475

1476
                    // notq %r_temp
1477
                    asm::inst::notq_m::new(PairedGpr::from(temp)).emit(sink, info, state);
1478
                }
1479
                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1480
                    // cmp %r_temp, %r_operand
1481
                    let temp = temp.to_reg();
1482
                    match *ty {
1483
                        types::I8 => asm::inst::cmpb_mr::new(operand, temp).emit(sink, info, state),
1484
                        types::I16 => {
1485
                            asm::inst::cmpw_mr::new(operand, temp).emit(sink, info, state)
1486
                        }
1487
                        types::I32 => {
1488
                            asm::inst::cmpl_mr::new(operand, temp).emit(sink, info, state)
1489
                        }
1490
                        types::I64 => {
1491
                            asm::inst::cmpq_mr::new(operand, temp).emit(sink, info, state)
1492
                        }
1493
                        _ => unreachable!(),
1494
                    }
1495

1496
                    // cmovcc %r_operand, %r_temp
1497
                    match op {
1498
                        RmwOp::Umin => {
1499
                            asm::inst::cmovbeq_rm::new(temp_r, *operand).emit(sink, info, state)
1500
                        }
1501
                        RmwOp::Umax => {
1502
                            asm::inst::cmovaeq_rm::new(temp_r, *operand).emit(sink, info, state)
1503
                        }
1504
                        RmwOp::Smin => {
1505
                            asm::inst::cmovleq_rm::new(temp_r, *operand).emit(sink, info, state)
1506
                        }
1507
                        RmwOp::Smax => {
1508
                            asm::inst::cmovgeq_rm::new(temp_r, *operand).emit(sink, info, state)
1509
                        }
1510
                        _ => unreachable!(),
1511
                    }
1512
                }
1513
                RmwOp::And => {
1514
                    // andq %r_operand, %r_temp
1515
                    asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1516
                }
1517
                RmwOp::Or => {
1518
                    // orq %r_operand, %r_temp
1519
                    asm::inst::orq_rm::new(temp, operand).emit(sink, info, state);
1520
                }
1521
                RmwOp::Xor => {
1522
                    // xorq %r_operand, %r_temp
1523
                    asm::inst::xorq_rm::new(temp, operand).emit(sink, info, state);
1524
                }
1525
            }
1526

1527
            // lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
1528
            // No need to call `add_trap` here, since the `i4` emit will do that.
1529
            let temp = temp.to_reg();
1530
            let dst_old = PairedGpr::from(dst_old);
1531
            let inst: AsmInst = match *ty {
1532
                types::I8 => asm::inst::lock_cmpxchgb_mr::new(mem, temp, dst_old).into(),
1533
                types::I16 => asm::inst::lock_cmpxchgw_mr::new(mem, temp, dst_old).into(),
1534
                types::I32 => asm::inst::lock_cmpxchgl_mr::new(mem, temp, dst_old).into(),
1535
                types::I64 => asm::inst::lock_cmpxchgq_mr::new(mem, temp, dst_old).into(),
1536
                _ => unreachable!(),
1537
            };
1538
            inst.emit(sink, info, state);
1539

1540
            // jnz again
1541
            one_way_jmp(sink, CC::NZ, again_label);
1542
        }
1543

1544
        Inst::Atomic128RmwSeq {
1545
            op,
1546
            mem,
1547
            operand_low,
1548
            operand_high,
1549
            temp_low,
1550
            temp_high,
1551
            dst_old_low,
1552
            dst_old_high,
1553
        } => {
1554
            let operand_low = *operand_low;
1555
            let operand_high = *operand_high;
1556
            let temp_low = *temp_low;
1557
            let temp_high = *temp_high;
1558
            let dst_old_low = *dst_old_low;
1559
            let dst_old_high = *dst_old_high;
1560
            debug_assert_eq!(temp_low.to_reg(), regs::rbx());
1561
            debug_assert_eq!(temp_high.to_reg(), regs::rcx());
1562
            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1563
            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1564
            let mem = mem.finalize(state.frame_layout(), sink).clone();
1565

1566
            let again_label = sink.get_label();
1567

1568
            // Load the initial value.
1569
            asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1570
            asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1571

1572
            // again:
1573
            sink.bind_label(again_label, state.ctrl_plane_mut());
1574

1575
            // Move old value to temp registers.
1576
            asm::inst::movq_mr::new(temp_low, dst_old_low.to_reg()).emit(sink, info, state);
1577
            asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg()).emit(sink, info, state);
1578

1579
            // Perform the operation.
1580
            use Atomic128RmwSeqOp as RmwOp;
1581
            match op {
1582
                RmwOp::Nand => {
1583
                    // temp &= operand
1584
                    asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1585
                    asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1586

1587
                    // temp = !temp
1588
                    asm::inst::notq_m::new(PairedGpr::from(temp_low)).emit(sink, info, state);
1589
                    asm::inst::notq_m::new(PairedGpr::from(temp_high)).emit(sink, info, state);
1590
                }
1591
                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1592
                    // Do a comparison with LHS temp and RHS operand.
1593
                    // Note the opposite argument orders.
1594
                    asm::inst::cmpq_mr::new(temp_low.to_reg(), operand_low).emit(sink, info, state);
1595
                    // This will clobber `temp_high`
1596
                    asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1597
                    // Restore the clobbered value
1598
                    asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg())
1599
                        .emit(sink, info, state);
1600
                    match op {
1601
                        RmwOp::Umin => {
1602
                            asm::inst::cmovaeq_rm::new(temp_low, operand_low)
1603
                                .emit(sink, info, state);
1604
                            asm::inst::cmovaeq_rm::new(temp_high, operand_high)
1605
                                .emit(sink, info, state);
1606
                        }
1607
                        RmwOp::Umax => {
1608
                            asm::inst::cmovbq_rm::new(temp_low, operand_low)
1609
                                .emit(sink, info, state);
1610
                            asm::inst::cmovbq_rm::new(temp_high, operand_high)
1611
                                .emit(sink, info, state);
1612
                        }
1613
                        RmwOp::Smin => {
1614
                            asm::inst::cmovgeq_rm::new(temp_low, operand_low)
1615
                                .emit(sink, info, state);
1616
                            asm::inst::cmovgeq_rm::new(temp_high, operand_high)
1617
                                .emit(sink, info, state);
1618
                        }
1619
                        RmwOp::Smax => {
1620
                            asm::inst::cmovlq_rm::new(temp_low, operand_low)
1621
                                .emit(sink, info, state);
1622
                            asm::inst::cmovlq_rm::new(temp_high, operand_high)
1623
                                .emit(sink, info, state);
1624
                        }
1625
                        _ => unreachable!(),
1626
                    }
1627
                }
1628
                RmwOp::Add => {
1629
                    asm::inst::addq_rm::new(temp_low, operand_low).emit(sink, info, state);
1630
                    asm::inst::adcq_rm::new(temp_high, operand_high).emit(sink, info, state);
1631
                }
1632
                RmwOp::Sub => {
1633
                    asm::inst::subq_rm::new(temp_low, operand_low).emit(sink, info, state);
1634
                    asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1635
                }
1636
                RmwOp::And => {
1637
                    asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1638
                    asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1639
                }
1640
                RmwOp::Or => {
1641
                    asm::inst::orq_rm::new(temp_low, operand_low).emit(sink, info, state);
1642
                    asm::inst::orq_rm::new(temp_high, operand_high).emit(sink, info, state);
1643
                }
1644
                RmwOp::Xor => {
1645
                    asm::inst::xorq_rm::new(temp_low, operand_low).emit(sink, info, state);
1646
                    asm::inst::xorq_rm::new(temp_high, operand_high).emit(sink, info, state);
1647
                }
1648
            }
1649

1650
            // cmpxchg16b (mem)
1651
            asm::inst::lock_cmpxchg16b_m::new(
1652
                PairedGpr::from(dst_old_low),
1653
                PairedGpr::from(dst_old_high),
1654
                temp_low.to_reg(),
1655
                temp_high.to_reg(),
1656
                mem,
1657
            )
1658
            .emit(sink, info, state);
1659

1660
            // jnz again
1661
            one_way_jmp(sink, CC::NZ, again_label);
1662
        }
1663

1664
        Inst::Atomic128XchgSeq {
1665
            mem,
1666
            operand_low,
1667
            operand_high,
1668
            dst_old_low,
1669
            dst_old_high,
1670
        } => {
1671
            let operand_low = *operand_low;
1672
            let operand_high = *operand_high;
1673
            let dst_old_low = *dst_old_low;
1674
            let dst_old_high = *dst_old_high;
1675
            debug_assert_eq!(operand_low, regs::rbx());
1676
            debug_assert_eq!(operand_high, regs::rcx());
1677
            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1678
            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1679
            let mem = mem.finalize(state.frame_layout(), sink).clone();
1680

1681
            let again_label = sink.get_label();
1682

1683
            // Load the initial value.
1684
            asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1685
            asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1686

1687
            // again:
1688
            sink.bind_label(again_label, state.ctrl_plane_mut());
1689

1690
            // cmpxchg16b (mem)
1691
            asm::inst::lock_cmpxchg16b_m::new(
1692
                PairedGpr::from(dst_old_low),
1693
                PairedGpr::from(dst_old_high),
1694
                operand_low,
1695
                operand_high,
1696
                mem,
1697
            )
1698
            .emit(sink, info, state);
1699

1700
            // jnz again
1701
            one_way_jmp(sink, CC::NZ, again_label);
1702
        }
1703

1704
        Inst::ElfTlsGetAddr { symbol, dst } => {
1705
            let dst = dst.to_reg().to_reg();
1706
            debug_assert_eq!(dst, regs::rax());
1707

1708
            // N.B.: Must be exactly this byte sequence; the linker requires it,
1709
            // because it must know how to rewrite the bytes.
1710

1711
            // data16 lea gv@tlsgd(%rip),%rdi
1712
            sink.put1(0x66); // data16
1713
            sink.put1(0b01001000); // REX.W
1714
            sink.put1(0x8d); // LEA
1715
            sink.put1(0x3d); // ModRM byte
1716
            emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
1717
            sink.put4(0); // offset
1718

1719
            // data16 data16 callq __tls_get_addr-4
1720
            sink.put1(0x66); // data16
1721
            sink.put1(0x66); // data16
1722
            sink.put1(0b01001000); // REX.W
1723
            sink.put1(0xe8); // CALL
1724
            emit_reloc(
1725
                sink,
1726
                Reloc::X86CallPLTRel4,
1727
                &ExternalName::LibCall(LibCall::ElfTlsGetAddr),
1728
                -4,
1729
            );
1730
            sink.put4(0); // offset
1731
        }
1732

1733
        Inst::MachOTlsGetAddr { symbol, dst } => {
1734
            let dst = dst.to_reg().to_reg();
1735
            debug_assert_eq!(dst, regs::rax());
1736

1737
            // movq gv@tlv(%rip), %rdi
1738
            sink.put1(0x48); // REX.w
1739
            sink.put1(0x8b); // MOV
1740
            sink.put1(0x3d); // ModRM byte
1741
            emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
1742
            sink.put4(0); // offset
1743

1744
            asm::inst::callq_m::new(asm::Amode::ImmReg {
1745
                base: Gpr::RDI,
1746
                simm32: asm::AmodeOffsetPlusKnownOffset::ZERO,
1747
                trap: None,
1748
            })
1749
            .emit(sink, info, state);
1750
        }
1751

1752
        Inst::CoffTlsGetAddr { symbol, dst, tmp } => {
1753
            let dst = dst.to_reg().to_reg();
1754
            debug_assert_eq!(dst, regs::rax());
1755

1756
            // tmp is used below directly as %rcx
1757
            let tmp = tmp.to_reg().to_reg();
1758
            debug_assert_eq!(tmp, regs::rcx());
1759

1760
            // See: https://gcc.godbolt.org/z/M8or9x6ss
1761
            // And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
1762

1763
            // Emit the following sequence
1764
            // movl	(%rip), %eax          ; IMAGE_REL_AMD64_REL32	_tls_index
1765
            // movq	%gs:88, %rcx
1766
            // movq	(%rcx,%rax,8), %rax
1767
            // leaq	(%rax), %rax          ; Reloc: IMAGE_REL_AMD64_SECREL	symbol
1768

1769
            // Load TLS index for current thread
1770
            // movl	(%rip), %eax
1771
            sink.put1(0x8b); // mov
1772
            sink.put1(0x05);
1773
            emit_reloc(
1774
                sink,
1775
                Reloc::X86PCRel4,
1776
                &ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
1777
                -4,
1778
            );
1779
            sink.put4(0); // offset
1780

1781
            // movq	%gs:88, %rcx
1782
            // Load the TLS Storage Array pointer
1783
            // The gs segment register refers to the base address of the TEB on x64.
1784
            // 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
1785
            sink.put_data(&[
1786
                0x65, 0x48, // REX.W
1787
                0x8b, // MOV
1788
                0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
1789
                0x00, 0x00, 0x00,
1790
            ]);
1791

1792
            // movq	(%rcx,%rax,8), %rax
1793
            // Load the actual TLS entry for this thread.
1794
            // Computes ThreadLocalStoragePointer + _tls_index*8
1795
            sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
1796

1797
            // leaq	(%rax), %rax
1798
            sink.put1(0x48);
1799
            sink.put1(0x8d);
1800
            sink.put1(0x80);
1801
            emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
1802
            sink.put4(0); // offset
1803
        }
1804

1805
        Inst::Unwind { inst } => {
1806
            sink.add_unwind(inst.clone());
1807
        }
1808

1809
        Inst::DummyUse { .. } => {
1810
            // Nothing.
1811
        }
1812

1813
        Inst::LabelAddress { dst, label } => {
1814
            // Emit an LEA with a LabelUse given this label.
1815
            asm::inst::leaq_rm::new(*dst, Amode::rip_relative(*label)).emit(sink, info, state);
1816
        }
1817

1818
        Inst::External { inst } => {
1819
            let frame = state.frame_layout();
1820
            emit_maybe_shrink(
1821
                inst,
1822
                &mut external::AsmCodeSink {
1823
                    sink,
1824

1825
                    // These values are transcribed from what is happening in
1826
                    // `SyntheticAmode::finalize`. This, plus the `Into` logic
1827
                    // converting a `SyntheticAmode` to its external counterpart, are
1828
                    // necessary to communicate Cranelift's internal offsets to the
1829
                    // assembler; due to when Cranelift determines these offsets, this
1830
                    // happens quite late (i.e., here during emission).
1831
                    incoming_arg_offset: i32::try_from(
1832
                        frame.tail_args_size + frame.setup_area_size,
1833
                    )
1834
                    .unwrap(),
1835
                    slot_offset: i32::try_from(frame.outgoing_args_size).unwrap(),
1836
                },
1837
            );
1838
        }
1839
    }
1840

1841
    state.clear_post_insn();
1842
}
1843

1844
/// Emit the common sequence used for both direct and indirect tail calls:
1845
///
1846
/// * Copy the new frame's stack arguments over the top of our current frame.
1847
///
1848
/// * Restore the old frame pointer.
1849
///
1850
/// * Initialize the tail callee's stack pointer (simultaneously deallocating
1851
///   the temporary stack space we allocated when creating the new frame's stack
1852
///   arguments).
1853
///
1854
/// * Move the return address into its stack slot.
1855
fn emit_return_call_common_sequence<T>(
1856
    sink: &mut MachBuffer<Inst>,
1857
    info: &EmitInfo,
1858
    state: &mut EmitState,
1859
    call_info: &ReturnCallInfo<T>,
1860
) {
1861
    assert!(
1862
        info.flags.preserve_frame_pointers(),
1863
        "frame pointers aren't fundamentally required for tail calls, \
1864
                 but the current implementation relies on them being present"
1865
    );
1866

1867
    let tmp = call_info.tmp.to_writable_reg();
1868

1869
    for inst in
1870
        X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
1871
    {
1872
        inst.emit(sink, info, state);
1873
    }
1874

1875
    for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
1876
        CallConv::Tail,
1877
        &info.flags,
1878
        &info.isa_flags,
1879
        state.frame_layout(),
1880
    ) {
1881
        inst.emit(sink, info, state);
1882
    }
1883

1884
    let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;
1885
    if incoming_args_diff > 0 {
1886
        // Move the saved return address up by `incoming_args_diff`.
1887
        let addr = Amode::imm_reg(0, regs::rsp());
1888
        asm::inst::movq_rm::new(tmp, addr).emit(sink, info, state);
1889
        asm::inst::movq_mr::new(
1890
            Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),
1891
            Gpr::unwrap_new(tmp.to_reg()),
1892
        )
1893
        .emit(sink, info, state);
1894

1895
        // Increment the stack pointer to shrink the argument area for the new
1896
        // call.
1897
        let rsp = Writable::from_reg(regs::rsp());
1898
        let incoming_args_diff = i32::try_from(incoming_args_diff)
1899
            .expect("`incoming_args_diff` is too large to fit in a 32-bit signed immediate");
1900
        Inst::addq_mi(rsp, incoming_args_diff).emit(sink, info, state);
1901
    }
1902
}
1903

1904
/// Conveniene trait to have an `emit` method on all `asm::inst::*` variants.
1905
trait ExternalEmit {
1906
    fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState);
1907
}
1908

1909
impl<I> ExternalEmit for I
1910
where
1911
    I: Into<asm::inst::Inst<CraneliftRegisters>>,
1912
{
1913
    fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState) {
1914
        Inst::External { inst: self.into() }.emit(sink, info, state)
1915
    }
1916
}
1917

1918
/// Attempt to "shrink" the provided `inst`.
1919
///
1920
/// This function will inspect `inst` and attempt to return a new instruction
1921
/// which is equivalent semantically but will encode to a smaller binary
1922
/// representation. This is only done for instructions which require register
1923
/// allocation to have already happened, for example shrinking immediates should
1924
/// be done during instruction selection not at this point.
1925
///
1926
/// An example of this optimization is the `AND` instruction. The Intel manual
1927
/// has a smaller encoding for `AND AL, imm8` than it does for `AND r/m8, imm8`.
1928
/// Here the instructions are matched against and if regalloc state indicates
1929
/// that a smaller variant is available then that's swapped to instead.
1930
fn emit_maybe_shrink(inst: &AsmInst, sink: &mut impl asm::CodeSink) {
1931
    use cranelift_assembler_x64::GprMem;
1932
    use cranelift_assembler_x64::inst::*;
1933

1934
    type R = CraneliftRegisters;
1935
    const RAX: PairedGpr = PairedGpr {
1936
        read: Gpr::RAX,
1937
        write: Writable::from_reg(Gpr::RAX),
1938
    };
1939
    const RAX_RM: GprMem<PairedGpr, Gpr> = GprMem::Gpr(RAX);
1940

1941
    match *inst {
1942
        // and
1943
        Inst::andb_mi(andb_mi { rm8: RAX_RM, imm8 }) => andb_i::<R>::new(RAX, imm8).encode(sink),
1944
        Inst::andw_mi(andw_mi {
1945
            rm16: RAX_RM,
1946
            imm16,
1947
        }) => andw_i::<R>::new(RAX, imm16).encode(sink),
1948
        Inst::andl_mi(andl_mi {
1949
            rm32: RAX_RM,
1950
            imm32,
1951
        }) => andl_i::<R>::new(RAX, imm32).encode(sink),
1952
        Inst::andq_mi_sxl(andq_mi_sxl {
1953
            rm64: RAX_RM,
1954
            imm32,
1955
        }) => andq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1956

1957
        // or
1958
        Inst::orb_mi(orb_mi { rm8: RAX_RM, imm8 }) => orb_i::<R>::new(RAX, imm8).encode(sink),
1959
        Inst::orw_mi(orw_mi {
1960
            rm16: RAX_RM,
1961
            imm16,
1962
        }) => orw_i::<R>::new(RAX, imm16).encode(sink),
1963
        Inst::orl_mi(orl_mi {
1964
            rm32: RAX_RM,
1965
            imm32,
1966
        }) => orl_i::<R>::new(RAX, imm32).encode(sink),
1967
        Inst::orq_mi_sxl(orq_mi_sxl {
1968
            rm64: RAX_RM,
1969
            imm32,
1970
        }) => orq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1971

1972
        // xor
1973
        Inst::xorb_mi(xorb_mi { rm8: RAX_RM, imm8 }) => xorb_i::<R>::new(RAX, imm8).encode(sink),
1974
        Inst::xorw_mi(xorw_mi {
1975
            rm16: RAX_RM,
1976
            imm16,
1977
        }) => xorw_i::<R>::new(RAX, imm16).encode(sink),
1978
        Inst::xorl_mi(xorl_mi {
1979
            rm32: RAX_RM,
1980
            imm32,
1981
        }) => xorl_i::<R>::new(RAX, imm32).encode(sink),
1982
        Inst::xorq_mi_sxl(xorq_mi_sxl {
1983
            rm64: RAX_RM,
1984
            imm32,
1985
        }) => xorq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1986

1987
        // add
1988
        Inst::addb_mi(addb_mi { rm8: RAX_RM, imm8 }) => addb_i::<R>::new(RAX, imm8).encode(sink),
1989
        Inst::addw_mi(addw_mi {
1990
            rm16: RAX_RM,
1991
            imm16,
1992
        }) => addw_i::<R>::new(RAX, imm16).encode(sink),
1993
        Inst::addl_mi(addl_mi {
1994
            rm32: RAX_RM,
1995
            imm32,
1996
        }) => addl_i::<R>::new(RAX, imm32).encode(sink),
1997
        Inst::addq_mi_sxl(addq_mi_sxl {
1998
            rm64: RAX_RM,
1999
            imm32,
2000
        }) => addq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2001

2002
        // adc
2003
        Inst::adcb_mi(adcb_mi { rm8: RAX_RM, imm8 }) => adcb_i::<R>::new(RAX, imm8).encode(sink),
2004
        Inst::adcw_mi(adcw_mi {
2005
            rm16: RAX_RM,
2006
            imm16,
2007
        }) => adcw_i::<R>::new(RAX, imm16).encode(sink),
2008
        Inst::adcl_mi(adcl_mi {
2009
            rm32: RAX_RM,
2010
            imm32,
2011
        }) => adcl_i::<R>::new(RAX, imm32).encode(sink),
2012
        Inst::adcq_mi_sxl(adcq_mi_sxl {
2013
            rm64: RAX_RM,
2014
            imm32,
2015
        }) => adcq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2016

2017
        // sub
2018
        Inst::subb_mi(subb_mi { rm8: RAX_RM, imm8 }) => subb_i::<R>::new(RAX, imm8).encode(sink),
2019
        Inst::subw_mi(subw_mi {
2020
            rm16: RAX_RM,
2021
            imm16,
2022
        }) => subw_i::<R>::new(RAX, imm16).encode(sink),
2023
        Inst::subl_mi(subl_mi {
2024
            rm32: RAX_RM,
2025
            imm32,
2026
        }) => subl_i::<R>::new(RAX, imm32).encode(sink),
2027
        Inst::subq_mi_sxl(subq_mi_sxl {
2028
            rm64: RAX_RM,
2029
            imm32,
2030
        }) => subq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2031

2032
        // sbb
2033
        Inst::sbbb_mi(sbbb_mi { rm8: RAX_RM, imm8 }) => sbbb_i::<R>::new(RAX, imm8).encode(sink),
2034
        Inst::sbbw_mi(sbbw_mi {
2035
            rm16: RAX_RM,
2036
            imm16,
2037
        }) => sbbw_i::<R>::new(RAX, imm16).encode(sink),
2038
        Inst::sbbl_mi(sbbl_mi {
2039
            rm32: RAX_RM,
2040
            imm32,
2041
        }) => sbbl_i::<R>::new(RAX, imm32).encode(sink),
2042
        Inst::sbbq_mi_sxl(sbbq_mi_sxl {
2043
            rm64: RAX_RM,
2044
            imm32,
2045
        }) => sbbq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2046

2047
        // cmp
2048
        Inst::cmpb_mi(cmpb_mi {
2049
            rm8: GprMem::Gpr(Gpr::RAX),
2050
            imm8,
2051
        }) => cmpb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2052
        Inst::cmpw_mi(cmpw_mi {
2053
            rm16: GprMem::Gpr(Gpr::RAX),
2054
            imm16,
2055
        }) => cmpw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2056
        Inst::cmpl_mi(cmpl_mi {
2057
            rm32: GprMem::Gpr(Gpr::RAX),
2058
            imm32,
2059
        }) => cmpl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2060
        Inst::cmpq_mi(cmpq_mi {
2061
            rm64: GprMem::Gpr(Gpr::RAX),
2062
            imm32,
2063
        }) => cmpq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2064

2065
        // test
2066
        Inst::testb_mi(testb_mi {
2067
            rm8: GprMem::Gpr(Gpr::RAX),
2068
            imm8,
2069
        }) => testb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2070
        Inst::testw_mi(testw_mi {
2071
            rm16: GprMem::Gpr(Gpr::RAX),
2072
            imm16,
2073
        }) => testw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2074
        Inst::testl_mi(testl_mi {
2075
            rm32: GprMem::Gpr(Gpr::RAX),
2076
            imm32,
2077
        }) => testl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2078
        Inst::testq_mi(testq_mi {
2079
            rm64: GprMem::Gpr(Gpr::RAX),
2080
            imm32,
2081
        }) => testq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2082

2083
        // lea
2084
        Inst::leal_rm(leal_rm { r32, m32 }) => emit_lea(
2085
            r32,
2086
            m32,
2087
            sink,
2088
            |dst, amode, s| leal_rm::<R>::new(dst, amode).encode(s),
2089
            |dst, simm32, s| addl_mi::<R>::new(dst, simm32.cast_unsigned()).encode(s),
2090
            |dst, reg, s| addl_rm::<R>::new(dst, reg).encode(s),
2091
        ),
2092
        Inst::leaq_rm(leaq_rm { r64, m64 }) => emit_lea(
2093
            r64,
2094
            m64,
2095
            sink,
2096
            |dst, amode, s| leaq_rm::<R>::new(dst, amode).encode(s),
2097
            |dst, simm32, s| addq_mi_sxl::<R>::new(dst, simm32).encode(s),
2098
            |dst, reg, s| addq_rm::<R>::new(dst, reg).encode(s),
2099
        ),
2100

2101
        // All other instructions fall through to here and cannot be shrunk, so
2102
        // return `false` to emit them as usual.
2103
        _ => inst.encode(sink),
2104
    }
2105
}
2106

2107
/// If `lea` can actually get encoded as an `add` then do that instead.
2108
/// Currently all candidate `iadd`s become an `lea` pseudo-instruction here but
2109
/// maximizing the use of `lea` is not necessarily optimal. The `lea`
2110
/// instruction goes through dedicated address units on cores which are finite
2111
/// and disjoint from the general ALU, so if everything uses `lea` then those
2112
/// units can get saturated while leaving the ALU idle.
2113
///
2114
/// To help make use of more parts of a CPU, this attempts to use `add` when
2115
/// it's semantically equivalent to `lea`, or otherwise when the `dst` register
2116
/// is the same as the `base` or `index` register.
2117
///
2118
/// FIXME: ideally regalloc is informed of this constraint. Register allocation
2119
/// of `lea` should "attempt" to put the `base` in the same register as `dst`
2120
/// but not at the expense of generating a `mov` instruction. Currently that's
2121
/// not possible but perhaps one day it may be worth it.
2122
fn emit_lea<S>(
2123
    dst: asm::Gpr<WritableGpr>,
2124
    addr: asm::Amode<Gpr>,
2125
    sink: &mut S,
2126
    lea: fn(WritableGpr, asm::Amode<Gpr>, &mut S),
2127
    add_mi: fn(PairedGpr, i32, &mut S),
2128
    add_rm: fn(PairedGpr, Gpr, &mut S),
2129
) where
2130
    S: asm::CodeSink,
2131
{
2132
    match addr {
2133
        // If `base == dst` then this is `add dst, $imm`, so encode that
2134
        // instead.
2135
        asm::Amode::ImmReg {
2136
            base,
2137
            simm32:
2138
                asm::AmodeOffsetPlusKnownOffset {
2139
                    simm32,
2140
                    offset: None,
2141
                },
2142
            trap: None,
2143
        } if dst.as_ref().to_reg() == base => add_mi(
2144
            PairedGpr {
2145
                read: base,
2146
                write: *dst.as_ref(),
2147
            },
2148
            simm32.value(),
2149
            sink,
2150
        ),
2151

2152
        // If the offset is 0 and the shift is a scale of 1, then:
2153
        //
2154
        // * If `base == dst`, then this is `addq dst, index`
2155
        // * If `index == dst`, then this is `addq dst, base`
2156
        asm::Amode::ImmRegRegShift {
2157
            base,
2158
            index,
2159
            scale: asm::Scale::One,
2160
            simm32: asm::AmodeOffset::ZERO,
2161
            trap: None,
2162
        } => {
2163
            if dst.as_ref().to_reg() == base {
2164
                add_rm(
2165
                    PairedGpr {
2166
                        read: base,
2167
                        write: *dst.as_ref(),
2168
                    },
2169
                    *index.as_ref(),
2170
                    sink,
2171
                )
2172
            } else if dst.as_ref().to_reg() == *index.as_ref() {
2173
                add_rm(
2174
                    PairedGpr {
2175
                        read: *index.as_ref(),
2176
                        write: *dst.as_ref(),
2177
                    },
2178
                    base,
2179
                    sink,
2180
                )
2181
            } else {
2182
                lea(*dst.as_ref(), addr, sink)
2183
            }
2184
        }
2185

2186
        _ => lea(*dst.as_ref(), addr, sink),
2187
    }
2188
}
2189

2190
Product

Resources

Company