Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bytecodealliance
GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/cranelift/codegen/src/isa/x64/inst/emit.rs
1693 views
1
use crate::ir::KnownSymbol;
2
use crate::ir::immediates::{Ieee32, Ieee64};
3
use crate::isa::x64::external::{AsmInst, CraneliftRegisters, PairedGpr};
4
use crate::isa::x64::inst::args::*;
5
use crate::isa::x64::inst::*;
6
use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};
7
use cranelift_assembler_x64 as asm;
8
9
/// A small helper to generate a signed conversion instruction.
10
fn emit_signed_cvt(
11
sink: &mut MachBuffer<Inst>,
12
info: &EmitInfo,
13
state: &mut EmitState,
14
src: Reg,
15
dst: Writable<Reg>,
16
to_f64: bool,
17
) {
18
assert!(src.is_real());
19
assert!(dst.to_reg().is_real());
20
21
// Handle an unsigned int, which is the "easy" case: a signed conversion
22
// will do the right thing.
23
let dst = WritableXmm::from_writable_reg(dst).unwrap();
24
if to_f64 {
25
asm::inst::cvtsi2sdq_a::new(dst, src).emit(sink, info, state);
26
} else {
27
asm::inst::cvtsi2ssq_a::new(dst, src).emit(sink, info, state);
28
}
29
}
30
31
/// Emits a one way conditional jump if CC is set (true).
32
fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
33
let cond_start = sink.cur_offset();
34
let cond_disp_off = cond_start + 2;
35
sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
36
emit_jcc_no_offset(sink, cc);
37
debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
38
}
39
40
/// Like `one_way_jmp` above emitting a conditional jump, but also using
41
/// `MachBuffer::add_cond_branch`.
42
fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
43
let cond_start = sink.cur_offset();
44
let cond_disp_off = cond_start + 2;
45
let cond_end = cond_start + 6;
46
47
sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
48
// FIXME: ideally this `inverted` calculation would go through the external
49
// assembler, but for now it's left done manually.
50
let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
51
sink.add_cond_branch(cond_start, cond_end, label, &inverted[..]);
52
53
emit_jcc_no_offset(sink, cc);
54
55
debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
56
debug_assert_eq!(sink.cur_offset(), cond_end);
57
}
58
59
fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {
60
// Note that the disassembler matches Capstone which doesn't match the `CC`
61
// enum directly as Intel has multiple mnemonics use the same encoding.
62
let inst: AsmInst = match cc {
63
CC::Z => asm::inst::je_d32::new(0).into(), // jz == je
64
CC::NZ => asm::inst::jne_d32::new(0).into(), // jnz == jne
65
CC::B => asm::inst::jb_d32::new(0).into(),
66
CC::NB => asm::inst::jae_d32::new(0).into(), // jnb == jae
67
CC::BE => asm::inst::jbe_d32::new(0).into(),
68
CC::NBE => asm::inst::ja_d32::new(0).into(), // jnbe == ja
69
CC::L => asm::inst::jl_d32::new(0).into(),
70
CC::LE => asm::inst::jle_d32::new(0).into(),
71
CC::NL => asm::inst::jge_d32::new(0).into(), // jnl == jge
72
CC::NLE => asm::inst::jg_d32::new(0).into(), // jnle == jg
73
CC::O => asm::inst::jo_d32::new(0).into(),
74
CC::NO => asm::inst::jno_d32::new(0).into(),
75
CC::P => asm::inst::jp_d32::new(0).into(),
76
CC::NP => asm::inst::jnp_d32::new(0).into(),
77
CC::S => asm::inst::js_d32::new(0).into(),
78
CC::NS => asm::inst::jns_d32::new(0).into(),
79
};
80
inst.encode(&mut external::AsmCodeSink {
81
sink,
82
incoming_arg_offset: 0,
83
slot_offset: 0,
84
});
85
}
86
87
/// Emits an unconditional branch.
88
fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {
89
let uncond_start = sink.cur_offset();
90
let uncond_disp_off = uncond_start + 1;
91
let uncond_end = uncond_start + 5;
92
93
sink.use_label_at_offset(uncond_disp_off, label, LabelUse::JmpRel32);
94
sink.add_uncond_branch(uncond_start, uncond_end, label);
95
96
asm::inst::jmp_d32::new(0).encode(&mut external::AsmCodeSink {
97
sink,
98
incoming_arg_offset: 0,
99
slot_offset: 0,
100
});
101
debug_assert_eq!(sink.cur_offset(), uncond_disp_off + 4);
102
debug_assert_eq!(sink.cur_offset(), uncond_end);
103
}
104
105
/// Emits a relocation, attaching the current source location as well.
106
fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
107
sink.add_reloc(kind, name, addend);
108
}
109
110
/// The top-level emit function.
111
///
112
/// Important! Do not add improved (shortened) encoding cases to existing
113
/// instructions without also adding tests for those improved encodings. That
114
/// is a dangerous game that leads to hard-to-track-down errors in the emitted
115
/// code.
116
///
117
/// For all instructions, make sure to have test coverage for all of the
118
/// following situations. Do this by creating the cross product resulting from
119
/// applying the following rules to each operand:
120
///
121
/// (1) for any insn that mentions a register: one test using a register from
122
/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
123
/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
124
/// This helps detect incorrect REX prefix construction.
125
///
126
/// (2) for any insn that mentions a byte register: one test for each of the
127
/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
128
/// [r8b .. r11b] and [r12b .. r15b]. This checks that
129
/// apparently-redundant REX prefixes are retained when required.
130
///
131
/// (3) for any insn that contains an immediate field, check the following
132
/// cases: field is zero, field is in simm8 range (-128 .. 127), field is
133
/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some
134
/// instructions that require a 32-bit immediate have a short-form encoding
135
/// when the imm is in simm8 range.
136
///
137
/// Rules (1), (2) and (3) don't apply for registers within address expressions
138
/// (`Addr`s). Those are already pretty well tested, and the registers in them
139
/// don't have any effect on the containing instruction (apart from possibly
140
/// require REX prefix bits).
141
///
142
/// When choosing registers for a test, avoid using registers with the same
143
/// offset within a given group. For example, don't use rax and r8, since they
144
/// both have the lowest 3 bits as 000, and so the test won't detect errors
145
/// where those 3-bit register sub-fields are confused by the emitter. Instead
146
/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl
147
/// and bpl since they have the same offset in their group; use instead (eg) cl
148
/// and sil.
149
///
150
/// For all instructions, also add a test that uses only low-half registers
151
/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
152
/// prefixes are correctly omitted. This low-half restriction must apply to
153
/// _all_ registers in the insn, even those in address expressions.
154
///
155
/// Following these rules creates large numbers of test cases, but it's the
156
/// only way to make the emitter reliable.
157
///
158
/// Known possible improvements:
159
///
160
/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we
161
/// care?)
162
pub(crate) fn emit(
163
inst: &Inst,
164
sink: &mut MachBuffer<Inst>,
165
info: &EmitInfo,
166
state: &mut EmitState,
167
) {
168
if !inst.is_available(&info) {
169
let features = if let Inst::External { inst } = inst {
170
inst.features().to_string()
171
} else {
172
"see `is_available` source for feature term".to_string()
173
};
174
panic!(
175
"Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {features}"
176
);
177
}
178
179
match inst {
180
Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
181
// Validate that the register constraints of the dividend and the
182
// destination are all as expected.
183
let (dst, size) = match inst {
184
Inst::CheckedSRemSeq {
185
dividend_lo,
186
dividend_hi,
187
dst_quotient,
188
dst_remainder,
189
size,
190
..
191
} => {
192
let dividend_lo = dividend_lo.to_reg();
193
let dividend_hi = dividend_hi.to_reg();
194
let dst_quotient = dst_quotient.to_reg().to_reg();
195
let dst_remainder = dst_remainder.to_reg().to_reg();
196
debug_assert_eq!(dividend_lo, regs::rax());
197
debug_assert_eq!(dividend_hi, regs::rdx());
198
debug_assert_eq!(dst_quotient, regs::rax());
199
debug_assert_eq!(dst_remainder, regs::rdx());
200
(regs::rdx(), *size)
201
}
202
Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
203
let dividend = dividend.to_reg();
204
let dst = dst.to_reg().to_reg();
205
debug_assert_eq!(dividend, regs::rax());
206
debug_assert_eq!(dst, regs::rax());
207
(regs::rax(), OperandSize::Size8)
208
}
209
_ => unreachable!(),
210
};
211
212
// Generates the following code sequence:
213
//
214
// cmp -1 %divisor
215
// jnz $do_op
216
//
217
// ;; for srem, result is 0
218
// mov #0, %dst
219
// j $done
220
//
221
// $do_op:
222
// idiv %divisor
223
//
224
// $done:
225
226
let do_op = sink.get_label();
227
let done_label = sink.get_label();
228
229
// Check if the divisor is -1, and if it isn't then immediately
230
// go to the `idiv`.
231
let inst = Inst::cmp_mi_sxb(size, *divisor, -1);
232
inst.emit(sink, info, state);
233
one_way_jmp(sink, CC::NZ, do_op);
234
235
// ... otherwise the divisor is -1 and the result is always 0. This
236
// is written to the destination register which will be %rax for
237
// 8-bit srem and %rdx otherwise.
238
//
239
// Note that for 16-to-64-bit srem operations this leaves the
240
// second destination, %rax, unchanged. This isn't semantically
241
// correct if a lowering actually tries to use the `dst_quotient`
242
// output but for srem only the `dst_remainder` output is used for
243
// now.
244
let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
245
inst.emit(sink, info, state);
246
let inst = Inst::jmp_known(done_label);
247
inst.emit(sink, info, state);
248
249
// Here the `idiv` is executed, which is different depending on the
250
// size
251
sink.bind_label(do_op, state.ctrl_plane_mut());
252
let rax = Gpr::RAX;
253
let rdx = Gpr::RDX;
254
let writable_rax = Writable::from_reg(rax);
255
let writable_rdx = Writable::from_reg(rdx);
256
let inst: AsmInst = match size {
257
OperandSize::Size8 => asm::inst::idivb_m::new(
258
PairedGpr::from(writable_rax),
259
*divisor,
260
TrapCode::INTEGER_DIVISION_BY_ZERO,
261
)
262
.into(),
263
264
OperandSize::Size16 => asm::inst::idivw_m::new(
265
PairedGpr::from(writable_rax),
266
PairedGpr::from(writable_rdx),
267
*divisor,
268
TrapCode::INTEGER_DIVISION_BY_ZERO,
269
)
270
.into(),
271
272
OperandSize::Size32 => asm::inst::idivl_m::new(
273
PairedGpr::from(writable_rax),
274
PairedGpr::from(writable_rdx),
275
*divisor,
276
TrapCode::INTEGER_DIVISION_BY_ZERO,
277
)
278
.into(),
279
280
OperandSize::Size64 => asm::inst::idivq_m::new(
281
PairedGpr::from(writable_rax),
282
PairedGpr::from(writable_rdx),
283
*divisor,
284
TrapCode::INTEGER_DIVISION_BY_ZERO,
285
)
286
.into(),
287
};
288
inst.emit(sink, info, state);
289
290
sink.bind_label(done_label, state.ctrl_plane_mut());
291
}
292
293
Inst::MovFromPReg { src, dst } => {
294
let src: Reg = (*src).into();
295
debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
296
asm::inst::movq_mr::new(*dst, Gpr::unwrap_new(src)).emit(sink, info, state);
297
}
298
299
Inst::MovToPReg { src, dst } => {
300
let dst: Reg = (*dst).into();
301
debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
302
let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
303
asm::inst::movq_mr::new(dst, *src).emit(sink, info, state);
304
}
305
306
Inst::XmmCmove {
307
ty,
308
cc,
309
consequent,
310
alternative,
311
dst,
312
} => {
313
let alternative = *alternative;
314
let dst = *dst;
315
debug_assert_eq!(alternative, dst.to_reg());
316
let consequent = *consequent;
317
318
// Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
319
// this doesn't clobber flags. Make sure to not do so here.
320
let next = sink.get_label();
321
322
// Jump if cc is *not* set.
323
one_way_jmp(sink, cc.invert(), next);
324
Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)
325
.emit(sink, info, state);
326
327
sink.bind_label(next, state.ctrl_plane_mut());
328
}
329
330
Inst::StackProbeLoop {
331
tmp,
332
frame_size,
333
guard_size,
334
} => {
335
assert!(info.flags.enable_probestack());
336
assert!(guard_size.is_power_of_two());
337
338
let tmp = *tmp;
339
340
// Number of probes that we need to perform
341
let probe_count = align_to(*frame_size, *guard_size) / guard_size;
342
343
// The inline stack probe loop has 3 phases:
344
//
345
// We generate the "guard area" register which is essentially the frame_size aligned to
346
// guard_size. We copy the stack pointer and subtract the guard area from it. This
347
// gets us a register that we can use to compare when looping.
348
//
349
// After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
350
// distance at a time and then touch the stack by writing anything to it. We use the previously
351
// created "guard area" register to know when to stop looping.
352
//
353
// When we have touched all the pages that we need, we have to restore the stack pointer
354
// to where it was before.
355
//
356
// Generate the following code:
357
// mov tmp_reg, rsp
358
// sub tmp_reg, guard_size * probe_count
359
// .loop_start:
360
// sub rsp, guard_size
361
// mov [rsp], rsp
362
// cmp rsp, tmp_reg
363
// jne .loop_start
364
// add rsp, guard_size * probe_count
365
366
// Create the guard bound register
367
// mov tmp_reg, rsp
368
let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
369
inst.emit(sink, info, state);
370
371
// sub tmp_reg, GUARD_SIZE * probe_count
372
let guard_plus_count = i32::try_from(guard_size * probe_count)
373
.expect("`guard_size * probe_count` is too large to fit in a 32-bit immediate");
374
Inst::subq_mi(tmp, guard_plus_count).emit(sink, info, state);
375
376
// Emit the main loop!
377
let loop_start = sink.get_label();
378
sink.bind_label(loop_start, state.ctrl_plane_mut());
379
380
// sub rsp, GUARD_SIZE
381
let rsp = Writable::from_reg(regs::rsp());
382
let guard_size_ = i32::try_from(*guard_size)
383
.expect("`guard_size` is too large to fit in a 32-bit immediate");
384
Inst::subq_mi(rsp, guard_size_).emit(sink, info, state);
385
386
// TODO: `mov [rsp], 0` would be better, but we don't have that instruction
387
// Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
388
// instruction size.
389
// mov [rsp], rsp
390
asm::inst::movl_mr::new(Amode::imm_reg(0, regs::rsp()), Gpr::RSP)
391
.emit(sink, info, state);
392
393
// Compare and jump if we are not done yet
394
// cmp rsp, tmp_reg
395
let tmp = Gpr::unwrap_new(tmp.to_reg());
396
asm::inst::cmpq_rm::new(tmp, Gpr::RSP).emit(sink, info, state);
397
398
// jne .loop_start
399
// TODO: Encoding the conditional jump as a short jump
400
// could save us us 4 bytes here.
401
one_way_jmp(sink, CC::NZ, loop_start);
402
403
// The regular prologue code is going to emit a `sub` after this, so we need to
404
// reset the stack pointer
405
//
406
// TODO: It would be better if we could avoid the `add` + `sub` that is generated here
407
// and in the stack adj portion of the prologue
408
//
409
// add rsp, GUARD_SIZE * probe_count
410
Inst::addq_mi(rsp, guard_plus_count).emit(sink, info, state);
411
}
412
413
Inst::CallKnown { info: call_info } => {
414
let stack_map = state.take_stack_map();
415
416
asm::inst::callq_d::new(0).emit(sink, info, state);
417
418
// The last 4 bytes of `callq` is the relative displacement to where
419
// we're calling, so that's where the reloc is registered.
420
//
421
// The addend adjusts for the difference between the end of the
422
// instruction and the beginning of the immediate field.
423
let len = sink.cur_offset();
424
sink.add_reloc_at_offset(len - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
425
426
if let Some(s) = stack_map {
427
sink.push_user_stack_map(state, len, s);
428
}
429
430
if let Some(try_call) = call_info.try_call_info.as_ref() {
431
sink.add_try_call_site(
432
Some(state.frame_layout().sp_to_fp()),
433
try_call.exception_handlers(&state.frame_layout()),
434
);
435
} else {
436
sink.add_call_site();
437
}
438
439
// Reclaim the outgoing argument area that was released by the
440
// callee, to ensure that StackAMode values are always computed from
441
// a consistent SP.
442
if call_info.callee_pop_size > 0 {
443
let rsp = Writable::from_reg(regs::rsp());
444
let callee_pop_size = i32::try_from(call_info.callee_pop_size)
445
.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
446
Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
447
}
448
449
// Load any stack-carried return values.
450
call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
451
state.frame_layout().stackslots_size,
452
|inst| inst.emit(sink, info, state),
453
|_space_needed| None,
454
);
455
456
// If this is a try-call, jump to the continuation
457
// (normal-return) block.
458
if let Some(try_call) = call_info.try_call_info.as_ref() {
459
let jmp = Inst::JmpKnown {
460
dst: try_call.continuation,
461
};
462
jmp.emit(sink, info, state);
463
}
464
}
465
466
Inst::ReturnCallKnown { info: call_info } => {
467
emit_return_call_common_sequence(sink, info, state, &call_info);
468
469
// Finally, jump to the callee!
470
//
471
// Note: this is not `Inst::Jmp { .. }.emit(..)` because we have
472
// different metadata in this case: we don't have a label for the
473
// target, but rather a function relocation.
474
asm::inst::jmp_d32::new(0).emit(sink, info, state);
475
let offset = sink.cur_offset();
476
// The addend adjusts for the difference between the end of the instruction and the
477
// beginning of the immediate field.
478
sink.add_reloc_at_offset(offset - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
479
sink.add_call_site();
480
}
481
482
Inst::ReturnCallUnknown { info: call_info } => {
483
let callee = call_info.dest;
484
485
emit_return_call_common_sequence(sink, info, state, &call_info);
486
487
asm::inst::jmpq_m::new(callee).emit(sink, info, state);
488
sink.add_call_site();
489
}
490
491
Inst::CallUnknown {
492
info: call_info, ..
493
} => {
494
let stack_map = state.take_stack_map();
495
496
let dest = match call_info.dest.clone() {
497
RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::unwrap_new(reg)),
498
RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),
499
};
500
501
asm::inst::callq_m::new(dest).emit(sink, info, state);
502
503
if let Some(s) = stack_map {
504
let offset = sink.cur_offset();
505
sink.push_user_stack_map(state, offset, s);
506
}
507
508
if let Some(try_call) = call_info.try_call_info.as_ref() {
509
sink.add_try_call_site(
510
Some(state.frame_layout().sp_to_fp()),
511
try_call.exception_handlers(&state.frame_layout()),
512
);
513
} else {
514
sink.add_call_site();
515
}
516
517
// Reclaim the outgoing argument area that was released by the callee, to ensure that
518
// StackAMode values are always computed from a consistent SP.
519
if call_info.callee_pop_size > 0 {
520
let rsp = Writable::from_reg(regs::rsp());
521
let callee_pop_size = i32::try_from(call_info.callee_pop_size)
522
.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
523
Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
524
}
525
526
// Load any stack-carried return values.
527
call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
528
state.frame_layout().stackslots_size,
529
|inst| inst.emit(sink, info, state),
530
|_space_needed| None,
531
);
532
533
if let Some(try_call) = call_info.try_call_info.as_ref() {
534
let jmp = Inst::JmpKnown {
535
dst: try_call.continuation,
536
};
537
jmp.emit(sink, info, state);
538
}
539
}
540
541
Inst::Args { .. } => {}
542
Inst::Rets { .. } => {}
543
544
Inst::StackSwitchBasic {
545
store_context_ptr,
546
load_context_ptr,
547
in_payload0,
548
out_payload0,
549
} => {
550
// Note that we do not emit anything for preserving and restoring
551
// ordinary registers here: That's taken care of by regalloc for us,
552
// since we marked this instruction as clobbering all registers.
553
//
554
// Also note that we do nothing about passing the single payload
555
// value: We've informed regalloc that it is sent and received via
556
// the fixed register given by [stack_switch::payload_register]
557
558
let (tmp1, tmp2) = {
559
// Ideally we would just ask regalloc for two temporary registers.
560
// However, adding any early defs to the constraints on StackSwitch
561
// causes TooManyLiveRegs. Fortunately, we can manually find tmp
562
// registers without regalloc: Since our instruction clobbers all
563
// registers, we can simply pick any register that is not assigned
564
// to the operands.
565
566
let all = crate::isa::x64::abi::ALL_CLOBBERS;
567
568
let used_regs = [
569
**load_context_ptr,
570
**store_context_ptr,
571
**in_payload0,
572
*out_payload0.to_reg(),
573
];
574
575
let mut tmps = all.into_iter().filter_map(|preg| {
576
let reg: Reg = preg.into();
577
if !used_regs.contains(&reg) {
578
WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))
579
} else {
580
None
581
}
582
});
583
(tmps.next().unwrap(), tmps.next().unwrap())
584
};
585
586
let layout = stack_switch::control_context_layout();
587
let rsp_offset = layout.stack_pointer_offset as i32;
588
let pc_offset = layout.ip_offset as i32;
589
let rbp_offset = layout.frame_pointer_offset as i32;
590
591
// Location to which someone switch-ing back to this stack will jump
592
// to: Right behind the `StackSwitch` instruction
593
let resume = sink.get_label();
594
595
//
596
// For RBP and RSP we do the following:
597
// - Load new value for register from `load_context_ptr` +
598
// corresponding offset.
599
// - Store previous (!) value of register at `store_context_ptr` +
600
// corresponding offset.
601
//
602
// Since `load_context_ptr` and `store_context_ptr` are allowed to be
603
// equal, we need to use a temporary register here.
604
//
605
606
let mut exchange = |offset, reg| {
607
let addr = SyntheticAmode::real(Amode::imm_reg(offset, **load_context_ptr));
608
asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
609
610
asm::inst::movq_mr::new(
611
Amode::imm_reg(offset, **store_context_ptr),
612
Gpr::new(reg).unwrap(),
613
)
614
.emit(sink, info, state);
615
616
let dst = Writable::from_reg(reg);
617
asm::inst::movq_mr::new(dst.map(Gpr::unwrap_new), tmp1.to_reg())
618
.emit(sink, info, state);
619
};
620
621
exchange(rsp_offset, regs::rsp());
622
exchange(rbp_offset, regs::rbp());
623
624
//
625
// Load target PC, store resume PC, jump to target PC
626
//
627
628
let addr = SyntheticAmode::real(Amode::imm_reg(pc_offset, **load_context_ptr));
629
asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
630
631
let amode = Amode::RipRelative { target: resume };
632
asm::inst::leaq_rm::new(tmp2, amode).emit(sink, info, state);
633
634
asm::inst::movq_mr::new(
635
Amode::imm_reg(pc_offset, **store_context_ptr),
636
tmp2.to_reg(),
637
)
638
.emit(sink, info, state);
639
640
asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
641
642
sink.bind_label(resume, state.ctrl_plane_mut());
643
}
644
645
Inst::JmpKnown { dst } => uncond_jmp(sink, *dst),
646
647
Inst::WinchJmpIf { cc, taken } => one_way_jmp(sink, *cc, *taken),
648
649
Inst::JmpCond {
650
cc,
651
taken,
652
not_taken,
653
} => {
654
cond_jmp(sink, *cc, *taken);
655
uncond_jmp(sink, *not_taken);
656
}
657
658
Inst::JmpCondOr {
659
cc1,
660
cc2,
661
taken,
662
not_taken,
663
} => {
664
// Emit:
665
// jcc1 taken
666
// jcc2 taken
667
// jmp not_taken
668
//
669
// Note that we enroll both conditionals in the
670
// branch-chomping mechanism because MachBuffer
671
// simplification can continue upward as long as it keeps
672
// chomping branches. In the best case, if taken ==
673
// not_taken and that one block is the fallthrough block,
674
// all three branches can disappear.
675
676
cond_jmp(sink, *cc1, *taken);
677
cond_jmp(sink, *cc2, *taken);
678
uncond_jmp(sink, *not_taken);
679
}
680
681
&Inst::JmpTableSeq {
682
idx,
683
tmp1,
684
tmp2,
685
ref targets,
686
ref default_target,
687
..
688
} => {
689
// This sequence is *one* instruction in the vcode, and is expanded only here at
690
// emission time, because we cannot allow the regalloc to insert spills/reloads in
691
// the middle; we depend on hardcoded PC-rel addressing below.
692
//
693
// We don't have to worry about emitting islands, because the only label-use type has a
694
// maximum range of 2 GB. If we later consider using shorter-range label references,
695
// this will need to be revisited.
696
697
// We generate the following sequence. Note that the only read of %idx is before the
698
// write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
699
// if you change this.
700
// lea start_of_jump_table_offset(%rip), %tmp1
701
// movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
702
// addq %tmp2, %tmp1
703
// j *%tmp1
704
// $start_of_jump_table:
705
// -- jump table entries
706
707
// Load base address of jump table.
708
let start_of_jumptable = sink.get_label();
709
asm::inst::leaq_rm::new(tmp1, Amode::rip_relative(start_of_jumptable))
710
.emit(sink, info, state);
711
712
// Load value out of the jump table. It's a relative offset to the target block, so it
713
// might be negative; use a sign-extension.
714
let inst = Inst::movsx_rm_r(
715
ExtMode::LQ,
716
RegMem::mem(Amode::imm_reg_reg_shift(
717
0,
718
Gpr::unwrap_new(tmp1.to_reg()),
719
Gpr::unwrap_new(idx),
720
2,
721
)),
722
tmp2,
723
);
724
inst.emit(sink, info, state);
725
726
// Add base of jump table to jump-table-sourced block offset.
727
asm::inst::addq_rm::new(tmp1, tmp2).emit(sink, info, state);
728
729
// Branch to computed address.
730
asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
731
732
// Emit jump table (table of 32-bit offsets).
733
sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());
734
let jt_off = sink.cur_offset();
735
for &target in targets.iter().chain(std::iter::once(default_target)) {
736
let word_off = sink.cur_offset();
737
// off_into_table is an addend here embedded in the label to be later patched at
738
// the end of codegen. The offset is initially relative to this jump table entry;
739
// with the extra addend, it'll be relative to the jump table's start, after
740
// patching.
741
let off_into_table = word_off - jt_off;
742
sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
743
sink.put4(off_into_table);
744
}
745
}
746
747
Inst::TrapIf { cc, trap_code } => {
748
let trap_label = sink.defer_trap(*trap_code);
749
one_way_jmp(sink, *cc, trap_label);
750
}
751
752
Inst::TrapIfAnd {
753
cc1,
754
cc2,
755
trap_code,
756
} => {
757
let trap_label = sink.defer_trap(*trap_code);
758
let else_label = sink.get_label();
759
760
// Jump to the end if the first condition isn't true, and then if
761
// the second condition is true go to the trap.
762
one_way_jmp(sink, cc1.invert(), else_label);
763
one_way_jmp(sink, *cc2, trap_label);
764
765
sink.bind_label(else_label, state.ctrl_plane_mut());
766
}
767
768
Inst::TrapIfOr {
769
cc1,
770
cc2,
771
trap_code,
772
} => {
773
let trap_label = sink.defer_trap(*trap_code);
774
775
// Emit two jumps to the same trap if either condition code is true.
776
one_way_jmp(sink, *cc1, trap_label);
777
one_way_jmp(sink, *cc2, trap_label);
778
}
779
780
Inst::XmmMinMaxSeq {
781
size,
782
is_min,
783
lhs,
784
rhs,
785
dst,
786
} => {
787
let rhs = rhs.to_reg();
788
let lhs = lhs.to_reg();
789
let dst = dst.to_writable_reg();
790
debug_assert_eq!(rhs, dst.to_reg());
791
792
// Generates the following sequence:
793
// cmpss/cmpsd %lhs, %rhs_dst
794
// jnz do_min_max
795
// jp propagate_nan
796
//
797
// ;; ordered and equal: propagate the sign bit (for -0 vs 0):
798
// {and,or}{ss,sd} %lhs, %rhs_dst
799
// j done
800
//
801
// ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
802
// ;; NaN value is returned), we add both inputs.
803
// propagate_nan:
804
// add{ss,sd} %lhs, %rhs_dst
805
// j done
806
//
807
// do_min_max:
808
// {min,max}{ss,sd} %lhs, %rhs_dst
809
//
810
// done:
811
let done = sink.get_label();
812
let propagate_nan = sink.get_label();
813
let do_min_max = sink.get_label();
814
815
let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
816
OperandSize::Size32 => (
817
asm::inst::addss_a::new(dst, lhs).into(),
818
asm::inst::ucomiss_a::new(dst.to_reg(), lhs).into(),
819
asm::inst::andps_a::new(dst, lhs).into(),
820
asm::inst::orps_a::new(dst, lhs).into(),
821
if *is_min {
822
asm::inst::minss_a::new(dst, lhs).into()
823
} else {
824
asm::inst::maxss_a::new(dst, lhs).into()
825
},
826
),
827
OperandSize::Size64 => (
828
asm::inst::addsd_a::new(dst, lhs).into(),
829
asm::inst::ucomisd_a::new(dst.to_reg(), lhs).into(),
830
asm::inst::andpd_a::new(dst, lhs).into(),
831
asm::inst::orpd_a::new(dst, lhs).into(),
832
if *is_min {
833
asm::inst::minsd_a::new(dst, lhs).into()
834
} else {
835
asm::inst::maxsd_a::new(dst, lhs).into()
836
},
837
),
838
_ => unreachable!(),
839
};
840
let add_op: AsmInst = add_op;
841
let or_op: AsmInst = or_op;
842
let min_max_op: AsmInst = min_max_op;
843
let cmp_op: AsmInst = cmp_op;
844
845
cmp_op.emit(sink, info, state);
846
847
one_way_jmp(sink, CC::NZ, do_min_max);
848
one_way_jmp(sink, CC::P, propagate_nan);
849
850
// Ordered and equal. The operands are bit-identical unless they are zero
851
// and negative zero. These instructions merge the sign bits in that
852
// case, and are no-ops otherwise.
853
let inst: AsmInst = if *is_min { or_op } else { and_op };
854
inst.emit(sink, info, state);
855
856
let inst = Inst::jmp_known(done);
857
inst.emit(sink, info, state);
858
859
// x86's min/max are not symmetric; if either operand is a NaN, they return the
860
// read-only operand: perform an addition between the two operands, which has the
861
// desired NaN propagation effects.
862
sink.bind_label(propagate_nan, state.ctrl_plane_mut());
863
add_op.emit(sink, info, state);
864
865
one_way_jmp(sink, CC::P, done);
866
867
sink.bind_label(do_min_max, state.ctrl_plane_mut());
868
min_max_op.emit(sink, info, state);
869
870
sink.bind_label(done, state.ctrl_plane_mut());
871
}
872
873
Inst::XmmUninitializedValue { .. } | Inst::GprUninitializedValue { .. } => {
874
// These instruction formats only exist to declare a register as a
875
// `def`; no code is emitted. This is always immediately followed by
876
// an instruction, such as `xor <tmp>, <tmp>`, that semantically
877
// reads this undefined value but arithmetically produces the same
878
// result regardless of its value.
879
}
880
881
Inst::CvtUint64ToFloatSeq {
882
dst_size,
883
src,
884
dst,
885
tmp_gpr1,
886
tmp_gpr2,
887
} => {
888
let src = src.to_reg();
889
let dst = dst.to_writable_reg();
890
let tmp_gpr1 = tmp_gpr1.to_writable_reg();
891
let tmp_gpr2 = tmp_gpr2.to_writable_reg();
892
893
// Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
894
// different sequence.
895
//
896
// Emit the following sequence:
897
//
898
// cmp 0, %src
899
// jl handle_negative
900
//
901
// ;; handle positive, which can't overflow
902
// cvtsi2sd/cvtsi2ss %src, %dst
903
// j done
904
//
905
// ;; handle negative: see below for an explanation of what it's doing.
906
// handle_negative:
907
// mov %src, %tmp_gpr1
908
// shr $1, %tmp_gpr1
909
// mov %src, %tmp_gpr2
910
// and $1, %tmp_gpr2
911
// or %tmp_gpr1, %tmp_gpr2
912
// cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
913
// addsd/addss %dst, %dst
914
//
915
// done:
916
917
assert_ne!(src, tmp_gpr1.to_reg());
918
assert_ne!(src, tmp_gpr2.to_reg());
919
920
let handle_negative = sink.get_label();
921
let done = sink.get_label();
922
923
// If x seen as a signed int64 is not negative, a signed-conversion will do the right
924
// thing.
925
// TODO use tst src, src here.
926
asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);
927
928
one_way_jmp(sink, CC::L, handle_negative);
929
930
// Handle a positive int64, which is the "easy" case: a signed conversion will do the
931
// right thing.
932
emit_signed_cvt(
933
sink,
934
info,
935
state,
936
src,
937
dst,
938
*dst_size == OperandSize::Size64,
939
);
940
941
let inst = Inst::jmp_known(done);
942
inst.emit(sink, info, state);
943
944
sink.bind_label(handle_negative, state.ctrl_plane_mut());
945
946
// Divide x by two to get it in range for the signed conversion, keep the LSB, and
947
// scale it back up on the FP side.
948
let inst = Inst::gen_move(tmp_gpr1, src, types::I64);
949
inst.emit(sink, info, state);
950
951
// tmp_gpr1 := src >> 1
952
asm::inst::shrq_mi::new(tmp_gpr1, 1).emit(sink, info, state);
953
954
let inst = Inst::gen_move(tmp_gpr2, src, types::I64);
955
inst.emit(sink, info, state);
956
957
asm::inst::andq_mi_sxb::new(tmp_gpr2, 1).emit(sink, info, state);
958
959
asm::inst::orq_rm::new(tmp_gpr2, tmp_gpr1).emit(sink, info, state);
960
961
emit_signed_cvt(
962
sink,
963
info,
964
state,
965
tmp_gpr2.to_reg(),
966
dst,
967
*dst_size == OperandSize::Size64,
968
);
969
970
let inst: AsmInst = match *dst_size {
971
OperandSize::Size64 => asm::inst::addsd_a::new(dst, dst.to_reg()).into(),
972
OperandSize::Size32 => asm::inst::addss_a::new(dst, dst.to_reg()).into(),
973
_ => unreachable!(),
974
};
975
inst.emit(sink, info, state);
976
977
sink.bind_label(done, state.ctrl_plane_mut());
978
}
979
980
Inst::CvtFloatToSintSeq {
981
src_size,
982
dst_size,
983
is_saturating,
984
src,
985
dst,
986
tmp_gpr,
987
tmp_xmm,
988
} => {
989
use OperandSize::*;
990
991
let src = src.to_reg();
992
let dst = dst.to_writable_reg();
993
let tmp_gpr = tmp_gpr.to_writable_reg();
994
let tmp_xmm = tmp_xmm.to_writable_reg();
995
996
// Emits the following common sequence:
997
//
998
// cvttss2si/cvttsd2si %src, %dst
999
// cmp %dst, 1
1000
// jno done
1001
//
1002
// Then, for saturating conversions:
1003
//
1004
// ;; check for NaN
1005
// cmpss/cmpsd %src, %src
1006
// jnp not_nan
1007
// xor %dst, %dst
1008
//
1009
// ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
1010
// ;; already in %dst.
1011
// xorpd %tmp_xmm, %tmp_xmm
1012
// cmpss/cmpsd %src, %tmp_xmm
1013
// jnb done
1014
// mov/movaps $INT_MAX, %dst
1015
//
1016
// done:
1017
//
1018
// Then, for non-saturating conversions:
1019
//
1020
// ;; check for NaN
1021
// cmpss/cmpsd %src, %src
1022
// jnp not_nan
1023
// ud2 trap BadConversionToInteger
1024
//
1025
// ;; check if INT_MIN was the correct result, against a magic constant:
1026
// not_nan:
1027
// movaps/mov $magic, %tmp_gpr
1028
// movq/movd %tmp_gpr, %tmp_xmm
1029
// cmpss/cmpsd %tmp_xmm, %src
1030
// jnb/jnbe $check_positive
1031
// ud2 trap IntegerOverflow
1032
//
1033
// ;; if positive, it was a real overflow
1034
// check_positive:
1035
// xorpd %tmp_xmm, %tmp_xmm
1036
// cmpss/cmpsd %src, %tmp_xmm
1037
// jnb done
1038
// ud2 trap IntegerOverflow
1039
//
1040
// done:
1041
1042
let cmp_op: AsmInst = match src_size {
1043
Size64 => asm::inst::ucomisd_a::new(src, src).into(),
1044
Size32 => asm::inst::ucomiss_a::new(src, src).into(),
1045
_ => unreachable!(),
1046
};
1047
1048
let cvtt_op = |dst, src| Inst::External {
1049
inst: match (*src_size, *dst_size) {
1050
(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1051
(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1052
(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1053
(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1054
_ => unreachable!(),
1055
},
1056
};
1057
1058
let done = sink.get_label();
1059
1060
// The truncation.
1061
cvtt_op(dst, src).emit(sink, info, state);
1062
1063
// Compare against 1, in case of overflow the dst operand was INT_MIN.
1064
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);
1065
inst.emit(sink, info, state);
1066
1067
one_way_jmp(sink, CC::NO, done); // no overflow => done
1068
1069
// Check for NaN.
1070
cmp_op.emit(sink, info, state);
1071
1072
if *is_saturating {
1073
let not_nan = sink.get_label();
1074
one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
1075
1076
// For NaN, emit 0.
1077
let inst: AsmInst = match *dst_size {
1078
OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1079
OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1080
_ => unreachable!(),
1081
};
1082
inst.emit(sink, info, state);
1083
1084
let inst = Inst::jmp_known(done);
1085
inst.emit(sink, info, state);
1086
1087
sink.bind_label(not_nan, state.ctrl_plane_mut());
1088
1089
// If the input was positive, saturate to INT_MAX.
1090
1091
// Zero out tmp_xmm.
1092
asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1093
1094
let inst: AsmInst = match src_size {
1095
Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1096
Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1097
_ => unreachable!(),
1098
};
1099
inst.emit(sink, info, state);
1100
1101
// Jump if >= to done.
1102
one_way_jmp(sink, CC::NB, done);
1103
1104
// Otherwise, put INT_MAX.
1105
if *dst_size == OperandSize::Size64 {
1106
let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);
1107
inst.emit(sink, info, state);
1108
} else {
1109
let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);
1110
inst.emit(sink, info, state);
1111
}
1112
} else {
1113
let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1114
inst.emit(sink, info, state);
1115
1116
// Check if INT_MIN was the correct result: determine the smallest floating point
1117
// number that would convert to INT_MIN, put it in a temporary register, and compare
1118
// against the src register.
1119
// If the src register is less (or in some cases, less-or-equal) than the threshold,
1120
// trap!
1121
1122
let mut no_overflow_cc = CC::NB; // >=
1123
let output_bits = dst_size.to_bits();
1124
match *src_size {
1125
OperandSize::Size32 => {
1126
let cst = (-Ieee32::pow2(output_bits - 1)).bits();
1127
let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);
1128
inst.emit(sink, info, state);
1129
}
1130
OperandSize::Size64 => {
1131
// An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
1132
// so there are values less than -2^(N-1) that convert correctly to INT_MIN.
1133
let cst = if output_bits < 64 {
1134
no_overflow_cc = CC::NBE; // >
1135
Ieee64::fcvt_to_sint_negative_overflow(output_bits)
1136
} else {
1137
-Ieee64::pow2(output_bits - 1)
1138
};
1139
let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);
1140
inst.emit(sink, info, state);
1141
}
1142
_ => unreachable!(),
1143
}
1144
1145
let inst: AsmInst = {
1146
let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1147
match src_size {
1148
Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1149
Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1150
_ => unreachable!(),
1151
}
1152
};
1153
inst.emit(sink, info, state);
1154
1155
let inst: AsmInst = match src_size {
1156
Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1157
Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1158
_ => unreachable!(),
1159
};
1160
inst.emit(sink, info, state);
1161
1162
// no trap if src >= or > threshold
1163
let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);
1164
inst.emit(sink, info, state);
1165
1166
// If positive, it was a real overflow.
1167
1168
// Zero out the tmp_xmm register.
1169
asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1170
1171
let inst: AsmInst = match src_size {
1172
Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1173
Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1174
_ => unreachable!(),
1175
};
1176
inst.emit(sink, info, state);
1177
1178
// no trap if 0 >= src
1179
let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);
1180
inst.emit(sink, info, state);
1181
}
1182
1183
sink.bind_label(done, state.ctrl_plane_mut());
1184
}
1185
1186
Inst::CvtFloatToUintSeq {
1187
src_size,
1188
dst_size,
1189
is_saturating,
1190
src,
1191
dst,
1192
tmp_gpr,
1193
tmp_xmm,
1194
tmp_xmm2,
1195
} => {
1196
use OperandSize::*;
1197
1198
let src = src.to_reg();
1199
let dst = dst.to_writable_reg();
1200
let tmp_gpr = tmp_gpr.to_writable_reg();
1201
let tmp_xmm = tmp_xmm.to_writable_reg();
1202
let tmp_xmm2 = tmp_xmm2.to_writable_reg();
1203
1204
// The only difference in behavior between saturating and non-saturating is how we
1205
// handle errors. Emits the following sequence:
1206
//
1207
// movaps/mov 2**(int_width - 1), %tmp_gpr
1208
// movq/movd %tmp_gpr, %tmp_xmm
1209
// cmpss/cmpsd %tmp_xmm, %src
1210
// jnb is_large
1211
//
1212
// ;; check for NaN inputs
1213
// jnp not_nan
1214
// -- non-saturating: ud2 trap BadConversionToInteger
1215
// -- saturating: xor %dst, %dst; j done
1216
//
1217
// not_nan:
1218
// cvttss2si/cvttsd2si %src, %dst
1219
// cmp 0, %dst
1220
// jnl done
1221
// -- non-saturating: ud2 trap IntegerOverflow
1222
// -- saturating: xor %dst, %dst; j done
1223
//
1224
// is_large:
1225
// mov %src, %tmp_xmm2
1226
// subss/subsd %tmp_xmm, %tmp_xmm2
1227
// cvttss2si/cvttss2sd %tmp_x, %dst
1228
// cmp 0, %dst
1229
// jnl next_is_large
1230
// -- non-saturating: ud2 trap IntegerOverflow
1231
// -- saturating: movaps $UINT_MAX, %dst; j done
1232
//
1233
// next_is_large:
1234
// add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
1235
//
1236
// done:
1237
1238
assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");
1239
1240
let xor_op = |dst, src| Inst::External {
1241
inst: match *dst_size {
1242
Size32 => asm::inst::xorl_rm::new(dst, src).into(),
1243
Size64 => asm::inst::xorq_rm::new(dst, src).into(),
1244
_ => unreachable!(),
1245
},
1246
};
1247
1248
let subs_op = |dst, src| Inst::External {
1249
inst: match *src_size {
1250
Size32 => asm::inst::subss_a::new(dst, src).into(),
1251
Size64 => asm::inst::subsd_a::new(dst, src).into(),
1252
_ => unreachable!(),
1253
},
1254
};
1255
1256
let cvtt_op = |dst, src| Inst::External {
1257
inst: match (*src_size, *dst_size) {
1258
(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1259
(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1260
(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1261
(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1262
_ => unreachable!(),
1263
},
1264
};
1265
1266
let done = sink.get_label();
1267
1268
let cst = match src_size {
1269
OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
1270
OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
1271
_ => unreachable!(),
1272
};
1273
1274
let inst = Inst::imm(*src_size, cst, tmp_gpr);
1275
inst.emit(sink, info, state);
1276
1277
let inst: AsmInst = {
1278
let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1279
match src_size {
1280
Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1281
Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1282
_ => unreachable!(),
1283
}
1284
};
1285
inst.emit(sink, info, state);
1286
1287
let inst: AsmInst = match src_size {
1288
Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1289
Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1290
_ => unreachable!(),
1291
};
1292
inst.emit(sink, info, state);
1293
1294
let handle_large = sink.get_label();
1295
one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
1296
1297
if *is_saturating {
1298
// If not NaN jump over this 0-return, otherwise return 0
1299
let not_nan = sink.get_label();
1300
one_way_jmp(sink, CC::NP, not_nan);
1301
1302
xor_op(dst, dst).emit(sink, info, state);
1303
1304
let inst = Inst::jmp_known(done);
1305
inst.emit(sink, info, state);
1306
sink.bind_label(not_nan, state.ctrl_plane_mut());
1307
} else {
1308
// Trap.
1309
let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1310
inst.emit(sink, info, state);
1311
}
1312
1313
// Actual truncation for small inputs: if the result is not positive, then we had an
1314
// overflow.
1315
1316
cvtt_op(dst, src).emit(sink, info, state);
1317
1318
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1319
inst.emit(sink, info, state);
1320
1321
one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
1322
1323
if *is_saturating {
1324
// The input was "small" (< 2**(width -1)), so the only way to get an integer
1325
// overflow is because the input was too small: saturate to the min value, i.e. 0.
1326
let inst: AsmInst = match *dst_size {
1327
OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1328
OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1329
_ => unreachable!(),
1330
};
1331
inst.emit(sink, info, state);
1332
1333
let inst = Inst::jmp_known(done);
1334
inst.emit(sink, info, state);
1335
} else {
1336
// Trap.
1337
asm::inst::ud2_zo::new(TrapCode::INTEGER_OVERFLOW).emit(sink, info, state);
1338
}
1339
1340
// Now handle large inputs.
1341
1342
sink.bind_label(handle_large, state.ctrl_plane_mut());
1343
1344
let inst = Inst::gen_move(tmp_xmm2, src, types::F64);
1345
inst.emit(sink, info, state);
1346
1347
subs_op(tmp_xmm2, tmp_xmm.to_reg()).emit(sink, info, state);
1348
1349
cvtt_op(dst, tmp_xmm2.to_reg()).emit(sink, info, state);
1350
1351
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1352
inst.emit(sink, info, state);
1353
1354
if *is_saturating {
1355
let next_is_large = sink.get_label();
1356
one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
1357
1358
// The input was "large" (>= 2**(width -1)), so the only way to get an integer
1359
// overflow is because the input was too large: saturate to the max value.
1360
let inst = Inst::imm(
1361
OperandSize::Size64,
1362
if *dst_size == OperandSize::Size64 {
1363
u64::max_value()
1364
} else {
1365
u32::max_value() as u64
1366
},
1367
dst,
1368
);
1369
inst.emit(sink, info, state);
1370
1371
let inst = Inst::jmp_known(done);
1372
inst.emit(sink, info, state);
1373
sink.bind_label(next_is_large, state.ctrl_plane_mut());
1374
} else {
1375
let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);
1376
inst.emit(sink, info, state);
1377
}
1378
1379
if *dst_size == OperandSize::Size64 {
1380
let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);
1381
inst.emit(sink, info, state);
1382
1383
asm::inst::addq_rm::new(dst, tmp_gpr).emit(sink, info, state);
1384
} else {
1385
asm::inst::addl_mi::new(dst, asm::Imm32::new(1 << 31)).emit(sink, info, state);
1386
}
1387
1388
sink.bind_label(done, state.ctrl_plane_mut());
1389
}
1390
1391
Inst::LoadExtName {
1392
dst,
1393
name,
1394
offset,
1395
distance,
1396
} => {
1397
let name = &**name;
1398
let riprel = asm::Amode::RipRelative {
1399
target: asm::DeferredTarget::None,
1400
};
1401
if info.flags.is_pic() {
1402
// Generates: movq symbol@GOTPCREL(%rip), %dst
1403
asm::inst::movq_rm::new(*dst, riprel).emit(sink, info, state);
1404
let cur = sink.cur_offset();
1405
sink.add_reloc_at_offset(cur - 4, Reloc::X86GOTPCRel4, name, -4);
1406
1407
// Offset in the relocation above applies to the address of the
1408
// *GOT entry*, not the loaded address; so we emit a separate
1409
// add instruction if needed.
1410
let offset = i32::try_from(*offset).unwrap();
1411
if offset != 0 {
1412
asm::inst::addq_mi_sxl::new(PairedGpr::from(*dst), offset)
1413
.emit(sink, info, state);
1414
}
1415
} else if distance == &RelocDistance::Near {
1416
// If we know the distance to the name is within 2GB (e.g., a
1417
// module-local function), we can generate a RIP-relative
1418
// address, with a relocation.
1419
asm::inst::leaq_rm::new(*dst, riprel).emit(sink, info, state);
1420
let cur = sink.cur_offset();
1421
sink.add_reloc_at_offset(cur - 4, Reloc::X86CallPCRel4, name, *offset - 4);
1422
} else {
1423
// The full address can be encoded in the register, with a
1424
// relocation.
1425
asm::inst::movabsq_oi::new(*dst, 0).emit(sink, info, state);
1426
let cur = sink.cur_offset();
1427
sink.add_reloc_at_offset(cur - 8, Reloc::Abs8, name, *offset);
1428
}
1429
}
1430
1431
Inst::AtomicRmwSeq {
1432
ty,
1433
op,
1434
mem,
1435
operand,
1436
temp,
1437
dst_old,
1438
} => {
1439
let operand = *operand;
1440
let temp = *temp;
1441
let temp_r = temp.map(|r| *r);
1442
let dst_old = *dst_old;
1443
let dst_old_r = dst_old.map(|r| *r);
1444
debug_assert_eq!(dst_old.to_reg(), regs::rax());
1445
let mem = mem.finalize(state.frame_layout(), sink).clone();
1446
1447
// Emit this:
1448
// mov{zbq,zwq,zlq,q} (%r_address), %rax // rax = old value
1449
// again:
1450
// movq %rax, %r_temp // rax = old value, r_temp = old value
1451
// `op`q %r_operand, %r_temp // rax = old value, r_temp = new value
1452
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address) // try to store new value
1453
// jnz again // If this is taken, rax will have a "revised" old value
1454
//
1455
// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old
1456
// value), %r_temp (trashed), %rflags (trashed)
1457
let again_label = sink.get_label();
1458
1459
// mov{zbq,zwq,zlq,q} (%r_address), %rax
1460
// No need to call `add_trap` here, since the `i1` emit will do that.
1461
let i1 = Inst::load(*ty, mem.clone(), dst_old_r, ExtKind::ZeroExtend);
1462
i1.emit(sink, info, state);
1463
1464
// again:
1465
sink.bind_label(again_label, state.ctrl_plane_mut());
1466
1467
// movq %rax, %r_temp
1468
asm::inst::movq_mr::new(temp, dst_old.to_reg()).emit(sink, info, state);
1469
1470
use AtomicRmwSeqOp as RmwOp;
1471
match op {
1472
RmwOp::Nand => {
1473
// andq %r_operand, %r_temp
1474
asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1475
1476
// notq %r_temp
1477
asm::inst::notq_m::new(PairedGpr::from(temp)).emit(sink, info, state);
1478
}
1479
RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1480
// cmp %r_temp, %r_operand
1481
let temp = temp.to_reg();
1482
match *ty {
1483
types::I8 => asm::inst::cmpb_mr::new(operand, temp).emit(sink, info, state),
1484
types::I16 => {
1485
asm::inst::cmpw_mr::new(operand, temp).emit(sink, info, state)
1486
}
1487
types::I32 => {
1488
asm::inst::cmpl_mr::new(operand, temp).emit(sink, info, state)
1489
}
1490
types::I64 => {
1491
asm::inst::cmpq_mr::new(operand, temp).emit(sink, info, state)
1492
}
1493
_ => unreachable!(),
1494
}
1495
1496
// cmovcc %r_operand, %r_temp
1497
match op {
1498
RmwOp::Umin => {
1499
asm::inst::cmovbeq_rm::new(temp_r, *operand).emit(sink, info, state)
1500
}
1501
RmwOp::Umax => {
1502
asm::inst::cmovaeq_rm::new(temp_r, *operand).emit(sink, info, state)
1503
}
1504
RmwOp::Smin => {
1505
asm::inst::cmovleq_rm::new(temp_r, *operand).emit(sink, info, state)
1506
}
1507
RmwOp::Smax => {
1508
asm::inst::cmovgeq_rm::new(temp_r, *operand).emit(sink, info, state)
1509
}
1510
_ => unreachable!(),
1511
}
1512
}
1513
RmwOp::And => {
1514
// andq %r_operand, %r_temp
1515
asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1516
}
1517
RmwOp::Or => {
1518
// orq %r_operand, %r_temp
1519
asm::inst::orq_rm::new(temp, operand).emit(sink, info, state);
1520
}
1521
RmwOp::Xor => {
1522
// xorq %r_operand, %r_temp
1523
asm::inst::xorq_rm::new(temp, operand).emit(sink, info, state);
1524
}
1525
}
1526
1527
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
1528
// No need to call `add_trap` here, since the `i4` emit will do that.
1529
let temp = temp.to_reg();
1530
let dst_old = PairedGpr::from(dst_old);
1531
let inst: AsmInst = match *ty {
1532
types::I8 => asm::inst::lock_cmpxchgb_mr::new(mem, temp, dst_old).into(),
1533
types::I16 => asm::inst::lock_cmpxchgw_mr::new(mem, temp, dst_old).into(),
1534
types::I32 => asm::inst::lock_cmpxchgl_mr::new(mem, temp, dst_old).into(),
1535
types::I64 => asm::inst::lock_cmpxchgq_mr::new(mem, temp, dst_old).into(),
1536
_ => unreachable!(),
1537
};
1538
inst.emit(sink, info, state);
1539
1540
// jnz again
1541
one_way_jmp(sink, CC::NZ, again_label);
1542
}
1543
1544
Inst::Atomic128RmwSeq {
1545
op,
1546
mem,
1547
operand_low,
1548
operand_high,
1549
temp_low,
1550
temp_high,
1551
dst_old_low,
1552
dst_old_high,
1553
} => {
1554
let operand_low = *operand_low;
1555
let operand_high = *operand_high;
1556
let temp_low = *temp_low;
1557
let temp_high = *temp_high;
1558
let dst_old_low = *dst_old_low;
1559
let dst_old_high = *dst_old_high;
1560
debug_assert_eq!(temp_low.to_reg(), regs::rbx());
1561
debug_assert_eq!(temp_high.to_reg(), regs::rcx());
1562
debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1563
debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1564
let mem = mem.finalize(state.frame_layout(), sink).clone();
1565
1566
let again_label = sink.get_label();
1567
1568
// Load the initial value.
1569
asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1570
asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1571
1572
// again:
1573
sink.bind_label(again_label, state.ctrl_plane_mut());
1574
1575
// Move old value to temp registers.
1576
asm::inst::movq_mr::new(temp_low, dst_old_low.to_reg()).emit(sink, info, state);
1577
asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg()).emit(sink, info, state);
1578
1579
// Perform the operation.
1580
use Atomic128RmwSeqOp as RmwOp;
1581
match op {
1582
RmwOp::Nand => {
1583
// temp &= operand
1584
asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1585
asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1586
1587
// temp = !temp
1588
asm::inst::notq_m::new(PairedGpr::from(temp_low)).emit(sink, info, state);
1589
asm::inst::notq_m::new(PairedGpr::from(temp_high)).emit(sink, info, state);
1590
}
1591
RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1592
// Do a comparison with LHS temp and RHS operand.
1593
// Note the opposite argument orders.
1594
asm::inst::cmpq_mr::new(temp_low.to_reg(), operand_low).emit(sink, info, state);
1595
// This will clobber `temp_high`
1596
asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1597
// Restore the clobbered value
1598
asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg())
1599
.emit(sink, info, state);
1600
match op {
1601
RmwOp::Umin => {
1602
asm::inst::cmovaeq_rm::new(temp_low, operand_low)
1603
.emit(sink, info, state);
1604
asm::inst::cmovaeq_rm::new(temp_high, operand_high)
1605
.emit(sink, info, state);
1606
}
1607
RmwOp::Umax => {
1608
asm::inst::cmovbq_rm::new(temp_low, operand_low)
1609
.emit(sink, info, state);
1610
asm::inst::cmovbq_rm::new(temp_high, operand_high)
1611
.emit(sink, info, state);
1612
}
1613
RmwOp::Smin => {
1614
asm::inst::cmovgeq_rm::new(temp_low, operand_low)
1615
.emit(sink, info, state);
1616
asm::inst::cmovgeq_rm::new(temp_high, operand_high)
1617
.emit(sink, info, state);
1618
}
1619
RmwOp::Smax => {
1620
asm::inst::cmovlq_rm::new(temp_low, operand_low)
1621
.emit(sink, info, state);
1622
asm::inst::cmovlq_rm::new(temp_high, operand_high)
1623
.emit(sink, info, state);
1624
}
1625
_ => unreachable!(),
1626
}
1627
}
1628
RmwOp::Add => {
1629
asm::inst::addq_rm::new(temp_low, operand_low).emit(sink, info, state);
1630
asm::inst::adcq_rm::new(temp_high, operand_high).emit(sink, info, state);
1631
}
1632
RmwOp::Sub => {
1633
asm::inst::subq_rm::new(temp_low, operand_low).emit(sink, info, state);
1634
asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1635
}
1636
RmwOp::And => {
1637
asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1638
asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1639
}
1640
RmwOp::Or => {
1641
asm::inst::orq_rm::new(temp_low, operand_low).emit(sink, info, state);
1642
asm::inst::orq_rm::new(temp_high, operand_high).emit(sink, info, state);
1643
}
1644
RmwOp::Xor => {
1645
asm::inst::xorq_rm::new(temp_low, operand_low).emit(sink, info, state);
1646
asm::inst::xorq_rm::new(temp_high, operand_high).emit(sink, info, state);
1647
}
1648
}
1649
1650
// cmpxchg16b (mem)
1651
asm::inst::lock_cmpxchg16b_m::new(
1652
PairedGpr::from(dst_old_low),
1653
PairedGpr::from(dst_old_high),
1654
temp_low.to_reg(),
1655
temp_high.to_reg(),
1656
mem,
1657
)
1658
.emit(sink, info, state);
1659
1660
// jnz again
1661
one_way_jmp(sink, CC::NZ, again_label);
1662
}
1663
1664
Inst::Atomic128XchgSeq {
1665
mem,
1666
operand_low,
1667
operand_high,
1668
dst_old_low,
1669
dst_old_high,
1670
} => {
1671
let operand_low = *operand_low;
1672
let operand_high = *operand_high;
1673
let dst_old_low = *dst_old_low;
1674
let dst_old_high = *dst_old_high;
1675
debug_assert_eq!(operand_low, regs::rbx());
1676
debug_assert_eq!(operand_high, regs::rcx());
1677
debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1678
debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1679
let mem = mem.finalize(state.frame_layout(), sink).clone();
1680
1681
let again_label = sink.get_label();
1682
1683
// Load the initial value.
1684
asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1685
asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1686
1687
// again:
1688
sink.bind_label(again_label, state.ctrl_plane_mut());
1689
1690
// cmpxchg16b (mem)
1691
asm::inst::lock_cmpxchg16b_m::new(
1692
PairedGpr::from(dst_old_low),
1693
PairedGpr::from(dst_old_high),
1694
operand_low,
1695
operand_high,
1696
mem,
1697
)
1698
.emit(sink, info, state);
1699
1700
// jnz again
1701
one_way_jmp(sink, CC::NZ, again_label);
1702
}
1703
1704
Inst::ElfTlsGetAddr { symbol, dst } => {
1705
let dst = dst.to_reg().to_reg();
1706
debug_assert_eq!(dst, regs::rax());
1707
1708
// N.B.: Must be exactly this byte sequence; the linker requires it,
1709
// because it must know how to rewrite the bytes.
1710
1711
// data16 lea gv@tlsgd(%rip),%rdi
1712
sink.put1(0x66); // data16
1713
sink.put1(0b01001000); // REX.W
1714
sink.put1(0x8d); // LEA
1715
sink.put1(0x3d); // ModRM byte
1716
emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
1717
sink.put4(0); // offset
1718
1719
// data16 data16 callq __tls_get_addr-4
1720
sink.put1(0x66); // data16
1721
sink.put1(0x66); // data16
1722
sink.put1(0b01001000); // REX.W
1723
sink.put1(0xe8); // CALL
1724
emit_reloc(
1725
sink,
1726
Reloc::X86CallPLTRel4,
1727
&ExternalName::LibCall(LibCall::ElfTlsGetAddr),
1728
-4,
1729
);
1730
sink.put4(0); // offset
1731
}
1732
1733
Inst::MachOTlsGetAddr { symbol, dst } => {
1734
let dst = dst.to_reg().to_reg();
1735
debug_assert_eq!(dst, regs::rax());
1736
1737
// movq gv@tlv(%rip), %rdi
1738
sink.put1(0x48); // REX.w
1739
sink.put1(0x8b); // MOV
1740
sink.put1(0x3d); // ModRM byte
1741
emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
1742
sink.put4(0); // offset
1743
1744
asm::inst::callq_m::new(asm::Amode::ImmReg {
1745
base: Gpr::RDI,
1746
simm32: asm::AmodeOffsetPlusKnownOffset::ZERO,
1747
trap: None,
1748
})
1749
.emit(sink, info, state);
1750
}
1751
1752
Inst::CoffTlsGetAddr { symbol, dst, tmp } => {
1753
let dst = dst.to_reg().to_reg();
1754
debug_assert_eq!(dst, regs::rax());
1755
1756
// tmp is used below directly as %rcx
1757
let tmp = tmp.to_reg().to_reg();
1758
debug_assert_eq!(tmp, regs::rcx());
1759
1760
// See: https://gcc.godbolt.org/z/M8or9x6ss
1761
// And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
1762
1763
// Emit the following sequence
1764
// movl (%rip), %eax ; IMAGE_REL_AMD64_REL32 _tls_index
1765
// movq %gs:88, %rcx
1766
// movq (%rcx,%rax,8), %rax
1767
// leaq (%rax), %rax ; Reloc: IMAGE_REL_AMD64_SECREL symbol
1768
1769
// Load TLS index for current thread
1770
// movl (%rip), %eax
1771
sink.put1(0x8b); // mov
1772
sink.put1(0x05);
1773
emit_reloc(
1774
sink,
1775
Reloc::X86PCRel4,
1776
&ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
1777
-4,
1778
);
1779
sink.put4(0); // offset
1780
1781
// movq %gs:88, %rcx
1782
// Load the TLS Storage Array pointer
1783
// The gs segment register refers to the base address of the TEB on x64.
1784
// 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
1785
sink.put_data(&[
1786
0x65, 0x48, // REX.W
1787
0x8b, // MOV
1788
0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
1789
0x00, 0x00, 0x00,
1790
]);
1791
1792
// movq (%rcx,%rax,8), %rax
1793
// Load the actual TLS entry for this thread.
1794
// Computes ThreadLocalStoragePointer + _tls_index*8
1795
sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
1796
1797
// leaq (%rax), %rax
1798
sink.put1(0x48);
1799
sink.put1(0x8d);
1800
sink.put1(0x80);
1801
emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
1802
sink.put4(0); // offset
1803
}
1804
1805
Inst::Unwind { inst } => {
1806
sink.add_unwind(inst.clone());
1807
}
1808
1809
Inst::DummyUse { .. } => {
1810
// Nothing.
1811
}
1812
1813
Inst::LabelAddress { dst, label } => {
1814
// Emit an LEA with a LabelUse given this label.
1815
asm::inst::leaq_rm::new(*dst, Amode::rip_relative(*label)).emit(sink, info, state);
1816
}
1817
1818
Inst::External { inst } => {
1819
let frame = state.frame_layout();
1820
emit_maybe_shrink(
1821
inst,
1822
&mut external::AsmCodeSink {
1823
sink,
1824
1825
// These values are transcribed from what is happening in
1826
// `SyntheticAmode::finalize`. This, plus the `Into` logic
1827
// converting a `SyntheticAmode` to its external counterpart, are
1828
// necessary to communicate Cranelift's internal offsets to the
1829
// assembler; due to when Cranelift determines these offsets, this
1830
// happens quite late (i.e., here during emission).
1831
incoming_arg_offset: i32::try_from(
1832
frame.tail_args_size + frame.setup_area_size,
1833
)
1834
.unwrap(),
1835
slot_offset: i32::try_from(frame.outgoing_args_size).unwrap(),
1836
},
1837
);
1838
}
1839
}
1840
1841
state.clear_post_insn();
1842
}
1843
1844
/// Emit the common sequence used for both direct and indirect tail calls:
1845
///
1846
/// * Copy the new frame's stack arguments over the top of our current frame.
1847
///
1848
/// * Restore the old frame pointer.
1849
///
1850
/// * Initialize the tail callee's stack pointer (simultaneously deallocating
1851
/// the temporary stack space we allocated when creating the new frame's stack
1852
/// arguments).
1853
///
1854
/// * Move the return address into its stack slot.
1855
fn emit_return_call_common_sequence<T>(
1856
sink: &mut MachBuffer<Inst>,
1857
info: &EmitInfo,
1858
state: &mut EmitState,
1859
call_info: &ReturnCallInfo<T>,
1860
) {
1861
assert!(
1862
info.flags.preserve_frame_pointers(),
1863
"frame pointers aren't fundamentally required for tail calls, \
1864
but the current implementation relies on them being present"
1865
);
1866
1867
let tmp = call_info.tmp.to_writable_reg();
1868
1869
for inst in
1870
X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
1871
{
1872
inst.emit(sink, info, state);
1873
}
1874
1875
for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
1876
CallConv::Tail,
1877
&info.flags,
1878
&info.isa_flags,
1879
state.frame_layout(),
1880
) {
1881
inst.emit(sink, info, state);
1882
}
1883
1884
let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;
1885
if incoming_args_diff > 0 {
1886
// Move the saved return address up by `incoming_args_diff`.
1887
let addr = Amode::imm_reg(0, regs::rsp());
1888
asm::inst::movq_rm::new(tmp, addr).emit(sink, info, state);
1889
asm::inst::movq_mr::new(
1890
Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),
1891
Gpr::unwrap_new(tmp.to_reg()),
1892
)
1893
.emit(sink, info, state);
1894
1895
// Increment the stack pointer to shrink the argument area for the new
1896
// call.
1897
let rsp = Writable::from_reg(regs::rsp());
1898
let incoming_args_diff = i32::try_from(incoming_args_diff)
1899
.expect("`incoming_args_diff` is too large to fit in a 32-bit signed immediate");
1900
Inst::addq_mi(rsp, incoming_args_diff).emit(sink, info, state);
1901
}
1902
}
1903
1904
/// Conveniene trait to have an `emit` method on all `asm::inst::*` variants.
1905
trait ExternalEmit {
1906
fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState);
1907
}
1908
1909
impl<I> ExternalEmit for I
1910
where
1911
I: Into<asm::inst::Inst<CraneliftRegisters>>,
1912
{
1913
fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState) {
1914
Inst::External { inst: self.into() }.emit(sink, info, state)
1915
}
1916
}
1917
1918
/// Attempt to "shrink" the provided `inst`.
1919
///
1920
/// This function will inspect `inst` and attempt to return a new instruction
1921
/// which is equivalent semantically but will encode to a smaller binary
1922
/// representation. This is only done for instructions which require register
1923
/// allocation to have already happened, for example shrinking immediates should
1924
/// be done during instruction selection not at this point.
1925
///
1926
/// An example of this optimization is the `AND` instruction. The Intel manual
1927
/// has a smaller encoding for `AND AL, imm8` than it does for `AND r/m8, imm8`.
1928
/// Here the instructions are matched against and if regalloc state indicates
1929
/// that a smaller variant is available then that's swapped to instead.
1930
fn emit_maybe_shrink(inst: &AsmInst, sink: &mut impl asm::CodeSink) {
1931
use cranelift_assembler_x64::GprMem;
1932
use cranelift_assembler_x64::inst::*;
1933
1934
type R = CraneliftRegisters;
1935
const RAX: PairedGpr = PairedGpr {
1936
read: Gpr::RAX,
1937
write: Writable::from_reg(Gpr::RAX),
1938
};
1939
const RAX_RM: GprMem<PairedGpr, Gpr> = GprMem::Gpr(RAX);
1940
1941
match *inst {
1942
// and
1943
Inst::andb_mi(andb_mi { rm8: RAX_RM, imm8 }) => andb_i::<R>::new(RAX, imm8).encode(sink),
1944
Inst::andw_mi(andw_mi {
1945
rm16: RAX_RM,
1946
imm16,
1947
}) => andw_i::<R>::new(RAX, imm16).encode(sink),
1948
Inst::andl_mi(andl_mi {
1949
rm32: RAX_RM,
1950
imm32,
1951
}) => andl_i::<R>::new(RAX, imm32).encode(sink),
1952
Inst::andq_mi_sxl(andq_mi_sxl {
1953
rm64: RAX_RM,
1954
imm32,
1955
}) => andq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1956
1957
// or
1958
Inst::orb_mi(orb_mi { rm8: RAX_RM, imm8 }) => orb_i::<R>::new(RAX, imm8).encode(sink),
1959
Inst::orw_mi(orw_mi {
1960
rm16: RAX_RM,
1961
imm16,
1962
}) => orw_i::<R>::new(RAX, imm16).encode(sink),
1963
Inst::orl_mi(orl_mi {
1964
rm32: RAX_RM,
1965
imm32,
1966
}) => orl_i::<R>::new(RAX, imm32).encode(sink),
1967
Inst::orq_mi_sxl(orq_mi_sxl {
1968
rm64: RAX_RM,
1969
imm32,
1970
}) => orq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1971
1972
// xor
1973
Inst::xorb_mi(xorb_mi { rm8: RAX_RM, imm8 }) => xorb_i::<R>::new(RAX, imm8).encode(sink),
1974
Inst::xorw_mi(xorw_mi {
1975
rm16: RAX_RM,
1976
imm16,
1977
}) => xorw_i::<R>::new(RAX, imm16).encode(sink),
1978
Inst::xorl_mi(xorl_mi {
1979
rm32: RAX_RM,
1980
imm32,
1981
}) => xorl_i::<R>::new(RAX, imm32).encode(sink),
1982
Inst::xorq_mi_sxl(xorq_mi_sxl {
1983
rm64: RAX_RM,
1984
imm32,
1985
}) => xorq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1986
1987
// add
1988
Inst::addb_mi(addb_mi { rm8: RAX_RM, imm8 }) => addb_i::<R>::new(RAX, imm8).encode(sink),
1989
Inst::addw_mi(addw_mi {
1990
rm16: RAX_RM,
1991
imm16,
1992
}) => addw_i::<R>::new(RAX, imm16).encode(sink),
1993
Inst::addl_mi(addl_mi {
1994
rm32: RAX_RM,
1995
imm32,
1996
}) => addl_i::<R>::new(RAX, imm32).encode(sink),
1997
Inst::addq_mi_sxl(addq_mi_sxl {
1998
rm64: RAX_RM,
1999
imm32,
2000
}) => addq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2001
2002
// adc
2003
Inst::adcb_mi(adcb_mi { rm8: RAX_RM, imm8 }) => adcb_i::<R>::new(RAX, imm8).encode(sink),
2004
Inst::adcw_mi(adcw_mi {
2005
rm16: RAX_RM,
2006
imm16,
2007
}) => adcw_i::<R>::new(RAX, imm16).encode(sink),
2008
Inst::adcl_mi(adcl_mi {
2009
rm32: RAX_RM,
2010
imm32,
2011
}) => adcl_i::<R>::new(RAX, imm32).encode(sink),
2012
Inst::adcq_mi_sxl(adcq_mi_sxl {
2013
rm64: RAX_RM,
2014
imm32,
2015
}) => adcq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2016
2017
// sub
2018
Inst::subb_mi(subb_mi { rm8: RAX_RM, imm8 }) => subb_i::<R>::new(RAX, imm8).encode(sink),
2019
Inst::subw_mi(subw_mi {
2020
rm16: RAX_RM,
2021
imm16,
2022
}) => subw_i::<R>::new(RAX, imm16).encode(sink),
2023
Inst::subl_mi(subl_mi {
2024
rm32: RAX_RM,
2025
imm32,
2026
}) => subl_i::<R>::new(RAX, imm32).encode(sink),
2027
Inst::subq_mi_sxl(subq_mi_sxl {
2028
rm64: RAX_RM,
2029
imm32,
2030
}) => subq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2031
2032
// sbb
2033
Inst::sbbb_mi(sbbb_mi { rm8: RAX_RM, imm8 }) => sbbb_i::<R>::new(RAX, imm8).encode(sink),
2034
Inst::sbbw_mi(sbbw_mi {
2035
rm16: RAX_RM,
2036
imm16,
2037
}) => sbbw_i::<R>::new(RAX, imm16).encode(sink),
2038
Inst::sbbl_mi(sbbl_mi {
2039
rm32: RAX_RM,
2040
imm32,
2041
}) => sbbl_i::<R>::new(RAX, imm32).encode(sink),
2042
Inst::sbbq_mi_sxl(sbbq_mi_sxl {
2043
rm64: RAX_RM,
2044
imm32,
2045
}) => sbbq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2046
2047
// cmp
2048
Inst::cmpb_mi(cmpb_mi {
2049
rm8: GprMem::Gpr(Gpr::RAX),
2050
imm8,
2051
}) => cmpb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2052
Inst::cmpw_mi(cmpw_mi {
2053
rm16: GprMem::Gpr(Gpr::RAX),
2054
imm16,
2055
}) => cmpw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2056
Inst::cmpl_mi(cmpl_mi {
2057
rm32: GprMem::Gpr(Gpr::RAX),
2058
imm32,
2059
}) => cmpl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2060
Inst::cmpq_mi(cmpq_mi {
2061
rm64: GprMem::Gpr(Gpr::RAX),
2062
imm32,
2063
}) => cmpq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2064
2065
// test
2066
Inst::testb_mi(testb_mi {
2067
rm8: GprMem::Gpr(Gpr::RAX),
2068
imm8,
2069
}) => testb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2070
Inst::testw_mi(testw_mi {
2071
rm16: GprMem::Gpr(Gpr::RAX),
2072
imm16,
2073
}) => testw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2074
Inst::testl_mi(testl_mi {
2075
rm32: GprMem::Gpr(Gpr::RAX),
2076
imm32,
2077
}) => testl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2078
Inst::testq_mi(testq_mi {
2079
rm64: GprMem::Gpr(Gpr::RAX),
2080
imm32,
2081
}) => testq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2082
2083
// lea
2084
Inst::leal_rm(leal_rm { r32, m32 }) => emit_lea(
2085
r32,
2086
m32,
2087
sink,
2088
|dst, amode, s| leal_rm::<R>::new(dst, amode).encode(s),
2089
|dst, simm32, s| addl_mi::<R>::new(dst, simm32.cast_unsigned()).encode(s),
2090
|dst, reg, s| addl_rm::<R>::new(dst, reg).encode(s),
2091
),
2092
Inst::leaq_rm(leaq_rm { r64, m64 }) => emit_lea(
2093
r64,
2094
m64,
2095
sink,
2096
|dst, amode, s| leaq_rm::<R>::new(dst, amode).encode(s),
2097
|dst, simm32, s| addq_mi_sxl::<R>::new(dst, simm32).encode(s),
2098
|dst, reg, s| addq_rm::<R>::new(dst, reg).encode(s),
2099
),
2100
2101
// All other instructions fall through to here and cannot be shrunk, so
2102
// return `false` to emit them as usual.
2103
_ => inst.encode(sink),
2104
}
2105
}
2106
2107
/// If `lea` can actually get encoded as an `add` then do that instead.
2108
/// Currently all candidate `iadd`s become an `lea` pseudo-instruction here but
2109
/// maximizing the use of `lea` is not necessarily optimal. The `lea`
2110
/// instruction goes through dedicated address units on cores which are finite
2111
/// and disjoint from the general ALU, so if everything uses `lea` then those
2112
/// units can get saturated while leaving the ALU idle.
2113
///
2114
/// To help make use of more parts of a CPU, this attempts to use `add` when
2115
/// it's semantically equivalent to `lea`, or otherwise when the `dst` register
2116
/// is the same as the `base` or `index` register.
2117
///
2118
/// FIXME: ideally regalloc is informed of this constraint. Register allocation
2119
/// of `lea` should "attempt" to put the `base` in the same register as `dst`
2120
/// but not at the expense of generating a `mov` instruction. Currently that's
2121
/// not possible but perhaps one day it may be worth it.
2122
fn emit_lea<S>(
2123
dst: asm::Gpr<WritableGpr>,
2124
addr: asm::Amode<Gpr>,
2125
sink: &mut S,
2126
lea: fn(WritableGpr, asm::Amode<Gpr>, &mut S),
2127
add_mi: fn(PairedGpr, i32, &mut S),
2128
add_rm: fn(PairedGpr, Gpr, &mut S),
2129
) where
2130
S: asm::CodeSink,
2131
{
2132
match addr {
2133
// If `base == dst` then this is `add dst, $imm`, so encode that
2134
// instead.
2135
asm::Amode::ImmReg {
2136
base,
2137
simm32:
2138
asm::AmodeOffsetPlusKnownOffset {
2139
simm32,
2140
offset: None,
2141
},
2142
trap: None,
2143
} if dst.as_ref().to_reg() == base => add_mi(
2144
PairedGpr {
2145
read: base,
2146
write: *dst.as_ref(),
2147
},
2148
simm32.value(),
2149
sink,
2150
),
2151
2152
// If the offset is 0 and the shift is a scale of 1, then:
2153
//
2154
// * If `base == dst`, then this is `addq dst, index`
2155
// * If `index == dst`, then this is `addq dst, base`
2156
asm::Amode::ImmRegRegShift {
2157
base,
2158
index,
2159
scale: asm::Scale::One,
2160
simm32: asm::AmodeOffset::ZERO,
2161
trap: None,
2162
} => {
2163
if dst.as_ref().to_reg() == base {
2164
add_rm(
2165
PairedGpr {
2166
read: base,
2167
write: *dst.as_ref(),
2168
},
2169
*index.as_ref(),
2170
sink,
2171
)
2172
} else if dst.as_ref().to_reg() == *index.as_ref() {
2173
add_rm(
2174
PairedGpr {
2175
read: *index.as_ref(),
2176
write: *dst.as_ref(),
2177
},
2178
base,
2179
sink,
2180
)
2181
} else {
2182
lea(*dst.as_ref(), addr, sink)
2183
}
2184
}
2185
2186
_ => lea(*dst.as_ref(), addr, sink),
2187
}
2188
}
2189
2190