Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bytecodealliance
GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/cranelift/codegen/src/isa/x64/inst/emit.rs
3088 views
1
use crate::ir::KnownSymbol;
2
use crate::ir::immediates::{Ieee32, Ieee64};
3
use crate::isa::x64::external::{AsmInst, CraneliftRegisters, PairedGpr};
4
use crate::isa::x64::inst::args::*;
5
use crate::isa::x64::inst::*;
6
use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};
7
use cranelift_assembler_x64 as asm;
8
9
/// A small helper to generate a signed conversion instruction.
10
fn emit_signed_cvt(
11
sink: &mut MachBuffer<Inst>,
12
info: &EmitInfo,
13
state: &mut EmitState,
14
src: Reg,
15
dst: Writable<Reg>,
16
to_f64: bool,
17
) {
18
assert!(src.is_real());
19
assert!(dst.to_reg().is_real());
20
21
// Handle an unsigned int, which is the "easy" case: a signed conversion
22
// will do the right thing.
23
let dst = WritableXmm::from_writable_reg(dst).unwrap();
24
if to_f64 {
25
asm::inst::cvtsi2sdq_a::new(dst, src).emit(sink, info, state);
26
} else {
27
asm::inst::cvtsi2ssq_a::new(dst, src).emit(sink, info, state);
28
}
29
}
30
31
/// Emits a one way conditional jump if CC is set (true).
32
fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
33
let cond_start = sink.cur_offset();
34
let cond_disp_off = cond_start + 2;
35
sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
36
emit_jcc_no_offset(sink, cc);
37
debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
38
}
39
40
/// Like `one_way_jmp` above emitting a conditional jump, but also using
41
/// `MachBuffer::add_cond_branch`.
42
fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
43
let cond_start = sink.cur_offset();
44
let cond_disp_off = cond_start + 2;
45
let cond_end = cond_start + 6;
46
47
sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
48
// FIXME: ideally this `inverted` calculation would go through the external
49
// assembler, but for now it's left done manually.
50
let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
51
sink.add_cond_branch(cond_start, cond_end, label, &inverted[..]);
52
53
emit_jcc_no_offset(sink, cc);
54
55
debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
56
debug_assert_eq!(sink.cur_offset(), cond_end);
57
}
58
59
fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {
60
// Note that the disassembler matches Capstone which doesn't match the `CC`
61
// enum directly as Intel has multiple mnemonics use the same encoding.
62
let inst: AsmInst = match cc {
63
CC::Z => asm::inst::je_d32::new(0).into(), // jz == je
64
CC::NZ => asm::inst::jne_d32::new(0).into(), // jnz == jne
65
CC::B => asm::inst::jb_d32::new(0).into(),
66
CC::NB => asm::inst::jae_d32::new(0).into(), // jnb == jae
67
CC::BE => asm::inst::jbe_d32::new(0).into(),
68
CC::NBE => asm::inst::ja_d32::new(0).into(), // jnbe == ja
69
CC::L => asm::inst::jl_d32::new(0).into(),
70
CC::LE => asm::inst::jle_d32::new(0).into(),
71
CC::NL => asm::inst::jge_d32::new(0).into(), // jnl == jge
72
CC::NLE => asm::inst::jg_d32::new(0).into(), // jnle == jg
73
CC::O => asm::inst::jo_d32::new(0).into(),
74
CC::NO => asm::inst::jno_d32::new(0).into(),
75
CC::P => asm::inst::jp_d32::new(0).into(),
76
CC::NP => asm::inst::jnp_d32::new(0).into(),
77
CC::S => asm::inst::js_d32::new(0).into(),
78
CC::NS => asm::inst::jns_d32::new(0).into(),
79
};
80
inst.encode(&mut external::AsmCodeSink {
81
sink,
82
incoming_arg_offset: 0,
83
slot_offset: 0,
84
});
85
}
86
87
/// Emits an unconditional branch.
88
fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {
89
let uncond_start = sink.cur_offset();
90
let uncond_disp_off = uncond_start + 1;
91
let uncond_end = uncond_start + 5;
92
93
sink.use_label_at_offset(uncond_disp_off, label, LabelUse::JmpRel32);
94
sink.add_uncond_branch(uncond_start, uncond_end, label);
95
96
asm::inst::jmp_d32::new(0).encode(&mut external::AsmCodeSink {
97
sink,
98
incoming_arg_offset: 0,
99
slot_offset: 0,
100
});
101
debug_assert_eq!(sink.cur_offset(), uncond_disp_off + 4);
102
debug_assert_eq!(sink.cur_offset(), uncond_end);
103
}
104
105
/// Emits a relocation, attaching the current source location as well.
106
fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
107
sink.add_reloc(kind, name, addend);
108
}
109
110
/// The top-level emit function.
111
///
112
/// Important! Do not add improved (shortened) encoding cases to existing
113
/// instructions without also adding tests for those improved encodings. That
114
/// is a dangerous game that leads to hard-to-track-down errors in the emitted
115
/// code.
116
///
117
/// For all instructions, make sure to have test coverage for all of the
118
/// following situations. Do this by creating the cross product resulting from
119
/// applying the following rules to each operand:
120
///
121
/// (1) for any insn that mentions a register: one test using a register from
122
/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
123
/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
124
/// This helps detect incorrect REX prefix construction.
125
///
126
/// (2) for any insn that mentions a byte register: one test for each of the
127
/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
128
/// [r8b .. r11b] and [r12b .. r15b]. This checks that
129
/// apparently-redundant REX prefixes are retained when required.
130
///
131
/// (3) for any insn that contains an immediate field, check the following
132
/// cases: field is zero, field is in simm8 range (-128 .. 127), field is
133
/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some
134
/// instructions that require a 32-bit immediate have a short-form encoding
135
/// when the imm is in simm8 range.
136
///
137
/// Rules (1), (2) and (3) don't apply for registers within address expressions
138
/// (`Addr`s). Those are already pretty well tested, and the registers in them
139
/// don't have any effect on the containing instruction (apart from possibly
140
/// require REX prefix bits).
141
///
142
/// When choosing registers for a test, avoid using registers with the same
143
/// offset within a given group. For example, don't use rax and r8, since they
144
/// both have the lowest 3 bits as 000, and so the test won't detect errors
145
/// where those 3-bit register sub-fields are confused by the emitter. Instead
146
/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl
147
/// and bpl since they have the same offset in their group; use instead (eg) cl
148
/// and sil.
149
///
150
/// For all instructions, also add a test that uses only low-half registers
151
/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
152
/// prefixes are correctly omitted. This low-half restriction must apply to
153
/// _all_ registers in the insn, even those in address expressions.
154
///
155
/// Following these rules creates large numbers of test cases, but it's the
156
/// only way to make the emitter reliable.
157
///
158
/// Known possible improvements:
159
///
160
/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we
161
/// care?)
162
pub(crate) fn emit(
163
inst: &Inst,
164
sink: &mut MachBuffer<Inst>,
165
info: &EmitInfo,
166
state: &mut EmitState,
167
) {
168
if !inst.is_available(&info) {
169
let features = if let Inst::External { inst } = inst {
170
inst.features().to_string()
171
} else {
172
"see `is_available` source for feature term".to_string()
173
};
174
panic!(
175
"Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {features}"
176
);
177
}
178
179
match inst {
180
Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
181
// Validate that the register constraints of the dividend and the
182
// destination are all as expected.
183
let (dst, size) = match inst {
184
Inst::CheckedSRemSeq {
185
dividend_lo,
186
dividend_hi,
187
dst_quotient,
188
dst_remainder,
189
size,
190
..
191
} => {
192
let dividend_lo = dividend_lo.to_reg();
193
let dividend_hi = dividend_hi.to_reg();
194
let dst_quotient = dst_quotient.to_reg().to_reg();
195
let dst_remainder = dst_remainder.to_reg().to_reg();
196
debug_assert_eq!(dividend_lo, regs::rax());
197
debug_assert_eq!(dividend_hi, regs::rdx());
198
debug_assert_eq!(dst_quotient, regs::rax());
199
debug_assert_eq!(dst_remainder, regs::rdx());
200
(regs::rdx(), *size)
201
}
202
Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
203
let dividend = dividend.to_reg();
204
let dst = dst.to_reg().to_reg();
205
debug_assert_eq!(dividend, regs::rax());
206
debug_assert_eq!(dst, regs::rax());
207
(regs::rax(), OperandSize::Size8)
208
}
209
_ => unreachable!(),
210
};
211
212
// Generates the following code sequence:
213
//
214
// cmp -1 %divisor
215
// jnz $do_op
216
//
217
// ;; for srem, result is 0
218
// mov #0, %dst
219
// j $done
220
//
221
// $do_op:
222
// idiv %divisor
223
//
224
// $done:
225
226
let do_op = sink.get_label();
227
let done_label = sink.get_label();
228
229
// Check if the divisor is -1, and if it isn't then immediately
230
// go to the `idiv`.
231
let inst = Inst::cmp_mi_sxb(size, *divisor, -1);
232
inst.emit(sink, info, state);
233
one_way_jmp(sink, CC::NZ, do_op);
234
235
// ... otherwise the divisor is -1 and the result is always 0. This
236
// is written to the destination register which will be %rax for
237
// 8-bit srem and %rdx otherwise.
238
//
239
// Note that for 16-to-64-bit srem operations this leaves the
240
// second destination, %rax, unchanged. This isn't semantically
241
// correct if a lowering actually tries to use the `dst_quotient`
242
// output but for srem only the `dst_remainder` output is used for
243
// now.
244
let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
245
inst.emit(sink, info, state);
246
let inst = Inst::jmp_known(done_label);
247
inst.emit(sink, info, state);
248
249
// Here the `idiv` is executed, which is different depending on the
250
// size
251
sink.bind_label(do_op, state.ctrl_plane_mut());
252
let rax = Gpr::RAX;
253
let rdx = Gpr::RDX;
254
let writable_rax = Writable::from_reg(rax);
255
let writable_rdx = Writable::from_reg(rdx);
256
let inst: AsmInst = match size {
257
OperandSize::Size8 => asm::inst::idivb_m::new(
258
PairedGpr::from(writable_rax),
259
*divisor,
260
TrapCode::INTEGER_DIVISION_BY_ZERO,
261
)
262
.into(),
263
264
OperandSize::Size16 => asm::inst::idivw_m::new(
265
PairedGpr::from(writable_rax),
266
PairedGpr::from(writable_rdx),
267
*divisor,
268
TrapCode::INTEGER_DIVISION_BY_ZERO,
269
)
270
.into(),
271
272
OperandSize::Size32 => asm::inst::idivl_m::new(
273
PairedGpr::from(writable_rax),
274
PairedGpr::from(writable_rdx),
275
*divisor,
276
TrapCode::INTEGER_DIVISION_BY_ZERO,
277
)
278
.into(),
279
280
OperandSize::Size64 => asm::inst::idivq_m::new(
281
PairedGpr::from(writable_rax),
282
PairedGpr::from(writable_rdx),
283
*divisor,
284
TrapCode::INTEGER_DIVISION_BY_ZERO,
285
)
286
.into(),
287
};
288
inst.emit(sink, info, state);
289
290
sink.bind_label(done_label, state.ctrl_plane_mut());
291
}
292
293
Inst::MovFromPReg { src, dst } => {
294
let src: Reg = (*src).into();
295
debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
296
asm::inst::movq_mr::new(*dst, Gpr::unwrap_new(src)).emit(sink, info, state);
297
}
298
299
Inst::MovToPReg { src, dst } => {
300
let dst: Reg = (*dst).into();
301
debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
302
let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
303
asm::inst::movq_mr::new(dst, *src).emit(sink, info, state);
304
}
305
306
Inst::XmmCmove {
307
ty,
308
cc,
309
consequent,
310
alternative,
311
dst,
312
} => {
313
let alternative = *alternative;
314
let dst = *dst;
315
debug_assert_eq!(alternative, dst.to_reg());
316
let consequent = *consequent;
317
318
// Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
319
// this doesn't clobber flags. Make sure to not do so here.
320
let next = sink.get_label();
321
322
// Jump if cc is *not* set.
323
one_way_jmp(sink, cc.invert(), next);
324
Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)
325
.emit(sink, info, state);
326
327
sink.bind_label(next, state.ctrl_plane_mut());
328
}
329
330
Inst::StackProbeLoop {
331
tmp,
332
frame_size,
333
guard_size,
334
} => {
335
assert!(info.flags.enable_probestack());
336
assert!(guard_size.is_power_of_two());
337
338
let tmp = *tmp;
339
340
// Number of probes that we need to perform
341
let probe_count = align_to(*frame_size, *guard_size) / guard_size;
342
343
// The inline stack probe loop has 3 phases:
344
//
345
// We generate the "guard area" register which is essentially the frame_size aligned to
346
// guard_size. We copy the stack pointer and subtract the guard area from it. This
347
// gets us a register that we can use to compare when looping.
348
//
349
// After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
350
// distance at a time and then touch the stack by writing anything to it. We use the previously
351
// created "guard area" register to know when to stop looping.
352
//
353
// When we have touched all the pages that we need, we have to restore the stack pointer
354
// to where it was before.
355
//
356
// Generate the following code:
357
// mov tmp_reg, rsp
358
// sub tmp_reg, guard_size * probe_count
359
// .loop_start:
360
// sub rsp, guard_size
361
// mov [rsp], 0
362
// cmp rsp, tmp_reg
363
// jne .loop_start
364
// add rsp, guard_size * probe_count
365
366
// Create the guard bound register
367
// mov tmp_reg, rsp
368
let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
369
inst.emit(sink, info, state);
370
371
// sub tmp_reg, GUARD_SIZE * probe_count
372
let guard_plus_count = i32::try_from(guard_size * probe_count)
373
.expect("`guard_size * probe_count` is too large to fit in a 32-bit immediate");
374
Inst::subq_mi(tmp, guard_plus_count).emit(sink, info, state);
375
376
// Emit the main loop!
377
let loop_start = sink.get_label();
378
sink.bind_label(loop_start, state.ctrl_plane_mut());
379
380
// sub rsp, GUARD_SIZE
381
let rsp = Writable::from_reg(regs::rsp());
382
let guard_size_ = i32::try_from(*guard_size)
383
.expect("`guard_size` is too large to fit in a 32-bit immediate");
384
Inst::subq_mi(rsp, guard_size_).emit(sink, info, state);
385
386
// Touch the current page by storing an immediate zero.
387
// mov [rsp], 0
388
asm::inst::movl_mi::new(Amode::imm_reg(0, regs::rsp()), 0i32.cast_unsigned())
389
.emit(sink, info, state);
390
391
// Compare and jump if we are not done yet
392
// cmp rsp, tmp_reg
393
let tmp = Gpr::unwrap_new(tmp.to_reg());
394
asm::inst::cmpq_rm::new(tmp, Gpr::RSP).emit(sink, info, state);
395
396
// jne .loop_start
397
// TODO: Encoding the conditional jump as a short jump
398
// could save us us 4 bytes here.
399
one_way_jmp(sink, CC::NZ, loop_start);
400
401
// The regular prologue code is going to emit a `sub` after this, so we need to
402
// reset the stack pointer
403
//
404
// TODO: It would be better if we could avoid the `add` + `sub` that is generated here
405
// and in the stack adj portion of the prologue
406
//
407
// add rsp, GUARD_SIZE * probe_count
408
Inst::addq_mi(rsp, guard_plus_count).emit(sink, info, state);
409
}
410
411
Inst::CallKnown { info: call_info } => {
412
let start = sink.cur_offset();
413
let stack_map = state.take_stack_map();
414
415
asm::inst::callq_d::new(0).emit(sink, info, state);
416
417
// The last 4 bytes of `callq` is the relative displacement to where
418
// we're calling, so that's where the reloc is registered.
419
//
420
// The addend adjusts for the difference between the end of the
421
// instruction and the beginning of the immediate field.
422
let len = sink.cur_offset();
423
sink.add_reloc_at_offset(len - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
424
425
if let Some(s) = stack_map {
426
sink.push_user_stack_map(state, len, s);
427
}
428
429
if let Some(try_call) = call_info.try_call_info.as_ref() {
430
sink.add_try_call_site(
431
Some(state.frame_layout().sp_to_fp()),
432
try_call.exception_handlers(&state.frame_layout()),
433
);
434
} else {
435
sink.add_call_site();
436
}
437
438
// Reclaim the outgoing argument area that was released by the
439
// callee, to ensure that StackAMode values are always computed from
440
// a consistent SP.
441
if call_info.callee_pop_size > 0 {
442
let rsp = Writable::from_reg(regs::rsp());
443
let callee_pop_size = i32::try_from(call_info.callee_pop_size)
444
.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
445
Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
446
}
447
448
if call_info.patchable {
449
sink.add_patchable_call_site(sink.cur_offset() - start);
450
} else {
451
// Load any stack-carried return values.
452
call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
453
state.frame_layout().stackslots_size,
454
|inst| inst.emit(sink, info, state),
455
|_space_needed| None,
456
);
457
}
458
459
// If this is a try-call, jump to the continuation
460
// (normal-return) block.
461
if let Some(try_call) = call_info.try_call_info.as_ref() {
462
let jmp = Inst::JmpKnown {
463
dst: try_call.continuation,
464
};
465
jmp.emit(sink, info, state);
466
}
467
}
468
469
Inst::ReturnCallKnown { info: call_info } => {
470
emit_return_call_common_sequence(sink, info, state, &call_info);
471
472
// Finally, jump to the callee!
473
//
474
// Note: this is not `Inst::Jmp { .. }.emit(..)` because we have
475
// different metadata in this case: we don't have a label for the
476
// target, but rather a function relocation.
477
asm::inst::jmp_d32::new(0).emit(sink, info, state);
478
let offset = sink.cur_offset();
479
// The addend adjusts for the difference between the end of the instruction and the
480
// beginning of the immediate field.
481
sink.add_reloc_at_offset(offset - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);
482
sink.add_call_site();
483
}
484
485
Inst::ReturnCallUnknown { info: call_info } => {
486
let callee = call_info.dest;
487
488
emit_return_call_common_sequence(sink, info, state, &call_info);
489
490
asm::inst::jmpq_m::new(callee).emit(sink, info, state);
491
sink.add_call_site();
492
}
493
494
Inst::CallUnknown {
495
info: call_info, ..
496
} => {
497
let stack_map = state.take_stack_map();
498
499
let dest = match call_info.dest.clone() {
500
RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::unwrap_new(reg)),
501
RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),
502
};
503
504
asm::inst::callq_m::new(dest).emit(sink, info, state);
505
506
if let Some(s) = stack_map {
507
let offset = sink.cur_offset();
508
sink.push_user_stack_map(state, offset, s);
509
}
510
511
if let Some(try_call) = call_info.try_call_info.as_ref() {
512
sink.add_try_call_site(
513
Some(state.frame_layout().sp_to_fp()),
514
try_call.exception_handlers(&state.frame_layout()),
515
);
516
} else {
517
sink.add_call_site();
518
}
519
520
// Reclaim the outgoing argument area that was released by the callee, to ensure that
521
// StackAMode values are always computed from a consistent SP.
522
if call_info.callee_pop_size > 0 {
523
let rsp = Writable::from_reg(regs::rsp());
524
let callee_pop_size = i32::try_from(call_info.callee_pop_size)
525
.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");
526
Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);
527
}
528
529
// Load any stack-carried return values.
530
call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(
531
state.frame_layout().stackslots_size,
532
|inst| inst.emit(sink, info, state),
533
|_space_needed| None,
534
);
535
536
if let Some(try_call) = call_info.try_call_info.as_ref() {
537
let jmp = Inst::JmpKnown {
538
dst: try_call.continuation,
539
};
540
jmp.emit(sink, info, state);
541
}
542
}
543
544
Inst::Args { .. } => {}
545
Inst::Rets { .. } => {}
546
547
Inst::StackSwitchBasic {
548
store_context_ptr,
549
load_context_ptr,
550
in_payload0,
551
out_payload0,
552
} => {
553
// Note that we do not emit anything for preserving and restoring
554
// ordinary registers here: That's taken care of by regalloc for us,
555
// since we marked this instruction as clobbering all registers.
556
//
557
// Also note that we do nothing about passing the single payload
558
// value: We've informed regalloc that it is sent and received via
559
// the fixed register given by [stack_switch::payload_register]
560
561
let (tmp1, tmp2) = {
562
// Ideally we would just ask regalloc for two temporary registers.
563
// However, adding any early defs to the constraints on StackSwitch
564
// causes TooManyLiveRegs. Fortunately, we can manually find tmp
565
// registers without regalloc: Since our instruction clobbers all
566
// registers, we can simply pick any register that is not assigned
567
// to the operands.
568
569
let all = crate::isa::x64::abi::ALL_CLOBBERS;
570
571
let used_regs = [
572
**load_context_ptr,
573
**store_context_ptr,
574
**in_payload0,
575
*out_payload0.to_reg(),
576
];
577
578
let mut tmps = all.into_iter().filter_map(|preg| {
579
let reg: Reg = preg.into();
580
if !used_regs.contains(&reg) {
581
WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))
582
} else {
583
None
584
}
585
});
586
(tmps.next().unwrap(), tmps.next().unwrap())
587
};
588
589
let layout = stack_switch::control_context_layout();
590
let rsp_offset = layout.stack_pointer_offset as i32;
591
let pc_offset = layout.ip_offset as i32;
592
let rbp_offset = layout.frame_pointer_offset as i32;
593
594
// Location to which someone switch-ing back to this stack will jump
595
// to: Right behind the `StackSwitch` instruction
596
let resume = sink.get_label();
597
598
//
599
// For RBP and RSP we do the following:
600
// - Load new value for register from `load_context_ptr` +
601
// corresponding offset.
602
// - Store previous (!) value of register at `store_context_ptr` +
603
// corresponding offset.
604
//
605
// Since `load_context_ptr` and `store_context_ptr` are allowed to be
606
// equal, we need to use a temporary register here.
607
//
608
609
let mut exchange = |offset, reg| {
610
let addr = SyntheticAmode::real(Amode::imm_reg(offset, **load_context_ptr));
611
asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
612
613
asm::inst::movq_mr::new(
614
Amode::imm_reg(offset, **store_context_ptr),
615
Gpr::new(reg).unwrap(),
616
)
617
.emit(sink, info, state);
618
619
let dst = Writable::from_reg(reg);
620
asm::inst::movq_mr::new(dst.map(Gpr::unwrap_new), tmp1.to_reg())
621
.emit(sink, info, state);
622
};
623
624
exchange(rsp_offset, regs::rsp());
625
exchange(rbp_offset, regs::rbp());
626
627
//
628
// Load target PC, store resume PC, jump to target PC
629
//
630
631
let addr = SyntheticAmode::real(Amode::imm_reg(pc_offset, **load_context_ptr));
632
asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);
633
634
let amode = Amode::RipRelative { target: resume };
635
asm::inst::leaq_rm::new(tmp2, amode).emit(sink, info, state);
636
637
asm::inst::movq_mr::new(
638
Amode::imm_reg(pc_offset, **store_context_ptr),
639
tmp2.to_reg(),
640
)
641
.emit(sink, info, state);
642
643
asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
644
645
sink.bind_label(resume, state.ctrl_plane_mut());
646
}
647
648
Inst::JmpKnown { dst } => uncond_jmp(sink, *dst),
649
650
Inst::WinchJmpIf { cc, taken } => one_way_jmp(sink, *cc, *taken),
651
652
Inst::JmpCond {
653
cc,
654
taken,
655
not_taken,
656
} => {
657
cond_jmp(sink, *cc, *taken);
658
uncond_jmp(sink, *not_taken);
659
}
660
661
Inst::JmpCondOr {
662
cc1,
663
cc2,
664
taken,
665
not_taken,
666
} => {
667
// Emit:
668
// jcc1 taken
669
// jcc2 taken
670
// jmp not_taken
671
//
672
// Note that we enroll both conditionals in the
673
// branch-chomping mechanism because MachBuffer
674
// simplification can continue upward as long as it keeps
675
// chomping branches. In the best case, if taken ==
676
// not_taken and that one block is the fallthrough block,
677
// all three branches can disappear.
678
679
cond_jmp(sink, *cc1, *taken);
680
cond_jmp(sink, *cc2, *taken);
681
uncond_jmp(sink, *not_taken);
682
}
683
684
&Inst::JmpTableSeq {
685
idx,
686
tmp1,
687
tmp2,
688
ref targets,
689
ref default_target,
690
..
691
} => {
692
// This sequence is *one* instruction in the vcode, and is expanded only here at
693
// emission time, because we cannot allow the regalloc to insert spills/reloads in
694
// the middle; we depend on hardcoded PC-rel addressing below.
695
//
696
// We don't have to worry about emitting islands, because the only label-use type has a
697
// maximum range of 2 GB. If we later consider using shorter-range label references,
698
// this will need to be revisited.
699
700
// We generate the following sequence. Note that the only read of %idx is before the
701
// write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
702
// if you change this.
703
// lea start_of_jump_table_offset(%rip), %tmp1
704
// movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
705
// addq %tmp2, %tmp1
706
// j *%tmp1
707
// $start_of_jump_table:
708
// -- jump table entries
709
710
// Load base address of jump table.
711
let start_of_jumptable = sink.get_label();
712
asm::inst::leaq_rm::new(tmp1, Amode::rip_relative(start_of_jumptable))
713
.emit(sink, info, state);
714
715
// Load value out of the jump table. It's a relative offset to the target block, so it
716
// might be negative; use a sign-extension.
717
let inst = Inst::movsx_rm_r(
718
ExtMode::LQ,
719
RegMem::mem(Amode::imm_reg_reg_shift(
720
0,
721
Gpr::unwrap_new(tmp1.to_reg()),
722
Gpr::unwrap_new(idx),
723
2,
724
)),
725
tmp2,
726
);
727
inst.emit(sink, info, state);
728
729
// Add base of jump table to jump-table-sourced block offset.
730
asm::inst::addq_rm::new(tmp1, tmp2).emit(sink, info, state);
731
732
// Branch to computed address.
733
asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);
734
735
// Emit jump table (table of 32-bit offsets).
736
sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());
737
let jt_off = sink.cur_offset();
738
for &target in targets.iter().chain(core::iter::once(default_target)) {
739
let word_off = sink.cur_offset();
740
// off_into_table is an addend here embedded in the label to be later patched at
741
// the end of codegen. The offset is initially relative to this jump table entry;
742
// with the extra addend, it'll be relative to the jump table's start, after
743
// patching.
744
let off_into_table = word_off - jt_off;
745
sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
746
sink.put4(off_into_table);
747
}
748
}
749
750
Inst::TrapIf { cc, trap_code } => {
751
let trap_label = sink.defer_trap(*trap_code);
752
one_way_jmp(sink, *cc, trap_label);
753
}
754
755
Inst::TrapIfAnd {
756
cc1,
757
cc2,
758
trap_code,
759
} => {
760
let trap_label = sink.defer_trap(*trap_code);
761
let else_label = sink.get_label();
762
763
// Jump to the end if the first condition isn't true, and then if
764
// the second condition is true go to the trap.
765
one_way_jmp(sink, cc1.invert(), else_label);
766
one_way_jmp(sink, *cc2, trap_label);
767
768
sink.bind_label(else_label, state.ctrl_plane_mut());
769
}
770
771
Inst::TrapIfOr {
772
cc1,
773
cc2,
774
trap_code,
775
} => {
776
let trap_label = sink.defer_trap(*trap_code);
777
778
// Emit two jumps to the same trap if either condition code is true.
779
one_way_jmp(sink, *cc1, trap_label);
780
one_way_jmp(sink, *cc2, trap_label);
781
}
782
783
Inst::XmmMinMaxSeq {
784
size,
785
is_min,
786
lhs,
787
rhs,
788
dst,
789
} => {
790
let rhs = rhs.to_reg();
791
let lhs = lhs.to_reg();
792
let dst = dst.to_writable_reg();
793
debug_assert_eq!(rhs, dst.to_reg());
794
795
// Generates the following sequence:
796
// cmpss/cmpsd %lhs, %rhs_dst
797
// jnz do_min_max
798
// jp propagate_nan
799
//
800
// ;; ordered and equal: propagate the sign bit (for -0 vs 0):
801
// {and,or}{ss,sd} %lhs, %rhs_dst
802
// j done
803
//
804
// ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
805
// ;; NaN value is returned), we add both inputs.
806
// propagate_nan:
807
// add{ss,sd} %lhs, %rhs_dst
808
// j done
809
//
810
// do_min_max:
811
// {min,max}{ss,sd} %lhs, %rhs_dst
812
//
813
// done:
814
let done = sink.get_label();
815
let propagate_nan = sink.get_label();
816
let do_min_max = sink.get_label();
817
818
let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
819
OperandSize::Size32 => (
820
asm::inst::addss_a::new(dst, lhs).into(),
821
asm::inst::ucomiss_a::new(dst.to_reg(), lhs).into(),
822
asm::inst::andps_a::new(dst, lhs).into(),
823
asm::inst::orps_a::new(dst, lhs).into(),
824
if *is_min {
825
asm::inst::minss_a::new(dst, lhs).into()
826
} else {
827
asm::inst::maxss_a::new(dst, lhs).into()
828
},
829
),
830
OperandSize::Size64 => (
831
asm::inst::addsd_a::new(dst, lhs).into(),
832
asm::inst::ucomisd_a::new(dst.to_reg(), lhs).into(),
833
asm::inst::andpd_a::new(dst, lhs).into(),
834
asm::inst::orpd_a::new(dst, lhs).into(),
835
if *is_min {
836
asm::inst::minsd_a::new(dst, lhs).into()
837
} else {
838
asm::inst::maxsd_a::new(dst, lhs).into()
839
},
840
),
841
_ => unreachable!(),
842
};
843
let add_op: AsmInst = add_op;
844
let or_op: AsmInst = or_op;
845
let min_max_op: AsmInst = min_max_op;
846
let cmp_op: AsmInst = cmp_op;
847
848
cmp_op.emit(sink, info, state);
849
850
one_way_jmp(sink, CC::NZ, do_min_max);
851
one_way_jmp(sink, CC::P, propagate_nan);
852
853
// Ordered and equal. The operands are bit-identical unless they are zero
854
// and negative zero. These instructions merge the sign bits in that
855
// case, and are no-ops otherwise.
856
let inst: AsmInst = if *is_min { or_op } else { and_op };
857
inst.emit(sink, info, state);
858
859
let inst = Inst::jmp_known(done);
860
inst.emit(sink, info, state);
861
862
// x86's min/max are not symmetric; if either operand is a NaN, they return the
863
// read-only operand: perform an addition between the two operands, which has the
864
// desired NaN propagation effects.
865
sink.bind_label(propagate_nan, state.ctrl_plane_mut());
866
add_op.emit(sink, info, state);
867
868
one_way_jmp(sink, CC::P, done);
869
870
sink.bind_label(do_min_max, state.ctrl_plane_mut());
871
min_max_op.emit(sink, info, state);
872
873
sink.bind_label(done, state.ctrl_plane_mut());
874
}
875
876
Inst::XmmUninitializedValue { .. } | Inst::GprUninitializedValue { .. } => {
877
// These instruction formats only exist to declare a register as a
878
// `def`; no code is emitted. This is always immediately followed by
879
// an instruction, such as `xor <tmp>, <tmp>`, that semantically
880
// reads this undefined value but arithmetically produces the same
881
// result regardless of its value.
882
}
883
884
Inst::CvtUint64ToFloatSeq {
885
dst_size,
886
src,
887
dst,
888
tmp_gpr1,
889
tmp_gpr2,
890
} => {
891
let src = src.to_reg();
892
let dst = dst.to_writable_reg();
893
let tmp_gpr1 = tmp_gpr1.to_writable_reg();
894
let tmp_gpr2 = tmp_gpr2.to_writable_reg();
895
896
// Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
897
// different sequence.
898
//
899
// Emit the following sequence:
900
//
901
// cmp 0, %src
902
// jl handle_negative
903
//
904
// ;; handle positive, which can't overflow
905
// cvtsi2sd/cvtsi2ss %src, %dst
906
// j done
907
//
908
// ;; handle negative: see below for an explanation of what it's doing.
909
// handle_negative:
910
// mov %src, %tmp_gpr1
911
// shr $1, %tmp_gpr1
912
// mov %src, %tmp_gpr2
913
// and $1, %tmp_gpr2
914
// or %tmp_gpr1, %tmp_gpr2
915
// cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
916
// addsd/addss %dst, %dst
917
//
918
// done:
919
920
assert_ne!(src, tmp_gpr1.to_reg());
921
assert_ne!(src, tmp_gpr2.to_reg());
922
923
let handle_negative = sink.get_label();
924
let done = sink.get_label();
925
926
// If x seen as a signed int64 is not negative, a signed-conversion will do the right
927
// thing.
928
// TODO use tst src, src here.
929
asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);
930
931
one_way_jmp(sink, CC::L, handle_negative);
932
933
// Handle a positive int64, which is the "easy" case: a signed conversion will do the
934
// right thing.
935
emit_signed_cvt(
936
sink,
937
info,
938
state,
939
src,
940
dst,
941
*dst_size == OperandSize::Size64,
942
);
943
944
let inst = Inst::jmp_known(done);
945
inst.emit(sink, info, state);
946
947
sink.bind_label(handle_negative, state.ctrl_plane_mut());
948
949
// Divide x by two to get it in range for the signed conversion, keep the LSB, and
950
// scale it back up on the FP side.
951
let inst = Inst::gen_move(tmp_gpr1, src, types::I64);
952
inst.emit(sink, info, state);
953
954
// tmp_gpr1 := src >> 1
955
asm::inst::shrq_mi::new(tmp_gpr1, 1).emit(sink, info, state);
956
957
let inst = Inst::gen_move(tmp_gpr2, src, types::I64);
958
inst.emit(sink, info, state);
959
960
asm::inst::andq_mi_sxb::new(tmp_gpr2, 1).emit(sink, info, state);
961
962
asm::inst::orq_rm::new(tmp_gpr2, tmp_gpr1).emit(sink, info, state);
963
964
emit_signed_cvt(
965
sink,
966
info,
967
state,
968
tmp_gpr2.to_reg(),
969
dst,
970
*dst_size == OperandSize::Size64,
971
);
972
973
let inst: AsmInst = match *dst_size {
974
OperandSize::Size64 => asm::inst::addsd_a::new(dst, dst.to_reg()).into(),
975
OperandSize::Size32 => asm::inst::addss_a::new(dst, dst.to_reg()).into(),
976
_ => unreachable!(),
977
};
978
inst.emit(sink, info, state);
979
980
sink.bind_label(done, state.ctrl_plane_mut());
981
}
982
983
Inst::CvtFloatToSintSeq {
984
src_size,
985
dst_size,
986
is_saturating,
987
src,
988
dst,
989
tmp_gpr,
990
tmp_xmm,
991
} => {
992
use OperandSize::*;
993
994
let src = src.to_reg();
995
let dst = dst.to_writable_reg();
996
let tmp_gpr = tmp_gpr.to_writable_reg();
997
let tmp_xmm = tmp_xmm.to_writable_reg();
998
999
// Emits the following common sequence:
1000
//
1001
// cvttss2si/cvttsd2si %src, %dst
1002
// cmp %dst, 1
1003
// jno done
1004
//
1005
// Then, for saturating conversions:
1006
//
1007
// ;; check for NaN
1008
// cmpss/cmpsd %src, %src
1009
// jnp not_nan
1010
// xor %dst, %dst
1011
//
1012
// ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
1013
// ;; already in %dst.
1014
// xorpd %tmp_xmm, %tmp_xmm
1015
// cmpss/cmpsd %src, %tmp_xmm
1016
// jnb done
1017
// mov/movaps $INT_MAX, %dst
1018
//
1019
// done:
1020
//
1021
// Then, for non-saturating conversions:
1022
//
1023
// ;; check for NaN
1024
// cmpss/cmpsd %src, %src
1025
// jnp not_nan
1026
// ud2 trap BadConversionToInteger
1027
//
1028
// ;; check if INT_MIN was the correct result, against a magic constant:
1029
// not_nan:
1030
// movaps/mov $magic, %tmp_gpr
1031
// movq/movd %tmp_gpr, %tmp_xmm
1032
// cmpss/cmpsd %tmp_xmm, %src
1033
// jnb/jnbe $check_positive
1034
// ud2 trap IntegerOverflow
1035
//
1036
// ;; if positive, it was a real overflow
1037
// check_positive:
1038
// xorpd %tmp_xmm, %tmp_xmm
1039
// cmpss/cmpsd %src, %tmp_xmm
1040
// jnb done
1041
// ud2 trap IntegerOverflow
1042
//
1043
// done:
1044
1045
let cmp_op: AsmInst = match src_size {
1046
Size64 => asm::inst::ucomisd_a::new(src, src).into(),
1047
Size32 => asm::inst::ucomiss_a::new(src, src).into(),
1048
_ => unreachable!(),
1049
};
1050
1051
let cvtt_op = |dst, src| Inst::External {
1052
inst: match (*src_size, *dst_size) {
1053
(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1054
(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1055
(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1056
(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1057
_ => unreachable!(),
1058
},
1059
};
1060
1061
let done = sink.get_label();
1062
1063
// The truncation.
1064
cvtt_op(dst, src).emit(sink, info, state);
1065
1066
// Compare against 1, in case of overflow the dst operand was INT_MIN.
1067
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);
1068
inst.emit(sink, info, state);
1069
1070
one_way_jmp(sink, CC::NO, done); // no overflow => done
1071
1072
// Check for NaN.
1073
cmp_op.emit(sink, info, state);
1074
1075
if *is_saturating {
1076
let not_nan = sink.get_label();
1077
one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
1078
1079
// For NaN, emit 0.
1080
let inst: AsmInst = match *dst_size {
1081
OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1082
OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1083
_ => unreachable!(),
1084
};
1085
inst.emit(sink, info, state);
1086
1087
let inst = Inst::jmp_known(done);
1088
inst.emit(sink, info, state);
1089
1090
sink.bind_label(not_nan, state.ctrl_plane_mut());
1091
1092
// If the input was positive, saturate to INT_MAX.
1093
1094
// Zero out tmp_xmm.
1095
asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1096
1097
let inst: AsmInst = match src_size {
1098
Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1099
Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1100
_ => unreachable!(),
1101
};
1102
inst.emit(sink, info, state);
1103
1104
// Jump if >= to done.
1105
one_way_jmp(sink, CC::NB, done);
1106
1107
// Otherwise, put INT_MAX.
1108
if *dst_size == OperandSize::Size64 {
1109
let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);
1110
inst.emit(sink, info, state);
1111
} else {
1112
let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);
1113
inst.emit(sink, info, state);
1114
}
1115
} else {
1116
let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1117
inst.emit(sink, info, state);
1118
1119
// Check if INT_MIN was the correct result: determine the smallest floating point
1120
// number that would convert to INT_MIN, put it in a temporary register, and compare
1121
// against the src register.
1122
// If the src register is less (or in some cases, less-or-equal) than the threshold,
1123
// trap!
1124
1125
let mut no_overflow_cc = CC::NB; // >=
1126
let output_bits = dst_size.to_bits();
1127
match *src_size {
1128
OperandSize::Size32 => {
1129
let cst = (-Ieee32::pow2(output_bits - 1)).bits();
1130
let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);
1131
inst.emit(sink, info, state);
1132
}
1133
OperandSize::Size64 => {
1134
// An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
1135
// so there are values less than -2^(N-1) that convert correctly to INT_MIN.
1136
let cst = if output_bits < 64 {
1137
no_overflow_cc = CC::NBE; // >
1138
Ieee64::fcvt_to_sint_negative_overflow(output_bits)
1139
} else {
1140
-Ieee64::pow2(output_bits - 1)
1141
};
1142
let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);
1143
inst.emit(sink, info, state);
1144
}
1145
_ => unreachable!(),
1146
}
1147
1148
let inst: AsmInst = {
1149
let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1150
match src_size {
1151
Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1152
Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1153
_ => unreachable!(),
1154
}
1155
};
1156
inst.emit(sink, info, state);
1157
1158
let inst: AsmInst = match src_size {
1159
Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1160
Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1161
_ => unreachable!(),
1162
};
1163
inst.emit(sink, info, state);
1164
1165
// no trap if src >= or > threshold
1166
let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);
1167
inst.emit(sink, info, state);
1168
1169
// If positive, it was a real overflow.
1170
1171
// Zero out the tmp_xmm register.
1172
asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);
1173
1174
let inst: AsmInst = match src_size {
1175
Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),
1176
Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),
1177
_ => unreachable!(),
1178
};
1179
inst.emit(sink, info, state);
1180
1181
// no trap if 0 >= src
1182
let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);
1183
inst.emit(sink, info, state);
1184
}
1185
1186
sink.bind_label(done, state.ctrl_plane_mut());
1187
}
1188
1189
Inst::CvtFloatToUintSeq {
1190
src_size,
1191
dst_size,
1192
is_saturating,
1193
src,
1194
dst,
1195
tmp_gpr,
1196
tmp_xmm,
1197
tmp_xmm2,
1198
} => {
1199
use OperandSize::*;
1200
1201
let src = src.to_reg();
1202
let dst = dst.to_writable_reg();
1203
let tmp_gpr = tmp_gpr.to_writable_reg();
1204
let tmp_xmm = tmp_xmm.to_writable_reg();
1205
let tmp_xmm2 = tmp_xmm2.to_writable_reg();
1206
1207
// The only difference in behavior between saturating and non-saturating is how we
1208
// handle errors. Emits the following sequence:
1209
//
1210
// movaps/mov 2**(int_width - 1), %tmp_gpr
1211
// movq/movd %tmp_gpr, %tmp_xmm
1212
// cmpss/cmpsd %tmp_xmm, %src
1213
// jnb is_large
1214
//
1215
// ;; check for NaN inputs
1216
// jnp not_nan
1217
// -- non-saturating: ud2 trap BadConversionToInteger
1218
// -- saturating: xor %dst, %dst; j done
1219
//
1220
// not_nan:
1221
// cvttss2si/cvttsd2si %src, %dst
1222
// cmp 0, %dst
1223
// jnl done
1224
// -- non-saturating: ud2 trap IntegerOverflow
1225
// -- saturating: xor %dst, %dst; j done
1226
//
1227
// is_large:
1228
// mov %src, %tmp_xmm2
1229
// subss/subsd %tmp_xmm, %tmp_xmm2
1230
// cvttss2si/cvttss2sd %tmp_x, %dst
1231
// cmp 0, %dst
1232
// jnl next_is_large
1233
// -- non-saturating: ud2 trap IntegerOverflow
1234
// -- saturating: movaps $UINT_MAX, %dst; j done
1235
//
1236
// next_is_large:
1237
// add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
1238
//
1239
// done:
1240
1241
assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");
1242
1243
let xor_op = |dst, src| Inst::External {
1244
inst: match *dst_size {
1245
Size32 => asm::inst::xorl_rm::new(dst, src).into(),
1246
Size64 => asm::inst::xorq_rm::new(dst, src).into(),
1247
_ => unreachable!(),
1248
},
1249
};
1250
1251
let subs_op = |dst, src| Inst::External {
1252
inst: match *src_size {
1253
Size32 => asm::inst::subss_a::new(dst, src).into(),
1254
Size64 => asm::inst::subsd_a::new(dst, src).into(),
1255
_ => unreachable!(),
1256
},
1257
};
1258
1259
let cvtt_op = |dst, src| Inst::External {
1260
inst: match (*src_size, *dst_size) {
1261
(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),
1262
(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),
1263
(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),
1264
(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),
1265
_ => unreachable!(),
1266
},
1267
};
1268
1269
let done = sink.get_label();
1270
1271
let cst = match src_size {
1272
OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
1273
OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
1274
_ => unreachable!(),
1275
};
1276
1277
let inst = Inst::imm(*src_size, cst, tmp_gpr);
1278
inst.emit(sink, info, state);
1279
1280
let inst: AsmInst = {
1281
let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());
1282
match src_size {
1283
Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),
1284
Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),
1285
_ => unreachable!(),
1286
}
1287
};
1288
inst.emit(sink, info, state);
1289
1290
let inst: AsmInst = match src_size {
1291
Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),
1292
Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),
1293
_ => unreachable!(),
1294
};
1295
inst.emit(sink, info, state);
1296
1297
let handle_large = sink.get_label();
1298
one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
1299
1300
if *is_saturating {
1301
// If not NaN jump over this 0-return, otherwise return 0
1302
let not_nan = sink.get_label();
1303
one_way_jmp(sink, CC::NP, not_nan);
1304
1305
xor_op(dst, dst).emit(sink, info, state);
1306
1307
let inst = Inst::jmp_known(done);
1308
inst.emit(sink, info, state);
1309
sink.bind_label(not_nan, state.ctrl_plane_mut());
1310
} else {
1311
// Trap.
1312
let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
1313
inst.emit(sink, info, state);
1314
}
1315
1316
// Actual truncation for small inputs: if the result is not positive, then we had an
1317
// overflow.
1318
1319
cvtt_op(dst, src).emit(sink, info, state);
1320
1321
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1322
inst.emit(sink, info, state);
1323
1324
one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
1325
1326
if *is_saturating {
1327
// The input was "small" (< 2**(width -1)), so the only way to get an integer
1328
// overflow is because the input was too small: saturate to the min value, i.e. 0.
1329
let inst: AsmInst = match *dst_size {
1330
OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),
1331
OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),
1332
_ => unreachable!(),
1333
};
1334
inst.emit(sink, info, state);
1335
1336
let inst = Inst::jmp_known(done);
1337
inst.emit(sink, info, state);
1338
} else {
1339
// Trap.
1340
asm::inst::ud2_zo::new(TrapCode::INTEGER_OVERFLOW).emit(sink, info, state);
1341
}
1342
1343
// Now handle large inputs.
1344
1345
sink.bind_label(handle_large, state.ctrl_plane_mut());
1346
1347
let inst = Inst::gen_move(tmp_xmm2, src, types::F64);
1348
inst.emit(sink, info, state);
1349
1350
subs_op(tmp_xmm2, tmp_xmm.to_reg()).emit(sink, info, state);
1351
1352
cvtt_op(dst, tmp_xmm2.to_reg()).emit(sink, info, state);
1353
1354
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
1355
inst.emit(sink, info, state);
1356
1357
if *is_saturating {
1358
let next_is_large = sink.get_label();
1359
one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
1360
1361
// The input was "large" (>= 2**(width -1)), so the only way to get an integer
1362
// overflow is because the input was too large: saturate to the max value.
1363
let inst = Inst::imm(
1364
OperandSize::Size64,
1365
if *dst_size == OperandSize::Size64 {
1366
u64::max_value()
1367
} else {
1368
u32::max_value() as u64
1369
},
1370
dst,
1371
);
1372
inst.emit(sink, info, state);
1373
1374
let inst = Inst::jmp_known(done);
1375
inst.emit(sink, info, state);
1376
sink.bind_label(next_is_large, state.ctrl_plane_mut());
1377
} else {
1378
let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);
1379
inst.emit(sink, info, state);
1380
}
1381
1382
if *dst_size == OperandSize::Size64 {
1383
let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);
1384
inst.emit(sink, info, state);
1385
1386
asm::inst::addq_rm::new(dst, tmp_gpr).emit(sink, info, state);
1387
} else {
1388
asm::inst::addl_mi::new(dst, asm::Imm32::new(1 << 31)).emit(sink, info, state);
1389
}
1390
1391
sink.bind_label(done, state.ctrl_plane_mut());
1392
}
1393
1394
Inst::LoadExtName {
1395
dst,
1396
name,
1397
offset,
1398
distance,
1399
} => {
1400
let name = &**name;
1401
let riprel = asm::Amode::RipRelative {
1402
target: asm::DeferredTarget::None,
1403
};
1404
if info.flags.is_pic() {
1405
// Generates: movq symbol@GOTPCREL(%rip), %dst
1406
asm::inst::movq_rm::new(*dst, riprel).emit(sink, info, state);
1407
let cur = sink.cur_offset();
1408
sink.add_reloc_at_offset(cur - 4, Reloc::X86GOTPCRel4, name, -4);
1409
1410
// Offset in the relocation above applies to the address of the
1411
// *GOT entry*, not the loaded address; so we emit a separate
1412
// add instruction if needed.
1413
let offset = i32::try_from(*offset).unwrap();
1414
if offset != 0 {
1415
asm::inst::addq_mi_sxl::new(PairedGpr::from(*dst), offset)
1416
.emit(sink, info, state);
1417
}
1418
} else if distance == &RelocDistance::Near {
1419
// If we know the distance to the name is within 2GB (e.g., a
1420
// module-local function), we can generate a RIP-relative
1421
// address, with a relocation.
1422
asm::inst::leaq_rm::new(*dst, riprel).emit(sink, info, state);
1423
let cur = sink.cur_offset();
1424
sink.add_reloc_at_offset(cur - 4, Reloc::X86CallPCRel4, name, *offset - 4);
1425
} else {
1426
// The full address can be encoded in the register, with a
1427
// relocation.
1428
asm::inst::movabsq_oi::new(*dst, 0).emit(sink, info, state);
1429
let cur = sink.cur_offset();
1430
sink.add_reloc_at_offset(cur - 8, Reloc::Abs8, name, *offset);
1431
}
1432
}
1433
1434
Inst::AtomicRmwSeq {
1435
ty,
1436
op,
1437
mem,
1438
operand,
1439
temp,
1440
dst_old,
1441
} => {
1442
let operand = *operand;
1443
let temp = *temp;
1444
let temp_r = temp.map(|r| *r);
1445
let dst_old = *dst_old;
1446
let dst_old_r = dst_old.map(|r| *r);
1447
debug_assert_eq!(dst_old.to_reg(), regs::rax());
1448
let mem = mem.finalize(state.frame_layout(), sink).clone();
1449
1450
// Emit this:
1451
// mov{zbq,zwq,zlq,q} (%r_address), %rax // rax = old value
1452
// again:
1453
// movq %rax, %r_temp // rax = old value, r_temp = old value
1454
// `op`q %r_operand, %r_temp // rax = old value, r_temp = new value
1455
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address) // try to store new value
1456
// jnz again // If this is taken, rax will have a "revised" old value
1457
//
1458
// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old
1459
// value), %r_temp (trashed), %rflags (trashed)
1460
let again_label = sink.get_label();
1461
1462
// mov{zbq,zwq,zlq,q} (%r_address), %rax
1463
// No need to call `add_trap` here, since the `i1` emit will do that.
1464
let i1 = Inst::load(*ty, mem.clone(), dst_old_r, ExtKind::ZeroExtend);
1465
i1.emit(sink, info, state);
1466
1467
// again:
1468
sink.bind_label(again_label, state.ctrl_plane_mut());
1469
1470
// movq %rax, %r_temp
1471
asm::inst::movq_mr::new(temp, dst_old.to_reg()).emit(sink, info, state);
1472
1473
use AtomicRmwSeqOp as RmwOp;
1474
match op {
1475
RmwOp::Nand => {
1476
// andq %r_operand, %r_temp
1477
asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1478
1479
// notq %r_temp
1480
asm::inst::notq_m::new(PairedGpr::from(temp)).emit(sink, info, state);
1481
}
1482
RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1483
// cmp %r_temp, %r_operand
1484
let temp = temp.to_reg();
1485
match *ty {
1486
types::I8 => asm::inst::cmpb_mr::new(operand, temp).emit(sink, info, state),
1487
types::I16 => {
1488
asm::inst::cmpw_mr::new(operand, temp).emit(sink, info, state)
1489
}
1490
types::I32 => {
1491
asm::inst::cmpl_mr::new(operand, temp).emit(sink, info, state)
1492
}
1493
types::I64 => {
1494
asm::inst::cmpq_mr::new(operand, temp).emit(sink, info, state)
1495
}
1496
_ => unreachable!(),
1497
}
1498
1499
// cmovcc %r_operand, %r_temp
1500
match op {
1501
RmwOp::Umin => {
1502
asm::inst::cmovbeq_rm::new(temp_r, *operand).emit(sink, info, state)
1503
}
1504
RmwOp::Umax => {
1505
asm::inst::cmovaeq_rm::new(temp_r, *operand).emit(sink, info, state)
1506
}
1507
RmwOp::Smin => {
1508
asm::inst::cmovleq_rm::new(temp_r, *operand).emit(sink, info, state)
1509
}
1510
RmwOp::Smax => {
1511
asm::inst::cmovgeq_rm::new(temp_r, *operand).emit(sink, info, state)
1512
}
1513
_ => unreachable!(),
1514
}
1515
}
1516
RmwOp::And => {
1517
// andq %r_operand, %r_temp
1518
asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);
1519
}
1520
RmwOp::Or => {
1521
// orq %r_operand, %r_temp
1522
asm::inst::orq_rm::new(temp, operand).emit(sink, info, state);
1523
}
1524
RmwOp::Xor => {
1525
// xorq %r_operand, %r_temp
1526
asm::inst::xorq_rm::new(temp, operand).emit(sink, info, state);
1527
}
1528
}
1529
1530
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
1531
// No need to call `add_trap` here, since the `i4` emit will do that.
1532
let temp = temp.to_reg();
1533
let dst_old = PairedGpr::from(dst_old);
1534
let inst: AsmInst = match *ty {
1535
types::I8 => asm::inst::lock_cmpxchgb_mr::new(mem, temp, dst_old).into(),
1536
types::I16 => asm::inst::lock_cmpxchgw_mr::new(mem, temp, dst_old).into(),
1537
types::I32 => asm::inst::lock_cmpxchgl_mr::new(mem, temp, dst_old).into(),
1538
types::I64 => asm::inst::lock_cmpxchgq_mr::new(mem, temp, dst_old).into(),
1539
_ => unreachable!(),
1540
};
1541
inst.emit(sink, info, state);
1542
1543
// jnz again
1544
one_way_jmp(sink, CC::NZ, again_label);
1545
}
1546
1547
Inst::Atomic128RmwSeq {
1548
op,
1549
mem,
1550
operand_low,
1551
operand_high,
1552
temp_low,
1553
temp_high,
1554
dst_old_low,
1555
dst_old_high,
1556
} => {
1557
let operand_low = *operand_low;
1558
let operand_high = *operand_high;
1559
let temp_low = *temp_low;
1560
let temp_high = *temp_high;
1561
let dst_old_low = *dst_old_low;
1562
let dst_old_high = *dst_old_high;
1563
debug_assert_eq!(temp_low.to_reg(), regs::rbx());
1564
debug_assert_eq!(temp_high.to_reg(), regs::rcx());
1565
debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1566
debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1567
let mem = mem.finalize(state.frame_layout(), sink).clone();
1568
1569
let again_label = sink.get_label();
1570
1571
// Load the initial value.
1572
asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1573
asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1574
1575
// again:
1576
sink.bind_label(again_label, state.ctrl_plane_mut());
1577
1578
// Move old value to temp registers.
1579
asm::inst::movq_mr::new(temp_low, dst_old_low.to_reg()).emit(sink, info, state);
1580
asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg()).emit(sink, info, state);
1581
1582
// Perform the operation.
1583
use Atomic128RmwSeqOp as RmwOp;
1584
match op {
1585
RmwOp::Nand => {
1586
// temp &= operand
1587
asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1588
asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1589
1590
// temp = !temp
1591
asm::inst::notq_m::new(PairedGpr::from(temp_low)).emit(sink, info, state);
1592
asm::inst::notq_m::new(PairedGpr::from(temp_high)).emit(sink, info, state);
1593
}
1594
RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
1595
// Do a comparison with LHS temp and RHS operand.
1596
// Note the opposite argument orders.
1597
asm::inst::cmpq_mr::new(temp_low.to_reg(), operand_low).emit(sink, info, state);
1598
// This will clobber `temp_high`
1599
asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1600
// Restore the clobbered value
1601
asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg())
1602
.emit(sink, info, state);
1603
match op {
1604
RmwOp::Umin => {
1605
asm::inst::cmovaeq_rm::new(temp_low, operand_low)
1606
.emit(sink, info, state);
1607
asm::inst::cmovaeq_rm::new(temp_high, operand_high)
1608
.emit(sink, info, state);
1609
}
1610
RmwOp::Umax => {
1611
asm::inst::cmovbq_rm::new(temp_low, operand_low)
1612
.emit(sink, info, state);
1613
asm::inst::cmovbq_rm::new(temp_high, operand_high)
1614
.emit(sink, info, state);
1615
}
1616
RmwOp::Smin => {
1617
asm::inst::cmovgeq_rm::new(temp_low, operand_low)
1618
.emit(sink, info, state);
1619
asm::inst::cmovgeq_rm::new(temp_high, operand_high)
1620
.emit(sink, info, state);
1621
}
1622
RmwOp::Smax => {
1623
asm::inst::cmovlq_rm::new(temp_low, operand_low)
1624
.emit(sink, info, state);
1625
asm::inst::cmovlq_rm::new(temp_high, operand_high)
1626
.emit(sink, info, state);
1627
}
1628
_ => unreachable!(),
1629
}
1630
}
1631
RmwOp::Add => {
1632
asm::inst::addq_rm::new(temp_low, operand_low).emit(sink, info, state);
1633
asm::inst::adcq_rm::new(temp_high, operand_high).emit(sink, info, state);
1634
}
1635
RmwOp::Sub => {
1636
asm::inst::subq_rm::new(temp_low, operand_low).emit(sink, info, state);
1637
asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);
1638
}
1639
RmwOp::And => {
1640
asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);
1641
asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);
1642
}
1643
RmwOp::Or => {
1644
asm::inst::orq_rm::new(temp_low, operand_low).emit(sink, info, state);
1645
asm::inst::orq_rm::new(temp_high, operand_high).emit(sink, info, state);
1646
}
1647
RmwOp::Xor => {
1648
asm::inst::xorq_rm::new(temp_low, operand_low).emit(sink, info, state);
1649
asm::inst::xorq_rm::new(temp_high, operand_high).emit(sink, info, state);
1650
}
1651
}
1652
1653
// cmpxchg16b (mem)
1654
asm::inst::lock_cmpxchg16b_m::new(
1655
PairedGpr::from(dst_old_low),
1656
PairedGpr::from(dst_old_high),
1657
temp_low.to_reg(),
1658
temp_high.to_reg(),
1659
mem,
1660
)
1661
.emit(sink, info, state);
1662
1663
// jnz again
1664
one_way_jmp(sink, CC::NZ, again_label);
1665
}
1666
1667
Inst::Atomic128XchgSeq {
1668
mem,
1669
operand_low,
1670
operand_high,
1671
dst_old_low,
1672
dst_old_high,
1673
} => {
1674
let operand_low = *operand_low;
1675
let operand_high = *operand_high;
1676
let dst_old_low = *dst_old_low;
1677
let dst_old_high = *dst_old_high;
1678
debug_assert_eq!(operand_low, regs::rbx());
1679
debug_assert_eq!(operand_high, regs::rcx());
1680
debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
1681
debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
1682
let mem = mem.finalize(state.frame_layout(), sink).clone();
1683
1684
let again_label = sink.get_label();
1685
1686
// Load the initial value.
1687
asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);
1688
asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);
1689
1690
// again:
1691
sink.bind_label(again_label, state.ctrl_plane_mut());
1692
1693
// cmpxchg16b (mem)
1694
asm::inst::lock_cmpxchg16b_m::new(
1695
PairedGpr::from(dst_old_low),
1696
PairedGpr::from(dst_old_high),
1697
operand_low,
1698
operand_high,
1699
mem,
1700
)
1701
.emit(sink, info, state);
1702
1703
// jnz again
1704
one_way_jmp(sink, CC::NZ, again_label);
1705
}
1706
1707
Inst::ElfTlsGetAddr { symbol, dst } => {
1708
let dst = dst.to_reg().to_reg();
1709
debug_assert_eq!(dst, regs::rax());
1710
1711
// N.B.: Must be exactly this byte sequence; the linker requires it,
1712
// because it must know how to rewrite the bytes.
1713
1714
// data16 lea gv@tlsgd(%rip),%rdi
1715
sink.put1(0x66); // data16
1716
sink.put1(0b01001000); // REX.W
1717
sink.put1(0x8d); // LEA
1718
sink.put1(0x3d); // ModRM byte
1719
emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
1720
sink.put4(0); // offset
1721
1722
// data16 data16 callq __tls_get_addr-4
1723
sink.put1(0x66); // data16
1724
sink.put1(0x66); // data16
1725
sink.put1(0b01001000); // REX.W
1726
sink.put1(0xe8); // CALL
1727
emit_reloc(
1728
sink,
1729
Reloc::X86CallPLTRel4,
1730
&ExternalName::LibCall(LibCall::ElfTlsGetAddr),
1731
-4,
1732
);
1733
sink.put4(0); // offset
1734
}
1735
1736
Inst::MachOTlsGetAddr { symbol, dst } => {
1737
let dst = dst.to_reg().to_reg();
1738
debug_assert_eq!(dst, regs::rax());
1739
1740
// movq gv@tlv(%rip), %rdi
1741
sink.put1(0x48); // REX.w
1742
sink.put1(0x8b); // MOV
1743
sink.put1(0x3d); // ModRM byte
1744
emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
1745
sink.put4(0); // offset
1746
1747
asm::inst::callq_m::new(asm::Amode::ImmReg {
1748
base: Gpr::RDI,
1749
simm32: asm::AmodeOffsetPlusKnownOffset::ZERO,
1750
trap: None,
1751
})
1752
.emit(sink, info, state);
1753
}
1754
1755
Inst::CoffTlsGetAddr { symbol, dst, tmp } => {
1756
let dst = dst.to_reg().to_reg();
1757
debug_assert_eq!(dst, regs::rax());
1758
1759
// tmp is used below directly as %rcx
1760
let tmp = tmp.to_reg().to_reg();
1761
debug_assert_eq!(tmp, regs::rcx());
1762
1763
// See: https://gcc.godbolt.org/z/M8or9x6ss
1764
// And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
1765
1766
// Emit the following sequence
1767
// movl (%rip), %eax ; IMAGE_REL_AMD64_REL32 _tls_index
1768
// movq %gs:88, %rcx
1769
// movq (%rcx,%rax,8), %rax
1770
// leaq (%rax), %rax ; Reloc: IMAGE_REL_AMD64_SECREL symbol
1771
1772
// Load TLS index for current thread
1773
// movl (%rip), %eax
1774
sink.put1(0x8b); // mov
1775
sink.put1(0x05);
1776
emit_reloc(
1777
sink,
1778
Reloc::X86PCRel4,
1779
&ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
1780
-4,
1781
);
1782
sink.put4(0); // offset
1783
1784
// movq %gs:88, %rcx
1785
// Load the TLS Storage Array pointer
1786
// The gs segment register refers to the base address of the TEB on x64.
1787
// 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
1788
sink.put_data(&[
1789
0x65, 0x48, // REX.W
1790
0x8b, // MOV
1791
0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
1792
0x00, 0x00, 0x00,
1793
]);
1794
1795
// movq (%rcx,%rax,8), %rax
1796
// Load the actual TLS entry for this thread.
1797
// Computes ThreadLocalStoragePointer + _tls_index*8
1798
sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
1799
1800
// leaq (%rax), %rax
1801
sink.put1(0x48);
1802
sink.put1(0x8d);
1803
sink.put1(0x80);
1804
emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
1805
sink.put4(0); // offset
1806
}
1807
1808
Inst::Unwind { inst } => {
1809
sink.add_unwind(inst.clone());
1810
}
1811
1812
Inst::DummyUse { .. } => {
1813
// Nothing.
1814
}
1815
1816
Inst::LabelAddress { dst, label } => {
1817
// Emit an LEA with a LabelUse given this label.
1818
asm::inst::leaq_rm::new(*dst, Amode::rip_relative(*label)).emit(sink, info, state);
1819
}
1820
1821
Inst::SequencePoint { .. } => {
1822
// Nothing.
1823
}
1824
1825
Inst::External { inst } => {
1826
let frame = state.frame_layout();
1827
emit_maybe_shrink(
1828
inst,
1829
&mut external::AsmCodeSink {
1830
sink,
1831
1832
// These values are transcribed from what is happening in
1833
// `SyntheticAmode::finalize`. This, plus the `Into` logic
1834
// converting a `SyntheticAmode` to its external counterpart, are
1835
// necessary to communicate Cranelift's internal offsets to the
1836
// assembler; due to when Cranelift determines these offsets, this
1837
// happens quite late (i.e., here during emission).
1838
incoming_arg_offset: i32::try_from(
1839
frame.tail_args_size + frame.setup_area_size,
1840
)
1841
.unwrap(),
1842
slot_offset: i32::try_from(frame.outgoing_args_size).unwrap(),
1843
},
1844
);
1845
}
1846
}
1847
1848
state.clear_post_insn();
1849
}
1850
1851
/// Emit the common sequence used for both direct and indirect tail calls:
1852
///
1853
/// * Copy the new frame's stack arguments over the top of our current frame.
1854
///
1855
/// * Restore the old frame pointer.
1856
///
1857
/// * Initialize the tail callee's stack pointer (simultaneously deallocating
1858
/// the temporary stack space we allocated when creating the new frame's stack
1859
/// arguments).
1860
///
1861
/// * Move the return address into its stack slot.
1862
fn emit_return_call_common_sequence<T>(
1863
sink: &mut MachBuffer<Inst>,
1864
info: &EmitInfo,
1865
state: &mut EmitState,
1866
call_info: &ReturnCallInfo<T>,
1867
) {
1868
assert!(
1869
info.flags.preserve_frame_pointers(),
1870
"frame pointers aren't fundamentally required for tail calls, \
1871
but the current implementation relies on them being present"
1872
);
1873
1874
let tmp = call_info.tmp.to_writable_reg();
1875
1876
for inst in
1877
X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
1878
{
1879
inst.emit(sink, info, state);
1880
}
1881
1882
for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
1883
CallConv::Tail,
1884
&info.flags,
1885
&info.isa_flags,
1886
state.frame_layout(),
1887
) {
1888
inst.emit(sink, info, state);
1889
}
1890
1891
let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;
1892
if incoming_args_diff > 0 {
1893
// Move the saved return address up by `incoming_args_diff`.
1894
let addr = Amode::imm_reg(0, regs::rsp());
1895
asm::inst::movq_rm::new(tmp, addr).emit(sink, info, state);
1896
asm::inst::movq_mr::new(
1897
Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),
1898
Gpr::unwrap_new(tmp.to_reg()),
1899
)
1900
.emit(sink, info, state);
1901
1902
// Increment the stack pointer to shrink the argument area for the new
1903
// call.
1904
let rsp = Writable::from_reg(regs::rsp());
1905
let incoming_args_diff = i32::try_from(incoming_args_diff)
1906
.expect("`incoming_args_diff` is too large to fit in a 32-bit signed immediate");
1907
Inst::addq_mi(rsp, incoming_args_diff).emit(sink, info, state);
1908
}
1909
}
1910
1911
/// Conveniene trait to have an `emit` method on all `asm::inst::*` variants.
1912
trait ExternalEmit {
1913
fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState);
1914
}
1915
1916
impl<I> ExternalEmit for I
1917
where
1918
I: Into<asm::inst::Inst<CraneliftRegisters>>,
1919
{
1920
fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState) {
1921
Inst::External { inst: self.into() }.emit(sink, info, state)
1922
}
1923
}
1924
1925
/// Attempt to "shrink" the provided `inst`.
1926
///
1927
/// This function will inspect `inst` and attempt to return a new instruction
1928
/// which is equivalent semantically but will encode to a smaller binary
1929
/// representation. This is only done for instructions which require register
1930
/// allocation to have already happened, for example shrinking immediates should
1931
/// be done during instruction selection not at this point.
1932
///
1933
/// An example of this optimization is the `AND` instruction. The Intel manual
1934
/// has a smaller encoding for `AND AL, imm8` than it does for `AND r/m8, imm8`.
1935
/// Here the instructions are matched against and if regalloc state indicates
1936
/// that a smaller variant is available then that's swapped to instead.
1937
fn emit_maybe_shrink(inst: &AsmInst, sink: &mut impl asm::CodeSink) {
1938
use cranelift_assembler_x64::GprMem;
1939
use cranelift_assembler_x64::inst::*;
1940
1941
type R = CraneliftRegisters;
1942
const RAX: PairedGpr = PairedGpr {
1943
read: Gpr::RAX,
1944
write: Writable::from_reg(Gpr::RAX),
1945
};
1946
const RAX_RM: GprMem<PairedGpr, Gpr> = GprMem::Gpr(RAX);
1947
1948
match *inst {
1949
// and
1950
Inst::andb_mi(andb_mi { rm8: RAX_RM, imm8 }) => andb_i::<R>::new(RAX, imm8).encode(sink),
1951
Inst::andw_mi(andw_mi {
1952
rm16: RAX_RM,
1953
imm16,
1954
}) => andw_i::<R>::new(RAX, imm16).encode(sink),
1955
Inst::andl_mi(andl_mi {
1956
rm32: RAX_RM,
1957
imm32,
1958
}) => andl_i::<R>::new(RAX, imm32).encode(sink),
1959
Inst::andq_mi_sxl(andq_mi_sxl {
1960
rm64: RAX_RM,
1961
imm32,
1962
}) => andq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1963
1964
// or
1965
Inst::orb_mi(orb_mi { rm8: RAX_RM, imm8 }) => orb_i::<R>::new(RAX, imm8).encode(sink),
1966
Inst::orw_mi(orw_mi {
1967
rm16: RAX_RM,
1968
imm16,
1969
}) => orw_i::<R>::new(RAX, imm16).encode(sink),
1970
Inst::orl_mi(orl_mi {
1971
rm32: RAX_RM,
1972
imm32,
1973
}) => orl_i::<R>::new(RAX, imm32).encode(sink),
1974
Inst::orq_mi_sxl(orq_mi_sxl {
1975
rm64: RAX_RM,
1976
imm32,
1977
}) => orq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1978
1979
// xor
1980
Inst::xorb_mi(xorb_mi { rm8: RAX_RM, imm8 }) => xorb_i::<R>::new(RAX, imm8).encode(sink),
1981
Inst::xorw_mi(xorw_mi {
1982
rm16: RAX_RM,
1983
imm16,
1984
}) => xorw_i::<R>::new(RAX, imm16).encode(sink),
1985
Inst::xorl_mi(xorl_mi {
1986
rm32: RAX_RM,
1987
imm32,
1988
}) => xorl_i::<R>::new(RAX, imm32).encode(sink),
1989
Inst::xorq_mi_sxl(xorq_mi_sxl {
1990
rm64: RAX_RM,
1991
imm32,
1992
}) => xorq_i_sxl::<R>::new(RAX, imm32).encode(sink),
1993
1994
// add
1995
Inst::addb_mi(addb_mi { rm8: RAX_RM, imm8 }) => addb_i::<R>::new(RAX, imm8).encode(sink),
1996
Inst::addw_mi(addw_mi {
1997
rm16: RAX_RM,
1998
imm16,
1999
}) => addw_i::<R>::new(RAX, imm16).encode(sink),
2000
Inst::addl_mi(addl_mi {
2001
rm32: RAX_RM,
2002
imm32,
2003
}) => addl_i::<R>::new(RAX, imm32).encode(sink),
2004
Inst::addq_mi_sxl(addq_mi_sxl {
2005
rm64: RAX_RM,
2006
imm32,
2007
}) => addq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2008
2009
// adc
2010
Inst::adcb_mi(adcb_mi { rm8: RAX_RM, imm8 }) => adcb_i::<R>::new(RAX, imm8).encode(sink),
2011
Inst::adcw_mi(adcw_mi {
2012
rm16: RAX_RM,
2013
imm16,
2014
}) => adcw_i::<R>::new(RAX, imm16).encode(sink),
2015
Inst::adcl_mi(adcl_mi {
2016
rm32: RAX_RM,
2017
imm32,
2018
}) => adcl_i::<R>::new(RAX, imm32).encode(sink),
2019
Inst::adcq_mi_sxl(adcq_mi_sxl {
2020
rm64: RAX_RM,
2021
imm32,
2022
}) => adcq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2023
2024
// sub
2025
Inst::subb_mi(subb_mi { rm8: RAX_RM, imm8 }) => subb_i::<R>::new(RAX, imm8).encode(sink),
2026
Inst::subw_mi(subw_mi {
2027
rm16: RAX_RM,
2028
imm16,
2029
}) => subw_i::<R>::new(RAX, imm16).encode(sink),
2030
Inst::subl_mi(subl_mi {
2031
rm32: RAX_RM,
2032
imm32,
2033
}) => subl_i::<R>::new(RAX, imm32).encode(sink),
2034
Inst::subq_mi_sxl(subq_mi_sxl {
2035
rm64: RAX_RM,
2036
imm32,
2037
}) => subq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2038
2039
// sbb
2040
Inst::sbbb_mi(sbbb_mi { rm8: RAX_RM, imm8 }) => sbbb_i::<R>::new(RAX, imm8).encode(sink),
2041
Inst::sbbw_mi(sbbw_mi {
2042
rm16: RAX_RM,
2043
imm16,
2044
}) => sbbw_i::<R>::new(RAX, imm16).encode(sink),
2045
Inst::sbbl_mi(sbbl_mi {
2046
rm32: RAX_RM,
2047
imm32,
2048
}) => sbbl_i::<R>::new(RAX, imm32).encode(sink),
2049
Inst::sbbq_mi_sxl(sbbq_mi_sxl {
2050
rm64: RAX_RM,
2051
imm32,
2052
}) => sbbq_i_sxl::<R>::new(RAX, imm32).encode(sink),
2053
2054
// cmp
2055
Inst::cmpb_mi(cmpb_mi {
2056
rm8: GprMem::Gpr(Gpr::RAX),
2057
imm8,
2058
}) => cmpb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2059
Inst::cmpw_mi(cmpw_mi {
2060
rm16: GprMem::Gpr(Gpr::RAX),
2061
imm16,
2062
}) => cmpw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2063
Inst::cmpl_mi(cmpl_mi {
2064
rm32: GprMem::Gpr(Gpr::RAX),
2065
imm32,
2066
}) => cmpl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2067
Inst::cmpq_mi(cmpq_mi {
2068
rm64: GprMem::Gpr(Gpr::RAX),
2069
imm32,
2070
}) => cmpq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2071
2072
// test
2073
Inst::testb_mi(testb_mi {
2074
rm8: GprMem::Gpr(Gpr::RAX),
2075
imm8,
2076
}) => testb_i::<R>::new(Gpr::RAX, imm8).encode(sink),
2077
Inst::testw_mi(testw_mi {
2078
rm16: GprMem::Gpr(Gpr::RAX),
2079
imm16,
2080
}) => testw_i::<R>::new(Gpr::RAX, imm16).encode(sink),
2081
Inst::testl_mi(testl_mi {
2082
rm32: GprMem::Gpr(Gpr::RAX),
2083
imm32,
2084
}) => testl_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2085
Inst::testq_mi(testq_mi {
2086
rm64: GprMem::Gpr(Gpr::RAX),
2087
imm32,
2088
}) => testq_i::<R>::new(Gpr::RAX, imm32).encode(sink),
2089
2090
// lea
2091
Inst::leal_rm(leal_rm { r32, m32 }) => emit_lea(
2092
r32,
2093
m32,
2094
sink,
2095
|dst, amode, s| leal_rm::<R>::new(dst, amode).encode(s),
2096
|dst, simm32, s| addl_mi::<R>::new(dst, simm32.cast_unsigned()).encode(s),
2097
|dst, reg, s| addl_rm::<R>::new(dst, reg).encode(s),
2098
),
2099
Inst::leaq_rm(leaq_rm { r64, m64 }) => emit_lea(
2100
r64,
2101
m64,
2102
sink,
2103
|dst, amode, s| leaq_rm::<R>::new(dst, amode).encode(s),
2104
|dst, simm32, s| addq_mi_sxl::<R>::new(dst, simm32).encode(s),
2105
|dst, reg, s| addq_rm::<R>::new(dst, reg).encode(s),
2106
),
2107
2108
// All other instructions fall through to here and cannot be shrunk, so
2109
// return `false` to emit them as usual.
2110
_ => inst.encode(sink),
2111
}
2112
}
2113
2114
/// If `lea` can actually get encoded as an `add` then do that instead.
2115
/// Currently all candidate `iadd`s become an `lea` pseudo-instruction here but
2116
/// maximizing the use of `lea` is not necessarily optimal. The `lea`
2117
/// instruction goes through dedicated address units on cores which are finite
2118
/// and disjoint from the general ALU, so if everything uses `lea` then those
2119
/// units can get saturated while leaving the ALU idle.
2120
///
2121
/// To help make use of more parts of a CPU, this attempts to use `add` when
2122
/// it's semantically equivalent to `lea`, or otherwise when the `dst` register
2123
/// is the same as the `base` or `index` register.
2124
///
2125
/// FIXME: ideally regalloc is informed of this constraint. Register allocation
2126
/// of `lea` should "attempt" to put the `base` in the same register as `dst`
2127
/// but not at the expense of generating a `mov` instruction. Currently that's
2128
/// not possible but perhaps one day it may be worth it.
2129
fn emit_lea<S>(
2130
dst: asm::Gpr<WritableGpr>,
2131
addr: asm::Amode<Gpr>,
2132
sink: &mut S,
2133
lea: fn(WritableGpr, asm::Amode<Gpr>, &mut S),
2134
add_mi: fn(PairedGpr, i32, &mut S),
2135
add_rm: fn(PairedGpr, Gpr, &mut S),
2136
) where
2137
S: asm::CodeSink,
2138
{
2139
match addr {
2140
// If `base == dst` then this is `add dst, $imm`, so encode that
2141
// instead.
2142
asm::Amode::ImmReg {
2143
base,
2144
simm32:
2145
asm::AmodeOffsetPlusKnownOffset {
2146
simm32,
2147
offset: None,
2148
},
2149
trap: None,
2150
} if dst.as_ref().to_reg() == base => add_mi(
2151
PairedGpr {
2152
read: base,
2153
write: *dst.as_ref(),
2154
},
2155
simm32.value(),
2156
sink,
2157
),
2158
2159
// If the offset is 0 and the shift is a scale of 1, then:
2160
//
2161
// * If `base == dst`, then this is `addq dst, index`
2162
// * If `index == dst`, then this is `addq dst, base`
2163
asm::Amode::ImmRegRegShift {
2164
base,
2165
index,
2166
scale: asm::Scale::One,
2167
simm32: asm::AmodeOffset::ZERO,
2168
trap: None,
2169
} => {
2170
if dst.as_ref().to_reg() == base {
2171
add_rm(
2172
PairedGpr {
2173
read: base,
2174
write: *dst.as_ref(),
2175
},
2176
*index.as_ref(),
2177
sink,
2178
)
2179
} else if dst.as_ref().to_reg() == *index.as_ref() {
2180
add_rm(
2181
PairedGpr {
2182
read: *index.as_ref(),
2183
write: *dst.as_ref(),
2184
},
2185
base,
2186
sink,
2187
)
2188
} else {
2189
lea(*dst.as_ref(), addr, sink)
2190
}
2191
}
2192
2193
_ => lea(*dst.as_ref(), addr, sink),
2194
}
2195
}
2196
2197