Path: blob/main/cranelift/codegen/src/isa/x64/inst/emit.rs
3088 views
use crate::ir::KnownSymbol;1use crate::ir::immediates::{Ieee32, Ieee64};2use crate::isa::x64::external::{AsmInst, CraneliftRegisters, PairedGpr};3use crate::isa::x64::inst::args::*;4use crate::isa::x64::inst::*;5use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};6use cranelift_assembler_x64 as asm;78/// A small helper to generate a signed conversion instruction.9fn emit_signed_cvt(10sink: &mut MachBuffer<Inst>,11info: &EmitInfo,12state: &mut EmitState,13src: Reg,14dst: Writable<Reg>,15to_f64: bool,16) {17assert!(src.is_real());18assert!(dst.to_reg().is_real());1920// Handle an unsigned int, which is the "easy" case: a signed conversion21// will do the right thing.22let dst = WritableXmm::from_writable_reg(dst).unwrap();23if to_f64 {24asm::inst::cvtsi2sdq_a::new(dst, src).emit(sink, info, state);25} else {26asm::inst::cvtsi2ssq_a::new(dst, src).emit(sink, info, state);27}28}2930/// Emits a one way conditional jump if CC is set (true).31fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {32let cond_start = sink.cur_offset();33let cond_disp_off = cond_start + 2;34sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);35emit_jcc_no_offset(sink, cc);36debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);37}3839/// Like `one_way_jmp` above emitting a conditional jump, but also using40/// `MachBuffer::add_cond_branch`.41fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {42let cond_start = sink.cur_offset();43let cond_disp_off = cond_start + 2;44let cond_end = cond_start + 6;4546sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);47// FIXME: ideally this `inverted` calculation would go through the external48// assembler, but for now it's left done manually.49let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];50sink.add_cond_branch(cond_start, cond_end, label, &inverted[..]);5152emit_jcc_no_offset(sink, cc);5354debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);55debug_assert_eq!(sink.cur_offset(), cond_end);56}5758fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {59// Note that the disassembler matches Capstone which doesn't match the `CC`60// enum directly as Intel has multiple mnemonics use the same encoding.61let inst: AsmInst = match cc {62CC::Z => asm::inst::je_d32::new(0).into(), // jz == je63CC::NZ => asm::inst::jne_d32::new(0).into(), // jnz == jne64CC::B => asm::inst::jb_d32::new(0).into(),65CC::NB => asm::inst::jae_d32::new(0).into(), // jnb == jae66CC::BE => asm::inst::jbe_d32::new(0).into(),67CC::NBE => asm::inst::ja_d32::new(0).into(), // jnbe == ja68CC::L => asm::inst::jl_d32::new(0).into(),69CC::LE => asm::inst::jle_d32::new(0).into(),70CC::NL => asm::inst::jge_d32::new(0).into(), // jnl == jge71CC::NLE => asm::inst::jg_d32::new(0).into(), // jnle == jg72CC::O => asm::inst::jo_d32::new(0).into(),73CC::NO => asm::inst::jno_d32::new(0).into(),74CC::P => asm::inst::jp_d32::new(0).into(),75CC::NP => asm::inst::jnp_d32::new(0).into(),76CC::S => asm::inst::js_d32::new(0).into(),77CC::NS => asm::inst::jns_d32::new(0).into(),78};79inst.encode(&mut external::AsmCodeSink {80sink,81incoming_arg_offset: 0,82slot_offset: 0,83});84}8586/// Emits an unconditional branch.87fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {88let uncond_start = sink.cur_offset();89let uncond_disp_off = uncond_start + 1;90let uncond_end = uncond_start + 5;9192sink.use_label_at_offset(uncond_disp_off, label, LabelUse::JmpRel32);93sink.add_uncond_branch(uncond_start, uncond_end, label);9495asm::inst::jmp_d32::new(0).encode(&mut external::AsmCodeSink {96sink,97incoming_arg_offset: 0,98slot_offset: 0,99});100debug_assert_eq!(sink.cur_offset(), uncond_disp_off + 4);101debug_assert_eq!(sink.cur_offset(), uncond_end);102}103104/// Emits a relocation, attaching the current source location as well.105fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {106sink.add_reloc(kind, name, addend);107}108109/// The top-level emit function.110///111/// Important! Do not add improved (shortened) encoding cases to existing112/// instructions without also adding tests for those improved encodings. That113/// is a dangerous game that leads to hard-to-track-down errors in the emitted114/// code.115///116/// For all instructions, make sure to have test coverage for all of the117/// following situations. Do this by creating the cross product resulting from118/// applying the following rules to each operand:119///120/// (1) for any insn that mentions a register: one test using a register from121/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one122/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].123/// This helps detect incorrect REX prefix construction.124///125/// (2) for any insn that mentions a byte register: one test for each of the126/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],127/// [r8b .. r11b] and [r12b .. r15b]. This checks that128/// apparently-redundant REX prefixes are retained when required.129///130/// (3) for any insn that contains an immediate field, check the following131/// cases: field is zero, field is in simm8 range (-128 .. 127), field is132/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some133/// instructions that require a 32-bit immediate have a short-form encoding134/// when the imm is in simm8 range.135///136/// Rules (1), (2) and (3) don't apply for registers within address expressions137/// (`Addr`s). Those are already pretty well tested, and the registers in them138/// don't have any effect on the containing instruction (apart from possibly139/// require REX prefix bits).140///141/// When choosing registers for a test, avoid using registers with the same142/// offset within a given group. For example, don't use rax and r8, since they143/// both have the lowest 3 bits as 000, and so the test won't detect errors144/// where those 3-bit register sub-fields are confused by the emitter. Instead145/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl146/// and bpl since they have the same offset in their group; use instead (eg) cl147/// and sil.148///149/// For all instructions, also add a test that uses only low-half registers150/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX151/// prefixes are correctly omitted. This low-half restriction must apply to152/// _all_ registers in the insn, even those in address expressions.153///154/// Following these rules creates large numbers of test cases, but it's the155/// only way to make the emitter reliable.156///157/// Known possible improvements:158///159/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we160/// care?)161pub(crate) fn emit(162inst: &Inst,163sink: &mut MachBuffer<Inst>,164info: &EmitInfo,165state: &mut EmitState,166) {167if !inst.is_available(&info) {168let features = if let Inst::External { inst } = inst {169inst.features().to_string()170} else {171"see `is_available` source for feature term".to_string()172};173panic!(174"Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {features}"175);176}177178match inst {179Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {180// Validate that the register constraints of the dividend and the181// destination are all as expected.182let (dst, size) = match inst {183Inst::CheckedSRemSeq {184dividend_lo,185dividend_hi,186dst_quotient,187dst_remainder,188size,189..190} => {191let dividend_lo = dividend_lo.to_reg();192let dividend_hi = dividend_hi.to_reg();193let dst_quotient = dst_quotient.to_reg().to_reg();194let dst_remainder = dst_remainder.to_reg().to_reg();195debug_assert_eq!(dividend_lo, regs::rax());196debug_assert_eq!(dividend_hi, regs::rdx());197debug_assert_eq!(dst_quotient, regs::rax());198debug_assert_eq!(dst_remainder, regs::rdx());199(regs::rdx(), *size)200}201Inst::CheckedSRemSeq8 { dividend, dst, .. } => {202let dividend = dividend.to_reg();203let dst = dst.to_reg().to_reg();204debug_assert_eq!(dividend, regs::rax());205debug_assert_eq!(dst, regs::rax());206(regs::rax(), OperandSize::Size8)207}208_ => unreachable!(),209};210211// Generates the following code sequence:212//213// cmp -1 %divisor214// jnz $do_op215//216// ;; for srem, result is 0217// mov #0, %dst218// j $done219//220// $do_op:221// idiv %divisor222//223// $done:224225let do_op = sink.get_label();226let done_label = sink.get_label();227228// Check if the divisor is -1, and if it isn't then immediately229// go to the `idiv`.230let inst = Inst::cmp_mi_sxb(size, *divisor, -1);231inst.emit(sink, info, state);232one_way_jmp(sink, CC::NZ, do_op);233234// ... otherwise the divisor is -1 and the result is always 0. This235// is written to the destination register which will be %rax for236// 8-bit srem and %rdx otherwise.237//238// Note that for 16-to-64-bit srem operations this leaves the239// second destination, %rax, unchanged. This isn't semantically240// correct if a lowering actually tries to use the `dst_quotient`241// output but for srem only the `dst_remainder` output is used for242// now.243let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));244inst.emit(sink, info, state);245let inst = Inst::jmp_known(done_label);246inst.emit(sink, info, state);247248// Here the `idiv` is executed, which is different depending on the249// size250sink.bind_label(do_op, state.ctrl_plane_mut());251let rax = Gpr::RAX;252let rdx = Gpr::RDX;253let writable_rax = Writable::from_reg(rax);254let writable_rdx = Writable::from_reg(rdx);255let inst: AsmInst = match size {256OperandSize::Size8 => asm::inst::idivb_m::new(257PairedGpr::from(writable_rax),258*divisor,259TrapCode::INTEGER_DIVISION_BY_ZERO,260)261.into(),262263OperandSize::Size16 => asm::inst::idivw_m::new(264PairedGpr::from(writable_rax),265PairedGpr::from(writable_rdx),266*divisor,267TrapCode::INTEGER_DIVISION_BY_ZERO,268)269.into(),270271OperandSize::Size32 => asm::inst::idivl_m::new(272PairedGpr::from(writable_rax),273PairedGpr::from(writable_rdx),274*divisor,275TrapCode::INTEGER_DIVISION_BY_ZERO,276)277.into(),278279OperandSize::Size64 => asm::inst::idivq_m::new(280PairedGpr::from(writable_rax),281PairedGpr::from(writable_rdx),282*divisor,283TrapCode::INTEGER_DIVISION_BY_ZERO,284)285.into(),286};287inst.emit(sink, info, state);288289sink.bind_label(done_label, state.ctrl_plane_mut());290}291292Inst::MovFromPReg { src, dst } => {293let src: Reg = (*src).into();294debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));295asm::inst::movq_mr::new(*dst, Gpr::unwrap_new(src)).emit(sink, info, state);296}297298Inst::MovToPReg { src, dst } => {299let dst: Reg = (*dst).into();300debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));301let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();302asm::inst::movq_mr::new(dst, *src).emit(sink, info, state);303}304305Inst::XmmCmove {306ty,307cc,308consequent,309alternative,310dst,311} => {312let alternative = *alternative;313let dst = *dst;314debug_assert_eq!(alternative, dst.to_reg());315let consequent = *consequent;316317// Lowering of the Select IR opcode when the input is an fcmp relies on the fact that318// this doesn't clobber flags. Make sure to not do so here.319let next = sink.get_label();320321// Jump if cc is *not* set.322one_way_jmp(sink, cc.invert(), next);323Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)324.emit(sink, info, state);325326sink.bind_label(next, state.ctrl_plane_mut());327}328329Inst::StackProbeLoop {330tmp,331frame_size,332guard_size,333} => {334assert!(info.flags.enable_probestack());335assert!(guard_size.is_power_of_two());336337let tmp = *tmp;338339// Number of probes that we need to perform340let probe_count = align_to(*frame_size, *guard_size) / guard_size;341342// The inline stack probe loop has 3 phases:343//344// We generate the "guard area" register which is essentially the frame_size aligned to345// guard_size. We copy the stack pointer and subtract the guard area from it. This346// gets us a register that we can use to compare when looping.347//348// After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd349// distance at a time and then touch the stack by writing anything to it. We use the previously350// created "guard area" register to know when to stop looping.351//352// When we have touched all the pages that we need, we have to restore the stack pointer353// to where it was before.354//355// Generate the following code:356// mov tmp_reg, rsp357// sub tmp_reg, guard_size * probe_count358// .loop_start:359// sub rsp, guard_size360// mov [rsp], 0361// cmp rsp, tmp_reg362// jne .loop_start363// add rsp, guard_size * probe_count364365// Create the guard bound register366// mov tmp_reg, rsp367let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);368inst.emit(sink, info, state);369370// sub tmp_reg, GUARD_SIZE * probe_count371let guard_plus_count = i32::try_from(guard_size * probe_count)372.expect("`guard_size * probe_count` is too large to fit in a 32-bit immediate");373Inst::subq_mi(tmp, guard_plus_count).emit(sink, info, state);374375// Emit the main loop!376let loop_start = sink.get_label();377sink.bind_label(loop_start, state.ctrl_plane_mut());378379// sub rsp, GUARD_SIZE380let rsp = Writable::from_reg(regs::rsp());381let guard_size_ = i32::try_from(*guard_size)382.expect("`guard_size` is too large to fit in a 32-bit immediate");383Inst::subq_mi(rsp, guard_size_).emit(sink, info, state);384385// Touch the current page by storing an immediate zero.386// mov [rsp], 0387asm::inst::movl_mi::new(Amode::imm_reg(0, regs::rsp()), 0i32.cast_unsigned())388.emit(sink, info, state);389390// Compare and jump if we are not done yet391// cmp rsp, tmp_reg392let tmp = Gpr::unwrap_new(tmp.to_reg());393asm::inst::cmpq_rm::new(tmp, Gpr::RSP).emit(sink, info, state);394395// jne .loop_start396// TODO: Encoding the conditional jump as a short jump397// could save us us 4 bytes here.398one_way_jmp(sink, CC::NZ, loop_start);399400// The regular prologue code is going to emit a `sub` after this, so we need to401// reset the stack pointer402//403// TODO: It would be better if we could avoid the `add` + `sub` that is generated here404// and in the stack adj portion of the prologue405//406// add rsp, GUARD_SIZE * probe_count407Inst::addq_mi(rsp, guard_plus_count).emit(sink, info, state);408}409410Inst::CallKnown { info: call_info } => {411let start = sink.cur_offset();412let stack_map = state.take_stack_map();413414asm::inst::callq_d::new(0).emit(sink, info, state);415416// The last 4 bytes of `callq` is the relative displacement to where417// we're calling, so that's where the reloc is registered.418//419// The addend adjusts for the difference between the end of the420// instruction and the beginning of the immediate field.421let len = sink.cur_offset();422sink.add_reloc_at_offset(len - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);423424if let Some(s) = stack_map {425sink.push_user_stack_map(state, len, s);426}427428if let Some(try_call) = call_info.try_call_info.as_ref() {429sink.add_try_call_site(430Some(state.frame_layout().sp_to_fp()),431try_call.exception_handlers(&state.frame_layout()),432);433} else {434sink.add_call_site();435}436437// Reclaim the outgoing argument area that was released by the438// callee, to ensure that StackAMode values are always computed from439// a consistent SP.440if call_info.callee_pop_size > 0 {441let rsp = Writable::from_reg(regs::rsp());442let callee_pop_size = i32::try_from(call_info.callee_pop_size)443.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");444Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);445}446447if call_info.patchable {448sink.add_patchable_call_site(sink.cur_offset() - start);449} else {450// Load any stack-carried return values.451call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(452state.frame_layout().stackslots_size,453|inst| inst.emit(sink, info, state),454|_space_needed| None,455);456}457458// If this is a try-call, jump to the continuation459// (normal-return) block.460if let Some(try_call) = call_info.try_call_info.as_ref() {461let jmp = Inst::JmpKnown {462dst: try_call.continuation,463};464jmp.emit(sink, info, state);465}466}467468Inst::ReturnCallKnown { info: call_info } => {469emit_return_call_common_sequence(sink, info, state, &call_info);470471// Finally, jump to the callee!472//473// Note: this is not `Inst::Jmp { .. }.emit(..)` because we have474// different metadata in this case: we don't have a label for the475// target, but rather a function relocation.476asm::inst::jmp_d32::new(0).emit(sink, info, state);477let offset = sink.cur_offset();478// The addend adjusts for the difference between the end of the instruction and the479// beginning of the immediate field.480sink.add_reloc_at_offset(offset - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);481sink.add_call_site();482}483484Inst::ReturnCallUnknown { info: call_info } => {485let callee = call_info.dest;486487emit_return_call_common_sequence(sink, info, state, &call_info);488489asm::inst::jmpq_m::new(callee).emit(sink, info, state);490sink.add_call_site();491}492493Inst::CallUnknown {494info: call_info, ..495} => {496let stack_map = state.take_stack_map();497498let dest = match call_info.dest.clone() {499RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::unwrap_new(reg)),500RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),501};502503asm::inst::callq_m::new(dest).emit(sink, info, state);504505if let Some(s) = stack_map {506let offset = sink.cur_offset();507sink.push_user_stack_map(state, offset, s);508}509510if let Some(try_call) = call_info.try_call_info.as_ref() {511sink.add_try_call_site(512Some(state.frame_layout().sp_to_fp()),513try_call.exception_handlers(&state.frame_layout()),514);515} else {516sink.add_call_site();517}518519// Reclaim the outgoing argument area that was released by the callee, to ensure that520// StackAMode values are always computed from a consistent SP.521if call_info.callee_pop_size > 0 {522let rsp = Writable::from_reg(regs::rsp());523let callee_pop_size = i32::try_from(call_info.callee_pop_size)524.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");525Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);526}527528// Load any stack-carried return values.529call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(530state.frame_layout().stackslots_size,531|inst| inst.emit(sink, info, state),532|_space_needed| None,533);534535if let Some(try_call) = call_info.try_call_info.as_ref() {536let jmp = Inst::JmpKnown {537dst: try_call.continuation,538};539jmp.emit(sink, info, state);540}541}542543Inst::Args { .. } => {}544Inst::Rets { .. } => {}545546Inst::StackSwitchBasic {547store_context_ptr,548load_context_ptr,549in_payload0,550out_payload0,551} => {552// Note that we do not emit anything for preserving and restoring553// ordinary registers here: That's taken care of by regalloc for us,554// since we marked this instruction as clobbering all registers.555//556// Also note that we do nothing about passing the single payload557// value: We've informed regalloc that it is sent and received via558// the fixed register given by [stack_switch::payload_register]559560let (tmp1, tmp2) = {561// Ideally we would just ask regalloc for two temporary registers.562// However, adding any early defs to the constraints on StackSwitch563// causes TooManyLiveRegs. Fortunately, we can manually find tmp564// registers without regalloc: Since our instruction clobbers all565// registers, we can simply pick any register that is not assigned566// to the operands.567568let all = crate::isa::x64::abi::ALL_CLOBBERS;569570let used_regs = [571**load_context_ptr,572**store_context_ptr,573**in_payload0,574*out_payload0.to_reg(),575];576577let mut tmps = all.into_iter().filter_map(|preg| {578let reg: Reg = preg.into();579if !used_regs.contains(®) {580WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))581} else {582None583}584});585(tmps.next().unwrap(), tmps.next().unwrap())586};587588let layout = stack_switch::control_context_layout();589let rsp_offset = layout.stack_pointer_offset as i32;590let pc_offset = layout.ip_offset as i32;591let rbp_offset = layout.frame_pointer_offset as i32;592593// Location to which someone switch-ing back to this stack will jump594// to: Right behind the `StackSwitch` instruction595let resume = sink.get_label();596597//598// For RBP and RSP we do the following:599// - Load new value for register from `load_context_ptr` +600// corresponding offset.601// - Store previous (!) value of register at `store_context_ptr` +602// corresponding offset.603//604// Since `load_context_ptr` and `store_context_ptr` are allowed to be605// equal, we need to use a temporary register here.606//607608let mut exchange = |offset, reg| {609let addr = SyntheticAmode::real(Amode::imm_reg(offset, **load_context_ptr));610asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);611612asm::inst::movq_mr::new(613Amode::imm_reg(offset, **store_context_ptr),614Gpr::new(reg).unwrap(),615)616.emit(sink, info, state);617618let dst = Writable::from_reg(reg);619asm::inst::movq_mr::new(dst.map(Gpr::unwrap_new), tmp1.to_reg())620.emit(sink, info, state);621};622623exchange(rsp_offset, regs::rsp());624exchange(rbp_offset, regs::rbp());625626//627// Load target PC, store resume PC, jump to target PC628//629630let addr = SyntheticAmode::real(Amode::imm_reg(pc_offset, **load_context_ptr));631asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);632633let amode = Amode::RipRelative { target: resume };634asm::inst::leaq_rm::new(tmp2, amode).emit(sink, info, state);635636asm::inst::movq_mr::new(637Amode::imm_reg(pc_offset, **store_context_ptr),638tmp2.to_reg(),639)640.emit(sink, info, state);641642asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);643644sink.bind_label(resume, state.ctrl_plane_mut());645}646647Inst::JmpKnown { dst } => uncond_jmp(sink, *dst),648649Inst::WinchJmpIf { cc, taken } => one_way_jmp(sink, *cc, *taken),650651Inst::JmpCond {652cc,653taken,654not_taken,655} => {656cond_jmp(sink, *cc, *taken);657uncond_jmp(sink, *not_taken);658}659660Inst::JmpCondOr {661cc1,662cc2,663taken,664not_taken,665} => {666// Emit:667// jcc1 taken668// jcc2 taken669// jmp not_taken670//671// Note that we enroll both conditionals in the672// branch-chomping mechanism because MachBuffer673// simplification can continue upward as long as it keeps674// chomping branches. In the best case, if taken ==675// not_taken and that one block is the fallthrough block,676// all three branches can disappear.677678cond_jmp(sink, *cc1, *taken);679cond_jmp(sink, *cc2, *taken);680uncond_jmp(sink, *not_taken);681}682683&Inst::JmpTableSeq {684idx,685tmp1,686tmp2,687ref targets,688ref default_target,689..690} => {691// This sequence is *one* instruction in the vcode, and is expanded only here at692// emission time, because we cannot allow the regalloc to insert spills/reloads in693// the middle; we depend on hardcoded PC-rel addressing below.694//695// We don't have to worry about emitting islands, because the only label-use type has a696// maximum range of 2 GB. If we later consider using shorter-range label references,697// this will need to be revisited.698699// We generate the following sequence. Note that the only read of %idx is before the700// write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs701// if you change this.702// lea start_of_jump_table_offset(%rip), %tmp1703// movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4704// addq %tmp2, %tmp1705// j *%tmp1706// $start_of_jump_table:707// -- jump table entries708709// Load base address of jump table.710let start_of_jumptable = sink.get_label();711asm::inst::leaq_rm::new(tmp1, Amode::rip_relative(start_of_jumptable))712.emit(sink, info, state);713714// Load value out of the jump table. It's a relative offset to the target block, so it715// might be negative; use a sign-extension.716let inst = Inst::movsx_rm_r(717ExtMode::LQ,718RegMem::mem(Amode::imm_reg_reg_shift(7190,720Gpr::unwrap_new(tmp1.to_reg()),721Gpr::unwrap_new(idx),7222,723)),724tmp2,725);726inst.emit(sink, info, state);727728// Add base of jump table to jump-table-sourced block offset.729asm::inst::addq_rm::new(tmp1, tmp2).emit(sink, info, state);730731// Branch to computed address.732asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);733734// Emit jump table (table of 32-bit offsets).735sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());736let jt_off = sink.cur_offset();737for &target in targets.iter().chain(core::iter::once(default_target)) {738let word_off = sink.cur_offset();739// off_into_table is an addend here embedded in the label to be later patched at740// the end of codegen. The offset is initially relative to this jump table entry;741// with the extra addend, it'll be relative to the jump table's start, after742// patching.743let off_into_table = word_off - jt_off;744sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);745sink.put4(off_into_table);746}747}748749Inst::TrapIf { cc, trap_code } => {750let trap_label = sink.defer_trap(*trap_code);751one_way_jmp(sink, *cc, trap_label);752}753754Inst::TrapIfAnd {755cc1,756cc2,757trap_code,758} => {759let trap_label = sink.defer_trap(*trap_code);760let else_label = sink.get_label();761762// Jump to the end if the first condition isn't true, and then if763// the second condition is true go to the trap.764one_way_jmp(sink, cc1.invert(), else_label);765one_way_jmp(sink, *cc2, trap_label);766767sink.bind_label(else_label, state.ctrl_plane_mut());768}769770Inst::TrapIfOr {771cc1,772cc2,773trap_code,774} => {775let trap_label = sink.defer_trap(*trap_code);776777// Emit two jumps to the same trap if either condition code is true.778one_way_jmp(sink, *cc1, trap_label);779one_way_jmp(sink, *cc2, trap_label);780}781782Inst::XmmMinMaxSeq {783size,784is_min,785lhs,786rhs,787dst,788} => {789let rhs = rhs.to_reg();790let lhs = lhs.to_reg();791let dst = dst.to_writable_reg();792debug_assert_eq!(rhs, dst.to_reg());793794// Generates the following sequence:795// cmpss/cmpsd %lhs, %rhs_dst796// jnz do_min_max797// jp propagate_nan798//799// ;; ordered and equal: propagate the sign bit (for -0 vs 0):800// {and,or}{ss,sd} %lhs, %rhs_dst801// j done802//803// ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the804// ;; NaN value is returned), we add both inputs.805// propagate_nan:806// add{ss,sd} %lhs, %rhs_dst807// j done808//809// do_min_max:810// {min,max}{ss,sd} %lhs, %rhs_dst811//812// done:813let done = sink.get_label();814let propagate_nan = sink.get_label();815let do_min_max = sink.get_label();816817let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {818OperandSize::Size32 => (819asm::inst::addss_a::new(dst, lhs).into(),820asm::inst::ucomiss_a::new(dst.to_reg(), lhs).into(),821asm::inst::andps_a::new(dst, lhs).into(),822asm::inst::orps_a::new(dst, lhs).into(),823if *is_min {824asm::inst::minss_a::new(dst, lhs).into()825} else {826asm::inst::maxss_a::new(dst, lhs).into()827},828),829OperandSize::Size64 => (830asm::inst::addsd_a::new(dst, lhs).into(),831asm::inst::ucomisd_a::new(dst.to_reg(), lhs).into(),832asm::inst::andpd_a::new(dst, lhs).into(),833asm::inst::orpd_a::new(dst, lhs).into(),834if *is_min {835asm::inst::minsd_a::new(dst, lhs).into()836} else {837asm::inst::maxsd_a::new(dst, lhs).into()838},839),840_ => unreachable!(),841};842let add_op: AsmInst = add_op;843let or_op: AsmInst = or_op;844let min_max_op: AsmInst = min_max_op;845let cmp_op: AsmInst = cmp_op;846847cmp_op.emit(sink, info, state);848849one_way_jmp(sink, CC::NZ, do_min_max);850one_way_jmp(sink, CC::P, propagate_nan);851852// Ordered and equal. The operands are bit-identical unless they are zero853// and negative zero. These instructions merge the sign bits in that854// case, and are no-ops otherwise.855let inst: AsmInst = if *is_min { or_op } else { and_op };856inst.emit(sink, info, state);857858let inst = Inst::jmp_known(done);859inst.emit(sink, info, state);860861// x86's min/max are not symmetric; if either operand is a NaN, they return the862// read-only operand: perform an addition between the two operands, which has the863// desired NaN propagation effects.864sink.bind_label(propagate_nan, state.ctrl_plane_mut());865add_op.emit(sink, info, state);866867one_way_jmp(sink, CC::P, done);868869sink.bind_label(do_min_max, state.ctrl_plane_mut());870min_max_op.emit(sink, info, state);871872sink.bind_label(done, state.ctrl_plane_mut());873}874875Inst::XmmUninitializedValue { .. } | Inst::GprUninitializedValue { .. } => {876// These instruction formats only exist to declare a register as a877// `def`; no code is emitted. This is always immediately followed by878// an instruction, such as `xor <tmp>, <tmp>`, that semantically879// reads this undefined value but arithmetically produces the same880// result regardless of its value.881}882883Inst::CvtUint64ToFloatSeq {884dst_size,885src,886dst,887tmp_gpr1,888tmp_gpr2,889} => {890let src = src.to_reg();891let dst = dst.to_writable_reg();892let tmp_gpr1 = tmp_gpr1.to_writable_reg();893let tmp_gpr2 = tmp_gpr2.to_writable_reg();894895// Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a896// different sequence.897//898// Emit the following sequence:899//900// cmp 0, %src901// jl handle_negative902//903// ;; handle positive, which can't overflow904// cvtsi2sd/cvtsi2ss %src, %dst905// j done906//907// ;; handle negative: see below for an explanation of what it's doing.908// handle_negative:909// mov %src, %tmp_gpr1910// shr $1, %tmp_gpr1911// mov %src, %tmp_gpr2912// and $1, %tmp_gpr2913// or %tmp_gpr1, %tmp_gpr2914// cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst915// addsd/addss %dst, %dst916//917// done:918919assert_ne!(src, tmp_gpr1.to_reg());920assert_ne!(src, tmp_gpr2.to_reg());921922let handle_negative = sink.get_label();923let done = sink.get_label();924925// If x seen as a signed int64 is not negative, a signed-conversion will do the right926// thing.927// TODO use tst src, src here.928asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);929930one_way_jmp(sink, CC::L, handle_negative);931932// Handle a positive int64, which is the "easy" case: a signed conversion will do the933// right thing.934emit_signed_cvt(935sink,936info,937state,938src,939dst,940*dst_size == OperandSize::Size64,941);942943let inst = Inst::jmp_known(done);944inst.emit(sink, info, state);945946sink.bind_label(handle_negative, state.ctrl_plane_mut());947948// Divide x by two to get it in range for the signed conversion, keep the LSB, and949// scale it back up on the FP side.950let inst = Inst::gen_move(tmp_gpr1, src, types::I64);951inst.emit(sink, info, state);952953// tmp_gpr1 := src >> 1954asm::inst::shrq_mi::new(tmp_gpr1, 1).emit(sink, info, state);955956let inst = Inst::gen_move(tmp_gpr2, src, types::I64);957inst.emit(sink, info, state);958959asm::inst::andq_mi_sxb::new(tmp_gpr2, 1).emit(sink, info, state);960961asm::inst::orq_rm::new(tmp_gpr2, tmp_gpr1).emit(sink, info, state);962963emit_signed_cvt(964sink,965info,966state,967tmp_gpr2.to_reg(),968dst,969*dst_size == OperandSize::Size64,970);971972let inst: AsmInst = match *dst_size {973OperandSize::Size64 => asm::inst::addsd_a::new(dst, dst.to_reg()).into(),974OperandSize::Size32 => asm::inst::addss_a::new(dst, dst.to_reg()).into(),975_ => unreachable!(),976};977inst.emit(sink, info, state);978979sink.bind_label(done, state.ctrl_plane_mut());980}981982Inst::CvtFloatToSintSeq {983src_size,984dst_size,985is_saturating,986src,987dst,988tmp_gpr,989tmp_xmm,990} => {991use OperandSize::*;992993let src = src.to_reg();994let dst = dst.to_writable_reg();995let tmp_gpr = tmp_gpr.to_writable_reg();996let tmp_xmm = tmp_xmm.to_writable_reg();997998// Emits the following common sequence:999//1000// cvttss2si/cvttsd2si %src, %dst1001// cmp %dst, 11002// jno done1003//1004// Then, for saturating conversions:1005//1006// ;; check for NaN1007// cmpss/cmpsd %src, %src1008// jnp not_nan1009// xor %dst, %dst1010//1011// ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is1012// ;; already in %dst.1013// xorpd %tmp_xmm, %tmp_xmm1014// cmpss/cmpsd %src, %tmp_xmm1015// jnb done1016// mov/movaps $INT_MAX, %dst1017//1018// done:1019//1020// Then, for non-saturating conversions:1021//1022// ;; check for NaN1023// cmpss/cmpsd %src, %src1024// jnp not_nan1025// ud2 trap BadConversionToInteger1026//1027// ;; check if INT_MIN was the correct result, against a magic constant:1028// not_nan:1029// movaps/mov $magic, %tmp_gpr1030// movq/movd %tmp_gpr, %tmp_xmm1031// cmpss/cmpsd %tmp_xmm, %src1032// jnb/jnbe $check_positive1033// ud2 trap IntegerOverflow1034//1035// ;; if positive, it was a real overflow1036// check_positive:1037// xorpd %tmp_xmm, %tmp_xmm1038// cmpss/cmpsd %src, %tmp_xmm1039// jnb done1040// ud2 trap IntegerOverflow1041//1042// done:10431044let cmp_op: AsmInst = match src_size {1045Size64 => asm::inst::ucomisd_a::new(src, src).into(),1046Size32 => asm::inst::ucomiss_a::new(src, src).into(),1047_ => unreachable!(),1048};10491050let cvtt_op = |dst, src| Inst::External {1051inst: match (*src_size, *dst_size) {1052(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),1053(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),1054(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),1055(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),1056_ => unreachable!(),1057},1058};10591060let done = sink.get_label();10611062// The truncation.1063cvtt_op(dst, src).emit(sink, info, state);10641065// Compare against 1, in case of overflow the dst operand was INT_MIN.1066let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);1067inst.emit(sink, info, state);10681069one_way_jmp(sink, CC::NO, done); // no overflow => done10701071// Check for NaN.1072cmp_op.emit(sink, info, state);10731074if *is_saturating {1075let not_nan = sink.get_label();1076one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN10771078// For NaN, emit 0.1079let inst: AsmInst = match *dst_size {1080OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),1081OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),1082_ => unreachable!(),1083};1084inst.emit(sink, info, state);10851086let inst = Inst::jmp_known(done);1087inst.emit(sink, info, state);10881089sink.bind_label(not_nan, state.ctrl_plane_mut());10901091// If the input was positive, saturate to INT_MAX.10921093// Zero out tmp_xmm.1094asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);10951096let inst: AsmInst = match src_size {1097Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),1098Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),1099_ => unreachable!(),1100};1101inst.emit(sink, info, state);11021103// Jump if >= to done.1104one_way_jmp(sink, CC::NB, done);11051106// Otherwise, put INT_MAX.1107if *dst_size == OperandSize::Size64 {1108let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);1109inst.emit(sink, info, state);1110} else {1111let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);1112inst.emit(sink, info, state);1113}1114} else {1115let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);1116inst.emit(sink, info, state);11171118// Check if INT_MIN was the correct result: determine the smallest floating point1119// number that would convert to INT_MIN, put it in a temporary register, and compare1120// against the src register.1121// If the src register is less (or in some cases, less-or-equal) than the threshold,1122// trap!11231124let mut no_overflow_cc = CC::NB; // >=1125let output_bits = dst_size.to_bits();1126match *src_size {1127OperandSize::Size32 => {1128let cst = (-Ieee32::pow2(output_bits - 1)).bits();1129let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);1130inst.emit(sink, info, state);1131}1132OperandSize::Size64 => {1133// An f64 can represent `i32::min_value() - 1` exactly with precision to spare,1134// so there are values less than -2^(N-1) that convert correctly to INT_MIN.1135let cst = if output_bits < 64 {1136no_overflow_cc = CC::NBE; // >1137Ieee64::fcvt_to_sint_negative_overflow(output_bits)1138} else {1139-Ieee64::pow2(output_bits - 1)1140};1141let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);1142inst.emit(sink, info, state);1143}1144_ => unreachable!(),1145}11461147let inst: AsmInst = {1148let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());1149match src_size {1150Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),1151Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),1152_ => unreachable!(),1153}1154};1155inst.emit(sink, info, state);11561157let inst: AsmInst = match src_size {1158Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),1159Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),1160_ => unreachable!(),1161};1162inst.emit(sink, info, state);11631164// no trap if src >= or > threshold1165let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);1166inst.emit(sink, info, state);11671168// If positive, it was a real overflow.11691170// Zero out the tmp_xmm register.1171asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);11721173let inst: AsmInst = match src_size {1174Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),1175Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),1176_ => unreachable!(),1177};1178inst.emit(sink, info, state);11791180// no trap if 0 >= src1181let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);1182inst.emit(sink, info, state);1183}11841185sink.bind_label(done, state.ctrl_plane_mut());1186}11871188Inst::CvtFloatToUintSeq {1189src_size,1190dst_size,1191is_saturating,1192src,1193dst,1194tmp_gpr,1195tmp_xmm,1196tmp_xmm2,1197} => {1198use OperandSize::*;11991200let src = src.to_reg();1201let dst = dst.to_writable_reg();1202let tmp_gpr = tmp_gpr.to_writable_reg();1203let tmp_xmm = tmp_xmm.to_writable_reg();1204let tmp_xmm2 = tmp_xmm2.to_writable_reg();12051206// The only difference in behavior between saturating and non-saturating is how we1207// handle errors. Emits the following sequence:1208//1209// movaps/mov 2**(int_width - 1), %tmp_gpr1210// movq/movd %tmp_gpr, %tmp_xmm1211// cmpss/cmpsd %tmp_xmm, %src1212// jnb is_large1213//1214// ;; check for NaN inputs1215// jnp not_nan1216// -- non-saturating: ud2 trap BadConversionToInteger1217// -- saturating: xor %dst, %dst; j done1218//1219// not_nan:1220// cvttss2si/cvttsd2si %src, %dst1221// cmp 0, %dst1222// jnl done1223// -- non-saturating: ud2 trap IntegerOverflow1224// -- saturating: xor %dst, %dst; j done1225//1226// is_large:1227// mov %src, %tmp_xmm21228// subss/subsd %tmp_xmm, %tmp_xmm21229// cvttss2si/cvttss2sd %tmp_x, %dst1230// cmp 0, %dst1231// jnl next_is_large1232// -- non-saturating: ud2 trap IntegerOverflow1233// -- saturating: movaps $UINT_MAX, %dst; j done1234//1235// next_is_large:1236// add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers1237//1238// done:12391240assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");12411242let xor_op = |dst, src| Inst::External {1243inst: match *dst_size {1244Size32 => asm::inst::xorl_rm::new(dst, src).into(),1245Size64 => asm::inst::xorq_rm::new(dst, src).into(),1246_ => unreachable!(),1247},1248};12491250let subs_op = |dst, src| Inst::External {1251inst: match *src_size {1252Size32 => asm::inst::subss_a::new(dst, src).into(),1253Size64 => asm::inst::subsd_a::new(dst, src).into(),1254_ => unreachable!(),1255},1256};12571258let cvtt_op = |dst, src| Inst::External {1259inst: match (*src_size, *dst_size) {1260(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),1261(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),1262(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),1263(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),1264_ => unreachable!(),1265},1266};12671268let done = sink.get_label();12691270let cst = match src_size {1271OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,1272OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),1273_ => unreachable!(),1274};12751276let inst = Inst::imm(*src_size, cst, tmp_gpr);1277inst.emit(sink, info, state);12781279let inst: AsmInst = {1280let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());1281match src_size {1282Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),1283Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),1284_ => unreachable!(),1285}1286};1287inst.emit(sink, info, state);12881289let inst: AsmInst = match src_size {1290Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),1291Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),1292_ => unreachable!(),1293};1294inst.emit(sink, info, state);12951296let handle_large = sink.get_label();1297one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold12981299if *is_saturating {1300// If not NaN jump over this 0-return, otherwise return 01301let not_nan = sink.get_label();1302one_way_jmp(sink, CC::NP, not_nan);13031304xor_op(dst, dst).emit(sink, info, state);13051306let inst = Inst::jmp_known(done);1307inst.emit(sink, info, state);1308sink.bind_label(not_nan, state.ctrl_plane_mut());1309} else {1310// Trap.1311let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);1312inst.emit(sink, info, state);1313}13141315// Actual truncation for small inputs: if the result is not positive, then we had an1316// overflow.13171318cvtt_op(dst, src).emit(sink, info, state);13191320let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);1321inst.emit(sink, info, state);13221323one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done13241325if *is_saturating {1326// The input was "small" (< 2**(width -1)), so the only way to get an integer1327// overflow is because the input was too small: saturate to the min value, i.e. 0.1328let inst: AsmInst = match *dst_size {1329OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),1330OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),1331_ => unreachable!(),1332};1333inst.emit(sink, info, state);13341335let inst = Inst::jmp_known(done);1336inst.emit(sink, info, state);1337} else {1338// Trap.1339asm::inst::ud2_zo::new(TrapCode::INTEGER_OVERFLOW).emit(sink, info, state);1340}13411342// Now handle large inputs.13431344sink.bind_label(handle_large, state.ctrl_plane_mut());13451346let inst = Inst::gen_move(tmp_xmm2, src, types::F64);1347inst.emit(sink, info, state);13481349subs_op(tmp_xmm2, tmp_xmm.to_reg()).emit(sink, info, state);13501351cvtt_op(dst, tmp_xmm2.to_reg()).emit(sink, info, state);13521353let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);1354inst.emit(sink, info, state);13551356if *is_saturating {1357let next_is_large = sink.get_label();1358one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large13591360// The input was "large" (>= 2**(width -1)), so the only way to get an integer1361// overflow is because the input was too large: saturate to the max value.1362let inst = Inst::imm(1363OperandSize::Size64,1364if *dst_size == OperandSize::Size64 {1365u64::max_value()1366} else {1367u32::max_value() as u641368},1369dst,1370);1371inst.emit(sink, info, state);13721373let inst = Inst::jmp_known(done);1374inst.emit(sink, info, state);1375sink.bind_label(next_is_large, state.ctrl_plane_mut());1376} else {1377let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);1378inst.emit(sink, info, state);1379}13801381if *dst_size == OperandSize::Size64 {1382let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);1383inst.emit(sink, info, state);13841385asm::inst::addq_rm::new(dst, tmp_gpr).emit(sink, info, state);1386} else {1387asm::inst::addl_mi::new(dst, asm::Imm32::new(1 << 31)).emit(sink, info, state);1388}13891390sink.bind_label(done, state.ctrl_plane_mut());1391}13921393Inst::LoadExtName {1394dst,1395name,1396offset,1397distance,1398} => {1399let name = &**name;1400let riprel = asm::Amode::RipRelative {1401target: asm::DeferredTarget::None,1402};1403if info.flags.is_pic() {1404// Generates: movq symbol@GOTPCREL(%rip), %dst1405asm::inst::movq_rm::new(*dst, riprel).emit(sink, info, state);1406let cur = sink.cur_offset();1407sink.add_reloc_at_offset(cur - 4, Reloc::X86GOTPCRel4, name, -4);14081409// Offset in the relocation above applies to the address of the1410// *GOT entry*, not the loaded address; so we emit a separate1411// add instruction if needed.1412let offset = i32::try_from(*offset).unwrap();1413if offset != 0 {1414asm::inst::addq_mi_sxl::new(PairedGpr::from(*dst), offset)1415.emit(sink, info, state);1416}1417} else if distance == &RelocDistance::Near {1418// If we know the distance to the name is within 2GB (e.g., a1419// module-local function), we can generate a RIP-relative1420// address, with a relocation.1421asm::inst::leaq_rm::new(*dst, riprel).emit(sink, info, state);1422let cur = sink.cur_offset();1423sink.add_reloc_at_offset(cur - 4, Reloc::X86CallPCRel4, name, *offset - 4);1424} else {1425// The full address can be encoded in the register, with a1426// relocation.1427asm::inst::movabsq_oi::new(*dst, 0).emit(sink, info, state);1428let cur = sink.cur_offset();1429sink.add_reloc_at_offset(cur - 8, Reloc::Abs8, name, *offset);1430}1431}14321433Inst::AtomicRmwSeq {1434ty,1435op,1436mem,1437operand,1438temp,1439dst_old,1440} => {1441let operand = *operand;1442let temp = *temp;1443let temp_r = temp.map(|r| *r);1444let dst_old = *dst_old;1445let dst_old_r = dst_old.map(|r| *r);1446debug_assert_eq!(dst_old.to_reg(), regs::rax());1447let mem = mem.finalize(state.frame_layout(), sink).clone();14481449// Emit this:1450// mov{zbq,zwq,zlq,q} (%r_address), %rax // rax = old value1451// again:1452// movq %rax, %r_temp // rax = old value, r_temp = old value1453// `op`q %r_operand, %r_temp // rax = old value, r_temp = new value1454// lock cmpxchg{b,w,l,q} %r_temp, (%r_address) // try to store new value1455// jnz again // If this is taken, rax will have a "revised" old value1456//1457// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old1458// value), %r_temp (trashed), %rflags (trashed)1459let again_label = sink.get_label();14601461// mov{zbq,zwq,zlq,q} (%r_address), %rax1462// No need to call `add_trap` here, since the `i1` emit will do that.1463let i1 = Inst::load(*ty, mem.clone(), dst_old_r, ExtKind::ZeroExtend);1464i1.emit(sink, info, state);14651466// again:1467sink.bind_label(again_label, state.ctrl_plane_mut());14681469// movq %rax, %r_temp1470asm::inst::movq_mr::new(temp, dst_old.to_reg()).emit(sink, info, state);14711472use AtomicRmwSeqOp as RmwOp;1473match op {1474RmwOp::Nand => {1475// andq %r_operand, %r_temp1476asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);14771478// notq %r_temp1479asm::inst::notq_m::new(PairedGpr::from(temp)).emit(sink, info, state);1480}1481RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {1482// cmp %r_temp, %r_operand1483let temp = temp.to_reg();1484match *ty {1485types::I8 => asm::inst::cmpb_mr::new(operand, temp).emit(sink, info, state),1486types::I16 => {1487asm::inst::cmpw_mr::new(operand, temp).emit(sink, info, state)1488}1489types::I32 => {1490asm::inst::cmpl_mr::new(operand, temp).emit(sink, info, state)1491}1492types::I64 => {1493asm::inst::cmpq_mr::new(operand, temp).emit(sink, info, state)1494}1495_ => unreachable!(),1496}14971498// cmovcc %r_operand, %r_temp1499match op {1500RmwOp::Umin => {1501asm::inst::cmovbeq_rm::new(temp_r, *operand).emit(sink, info, state)1502}1503RmwOp::Umax => {1504asm::inst::cmovaeq_rm::new(temp_r, *operand).emit(sink, info, state)1505}1506RmwOp::Smin => {1507asm::inst::cmovleq_rm::new(temp_r, *operand).emit(sink, info, state)1508}1509RmwOp::Smax => {1510asm::inst::cmovgeq_rm::new(temp_r, *operand).emit(sink, info, state)1511}1512_ => unreachable!(),1513}1514}1515RmwOp::And => {1516// andq %r_operand, %r_temp1517asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);1518}1519RmwOp::Or => {1520// orq %r_operand, %r_temp1521asm::inst::orq_rm::new(temp, operand).emit(sink, info, state);1522}1523RmwOp::Xor => {1524// xorq %r_operand, %r_temp1525asm::inst::xorq_rm::new(temp, operand).emit(sink, info, state);1526}1527}15281529// lock cmpxchg{b,w,l,q} %r_temp, (%r_address)1530// No need to call `add_trap` here, since the `i4` emit will do that.1531let temp = temp.to_reg();1532let dst_old = PairedGpr::from(dst_old);1533let inst: AsmInst = match *ty {1534types::I8 => asm::inst::lock_cmpxchgb_mr::new(mem, temp, dst_old).into(),1535types::I16 => asm::inst::lock_cmpxchgw_mr::new(mem, temp, dst_old).into(),1536types::I32 => asm::inst::lock_cmpxchgl_mr::new(mem, temp, dst_old).into(),1537types::I64 => asm::inst::lock_cmpxchgq_mr::new(mem, temp, dst_old).into(),1538_ => unreachable!(),1539};1540inst.emit(sink, info, state);15411542// jnz again1543one_way_jmp(sink, CC::NZ, again_label);1544}15451546Inst::Atomic128RmwSeq {1547op,1548mem,1549operand_low,1550operand_high,1551temp_low,1552temp_high,1553dst_old_low,1554dst_old_high,1555} => {1556let operand_low = *operand_low;1557let operand_high = *operand_high;1558let temp_low = *temp_low;1559let temp_high = *temp_high;1560let dst_old_low = *dst_old_low;1561let dst_old_high = *dst_old_high;1562debug_assert_eq!(temp_low.to_reg(), regs::rbx());1563debug_assert_eq!(temp_high.to_reg(), regs::rcx());1564debug_assert_eq!(dst_old_low.to_reg(), regs::rax());1565debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());1566let mem = mem.finalize(state.frame_layout(), sink).clone();15671568let again_label = sink.get_label();15691570// Load the initial value.1571asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);1572asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);15731574// again:1575sink.bind_label(again_label, state.ctrl_plane_mut());15761577// Move old value to temp registers.1578asm::inst::movq_mr::new(temp_low, dst_old_low.to_reg()).emit(sink, info, state);1579asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg()).emit(sink, info, state);15801581// Perform the operation.1582use Atomic128RmwSeqOp as RmwOp;1583match op {1584RmwOp::Nand => {1585// temp &= operand1586asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);1587asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);15881589// temp = !temp1590asm::inst::notq_m::new(PairedGpr::from(temp_low)).emit(sink, info, state);1591asm::inst::notq_m::new(PairedGpr::from(temp_high)).emit(sink, info, state);1592}1593RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {1594// Do a comparison with LHS temp and RHS operand.1595// Note the opposite argument orders.1596asm::inst::cmpq_mr::new(temp_low.to_reg(), operand_low).emit(sink, info, state);1597// This will clobber `temp_high`1598asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);1599// Restore the clobbered value1600asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg())1601.emit(sink, info, state);1602match op {1603RmwOp::Umin => {1604asm::inst::cmovaeq_rm::new(temp_low, operand_low)1605.emit(sink, info, state);1606asm::inst::cmovaeq_rm::new(temp_high, operand_high)1607.emit(sink, info, state);1608}1609RmwOp::Umax => {1610asm::inst::cmovbq_rm::new(temp_low, operand_low)1611.emit(sink, info, state);1612asm::inst::cmovbq_rm::new(temp_high, operand_high)1613.emit(sink, info, state);1614}1615RmwOp::Smin => {1616asm::inst::cmovgeq_rm::new(temp_low, operand_low)1617.emit(sink, info, state);1618asm::inst::cmovgeq_rm::new(temp_high, operand_high)1619.emit(sink, info, state);1620}1621RmwOp::Smax => {1622asm::inst::cmovlq_rm::new(temp_low, operand_low)1623.emit(sink, info, state);1624asm::inst::cmovlq_rm::new(temp_high, operand_high)1625.emit(sink, info, state);1626}1627_ => unreachable!(),1628}1629}1630RmwOp::Add => {1631asm::inst::addq_rm::new(temp_low, operand_low).emit(sink, info, state);1632asm::inst::adcq_rm::new(temp_high, operand_high).emit(sink, info, state);1633}1634RmwOp::Sub => {1635asm::inst::subq_rm::new(temp_low, operand_low).emit(sink, info, state);1636asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);1637}1638RmwOp::And => {1639asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);1640asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);1641}1642RmwOp::Or => {1643asm::inst::orq_rm::new(temp_low, operand_low).emit(sink, info, state);1644asm::inst::orq_rm::new(temp_high, operand_high).emit(sink, info, state);1645}1646RmwOp::Xor => {1647asm::inst::xorq_rm::new(temp_low, operand_low).emit(sink, info, state);1648asm::inst::xorq_rm::new(temp_high, operand_high).emit(sink, info, state);1649}1650}16511652// cmpxchg16b (mem)1653asm::inst::lock_cmpxchg16b_m::new(1654PairedGpr::from(dst_old_low),1655PairedGpr::from(dst_old_high),1656temp_low.to_reg(),1657temp_high.to_reg(),1658mem,1659)1660.emit(sink, info, state);16611662// jnz again1663one_way_jmp(sink, CC::NZ, again_label);1664}16651666Inst::Atomic128XchgSeq {1667mem,1668operand_low,1669operand_high,1670dst_old_low,1671dst_old_high,1672} => {1673let operand_low = *operand_low;1674let operand_high = *operand_high;1675let dst_old_low = *dst_old_low;1676let dst_old_high = *dst_old_high;1677debug_assert_eq!(operand_low, regs::rbx());1678debug_assert_eq!(operand_high, regs::rcx());1679debug_assert_eq!(dst_old_low.to_reg(), regs::rax());1680debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());1681let mem = mem.finalize(state.frame_layout(), sink).clone();16821683let again_label = sink.get_label();16841685// Load the initial value.1686asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);1687asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);16881689// again:1690sink.bind_label(again_label, state.ctrl_plane_mut());16911692// cmpxchg16b (mem)1693asm::inst::lock_cmpxchg16b_m::new(1694PairedGpr::from(dst_old_low),1695PairedGpr::from(dst_old_high),1696operand_low,1697operand_high,1698mem,1699)1700.emit(sink, info, state);17011702// jnz again1703one_way_jmp(sink, CC::NZ, again_label);1704}17051706Inst::ElfTlsGetAddr { symbol, dst } => {1707let dst = dst.to_reg().to_reg();1708debug_assert_eq!(dst, regs::rax());17091710// N.B.: Must be exactly this byte sequence; the linker requires it,1711// because it must know how to rewrite the bytes.17121713// data16 lea gv@tlsgd(%rip),%rdi1714sink.put1(0x66); // data161715sink.put1(0b01001000); // REX.W1716sink.put1(0x8d); // LEA1717sink.put1(0x3d); // ModRM byte1718emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);1719sink.put4(0); // offset17201721// data16 data16 callq __tls_get_addr-41722sink.put1(0x66); // data161723sink.put1(0x66); // data161724sink.put1(0b01001000); // REX.W1725sink.put1(0xe8); // CALL1726emit_reloc(1727sink,1728Reloc::X86CallPLTRel4,1729&ExternalName::LibCall(LibCall::ElfTlsGetAddr),1730-4,1731);1732sink.put4(0); // offset1733}17341735Inst::MachOTlsGetAddr { symbol, dst } => {1736let dst = dst.to_reg().to_reg();1737debug_assert_eq!(dst, regs::rax());17381739// movq gv@tlv(%rip), %rdi1740sink.put1(0x48); // REX.w1741sink.put1(0x8b); // MOV1742sink.put1(0x3d); // ModRM byte1743emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);1744sink.put4(0); // offset17451746asm::inst::callq_m::new(asm::Amode::ImmReg {1747base: Gpr::RDI,1748simm32: asm::AmodeOffsetPlusKnownOffset::ZERO,1749trap: None,1750})1751.emit(sink, info, state);1752}17531754Inst::CoffTlsGetAddr { symbol, dst, tmp } => {1755let dst = dst.to_reg().to_reg();1756debug_assert_eq!(dst, regs::rax());17571758// tmp is used below directly as %rcx1759let tmp = tmp.to_reg().to_reg();1760debug_assert_eq!(tmp, regs::rcx());17611762// See: https://gcc.godbolt.org/z/M8or9x6ss1763// And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-53293028217641765// Emit the following sequence1766// movl (%rip), %eax ; IMAGE_REL_AMD64_REL32 _tls_index1767// movq %gs:88, %rcx1768// movq (%rcx,%rax,8), %rax1769// leaq (%rax), %rax ; Reloc: IMAGE_REL_AMD64_SECREL symbol17701771// Load TLS index for current thread1772// movl (%rip), %eax1773sink.put1(0x8b); // mov1774sink.put1(0x05);1775emit_reloc(1776sink,1777Reloc::X86PCRel4,1778&ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),1779-4,1780);1781sink.put4(0); // offset17821783// movq %gs:88, %rcx1784// Load the TLS Storage Array pointer1785// The gs segment register refers to the base address of the TEB on x64.1786// 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:1787sink.put_data(&[17880x65, 0x48, // REX.W17890x8b, // MOV17900x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset17910x00, 0x00, 0x00,1792]);17931794// movq (%rcx,%rax,8), %rax1795// Load the actual TLS entry for this thread.1796// Computes ThreadLocalStoragePointer + _tls_index*81797sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);17981799// leaq (%rax), %rax1800sink.put1(0x48);1801sink.put1(0x8d);1802sink.put1(0x80);1803emit_reloc(sink, Reloc::X86SecRel, symbol, 0);1804sink.put4(0); // offset1805}18061807Inst::Unwind { inst } => {1808sink.add_unwind(inst.clone());1809}18101811Inst::DummyUse { .. } => {1812// Nothing.1813}18141815Inst::LabelAddress { dst, label } => {1816// Emit an LEA with a LabelUse given this label.1817asm::inst::leaq_rm::new(*dst, Amode::rip_relative(*label)).emit(sink, info, state);1818}18191820Inst::SequencePoint { .. } => {1821// Nothing.1822}18231824Inst::External { inst } => {1825let frame = state.frame_layout();1826emit_maybe_shrink(1827inst,1828&mut external::AsmCodeSink {1829sink,18301831// These values are transcribed from what is happening in1832// `SyntheticAmode::finalize`. This, plus the `Into` logic1833// converting a `SyntheticAmode` to its external counterpart, are1834// necessary to communicate Cranelift's internal offsets to the1835// assembler; due to when Cranelift determines these offsets, this1836// happens quite late (i.e., here during emission).1837incoming_arg_offset: i32::try_from(1838frame.tail_args_size + frame.setup_area_size,1839)1840.unwrap(),1841slot_offset: i32::try_from(frame.outgoing_args_size).unwrap(),1842},1843);1844}1845}18461847state.clear_post_insn();1848}18491850/// Emit the common sequence used for both direct and indirect tail calls:1851///1852/// * Copy the new frame's stack arguments over the top of our current frame.1853///1854/// * Restore the old frame pointer.1855///1856/// * Initialize the tail callee's stack pointer (simultaneously deallocating1857/// the temporary stack space we allocated when creating the new frame's stack1858/// arguments).1859///1860/// * Move the return address into its stack slot.1861fn emit_return_call_common_sequence<T>(1862sink: &mut MachBuffer<Inst>,1863info: &EmitInfo,1864state: &mut EmitState,1865call_info: &ReturnCallInfo<T>,1866) {1867assert!(1868info.flags.preserve_frame_pointers(),1869"frame pointers aren't fundamentally required for tail calls, \1870but the current implementation relies on them being present"1871);18721873let tmp = call_info.tmp.to_writable_reg();18741875for inst in1876X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())1877{1878inst.emit(sink, info, state);1879}18801881for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(1882CallConv::Tail,1883&info.flags,1884&info.isa_flags,1885state.frame_layout(),1886) {1887inst.emit(sink, info, state);1888}18891890let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;1891if incoming_args_diff > 0 {1892// Move the saved return address up by `incoming_args_diff`.1893let addr = Amode::imm_reg(0, regs::rsp());1894asm::inst::movq_rm::new(tmp, addr).emit(sink, info, state);1895asm::inst::movq_mr::new(1896Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),1897Gpr::unwrap_new(tmp.to_reg()),1898)1899.emit(sink, info, state);19001901// Increment the stack pointer to shrink the argument area for the new1902// call.1903let rsp = Writable::from_reg(regs::rsp());1904let incoming_args_diff = i32::try_from(incoming_args_diff)1905.expect("`incoming_args_diff` is too large to fit in a 32-bit signed immediate");1906Inst::addq_mi(rsp, incoming_args_diff).emit(sink, info, state);1907}1908}19091910/// Conveniene trait to have an `emit` method on all `asm::inst::*` variants.1911trait ExternalEmit {1912fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState);1913}19141915impl<I> ExternalEmit for I1916where1917I: Into<asm::inst::Inst<CraneliftRegisters>>,1918{1919fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState) {1920Inst::External { inst: self.into() }.emit(sink, info, state)1921}1922}19231924/// Attempt to "shrink" the provided `inst`.1925///1926/// This function will inspect `inst` and attempt to return a new instruction1927/// which is equivalent semantically but will encode to a smaller binary1928/// representation. This is only done for instructions which require register1929/// allocation to have already happened, for example shrinking immediates should1930/// be done during instruction selection not at this point.1931///1932/// An example of this optimization is the `AND` instruction. The Intel manual1933/// has a smaller encoding for `AND AL, imm8` than it does for `AND r/m8, imm8`.1934/// Here the instructions are matched against and if regalloc state indicates1935/// that a smaller variant is available then that's swapped to instead.1936fn emit_maybe_shrink(inst: &AsmInst, sink: &mut impl asm::CodeSink) {1937use cranelift_assembler_x64::GprMem;1938use cranelift_assembler_x64::inst::*;19391940type R = CraneliftRegisters;1941const RAX: PairedGpr = PairedGpr {1942read: Gpr::RAX,1943write: Writable::from_reg(Gpr::RAX),1944};1945const RAX_RM: GprMem<PairedGpr, Gpr> = GprMem::Gpr(RAX);19461947match *inst {1948// and1949Inst::andb_mi(andb_mi { rm8: RAX_RM, imm8 }) => andb_i::<R>::new(RAX, imm8).encode(sink),1950Inst::andw_mi(andw_mi {1951rm16: RAX_RM,1952imm16,1953}) => andw_i::<R>::new(RAX, imm16).encode(sink),1954Inst::andl_mi(andl_mi {1955rm32: RAX_RM,1956imm32,1957}) => andl_i::<R>::new(RAX, imm32).encode(sink),1958Inst::andq_mi_sxl(andq_mi_sxl {1959rm64: RAX_RM,1960imm32,1961}) => andq_i_sxl::<R>::new(RAX, imm32).encode(sink),19621963// or1964Inst::orb_mi(orb_mi { rm8: RAX_RM, imm8 }) => orb_i::<R>::new(RAX, imm8).encode(sink),1965Inst::orw_mi(orw_mi {1966rm16: RAX_RM,1967imm16,1968}) => orw_i::<R>::new(RAX, imm16).encode(sink),1969Inst::orl_mi(orl_mi {1970rm32: RAX_RM,1971imm32,1972}) => orl_i::<R>::new(RAX, imm32).encode(sink),1973Inst::orq_mi_sxl(orq_mi_sxl {1974rm64: RAX_RM,1975imm32,1976}) => orq_i_sxl::<R>::new(RAX, imm32).encode(sink),19771978// xor1979Inst::xorb_mi(xorb_mi { rm8: RAX_RM, imm8 }) => xorb_i::<R>::new(RAX, imm8).encode(sink),1980Inst::xorw_mi(xorw_mi {1981rm16: RAX_RM,1982imm16,1983}) => xorw_i::<R>::new(RAX, imm16).encode(sink),1984Inst::xorl_mi(xorl_mi {1985rm32: RAX_RM,1986imm32,1987}) => xorl_i::<R>::new(RAX, imm32).encode(sink),1988Inst::xorq_mi_sxl(xorq_mi_sxl {1989rm64: RAX_RM,1990imm32,1991}) => xorq_i_sxl::<R>::new(RAX, imm32).encode(sink),19921993// add1994Inst::addb_mi(addb_mi { rm8: RAX_RM, imm8 }) => addb_i::<R>::new(RAX, imm8).encode(sink),1995Inst::addw_mi(addw_mi {1996rm16: RAX_RM,1997imm16,1998}) => addw_i::<R>::new(RAX, imm16).encode(sink),1999Inst::addl_mi(addl_mi {2000rm32: RAX_RM,2001imm32,2002}) => addl_i::<R>::new(RAX, imm32).encode(sink),2003Inst::addq_mi_sxl(addq_mi_sxl {2004rm64: RAX_RM,2005imm32,2006}) => addq_i_sxl::<R>::new(RAX, imm32).encode(sink),20072008// adc2009Inst::adcb_mi(adcb_mi { rm8: RAX_RM, imm8 }) => adcb_i::<R>::new(RAX, imm8).encode(sink),2010Inst::adcw_mi(adcw_mi {2011rm16: RAX_RM,2012imm16,2013}) => adcw_i::<R>::new(RAX, imm16).encode(sink),2014Inst::adcl_mi(adcl_mi {2015rm32: RAX_RM,2016imm32,2017}) => adcl_i::<R>::new(RAX, imm32).encode(sink),2018Inst::adcq_mi_sxl(adcq_mi_sxl {2019rm64: RAX_RM,2020imm32,2021}) => adcq_i_sxl::<R>::new(RAX, imm32).encode(sink),20222023// sub2024Inst::subb_mi(subb_mi { rm8: RAX_RM, imm8 }) => subb_i::<R>::new(RAX, imm8).encode(sink),2025Inst::subw_mi(subw_mi {2026rm16: RAX_RM,2027imm16,2028}) => subw_i::<R>::new(RAX, imm16).encode(sink),2029Inst::subl_mi(subl_mi {2030rm32: RAX_RM,2031imm32,2032}) => subl_i::<R>::new(RAX, imm32).encode(sink),2033Inst::subq_mi_sxl(subq_mi_sxl {2034rm64: RAX_RM,2035imm32,2036}) => subq_i_sxl::<R>::new(RAX, imm32).encode(sink),20372038// sbb2039Inst::sbbb_mi(sbbb_mi { rm8: RAX_RM, imm8 }) => sbbb_i::<R>::new(RAX, imm8).encode(sink),2040Inst::sbbw_mi(sbbw_mi {2041rm16: RAX_RM,2042imm16,2043}) => sbbw_i::<R>::new(RAX, imm16).encode(sink),2044Inst::sbbl_mi(sbbl_mi {2045rm32: RAX_RM,2046imm32,2047}) => sbbl_i::<R>::new(RAX, imm32).encode(sink),2048Inst::sbbq_mi_sxl(sbbq_mi_sxl {2049rm64: RAX_RM,2050imm32,2051}) => sbbq_i_sxl::<R>::new(RAX, imm32).encode(sink),20522053// cmp2054Inst::cmpb_mi(cmpb_mi {2055rm8: GprMem::Gpr(Gpr::RAX),2056imm8,2057}) => cmpb_i::<R>::new(Gpr::RAX, imm8).encode(sink),2058Inst::cmpw_mi(cmpw_mi {2059rm16: GprMem::Gpr(Gpr::RAX),2060imm16,2061}) => cmpw_i::<R>::new(Gpr::RAX, imm16).encode(sink),2062Inst::cmpl_mi(cmpl_mi {2063rm32: GprMem::Gpr(Gpr::RAX),2064imm32,2065}) => cmpl_i::<R>::new(Gpr::RAX, imm32).encode(sink),2066Inst::cmpq_mi(cmpq_mi {2067rm64: GprMem::Gpr(Gpr::RAX),2068imm32,2069}) => cmpq_i::<R>::new(Gpr::RAX, imm32).encode(sink),20702071// test2072Inst::testb_mi(testb_mi {2073rm8: GprMem::Gpr(Gpr::RAX),2074imm8,2075}) => testb_i::<R>::new(Gpr::RAX, imm8).encode(sink),2076Inst::testw_mi(testw_mi {2077rm16: GprMem::Gpr(Gpr::RAX),2078imm16,2079}) => testw_i::<R>::new(Gpr::RAX, imm16).encode(sink),2080Inst::testl_mi(testl_mi {2081rm32: GprMem::Gpr(Gpr::RAX),2082imm32,2083}) => testl_i::<R>::new(Gpr::RAX, imm32).encode(sink),2084Inst::testq_mi(testq_mi {2085rm64: GprMem::Gpr(Gpr::RAX),2086imm32,2087}) => testq_i::<R>::new(Gpr::RAX, imm32).encode(sink),20882089// lea2090Inst::leal_rm(leal_rm { r32, m32 }) => emit_lea(2091r32,2092m32,2093sink,2094|dst, amode, s| leal_rm::<R>::new(dst, amode).encode(s),2095|dst, simm32, s| addl_mi::<R>::new(dst, simm32.cast_unsigned()).encode(s),2096|dst, reg, s| addl_rm::<R>::new(dst, reg).encode(s),2097),2098Inst::leaq_rm(leaq_rm { r64, m64 }) => emit_lea(2099r64,2100m64,2101sink,2102|dst, amode, s| leaq_rm::<R>::new(dst, amode).encode(s),2103|dst, simm32, s| addq_mi_sxl::<R>::new(dst, simm32).encode(s),2104|dst, reg, s| addq_rm::<R>::new(dst, reg).encode(s),2105),21062107// All other instructions fall through to here and cannot be shrunk, so2108// return `false` to emit them as usual.2109_ => inst.encode(sink),2110}2111}21122113/// If `lea` can actually get encoded as an `add` then do that instead.2114/// Currently all candidate `iadd`s become an `lea` pseudo-instruction here but2115/// maximizing the use of `lea` is not necessarily optimal. The `lea`2116/// instruction goes through dedicated address units on cores which are finite2117/// and disjoint from the general ALU, so if everything uses `lea` then those2118/// units can get saturated while leaving the ALU idle.2119///2120/// To help make use of more parts of a CPU, this attempts to use `add` when2121/// it's semantically equivalent to `lea`, or otherwise when the `dst` register2122/// is the same as the `base` or `index` register.2123///2124/// FIXME: ideally regalloc is informed of this constraint. Register allocation2125/// of `lea` should "attempt" to put the `base` in the same register as `dst`2126/// but not at the expense of generating a `mov` instruction. Currently that's2127/// not possible but perhaps one day it may be worth it.2128fn emit_lea<S>(2129dst: asm::Gpr<WritableGpr>,2130addr: asm::Amode<Gpr>,2131sink: &mut S,2132lea: fn(WritableGpr, asm::Amode<Gpr>, &mut S),2133add_mi: fn(PairedGpr, i32, &mut S),2134add_rm: fn(PairedGpr, Gpr, &mut S),2135) where2136S: asm::CodeSink,2137{2138match addr {2139// If `base == dst` then this is `add dst, $imm`, so encode that2140// instead.2141asm::Amode::ImmReg {2142base,2143simm32:2144asm::AmodeOffsetPlusKnownOffset {2145simm32,2146offset: None,2147},2148trap: None,2149} if dst.as_ref().to_reg() == base => add_mi(2150PairedGpr {2151read: base,2152write: *dst.as_ref(),2153},2154simm32.value(),2155sink,2156),21572158// If the offset is 0 and the shift is a scale of 1, then:2159//2160// * If `base == dst`, then this is `addq dst, index`2161// * If `index == dst`, then this is `addq dst, base`2162asm::Amode::ImmRegRegShift {2163base,2164index,2165scale: asm::Scale::One,2166simm32: asm::AmodeOffset::ZERO,2167trap: None,2168} => {2169if dst.as_ref().to_reg() == base {2170add_rm(2171PairedGpr {2172read: base,2173write: *dst.as_ref(),2174},2175*index.as_ref(),2176sink,2177)2178} else if dst.as_ref().to_reg() == *index.as_ref() {2179add_rm(2180PairedGpr {2181read: *index.as_ref(),2182write: *dst.as_ref(),2183},2184base,2185sink,2186)2187} else {2188lea(*dst.as_ref(), addr, sink)2189}2190}21912192_ => lea(*dst.as_ref(), addr, sink),2193}2194}219521962197