Path: blob/main/cranelift/codegen/src/isa/x64/inst/emit.rs
1693 views
use crate::ir::KnownSymbol;1use crate::ir::immediates::{Ieee32, Ieee64};2use crate::isa::x64::external::{AsmInst, CraneliftRegisters, PairedGpr};3use crate::isa::x64::inst::args::*;4use crate::isa::x64::inst::*;5use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};6use cranelift_assembler_x64 as asm;78/// A small helper to generate a signed conversion instruction.9fn emit_signed_cvt(10sink: &mut MachBuffer<Inst>,11info: &EmitInfo,12state: &mut EmitState,13src: Reg,14dst: Writable<Reg>,15to_f64: bool,16) {17assert!(src.is_real());18assert!(dst.to_reg().is_real());1920// Handle an unsigned int, which is the "easy" case: a signed conversion21// will do the right thing.22let dst = WritableXmm::from_writable_reg(dst).unwrap();23if to_f64 {24asm::inst::cvtsi2sdq_a::new(dst, src).emit(sink, info, state);25} else {26asm::inst::cvtsi2ssq_a::new(dst, src).emit(sink, info, state);27}28}2930/// Emits a one way conditional jump if CC is set (true).31fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {32let cond_start = sink.cur_offset();33let cond_disp_off = cond_start + 2;34sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);35emit_jcc_no_offset(sink, cc);36debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);37}3839/// Like `one_way_jmp` above emitting a conditional jump, but also using40/// `MachBuffer::add_cond_branch`.41fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {42let cond_start = sink.cur_offset();43let cond_disp_off = cond_start + 2;44let cond_end = cond_start + 6;4546sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);47// FIXME: ideally this `inverted` calculation would go through the external48// assembler, but for now it's left done manually.49let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];50sink.add_cond_branch(cond_start, cond_end, label, &inverted[..]);5152emit_jcc_no_offset(sink, cc);5354debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);55debug_assert_eq!(sink.cur_offset(), cond_end);56}5758fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {59// Note that the disassembler matches Capstone which doesn't match the `CC`60// enum directly as Intel has multiple mnemonics use the same encoding.61let inst: AsmInst = match cc {62CC::Z => asm::inst::je_d32::new(0).into(), // jz == je63CC::NZ => asm::inst::jne_d32::new(0).into(), // jnz == jne64CC::B => asm::inst::jb_d32::new(0).into(),65CC::NB => asm::inst::jae_d32::new(0).into(), // jnb == jae66CC::BE => asm::inst::jbe_d32::new(0).into(),67CC::NBE => asm::inst::ja_d32::new(0).into(), // jnbe == ja68CC::L => asm::inst::jl_d32::new(0).into(),69CC::LE => asm::inst::jle_d32::new(0).into(),70CC::NL => asm::inst::jge_d32::new(0).into(), // jnl == jge71CC::NLE => asm::inst::jg_d32::new(0).into(), // jnle == jg72CC::O => asm::inst::jo_d32::new(0).into(),73CC::NO => asm::inst::jno_d32::new(0).into(),74CC::P => asm::inst::jp_d32::new(0).into(),75CC::NP => asm::inst::jnp_d32::new(0).into(),76CC::S => asm::inst::js_d32::new(0).into(),77CC::NS => asm::inst::jns_d32::new(0).into(),78};79inst.encode(&mut external::AsmCodeSink {80sink,81incoming_arg_offset: 0,82slot_offset: 0,83});84}8586/// Emits an unconditional branch.87fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {88let uncond_start = sink.cur_offset();89let uncond_disp_off = uncond_start + 1;90let uncond_end = uncond_start + 5;9192sink.use_label_at_offset(uncond_disp_off, label, LabelUse::JmpRel32);93sink.add_uncond_branch(uncond_start, uncond_end, label);9495asm::inst::jmp_d32::new(0).encode(&mut external::AsmCodeSink {96sink,97incoming_arg_offset: 0,98slot_offset: 0,99});100debug_assert_eq!(sink.cur_offset(), uncond_disp_off + 4);101debug_assert_eq!(sink.cur_offset(), uncond_end);102}103104/// Emits a relocation, attaching the current source location as well.105fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {106sink.add_reloc(kind, name, addend);107}108109/// The top-level emit function.110///111/// Important! Do not add improved (shortened) encoding cases to existing112/// instructions without also adding tests for those improved encodings. That113/// is a dangerous game that leads to hard-to-track-down errors in the emitted114/// code.115///116/// For all instructions, make sure to have test coverage for all of the117/// following situations. Do this by creating the cross product resulting from118/// applying the following rules to each operand:119///120/// (1) for any insn that mentions a register: one test using a register from121/// the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one122/// using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].123/// This helps detect incorrect REX prefix construction.124///125/// (2) for any insn that mentions a byte register: one test for each of the126/// four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],127/// [r8b .. r11b] and [r12b .. r15b]. This checks that128/// apparently-redundant REX prefixes are retained when required.129///130/// (3) for any insn that contains an immediate field, check the following131/// cases: field is zero, field is in simm8 range (-128 .. 127), field is132/// in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF). This is because some133/// instructions that require a 32-bit immediate have a short-form encoding134/// when the imm is in simm8 range.135///136/// Rules (1), (2) and (3) don't apply for registers within address expressions137/// (`Addr`s). Those are already pretty well tested, and the registers in them138/// don't have any effect on the containing instruction (apart from possibly139/// require REX prefix bits).140///141/// When choosing registers for a test, avoid using registers with the same142/// offset within a given group. For example, don't use rax and r8, since they143/// both have the lowest 3 bits as 000, and so the test won't detect errors144/// where those 3-bit register sub-fields are confused by the emitter. Instead145/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001). Similarly, don't use (eg) cl146/// and bpl since they have the same offset in their group; use instead (eg) cl147/// and sil.148///149/// For all instructions, also add a test that uses only low-half registers150/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX151/// prefixes are correctly omitted. This low-half restriction must apply to152/// _all_ registers in the insn, even those in address expressions.153///154/// Following these rules creates large numbers of test cases, but it's the155/// only way to make the emitter reliable.156///157/// Known possible improvements:158///159/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate. (Do we160/// care?)161pub(crate) fn emit(162inst: &Inst,163sink: &mut MachBuffer<Inst>,164info: &EmitInfo,165state: &mut EmitState,166) {167if !inst.is_available(&info) {168let features = if let Inst::External { inst } = inst {169inst.features().to_string()170} else {171"see `is_available` source for feature term".to_string()172};173panic!(174"Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {features}"175);176}177178match inst {179Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {180// Validate that the register constraints of the dividend and the181// destination are all as expected.182let (dst, size) = match inst {183Inst::CheckedSRemSeq {184dividend_lo,185dividend_hi,186dst_quotient,187dst_remainder,188size,189..190} => {191let dividend_lo = dividend_lo.to_reg();192let dividend_hi = dividend_hi.to_reg();193let dst_quotient = dst_quotient.to_reg().to_reg();194let dst_remainder = dst_remainder.to_reg().to_reg();195debug_assert_eq!(dividend_lo, regs::rax());196debug_assert_eq!(dividend_hi, regs::rdx());197debug_assert_eq!(dst_quotient, regs::rax());198debug_assert_eq!(dst_remainder, regs::rdx());199(regs::rdx(), *size)200}201Inst::CheckedSRemSeq8 { dividend, dst, .. } => {202let dividend = dividend.to_reg();203let dst = dst.to_reg().to_reg();204debug_assert_eq!(dividend, regs::rax());205debug_assert_eq!(dst, regs::rax());206(regs::rax(), OperandSize::Size8)207}208_ => unreachable!(),209};210211// Generates the following code sequence:212//213// cmp -1 %divisor214// jnz $do_op215//216// ;; for srem, result is 0217// mov #0, %dst218// j $done219//220// $do_op:221// idiv %divisor222//223// $done:224225let do_op = sink.get_label();226let done_label = sink.get_label();227228// Check if the divisor is -1, and if it isn't then immediately229// go to the `idiv`.230let inst = Inst::cmp_mi_sxb(size, *divisor, -1);231inst.emit(sink, info, state);232one_way_jmp(sink, CC::NZ, do_op);233234// ... otherwise the divisor is -1 and the result is always 0. This235// is written to the destination register which will be %rax for236// 8-bit srem and %rdx otherwise.237//238// Note that for 16-to-64-bit srem operations this leaves the239// second destination, %rax, unchanged. This isn't semantically240// correct if a lowering actually tries to use the `dst_quotient`241// output but for srem only the `dst_remainder` output is used for242// now.243let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));244inst.emit(sink, info, state);245let inst = Inst::jmp_known(done_label);246inst.emit(sink, info, state);247248// Here the `idiv` is executed, which is different depending on the249// size250sink.bind_label(do_op, state.ctrl_plane_mut());251let rax = Gpr::RAX;252let rdx = Gpr::RDX;253let writable_rax = Writable::from_reg(rax);254let writable_rdx = Writable::from_reg(rdx);255let inst: AsmInst = match size {256OperandSize::Size8 => asm::inst::idivb_m::new(257PairedGpr::from(writable_rax),258*divisor,259TrapCode::INTEGER_DIVISION_BY_ZERO,260)261.into(),262263OperandSize::Size16 => asm::inst::idivw_m::new(264PairedGpr::from(writable_rax),265PairedGpr::from(writable_rdx),266*divisor,267TrapCode::INTEGER_DIVISION_BY_ZERO,268)269.into(),270271OperandSize::Size32 => asm::inst::idivl_m::new(272PairedGpr::from(writable_rax),273PairedGpr::from(writable_rdx),274*divisor,275TrapCode::INTEGER_DIVISION_BY_ZERO,276)277.into(),278279OperandSize::Size64 => asm::inst::idivq_m::new(280PairedGpr::from(writable_rax),281PairedGpr::from(writable_rdx),282*divisor,283TrapCode::INTEGER_DIVISION_BY_ZERO,284)285.into(),286};287inst.emit(sink, info, state);288289sink.bind_label(done_label, state.ctrl_plane_mut());290}291292Inst::MovFromPReg { src, dst } => {293let src: Reg = (*src).into();294debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));295asm::inst::movq_mr::new(*dst, Gpr::unwrap_new(src)).emit(sink, info, state);296}297298Inst::MovToPReg { src, dst } => {299let dst: Reg = (*dst).into();300debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));301let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();302asm::inst::movq_mr::new(dst, *src).emit(sink, info, state);303}304305Inst::XmmCmove {306ty,307cc,308consequent,309alternative,310dst,311} => {312let alternative = *alternative;313let dst = *dst;314debug_assert_eq!(alternative, dst.to_reg());315let consequent = *consequent;316317// Lowering of the Select IR opcode when the input is an fcmp relies on the fact that318// this doesn't clobber flags. Make sure to not do so here.319let next = sink.get_label();320321// Jump if cc is *not* set.322one_way_jmp(sink, cc.invert(), next);323Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)324.emit(sink, info, state);325326sink.bind_label(next, state.ctrl_plane_mut());327}328329Inst::StackProbeLoop {330tmp,331frame_size,332guard_size,333} => {334assert!(info.flags.enable_probestack());335assert!(guard_size.is_power_of_two());336337let tmp = *tmp;338339// Number of probes that we need to perform340let probe_count = align_to(*frame_size, *guard_size) / guard_size;341342// The inline stack probe loop has 3 phases:343//344// We generate the "guard area" register which is essentially the frame_size aligned to345// guard_size. We copy the stack pointer and subtract the guard area from it. This346// gets us a register that we can use to compare when looping.347//348// After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd349// distance at a time and then touch the stack by writing anything to it. We use the previously350// created "guard area" register to know when to stop looping.351//352// When we have touched all the pages that we need, we have to restore the stack pointer353// to where it was before.354//355// Generate the following code:356// mov tmp_reg, rsp357// sub tmp_reg, guard_size * probe_count358// .loop_start:359// sub rsp, guard_size360// mov [rsp], rsp361// cmp rsp, tmp_reg362// jne .loop_start363// add rsp, guard_size * probe_count364365// Create the guard bound register366// mov tmp_reg, rsp367let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);368inst.emit(sink, info, state);369370// sub tmp_reg, GUARD_SIZE * probe_count371let guard_plus_count = i32::try_from(guard_size * probe_count)372.expect("`guard_size * probe_count` is too large to fit in a 32-bit immediate");373Inst::subq_mi(tmp, guard_plus_count).emit(sink, info, state);374375// Emit the main loop!376let loop_start = sink.get_label();377sink.bind_label(loop_start, state.ctrl_plane_mut());378379// sub rsp, GUARD_SIZE380let rsp = Writable::from_reg(regs::rsp());381let guard_size_ = i32::try_from(*guard_size)382.expect("`guard_size` is too large to fit in a 32-bit immediate");383Inst::subq_mi(rsp, guard_size_).emit(sink, info, state);384385// TODO: `mov [rsp], 0` would be better, but we don't have that instruction386// Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable387// instruction size.388// mov [rsp], rsp389asm::inst::movl_mr::new(Amode::imm_reg(0, regs::rsp()), Gpr::RSP)390.emit(sink, info, state);391392// Compare and jump if we are not done yet393// cmp rsp, tmp_reg394let tmp = Gpr::unwrap_new(tmp.to_reg());395asm::inst::cmpq_rm::new(tmp, Gpr::RSP).emit(sink, info, state);396397// jne .loop_start398// TODO: Encoding the conditional jump as a short jump399// could save us us 4 bytes here.400one_way_jmp(sink, CC::NZ, loop_start);401402// The regular prologue code is going to emit a `sub` after this, so we need to403// reset the stack pointer404//405// TODO: It would be better if we could avoid the `add` + `sub` that is generated here406// and in the stack adj portion of the prologue407//408// add rsp, GUARD_SIZE * probe_count409Inst::addq_mi(rsp, guard_plus_count).emit(sink, info, state);410}411412Inst::CallKnown { info: call_info } => {413let stack_map = state.take_stack_map();414415asm::inst::callq_d::new(0).emit(sink, info, state);416417// The last 4 bytes of `callq` is the relative displacement to where418// we're calling, so that's where the reloc is registered.419//420// The addend adjusts for the difference between the end of the421// instruction and the beginning of the immediate field.422let len = sink.cur_offset();423sink.add_reloc_at_offset(len - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);424425if let Some(s) = stack_map {426sink.push_user_stack_map(state, len, s);427}428429if let Some(try_call) = call_info.try_call_info.as_ref() {430sink.add_try_call_site(431Some(state.frame_layout().sp_to_fp()),432try_call.exception_handlers(&state.frame_layout()),433);434} else {435sink.add_call_site();436}437438// Reclaim the outgoing argument area that was released by the439// callee, to ensure that StackAMode values are always computed from440// a consistent SP.441if call_info.callee_pop_size > 0 {442let rsp = Writable::from_reg(regs::rsp());443let callee_pop_size = i32::try_from(call_info.callee_pop_size)444.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");445Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);446}447448// Load any stack-carried return values.449call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(450state.frame_layout().stackslots_size,451|inst| inst.emit(sink, info, state),452|_space_needed| None,453);454455// If this is a try-call, jump to the continuation456// (normal-return) block.457if let Some(try_call) = call_info.try_call_info.as_ref() {458let jmp = Inst::JmpKnown {459dst: try_call.continuation,460};461jmp.emit(sink, info, state);462}463}464465Inst::ReturnCallKnown { info: call_info } => {466emit_return_call_common_sequence(sink, info, state, &call_info);467468// Finally, jump to the callee!469//470// Note: this is not `Inst::Jmp { .. }.emit(..)` because we have471// different metadata in this case: we don't have a label for the472// target, but rather a function relocation.473asm::inst::jmp_d32::new(0).emit(sink, info, state);474let offset = sink.cur_offset();475// The addend adjusts for the difference between the end of the instruction and the476// beginning of the immediate field.477sink.add_reloc_at_offset(offset - 4, Reloc::X86CallPCRel4, &call_info.dest, -4);478sink.add_call_site();479}480481Inst::ReturnCallUnknown { info: call_info } => {482let callee = call_info.dest;483484emit_return_call_common_sequence(sink, info, state, &call_info);485486asm::inst::jmpq_m::new(callee).emit(sink, info, state);487sink.add_call_site();488}489490Inst::CallUnknown {491info: call_info, ..492} => {493let stack_map = state.take_stack_map();494495let dest = match call_info.dest.clone() {496RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::unwrap_new(reg)),497RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),498};499500asm::inst::callq_m::new(dest).emit(sink, info, state);501502if let Some(s) = stack_map {503let offset = sink.cur_offset();504sink.push_user_stack_map(state, offset, s);505}506507if let Some(try_call) = call_info.try_call_info.as_ref() {508sink.add_try_call_site(509Some(state.frame_layout().sp_to_fp()),510try_call.exception_handlers(&state.frame_layout()),511);512} else {513sink.add_call_site();514}515516// Reclaim the outgoing argument area that was released by the callee, to ensure that517// StackAMode values are always computed from a consistent SP.518if call_info.callee_pop_size > 0 {519let rsp = Writable::from_reg(regs::rsp());520let callee_pop_size = i32::try_from(call_info.callee_pop_size)521.expect("`callee_pop_size` is too large to fit in a 32-bit immediate");522Inst::subq_mi(rsp, callee_pop_size).emit(sink, info, state);523}524525// Load any stack-carried return values.526call_info.emit_retval_loads::<X64ABIMachineSpec, _, _>(527state.frame_layout().stackslots_size,528|inst| inst.emit(sink, info, state),529|_space_needed| None,530);531532if let Some(try_call) = call_info.try_call_info.as_ref() {533let jmp = Inst::JmpKnown {534dst: try_call.continuation,535};536jmp.emit(sink, info, state);537}538}539540Inst::Args { .. } => {}541Inst::Rets { .. } => {}542543Inst::StackSwitchBasic {544store_context_ptr,545load_context_ptr,546in_payload0,547out_payload0,548} => {549// Note that we do not emit anything for preserving and restoring550// ordinary registers here: That's taken care of by regalloc for us,551// since we marked this instruction as clobbering all registers.552//553// Also note that we do nothing about passing the single payload554// value: We've informed regalloc that it is sent and received via555// the fixed register given by [stack_switch::payload_register]556557let (tmp1, tmp2) = {558// Ideally we would just ask regalloc for two temporary registers.559// However, adding any early defs to the constraints on StackSwitch560// causes TooManyLiveRegs. Fortunately, we can manually find tmp561// registers without regalloc: Since our instruction clobbers all562// registers, we can simply pick any register that is not assigned563// to the operands.564565let all = crate::isa::x64::abi::ALL_CLOBBERS;566567let used_regs = [568**load_context_ptr,569**store_context_ptr,570**in_payload0,571*out_payload0.to_reg(),572];573574let mut tmps = all.into_iter().filter_map(|preg| {575let reg: Reg = preg.into();576if !used_regs.contains(®) {577WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))578} else {579None580}581});582(tmps.next().unwrap(), tmps.next().unwrap())583};584585let layout = stack_switch::control_context_layout();586let rsp_offset = layout.stack_pointer_offset as i32;587let pc_offset = layout.ip_offset as i32;588let rbp_offset = layout.frame_pointer_offset as i32;589590// Location to which someone switch-ing back to this stack will jump591// to: Right behind the `StackSwitch` instruction592let resume = sink.get_label();593594//595// For RBP and RSP we do the following:596// - Load new value for register from `load_context_ptr` +597// corresponding offset.598// - Store previous (!) value of register at `store_context_ptr` +599// corresponding offset.600//601// Since `load_context_ptr` and `store_context_ptr` are allowed to be602// equal, we need to use a temporary register here.603//604605let mut exchange = |offset, reg| {606let addr = SyntheticAmode::real(Amode::imm_reg(offset, **load_context_ptr));607asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);608609asm::inst::movq_mr::new(610Amode::imm_reg(offset, **store_context_ptr),611Gpr::new(reg).unwrap(),612)613.emit(sink, info, state);614615let dst = Writable::from_reg(reg);616asm::inst::movq_mr::new(dst.map(Gpr::unwrap_new), tmp1.to_reg())617.emit(sink, info, state);618};619620exchange(rsp_offset, regs::rsp());621exchange(rbp_offset, regs::rbp());622623//624// Load target PC, store resume PC, jump to target PC625//626627let addr = SyntheticAmode::real(Amode::imm_reg(pc_offset, **load_context_ptr));628asm::inst::movq_rm::new(tmp1, addr).emit(sink, info, state);629630let amode = Amode::RipRelative { target: resume };631asm::inst::leaq_rm::new(tmp2, amode).emit(sink, info, state);632633asm::inst::movq_mr::new(634Amode::imm_reg(pc_offset, **store_context_ptr),635tmp2.to_reg(),636)637.emit(sink, info, state);638639asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);640641sink.bind_label(resume, state.ctrl_plane_mut());642}643644Inst::JmpKnown { dst } => uncond_jmp(sink, *dst),645646Inst::WinchJmpIf { cc, taken } => one_way_jmp(sink, *cc, *taken),647648Inst::JmpCond {649cc,650taken,651not_taken,652} => {653cond_jmp(sink, *cc, *taken);654uncond_jmp(sink, *not_taken);655}656657Inst::JmpCondOr {658cc1,659cc2,660taken,661not_taken,662} => {663// Emit:664// jcc1 taken665// jcc2 taken666// jmp not_taken667//668// Note that we enroll both conditionals in the669// branch-chomping mechanism because MachBuffer670// simplification can continue upward as long as it keeps671// chomping branches. In the best case, if taken ==672// not_taken and that one block is the fallthrough block,673// all three branches can disappear.674675cond_jmp(sink, *cc1, *taken);676cond_jmp(sink, *cc2, *taken);677uncond_jmp(sink, *not_taken);678}679680&Inst::JmpTableSeq {681idx,682tmp1,683tmp2,684ref targets,685ref default_target,686..687} => {688// This sequence is *one* instruction in the vcode, and is expanded only here at689// emission time, because we cannot allow the regalloc to insert spills/reloads in690// the middle; we depend on hardcoded PC-rel addressing below.691//692// We don't have to worry about emitting islands, because the only label-use type has a693// maximum range of 2 GB. If we later consider using shorter-range label references,694// this will need to be revisited.695696// We generate the following sequence. Note that the only read of %idx is before the697// write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs698// if you change this.699// lea start_of_jump_table_offset(%rip), %tmp1700// movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4701// addq %tmp2, %tmp1702// j *%tmp1703// $start_of_jump_table:704// -- jump table entries705706// Load base address of jump table.707let start_of_jumptable = sink.get_label();708asm::inst::leaq_rm::new(tmp1, Amode::rip_relative(start_of_jumptable))709.emit(sink, info, state);710711// Load value out of the jump table. It's a relative offset to the target block, so it712// might be negative; use a sign-extension.713let inst = Inst::movsx_rm_r(714ExtMode::LQ,715RegMem::mem(Amode::imm_reg_reg_shift(7160,717Gpr::unwrap_new(tmp1.to_reg()),718Gpr::unwrap_new(idx),7192,720)),721tmp2,722);723inst.emit(sink, info, state);724725// Add base of jump table to jump-table-sourced block offset.726asm::inst::addq_rm::new(tmp1, tmp2).emit(sink, info, state);727728// Branch to computed address.729asm::inst::jmpq_m::new(tmp1.to_reg()).emit(sink, info, state);730731// Emit jump table (table of 32-bit offsets).732sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());733let jt_off = sink.cur_offset();734for &target in targets.iter().chain(std::iter::once(default_target)) {735let word_off = sink.cur_offset();736// off_into_table is an addend here embedded in the label to be later patched at737// the end of codegen. The offset is initially relative to this jump table entry;738// with the extra addend, it'll be relative to the jump table's start, after739// patching.740let off_into_table = word_off - jt_off;741sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);742sink.put4(off_into_table);743}744}745746Inst::TrapIf { cc, trap_code } => {747let trap_label = sink.defer_trap(*trap_code);748one_way_jmp(sink, *cc, trap_label);749}750751Inst::TrapIfAnd {752cc1,753cc2,754trap_code,755} => {756let trap_label = sink.defer_trap(*trap_code);757let else_label = sink.get_label();758759// Jump to the end if the first condition isn't true, and then if760// the second condition is true go to the trap.761one_way_jmp(sink, cc1.invert(), else_label);762one_way_jmp(sink, *cc2, trap_label);763764sink.bind_label(else_label, state.ctrl_plane_mut());765}766767Inst::TrapIfOr {768cc1,769cc2,770trap_code,771} => {772let trap_label = sink.defer_trap(*trap_code);773774// Emit two jumps to the same trap if either condition code is true.775one_way_jmp(sink, *cc1, trap_label);776one_way_jmp(sink, *cc2, trap_label);777}778779Inst::XmmMinMaxSeq {780size,781is_min,782lhs,783rhs,784dst,785} => {786let rhs = rhs.to_reg();787let lhs = lhs.to_reg();788let dst = dst.to_writable_reg();789debug_assert_eq!(rhs, dst.to_reg());790791// Generates the following sequence:792// cmpss/cmpsd %lhs, %rhs_dst793// jnz do_min_max794// jp propagate_nan795//796// ;; ordered and equal: propagate the sign bit (for -0 vs 0):797// {and,or}{ss,sd} %lhs, %rhs_dst798// j done799//800// ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the801// ;; NaN value is returned), we add both inputs.802// propagate_nan:803// add{ss,sd} %lhs, %rhs_dst804// j done805//806// do_min_max:807// {min,max}{ss,sd} %lhs, %rhs_dst808//809// done:810let done = sink.get_label();811let propagate_nan = sink.get_label();812let do_min_max = sink.get_label();813814let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {815OperandSize::Size32 => (816asm::inst::addss_a::new(dst, lhs).into(),817asm::inst::ucomiss_a::new(dst.to_reg(), lhs).into(),818asm::inst::andps_a::new(dst, lhs).into(),819asm::inst::orps_a::new(dst, lhs).into(),820if *is_min {821asm::inst::minss_a::new(dst, lhs).into()822} else {823asm::inst::maxss_a::new(dst, lhs).into()824},825),826OperandSize::Size64 => (827asm::inst::addsd_a::new(dst, lhs).into(),828asm::inst::ucomisd_a::new(dst.to_reg(), lhs).into(),829asm::inst::andpd_a::new(dst, lhs).into(),830asm::inst::orpd_a::new(dst, lhs).into(),831if *is_min {832asm::inst::minsd_a::new(dst, lhs).into()833} else {834asm::inst::maxsd_a::new(dst, lhs).into()835},836),837_ => unreachable!(),838};839let add_op: AsmInst = add_op;840let or_op: AsmInst = or_op;841let min_max_op: AsmInst = min_max_op;842let cmp_op: AsmInst = cmp_op;843844cmp_op.emit(sink, info, state);845846one_way_jmp(sink, CC::NZ, do_min_max);847one_way_jmp(sink, CC::P, propagate_nan);848849// Ordered and equal. The operands are bit-identical unless they are zero850// and negative zero. These instructions merge the sign bits in that851// case, and are no-ops otherwise.852let inst: AsmInst = if *is_min { or_op } else { and_op };853inst.emit(sink, info, state);854855let inst = Inst::jmp_known(done);856inst.emit(sink, info, state);857858// x86's min/max are not symmetric; if either operand is a NaN, they return the859// read-only operand: perform an addition between the two operands, which has the860// desired NaN propagation effects.861sink.bind_label(propagate_nan, state.ctrl_plane_mut());862add_op.emit(sink, info, state);863864one_way_jmp(sink, CC::P, done);865866sink.bind_label(do_min_max, state.ctrl_plane_mut());867min_max_op.emit(sink, info, state);868869sink.bind_label(done, state.ctrl_plane_mut());870}871872Inst::XmmUninitializedValue { .. } | Inst::GprUninitializedValue { .. } => {873// These instruction formats only exist to declare a register as a874// `def`; no code is emitted. This is always immediately followed by875// an instruction, such as `xor <tmp>, <tmp>`, that semantically876// reads this undefined value but arithmetically produces the same877// result regardless of its value.878}879880Inst::CvtUint64ToFloatSeq {881dst_size,882src,883dst,884tmp_gpr1,885tmp_gpr2,886} => {887let src = src.to_reg();888let dst = dst.to_writable_reg();889let tmp_gpr1 = tmp_gpr1.to_writable_reg();890let tmp_gpr2 = tmp_gpr2.to_writable_reg();891892// Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a893// different sequence.894//895// Emit the following sequence:896//897// cmp 0, %src898// jl handle_negative899//900// ;; handle positive, which can't overflow901// cvtsi2sd/cvtsi2ss %src, %dst902// j done903//904// ;; handle negative: see below for an explanation of what it's doing.905// handle_negative:906// mov %src, %tmp_gpr1907// shr $1, %tmp_gpr1908// mov %src, %tmp_gpr2909// and $1, %tmp_gpr2910// or %tmp_gpr1, %tmp_gpr2911// cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst912// addsd/addss %dst, %dst913//914// done:915916assert_ne!(src, tmp_gpr1.to_reg());917assert_ne!(src, tmp_gpr2.to_reg());918919let handle_negative = sink.get_label();920let done = sink.get_label();921922// If x seen as a signed int64 is not negative, a signed-conversion will do the right923// thing.924// TODO use tst src, src here.925asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);926927one_way_jmp(sink, CC::L, handle_negative);928929// Handle a positive int64, which is the "easy" case: a signed conversion will do the930// right thing.931emit_signed_cvt(932sink,933info,934state,935src,936dst,937*dst_size == OperandSize::Size64,938);939940let inst = Inst::jmp_known(done);941inst.emit(sink, info, state);942943sink.bind_label(handle_negative, state.ctrl_plane_mut());944945// Divide x by two to get it in range for the signed conversion, keep the LSB, and946// scale it back up on the FP side.947let inst = Inst::gen_move(tmp_gpr1, src, types::I64);948inst.emit(sink, info, state);949950// tmp_gpr1 := src >> 1951asm::inst::shrq_mi::new(tmp_gpr1, 1).emit(sink, info, state);952953let inst = Inst::gen_move(tmp_gpr2, src, types::I64);954inst.emit(sink, info, state);955956asm::inst::andq_mi_sxb::new(tmp_gpr2, 1).emit(sink, info, state);957958asm::inst::orq_rm::new(tmp_gpr2, tmp_gpr1).emit(sink, info, state);959960emit_signed_cvt(961sink,962info,963state,964tmp_gpr2.to_reg(),965dst,966*dst_size == OperandSize::Size64,967);968969let inst: AsmInst = match *dst_size {970OperandSize::Size64 => asm::inst::addsd_a::new(dst, dst.to_reg()).into(),971OperandSize::Size32 => asm::inst::addss_a::new(dst, dst.to_reg()).into(),972_ => unreachable!(),973};974inst.emit(sink, info, state);975976sink.bind_label(done, state.ctrl_plane_mut());977}978979Inst::CvtFloatToSintSeq {980src_size,981dst_size,982is_saturating,983src,984dst,985tmp_gpr,986tmp_xmm,987} => {988use OperandSize::*;989990let src = src.to_reg();991let dst = dst.to_writable_reg();992let tmp_gpr = tmp_gpr.to_writable_reg();993let tmp_xmm = tmp_xmm.to_writable_reg();994995// Emits the following common sequence:996//997// cvttss2si/cvttsd2si %src, %dst998// cmp %dst, 1999// jno done1000//1001// Then, for saturating conversions:1002//1003// ;; check for NaN1004// cmpss/cmpsd %src, %src1005// jnp not_nan1006// xor %dst, %dst1007//1008// ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is1009// ;; already in %dst.1010// xorpd %tmp_xmm, %tmp_xmm1011// cmpss/cmpsd %src, %tmp_xmm1012// jnb done1013// mov/movaps $INT_MAX, %dst1014//1015// done:1016//1017// Then, for non-saturating conversions:1018//1019// ;; check for NaN1020// cmpss/cmpsd %src, %src1021// jnp not_nan1022// ud2 trap BadConversionToInteger1023//1024// ;; check if INT_MIN was the correct result, against a magic constant:1025// not_nan:1026// movaps/mov $magic, %tmp_gpr1027// movq/movd %tmp_gpr, %tmp_xmm1028// cmpss/cmpsd %tmp_xmm, %src1029// jnb/jnbe $check_positive1030// ud2 trap IntegerOverflow1031//1032// ;; if positive, it was a real overflow1033// check_positive:1034// xorpd %tmp_xmm, %tmp_xmm1035// cmpss/cmpsd %src, %tmp_xmm1036// jnb done1037// ud2 trap IntegerOverflow1038//1039// done:10401041let cmp_op: AsmInst = match src_size {1042Size64 => asm::inst::ucomisd_a::new(src, src).into(),1043Size32 => asm::inst::ucomiss_a::new(src, src).into(),1044_ => unreachable!(),1045};10461047let cvtt_op = |dst, src| Inst::External {1048inst: match (*src_size, *dst_size) {1049(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),1050(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),1051(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),1052(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),1053_ => unreachable!(),1054},1055};10561057let done = sink.get_label();10581059// The truncation.1060cvtt_op(dst, src).emit(sink, info, state);10611062// Compare against 1, in case of overflow the dst operand was INT_MIN.1063let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);1064inst.emit(sink, info, state);10651066one_way_jmp(sink, CC::NO, done); // no overflow => done10671068// Check for NaN.1069cmp_op.emit(sink, info, state);10701071if *is_saturating {1072let not_nan = sink.get_label();1073one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN10741075// For NaN, emit 0.1076let inst: AsmInst = match *dst_size {1077OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),1078OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),1079_ => unreachable!(),1080};1081inst.emit(sink, info, state);10821083let inst = Inst::jmp_known(done);1084inst.emit(sink, info, state);10851086sink.bind_label(not_nan, state.ctrl_plane_mut());10871088// If the input was positive, saturate to INT_MAX.10891090// Zero out tmp_xmm.1091asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);10921093let inst: AsmInst = match src_size {1094Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),1095Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),1096_ => unreachable!(),1097};1098inst.emit(sink, info, state);10991100// Jump if >= to done.1101one_way_jmp(sink, CC::NB, done);11021103// Otherwise, put INT_MAX.1104if *dst_size == OperandSize::Size64 {1105let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);1106inst.emit(sink, info, state);1107} else {1108let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);1109inst.emit(sink, info, state);1110}1111} else {1112let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);1113inst.emit(sink, info, state);11141115// Check if INT_MIN was the correct result: determine the smallest floating point1116// number that would convert to INT_MIN, put it in a temporary register, and compare1117// against the src register.1118// If the src register is less (or in some cases, less-or-equal) than the threshold,1119// trap!11201121let mut no_overflow_cc = CC::NB; // >=1122let output_bits = dst_size.to_bits();1123match *src_size {1124OperandSize::Size32 => {1125let cst = (-Ieee32::pow2(output_bits - 1)).bits();1126let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);1127inst.emit(sink, info, state);1128}1129OperandSize::Size64 => {1130// An f64 can represent `i32::min_value() - 1` exactly with precision to spare,1131// so there are values less than -2^(N-1) that convert correctly to INT_MIN.1132let cst = if output_bits < 64 {1133no_overflow_cc = CC::NBE; // >1134Ieee64::fcvt_to_sint_negative_overflow(output_bits)1135} else {1136-Ieee64::pow2(output_bits - 1)1137};1138let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);1139inst.emit(sink, info, state);1140}1141_ => unreachable!(),1142}11431144let inst: AsmInst = {1145let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());1146match src_size {1147Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),1148Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),1149_ => unreachable!(),1150}1151};1152inst.emit(sink, info, state);11531154let inst: AsmInst = match src_size {1155Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),1156Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),1157_ => unreachable!(),1158};1159inst.emit(sink, info, state);11601161// no trap if src >= or > threshold1162let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);1163inst.emit(sink, info, state);11641165// If positive, it was a real overflow.11661167// Zero out the tmp_xmm register.1168asm::inst::xorpd_a::new(tmp_xmm, tmp_xmm.to_reg()).emit(sink, info, state);11691170let inst: AsmInst = match src_size {1171Size64 => asm::inst::ucomisd_a::new(tmp_xmm.to_reg(), src).into(),1172Size32 => asm::inst::ucomiss_a::new(tmp_xmm.to_reg(), src).into(),1173_ => unreachable!(),1174};1175inst.emit(sink, info, state);11761177// no trap if 0 >= src1178let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);1179inst.emit(sink, info, state);1180}11811182sink.bind_label(done, state.ctrl_plane_mut());1183}11841185Inst::CvtFloatToUintSeq {1186src_size,1187dst_size,1188is_saturating,1189src,1190dst,1191tmp_gpr,1192tmp_xmm,1193tmp_xmm2,1194} => {1195use OperandSize::*;11961197let src = src.to_reg();1198let dst = dst.to_writable_reg();1199let tmp_gpr = tmp_gpr.to_writable_reg();1200let tmp_xmm = tmp_xmm.to_writable_reg();1201let tmp_xmm2 = tmp_xmm2.to_writable_reg();12021203// The only difference in behavior between saturating and non-saturating is how we1204// handle errors. Emits the following sequence:1205//1206// movaps/mov 2**(int_width - 1), %tmp_gpr1207// movq/movd %tmp_gpr, %tmp_xmm1208// cmpss/cmpsd %tmp_xmm, %src1209// jnb is_large1210//1211// ;; check for NaN inputs1212// jnp not_nan1213// -- non-saturating: ud2 trap BadConversionToInteger1214// -- saturating: xor %dst, %dst; j done1215//1216// not_nan:1217// cvttss2si/cvttsd2si %src, %dst1218// cmp 0, %dst1219// jnl done1220// -- non-saturating: ud2 trap IntegerOverflow1221// -- saturating: xor %dst, %dst; j done1222//1223// is_large:1224// mov %src, %tmp_xmm21225// subss/subsd %tmp_xmm, %tmp_xmm21226// cvttss2si/cvttss2sd %tmp_x, %dst1227// cmp 0, %dst1228// jnl next_is_large1229// -- non-saturating: ud2 trap IntegerOverflow1230// -- saturating: movaps $UINT_MAX, %dst; j done1231//1232// next_is_large:1233// add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers1234//1235// done:12361237assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");12381239let xor_op = |dst, src| Inst::External {1240inst: match *dst_size {1241Size32 => asm::inst::xorl_rm::new(dst, src).into(),1242Size64 => asm::inst::xorq_rm::new(dst, src).into(),1243_ => unreachable!(),1244},1245};12461247let subs_op = |dst, src| Inst::External {1248inst: match *src_size {1249Size32 => asm::inst::subss_a::new(dst, src).into(),1250Size64 => asm::inst::subsd_a::new(dst, src).into(),1251_ => unreachable!(),1252},1253};12541255let cvtt_op = |dst, src| Inst::External {1256inst: match (*src_size, *dst_size) {1257(Size32, Size32) => asm::inst::cvttss2si_a::new(dst, src).into(),1258(Size32, Size64) => asm::inst::cvttss2si_aq::new(dst, src).into(),1259(Size64, Size32) => asm::inst::cvttsd2si_a::new(dst, src).into(),1260(Size64, Size64) => asm::inst::cvttsd2si_aq::new(dst, src).into(),1261_ => unreachable!(),1262},1263};12641265let done = sink.get_label();12661267let cst = match src_size {1268OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,1269OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),1270_ => unreachable!(),1271};12721273let inst = Inst::imm(*src_size, cst, tmp_gpr);1274inst.emit(sink, info, state);12751276let inst: AsmInst = {1277let tmp_xmm: WritableXmm = tmp_xmm.map(|r| Xmm::new(r).unwrap());1278match src_size {1279Size32 => asm::inst::movd_a::new(tmp_xmm, tmp_gpr).into(),1280Size64 => asm::inst::movq_a::new(tmp_xmm, tmp_gpr).into(),1281_ => unreachable!(),1282}1283};1284inst.emit(sink, info, state);12851286let inst: AsmInst = match src_size {1287Size64 => asm::inst::ucomisd_a::new(src, tmp_xmm.to_reg()).into(),1288Size32 => asm::inst::ucomiss_a::new(src, tmp_xmm.to_reg()).into(),1289_ => unreachable!(),1290};1291inst.emit(sink, info, state);12921293let handle_large = sink.get_label();1294one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold12951296if *is_saturating {1297// If not NaN jump over this 0-return, otherwise return 01298let not_nan = sink.get_label();1299one_way_jmp(sink, CC::NP, not_nan);13001301xor_op(dst, dst).emit(sink, info, state);13021303let inst = Inst::jmp_known(done);1304inst.emit(sink, info, state);1305sink.bind_label(not_nan, state.ctrl_plane_mut());1306} else {1307// Trap.1308let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);1309inst.emit(sink, info, state);1310}13111312// Actual truncation for small inputs: if the result is not positive, then we had an1313// overflow.13141315cvtt_op(dst, src).emit(sink, info, state);13161317let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);1318inst.emit(sink, info, state);13191320one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done13211322if *is_saturating {1323// The input was "small" (< 2**(width -1)), so the only way to get an integer1324// overflow is because the input was too small: saturate to the min value, i.e. 0.1325let inst: AsmInst = match *dst_size {1326OperandSize::Size32 => asm::inst::xorl_rm::new(dst, dst).into(),1327OperandSize::Size64 => asm::inst::xorq_rm::new(dst, dst).into(),1328_ => unreachable!(),1329};1330inst.emit(sink, info, state);13311332let inst = Inst::jmp_known(done);1333inst.emit(sink, info, state);1334} else {1335// Trap.1336asm::inst::ud2_zo::new(TrapCode::INTEGER_OVERFLOW).emit(sink, info, state);1337}13381339// Now handle large inputs.13401341sink.bind_label(handle_large, state.ctrl_plane_mut());13421343let inst = Inst::gen_move(tmp_xmm2, src, types::F64);1344inst.emit(sink, info, state);13451346subs_op(tmp_xmm2, tmp_xmm.to_reg()).emit(sink, info, state);13471348cvtt_op(dst, tmp_xmm2.to_reg()).emit(sink, info, state);13491350let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);1351inst.emit(sink, info, state);13521353if *is_saturating {1354let next_is_large = sink.get_label();1355one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large13561357// The input was "large" (>= 2**(width -1)), so the only way to get an integer1358// overflow is because the input was too large: saturate to the max value.1359let inst = Inst::imm(1360OperandSize::Size64,1361if *dst_size == OperandSize::Size64 {1362u64::max_value()1363} else {1364u32::max_value() as u641365},1366dst,1367);1368inst.emit(sink, info, state);13691370let inst = Inst::jmp_known(done);1371inst.emit(sink, info, state);1372sink.bind_label(next_is_large, state.ctrl_plane_mut());1373} else {1374let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);1375inst.emit(sink, info, state);1376}13771378if *dst_size == OperandSize::Size64 {1379let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);1380inst.emit(sink, info, state);13811382asm::inst::addq_rm::new(dst, tmp_gpr).emit(sink, info, state);1383} else {1384asm::inst::addl_mi::new(dst, asm::Imm32::new(1 << 31)).emit(sink, info, state);1385}13861387sink.bind_label(done, state.ctrl_plane_mut());1388}13891390Inst::LoadExtName {1391dst,1392name,1393offset,1394distance,1395} => {1396let name = &**name;1397let riprel = asm::Amode::RipRelative {1398target: asm::DeferredTarget::None,1399};1400if info.flags.is_pic() {1401// Generates: movq symbol@GOTPCREL(%rip), %dst1402asm::inst::movq_rm::new(*dst, riprel).emit(sink, info, state);1403let cur = sink.cur_offset();1404sink.add_reloc_at_offset(cur - 4, Reloc::X86GOTPCRel4, name, -4);14051406// Offset in the relocation above applies to the address of the1407// *GOT entry*, not the loaded address; so we emit a separate1408// add instruction if needed.1409let offset = i32::try_from(*offset).unwrap();1410if offset != 0 {1411asm::inst::addq_mi_sxl::new(PairedGpr::from(*dst), offset)1412.emit(sink, info, state);1413}1414} else if distance == &RelocDistance::Near {1415// If we know the distance to the name is within 2GB (e.g., a1416// module-local function), we can generate a RIP-relative1417// address, with a relocation.1418asm::inst::leaq_rm::new(*dst, riprel).emit(sink, info, state);1419let cur = sink.cur_offset();1420sink.add_reloc_at_offset(cur - 4, Reloc::X86CallPCRel4, name, *offset - 4);1421} else {1422// The full address can be encoded in the register, with a1423// relocation.1424asm::inst::movabsq_oi::new(*dst, 0).emit(sink, info, state);1425let cur = sink.cur_offset();1426sink.add_reloc_at_offset(cur - 8, Reloc::Abs8, name, *offset);1427}1428}14291430Inst::AtomicRmwSeq {1431ty,1432op,1433mem,1434operand,1435temp,1436dst_old,1437} => {1438let operand = *operand;1439let temp = *temp;1440let temp_r = temp.map(|r| *r);1441let dst_old = *dst_old;1442let dst_old_r = dst_old.map(|r| *r);1443debug_assert_eq!(dst_old.to_reg(), regs::rax());1444let mem = mem.finalize(state.frame_layout(), sink).clone();14451446// Emit this:1447// mov{zbq,zwq,zlq,q} (%r_address), %rax // rax = old value1448// again:1449// movq %rax, %r_temp // rax = old value, r_temp = old value1450// `op`q %r_operand, %r_temp // rax = old value, r_temp = new value1451// lock cmpxchg{b,w,l,q} %r_temp, (%r_address) // try to store new value1452// jnz again // If this is taken, rax will have a "revised" old value1453//1454// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old1455// value), %r_temp (trashed), %rflags (trashed)1456let again_label = sink.get_label();14571458// mov{zbq,zwq,zlq,q} (%r_address), %rax1459// No need to call `add_trap` here, since the `i1` emit will do that.1460let i1 = Inst::load(*ty, mem.clone(), dst_old_r, ExtKind::ZeroExtend);1461i1.emit(sink, info, state);14621463// again:1464sink.bind_label(again_label, state.ctrl_plane_mut());14651466// movq %rax, %r_temp1467asm::inst::movq_mr::new(temp, dst_old.to_reg()).emit(sink, info, state);14681469use AtomicRmwSeqOp as RmwOp;1470match op {1471RmwOp::Nand => {1472// andq %r_operand, %r_temp1473asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);14741475// notq %r_temp1476asm::inst::notq_m::new(PairedGpr::from(temp)).emit(sink, info, state);1477}1478RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {1479// cmp %r_temp, %r_operand1480let temp = temp.to_reg();1481match *ty {1482types::I8 => asm::inst::cmpb_mr::new(operand, temp).emit(sink, info, state),1483types::I16 => {1484asm::inst::cmpw_mr::new(operand, temp).emit(sink, info, state)1485}1486types::I32 => {1487asm::inst::cmpl_mr::new(operand, temp).emit(sink, info, state)1488}1489types::I64 => {1490asm::inst::cmpq_mr::new(operand, temp).emit(sink, info, state)1491}1492_ => unreachable!(),1493}14941495// cmovcc %r_operand, %r_temp1496match op {1497RmwOp::Umin => {1498asm::inst::cmovbeq_rm::new(temp_r, *operand).emit(sink, info, state)1499}1500RmwOp::Umax => {1501asm::inst::cmovaeq_rm::new(temp_r, *operand).emit(sink, info, state)1502}1503RmwOp::Smin => {1504asm::inst::cmovleq_rm::new(temp_r, *operand).emit(sink, info, state)1505}1506RmwOp::Smax => {1507asm::inst::cmovgeq_rm::new(temp_r, *operand).emit(sink, info, state)1508}1509_ => unreachable!(),1510}1511}1512RmwOp::And => {1513// andq %r_operand, %r_temp1514asm::inst::andq_rm::new(temp, operand).emit(sink, info, state);1515}1516RmwOp::Or => {1517// orq %r_operand, %r_temp1518asm::inst::orq_rm::new(temp, operand).emit(sink, info, state);1519}1520RmwOp::Xor => {1521// xorq %r_operand, %r_temp1522asm::inst::xorq_rm::new(temp, operand).emit(sink, info, state);1523}1524}15251526// lock cmpxchg{b,w,l,q} %r_temp, (%r_address)1527// No need to call `add_trap` here, since the `i4` emit will do that.1528let temp = temp.to_reg();1529let dst_old = PairedGpr::from(dst_old);1530let inst: AsmInst = match *ty {1531types::I8 => asm::inst::lock_cmpxchgb_mr::new(mem, temp, dst_old).into(),1532types::I16 => asm::inst::lock_cmpxchgw_mr::new(mem, temp, dst_old).into(),1533types::I32 => asm::inst::lock_cmpxchgl_mr::new(mem, temp, dst_old).into(),1534types::I64 => asm::inst::lock_cmpxchgq_mr::new(mem, temp, dst_old).into(),1535_ => unreachable!(),1536};1537inst.emit(sink, info, state);15381539// jnz again1540one_way_jmp(sink, CC::NZ, again_label);1541}15421543Inst::Atomic128RmwSeq {1544op,1545mem,1546operand_low,1547operand_high,1548temp_low,1549temp_high,1550dst_old_low,1551dst_old_high,1552} => {1553let operand_low = *operand_low;1554let operand_high = *operand_high;1555let temp_low = *temp_low;1556let temp_high = *temp_high;1557let dst_old_low = *dst_old_low;1558let dst_old_high = *dst_old_high;1559debug_assert_eq!(temp_low.to_reg(), regs::rbx());1560debug_assert_eq!(temp_high.to_reg(), regs::rcx());1561debug_assert_eq!(dst_old_low.to_reg(), regs::rax());1562debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());1563let mem = mem.finalize(state.frame_layout(), sink).clone();15641565let again_label = sink.get_label();15661567// Load the initial value.1568asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);1569asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);15701571// again:1572sink.bind_label(again_label, state.ctrl_plane_mut());15731574// Move old value to temp registers.1575asm::inst::movq_mr::new(temp_low, dst_old_low.to_reg()).emit(sink, info, state);1576asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg()).emit(sink, info, state);15771578// Perform the operation.1579use Atomic128RmwSeqOp as RmwOp;1580match op {1581RmwOp::Nand => {1582// temp &= operand1583asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);1584asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);15851586// temp = !temp1587asm::inst::notq_m::new(PairedGpr::from(temp_low)).emit(sink, info, state);1588asm::inst::notq_m::new(PairedGpr::from(temp_high)).emit(sink, info, state);1589}1590RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {1591// Do a comparison with LHS temp and RHS operand.1592// Note the opposite argument orders.1593asm::inst::cmpq_mr::new(temp_low.to_reg(), operand_low).emit(sink, info, state);1594// This will clobber `temp_high`1595asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);1596// Restore the clobbered value1597asm::inst::movq_mr::new(temp_high, dst_old_high.to_reg())1598.emit(sink, info, state);1599match op {1600RmwOp::Umin => {1601asm::inst::cmovaeq_rm::new(temp_low, operand_low)1602.emit(sink, info, state);1603asm::inst::cmovaeq_rm::new(temp_high, operand_high)1604.emit(sink, info, state);1605}1606RmwOp::Umax => {1607asm::inst::cmovbq_rm::new(temp_low, operand_low)1608.emit(sink, info, state);1609asm::inst::cmovbq_rm::new(temp_high, operand_high)1610.emit(sink, info, state);1611}1612RmwOp::Smin => {1613asm::inst::cmovgeq_rm::new(temp_low, operand_low)1614.emit(sink, info, state);1615asm::inst::cmovgeq_rm::new(temp_high, operand_high)1616.emit(sink, info, state);1617}1618RmwOp::Smax => {1619asm::inst::cmovlq_rm::new(temp_low, operand_low)1620.emit(sink, info, state);1621asm::inst::cmovlq_rm::new(temp_high, operand_high)1622.emit(sink, info, state);1623}1624_ => unreachable!(),1625}1626}1627RmwOp::Add => {1628asm::inst::addq_rm::new(temp_low, operand_low).emit(sink, info, state);1629asm::inst::adcq_rm::new(temp_high, operand_high).emit(sink, info, state);1630}1631RmwOp::Sub => {1632asm::inst::subq_rm::new(temp_low, operand_low).emit(sink, info, state);1633asm::inst::sbbq_rm::new(temp_high, operand_high).emit(sink, info, state);1634}1635RmwOp::And => {1636asm::inst::andq_rm::new(temp_low, operand_low).emit(sink, info, state);1637asm::inst::andq_rm::new(temp_high, operand_high).emit(sink, info, state);1638}1639RmwOp::Or => {1640asm::inst::orq_rm::new(temp_low, operand_low).emit(sink, info, state);1641asm::inst::orq_rm::new(temp_high, operand_high).emit(sink, info, state);1642}1643RmwOp::Xor => {1644asm::inst::xorq_rm::new(temp_low, operand_low).emit(sink, info, state);1645asm::inst::xorq_rm::new(temp_high, operand_high).emit(sink, info, state);1646}1647}16481649// cmpxchg16b (mem)1650asm::inst::lock_cmpxchg16b_m::new(1651PairedGpr::from(dst_old_low),1652PairedGpr::from(dst_old_high),1653temp_low.to_reg(),1654temp_high.to_reg(),1655mem,1656)1657.emit(sink, info, state);16581659// jnz again1660one_way_jmp(sink, CC::NZ, again_label);1661}16621663Inst::Atomic128XchgSeq {1664mem,1665operand_low,1666operand_high,1667dst_old_low,1668dst_old_high,1669} => {1670let operand_low = *operand_low;1671let operand_high = *operand_high;1672let dst_old_low = *dst_old_low;1673let dst_old_high = *dst_old_high;1674debug_assert_eq!(operand_low, regs::rbx());1675debug_assert_eq!(operand_high, regs::rcx());1676debug_assert_eq!(dst_old_low.to_reg(), regs::rax());1677debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());1678let mem = mem.finalize(state.frame_layout(), sink).clone();16791680let again_label = sink.get_label();16811682// Load the initial value.1683asm::inst::movq_rm::new(dst_old_low, mem.clone()).emit(sink, info, state);1684asm::inst::movq_rm::new(dst_old_high, mem.offset(8)).emit(sink, info, state);16851686// again:1687sink.bind_label(again_label, state.ctrl_plane_mut());16881689// cmpxchg16b (mem)1690asm::inst::lock_cmpxchg16b_m::new(1691PairedGpr::from(dst_old_low),1692PairedGpr::from(dst_old_high),1693operand_low,1694operand_high,1695mem,1696)1697.emit(sink, info, state);16981699// jnz again1700one_way_jmp(sink, CC::NZ, again_label);1701}17021703Inst::ElfTlsGetAddr { symbol, dst } => {1704let dst = dst.to_reg().to_reg();1705debug_assert_eq!(dst, regs::rax());17061707// N.B.: Must be exactly this byte sequence; the linker requires it,1708// because it must know how to rewrite the bytes.17091710// data16 lea gv@tlsgd(%rip),%rdi1711sink.put1(0x66); // data161712sink.put1(0b01001000); // REX.W1713sink.put1(0x8d); // LEA1714sink.put1(0x3d); // ModRM byte1715emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);1716sink.put4(0); // offset17171718// data16 data16 callq __tls_get_addr-41719sink.put1(0x66); // data161720sink.put1(0x66); // data161721sink.put1(0b01001000); // REX.W1722sink.put1(0xe8); // CALL1723emit_reloc(1724sink,1725Reloc::X86CallPLTRel4,1726&ExternalName::LibCall(LibCall::ElfTlsGetAddr),1727-4,1728);1729sink.put4(0); // offset1730}17311732Inst::MachOTlsGetAddr { symbol, dst } => {1733let dst = dst.to_reg().to_reg();1734debug_assert_eq!(dst, regs::rax());17351736// movq gv@tlv(%rip), %rdi1737sink.put1(0x48); // REX.w1738sink.put1(0x8b); // MOV1739sink.put1(0x3d); // ModRM byte1740emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);1741sink.put4(0); // offset17421743asm::inst::callq_m::new(asm::Amode::ImmReg {1744base: Gpr::RDI,1745simm32: asm::AmodeOffsetPlusKnownOffset::ZERO,1746trap: None,1747})1748.emit(sink, info, state);1749}17501751Inst::CoffTlsGetAddr { symbol, dst, tmp } => {1752let dst = dst.to_reg().to_reg();1753debug_assert_eq!(dst, regs::rax());17541755// tmp is used below directly as %rcx1756let tmp = tmp.to_reg().to_reg();1757debug_assert_eq!(tmp, regs::rcx());17581759// See: https://gcc.godbolt.org/z/M8or9x6ss1760// And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-53293028217611762// Emit the following sequence1763// movl (%rip), %eax ; IMAGE_REL_AMD64_REL32 _tls_index1764// movq %gs:88, %rcx1765// movq (%rcx,%rax,8), %rax1766// leaq (%rax), %rax ; Reloc: IMAGE_REL_AMD64_SECREL symbol17671768// Load TLS index for current thread1769// movl (%rip), %eax1770sink.put1(0x8b); // mov1771sink.put1(0x05);1772emit_reloc(1773sink,1774Reloc::X86PCRel4,1775&ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),1776-4,1777);1778sink.put4(0); // offset17791780// movq %gs:88, %rcx1781// Load the TLS Storage Array pointer1782// The gs segment register refers to the base address of the TEB on x64.1783// 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:1784sink.put_data(&[17850x65, 0x48, // REX.W17860x8b, // MOV17870x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset17880x00, 0x00, 0x00,1789]);17901791// movq (%rcx,%rax,8), %rax1792// Load the actual TLS entry for this thread.1793// Computes ThreadLocalStoragePointer + _tls_index*81794sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);17951796// leaq (%rax), %rax1797sink.put1(0x48);1798sink.put1(0x8d);1799sink.put1(0x80);1800emit_reloc(sink, Reloc::X86SecRel, symbol, 0);1801sink.put4(0); // offset1802}18031804Inst::Unwind { inst } => {1805sink.add_unwind(inst.clone());1806}18071808Inst::DummyUse { .. } => {1809// Nothing.1810}18111812Inst::LabelAddress { dst, label } => {1813// Emit an LEA with a LabelUse given this label.1814asm::inst::leaq_rm::new(*dst, Amode::rip_relative(*label)).emit(sink, info, state);1815}18161817Inst::External { inst } => {1818let frame = state.frame_layout();1819emit_maybe_shrink(1820inst,1821&mut external::AsmCodeSink {1822sink,18231824// These values are transcribed from what is happening in1825// `SyntheticAmode::finalize`. This, plus the `Into` logic1826// converting a `SyntheticAmode` to its external counterpart, are1827// necessary to communicate Cranelift's internal offsets to the1828// assembler; due to when Cranelift determines these offsets, this1829// happens quite late (i.e., here during emission).1830incoming_arg_offset: i32::try_from(1831frame.tail_args_size + frame.setup_area_size,1832)1833.unwrap(),1834slot_offset: i32::try_from(frame.outgoing_args_size).unwrap(),1835},1836);1837}1838}18391840state.clear_post_insn();1841}18421843/// Emit the common sequence used for both direct and indirect tail calls:1844///1845/// * Copy the new frame's stack arguments over the top of our current frame.1846///1847/// * Restore the old frame pointer.1848///1849/// * Initialize the tail callee's stack pointer (simultaneously deallocating1850/// the temporary stack space we allocated when creating the new frame's stack1851/// arguments).1852///1853/// * Move the return address into its stack slot.1854fn emit_return_call_common_sequence<T>(1855sink: &mut MachBuffer<Inst>,1856info: &EmitInfo,1857state: &mut EmitState,1858call_info: &ReturnCallInfo<T>,1859) {1860assert!(1861info.flags.preserve_frame_pointers(),1862"frame pointers aren't fundamentally required for tail calls, \1863but the current implementation relies on them being present"1864);18651866let tmp = call_info.tmp.to_writable_reg();18671868for inst in1869X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())1870{1871inst.emit(sink, info, state);1872}18731874for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(1875CallConv::Tail,1876&info.flags,1877&info.isa_flags,1878state.frame_layout(),1879) {1880inst.emit(sink, info, state);1881}18821883let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;1884if incoming_args_diff > 0 {1885// Move the saved return address up by `incoming_args_diff`.1886let addr = Amode::imm_reg(0, regs::rsp());1887asm::inst::movq_rm::new(tmp, addr).emit(sink, info, state);1888asm::inst::movq_mr::new(1889Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),1890Gpr::unwrap_new(tmp.to_reg()),1891)1892.emit(sink, info, state);18931894// Increment the stack pointer to shrink the argument area for the new1895// call.1896let rsp = Writable::from_reg(regs::rsp());1897let incoming_args_diff = i32::try_from(incoming_args_diff)1898.expect("`incoming_args_diff` is too large to fit in a 32-bit signed immediate");1899Inst::addq_mi(rsp, incoming_args_diff).emit(sink, info, state);1900}1901}19021903/// Conveniene trait to have an `emit` method on all `asm::inst::*` variants.1904trait ExternalEmit {1905fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState);1906}19071908impl<I> ExternalEmit for I1909where1910I: Into<asm::inst::Inst<CraneliftRegisters>>,1911{1912fn emit(self, sink: &mut MachBuffer<Inst>, info: &EmitInfo, state: &mut EmitState) {1913Inst::External { inst: self.into() }.emit(sink, info, state)1914}1915}19161917/// Attempt to "shrink" the provided `inst`.1918///1919/// This function will inspect `inst` and attempt to return a new instruction1920/// which is equivalent semantically but will encode to a smaller binary1921/// representation. This is only done for instructions which require register1922/// allocation to have already happened, for example shrinking immediates should1923/// be done during instruction selection not at this point.1924///1925/// An example of this optimization is the `AND` instruction. The Intel manual1926/// has a smaller encoding for `AND AL, imm8` than it does for `AND r/m8, imm8`.1927/// Here the instructions are matched against and if regalloc state indicates1928/// that a smaller variant is available then that's swapped to instead.1929fn emit_maybe_shrink(inst: &AsmInst, sink: &mut impl asm::CodeSink) {1930use cranelift_assembler_x64::GprMem;1931use cranelift_assembler_x64::inst::*;19321933type R = CraneliftRegisters;1934const RAX: PairedGpr = PairedGpr {1935read: Gpr::RAX,1936write: Writable::from_reg(Gpr::RAX),1937};1938const RAX_RM: GprMem<PairedGpr, Gpr> = GprMem::Gpr(RAX);19391940match *inst {1941// and1942Inst::andb_mi(andb_mi { rm8: RAX_RM, imm8 }) => andb_i::<R>::new(RAX, imm8).encode(sink),1943Inst::andw_mi(andw_mi {1944rm16: RAX_RM,1945imm16,1946}) => andw_i::<R>::new(RAX, imm16).encode(sink),1947Inst::andl_mi(andl_mi {1948rm32: RAX_RM,1949imm32,1950}) => andl_i::<R>::new(RAX, imm32).encode(sink),1951Inst::andq_mi_sxl(andq_mi_sxl {1952rm64: RAX_RM,1953imm32,1954}) => andq_i_sxl::<R>::new(RAX, imm32).encode(sink),19551956// or1957Inst::orb_mi(orb_mi { rm8: RAX_RM, imm8 }) => orb_i::<R>::new(RAX, imm8).encode(sink),1958Inst::orw_mi(orw_mi {1959rm16: RAX_RM,1960imm16,1961}) => orw_i::<R>::new(RAX, imm16).encode(sink),1962Inst::orl_mi(orl_mi {1963rm32: RAX_RM,1964imm32,1965}) => orl_i::<R>::new(RAX, imm32).encode(sink),1966Inst::orq_mi_sxl(orq_mi_sxl {1967rm64: RAX_RM,1968imm32,1969}) => orq_i_sxl::<R>::new(RAX, imm32).encode(sink),19701971// xor1972Inst::xorb_mi(xorb_mi { rm8: RAX_RM, imm8 }) => xorb_i::<R>::new(RAX, imm8).encode(sink),1973Inst::xorw_mi(xorw_mi {1974rm16: RAX_RM,1975imm16,1976}) => xorw_i::<R>::new(RAX, imm16).encode(sink),1977Inst::xorl_mi(xorl_mi {1978rm32: RAX_RM,1979imm32,1980}) => xorl_i::<R>::new(RAX, imm32).encode(sink),1981Inst::xorq_mi_sxl(xorq_mi_sxl {1982rm64: RAX_RM,1983imm32,1984}) => xorq_i_sxl::<R>::new(RAX, imm32).encode(sink),19851986// add1987Inst::addb_mi(addb_mi { rm8: RAX_RM, imm8 }) => addb_i::<R>::new(RAX, imm8).encode(sink),1988Inst::addw_mi(addw_mi {1989rm16: RAX_RM,1990imm16,1991}) => addw_i::<R>::new(RAX, imm16).encode(sink),1992Inst::addl_mi(addl_mi {1993rm32: RAX_RM,1994imm32,1995}) => addl_i::<R>::new(RAX, imm32).encode(sink),1996Inst::addq_mi_sxl(addq_mi_sxl {1997rm64: RAX_RM,1998imm32,1999}) => addq_i_sxl::<R>::new(RAX, imm32).encode(sink),20002001// adc2002Inst::adcb_mi(adcb_mi { rm8: RAX_RM, imm8 }) => adcb_i::<R>::new(RAX, imm8).encode(sink),2003Inst::adcw_mi(adcw_mi {2004rm16: RAX_RM,2005imm16,2006}) => adcw_i::<R>::new(RAX, imm16).encode(sink),2007Inst::adcl_mi(adcl_mi {2008rm32: RAX_RM,2009imm32,2010}) => adcl_i::<R>::new(RAX, imm32).encode(sink),2011Inst::adcq_mi_sxl(adcq_mi_sxl {2012rm64: RAX_RM,2013imm32,2014}) => adcq_i_sxl::<R>::new(RAX, imm32).encode(sink),20152016// sub2017Inst::subb_mi(subb_mi { rm8: RAX_RM, imm8 }) => subb_i::<R>::new(RAX, imm8).encode(sink),2018Inst::subw_mi(subw_mi {2019rm16: RAX_RM,2020imm16,2021}) => subw_i::<R>::new(RAX, imm16).encode(sink),2022Inst::subl_mi(subl_mi {2023rm32: RAX_RM,2024imm32,2025}) => subl_i::<R>::new(RAX, imm32).encode(sink),2026Inst::subq_mi_sxl(subq_mi_sxl {2027rm64: RAX_RM,2028imm32,2029}) => subq_i_sxl::<R>::new(RAX, imm32).encode(sink),20302031// sbb2032Inst::sbbb_mi(sbbb_mi { rm8: RAX_RM, imm8 }) => sbbb_i::<R>::new(RAX, imm8).encode(sink),2033Inst::sbbw_mi(sbbw_mi {2034rm16: RAX_RM,2035imm16,2036}) => sbbw_i::<R>::new(RAX, imm16).encode(sink),2037Inst::sbbl_mi(sbbl_mi {2038rm32: RAX_RM,2039imm32,2040}) => sbbl_i::<R>::new(RAX, imm32).encode(sink),2041Inst::sbbq_mi_sxl(sbbq_mi_sxl {2042rm64: RAX_RM,2043imm32,2044}) => sbbq_i_sxl::<R>::new(RAX, imm32).encode(sink),20452046// cmp2047Inst::cmpb_mi(cmpb_mi {2048rm8: GprMem::Gpr(Gpr::RAX),2049imm8,2050}) => cmpb_i::<R>::new(Gpr::RAX, imm8).encode(sink),2051Inst::cmpw_mi(cmpw_mi {2052rm16: GprMem::Gpr(Gpr::RAX),2053imm16,2054}) => cmpw_i::<R>::new(Gpr::RAX, imm16).encode(sink),2055Inst::cmpl_mi(cmpl_mi {2056rm32: GprMem::Gpr(Gpr::RAX),2057imm32,2058}) => cmpl_i::<R>::new(Gpr::RAX, imm32).encode(sink),2059Inst::cmpq_mi(cmpq_mi {2060rm64: GprMem::Gpr(Gpr::RAX),2061imm32,2062}) => cmpq_i::<R>::new(Gpr::RAX, imm32).encode(sink),20632064// test2065Inst::testb_mi(testb_mi {2066rm8: GprMem::Gpr(Gpr::RAX),2067imm8,2068}) => testb_i::<R>::new(Gpr::RAX, imm8).encode(sink),2069Inst::testw_mi(testw_mi {2070rm16: GprMem::Gpr(Gpr::RAX),2071imm16,2072}) => testw_i::<R>::new(Gpr::RAX, imm16).encode(sink),2073Inst::testl_mi(testl_mi {2074rm32: GprMem::Gpr(Gpr::RAX),2075imm32,2076}) => testl_i::<R>::new(Gpr::RAX, imm32).encode(sink),2077Inst::testq_mi(testq_mi {2078rm64: GprMem::Gpr(Gpr::RAX),2079imm32,2080}) => testq_i::<R>::new(Gpr::RAX, imm32).encode(sink),20812082// lea2083Inst::leal_rm(leal_rm { r32, m32 }) => emit_lea(2084r32,2085m32,2086sink,2087|dst, amode, s| leal_rm::<R>::new(dst, amode).encode(s),2088|dst, simm32, s| addl_mi::<R>::new(dst, simm32.cast_unsigned()).encode(s),2089|dst, reg, s| addl_rm::<R>::new(dst, reg).encode(s),2090),2091Inst::leaq_rm(leaq_rm { r64, m64 }) => emit_lea(2092r64,2093m64,2094sink,2095|dst, amode, s| leaq_rm::<R>::new(dst, amode).encode(s),2096|dst, simm32, s| addq_mi_sxl::<R>::new(dst, simm32).encode(s),2097|dst, reg, s| addq_rm::<R>::new(dst, reg).encode(s),2098),20992100// All other instructions fall through to here and cannot be shrunk, so2101// return `false` to emit them as usual.2102_ => inst.encode(sink),2103}2104}21052106/// If `lea` can actually get encoded as an `add` then do that instead.2107/// Currently all candidate `iadd`s become an `lea` pseudo-instruction here but2108/// maximizing the use of `lea` is not necessarily optimal. The `lea`2109/// instruction goes through dedicated address units on cores which are finite2110/// and disjoint from the general ALU, so if everything uses `lea` then those2111/// units can get saturated while leaving the ALU idle.2112///2113/// To help make use of more parts of a CPU, this attempts to use `add` when2114/// it's semantically equivalent to `lea`, or otherwise when the `dst` register2115/// is the same as the `base` or `index` register.2116///2117/// FIXME: ideally regalloc is informed of this constraint. Register allocation2118/// of `lea` should "attempt" to put the `base` in the same register as `dst`2119/// but not at the expense of generating a `mov` instruction. Currently that's2120/// not possible but perhaps one day it may be worth it.2121fn emit_lea<S>(2122dst: asm::Gpr<WritableGpr>,2123addr: asm::Amode<Gpr>,2124sink: &mut S,2125lea: fn(WritableGpr, asm::Amode<Gpr>, &mut S),2126add_mi: fn(PairedGpr, i32, &mut S),2127add_rm: fn(PairedGpr, Gpr, &mut S),2128) where2129S: asm::CodeSink,2130{2131match addr {2132// If `base == dst` then this is `add dst, $imm`, so encode that2133// instead.2134asm::Amode::ImmReg {2135base,2136simm32:2137asm::AmodeOffsetPlusKnownOffset {2138simm32,2139offset: None,2140},2141trap: None,2142} if dst.as_ref().to_reg() == base => add_mi(2143PairedGpr {2144read: base,2145write: *dst.as_ref(),2146},2147simm32.value(),2148sink,2149),21502151// If the offset is 0 and the shift is a scale of 1, then:2152//2153// * If `base == dst`, then this is `addq dst, index`2154// * If `index == dst`, then this is `addq dst, base`2155asm::Amode::ImmRegRegShift {2156base,2157index,2158scale: asm::Scale::One,2159simm32: asm::AmodeOffset::ZERO,2160trap: None,2161} => {2162if dst.as_ref().to_reg() == base {2163add_rm(2164PairedGpr {2165read: base,2166write: *dst.as_ref(),2167},2168*index.as_ref(),2169sink,2170)2171} else if dst.as_ref().to_reg() == *index.as_ref() {2172add_rm(2173PairedGpr {2174read: *index.as_ref(),2175write: *dst.as_ref(),2176},2177base,2178sink,2179)2180} else {2181lea(*dst.as_ref(), addr, sink)2182}2183}21842185_ => lea(*dst.as_ref(), addr, sink),2186}2187}218821892190