Path: blob/main/cranelift/codegen/src/isa/x64/lower/isle.rs
3088 views
//! ISLE integration glue code for x64 lowering.12// Pull in the ISLE generated code.3pub(crate) mod generated_code;4use crate::{ir::AtomicRmwOp, ir::types};5use generated_code::{AssemblerOutputs, Context, MInst, RegisterClass};67// Types that the generated ISLE code uses via `use super::*`.8use super::external::{CraneliftRegisters, PairedGpr, PairedXmm, isle_assembler_methods};9use super::{MergeableLoadSize, is_int_or_ref_ty, is_mergeable_load, lower_to_amode};10use crate::ir::condcodes::{FloatCC, IntCC};11use crate::ir::immediates::*;12use crate::ir::types::*;13use crate::ir::{14BlockCall, Inst, InstructionData, LibCall, MemFlags, Opcode, TrapCode, Value, ValueList,15};16use crate::isa::x64::X64Backend;17use crate::isa::x64::inst::{ReturnCallInfo, args::*, regs};18use crate::isa::x64::lower::{InsnInput, emit_vm_call};19use crate::machinst::isle::*;20use crate::machinst::{21ArgPair, CallArgList, CallInfo, CallRetList, InstOutput, MachInst, VCodeConstant,22VCodeConstantData,23};24use alloc::boxed::Box;25use alloc::vec::Vec;26use cranelift_assembler_x64 as asm;27use regalloc2::PReg;2829/// Type representing out-of-line data for calls. This type optional because the30/// call instruction is also used by Winch to emit calls, but the31/// `Box<CallInfo>` field is not used, it's only used by Cranelift. By making it32/// optional, we reduce the number of heap allocations in Winch.33type BoxCallInfo = Box<CallInfo<ExternalName>>;34type BoxCallIndInfo = Box<CallInfo<RegMem>>;35type BoxReturnCallInfo = Box<ReturnCallInfo<ExternalName>>;36type BoxReturnCallIndInfo = Box<ReturnCallInfo<Reg>>;37type VecArgPair = Vec<ArgPair>;38type BoxSyntheticAmode = Box<SyntheticAmode>;3940/// When interacting with the external assembler (see `external.rs`), we41/// need to fix the types we'll use.42type AssemblerInst = asm::Inst<CraneliftRegisters>;4344pub struct SinkableLoad {45inst: Inst,46addr_input: InsnInput,47offset: i32,48}4950/// The main entry point for lowering with ISLE.51pub(crate) fn lower(52lower_ctx: &mut Lower<MInst>,53backend: &X64Backend,54inst: Inst,55) -> Option<InstOutput> {56// TODO: reuse the ISLE context across lowerings so we can reuse its57// internal heap allocations.58let mut isle_ctx = IsleContext { lower_ctx, backend };59generated_code::constructor_lower(&mut isle_ctx, inst)60}6162pub(crate) fn lower_branch(63lower_ctx: &mut Lower<MInst>,64backend: &X64Backend,65branch: Inst,66targets: &[MachLabel],67) -> Option<()> {68// TODO: reuse the ISLE context across lowerings so we can reuse its69// internal heap allocations.70let mut isle_ctx = IsleContext { lower_ctx, backend };71generated_code::constructor_lower_branch(&mut isle_ctx, branch, &targets)72}7374impl Context for IsleContext<'_, '_, MInst, X64Backend> {75isle_lower_prelude_methods!();76isle_assembler_methods!();7778fn gen_call_info(79&mut self,80sig: Sig,81dest: ExternalName,82uses: CallArgList,83defs: CallRetList,84try_call_info: Option<TryCallInfo>,85patchable: bool,86) -> BoxCallInfo {87let stack_ret_space = self.lower_ctx.sigs()[sig].sized_stack_ret_space();88let stack_arg_space = self.lower_ctx.sigs()[sig].sized_stack_arg_space();89self.lower_ctx90.abi_mut()91.accumulate_outgoing_args_size(stack_ret_space + stack_arg_space);9293Box::new(94self.lower_ctx95.gen_call_info(sig, dest, uses, defs, try_call_info, patchable),96)97}9899fn gen_call_ind_info(100&mut self,101sig: Sig,102dest: &RegMem,103uses: CallArgList,104defs: CallRetList,105try_call_info: Option<TryCallInfo>,106) -> BoxCallIndInfo {107let stack_ret_space = self.lower_ctx.sigs()[sig].sized_stack_ret_space();108let stack_arg_space = self.lower_ctx.sigs()[sig].sized_stack_arg_space();109self.lower_ctx110.abi_mut()111.accumulate_outgoing_args_size(stack_ret_space + stack_arg_space);112113Box::new(114self.lower_ctx115.gen_call_info(sig, dest.clone(), uses, defs, try_call_info, false),116)117}118119fn gen_return_call_info(120&mut self,121sig: Sig,122dest: ExternalName,123uses: CallArgList,124) -> BoxReturnCallInfo {125let new_stack_arg_size = self.lower_ctx.sigs()[sig].sized_stack_arg_space();126self.lower_ctx127.abi_mut()128.accumulate_tail_args_size(new_stack_arg_size);129130Box::new(ReturnCallInfo {131dest,132uses,133tmp: self.lower_ctx.temp_writable_gpr(),134new_stack_arg_size,135})136}137138fn gen_return_call_ind_info(139&mut self,140sig: Sig,141dest: Reg,142uses: CallArgList,143) -> BoxReturnCallIndInfo {144let new_stack_arg_size = self.lower_ctx.sigs()[sig].sized_stack_arg_space();145self.lower_ctx146.abi_mut()147.accumulate_tail_args_size(new_stack_arg_size);148149Box::new(ReturnCallInfo {150dest,151uses,152tmp: self.lower_ctx.temp_writable_gpr(),153new_stack_arg_size,154})155}156157#[inline]158fn operand_size_of_type_32_64(&mut self, ty: Type) -> OperandSize {159if ty.bits() == 64 {160OperandSize::Size64161} else {162OperandSize::Size32163}164}165166#[inline]167fn raw_operand_size_of_type(&mut self, ty: Type) -> OperandSize {168OperandSize::from_ty(ty)169}170171fn put_in_reg_mem_imm(&mut self, val: Value) -> RegMemImm {172if let Some(imm) = self.i64_from_iconst(val) {173if let Ok(imm) = i32::try_from(imm) {174return RegMemImm::Imm {175simm32: imm.cast_unsigned(),176};177}178}179180self.put_in_reg_mem(val).into()181}182183fn put_in_xmm_mem_imm(&mut self, val: Value) -> XmmMemImm {184if let Some(imm) = self.i64_from_iconst(val) {185if let Ok(imm) = i32::try_from(imm) {186return XmmMemImm::unwrap_new(RegMemImm::Imm {187simm32: imm.cast_unsigned(),188});189}190}191192let res = match self.put_in_xmm_mem(val).to_reg_mem() {193RegMem::Reg { reg } => RegMemImm::Reg { reg },194RegMem::Mem { addr } => RegMemImm::Mem { addr },195};196197XmmMemImm::unwrap_new(res)198}199200fn put_in_xmm_mem(&mut self, val: Value) -> XmmMem {201let inputs = self.lower_ctx.get_value_as_source_or_const(val);202203if let Some(c) = inputs.constant {204// A load from the constant pool is better than a rematerialization into a register,205// because it reduces register pressure.206//207// NOTE: this is where behavior differs from `put_in_reg_mem`, as we always force208// constants to be 16 bytes when a constant will be used in place of an xmm register.209let vcode_constant = self.emit_u128_le_const(c as u128);210return XmmMem::unwrap_new(RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant)));211}212213XmmMem::unwrap_new(self.put_in_reg_mem(val))214}215216fn put_in_reg_mem(&mut self, val: Value) -> RegMem {217let inputs = self.lower_ctx.get_value_as_source_or_const(val);218219if let Some(c) = inputs.constant {220// A load from the constant pool is better than a221// rematerialization into a register, because it reduces222// register pressure.223let vcode_constant = self.emit_u64_le_const(c);224return RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant));225}226227if let Some(load) = self.sinkable_load(val) {228return RegMem::Mem {229addr: self.sink_load(&load),230};231}232233RegMem::reg(self.put_in_reg(val))234}235236#[inline]237fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {238imm.encode()239}240241#[inline]242fn encode_round_imm(&mut self, imm: &RoundImm) -> u8 {243imm.encode()244}245246#[inline]247fn has_avx(&mut self) -> bool {248self.backend.x64_flags.has_avx()249}250251#[inline]252fn use_avx2(&mut self) -> bool {253self.backend.x64_flags.has_avx() && self.backend.x64_flags.has_avx2()254}255256#[inline]257fn has_avx512vl(&mut self) -> bool {258self.backend.x64_flags.has_avx512vl()259}260261#[inline]262fn has_avx512dq(&mut self) -> bool {263self.backend.x64_flags.has_avx512dq()264}265266#[inline]267fn has_avx512f(&mut self) -> bool {268self.backend.x64_flags.has_avx512f()269}270271#[inline]272fn has_avx512bitalg(&mut self) -> bool {273self.backend.x64_flags.has_avx512bitalg()274}275276#[inline]277fn has_avx512vbmi(&mut self) -> bool {278self.backend.x64_flags.has_avx512vbmi()279}280281#[inline]282fn has_lzcnt(&mut self) -> bool {283self.backend.x64_flags.has_lzcnt()284}285286#[inline]287fn has_bmi1(&mut self) -> bool {288self.backend.x64_flags.has_bmi1()289}290291#[inline]292fn has_bmi2(&mut self) -> bool {293self.backend.x64_flags.has_bmi2()294}295296#[inline]297fn use_popcnt(&mut self) -> bool {298self.backend.x64_flags.has_popcnt() && self.backend.x64_flags.has_sse42()299}300301#[inline]302fn use_fma(&mut self) -> bool {303self.backend.x64_flags.has_avx() && self.backend.x64_flags.has_fma()304}305306#[inline]307fn has_sse3(&mut self) -> bool {308self.backend.x64_flags.has_sse3()309}310311#[inline]312fn has_ssse3(&mut self) -> bool {313self.backend.x64_flags.has_ssse3()314}315316#[inline]317fn has_sse41(&mut self) -> bool {318self.backend.x64_flags.has_sse41()319}320321#[inline]322fn use_sse42(&mut self) -> bool {323self.backend.x64_flags.has_sse41() && self.backend.x64_flags.has_sse42()324}325326#[inline]327fn has_cmpxchg16b(&mut self) -> bool {328self.backend.x64_flags.has_cmpxchg16b()329}330331#[inline]332fn shift_mask(&mut self, ty: Type) -> u8 {333debug_assert!(ty.lane_bits().is_power_of_two());334335(ty.lane_bits() - 1) as u8336}337338fn shift_amount_masked(&mut self, ty: Type, val: Imm64) -> u8 {339(val.bits() as u8) & self.shift_mask(ty)340}341342#[inline]343fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {344let imm = self.i64_from_iconst(val)?;345Some(GprMemImm::unwrap_new(RegMemImm::Imm {346simm32: i32::try_from(imm).ok()?.cast_unsigned(),347}))348}349350fn sinkable_load(&mut self, val: Value) -> Option<SinkableLoad> {351if let Some(inst) = self.is_sinkable_inst(val) {352if let Some((addr_input, offset)) =353is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Min32)354{355return Some(SinkableLoad {356inst,357addr_input,358offset,359});360}361}362None363}364365fn sinkable_load_exact(&mut self, val: Value) -> Option<SinkableLoad> {366if let Some(inst) = self.is_sinkable_inst(val) {367if let Some((addr_input, offset)) =368is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Exact)369{370return Some(SinkableLoad {371inst,372addr_input,373offset,374});375}376}377None378}379380fn sink_load(&mut self, load: &SinkableLoad) -> SyntheticAmode {381self.lower_ctx.sink_inst(load.inst);382let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset);383SyntheticAmode::Real(addr)384}385386#[inline]387fn ext_mode(&mut self, from_bits: u16, to_bits: u16) -> ExtMode {388ExtMode::new(from_bits, to_bits).unwrap()389}390391fn emit(&mut self, inst: &MInst) -> Unit {392self.lower_ctx.emit(inst.clone());393}394395#[inline]396fn sse_insertps_lane_imm(&mut self, lane: u8) -> u8 {397// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane398// shifted into bits 5:6).3990b00_00_00_00 | lane << 4400}401402#[inline]403fn synthetic_amode_to_reg_mem(&mut self, addr: &SyntheticAmode) -> RegMem {404RegMem::mem(addr.clone())405}406407#[inline]408fn amode_to_synthetic_amode(&mut self, amode: &Amode) -> SyntheticAmode {409amode.clone().into()410}411412#[inline]413fn synthetic_amode_slot(&mut self, offset: i32) -> SyntheticAmode {414SyntheticAmode::SlotOffset { simm32: offset }415}416417#[inline]418fn const_to_synthetic_amode(&mut self, c: VCodeConstant) -> SyntheticAmode {419SyntheticAmode::ConstantOffset(c)420}421422#[inline]423fn writable_gpr_to_reg(&mut self, r: WritableGpr) -> WritableReg {424r.to_writable_reg()425}426427#[inline]428fn writable_xmm_to_reg(&mut self, r: WritableXmm) -> WritableReg {429r.to_writable_reg()430}431432fn ishl_i8x16_mask_for_const(&mut self, amt: u32) -> SyntheticAmode {433// When the shift amount is known, we can statically (i.e. at compile434// time) determine the mask to use and only emit that.435debug_assert!(amt < 8);436let mask_offset = amt as usize * 16;437let mask_constant = self.lower_ctx.use_constant(VCodeConstantData::WellKnown(438&I8X16_ISHL_MASKS[mask_offset..mask_offset + 16],439));440SyntheticAmode::ConstantOffset(mask_constant)441}442443fn ishl_i8x16_mask_table(&mut self) -> SyntheticAmode {444let mask_table = self445.lower_ctx446.use_constant(VCodeConstantData::WellKnown(&I8X16_ISHL_MASKS));447SyntheticAmode::ConstantOffset(mask_table)448}449450fn ushr_i8x16_mask_for_const(&mut self, amt: u32) -> SyntheticAmode {451// When the shift amount is known, we can statically (i.e. at compile452// time) determine the mask to use and only emit that.453debug_assert!(amt < 8);454let mask_offset = amt as usize * 16;455let mask_constant = self.lower_ctx.use_constant(VCodeConstantData::WellKnown(456&I8X16_USHR_MASKS[mask_offset..mask_offset + 16],457));458SyntheticAmode::ConstantOffset(mask_constant)459}460461fn ushr_i8x16_mask_table(&mut self) -> SyntheticAmode {462let mask_table = self463.lower_ctx464.use_constant(VCodeConstantData::WellKnown(&I8X16_USHR_MASKS));465SyntheticAmode::ConstantOffset(mask_table)466}467468#[inline]469fn writable_reg_to_xmm(&mut self, r: WritableReg) -> WritableXmm {470Writable::from_reg(Xmm::unwrap_new(r.to_reg()))471}472473#[inline]474fn writable_xmm_to_xmm(&mut self, r: WritableXmm) -> Xmm {475r.to_reg()476}477478#[inline]479fn writable_gpr_to_gpr(&mut self, r: WritableGpr) -> Gpr {480r.to_reg()481}482483#[inline]484fn gpr_to_reg(&mut self, r: Gpr) -> Reg {485r.into()486}487488#[inline]489fn xmm_to_reg(&mut self, r: Xmm) -> Reg {490r.into()491}492493#[inline]494fn xmm_to_xmm_mem_imm(&mut self, r: Xmm) -> XmmMemImm {495r.into()496}497498#[inline]499fn xmm_mem_to_xmm_mem_imm(&mut self, r: &XmmMem) -> XmmMemImm {500XmmMemImm::unwrap_new(r.clone().to_reg_mem().into())501}502503#[inline]504fn temp_writable_gpr(&mut self) -> WritableGpr {505self.lower_ctx.temp_writable_gpr()506}507508#[inline]509fn temp_writable_xmm(&mut self) -> WritableXmm {510self.lower_ctx.temp_writable_xmm()511}512513#[inline]514fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm {515RegMemImm::Reg { reg }516}517518#[inline]519fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {520XmmMem::unwrap_new(rm.clone())521}522523#[inline]524fn gpr_mem_imm_new(&mut self, rmi: &RegMemImm) -> GprMemImm {525GprMemImm::unwrap_new(rmi.clone())526}527528#[inline]529fn xmm_mem_imm_new(&mut self, rmi: &RegMemImm) -> XmmMemImm {530XmmMemImm::unwrap_new(rmi.clone())531}532533#[inline]534fn xmm_to_xmm_mem(&mut self, r: Xmm) -> XmmMem {535r.into()536}537538#[inline]539fn xmm_mem_to_reg_mem(&mut self, xm: &XmmMem) -> RegMem {540xm.clone().into()541}542543#[inline]544fn gpr_mem_to_reg_mem(&mut self, gm: &GprMem) -> RegMem {545gm.clone().into()546}547548#[inline]549fn xmm_new(&mut self, r: Reg) -> Xmm {550Xmm::unwrap_new(r)551}552553#[inline]554fn gpr_new(&mut self, r: Reg) -> Gpr {555Gpr::unwrap_new(r)556}557558#[inline]559fn reg_mem_to_gpr_mem(&mut self, rm: &RegMem) -> GprMem {560GprMem::unwrap_new(rm.clone())561}562563#[inline]564fn reg_to_gpr_mem(&mut self, r: Reg) -> GprMem {565GprMem::unwrap_new(RegMem::reg(r))566}567568#[inline]569fn gpr_to_gpr_mem(&mut self, gpr: Gpr) -> GprMem {570GprMem::from(gpr)571}572573#[inline]574fn gpr_to_gpr_mem_imm(&mut self, gpr: Gpr) -> GprMemImm {575GprMemImm::from(gpr)576}577578#[inline]579fn type_register_class(&mut self, ty: Type) -> Option<RegisterClass> {580if is_int_or_ref_ty(ty) || ty == I128 {581Some(RegisterClass::Gpr {582single_register: ty != I128,583})584} else if ty.is_float() || (ty.is_vector() && ty.bits() <= 128) {585Some(RegisterClass::Xmm)586} else {587None588}589}590591#[inline]592fn ty_int_bool_or_ref(&mut self, ty: Type) -> Option<()> {593match ty {594types::I8 | types::I16 | types::I32 | types::I64 => Some(()),595_ => None,596}597}598599#[inline]600fn intcc_to_cc(&mut self, intcc: &IntCC) -> CC {601CC::from_intcc(*intcc)602}603604#[inline]605fn cc_invert(&mut self, cc: &CC) -> CC {606cc.invert()607}608609#[inline]610fn cc_nz_or_z(&mut self, cc: &CC) -> Option<CC> {611match cc {612CC::Z => Some(*cc),613CC::NZ => Some(*cc),614_ => None,615}616}617618#[inline]619fn sum_extend_fits_in_32_bits(620&mut self,621extend_from_ty: Type,622constant_value: Imm64,623offset: Offset32,624) -> Option<u32> {625let offset: i64 = offset.into();626let constant_value: u64 = constant_value.bits() as u64;627// If necessary, zero extend `constant_value` up to 64 bits.628let shift = 64 - extend_from_ty.bits();629let zero_extended_constant_value = (constant_value << shift) >> shift;630// Sum up the two operands.631let sum = offset.wrapping_add(zero_extended_constant_value as i64);632// Check that the sum will fit in 32-bits.633if sum == ((sum << 32) >> 32) {634Some(sum as u32)635} else {636None637}638}639640#[inline]641fn amode_offset(&mut self, addr: &SyntheticAmode, offset: i32) -> SyntheticAmode {642addr.offset(offset)643}644645#[inline]646fn zero_offset(&mut self) -> Offset32 {647Offset32::new(0)648}649650#[inline]651fn preg_rbp(&mut self) -> PReg {652regs::rbp().to_real_reg().unwrap().into()653}654655#[inline]656fn preg_rsp(&mut self) -> PReg {657regs::rsp().to_real_reg().unwrap().into()658}659660#[inline]661fn preg_pinned(&mut self) -> PReg {662regs::pinned_reg().to_real_reg().unwrap().into()663}664665fn libcall_1(&mut self, libcall: &LibCall, a: Reg) -> Reg {666let outputs = emit_vm_call(667self.lower_ctx,668&self.backend.flags,669&self.backend.triple,670*libcall,671&[ValueRegs::one(a)],672)673.expect("Failed to emit LibCall");674675debug_assert_eq!(outputs.len(), 1);676677outputs[0].only_reg().unwrap()678}679680fn libcall_2(&mut self, libcall: &LibCall, a: Reg, b: Reg) -> Reg {681let outputs = emit_vm_call(682self.lower_ctx,683&self.backend.flags,684&self.backend.triple,685*libcall,686&[ValueRegs::one(a), ValueRegs::one(b)],687)688.expect("Failed to emit LibCall");689690debug_assert_eq!(outputs.len(), 1);691692outputs[0].only_reg().unwrap()693}694695fn libcall_3(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg {696let outputs = emit_vm_call(697self.lower_ctx,698&self.backend.flags,699&self.backend.triple,700*libcall,701&[ValueRegs::one(a), ValueRegs::one(b), ValueRegs::one(c)],702)703.expect("Failed to emit LibCall");704705debug_assert_eq!(outputs.len(), 1);706707outputs[0].only_reg().unwrap()708}709710#[inline]711fn vconst_all_ones_or_all_zeros(&mut self, constant: Constant) -> Option<()> {712let const_data = self.lower_ctx.get_constant_data(constant);713if const_data.iter().all(|&b| b == 0 || b == 0xFF) {714return Some(());715}716None717}718719#[inline]720fn shuffle_0_31_mask(&mut self, mask: &VecMask) -> VCodeConstant {721let mask = mask722.iter()723.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })724.map(|b| if b > 15 { 0b10000000 } else { b })725.collect();726self.lower_ctx727.use_constant(VCodeConstantData::Generated(mask))728}729730#[inline]731fn shuffle_0_15_mask(&mut self, mask: &VecMask) -> VCodeConstant {732let mask = mask733.iter()734.map(|&b| if b > 15 { 0b10000000 } else { b })735.collect();736self.lower_ctx737.use_constant(VCodeConstantData::Generated(mask))738}739740#[inline]741fn shuffle_16_31_mask(&mut self, mask: &VecMask) -> VCodeConstant {742let mask = mask743.iter()744.map(|&b| b.wrapping_sub(16))745.map(|b| if b > 15 { 0b10000000 } else { b })746.collect();747self.lower_ctx748.use_constant(VCodeConstantData::Generated(mask))749}750751#[inline]752fn perm_from_mask_with_zeros(753&mut self,754mask: &VecMask,755) -> Option<(VCodeConstant, VCodeConstant)> {756if !mask.iter().any(|&b| b > 31) {757return None;758}759760let zeros = mask761.iter()762.map(|&b| if b > 31 { 0x00 } else { 0xff })763.collect();764765Some((766self.perm_from_mask(mask),767self.lower_ctx768.use_constant(VCodeConstantData::Generated(zeros)),769))770}771772#[inline]773fn perm_from_mask(&mut self, mask: &VecMask) -> VCodeConstant {774let mask = mask.iter().cloned().collect();775self.lower_ctx776.use_constant(VCodeConstantData::Generated(mask))777}778779fn xmm_mem_to_xmm_mem_aligned(&mut self, arg: &XmmMem) -> XmmMemAligned {780match XmmMemAligned::new(arg.clone().into()) {781Some(aligned) => aligned,782None => match arg.clone().into() {783RegMem::Mem { addr } => self.load_xmm_unaligned(addr).into(),784_ => unreachable!(),785},786}787}788789fn xmm_mem_imm_to_xmm_mem_aligned_imm(&mut self, arg: &XmmMemImm) -> XmmMemAlignedImm {790match XmmMemAlignedImm::new(arg.clone().into()) {791Some(aligned) => aligned,792None => match arg.clone().into() {793RegMemImm::Mem { addr } => self.load_xmm_unaligned(addr).into(),794_ => unreachable!(),795},796}797}798799fn pshufd_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {800let (a, b, c, d) = self.shuffle32_from_imm(imm)?;801if a < 4 && b < 4 && c < 4 && d < 4 {802Some(a | (b << 2) | (c << 4) | (d << 6))803} else {804None805}806}807808fn pshufd_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {809let (a, b, c, d) = self.shuffle32_from_imm(imm)?;810// When selecting from the right-hand-side, subtract these all by 4811// which will bail out if anything is less than 4. Afterwards the check812// is the same as `pshufd_lhs_imm` above.813let a = a.checked_sub(4)?;814let b = b.checked_sub(4)?;815let c = c.checked_sub(4)?;816let d = d.checked_sub(4)?;817if a < 4 && b < 4 && c < 4 && d < 4 {818Some(a | (b << 2) | (c << 4) | (d << 6))819} else {820None821}822}823824fn shufps_imm(&mut self, imm: Immediate) -> Option<u8> {825// The `shufps` instruction selects the first two elements from the826// first vector and the second two elements from the second vector, so827// offset the third/fourth selectors by 4 and then make sure everything828// fits in 32-bits.829let (a, b, c, d) = self.shuffle32_from_imm(imm)?;830let c = c.checked_sub(4)?;831let d = d.checked_sub(4)?;832if a < 4 && b < 4 && c < 4 && d < 4 {833Some(a | (b << 2) | (c << 4) | (d << 6))834} else {835None836}837}838839fn shufps_rev_imm(&mut self, imm: Immediate) -> Option<u8> {840// This is almost the same as `shufps_imm` except the elements that are841// subtracted are reversed. This handles the case that `shufps`842// instruction can be emitted if the order of the operands are swapped.843let (a, b, c, d) = self.shuffle32_from_imm(imm)?;844let a = a.checked_sub(4)?;845let b = b.checked_sub(4)?;846if a < 4 && b < 4 && c < 4 && d < 4 {847Some(a | (b << 2) | (c << 4) | (d << 6))848} else {849None850}851}852853fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {854// Similar to `shufps` except this operates over 16-bit values so four855// of them must be fixed and the other four must be in-range to encode856// in the immediate.857let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;858if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {859Some(a | (b << 2) | (c << 4) | (d << 6))860} else {861None862}863}864865fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {866let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;867let a = a.checked_sub(8)?;868let b = b.checked_sub(8)?;869let c = c.checked_sub(8)?;870let d = d.checked_sub(8)?;871let e = e.checked_sub(8)?;872let f = f.checked_sub(8)?;873let g = g.checked_sub(8)?;874let h = h.checked_sub(8)?;875if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {876Some(a | (b << 2) | (c << 4) | (d << 6))877} else {878None879}880}881882fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {883// Similar to `pshuflw` except that the first four operands must be884// fixed and the second four are offset by an extra 4 and tested to885// make sure they're all in the range [4, 8).886let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;887let e = e.checked_sub(4)?;888let f = f.checked_sub(4)?;889let g = g.checked_sub(4)?;890let h = h.checked_sub(4)?;891if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {892Some(e | (f << 2) | (g << 4) | (h << 6))893} else {894None895}896}897898fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {899// Note that everything here is offset by at least 8 and the upper900// bits are offset by 12 to test they're in the range of [12, 16).901let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;902let a = a.checked_sub(8)?;903let b = b.checked_sub(8)?;904let c = c.checked_sub(8)?;905let d = d.checked_sub(8)?;906let e = e.checked_sub(12)?;907let f = f.checked_sub(12)?;908let g = g.checked_sub(12)?;909let h = h.checked_sub(12)?;910if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {911Some(e | (f << 2) | (g << 4) | (h << 6))912} else {913None914}915}916917fn palignr_imm_from_immediate(&mut self, imm: Immediate) -> Option<u8> {918let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();919920if bytes.windows(2).all(|a| a[0] + 1 == a[1]) {921Some(bytes[0])922} else {923None924}925}926927fn pblendw_imm(&mut self, imm: Immediate) -> Option<u8> {928// First make sure that the shuffle immediate is selecting 16-bit lanes.929let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;930931// Next build up an 8-bit mask from each of the bits of the selected932// lanes above. This instruction can only be used when each lane933// selector chooses from the corresponding lane in either of the two934// operands, meaning the Nth lane selection must satisfy `lane % 8 ==935// N`.936//937// This helper closure is used to calculate the value of the938// corresponding bit.939let bit = |x: u8, c: u8| {940if x % 8 == c {941if x < 8 { Some(0) } else { Some(1 << c) }942} else {943None944}945};946Some(947bit(a, 0)?948| bit(b, 1)?949| bit(c, 2)?950| bit(d, 3)?951| bit(e, 4)?952| bit(f, 5)?953| bit(g, 6)?954| bit(h, 7)?,955)956}957958fn xmi_imm(&mut self, imm: u32) -> XmmMemImm {959XmmMemImm::unwrap_new(RegMemImm::imm(imm))960}961962fn insert_i8x16_lane_hole(&mut self, hole_idx: u8) -> VCodeConstant {963let mask = -1i128 as u128;964self.emit_u128_le_const(mask ^ (0xff << (hole_idx * 8)))965}966967fn writable_invalid_gpr(&mut self) -> WritableGpr {968let reg = Gpr::new(self.invalid_reg()).unwrap();969WritableGpr::from_reg(reg)970}971972fn box_synthetic_amode(&mut self, amode: &SyntheticAmode) -> BoxSyntheticAmode {973Box::new(amode.clone())974}975976////////////////////////////////////////////////////////////////////////////977///// External assembler methods.978////////////////////////////////////////////////////////////////////////////979980fn is_imm8(&mut self, src: &GprMemImm) -> Option<u8> {981match src.clone().to_reg_mem_imm() {982RegMemImm::Imm { simm32 } => {983Some(i8::try_from(simm32.cast_signed()).ok()?.cast_unsigned())984}985_ => None,986}987}988989fn is_imm8_xmm(&mut self, src: &XmmMemImm) -> Option<u8> {990match src.clone().to_reg_mem_imm() {991RegMemImm::Imm { simm32 } => {992Some(i8::try_from(simm32.cast_signed()).ok()?.cast_unsigned())993}994_ => None,995}996}997998fn is_simm8(&mut self, src: &GprMemImm) -> Option<i8> {999match src.clone().to_reg_mem_imm() {1000RegMemImm::Imm { simm32 } => Some(i8::try_from(simm32.cast_signed()).ok()?),1001_ => None,1002}1003}10041005fn is_imm16(&mut self, src: &GprMemImm) -> Option<u16> {1006match src.clone().to_reg_mem_imm() {1007RegMemImm::Imm { simm32 } => {1008Some(i16::try_from(simm32.cast_signed()).ok()?.cast_unsigned())1009}1010_ => None,1011}1012}10131014fn is_simm16(&mut self, src: &GprMemImm) -> Option<i16> {1015match src.clone().to_reg_mem_imm() {1016RegMemImm::Imm { simm32 } => Some(i16::try_from(simm32.cast_signed()).ok()?),1017_ => None,1018}1019}10201021fn is_imm32(&mut self, src: &GprMemImm) -> Option<u32> {1022match src.clone().to_reg_mem_imm() {1023RegMemImm::Imm { simm32 } => Some(simm32),1024_ => None,1025}1026}10271028fn is_simm32(&mut self, src: &GprMemImm) -> Option<i32> {1029match src.clone().to_reg_mem_imm() {1030RegMemImm::Imm { simm32 } => Some(simm32 as i32),1031_ => None,1032}1033}10341035fn is_gpr(&mut self, src: &GprMemImm) -> Option<Gpr> {1036match src.clone().to_reg_mem_imm() {1037RegMemImm::Reg { reg } => Gpr::new(reg),1038_ => None,1039}1040}10411042fn is_xmm(&mut self, src: &XmmMem) -> Option<Xmm> {1043match src.clone().to_reg_mem() {1044RegMem::Reg { reg } => Xmm::new(reg),1045_ => None,1046}1047}10481049fn is_gpr_mem(&mut self, src: &GprMemImm) -> Option<GprMem> {1050match src.clone().to_reg_mem_imm() {1051RegMemImm::Reg { reg } => GprMem::new(RegMem::Reg { reg }),1052RegMemImm::Mem { addr } => GprMem::new(RegMem::Mem { addr }),1053_ => None,1054}1055}10561057fn is_xmm_mem(&mut self, src: &XmmMemImm) -> Option<XmmMem> {1058match src.clone().to_reg_mem_imm() {1059RegMemImm::Reg { reg } => XmmMem::new(RegMem::Reg { reg }),1060RegMemImm::Mem { addr } => XmmMem::new(RegMem::Mem { addr }),1061_ => None,1062}1063}10641065fn is_mem(&mut self, src: &XmmMem) -> Option<SyntheticAmode> {1066match src.clone().to_reg_mem() {1067RegMem::Reg { .. } => None,1068RegMem::Mem { addr } => Some(addr),1069}1070}10711072// Custom constructors for `mulx` which only calculates the high half of the1073// result meaning that the same output operand is used in both destination1074// registers. This is in contrast to the assembler-generated version of this1075// instruction which generates two distinct temporary registers for output1076// which calculates both the high and low halves of the result.10771078fn x64_mulxl_rvm_hi(&mut self, src1: &GprMem, src2: Gpr) -> Gpr {1079let ret = self.temp_writable_gpr();1080let src1 = self.convert_gpr_mem_to_assembler_read_gpr_mem(src1);1081let inst = asm::inst::mulxl_rvm::new(ret, ret, src1, src2);1082self.emit(&MInst::External { inst: inst.into() });1083ret.to_reg()1084}10851086fn x64_mulxq_rvm_hi(&mut self, src1: &GprMem, src2: Gpr) -> Gpr {1087let ret = self.temp_writable_gpr();1088let src1 = self.convert_gpr_mem_to_assembler_read_gpr_mem(src1);1089let inst = asm::inst::mulxq_rvm::new(ret, ret, src1, src2);1090self.emit(&MInst::External { inst: inst.into() });1091ret.to_reg()1092}10931094fn bt_imm(&mut self, val: u64) -> Option<u8> {1095if val.count_ones() == 1 {1096Some(u8::try_from(val.trailing_zeros()).unwrap())1097} else {1098None1099}1100}1101}11021103impl IsleContext<'_, '_, MInst, X64Backend> {1104fn load_xmm_unaligned(&mut self, addr: SyntheticAmode) -> Xmm {1105let tmp = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap();1106self.lower_ctx.emit(MInst::External {1107inst: asm::inst::movdqu_a::new(1108Writable::from_reg(Xmm::unwrap_new(tmp.to_reg())),1109asm::XmmMem::Mem(addr.into()),1110)1111.into(),1112});1113Xmm::unwrap_new(tmp.to_reg())1114}11151116/// Helper used by code generated by the `cranelift-assembler-x64` crate.1117fn convert_gpr_to_assembler_read_write_gpr(&mut self, read: Gpr) -> asm::Gpr<PairedGpr> {1118let write = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();1119let write = WritableGpr::from_writable_reg(write).unwrap();1120asm::Gpr::new(PairedGpr { read, write })1121}11221123/// Helper used by code generated by the `cranelift-assembler-x64` crate.1124fn convert_gpr_to_assembler_fixed_read_write_gpr<const E: u8>(1125&mut self,1126read: Gpr,1127) -> asm::Fixed<PairedGpr, E> {1128let write = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();1129let write = WritableGpr::from_writable_reg(write).unwrap();1130asm::Fixed(PairedGpr { read, write })1131}11321133/// Helper used by code generated by the `cranelift-assembler-x64` crate.1134fn convert_xmm_to_assembler_read_write_xmm(&mut self, read: Xmm) -> asm::Xmm<PairedXmm> {1135let write = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap();1136let write = WritableXmm::from_writable_reg(write).unwrap();1137asm::Xmm::new(PairedXmm { read, write })1138}11391140/// Helper used by code generated by the `cranelift-assembler-x64` crate.1141fn convert_gpr_mem_to_assembler_read_gpr_mem(&self, read: &GprMem) -> asm::GprMem<Gpr, Gpr> {1142match read.clone().into() {1143RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::new(reg).unwrap()),1144RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),1145}1146}11471148/// Helper used by code generated by the `cranelift-assembler-x64` crate.1149fn convert_xmm_mem_to_assembler_read_xmm_mem_aligned(1150&self,1151read: &XmmMemAligned,1152) -> asm::XmmMem<Xmm, Gpr> {1153match read.clone().into() {1154RegMem::Reg { reg } => asm::XmmMem::Xmm(Xmm::new(reg).unwrap()),1155RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1156}1157}11581159/// Helper used by code generated by the `cranelift-assembler-x64` crate.1160fn convert_xmm_mem_to_assembler_read_xmm_mem(&self, read: &XmmMem) -> asm::XmmMem<Xmm, Gpr> {1161match read.clone().into() {1162RegMem::Reg { reg } => asm::XmmMem::Xmm(Xmm::new(reg).unwrap()),1163RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1164}1165}11661167/// Helper used by code generated by the `cranelift-assembler-x64` crate.1168fn convert_xmm_mem_to_assembler_write_xmm_mem(1169&self,1170write: &XmmMem,1171) -> asm::XmmMem<Writable<Xmm>, Gpr> {1172match write.clone().into() {1173RegMem::Reg { reg } => asm::XmmMem::Xmm(Writable::from_reg(Xmm::new(reg).unwrap())),1174RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1175}1176}11771178/// Helper used by code generated by the `cranelift-assembler-x64` crate.1179fn convert_xmm_mem_to_assembler_write_xmm_mem_aligned(1180&self,1181write: &XmmMemAligned,1182) -> asm::XmmMem<Writable<Xmm>, Gpr> {1183match write.clone().into() {1184RegMem::Reg { reg } => asm::XmmMem::Xmm(Writable::from_reg(Xmm::new(reg).unwrap())),1185RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1186}1187}11881189/// Helper used by code generated by the `cranelift-assembler-x64` crate.1190fn convert_gpr_mem_to_assembler_read_write_gpr_mem(1191&mut self,1192read: &GprMem,1193) -> asm::GprMem<PairedGpr, Gpr> {1194match read.clone().into() {1195RegMem::Reg { reg } => asm::GprMem::Gpr(1196*self1197.convert_gpr_to_assembler_read_write_gpr(Gpr::new(reg).unwrap())1198.as_ref(),1199),1200RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),1201}1202}12031204/// Helper used by code generated by the `cranelift-assembler-x64` crate.1205fn convert_gpr_mem_to_assembler_write_gpr_mem(1206&mut self,1207read: &GprMem,1208) -> asm::GprMem<WritableGpr, Gpr> {1209match read.clone().into() {1210RegMem::Reg { reg } => asm::GprMem::Gpr(WritableGpr::from_reg(Gpr::new(reg).unwrap())),1211RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),1212}1213}12141215/// Helper used by code generated by the `cranelift-assembler-x64` crate.1216fn convert_amode_to_assembler_amode(&mut self, amode: &SyntheticAmode) -> asm::Amode<Gpr> {1217amode.clone().into()1218}1219}12201221// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we1222// need to fix up the bits that migrate from one half of the lane to the1223// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift1224// right by 0 (no movement), we want to retain all the bits so we mask with1225// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so1226// we mask with `0x7f`; etc.12271228#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.1229const I8X16_ISHL_MASKS: [u8; 128] = [12300xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,12310xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,12320xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,12330xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,12340xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,12350xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,12360xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,12370x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,1238];12391240#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.1241const I8X16_USHR_MASKS: [u8; 128] = [12420xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,12430x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,12440x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,12450x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,12460x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,12470x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,12480x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,12490x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,1250];125112521253