Path: blob/main/cranelift/codegen/src/isa/x64/lower/isle.rs
1693 views
//! ISLE integration glue code for x64 lowering.12// Pull in the ISLE generated code.3pub(crate) mod generated_code;4use crate::{ir::AtomicRmwOp, ir::types};5use generated_code::{AssemblerOutputs, Context, MInst, RegisterClass};67// Types that the generated ISLE code uses via `use super::*`.8use super::external::{CraneliftRegisters, PairedGpr, PairedXmm, isle_assembler_methods};9use super::{MergeableLoadSize, is_int_or_ref_ty, is_mergeable_load, lower_to_amode};10use crate::ir::condcodes::{FloatCC, IntCC};11use crate::ir::immediates::*;12use crate::ir::types::*;13use crate::ir::{14BlockCall, Inst, InstructionData, LibCall, MemFlags, Opcode, TrapCode, Value, ValueList,15};16use crate::isa::x64::X64Backend;17use crate::isa::x64::inst::{ReturnCallInfo, args::*, regs};18use crate::isa::x64::lower::{InsnInput, emit_vm_call};19use crate::machinst::isle::*;20use crate::machinst::{21ArgPair, CallArgList, CallInfo, CallRetList, InstOutput, MachInst, VCodeConstant,22VCodeConstantData,23};24use alloc::vec::Vec;25use cranelift_assembler_x64 as asm;26use regalloc2::PReg;27use std::boxed::Box;2829/// Type representing out-of-line data for calls. This type optional because the30/// call instruction is also used by Winch to emit calls, but the31/// `Box<CallInfo>` field is not used, it's only used by Cranelift. By making it32/// optional, we reduce the number of heap allocations in Winch.33type BoxCallInfo = Box<CallInfo<ExternalName>>;34type BoxCallIndInfo = Box<CallInfo<RegMem>>;35type BoxReturnCallInfo = Box<ReturnCallInfo<ExternalName>>;36type BoxReturnCallIndInfo = Box<ReturnCallInfo<Reg>>;37type VecArgPair = Vec<ArgPair>;38type BoxSyntheticAmode = Box<SyntheticAmode>;3940/// When interacting with the external assembler (see `external.rs`), we41/// need to fix the types we'll use.42type AssemblerInst = asm::Inst<CraneliftRegisters>;4344pub struct SinkableLoad {45inst: Inst,46addr_input: InsnInput,47offset: i32,48}4950/// The main entry point for lowering with ISLE.51pub(crate) fn lower(52lower_ctx: &mut Lower<MInst>,53backend: &X64Backend,54inst: Inst,55) -> Option<InstOutput> {56// TODO: reuse the ISLE context across lowerings so we can reuse its57// internal heap allocations.58let mut isle_ctx = IsleContext { lower_ctx, backend };59generated_code::constructor_lower(&mut isle_ctx, inst)60}6162pub(crate) fn lower_branch(63lower_ctx: &mut Lower<MInst>,64backend: &X64Backend,65branch: Inst,66targets: &[MachLabel],67) -> Option<()> {68// TODO: reuse the ISLE context across lowerings so we can reuse its69// internal heap allocations.70let mut isle_ctx = IsleContext { lower_ctx, backend };71generated_code::constructor_lower_branch(&mut isle_ctx, branch, &targets)72}7374impl Context for IsleContext<'_, '_, MInst, X64Backend> {75isle_lower_prelude_methods!();76isle_assembler_methods!();7778fn gen_call_info(79&mut self,80sig: Sig,81dest: ExternalName,82uses: CallArgList,83defs: CallRetList,84try_call_info: Option<TryCallInfo>,85) -> BoxCallInfo {86let stack_ret_space = self.lower_ctx.sigs()[sig].sized_stack_ret_space();87let stack_arg_space = self.lower_ctx.sigs()[sig].sized_stack_arg_space();88self.lower_ctx89.abi_mut()90.accumulate_outgoing_args_size(stack_ret_space + stack_arg_space);9192Box::new(93self.lower_ctx94.gen_call_info(sig, dest, uses, defs, try_call_info),95)96}9798fn gen_call_ind_info(99&mut self,100sig: Sig,101dest: &RegMem,102uses: CallArgList,103defs: CallRetList,104try_call_info: Option<TryCallInfo>,105) -> BoxCallIndInfo {106let stack_ret_space = self.lower_ctx.sigs()[sig].sized_stack_ret_space();107let stack_arg_space = self.lower_ctx.sigs()[sig].sized_stack_arg_space();108self.lower_ctx109.abi_mut()110.accumulate_outgoing_args_size(stack_ret_space + stack_arg_space);111112Box::new(113self.lower_ctx114.gen_call_info(sig, dest.clone(), uses, defs, try_call_info),115)116}117118fn gen_return_call_info(119&mut self,120sig: Sig,121dest: ExternalName,122uses: CallArgList,123) -> BoxReturnCallInfo {124let new_stack_arg_size = self.lower_ctx.sigs()[sig].sized_stack_arg_space();125self.lower_ctx126.abi_mut()127.accumulate_tail_args_size(new_stack_arg_size);128129Box::new(ReturnCallInfo {130dest,131uses,132tmp: self.lower_ctx.temp_writable_gpr(),133new_stack_arg_size,134})135}136137fn gen_return_call_ind_info(138&mut self,139sig: Sig,140dest: Reg,141uses: CallArgList,142) -> BoxReturnCallIndInfo {143let new_stack_arg_size = self.lower_ctx.sigs()[sig].sized_stack_arg_space();144self.lower_ctx145.abi_mut()146.accumulate_tail_args_size(new_stack_arg_size);147148Box::new(ReturnCallInfo {149dest,150uses,151tmp: self.lower_ctx.temp_writable_gpr(),152new_stack_arg_size,153})154}155156#[inline]157fn operand_size_of_type_32_64(&mut self, ty: Type) -> OperandSize {158if ty.bits() == 64 {159OperandSize::Size64160} else {161OperandSize::Size32162}163}164165#[inline]166fn raw_operand_size_of_type(&mut self, ty: Type) -> OperandSize {167OperandSize::from_ty(ty)168}169170fn put_in_reg_mem_imm(&mut self, val: Value) -> RegMemImm {171if let Some(imm) = self.i64_from_iconst(val) {172if let Ok(imm) = i32::try_from(imm) {173return RegMemImm::Imm {174simm32: imm.cast_unsigned(),175};176}177}178179self.put_in_reg_mem(val).into()180}181182fn put_in_xmm_mem_imm(&mut self, val: Value) -> XmmMemImm {183if let Some(imm) = self.i64_from_iconst(val) {184if let Ok(imm) = i32::try_from(imm) {185return XmmMemImm::unwrap_new(RegMemImm::Imm {186simm32: imm.cast_unsigned(),187});188}189}190191let res = match self.put_in_xmm_mem(val).to_reg_mem() {192RegMem::Reg { reg } => RegMemImm::Reg { reg },193RegMem::Mem { addr } => RegMemImm::Mem { addr },194};195196XmmMemImm::unwrap_new(res)197}198199fn put_in_xmm_mem(&mut self, val: Value) -> XmmMem {200let inputs = self.lower_ctx.get_value_as_source_or_const(val);201202if let Some(c) = inputs.constant {203// A load from the constant pool is better than a rematerialization into a register,204// because it reduces register pressure.205//206// NOTE: this is where behavior differs from `put_in_reg_mem`, as we always force207// constants to be 16 bytes when a constant will be used in place of an xmm register.208let vcode_constant = self.emit_u128_le_const(c as u128);209return XmmMem::unwrap_new(RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant)));210}211212XmmMem::unwrap_new(self.put_in_reg_mem(val))213}214215fn put_in_reg_mem(&mut self, val: Value) -> RegMem {216let inputs = self.lower_ctx.get_value_as_source_or_const(val);217218if let Some(c) = inputs.constant {219// A load from the constant pool is better than a220// rematerialization into a register, because it reduces221// register pressure.222let vcode_constant = self.emit_u64_le_const(c);223return RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant));224}225226if let Some(load) = self.sinkable_load(val) {227return RegMem::Mem {228addr: self.sink_load(&load),229};230}231232RegMem::reg(self.put_in_reg(val))233}234235#[inline]236fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {237imm.encode()238}239240#[inline]241fn encode_round_imm(&mut self, imm: &RoundImm) -> u8 {242imm.encode()243}244245#[inline]246fn use_avx(&mut self) -> bool {247self.backend.x64_flags.use_avx()248}249250#[inline]251fn use_avx2(&mut self) -> bool {252self.backend.x64_flags.use_avx2()253}254255#[inline]256fn use_avx512vl(&mut self) -> bool {257self.backend.x64_flags.use_avx512vl()258}259260#[inline]261fn use_avx512dq(&mut self) -> bool {262self.backend.x64_flags.use_avx512dq()263}264265#[inline]266fn use_avx512f(&mut self) -> bool {267self.backend.x64_flags.use_avx512f()268}269270#[inline]271fn use_avx512bitalg(&mut self) -> bool {272self.backend.x64_flags.use_avx512bitalg()273}274275#[inline]276fn use_avx512vbmi(&mut self) -> bool {277self.backend.x64_flags.use_avx512vbmi()278}279280#[inline]281fn use_lzcnt(&mut self) -> bool {282self.backend.x64_flags.use_lzcnt()283}284285#[inline]286fn use_bmi1(&mut self) -> bool {287self.backend.x64_flags.use_bmi1()288}289290#[inline]291fn use_bmi2(&mut self) -> bool {292self.backend.x64_flags.use_bmi2()293}294295#[inline]296fn use_popcnt(&mut self) -> bool {297self.backend.x64_flags.use_popcnt()298}299300#[inline]301fn use_fma(&mut self) -> bool {302self.backend.x64_flags.use_fma()303}304305#[inline]306fn use_sse3(&mut self) -> bool {307self.backend.x64_flags.use_sse3()308}309310#[inline]311fn use_ssse3(&mut self) -> bool {312self.backend.x64_flags.use_ssse3()313}314315#[inline]316fn use_sse41(&mut self) -> bool {317self.backend.x64_flags.use_sse41()318}319320#[inline]321fn use_sse42(&mut self) -> bool {322self.backend.x64_flags.use_sse42()323}324325#[inline]326fn use_cmpxchg16b(&mut self) -> bool {327self.backend.x64_flags.use_cmpxchg16b()328}329330#[inline]331fn shift_mask(&mut self, ty: Type) -> u8 {332debug_assert!(ty.lane_bits().is_power_of_two());333334(ty.lane_bits() - 1) as u8335}336337fn shift_amount_masked(&mut self, ty: Type, val: Imm64) -> u8 {338(val.bits() as u8) & self.shift_mask(ty)339}340341#[inline]342fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {343let imm = self.i64_from_iconst(val)?;344Some(GprMemImm::unwrap_new(RegMemImm::Imm {345simm32: i32::try_from(imm).ok()?.cast_unsigned(),346}))347}348349fn sinkable_load(&mut self, val: Value) -> Option<SinkableLoad> {350if let Some(inst) = self.is_sinkable_inst(val) {351if let Some((addr_input, offset)) =352is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Min32)353{354return Some(SinkableLoad {355inst,356addr_input,357offset,358});359}360}361None362}363364fn sinkable_load_exact(&mut self, val: Value) -> Option<SinkableLoad> {365if let Some(inst) = self.is_sinkable_inst(val) {366if let Some((addr_input, offset)) =367is_mergeable_load(self.lower_ctx, inst, MergeableLoadSize::Exact)368{369return Some(SinkableLoad {370inst,371addr_input,372offset,373});374}375}376None377}378379fn sink_load(&mut self, load: &SinkableLoad) -> SyntheticAmode {380self.lower_ctx.sink_inst(load.inst);381let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset);382SyntheticAmode::Real(addr)383}384385#[inline]386fn ext_mode(&mut self, from_bits: u16, to_bits: u16) -> ExtMode {387ExtMode::new(from_bits, to_bits).unwrap()388}389390fn emit(&mut self, inst: &MInst) -> Unit {391self.lower_ctx.emit(inst.clone());392}393394#[inline]395fn sse_insertps_lane_imm(&mut self, lane: u8) -> u8 {396// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane397// shifted into bits 5:6).3980b00_00_00_00 | lane << 4399}400401#[inline]402fn synthetic_amode_to_reg_mem(&mut self, addr: &SyntheticAmode) -> RegMem {403RegMem::mem(addr.clone())404}405406#[inline]407fn amode_to_synthetic_amode(&mut self, amode: &Amode) -> SyntheticAmode {408amode.clone().into()409}410411#[inline]412fn const_to_synthetic_amode(&mut self, c: VCodeConstant) -> SyntheticAmode {413SyntheticAmode::ConstantOffset(c)414}415416#[inline]417fn writable_gpr_to_reg(&mut self, r: WritableGpr) -> WritableReg {418r.to_writable_reg()419}420421#[inline]422fn writable_xmm_to_reg(&mut self, r: WritableXmm) -> WritableReg {423r.to_writable_reg()424}425426fn ishl_i8x16_mask_for_const(&mut self, amt: u32) -> SyntheticAmode {427// When the shift amount is known, we can statically (i.e. at compile428// time) determine the mask to use and only emit that.429debug_assert!(amt < 8);430let mask_offset = amt as usize * 16;431let mask_constant = self.lower_ctx.use_constant(VCodeConstantData::WellKnown(432&I8X16_ISHL_MASKS[mask_offset..mask_offset + 16],433));434SyntheticAmode::ConstantOffset(mask_constant)435}436437fn ishl_i8x16_mask_table(&mut self) -> SyntheticAmode {438let mask_table = self439.lower_ctx440.use_constant(VCodeConstantData::WellKnown(&I8X16_ISHL_MASKS));441SyntheticAmode::ConstantOffset(mask_table)442}443444fn ushr_i8x16_mask_for_const(&mut self, amt: u32) -> SyntheticAmode {445// When the shift amount is known, we can statically (i.e. at compile446// time) determine the mask to use and only emit that.447debug_assert!(amt < 8);448let mask_offset = amt as usize * 16;449let mask_constant = self.lower_ctx.use_constant(VCodeConstantData::WellKnown(450&I8X16_USHR_MASKS[mask_offset..mask_offset + 16],451));452SyntheticAmode::ConstantOffset(mask_constant)453}454455fn ushr_i8x16_mask_table(&mut self) -> SyntheticAmode {456let mask_table = self457.lower_ctx458.use_constant(VCodeConstantData::WellKnown(&I8X16_USHR_MASKS));459SyntheticAmode::ConstantOffset(mask_table)460}461462#[inline]463fn writable_reg_to_xmm(&mut self, r: WritableReg) -> WritableXmm {464Writable::from_reg(Xmm::unwrap_new(r.to_reg()))465}466467#[inline]468fn writable_xmm_to_xmm(&mut self, r: WritableXmm) -> Xmm {469r.to_reg()470}471472#[inline]473fn writable_gpr_to_gpr(&mut self, r: WritableGpr) -> Gpr {474r.to_reg()475}476477#[inline]478fn gpr_to_reg(&mut self, r: Gpr) -> Reg {479r.into()480}481482#[inline]483fn xmm_to_reg(&mut self, r: Xmm) -> Reg {484r.into()485}486487#[inline]488fn xmm_to_xmm_mem_imm(&mut self, r: Xmm) -> XmmMemImm {489r.into()490}491492#[inline]493fn xmm_mem_to_xmm_mem_imm(&mut self, r: &XmmMem) -> XmmMemImm {494XmmMemImm::unwrap_new(r.clone().to_reg_mem().into())495}496497#[inline]498fn temp_writable_gpr(&mut self) -> WritableGpr {499self.lower_ctx.temp_writable_gpr()500}501502#[inline]503fn temp_writable_xmm(&mut self) -> WritableXmm {504self.lower_ctx.temp_writable_xmm()505}506507#[inline]508fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm {509RegMemImm::Reg { reg }510}511512#[inline]513fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {514XmmMem::unwrap_new(rm.clone())515}516517#[inline]518fn gpr_mem_imm_new(&mut self, rmi: &RegMemImm) -> GprMemImm {519GprMemImm::unwrap_new(rmi.clone())520}521522#[inline]523fn xmm_mem_imm_new(&mut self, rmi: &RegMemImm) -> XmmMemImm {524XmmMemImm::unwrap_new(rmi.clone())525}526527#[inline]528fn xmm_to_xmm_mem(&mut self, r: Xmm) -> XmmMem {529r.into()530}531532#[inline]533fn xmm_mem_to_reg_mem(&mut self, xm: &XmmMem) -> RegMem {534xm.clone().into()535}536537#[inline]538fn gpr_mem_to_reg_mem(&mut self, gm: &GprMem) -> RegMem {539gm.clone().into()540}541542#[inline]543fn xmm_new(&mut self, r: Reg) -> Xmm {544Xmm::unwrap_new(r)545}546547#[inline]548fn gpr_new(&mut self, r: Reg) -> Gpr {549Gpr::unwrap_new(r)550}551552#[inline]553fn reg_mem_to_gpr_mem(&mut self, rm: &RegMem) -> GprMem {554GprMem::unwrap_new(rm.clone())555}556557#[inline]558fn reg_to_gpr_mem(&mut self, r: Reg) -> GprMem {559GprMem::unwrap_new(RegMem::reg(r))560}561562#[inline]563fn gpr_to_gpr_mem(&mut self, gpr: Gpr) -> GprMem {564GprMem::from(gpr)565}566567#[inline]568fn gpr_to_gpr_mem_imm(&mut self, gpr: Gpr) -> GprMemImm {569GprMemImm::from(gpr)570}571572#[inline]573fn type_register_class(&mut self, ty: Type) -> Option<RegisterClass> {574if is_int_or_ref_ty(ty) || ty == I128 {575Some(RegisterClass::Gpr {576single_register: ty != I128,577})578} else if ty.is_float() || (ty.is_vector() && ty.bits() <= 128) {579Some(RegisterClass::Xmm)580} else {581None582}583}584585#[inline]586fn ty_int_bool_or_ref(&mut self, ty: Type) -> Option<()> {587match ty {588types::I8 | types::I16 | types::I32 | types::I64 => Some(()),589_ => None,590}591}592593#[inline]594fn intcc_to_cc(&mut self, intcc: &IntCC) -> CC {595CC::from_intcc(*intcc)596}597598#[inline]599fn cc_invert(&mut self, cc: &CC) -> CC {600cc.invert()601}602603#[inline]604fn cc_nz_or_z(&mut self, cc: &CC) -> Option<CC> {605match cc {606CC::Z => Some(*cc),607CC::NZ => Some(*cc),608_ => None,609}610}611612#[inline]613fn sum_extend_fits_in_32_bits(614&mut self,615extend_from_ty: Type,616constant_value: Imm64,617offset: Offset32,618) -> Option<u32> {619let offset: i64 = offset.into();620let constant_value: u64 = constant_value.bits() as u64;621// If necessary, zero extend `constant_value` up to 64 bits.622let shift = 64 - extend_from_ty.bits();623let zero_extended_constant_value = (constant_value << shift) >> shift;624// Sum up the two operands.625let sum = offset.wrapping_add(zero_extended_constant_value as i64);626// Check that the sum will fit in 32-bits.627if sum == ((sum << 32) >> 32) {628Some(sum as u32)629} else {630None631}632}633634#[inline]635fn amode_offset(&mut self, addr: &Amode, offset: i32) -> Amode {636addr.offset(offset)637}638639#[inline]640fn zero_offset(&mut self) -> Offset32 {641Offset32::new(0)642}643644#[inline]645fn preg_rbp(&mut self) -> PReg {646regs::rbp().to_real_reg().unwrap().into()647}648649#[inline]650fn preg_rsp(&mut self) -> PReg {651regs::rsp().to_real_reg().unwrap().into()652}653654#[inline]655fn preg_pinned(&mut self) -> PReg {656regs::pinned_reg().to_real_reg().unwrap().into()657}658659fn libcall_1(&mut self, libcall: &LibCall, a: Reg) -> Reg {660let outputs = emit_vm_call(661self.lower_ctx,662&self.backend.flags,663&self.backend.triple,664*libcall,665&[ValueRegs::one(a)],666)667.expect("Failed to emit LibCall");668669debug_assert_eq!(outputs.len(), 1);670671outputs[0].only_reg().unwrap()672}673674fn libcall_2(&mut self, libcall: &LibCall, a: Reg, b: Reg) -> Reg {675let outputs = emit_vm_call(676self.lower_ctx,677&self.backend.flags,678&self.backend.triple,679*libcall,680&[ValueRegs::one(a), ValueRegs::one(b)],681)682.expect("Failed to emit LibCall");683684debug_assert_eq!(outputs.len(), 1);685686outputs[0].only_reg().unwrap()687}688689fn libcall_3(&mut self, libcall: &LibCall, a: Reg, b: Reg, c: Reg) -> Reg {690let outputs = emit_vm_call(691self.lower_ctx,692&self.backend.flags,693&self.backend.triple,694*libcall,695&[ValueRegs::one(a), ValueRegs::one(b), ValueRegs::one(c)],696)697.expect("Failed to emit LibCall");698699debug_assert_eq!(outputs.len(), 1);700701outputs[0].only_reg().unwrap()702}703704#[inline]705fn vconst_all_ones_or_all_zeros(&mut self, constant: Constant) -> Option<()> {706let const_data = self.lower_ctx.get_constant_data(constant);707if const_data.iter().all(|&b| b == 0 || b == 0xFF) {708return Some(());709}710None711}712713#[inline]714fn shuffle_0_31_mask(&mut self, mask: &VecMask) -> VCodeConstant {715let mask = mask716.iter()717.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })718.map(|b| if b > 15 { 0b10000000 } else { b })719.collect();720self.lower_ctx721.use_constant(VCodeConstantData::Generated(mask))722}723724#[inline]725fn shuffle_0_15_mask(&mut self, mask: &VecMask) -> VCodeConstant {726let mask = mask727.iter()728.map(|&b| if b > 15 { 0b10000000 } else { b })729.collect();730self.lower_ctx731.use_constant(VCodeConstantData::Generated(mask))732}733734#[inline]735fn shuffle_16_31_mask(&mut self, mask: &VecMask) -> VCodeConstant {736let mask = mask737.iter()738.map(|&b| b.wrapping_sub(16))739.map(|b| if b > 15 { 0b10000000 } else { b })740.collect();741self.lower_ctx742.use_constant(VCodeConstantData::Generated(mask))743}744745#[inline]746fn perm_from_mask_with_zeros(747&mut self,748mask: &VecMask,749) -> Option<(VCodeConstant, VCodeConstant)> {750if !mask.iter().any(|&b| b > 31) {751return None;752}753754let zeros = mask755.iter()756.map(|&b| if b > 31 { 0x00 } else { 0xff })757.collect();758759Some((760self.perm_from_mask(mask),761self.lower_ctx762.use_constant(VCodeConstantData::Generated(zeros)),763))764}765766#[inline]767fn perm_from_mask(&mut self, mask: &VecMask) -> VCodeConstant {768let mask = mask.iter().cloned().collect();769self.lower_ctx770.use_constant(VCodeConstantData::Generated(mask))771}772773fn xmm_mem_to_xmm_mem_aligned(&mut self, arg: &XmmMem) -> XmmMemAligned {774match XmmMemAligned::new(arg.clone().into()) {775Some(aligned) => aligned,776None => match arg.clone().into() {777RegMem::Mem { addr } => self.load_xmm_unaligned(addr).into(),778_ => unreachable!(),779},780}781}782783fn xmm_mem_imm_to_xmm_mem_aligned_imm(&mut self, arg: &XmmMemImm) -> XmmMemAlignedImm {784match XmmMemAlignedImm::new(arg.clone().into()) {785Some(aligned) => aligned,786None => match arg.clone().into() {787RegMemImm::Mem { addr } => self.load_xmm_unaligned(addr).into(),788_ => unreachable!(),789},790}791}792793fn pshufd_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {794let (a, b, c, d) = self.shuffle32_from_imm(imm)?;795if a < 4 && b < 4 && c < 4 && d < 4 {796Some(a | (b << 2) | (c << 4) | (d << 6))797} else {798None799}800}801802fn pshufd_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {803let (a, b, c, d) = self.shuffle32_from_imm(imm)?;804// When selecting from the right-hand-side, subtract these all by 4805// which will bail out if anything is less than 4. Afterwards the check806// is the same as `pshufd_lhs_imm` above.807let a = a.checked_sub(4)?;808let b = b.checked_sub(4)?;809let c = c.checked_sub(4)?;810let d = d.checked_sub(4)?;811if a < 4 && b < 4 && c < 4 && d < 4 {812Some(a | (b << 2) | (c << 4) | (d << 6))813} else {814None815}816}817818fn shufps_imm(&mut self, imm: Immediate) -> Option<u8> {819// The `shufps` instruction selects the first two elements from the820// first vector and the second two elements from the second vector, so821// offset the third/fourth selectors by 4 and then make sure everything822// fits in 32-bits.823let (a, b, c, d) = self.shuffle32_from_imm(imm)?;824let c = c.checked_sub(4)?;825let d = d.checked_sub(4)?;826if a < 4 && b < 4 && c < 4 && d < 4 {827Some(a | (b << 2) | (c << 4) | (d << 6))828} else {829None830}831}832833fn shufps_rev_imm(&mut self, imm: Immediate) -> Option<u8> {834// This is almost the same as `shufps_imm` except the elements that are835// subtracted are reversed. This handles the case that `shufps`836// instruction can be emitted if the order of the operands are swapped.837let (a, b, c, d) = self.shuffle32_from_imm(imm)?;838let a = a.checked_sub(4)?;839let b = b.checked_sub(4)?;840if a < 4 && b < 4 && c < 4 && d < 4 {841Some(a | (b << 2) | (c << 4) | (d << 6))842} else {843None844}845}846847fn pshuflw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {848// Similar to `shufps` except this operates over 16-bit values so four849// of them must be fixed and the other four must be in-range to encode850// in the immediate.851let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;852if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {853Some(a | (b << 2) | (c << 4) | (d << 6))854} else {855None856}857}858859fn pshuflw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {860let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;861let a = a.checked_sub(8)?;862let b = b.checked_sub(8)?;863let c = c.checked_sub(8)?;864let d = d.checked_sub(8)?;865let e = e.checked_sub(8)?;866let f = f.checked_sub(8)?;867let g = g.checked_sub(8)?;868let h = h.checked_sub(8)?;869if a < 4 && b < 4 && c < 4 && d < 4 && [e, f, g, h] == [4, 5, 6, 7] {870Some(a | (b << 2) | (c << 4) | (d << 6))871} else {872None873}874}875876fn pshufhw_lhs_imm(&mut self, imm: Immediate) -> Option<u8> {877// Similar to `pshuflw` except that the first four operands must be878// fixed and the second four are offset by an extra 4 and tested to879// make sure they're all in the range [4, 8).880let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;881let e = e.checked_sub(4)?;882let f = f.checked_sub(4)?;883let g = g.checked_sub(4)?;884let h = h.checked_sub(4)?;885if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {886Some(e | (f << 2) | (g << 4) | (h << 6))887} else {888None889}890}891892fn pshufhw_rhs_imm(&mut self, imm: Immediate) -> Option<u8> {893// Note that everything here is offset by at least 8 and the upper894// bits are offset by 12 to test they're in the range of [12, 16).895let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;896let a = a.checked_sub(8)?;897let b = b.checked_sub(8)?;898let c = c.checked_sub(8)?;899let d = d.checked_sub(8)?;900let e = e.checked_sub(12)?;901let f = f.checked_sub(12)?;902let g = g.checked_sub(12)?;903let h = h.checked_sub(12)?;904if e < 4 && f < 4 && g < 4 && h < 4 && [a, b, c, d] == [0, 1, 2, 3] {905Some(e | (f << 2) | (g << 4) | (h << 6))906} else {907None908}909}910911fn palignr_imm_from_immediate(&mut self, imm: Immediate) -> Option<u8> {912let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();913914if bytes.windows(2).all(|a| a[0] + 1 == a[1]) {915Some(bytes[0])916} else {917None918}919}920921fn pblendw_imm(&mut self, imm: Immediate) -> Option<u8> {922// First make sure that the shuffle immediate is selecting 16-bit lanes.923let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;924925// Next build up an 8-bit mask from each of the bits of the selected926// lanes above. This instruction can only be used when each lane927// selector chooses from the corresponding lane in either of the two928// operands, meaning the Nth lane selection must satisfy `lane % 8 ==929// N`.930//931// This helper closure is used to calculate the value of the932// corresponding bit.933let bit = |x: u8, c: u8| {934if x % 8 == c {935if x < 8 { Some(0) } else { Some(1 << c) }936} else {937None938}939};940Some(941bit(a, 0)?942| bit(b, 1)?943| bit(c, 2)?944| bit(d, 3)?945| bit(e, 4)?946| bit(f, 5)?947| bit(g, 6)?948| bit(h, 7)?,949)950}951952fn xmi_imm(&mut self, imm: u32) -> XmmMemImm {953XmmMemImm::unwrap_new(RegMemImm::imm(imm))954}955956fn insert_i8x16_lane_hole(&mut self, hole_idx: u8) -> VCodeConstant {957let mask = -1i128 as u128;958self.emit_u128_le_const(mask ^ (0xff << (hole_idx * 8)))959}960961fn writable_invalid_gpr(&mut self) -> WritableGpr {962let reg = Gpr::new(self.invalid_reg()).unwrap();963WritableGpr::from_reg(reg)964}965966fn box_synthetic_amode(&mut self, amode: &SyntheticAmode) -> BoxSyntheticAmode {967Box::new(amode.clone())968}969970////////////////////////////////////////////////////////////////////////////971///// External assembler methods.972////////////////////////////////////////////////////////////////////////////973974fn is_imm8(&mut self, src: &GprMemImm) -> Option<u8> {975match src.clone().to_reg_mem_imm() {976RegMemImm::Imm { simm32 } => {977Some(i8::try_from(simm32.cast_signed()).ok()?.cast_unsigned())978}979_ => None,980}981}982983fn is_imm8_xmm(&mut self, src: &XmmMemImm) -> Option<u8> {984match src.clone().to_reg_mem_imm() {985RegMemImm::Imm { simm32 } => {986Some(i8::try_from(simm32.cast_signed()).ok()?.cast_unsigned())987}988_ => None,989}990}991992fn is_simm8(&mut self, src: &GprMemImm) -> Option<i8> {993match src.clone().to_reg_mem_imm() {994RegMemImm::Imm { simm32 } => Some(i8::try_from(simm32.cast_signed()).ok()?),995_ => None,996}997}998999fn is_imm16(&mut self, src: &GprMemImm) -> Option<u16> {1000match src.clone().to_reg_mem_imm() {1001RegMemImm::Imm { simm32 } => {1002Some(i16::try_from(simm32.cast_signed()).ok()?.cast_unsigned())1003}1004_ => None,1005}1006}10071008fn is_simm16(&mut self, src: &GprMemImm) -> Option<i16> {1009match src.clone().to_reg_mem_imm() {1010RegMemImm::Imm { simm32 } => Some(i16::try_from(simm32.cast_signed()).ok()?),1011_ => None,1012}1013}10141015fn is_imm32(&mut self, src: &GprMemImm) -> Option<u32> {1016match src.clone().to_reg_mem_imm() {1017RegMemImm::Imm { simm32 } => Some(simm32),1018_ => None,1019}1020}10211022fn is_simm32(&mut self, src: &GprMemImm) -> Option<i32> {1023match src.clone().to_reg_mem_imm() {1024RegMemImm::Imm { simm32 } => Some(simm32 as i32),1025_ => None,1026}1027}10281029fn is_gpr(&mut self, src: &GprMemImm) -> Option<Gpr> {1030match src.clone().to_reg_mem_imm() {1031RegMemImm::Reg { reg } => Gpr::new(reg),1032_ => None,1033}1034}10351036fn is_xmm(&mut self, src: &XmmMem) -> Option<Xmm> {1037match src.clone().to_reg_mem() {1038RegMem::Reg { reg } => Xmm::new(reg),1039_ => None,1040}1041}10421043fn is_gpr_mem(&mut self, src: &GprMemImm) -> Option<GprMem> {1044match src.clone().to_reg_mem_imm() {1045RegMemImm::Reg { reg } => GprMem::new(RegMem::Reg { reg }),1046RegMemImm::Mem { addr } => GprMem::new(RegMem::Mem { addr }),1047_ => None,1048}1049}10501051fn is_xmm_mem(&mut self, src: &XmmMemImm) -> Option<XmmMem> {1052match src.clone().to_reg_mem_imm() {1053RegMemImm::Reg { reg } => XmmMem::new(RegMem::Reg { reg }),1054RegMemImm::Mem { addr } => XmmMem::new(RegMem::Mem { addr }),1055_ => None,1056}1057}10581059fn is_mem(&mut self, src: &XmmMem) -> Option<SyntheticAmode> {1060match src.clone().to_reg_mem() {1061RegMem::Reg { .. } => None,1062RegMem::Mem { addr } => Some(addr),1063}1064}10651066// Custom constructors for `mulx` which only calculates the high half of the1067// result meaning that the same output operand is used in both destination1068// registers. This is in contrast to the assembler-generated version of this1069// instruction which generates two distinct temporary registers for output1070// which calculates both the high and low halves of the result.10711072fn x64_mulxl_rvm_hi(&mut self, src1: &GprMem, src2: Gpr) -> Gpr {1073let ret = self.temp_writable_gpr();1074let src1 = self.convert_gpr_mem_to_assembler_read_gpr_mem(src1);1075let inst = asm::inst::mulxl_rvm::new(ret, ret, src1, src2);1076self.emit(&MInst::External { inst: inst.into() });1077ret.to_reg()1078}10791080fn x64_mulxq_rvm_hi(&mut self, src1: &GprMem, src2: Gpr) -> Gpr {1081let ret = self.temp_writable_gpr();1082let src1 = self.convert_gpr_mem_to_assembler_read_gpr_mem(src1);1083let inst = asm::inst::mulxq_rvm::new(ret, ret, src1, src2);1084self.emit(&MInst::External { inst: inst.into() });1085ret.to_reg()1086}10871088fn bt_imm(&mut self, val: u64) -> Option<u8> {1089if val.count_ones() == 1 {1090Some(u8::try_from(val.trailing_zeros()).unwrap())1091} else {1092None1093}1094}1095}10961097impl IsleContext<'_, '_, MInst, X64Backend> {1098fn load_xmm_unaligned(&mut self, addr: SyntheticAmode) -> Xmm {1099let tmp = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap();1100self.lower_ctx.emit(MInst::External {1101inst: asm::inst::movdqu_a::new(1102Writable::from_reg(Xmm::unwrap_new(tmp.to_reg())),1103asm::XmmMem::Mem(addr.into()),1104)1105.into(),1106});1107Xmm::unwrap_new(tmp.to_reg())1108}11091110/// Helper used by code generated by the `cranelift-assembler-x64` crate.1111fn convert_gpr_to_assembler_read_write_gpr(&mut self, read: Gpr) -> asm::Gpr<PairedGpr> {1112let write = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();1113let write = WritableGpr::from_writable_reg(write).unwrap();1114asm::Gpr::new(PairedGpr { read, write })1115}11161117/// Helper used by code generated by the `cranelift-assembler-x64` crate.1118fn convert_gpr_to_assembler_fixed_read_write_gpr<const E: u8>(1119&mut self,1120read: Gpr,1121) -> asm::Fixed<PairedGpr, E> {1122let write = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();1123let write = WritableGpr::from_writable_reg(write).unwrap();1124asm::Fixed(PairedGpr { read, write })1125}11261127/// Helper used by code generated by the `cranelift-assembler-x64` crate.1128fn convert_xmm_to_assembler_read_write_xmm(&mut self, read: Xmm) -> asm::Xmm<PairedXmm> {1129let write = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap();1130let write = WritableXmm::from_writable_reg(write).unwrap();1131asm::Xmm::new(PairedXmm { read, write })1132}11331134/// Helper used by code generated by the `cranelift-assembler-x64` crate.1135fn convert_gpr_mem_to_assembler_read_gpr_mem(&self, read: &GprMem) -> asm::GprMem<Gpr, Gpr> {1136match read.clone().into() {1137RegMem::Reg { reg } => asm::GprMem::Gpr(Gpr::new(reg).unwrap()),1138RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),1139}1140}11411142/// Helper used by code generated by the `cranelift-assembler-x64` crate.1143fn convert_xmm_mem_to_assembler_read_xmm_mem_aligned(1144&self,1145read: &XmmMemAligned,1146) -> asm::XmmMem<Xmm, Gpr> {1147match read.clone().into() {1148RegMem::Reg { reg } => asm::XmmMem::Xmm(Xmm::new(reg).unwrap()),1149RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1150}1151}11521153/// Helper used by code generated by the `cranelift-assembler-x64` crate.1154fn convert_xmm_mem_to_assembler_read_xmm_mem(&self, read: &XmmMem) -> asm::XmmMem<Xmm, Gpr> {1155match read.clone().into() {1156RegMem::Reg { reg } => asm::XmmMem::Xmm(Xmm::new(reg).unwrap()),1157RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1158}1159}11601161/// Helper used by code generated by the `cranelift-assembler-x64` crate.1162fn convert_xmm_mem_to_assembler_write_xmm_mem(1163&self,1164write: &XmmMem,1165) -> asm::XmmMem<Writable<Xmm>, Gpr> {1166match write.clone().into() {1167RegMem::Reg { reg } => asm::XmmMem::Xmm(Writable::from_reg(Xmm::new(reg).unwrap())),1168RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1169}1170}11711172/// Helper used by code generated by the `cranelift-assembler-x64` crate.1173fn convert_xmm_mem_to_assembler_write_xmm_mem_aligned(1174&self,1175write: &XmmMemAligned,1176) -> asm::XmmMem<Writable<Xmm>, Gpr> {1177match write.clone().into() {1178RegMem::Reg { reg } => asm::XmmMem::Xmm(Writable::from_reg(Xmm::new(reg).unwrap())),1179RegMem::Mem { addr } => asm::XmmMem::Mem(addr.into()),1180}1181}11821183/// Helper used by code generated by the `cranelift-assembler-x64` crate.1184fn convert_gpr_mem_to_assembler_read_write_gpr_mem(1185&mut self,1186read: &GprMem,1187) -> asm::GprMem<PairedGpr, Gpr> {1188match read.clone().into() {1189RegMem::Reg { reg } => asm::GprMem::Gpr(1190*self1191.convert_gpr_to_assembler_read_write_gpr(Gpr::new(reg).unwrap())1192.as_ref(),1193),1194RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),1195}1196}11971198/// Helper used by code generated by the `cranelift-assembler-x64` crate.1199fn convert_gpr_mem_to_assembler_write_gpr_mem(1200&mut self,1201read: &GprMem,1202) -> asm::GprMem<WritableGpr, Gpr> {1203match read.clone().into() {1204RegMem::Reg { reg } => asm::GprMem::Gpr(WritableGpr::from_reg(Gpr::new(reg).unwrap())),1205RegMem::Mem { addr } => asm::GprMem::Mem(addr.into()),1206}1207}12081209/// Helper used by code generated by the `cranelift-assembler-x64` crate.1210fn convert_amode_to_assembler_amode(&mut self, amode: &SyntheticAmode) -> asm::Amode<Gpr> {1211amode.clone().into()1212}1213}12141215// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we1216// need to fix up the bits that migrate from one half of the lane to the1217// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift1218// right by 0 (no movement), we want to retain all the bits so we mask with1219// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so1220// we mask with `0x7f`; etc.12211222#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.1223const I8X16_ISHL_MASKS: [u8; 128] = [12240xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,12250xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,12260xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,12270xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,12280xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,12290xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,12300xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,12310x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,1232];12331234#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.1235const I8X16_USHR_MASKS: [u8; 128] = [12360xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,12370x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,12380x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,12390x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,12400x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,12410x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,12420x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,12430x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,1244];124512461247